示例#1
0
def similar_tweeters(user1, user2, num_tweets=NUM_TWEETS):
    '''
    TODOs:
        - [x] Retrieve last n tweets for each user
        - [x] Tokenize tweet words and filter for stop words, URLs, digits, punctuation, etc.
        - [x] Extract main subjects each user tweets about
        - [x] Compare subjects and calculate a similarity score
    '''
    tokens1 = tokenize_tweets(UserTweets(user1, num_tweets).tweets)
    tokens2 = tokenize_tweets(UserTweets(user2, num_tweets).tweets)
    return calc_similarity_score(tokens1, tokens2)
def similar_tweeters(user1, user2):
    userTweets1 = UserTweets(user1)
    userTweets2 = UserTweets(user2)

    nltk.download('punkt')  # if necessary...

    stemmer = nltk.stem.porter.PorterStemmer()
    remove_punctuation_map = dict(
        (ord(char), None) for char in string.punctuation)

    def stem_tokens(tokens):
        return [stemmer.stem(item) for item in tokens]

    '''remove punctuation, lowercase, stem'''

    def normalize(text):
        return stem_tokens(
            nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))

    vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words='english')

    def cosine_sim(text1, text2):
        tfidf = vectorizer.fit_transform([text1, text2])
        return ((tfidf * tfidf.T).A)[0, 1]

    for tweet1 in userTweets1._tweets:
        for tweet2 in userTweets2._tweets:

            clean_text1 = ' '.join(
                re.sub(
                    "(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|(RT|Python)",
                    " ", tweet1.text).split())
            clean_text2 = ' '.join(
                re.sub(
                    "(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|(RT|Python)",
                    " ", tweet2.text).split())

            if len(clean_text1.split()) and len(clean_text2.split()) > 3:
                try:
                    score = cosine_sim(clean_text1, clean_text2)
                except ValueError as err:
                    print(err)
                    print(f'Before {tweet1.text}, after {clean_text1}')
                    print(f'Before {tweet2.text}, after {clean_text2}')
                    continue

                if score > 0.5:
                    print(
                        f'Score was {score}\nUserText1 is {clean_text1}\nUserText2 is {clean_text2}'
                    )
示例#3
0
def similar_tweeters(user1, user2):
    user1_tweets = UserTweets(user1)
    user2_tweets = UserTweets(user2)
    tknzr = TweetTokenizer(strip_handles = True, reduce_len = True)
    user1_token = []
    user2_token = []
    for tweets in user1_tweets._tweets:
        user1_token.append(tknzr.tokenize(tweets[2]))
    for tweets in user2_tweets._tweets:
        user2_token.append(tknzr.tokenize(tweets[2]))
    #filtered_tweets = []
##    for item in [user1_token, user2_token]:
##        filtered_tweets.append(__filter_crap(item))
    __find_similarities(user1_token, user2_token)    
示例#4
0
    def get(self):
        self.response.headers['Content-Type'] = 'text/html'

        user = users.get_current_user()

        if user == None:
            template_values = {
                'login_url': users.create_login_url(self.request.url)
            }

            template = JINJA_ENVIRONMENT.get_template('loginpage.html')
            self.response.write(template.render(template_values))
            return
        myuser_key = ndb.Key('MyUser', user.user_id())
        myuser = myuser_key.get()
        if myuser == None:
            myuser = MyUser(id=user.user_id())
            myuser.put()

            template_values = {
                'logout_url': users.create_logout_url(self.request.url)
            }

            template = JINJA_ENVIRONMENT.get_template('signup.html')
            self.response.write(template.render(template_values))
        else:
            self.response.headers['Content-Type'] = 'text/html'
            user = users.get_current_user()
            name = self.request.get('name')
            myuser_key = ndb.Key('MyUser', user.user_id())
            myuser = myuser_key.get()
            storetweets = UserTweets().query().fetch()
            template_values = {
                'logout_url': users.create_logout_url(self.request.url),
                'usertweets': myuser.usertweets,
                'user': user,
                'myuser': myuser
            }

            template = JINJA_ENVIRONMENT.get_template('twitterhome.html')
            self.response.write(template.render(template_values))
示例#5
0
import csv
import datetime
import unittest

from usertweets import UserTweets
from usertweets import NUM_TWEETS

DT = datetime.datetime(2017, 4, 15, 18, 34, 35)
HANDLE = 'pybites'
MAX_ID = '853315632087085057'
TWEETS = (
    """RT @dbader_org: ""You're flying! How?"" ""Python!"" 😊  https://t.co/MYA4YIn9nF https://t.co/4vqM97UHxs""",
    """How to create a nice-looking HTML page of your #Kindle book highlights (notes) https://t.co/HKFK7inhUa #python""",
)
USER = UserTweets(HANDLE, max_id=MAX_ID)
output_file = './data/test.csv'

def read_csv():
    with open(USER.output_file) as f:
        r = csv.reader(f)
        next(r, None)  # skip the headers
        return list(r)

class TestUserTweets(unittest.TestCase):

    def test_num_tweets(self):
        self.assertEqual(len(USER), NUM_TWEETS)

    def test_first_tweet_returned_by_api(self):
        self.assertEqual(USER[0].id_str, MAX_ID)
        self.assertEqual(USER[0].created_at, DT)
示例#6
0
 def setUp(self):
     super().setUp()
     with patch('tweepy.API.user_timeline') as mock_timeline:
         mock_timeline.return_value = TWEETS
         self.user = UserTweets(HANDLE, max_id=MAX_ID)
示例#7
0
def similar_tweeters(user1, user2):
    tweets1 = [tweet_parser(t.text) for t in UserTweets(user1)]
    tweets2 = [tweet_parser(t.text) for t in UserTweets(user2)]
    ipdb.set_trace()
    dictionary = corpora.Dictionary([tweets1, tweets2])
    corpus = [dictionary.doc2bow(text) for text in [tweets1, tweets2]]
示例#8
0
    def post(self):
        self.response.headers['Content-Type'] = 'text/html'
        action = self.request.get('button')
        user = users.get_current_user()
        if action == 'SignUp':
            name = self.request.get('name')
            dateofbirth = self.request.get('dateofbirth')
            bio = self.request.get('bio')

            stweets = UserTweets.query().fetch()

            myuser_key = ndb.Key('MyUser', user.user_id())
            myuser = myuser_key.get()
            myuser = MyUser(id=user.user_id(),
                            name=name,
                            dateofbirth=datetime.strptime(
                                dateofbirth, '%Y-%m-%d').date(),
                            bio=bio,
                            followers=[],
                            following=[])
            myuser.put()
            usertweets_key = ndb.Key('UserTweets', name)
            gettweets = usertweets_key.get()
            self.redirect('/')
        elif self.request.get('button') == 'Search':
            temp = []
            count = []

            result = self.request.get('Search')

            user = users.get_current_user()
            myuser_key = ndb.Key('MyUser', user.user_id())
            myuser = myuser_key.get()

            if result == '':
                self.redirect('/')
            else:
                name_search = MyUser.query(MyUser.name == result).fetch()
                search_tweet = UserTweets.query().fetch()

                if len(name_search) > 0 or len(search_tweet) > 0:

                    for tweet in search_tweet:
                        for i in tweet.tweets:
                            temp.append(i)
                    for i in range(len(temp)):
                        temp2 = temp[i].split(" ")
                        for j in temp2:
                            if (j in result):
                                search_tweet.append(temp[i])
                                break

                    template_values = {
                        'search_tweet': search_tweet,
                        'name_search': name_search
                    }
                    template = JINJA_ENVIRONMENT.get_template(
                        'twitterhome.html')
                    self.response.write(template.render(template_values))

        elif self.request.get('button') == 'Tweet':
            tweets = self.request.get('tweets')

            user = users.get_current_user()
            myuser_key = ndb.Key('MyUser', user.user_id())
            myuser = myuser_key.get()
            new_tweet = UserTweets(name=myuser.name, tweets=tweets)
            myuser.usertweets.append(new_tweet)
            myuser.put()
            new_tweet.put()
            self.redirect('/')
        elif self.request.get('button') == 'Delete tweet':
            index = int(self.request.get('index'))
            user = users.get_current_user()
            myuser_key = ndb.Key('MyUser', user.user_id())
            myuser = myuser_key.get()
            del myuser.usertweets[index]
            myuser.put()
            self.redirect('/')