def similar_tweeters(user1, user2, num_tweets=NUM_TWEETS): ''' TODOs: - [x] Retrieve last n tweets for each user - [x] Tokenize tweet words and filter for stop words, URLs, digits, punctuation, etc. - [x] Extract main subjects each user tweets about - [x] Compare subjects and calculate a similarity score ''' tokens1 = tokenize_tweets(UserTweets(user1, num_tweets).tweets) tokens2 = tokenize_tweets(UserTweets(user2, num_tweets).tweets) return calc_similarity_score(tokens1, tokens2)
def similar_tweeters(user1, user2): userTweets1 = UserTweets(user1) userTweets2 = UserTweets(user2) nltk.download('punkt') # if necessary... stemmer = nltk.stem.porter.PorterStemmer() remove_punctuation_map = dict( (ord(char), None) for char in string.punctuation) def stem_tokens(tokens): return [stemmer.stem(item) for item in tokens] '''remove punctuation, lowercase, stem''' def normalize(text): return stem_tokens( nltk.word_tokenize(text.lower().translate(remove_punctuation_map))) vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words='english') def cosine_sim(text1, text2): tfidf = vectorizer.fit_transform([text1, text2]) return ((tfidf * tfidf.T).A)[0, 1] for tweet1 in userTweets1._tweets: for tweet2 in userTweets2._tweets: clean_text1 = ' '.join( re.sub( "(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|(RT|Python)", " ", tweet1.text).split()) clean_text2 = ' '.join( re.sub( "(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|(RT|Python)", " ", tweet2.text).split()) if len(clean_text1.split()) and len(clean_text2.split()) > 3: try: score = cosine_sim(clean_text1, clean_text2) except ValueError as err: print(err) print(f'Before {tweet1.text}, after {clean_text1}') print(f'Before {tweet2.text}, after {clean_text2}') continue if score > 0.5: print( f'Score was {score}\nUserText1 is {clean_text1}\nUserText2 is {clean_text2}' )
def similar_tweeters(user1, user2): user1_tweets = UserTweets(user1) user2_tweets = UserTweets(user2) tknzr = TweetTokenizer(strip_handles = True, reduce_len = True) user1_token = [] user2_token = [] for tweets in user1_tweets._tweets: user1_token.append(tknzr.tokenize(tweets[2])) for tweets in user2_tweets._tweets: user2_token.append(tknzr.tokenize(tweets[2])) #filtered_tweets = [] ## for item in [user1_token, user2_token]: ## filtered_tweets.append(__filter_crap(item)) __find_similarities(user1_token, user2_token)
def get(self): self.response.headers['Content-Type'] = 'text/html' user = users.get_current_user() if user == None: template_values = { 'login_url': users.create_login_url(self.request.url) } template = JINJA_ENVIRONMENT.get_template('loginpage.html') self.response.write(template.render(template_values)) return myuser_key = ndb.Key('MyUser', user.user_id()) myuser = myuser_key.get() if myuser == None: myuser = MyUser(id=user.user_id()) myuser.put() template_values = { 'logout_url': users.create_logout_url(self.request.url) } template = JINJA_ENVIRONMENT.get_template('signup.html') self.response.write(template.render(template_values)) else: self.response.headers['Content-Type'] = 'text/html' user = users.get_current_user() name = self.request.get('name') myuser_key = ndb.Key('MyUser', user.user_id()) myuser = myuser_key.get() storetweets = UserTweets().query().fetch() template_values = { 'logout_url': users.create_logout_url(self.request.url), 'usertweets': myuser.usertweets, 'user': user, 'myuser': myuser } template = JINJA_ENVIRONMENT.get_template('twitterhome.html') self.response.write(template.render(template_values))
import csv import datetime import unittest from usertweets import UserTweets from usertweets import NUM_TWEETS DT = datetime.datetime(2017, 4, 15, 18, 34, 35) HANDLE = 'pybites' MAX_ID = '853315632087085057' TWEETS = ( """RT @dbader_org: ""You're flying! How?"" ""Python!"" 😊 https://t.co/MYA4YIn9nF https://t.co/4vqM97UHxs""", """How to create a nice-looking HTML page of your #Kindle book highlights (notes) https://t.co/HKFK7inhUa #python""", ) USER = UserTweets(HANDLE, max_id=MAX_ID) output_file = './data/test.csv' def read_csv(): with open(USER.output_file) as f: r = csv.reader(f) next(r, None) # skip the headers return list(r) class TestUserTweets(unittest.TestCase): def test_num_tweets(self): self.assertEqual(len(USER), NUM_TWEETS) def test_first_tweet_returned_by_api(self): self.assertEqual(USER[0].id_str, MAX_ID) self.assertEqual(USER[0].created_at, DT)
def setUp(self): super().setUp() with patch('tweepy.API.user_timeline') as mock_timeline: mock_timeline.return_value = TWEETS self.user = UserTweets(HANDLE, max_id=MAX_ID)
def similar_tweeters(user1, user2): tweets1 = [tweet_parser(t.text) for t in UserTweets(user1)] tweets2 = [tweet_parser(t.text) for t in UserTweets(user2)] ipdb.set_trace() dictionary = corpora.Dictionary([tweets1, tweets2]) corpus = [dictionary.doc2bow(text) for text in [tweets1, tweets2]]
def post(self): self.response.headers['Content-Type'] = 'text/html' action = self.request.get('button') user = users.get_current_user() if action == 'SignUp': name = self.request.get('name') dateofbirth = self.request.get('dateofbirth') bio = self.request.get('bio') stweets = UserTweets.query().fetch() myuser_key = ndb.Key('MyUser', user.user_id()) myuser = myuser_key.get() myuser = MyUser(id=user.user_id(), name=name, dateofbirth=datetime.strptime( dateofbirth, '%Y-%m-%d').date(), bio=bio, followers=[], following=[]) myuser.put() usertweets_key = ndb.Key('UserTweets', name) gettweets = usertweets_key.get() self.redirect('/') elif self.request.get('button') == 'Search': temp = [] count = [] result = self.request.get('Search') user = users.get_current_user() myuser_key = ndb.Key('MyUser', user.user_id()) myuser = myuser_key.get() if result == '': self.redirect('/') else: name_search = MyUser.query(MyUser.name == result).fetch() search_tweet = UserTweets.query().fetch() if len(name_search) > 0 or len(search_tweet) > 0: for tweet in search_tweet: for i in tweet.tweets: temp.append(i) for i in range(len(temp)): temp2 = temp[i].split(" ") for j in temp2: if (j in result): search_tweet.append(temp[i]) break template_values = { 'search_tweet': search_tweet, 'name_search': name_search } template = JINJA_ENVIRONMENT.get_template( 'twitterhome.html') self.response.write(template.render(template_values)) elif self.request.get('button') == 'Tweet': tweets = self.request.get('tweets') user = users.get_current_user() myuser_key = ndb.Key('MyUser', user.user_id()) myuser = myuser_key.get() new_tweet = UserTweets(name=myuser.name, tweets=tweets) myuser.usertweets.append(new_tweet) myuser.put() new_tweet.put() self.redirect('/') elif self.request.get('button') == 'Delete tweet': index = int(self.request.get('index')) user = users.get_current_user() myuser_key = ndb.Key('MyUser', user.user_id()) myuser = myuser_key.get() del myuser.usertweets[index] myuser.put() self.redirect('/')