class Test_Data(unittest.TestCase): def setUp(self): self.tweets_data_path = 'test/sample.json' self.db = './test.sqlite' self.feels_db = TweetData(self.db) def tearDown(self): os.remove(self.db) def test_file_creation(self): self.assertTrue(os.path.exists(self.db)) def test_fields(self): f = self.feels_db.fields self.assertTrue(isinstance(f, tuple)) self.assertTrue(len(f) >= 11) def test_scrub(self): data = {'a': 1, 'b': 2} scrubbed = self.feels_db.scrub(data) self.assertTrue(isinstance(scrubbed, str)) def test_data_operation(self): twt = { 'created_at': 'Sun Feb 19 19:14:18 +0000 2017', 'id_str': '833394296418082817', 'text': 'All the feels!' } t = Tweet(twt) self.assertEqual(len(t.keys()), 3) self.feels_db.insert_tweet(t) df = self.feels_db.queue self.assertEqual(len(df), 1) df.sentiment = 0.9 for row in df.itertuples(): self.feels_db.update_tweet({ 'id_str': row.id_str, 'sentiment': row.sentiment }) self.assertEqual(len(self.feels_db.queue), 0) self.assertEqual(len(self.feels_db.all), 1)
class Test_Data(unittest.TestCase): def setUp(self): self.tweets_data_path = 'test/sample.json' self.db = './test.sqlite' self.feels_db = TweetData(self.db) self.tweets = [ { 'created_at': 'Sun Feb 19 19:14:18 +0000 2017', 'id_str': '833394296418082817', 'text': 'Tweetfeels is tremendous! Believe me. I know.', 'user': { 'followers_count': '100', 'friends_count': '200', 'location': None } }, # sentiment value = 0 { 'created_at': 'Sun Feb 20 19:14:19 +0000 2017', 'id_str': '833394296418082818', 'text': 'Fake news. Sad!', 'user': { 'followers_count': '200', 'friends_count': '200', 'location': None } }, # sentiment value = -0.7351 { 'created_at': 'Sun Feb 21 19:14:20 +0000 2017', 'id_str': '833394296418082819', 'text': 'I hate it.', 'user': { 'followers_count': '200', 'friends_count': '200', 'location': None } } # sentiment value = -0.5719 ] self.mock_tweets = [Tweet(t) for t in self.tweets] def tearDown(self): os.remove(self.db) def test_file_creation(self): self.assertTrue(os.path.exists(self.db)) def test_fields(self): f = self.feels_db.fields self.assertTrue(isinstance(f, tuple)) self.assertTrue(len(f) >= 11) def test_start(self): self.assertTrue(isinstance(self.feels_db.start, datetime)) def test_dates(self): for t in self.mock_tweets: self.feels_db.insert_tweet(t) self.assertEqual(len(self.feels_db.tweet_dates), 3) tweets = [] with open(self.tweets_data_path) as tweets_file: lines = filter(None, (line.rstrip() for line in tweets_file)) for line in lines: try: tweets.append(Tweet(json.loads(line))) except KeyError: pass for t in tweets: self.feels_db.insert_tweet(t) self.assertEqual(len(self.feels_db.tweet_dates), 105) df = self.feels_db.tweet_dates timebox = timedelta(seconds=60) second = timedelta(seconds=1) df = df.groupby(pd.TimeGrouper(freq=f'{int(timebox/second)}S')).size() df = df[df != 0] print(df) self.assertEqual(len(df), 3) self.assertEqual(df.iloc[0], 103) def test_fetch(self): tweets = [] with open(self.tweets_data_path) as tweets_file: lines = filter(None, (line.rstrip() for line in tweets_file)) for line in lines: try: tweets.append(Tweet(json.loads(line))) except KeyError: pass for t in tweets: self.feels_db.insert_tweet(t) for t in self.mock_tweets: self.feels_db.insert_tweet(t) it = self.feels_db.fetchbin(binsize=timedelta(minutes=30)) cur = next(it) self.assertEqual(cur.end - cur.start, timedelta(minutes=30)) self.assertEqual(len(cur), 103) cur = next(it) self.assertEqual(len(cur), 1) cur = next(it) self.assertEqual(len(cur), 1) def test_empty(self): for t in self.mock_tweets: self.feels_db.insert_tweet(t) it = self.feels_db.fetchbin(binsize=timedelta(hours=12), empty=True) cur = next(it) self.assertEqual(len(cur), 1) cur = next(it) self.assertEqual(len(cur), 0) cur = next(it) self.assertEqual(len(cur), 1) cur = next(it) cur = next(it) self.assertEqual(len(cur), 1) def test_bin(self): for t in self.mock_tweets: self.feels_db.insert_tweet(t) it = self.feels_db.fetchbin(binsize=timedelta(hours=12), empty=True) cur = next(it) self.assertEqual(cur.influence, 300) cur = next(it) self.assertEqual(cur.influence, 0) cur = next(it) self.assertEqual(cur.influence, 400) cur = next(it) cur = next(it) self.assertEqual(cur.influence, 400) def test_data_operation(self): twt = { 'created_at': 'Sun Feb 19 19:14:18 +0000 2017', 'id_str': '833394296418082817', 'text': 'All the feels!' } t = Tweet(twt) self.assertEqual(len(t.keys()), 7) self.feels_db.insert_tweet(t) b = self.feels_db.tweets_since(datetime.now()) self.assertEqual(len(b), 0) b = self.feels_db.tweets_since(0) self.assertEqual(len(b), 1) b.df.sentiment = 0.9 for row in b.df.itertuples(): self.feels_db.update_tweet({ 'id_str': row.id_str, 'sentiment': row.sentiment }) start = datetime(2017, 2, 17, 0, 0, 0) before = datetime(2017, 2, 18, 0, 0, 0) after = datetime(2017, 2, 20, 0, 0, 0) b = self.feels_db.tweets_between(start, before) self.assertEqual(len(b), 0) b = self.feels_db.tweets_between(start, after) self.assertEqual(len(b), 1)
class TweetFeels(object): """ The controller. :param credentials: A list of your 4 credential components. :param tracking: A list of keywords to track. :param db: A sqlite database to store data. Will be created if it doesn't already exist. Will append if it exists. :ivar calc_every_n: Wont calculate new sentiment until there are n records in the queue. :ivar lang: A list of languages to include in tweet gathering. """ def __init__(self, credentials, tracking=[], db='feels.sqlite'): self._listener = TweetListener(self.on_data, self.on_error) self._feels = TweetData(db) _auth = OAuthHandler(credentials[0], credentials[1]) _auth.set_access_token(credentials[2], credentials[3]) self._stream = Stream(_auth, self._listener) self.tracking = tracking self.lang = ['en'] self._sentiment = 0 self._filter_level = 'low' self.calc_every_n = 10 def start(self, seconds=None): def delayed_stop(): time.sleep(seconds) print('Timer completed. Disconnecting now...') self.stop() if len(self.tracking) == 0: print('Nothing to track!') else: self._stream.filter(track=self.tracking, languages=self.lang, async=True) # This does not work due to upstream bug in tweepy 3.5.0. They have fixed it in # https://github.com/tweepy/tweepy/pull/783 # self._stream.filter( # track=self.tracking, languages=self.lang, async=True, # filter_level=self._filter_level # ) if seconds is not None: t = Thread(target=delayed_stop) t.start() def stop(self): self._stream.disconnect() def on_data(self, data): """ Note: Due to upstream bug in tweepy for python3, it cannot handle the `filter_level` parameter in the `Stream.filter` function. Therefore, we'll take care of it here. The problem has been identified and fixed by the tweepy team here: https://github.com/tweepy/tweepy/pull/783 """ filter_value = {'none': 0, 'low': 1, 'medium': 2} value = filter_value[data['filter_level']] if value >= filter_value[self._filter_level]: self._feels.insert_tweet(data) def on_error(self, status): pass def _intensity(self, tweet): t = clean(tweet) return SentimentIntensityAnalyzer().polarity_scores(t)['compound'] @property def sentiment(self): df = self._feels.queue if (len(df) > self.calc_every_n): df.sentiment = df.text.apply(self._intensity) for row in df.itertuples(): self._feels.update_tweet({ 'id_str': row.id_str, 'sentiment': row.sentiment }) df = df.loc[df.sentiment != 0] # drop rows having 0 sentiment df = df.groupby('created_at') df = df.apply( lambda x: np.average(x.sentiment, weights=x.followers_count)) df = df.sort_index() for row in df.iteritems(): self._sentiment = self._sentiment * 0.99 + row[1] * 0.01 return self._sentiment