def test_dump_to_csv_dumps_with_top_level(self): if os.path.exists( os.path.dirname(os.path.abspath(__file__)) + '/data/output.csv'): os.remove( os.path.dirname(os.path.abspath(__file__)) + '/data/output.csv') field_list = [ 'id_str', 'coordinates.coordinates.0', 'coordinates.coordinates.1', 'user.id_str', 'user.lang', 'lang', 'text', 'user.screen_name', 'user.location', 'user.description', 'created_at', 'user.friends_count', 'user.followers_count', 'retweet_count', 'entities.urls.0.expanded_url', 'entities.urls.1.expanded_url', 'entities.urls.2.expanded_url', 'entities.urls.3.expanded_url', 'entities.urls.4.expanded_url', 'entities.hashtags.0.text', 'entities.hashtags.1.text' ] output_path = os.path.dirname( os.path.realpath(__file__)) + '/' + 'data/output.csv' collection = CsvCollection( os.path.dirname(os.path.realpath(__file__)) + '/' + config['csv']['valid']) collection.dump_to_csv(output_path, field_list, top_level=True) self.assertTrue(os.path.getsize(output_path) > 0) if os.path.exists( os.path.dirname(os.path.abspath(__file__)) + '/data/output.csv'): os.remove( os.path.dirname(os.path.abspath(__file__)) + '/data/output.csv')
def test_dump_to_sqlite_db_dumps_the_right_stuff_with_top_level(self): if os.path.exists( os.path.dirname(os.path.abspath(__file__)) + '/data/output.db'): os.remove( os.path.dirname(os.path.abspath(__file__)) + '/data/output.db') field_list = [ 'id_str', 'entities.hashtags.0', 'entities.hashtags.1', 'source', 'user.id', 'timestamp.$date', 'text', 'coordinates.coordinates.0', 'coordinates.coordinates.1', 'user.id_str', 'user.lang', 'lang', 'user.screen_name', 'user.location', 'user.description', 'created_at', 'user.friends_count', 'user.followers_count', 'retweet_count', 'entities.urls.0.expanded_url', 'entities.urls.1.expanded_url', 'entities.urls.2.expanded_url', 'entities.urls.3.expanded_url', 'entities.urls.4.expanded_url', 'entities.hashtags.0.text', 'entities.hashtags.1.text' ] output_path = os.path.dirname( os.path.realpath(__file__)) + '/' + 'data/output.db' collection = CsvCollection( os.path.dirname(os.path.realpath(__file__)) + '/' + config['csv']['valid']) collection.dump_to_sqlite_db(output_path, field_list, top_level=True) con = sqlite3.connect(output_path) cur = con.cursor() row = [ elem for row in cur.execute("SELECT * FROM data LIMIT 1;") for elem in row ] con.close() self.assertTrue(len(row) > 0) self.assertEqual( set(row), set([ '661275583813431296', '{"indices": [74, 83], "text": "jadehelm"}', '{"indices": [84, 98], "text": "newworldorder"}', '<a href="https://twitter.com/Col_Connaughton" rel="nofollow">Colin\'s Autotweeterpro5.3</a>', '379851447', '1446495359000', 'Susan Lindauer, Rtd US Army LTC Potter: Jade Helm https://t.co/VA4bQRudLt #jadehelm #newworldorder #usa #tyranny #threat', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL' ])) if os.path.exists( os.path.dirname(os.path.abspath(__file__)) + '/data/output.db'): os.remove( os.path.dirname(os.path.abspath(__file__)) + '/data/output.db')
def test_strip_tweets_keeps_fields(self): tweet_parser = TweetParser() collection = CsvCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['csv']['valid']) self.maxDiff = None it = collection.strip_tweets(['source', 'text', 'id_str']).get_iterator() def tweets_have_right_keys(iterator, fields): for tweet in iterator: keys = [key for key,value in tweet_parser.flatten_dict(tweet)] for elem in fields: if elem not in keys: return False return True self.assertTrue(tweets_have_right_keys(it, [['source'], ['text'], ['id_str']]))
def test_json_collection_custom_filter_filters(self): collectionone = CsvCollection( os.path.dirname(os.path.realpath(__file__)) + '/' + config['csv']['valid']) full_collection_len = len(list(collectionone.get_iterator())) def is_tweet_a_retweet(tweet): if 'retweeted' in tweet and tweet['retweeted']: return True else: return False num_retweets = len( list( collectionone.set_custom_filter( is_tweet_a_retweet).get_iterator())) collectiontwo = CsvCollection( os.path.dirname(os.path.realpath(__file__)) + '/' + config['csv']['valid']) def is_not_a_retweet(tweet): if 'retweeted' in tweet and tweet['retweeted']: return False else: return True num_non_retweets = len( list( collectiontwo.set_custom_filter( is_not_a_retweet).get_iterator())) #the numbes of retweets and non retweets should add up to the whole collection self.assertEqual(num_retweets + num_non_retweets, full_collection_len)
def test_strip_tweets_keeps_fields(self): tweet_parser = TweetParser() collection = CsvCollection( os.path.dirname(os.path.realpath(__file__)) + '/' + config['csv']['valid']) self.maxDiff = None it = collection.strip_tweets(['source', 'text', 'id_str']).get_iterator() def tweets_have_right_keys(iterator, fields): for tweet in iterator: keys = [key for key, value in tweet_parser.flatten_dict(tweet)] for elem in fields: if elem not in keys: return False return True self.assertTrue( tweets_have_right_keys(it, [['source'], ['text'], ['id_str']]))
def test_json_collection_custom_filter_filters(self): collectionone = CsvCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['csv']['valid']) full_collection_len = len(list(collectionone.get_iterator())) def is_tweet_a_retweet(tweet): if 'retweeted' in tweet and tweet['retweeted']: return True else: return False num_retweets = len(list(collectionone.set_custom_filter(is_tweet_a_retweet).get_iterator())) collectiontwo = CsvCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['csv']['valid']) def is_not_a_retweet(tweet): if 'retweeted' in tweet and tweet['retweeted']: return False else: return True num_non_retweets = len(list(collectiontwo.set_custom_filter(is_not_a_retweet).get_iterator())) #the numbes of retweets and non retweets should add up to the whole collection self.assertEqual(num_retweets + num_non_retweets, full_collection_len)
def test_iterator_returns_tweets(self): collection = CsvCollection( os.path.dirname(os.path.realpath(__file__)) + '/' + config['csv']['valid']) self.assertTrue(len(list(collection.get_iterator())) > 0)
def test_dump_to_csv_dumps_right_stuff_with_top_level(self): if os.path.exists( os.path.dirname(os.path.abspath(__file__)) + '/data/output.csv'): os.remove( os.path.dirname(os.path.abspath(__file__)) + '/data/output.csv') field_list = [ 'id_str', 'entities.hashtags.0', 'entities.hashtags.1', 'source', 'user.id', 'timestamp.$date', 'text', 'coordinates.coordinates.0', 'coordinates.coordinates.1', 'user.id_str', 'user.lang', 'lang', 'user.screen_name', 'user.location', 'user.description', 'created_at', 'user.friends_count', 'user.followers_count', 'retweet_count', 'entities.urls.0.expanded_url', 'entities.urls.1.expanded_url', 'entities.urls.2.expanded_url', 'entities.urls.3.expanded_url', 'entities.urls.4.expanded_url', 'entities.hashtags.0.text', 'entities.hashtags.1.text' ] output_path = os.path.dirname( os.path.realpath(__file__)) + '/' + 'data/output.csv' collection = CsvCollection( os.path.dirname(os.path.realpath(__file__)) + '/' + config['csv']['valid']) collection.dump_to_csv(output_path, field_list, top_level=True) with open( os.path.dirname(os.path.abspath(__file__)) + '/data/output.csv', 'r') as filehandle: count = 0 for line in csv.reader(filehandle): if count != 0: val_count = 0 for csv_row_value in line: everything_in_order = True if val_count == 0: self.assertEqual(csv_row_value, '661275583813431296') elif val_count == 1: if csv_row_value != 'jadehelm': everything_in_order = False elif val_count == 2: if csv_row_value != 'newworldorder': everything_in_order = False elif val_count == 3: self.assertEqual( csv_row_value, '<a href="https://twitter.com/Col_Connaughton" rel="nofollow">Colin\'s Autotweeterpro5.3</a>' ) elif val_count == 4: self.assertEqual(csv_row_value, '379851447') elif val_count == 5: self.assertEqual(csv_row_value, '1446495359000') elif val_count == 6: self.assertEqual( csv_row_value, 'Susan Lindauer, Rtd US Army LTC Potter: Jade Helm https://t.co/VA4bQRudLt #jadehelm #newworldorder #usa #tyranny #threat' ) if everything_in_order: self.assertTrue(True) val_count += 1 else: count += 1 if os.path.exists( os.path.dirname(os.path.abspath(__file__)) + '/data/output.csv'): os.remove( os.path.dirname(os.path.abspath(__file__)) + '/data/output.csv')
def test_iterator_returns_tweets(self): collection = CsvCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['csv']['valid']) self.assertTrue(len(list(collection.get_iterator())) > 0)