def test_mongo_collection_custom_filter_filters(self): collectionone = MongoCollection( config['mongo']['host'], config['mongo']['port'], config['mongo']['user'], config['mongo']['password'], config['mongo']['database'], config['mongo']['collection'] ) full_collection_len = len(list(collectionone.set_limit(10).get_iterator())) def is_tweet_a_retweet(tweet): if 'retweeted' in tweet and tweet['retweeted']: return True else: return False num_retweets = len(list(collectionone.set_limit(10).set_custom_filter(is_tweet_a_retweet).get_iterator())) collectiontwo = MongoCollection( config['mongo']['host'], config['mongo']['port'], config['mongo']['user'], config['mongo']['password'], config['mongo']['database'], config['mongo']['collection'] ) def is_not_a_retweet(tweet): if 'retweeted' in tweet and tweet['retweeted']: return False else: return True num_non_retweets = len(list(collectiontwo.set_limit(10).set_custom_filter(is_not_a_retweet).get_iterator())) #the number of retweets and non retweets should add up to the whole collection self.assertEqual(num_retweets + num_non_retweets, full_collection_len)
def test_pass_in_mongo(self): mongo_to_pass = pymongo.MongoClient(config['mongo']['host'], int(config['mongo']['port'])) collection = MongoCollection(config['mongo']['user'], config['mongo']['password'], config['mongo']['database'], config['mongo']['collection'], passed_mongo=mongo_to_pass) self.assertTrue(len(list(collection.set_limit(10).get_iterator())) > 0)
def test_iterator_returns_tweets(self): collection = MongoCollection( \ config['mongo']['host'], \ config['mongo']['port'], \ config['mongo']['user'], \ config['mongo']['password'], \ config['mongo']['database'], \ config['mongo']['collection'] \ ) self.assertTrue(len(list(collection.set_limit(10).get_iterator())) > 0)
def test_pass_in_mongo(self): mongo_to_pass = pymongo.MongoClient(config['mongo']['host'], int(config['mongo']['port'])) collection = MongoCollection( config['mongo']['user'], config['mongo']['password'], config['mongo']['database'], config['mongo']['collection'], passed_mongo=mongo_to_pass ) self.assertTrue(len(list(collection.set_limit(10).get_iterator())) > 0)
def test_strip_tweets_keeps_fields(self): tweet_parser = TweetParser() collection = MongoCollection( config['mongo']['host'], config['mongo']['port'], config['mongo']['user'], config['mongo']['password'], config['mongo']['database'], config['mongo']['collection'] ) self.maxDiff = None it = collection.set_limit(10).strip_tweets(['id', 'entities.user_mentions', 'user.profile_image_url_https']).get_iterator() def tweets_have_right_keys(iterator, fields): for tweet in iterator: keys = [key for key,value in tweet_parser.flatten_dict(tweet)] for elem in fields: if elem not in keys: return False return True self.assertTrue(tweets_have_right_keys(it, [['id'], ['entities', 'user_mentions'], ['user', 'profile_image_url_https']]))
def query_dump_database(hostname, port, dbname, username, password, authdb, authusername, authpassword, output_path): # connect to the db mongo = pymongo.MongoClient(hostname, int(port)) if username and password: mongo[authdb].authenticate(authusername, authpassword) db = mongo[dbname] # Get a list of relevant collections from the database db_collection_names = db.collection_names() db_collection_names.sort() db_collection_names.sort(key=len) if len(db_collection_names) == 0: print("Database for {} is empty".format(dbname)) logging.info("Database for {} is empty".format(dbname)) return #Create dump folder dump_folder_path = output_path + dbname if output_path[ -1:] == '/' else output_path + '/' + dbname if not os.path.exists(dump_folder_path): os.makedirs(dump_folder_path) else: print("Dump folder for {} already exists".format(dbname)) logging.info("Dump folder for {} already exists".format(dbname)) return print("Dumping database {}".format(dbname)) logging.info("Dumping {} database".format(dbname)) #Use MongoCollection with dump_to_bson to dump each collection in the database for collection_name in db_collection_names: collection_dump_path = dump_folder_path + collection_name + '.json' if dump_folder_path[ -1:] == '/' else dump_folder_path + '/' + collection_name + '.json' collection_to_dump = MongoCollection(hostname, port, username, password, dbname, collection_name) collection_to_dump.dump_to_json(collection_dump_path)
def test_strip_tweets_keeps_fields(self): tweet_parser = TweetParser() collection = MongoCollection(config['mongo']['host'], config['mongo']['port'], config['mongo']['user'], config['mongo']['password'], config['mongo']['database'], config['mongo']['collection']) self.maxDiff = None it = collection.set_limit(10).strip_tweets( ['id', 'entities.user_mentions', 'user.profile_image_url_https']).get_iterator() def tweets_have_right_keys(iterator, fields): for tweet in iterator: keys = [key for key, value in tweet_parser.flatten_dict(tweet)] for elem in fields: if elem not in keys: return False return True self.assertTrue( tweets_have_right_keys(it, [['id'], ['entities', 'user_mentions'], ['user', 'profile_image_url_https']]))
def query_dump_database(hostname, port, dbname, username, password, authdb, authusername, authpassword, output_path): # connect to the db mongo = pymongo.MongoClient(hostname, int(port)) if username and password: mongo[authdb].authenticate(authusername, authpassword) db = mongo[dbname] # Get a list of relevant collections from the database db_collection_names = db.collection_names() db_collection_names.sort() db_collection_names.sort(key=len) if len(db_collection_names) == 0: print("Database for {} is empty".format(dbname)) logging.info("Database for {} is empty".format(dbname)) return #Create dump folder dump_folder_path = output_path + dbname if output_path[-1:] == '/' else output_path + '/' + dbname if not os.path.exists(dump_folder_path): os.makedirs(dump_folder_path) else: print("Dump folder for {} already exists".format(dbname)) logging.info("Dump folder for {} already exists".format(dbname)) return print("Dumping database {}".format(dbname)) logging.info("Dumping {} database".format(dbname)) #Use MongoCollection with dump_to_bson to dump each collection in the database for collection_name in db_collection_names: collection_dump_path = dump_folder_path + collection_name + '.json' if dump_folder_path[-1:] == '/' else dump_folder_path + '/' + collection_name + '.json' collection_to_dump = MongoCollection(hostname, port, username, password, dbname, collection_name) collection_to_dump.dump_to_json(collection_dump_path)
def test_mongo_collection_custom_filter_filters(self): collectionone = MongoCollection(config['mongo']['host'], config['mongo']['port'], config['mongo']['user'], config['mongo']['password'], config['mongo']['database'], config['mongo']['collection']) full_collection_len = len( list(collectionone.set_limit(10).get_iterator())) def is_tweet_a_retweet(tweet): if 'retweeted' in tweet and tweet['retweeted']: return True else: return False num_retweets = len( list( collectionone.set_limit(10).set_custom_filter( is_tweet_a_retweet).get_iterator())) collectiontwo = MongoCollection(config['mongo']['host'], config['mongo']['port'], config['mongo']['user'], config['mongo']['password'], config['mongo']['database'], config['mongo']['collection']) def is_not_a_retweet(tweet): if 'retweeted' in tweet and tweet['retweeted']: return False else: return True num_non_retweets = len( list( collectiontwo.set_limit(10).set_custom_filter( is_not_a_retweet).get_iterator())) #the number of retweets and non retweets should add up to the whole collection self.assertEqual(num_retweets + num_non_retweets, full_collection_len)