Exemplo n.º 1
0
	def test_mongo_collection_custom_filter_filters(self):
		collectionone = MongoCollection(
			config['mongo']['host'],
			config['mongo']['port'],
			config['mongo']['user'],
			config['mongo']['password'],
			config['mongo']['database'],
			config['mongo']['collection']
		)
		full_collection_len = len(list(collectionone.set_limit(10).get_iterator()))
		def is_tweet_a_retweet(tweet):
			if 'retweeted' in tweet and tweet['retweeted']:
				return True
			else:
				return False
		num_retweets = len(list(collectionone.set_limit(10).set_custom_filter(is_tweet_a_retweet).get_iterator()))
		
		collectiontwo = MongoCollection(
			config['mongo']['host'],
			config['mongo']['port'],
			config['mongo']['user'],
			config['mongo']['password'],
			config['mongo']['database'],
			config['mongo']['collection']
		)
		def is_not_a_retweet(tweet):
			if 'retweeted' in tweet and tweet['retweeted']:
				return False
			else:
				return True
		num_non_retweets = len(list(collectiontwo.set_limit(10).set_custom_filter(is_not_a_retweet).get_iterator()))

		#the number of retweets and non retweets should add up to the whole collection
		self.assertEqual(num_retweets + num_non_retweets, full_collection_len)
Exemplo n.º 2
0
 def test_pass_in_mongo(self):
     mongo_to_pass = pymongo.MongoClient(config['mongo']['host'],
                                         int(config['mongo']['port']))
     collection = MongoCollection(config['mongo']['user'],
                                  config['mongo']['password'],
                                  config['mongo']['database'],
                                  config['mongo']['collection'],
                                  passed_mongo=mongo_to_pass)
     self.assertTrue(len(list(collection.set_limit(10).get_iterator())) > 0)
Exemplo n.º 3
0
 def test_iterator_returns_tweets(self):
     collection = MongoCollection(     \
      config['mongo']['host'],      \
      config['mongo']['port'],      \
      config['mongo']['user'],      \
      config['mongo']['password'],  \
      config['mongo']['database'],  \
      config['mongo']['collection'] \
     )
     self.assertTrue(len(list(collection.set_limit(10).get_iterator())) > 0)
Exemplo n.º 4
0
	def test_iterator_returns_tweets(self):
		collection = MongoCollection(     \
			config['mongo']['host'],      \
			config['mongo']['port'],      \
			config['mongo']['user'],      \
			config['mongo']['password'],  \
			config['mongo']['database'],  \
			config['mongo']['collection'] \
		)
		self.assertTrue(len(list(collection.set_limit(10).get_iterator())) > 0)
Exemplo n.º 5
0
	def test_pass_in_mongo(self):
		mongo_to_pass = pymongo.MongoClient(config['mongo']['host'], int(config['mongo']['port']))
		collection = MongoCollection(
			config['mongo']['user'],
			config['mongo']['password'],
			config['mongo']['database'],
			config['mongo']['collection'],
			passed_mongo=mongo_to_pass
		)
		self.assertTrue(len(list(collection.set_limit(10).get_iterator())) > 0)
Exemplo n.º 6
0
	def test_strip_tweets_keeps_fields(self):
		tweet_parser = TweetParser()
		collection = MongoCollection(
			config['mongo']['host'],
			config['mongo']['port'],
			config['mongo']['user'],
			config['mongo']['password'],
			config['mongo']['database'],
			config['mongo']['collection']
		)
		self.maxDiff = None
		it = collection.set_limit(10).strip_tweets(['id', 'entities.user_mentions', 'user.profile_image_url_https']).get_iterator()
		def tweets_have_right_keys(iterator, fields):
			for tweet in iterator:
				keys = [key for key,value in tweet_parser.flatten_dict(tweet)]
				for elem in fields:
					if elem not in keys:
						return False
			return True		
		self.assertTrue(tweets_have_right_keys(it, [['id'], ['entities', 'user_mentions'], ['user', 'profile_image_url_https']]))
Exemplo n.º 7
0
def query_dump_database(hostname, port, dbname, username, password, authdb,
                        authusername, authpassword, output_path):

    # connect to the db
    mongo = pymongo.MongoClient(hostname, int(port))
    if username and password:
        mongo[authdb].authenticate(authusername, authpassword)

    db = mongo[dbname]

    # Get a list of relevant collections from the database
    db_collection_names = db.collection_names()
    db_collection_names.sort()
    db_collection_names.sort(key=len)

    if len(db_collection_names) == 0:
        print("Database for {} is empty".format(dbname))
        logging.info("Database for {} is empty".format(dbname))
        return

    #Create dump folder
    dump_folder_path = output_path + dbname if output_path[
        -1:] == '/' else output_path + '/' + dbname
    if not os.path.exists(dump_folder_path):
        os.makedirs(dump_folder_path)
    else:
        print("Dump folder for {} already exists".format(dbname))
        logging.info("Dump folder for {} already exists".format(dbname))
        return

    print("Dumping database {}".format(dbname))
    logging.info("Dumping {} database".format(dbname))

    #Use MongoCollection with dump_to_bson to dump each collection in the database
    for collection_name in db_collection_names:
        collection_dump_path = dump_folder_path + collection_name + '.json' if dump_folder_path[
            -1:] == '/' else dump_folder_path + '/' + collection_name + '.json'
        collection_to_dump = MongoCollection(hostname, port, username,
                                             password, dbname, collection_name)
        collection_to_dump.dump_to_json(collection_dump_path)
Exemplo n.º 8
0
    def test_strip_tweets_keeps_fields(self):
        tweet_parser = TweetParser()
        collection = MongoCollection(config['mongo']['host'],
                                     config['mongo']['port'],
                                     config['mongo']['user'],
                                     config['mongo']['password'],
                                     config['mongo']['database'],
                                     config['mongo']['collection'])
        self.maxDiff = None
        it = collection.set_limit(10).strip_tweets(
            ['id', 'entities.user_mentions',
             'user.profile_image_url_https']).get_iterator()

        def tweets_have_right_keys(iterator, fields):
            for tweet in iterator:
                keys = [key for key, value in tweet_parser.flatten_dict(tweet)]
                for elem in fields:
                    if elem not in keys:
                        return False
            return True

        self.assertTrue(
            tweets_have_right_keys(it, [['id'], ['entities', 'user_mentions'],
                                        ['user', 'profile_image_url_https']]))
Exemplo n.º 9
0
def query_dump_database(hostname, port, dbname, username, password, authdb, authusername, authpassword, output_path):

    # connect to the db
    mongo = pymongo.MongoClient(hostname, int(port))
    if username and password:
        mongo[authdb].authenticate(authusername, authpassword)

    db = mongo[dbname]

    # Get a list of relevant collections from the database
    db_collection_names = db.collection_names()
    db_collection_names.sort()
    db_collection_names.sort(key=len)

    if len(db_collection_names) == 0:
        print("Database for {} is empty".format(dbname))
        logging.info("Database for {} is empty".format(dbname))
        return

    #Create dump folder
    dump_folder_path = output_path + dbname if output_path[-1:] == '/' else output_path + '/' + dbname
    if not os.path.exists(dump_folder_path):
        os.makedirs(dump_folder_path)
    else:
       print("Dump folder for {} already exists".format(dbname))
       logging.info("Dump folder for {} already exists".format(dbname))
       return 

    print("Dumping database {}".format(dbname))
    logging.info("Dumping {} database".format(dbname))

    #Use MongoCollection with dump_to_bson to dump each collection in the database
    for collection_name in db_collection_names:
        collection_dump_path = dump_folder_path + collection_name + '.json' if dump_folder_path[-1:] == '/' else dump_folder_path + '/' + collection_name + '.json'
        collection_to_dump = MongoCollection(hostname, port, username, password, dbname, collection_name)
        collection_to_dump.dump_to_json(collection_dump_path)
Exemplo n.º 10
0
    def test_mongo_collection_custom_filter_filters(self):
        collectionone = MongoCollection(config['mongo']['host'],
                                        config['mongo']['port'],
                                        config['mongo']['user'],
                                        config['mongo']['password'],
                                        config['mongo']['database'],
                                        config['mongo']['collection'])
        full_collection_len = len(
            list(collectionone.set_limit(10).get_iterator()))

        def is_tweet_a_retweet(tweet):
            if 'retweeted' in tweet and tweet['retweeted']:
                return True
            else:
                return False

        num_retweets = len(
            list(
                collectionone.set_limit(10).set_custom_filter(
                    is_tweet_a_retweet).get_iterator()))

        collectiontwo = MongoCollection(config['mongo']['host'],
                                        config['mongo']['port'],
                                        config['mongo']['user'],
                                        config['mongo']['password'],
                                        config['mongo']['database'],
                                        config['mongo']['collection'])

        def is_not_a_retweet(tweet):
            if 'retweeted' in tweet and tweet['retweeted']:
                return False
            else:
                return True

        num_non_retweets = len(
            list(
                collectiontwo.set_limit(10).set_custom_filter(
                    is_not_a_retweet).get_iterator()))

        #the number of retweets and non retweets should add up to the whole collection
        self.assertEqual(num_retweets + num_non_retweets, full_collection_len)