Exemplo n.º 1
0
    def buy_item_from_store(self, user_id, item_id):

        item_info = self.get_item_info(item_id)

        if item_info is None:
            return f"There is no item found for item id {item_id}"

        print("Item info ", item_info)
        user_coins = self.get_user_coins(user_id)
        print("User Coins", user_coins)
        item_cost = item_info['cost']

        if user_coins >= item_cost:
            mongo = MongoClient(MONGO_CONNECTION_STRING)
            mongo.beanbot.users.update_one(
                {'user_id': user_id},
                {"$inc": {
                    "beano_coin_count": -item_cost
                }},
                upsert=True)
            self.give_item_to_player(item_info, user_id)
            mongo.close()
            return f"You purchased a {item_info['name']}. You have {user_coins - item_cost } BC remaining!"
        else:
            return f"You do not have enough coins to purchase this time.  It costs {item_cost} and you have {user_coins}."
Exemplo n.º 2
0
def stash(results):
    """
    暂存到mongo数据库中。
    """
    summary = {}
    mongo = MongoClient(**config.mongo)
    try:
        for item_model, objs in results:
            collection_name = item_model['name']
            db = mongo.get_database('theforce')
            collection = db.get_collection(collection_name)
            collection.insert_many(objs)
            summary[collection_name] = len(
                objs) if collection_name not in summary else len(
                    objs) + summary[collection_name]

        print
        print "=" * 40
        print ' ' * 15, u'Stash'
        print "=" * 40
        print
        print u"数据已成功保存到MongoDB的theforce库中,其中新增数据:"
        for name, length in summary.items():
            print name, length
    finally:
        mongo.close()
    def test_ipv6(self):
        c = MongoClient("mongodb://[::1]:%d" % (port,), replicaSet=self.name)

        # Client switches to IPv4 once it has first ismaster response.
        msg = 'discovered primary with IPv4 address "%r"' % (self.primary,)
        wait_until(lambda: c.primary == self.primary, msg)

        # Same outcome with both IPv4 and IPv6 seeds.
        c = MongoClient("[::1]:%d,localhost:%d" % (port, port),
                        replicaSet=self.name)

        wait_until(lambda: c.primary == self.primary, msg)

        if client_context.auth_enabled:
            auth_str = "%s:%s@" % (db_user, db_pwd)
        else:
            auth_str = ""

        uri = "mongodb://%slocalhost:%d,[::1]:%d" % (auth_str, port, port)
        client = MongoClient(uri, replicaSet=self.name)
        client.pymongo_test.test.insert_one({"dummy": u("object")})
        client.pymongo_test_bernie.test.insert_one({"dummy": u("object")})

        dbs = client.database_names()
        self.assertTrue("pymongo_test" in dbs)
        self.assertTrue("pymongo_test_bernie" in dbs)
        client.close()
Exemplo n.º 4
0
 def get_store_inventory(self):
     mongo = MongoClient(MONGO_CONNECTION_STRING)
     response = mongo.beanbot.store_items.find({
         "active": True
     }).sort("item_id")
     mongo.close()
     return response
Exemplo n.º 5
0
class AssignmentPipeline(object):

    db_name = 'jd'
    collection_name = 'meidi'

    def open_spider(self, spider):
        self.client = MongoClient()
        self.db = self.client[self.db_name]

        file = './comments.csv'
        self.fp = open(file, 'w+')
        headers = ['creationTime', 'user', 'referenceName', 'content']
        self.csv_file = csv.DictWriter(self.fp, headers, extrasaction='ignore')
        self.csv_file.writeheader()

    def process_item(self, item, spider):
        item_dict = dict(item)

        self.db[self.collection_name].insert_one(item_dict)
        self.csv_file.writerow(item_dict)
        return item

    def close_spider(self, spider):
        self.client.close()
        self.fp.close()
    def test_ipv6(self):
        c = MongoClient("mongodb://[::1]:%d" % (port, ), replicaSet=self.name)

        # Client switches to IPv4 once it has first ismaster response.
        msg = 'discovered primary with IPv4 address "%r"' % (self.primary, )
        wait_until(lambda: c.primary == self.primary, msg)

        # Same outcome with both IPv4 and IPv6 seeds.
        c = MongoClient("[::1]:%d,localhost:%d" % (port, port),
                        replicaSet=self.name)

        wait_until(lambda: c.primary == self.primary, msg)

        if client_context.auth_enabled:
            auth_str = "%s:%s@" % (db_user, db_pwd)
        else:
            auth_str = ""

        uri = "mongodb://%slocalhost:%d,[::1]:%d" % (auth_str, port, port)
        client = MongoClient(uri, replicaSet=self.name)
        client.pymongo_test.test.insert_one({"dummy": u("object")})
        client.pymongo_test_bernie.test.insert_one({"dummy": u("object")})

        dbs = client.database_names()
        self.assertTrue("pymongo_test" in dbs)
        self.assertTrue("pymongo_test_bernie" in dbs)
        client.close()
Exemplo n.º 7
0
    def update_beano_stats(self, update_json):

        print("Pulling Stats from Mongo")
        mongo = MongoClient(MONGO_CONNECTION_STRING)
        stats = mongo.beanbot.beano_data.insert(update_json)
        mongo.close()
        print(stats)
        return stats
Exemplo n.º 8
0
    def get_beano_stats(self):

        print("Pulling Stats from Mongo")
        mongo = MongoClient(MONGO_CONNECTION_STRING)
        stats = defaultdict(int, mongo.beanbot.beano_data.find_one())
        mongo.close()
        print(stats)
        return stats
Exemplo n.º 9
0
    def get_user_coins(self, user_id):

        print("Pulling Coin Amount for ", user_id)
        mongo = MongoClient(MONGO_CONNECTION_STRING)
        stats = defaultdict(int,
                            mongo.beanbot.users.find_one({"user_id": user_id}))
        mongo.close()
        return stats['beano_coin_count']
Exemplo n.º 10
0
 def get_item_info(self, item_id):
     mongo = MongoClient(MONGO_CONNECTION_STRING)
     print("Pulling info for Item ID", item_id)
     response = mongo.beanbot.store_items.find_one({
         "active": True,
         "item_id": int(item_id)
     })
     mongo.close()
     return response
def doEverything():
#     certfile = '/home/bryan/Downloads/baratheon.pem'
    conn = MongoClient(url)
    db = conn[database]
    
    commands = []
    collectionName = "pythonMongo"
    commands.append("Creating collection " + collectionName)
    collection = db[collectionName]
    
    #insert 1
    commands.append("# 1 Inserts")
    commands.append("# 1.1 Insert a single document to a collection")
    collection.insert({"name": "test1", "value": 1})
    commands.append("Inserted {\"name\": \"test1\", \"value\": 1}")
    
    #insert many
    commands.append("#1.2 Inserting multiple entries into collection")
    multiPost = [{"name": "test1", "value": 1},{"name": "test2", "value": 2}, {"name": "test3", "value": 3}] 
    collection.insert(multiPost)
    commands.append("Inserted \n {\"name\": \"test1\", \"value\": 1} \n {\"name\": \"test2\", \"value\": 2} \n {\"name\": \"test3\", \"value\": 3}")
     
    # Find 
    commands.append("#2 Queries")
    commands.append("#2.1 Find one that matches a query condition")
    commands.append(collection.find_one({"name": "test1"}))
     
    # Find all 
    commands.append("#2.2 Find all that match a query condition")
    for doc in collection.find({"name": "test1"}):
        commands.append(doc)
    
    # Display all documents
    commands.append( "#2.3 Find all documents in collection")
    for doc in collection.find():
        commands.append(doc)   
    
    # update document
    commands.append("#3 Updating Documents")
    collection.update({"name": "test3"}, {"$set": { "value": 4}})
    commands.append("Updated test3 with value 4")
     
    # delete document
    commands.append("#4 Delete Documents")
    collection.remove({"name": "test2"})  
    commands.append("Deleted all with name test2")
    
    # Display all collection names
    commands.append("#5 Get a list of all of the collections")
    commands.append( db.collection_names())
    
    commands.append("#6 Drop a collection")
    db.drop_collection(collectionName)
    conn.close()
    commands.append("Connection to database has been closed")
    return commands
Exemplo n.º 12
0
    def give_user_coins(self, user_id, coin_amount):

        print("Giving {} coins to {}".format(coin_amount, user_id))
        mongo = MongoClient(MONGO_CONNECTION_STRING)
        mongo.beanbot.users.update_one({'user_id': user_id},
                                       {"$inc": {
                                           "beano_coin_count": 1
                                       }},
                                       upsert=True)
        mongo.close()
        return None
    def test_properties(self):
        c = client_context.rs_client
        c.admin.command('ping')

        wait_until(lambda: c.primary == self.primary, "discover primary")
        wait_until(lambda: c.arbiters == self.arbiters, "discover arbiters")
        wait_until(lambda: c.secondaries == self.secondaries,
                   "discover secondaries")

        self.assertEqual(c.primary, self.primary)
        self.assertEqual(c.secondaries, self.secondaries)
        self.assertEqual(c.arbiters, self.arbiters)
        self.assertEqual(c.max_pool_size, 100)

        # Make sure MongoClient's properties are copied to Database and
        # Collection.
        for obj in c, c.pymongo_test, c.pymongo_test.test:
            self.assertEqual(obj.codec_options, CodecOptions())
            self.assertEqual(obj.read_preference, ReadPreference.PRIMARY)
            self.assertEqual(obj.write_concern, WriteConcern())

        cursor = c.pymongo_test.test.find()
        self.assertEqual(ReadPreference.PRIMARY,
                         cursor._Cursor__read_preference)

        tag_sets = [{'dc': 'la', 'rack': '2'}, {'foo': 'bar'}]
        secondary = Secondary(tag_sets=tag_sets)
        c = MongoClient(pair,
                        replicaSet=self.name,
                        maxPoolSize=25,
                        document_class=SON,
                        tz_aware=True,
                        read_preference=secondary,
                        localThresholdMS=77,
                        j=True)

        self.assertEqual(c.max_pool_size, 25)

        for obj in c, c.pymongo_test, c.pymongo_test.test:
            self.assertEqual(obj.codec_options, CodecOptions(SON, True))
            self.assertEqual(obj.read_preference, secondary)
            self.assertEqual(obj.write_concern, WriteConcern(j=True))

        cursor = c.pymongo_test.test.find()
        self.assertEqual(secondary, cursor._Cursor__read_preference)

        nearest = Nearest(tag_sets=[{'dc': 'ny'}, {}])
        cursor = c.pymongo_test.get_collection("test",
                                               read_preference=nearest).find()

        self.assertEqual(nearest, cursor._Cursor__read_preference)
        self.assertEqual(c.max_bson_size, 16777216)
        c.close()
Exemplo n.º 14
0
class MovieSpider(CrawlSpider):
    #初始化爬虫对象是调用该方法
    def __init__(self):
        #调用父类的方法
        super().__init__(self)
        #访问数据库,创建客户端对象
        self.client = MongoClient('localhost', 27017)
        self.url_connection = self.client['moviedb']['urls']

    #销毁爬虫对象时,回调该方法
    def __del__(self):
        self.client.close()

    name = 'movie'
    # allowed_domains = ['www.xxx.com']
    start_urls = ['http://www.4567kan.com/frim/index1.html']
    link_1 = LinkExtractor(allow=r'http://www\.4567kan\.com/frim/index1\.html')
    link = LinkExtractor(
        allow=r'http://www\.4567kan\.com/frim/index1-\d+\.html')

    rules = (
        Rule(link_1, callback='parse_item', follow=False),
        Rule(link, callback='parse_item', follow=False),
    )

    def parse_item(self, response):
        # print(response.request.url)
        li_list = response.xpath('/html/body/div[1]/div/div/div/div[2]/ul/li')
        for li in li_list:
            self.detail_url = "http://www.4567kan.com" + li.xpath(
                './div/a/@href').extract_first()
            self.title = li.xpath('./div/a/@title').extract_first()
            # print('影名:'+self.title,'网址链接:'+self.detail_url)

            cursor = self.url_connection.find({'url': self.detail_url})
            if cursor.count() == 0:
                print('该%s没有被访问,可以进行数据的爬取...' % self.detail_url)
                self.url_connection.insert_one({"url": self.detail_url})
                #发起一个新的请求,访问该url的电影详情页面
                yield scrapy.Request(url=self.detail_url,
                                     callback=self.parse_detail)
            else:
                print("当前的%s已经访问过,无需访问" % self.detail_url)

    def parse_detail(self, response):
        item = MovieprojectItem()
        item['name'] = response.xpath(
            '/html/body/div[1]/div/div/div/div[2]/h1/text()').extract_first()
        item['desc'] = response.xpath(
            '/html/body/div[1]/div/div/div/div[2]/p[5]/span[2]//text()'
        ).extract()
        item['desc'] = ''.join(item['desc'])
        yield item
Exemplo n.º 15
0
def checkmongodbconnection():
    try:
        c = MongoClient(MONGO_URI,
                        server_api=ServerApi('1'),
                        serverSelectionTimeoutMS=5000)
        c.admin.command('ismaster')
        time.sleep(2)
        c.close()
        return True
    except:
        print('\nCould not connect to MongoDB.\n\n')
        return False
Exemplo n.º 16
0
 def process_request(self, request: scrapy.http.Request,
                     spider: scrapy.spiders.Spider) -> None:
     client = MongoClient(spider.settings.get('MONGODB_URI'))
     db = client[spider.settings.get('MONGODB_DATABASE')]
     collention = db[spider.settings.get('MONGODB_COLLECTION')]
     res = collention.find_one({'url': request.url})
     client.close()
     if res is not None:
         logger.info("MongoDBDupeFilter filtered request to: %(request)s",
                     {'request': request.url},
                     extra={'spider': spider})
         raise IgnoreRequest()
    def test_disconnect(self):
        c = MongoClient(host, port)
        coll = c.pymongo_test.bar

        c.close()
        c.close()

        coll.count()

        c.close()
        c.close()

        coll.count()
    def test_properties(self):
        c = client_context.rs_client
        c.admin.command('ping')

        wait_until(lambda: c.primary == self.primary, "discover primary")
        wait_until(lambda: c.arbiters == self.arbiters, "discover arbiters")
        wait_until(lambda: c.secondaries == self.secondaries,
                   "discover secondaries")

        self.assertEqual(c.primary, self.primary)
        self.assertEqual(c.secondaries, self.secondaries)
        self.assertEqual(c.arbiters, self.arbiters)
        self.assertEqual(c.max_pool_size, 100)

        # Make sure MongoClient's properties are copied to Database and
        # Collection.
        for obj in c, c.pymongo_test, c.pymongo_test.test:
            self.assertEqual(obj.codec_options, CodecOptions())
            self.assertEqual(obj.read_preference, ReadPreference.PRIMARY)
            self.assertEqual(obj.write_concern, WriteConcern())

        cursor = c.pymongo_test.test.find()
        self.assertEqual(
            ReadPreference.PRIMARY, cursor._Cursor__read_preference)

        tag_sets = [{'dc': 'la', 'rack': '2'}, {'foo': 'bar'}]
        secondary = Secondary(tag_sets=tag_sets)
        c = MongoClient(
            pair, replicaSet=self.name, maxPoolSize=25,
            document_class=SON, tz_aware=True,
            read_preference=secondary,
            localThresholdMS=77, j=True)

        self.assertEqual(c.max_pool_size, 25)

        for obj in c, c.pymongo_test, c.pymongo_test.test:
            self.assertEqual(obj.codec_options, CodecOptions(SON, True))
            self.assertEqual(obj.read_preference, secondary)
            self.assertEqual(obj.write_concern, WriteConcern(j=True))

        cursor = c.pymongo_test.test.find()
        self.assertEqual(
            secondary, cursor._Cursor__read_preference)

        nearest = Nearest(tag_sets=[{'dc': 'ny'}, {}])
        cursor = c.pymongo_test.get_collection(
            "test", read_preference=nearest).find()

        self.assertEqual(nearest, cursor._Cursor__read_preference)
        self.assertEqual(c.max_bson_size, 16777216)
        c.close()
Exemplo n.º 19
0
async def disconnect_mongo(client: MongoClient):
    """Close the MongoDB connection.

    Arguments:
        client(MongoClient) : MongoDB client
    """
    try:
        client.close()
    except errors.PyMongoError:
        raise errors.PyMongoError
    except Exception:
        raise Exception

    return
Exemplo n.º 20
0
 def give_item_to_player(self, item, user_id):
     mongo = MongoClient(MONGO_CONNECTION_STRING)
     mongo.beanbot.users.update_one({'user_id': user_id}, {
         "$push": {
             "inventory": {
                 "id": item['item_id'],
                 "name": item['name'],
                 "effect": item['effect'],
                 "description": item['description']
             }
         }
     },
                                    upsert=True)
     mongo.close()
     print("Gave {} to {}".format(item['item_id'], user_id))
Exemplo n.º 21
0
class Day10CrawlspiderDoubanPipeline(object):
    def open_spider(self, spider):
        """连接mongo数据库"""
        if spider.name == DoubanSpider.name:
            self.client = MongoClient()
            self.collection = self.client['douban']['top250']

    def process_item(self, item, spider):
        """保存数据"""
        if spider.name == DoubanSpider.name:
            self.collection.insert_one(dict(item))
        return item

    def close_spider(self, spider):
        """关闭连接数据库"""
        if spider.name == DoubanSpider.name:
            self.client.close()
Exemplo n.º 22
0
class MongoDBPipeline(object):
    def open_spider(self, spider):
        self.client = MongoClient("localhost",27017)

    def process_item(self, item, spider):
        title = item['title']
        content = item['content']
        url = item['url']
        dict1 = {
            "url":url,
            "title":title,
            "content":content
        }
        self.client['wangyi']['news'].save(dict1)
        return item

    def close_spider(self, spider):
        self.client.close()


# import pymysql
# class mysqlPileLine(object):
#     def __init__(self):
#         print("mysqlPileLine管道对象初始化...")
#         self.cursor = None
#         self.db = None
#
#     def open_spider(self,spider):
#         self.db = pymysql.Connect(host='127.0.0.1',port=3306,user='******',password='******',db='qiubai',charset='utf8')
#
#     def process_item(self,item,spider):
#         self.cursor = self.db.cursor()
#
#         try:
#             self.cursor.execute('insert into tb_qiubai(title,content) values("%s","%s")'%(item["title"],item["content"]))
#             self.db.commit()
#         except Exception as e:
#             print(e)
#             self.db.rollback()
#         return item
#
#     def close_spider(self,spider):
#         print("mysqlPileLine管道对象销毁...")
#         self.cursor.close()
#         self.db.close()
Exemplo n.º 23
0
def save(metas, batch_num=100):
    """
    读取配置,把Mongo数据同步到mysql中。
    """
    mongo = MongoClient(**config.mongo)
    db = MySQLdb.connect(**config.mysql)
    cursor = db.cursor()
    print
    print "=" * 40
    print ' ' * 15, u'Mongo --> MySQL'
    print "=" * 40
    print
    try:
        mongo_db = mongo.get_database('theforce')
        for meta in metas:
            for model_name, item_model in meta.iter_model():
                collection_name = item_model['name']
                table_name = item_model['table']
                attrs = meta.get_model_persist_attr_names(item_model)

                collection = mongo_db.get_collection(collection_name)
                results = [obj for obj in collection.find({})]
                sql = "insert into {0}({1}) values({2})".format(
                    table_name, ','.join(attrs),
                    ','.join(itertools.repeat('%s', len(attrs))))

                print
                print '-' * 40
                print u'开始处理{0}@mongo --> {1}@mysql, 共{2}条数据,每批{3}条批量迁移:'.format(
                    collection_name, table_name, len(results), batch_num)
                # 分组进行批量处理
                results2 = itertools.izip(itertools.count(), results)
                for group_key, group_it in itertools.groupby(
                        results2, lambda item: item[0] / batch_num):
                    print '.',
                    values = [[obj[attr] for attr in attrs]
                              for index, obj in group_it]
                    cursor.executemany(sql, values)
                print u'[完成]'
    finally:
        mongo.close()
        cursor.close()
        db.close()
    def test_kill_cursors_warning(self):
        # If kill_cursors is called while the client is disconnected, it
        # can't risk taking the lock to reconnect, in case it's being called
        # from Cursor.__del__, see PYTHON-799. Test that it shows a warning
        # in this case.
        client = MongoClient(host, port)
        collection = client.pymongo_test.test
        collection.insert({} for _ in range(4))
        cursor = collection.find().batch_size(1)
        cursor.next()
        client.close()
        ctx = catch_warnings()
        try:
            warnings.simplefilter("error", UserWarning)
            self.assertRaises(UserWarning, cursor.close)
        finally:
            ctx.exit()

        # Reconnect.
        collection.find_one()
        cursor.close()
Exemplo n.º 25
0
    def pat_beano(self, user_id):
        print("Patting Beano")
        current_timestamp = datetime.now()
        response = ("idle", "There was some issue. Beano is confused.")
        mongo = MongoClient(MONGO_CONNECTION_STRING)

        user_stats = mongo.beanbot.users.find_one({'user_id': user_id})

        if user_stats is None:
            print("{} not in mongo. Adding to mongo.".format(user_id))
            self.create_new_user(user_id)
            user_stats = mongo.beanbot.users.find_one({'user_id': user_id})

        print("User Stats : ", user_stats)

        last_pat_ts = user_stats['last_pat_timestamp']
        pat_cd = (current_timestamp - last_pat_ts).total_seconds() / 60

        print(pat_cd, " VS ", self.pat_cooldown)

        if pat_cd < self.pat_cooldown:
            response = ("idle",
                        "You cannot pat Beano for another {} minutes.".format(
                            self.pat_cooldown - pat_cd))
        else:
            mongo.beanbot.users.update_one(
                {'user_id': user_id},
                {"$set": {
                    "last_pat_timestamp": current_timestamp
                }},
                upsert=True)
            response = ("happy", "You pat Beano!")

        mongo.close()

        return response
Exemplo n.º 26
0
            '$set': {
                'attachUrl_uu': None,
                'attach_download_user': None,
                'attachTask': Constant.TODO
            }
        })
    except Exception as e:
        print(attach['attachUrl_uu'], e)
        continue

# error
attachs = db.component_original.find({"attachTask": Constant.ERROR}, {
    "_id": True,
    "attachUrl_uu": True
})

for attach in attachs:
    try:
        requests.get(fs_api_delete % attach['attachUrl_uu'])
        db.component_original.update_one(
            {'_id': attach["_id"]},
            {'$set': {
                'attachUrl_uu': None,
                'attach_download_user': None
            }})
    except Exception as e:
        print(attach['attachUrl_uu'], e)
        continue

cli.close()
Exemplo n.º 27
0
class ZhihuPipeline(object):

    config = {
        'uri': 'mongodb://localhost:27017',
        'fsync': False,
        'write_concern': 1,
        'database': 'zhihu_userdb',
        'collection': 'zhihu_userdb',
        'separate_collections': False,
        'replica_set': None,
        'unique_key': None,
        'buffer': None,
        'append_timestamp': False,
        'stop_on_duplicate': 0,
    }

    current_item = 0

    item_buffer = []
    duplicate_key_count = 0


    def __init__(self, mongo_uri, mongo_db, mongo_replSet_name, mongo_replSet_uri):
        self.config["uri"] = mongo_uri
        self.config["database"] = mongo_db
        if mongo_replSet_name:
            #self.mongo_replSet_name = mongo_replSet_name
            self.config["replica_set"] = mongo_replSet_name
        if mongo_replSet_uri:
            #self.mongo_replSet_uri = mongo_replSet_uri
            self.config["uri"] = mongo_replSet_uri
        self.logger = logging.getLogger("scrapy-Zhihu-logger")
    @classmethod
    def from_crawler(cls, crawler):
        return cls(
                crawler.settings.get("MONGO_URI", "mongodb://localhost:27017/zhihu_userdb"),
                crawler.settings.get("MONGO_DATABASE", "zhihu_userdb"),
                crawler.settings.get("MONGO_REPLSET_NAME", "None"),
                crawler.settings.get("MONGO_REPLSET_URI", "None")
                )

    def open_spider(self, spider):
        self.crawler = spider.crawler
        self.settings = spider.settings
        if self.config["replica_set"] is not None:
            self.conncetion = MongoReplicaSetClient(
                self.config["uri"],
                replicaSet = self.config["replica_set"],
                w = self.config["write_concern"],
                fsync = self.config["fsync"],
                read_preferences = ReadPreference.PRIMARY_PREFERRED
            )

        else:
            self.connection = MongoClient(
                self.config["uri"],
                fsync = self.config["fsync"],
                read_preferences = ReadPreference.PRIMARY
            )
        self.database = self.conncetion[self.config["database"]]
        self.collections = {'default': self.database[self.config['collection']]}
        self.logger.info('Connected dbpath: {0}, database: {1}'.format(
            self.config["uri"],
            self.config["database"]
        ))
        if self.config['stop_on_duplicate']:
            tmpValue = self.config['stop_on_duplicate']

            if tmpValue < 0:
                msg = (
                    u'Negative values are not allowed for'
                    u' MONGODB_STOP_ON_DUPLICATE option.'
                )

                self.logger.error(msg)
                raise SyntaxError(msg)

            self.stop_on_duplicate = self.config['stop_on_duplicate']

        else:
            self.stop_on_duplicate = 0

    def close_spider(self, spider):
        if self.item_buffer:
            self.insert_item(self.item_buffer, spider)
        self.connection.close()

    def process_item(self, item, spider):
        item = dict((k, v) for k, v in item.iteritems() if v is not None and v != "")
        if self.config['buffer']:
            self.current_item += 1

            if self.config['append_timestamp']:
                item['scrapy-mongodb'] = {'ts': datetime.datetime.utcnow()}

            self.item_buffer.append(item)

            if self.current_item == self.config['buffer']:
                self.current_item = 0

                try:
                    return self.insert_item(self.item_buffer, spider)
                finally:
                    self.item_buffer = []

            return item

        return self.insert_item(item, spider)

    def insert_item(self, item, spider):
        if not isinstance(item, list):
            item = dict(item)

            if self.config['append_timestamp']:
                item['scrapy-mongodb'] = {'ts': datetime.datetime.utcnow()}

        collection_name, collection = self.get_collection(spider.name)

        if self.config['unique_key'] is None:
            try:
                collection.insert(item, continue_on_error=True)
                self.logger.debug(u'Stored item(s) in MongoDB {0}/{1}'.format(
                    self.config['database'], collection_name))

            except errors.DuplicateKeyError:
                self.logger.debug(u'Duplicate key found')
                if (self.stop_on_duplicate > 0):
                    self.duplicate_key_count += 1
                    if (self.duplicate_key_count >= self.stop_on_duplicate):
                        self.crawler.engine.close_spider(
                            spider,
                            'Number of duplicate key insertion exceeded'
                        )

        else:
            key = {}

            if isinstance(self.config['unique_key'], list):
                for k in dict(self.config['unique_key']).keys():
                    key[k] = item[k]
            else:
                key[self.config['unique_key']] = item[self.config['unique_key']]

            collection.update(key, item, upsert=True)

            self.logger.debug(u'Stored item(s) in MongoDB {0}/{1}'.format(
                self.config['database'], collection_name))
        return item

    def get_collection(self, name):
        if self.config['separate_collections']:
            collection = self.collections.get(name)
            collection_name = name

            if not collection:
                collection = self.database[name]
                self.collections[name] = collection
        else:
            collection = self.collections.get('default')
            collection_name = self.config['collection']
        if self.config['unique_key']:
            collection.ensure_index(self.config['unique_key'], unique=True)
            self.logger.info(u'Ensuring index for key {0}'.format(
                self.config['unique_key']))
        return (collection_name, collection)
Exemplo n.º 28
0
class Mongo(DatabaseManager):
    
    def __init__(self,options={}):
        self.config = {
            'HOST':'localhost',
            'PORT':27017,
            'COLLECTION': 'default'
        }
        
        self.config.update(options)
        print self.config 
    
        self.connect()
       

    def connect(self):
        self.client = MongoClient(
            self.config['HOST'],
            self.config['PORT']
        )
        
        self.db = self.client[self.config['COLLECTION']]
        
    
    def iter(self):
        return self.db.collection.find()
    
    def close(self):
        if self.client:
            self.client.close()
    
    def count(self):
        return self.db.collection.count()
        
        
    def put(self,data):
        
        if isinstance(data, types.ListType):
            # For bulk insert, inject timestamp
            for x in data:
                x[TIMESTAMP_CREATED] = datetime.datetime.utcnow()
        else:
            # For single insert, inject timestamp
            data[TIMESTAMP_CREATED] = datetime.datetime.utcnow()
       
        try:
            self.db.collection.insert(data)
            return True
        except Exception as e:
            print e
            return False
        
    
    def update(self):
        pass
    
    def get(self, query=False):
        
        if not query:
            # Get all documents in collection
            return list(self.db.collection.find())
    
    def delete(self):
        pass
Exemplo n.º 29
0
def update_preseries(target_db_uri, target_collection, source_db_uri,
                     source_collection, interval, is_prediction,
                     computed_aggregations, resource_type):
    """
        Update aggregated collection in the target system

    :param target_db_uri: Target DB URI, i.e. mongodb://localhost/databasename
    :param target_collection: Output collection name
    :param source_db_uri: Source DB URI, i.e. mongodb://localhost/databasename
    :param source_collection: Input collection name
    :param interval: period type --> year, quarter, month, dayOfYear, week
    :param is_prediction: Boolean with True or False
    :param computed_aggregations: Document with results
    """

    tgt_client = MongoClient(target_db_uri)
    tgt_db = tgt_client.get_default_database()
    tgt_col = tgt_db[target_collection]

    src_client = MongoClient(source_db_uri)
    src_db = src_client.get_default_database()
    src_col = src_db[source_collection]

    project_clause = {
        "_id": 0,
        "company_name": "$company_name",
        "company_foundation_date": "$foundation_date"
    }
    if resource_type == 'person':
        project_clause = {
            "_id": 0,
            "first_name": "$first_name",
            "last_name": "$last_name",
            "gender": "$gender"
        }
    if resource_type == 'investor':
        project_clause = {
            "_id": 0,
            "investor_name": "$investor_name",
            "investor_foundation_date": "$foundation_date"
        }

    bulk_counter = 0
    full_counter = 0
    block_size = 10000
    aggregation_num = len(computed_aggregations)
    tries_num = 3
    bulk = tgt_col.initialize_ordered_bulk_op()

    while full_counter < aggregation_num:

        comp_aggregation = computed_aggregations[full_counter]

        fields_to_insert, fields_to_update = \
            prepare_fields(src_col, project_clause, resource_type,
                           comp_aggregation, interval, is_prediction)

        find_pipeline = get_find_pipeline(resource_type, comp_aggregation,
                                          interval, fields_to_insert,
                                          is_prediction)

        bulk.find(find_pipeline).upsert().update({
            "$setOnInsert": fields_to_insert,
            "$set": fields_to_update
        })

        bulk_counter += 1
        full_counter += 1

        # Manage a page of block_size records
        if bulk_counter == block_size:
            try:
                bulk.execute()
                tries_num = 3
                bulk = tgt_col.initialize_ordered_bulk_op()
                bulk_counter = 0
                print "%d records processed" % full_counter
            except BulkWriteError as ex:  # give a second chance to the execute
                if tries_num == 0:
                    print "bulk.execute() failed 3 times..."
                    print "ERROR processing Task. Exception: [%s]" % ex
                    traceback.print_exc()
                    raise ex
                sleep(0.5)
                bulk = tgt_col.initialize_ordered_bulk_op()
                bulk_counter = 0
                full_counter -= block_size
                tries_num -= 1
            except Exception as ex2:
                print "ERROR processing Task. Exception: [%s]" % ex2
                traceback.print_exc()
                raise ex2

    # Manage rest of records from the latest complete page to the end
    if bulk_counter > 0:
        try:
            bulk.execute()
            print "%d records processed. Finished" % full_counter
        except BulkWriteError as ex:  # give a second chance to the execute
            sleep(1)
            bulk = tgt_col.initialize_ordered_bulk_op()
            full_counter = aggregation_num - bulk_counter
            for comp_aggr_inx in range(full_counter, aggregation_num):
                comp_aggregation = computed_aggregations[comp_aggr_inx]
                if len(comp_aggregation['result']) == 0:
                    continue

                fields_to_insert, fields_to_update = \
                    prepare_fields(src_col, project_clause, resource_type,
                           comp_aggregation, interval, is_prediction)

                find_pipeline = get_find_pipeline(resource_type,
                                                  comp_aggregation, interval,
                                                  fields_to_insert,
                                                  is_prediction)

                bulk.find(find_pipeline).upsert().update({
                    "$setOnInsert":
                    fields_to_insert,
                    "$set":
                    fields_to_update
                })

                full_counter += 1

            bulk.execute()
            print "%d records processed. Finished" % full_counter
        except Exception as ex:
            print "ERROR processing Task. Exception: [%s]" % ex
            traceback.print_exc()
            raise ex

    tgt_client.close()
    src_client.close()
Exemplo n.º 30
0
class Connection:
    _graph_map: dict[str, Connection] = {}
    _initialized_map: dict[str, bool] = {}

    def __new__(cls: type[Connection], graph_name: str) -> Connection:
        if not cls._graph_map.get(graph_name):
            cls._graph_map[graph_name] = super(Connection, cls).__new__(cls)
        return cls._graph_map[graph_name]

    def __init__(self: Connection, graph_name: str) -> None:
        if self.__class__._initialized_map.get(graph_name):
            return
        self._graph_name: str = graph_name
        self._url: Optional[str] = None
        self._client: Optional[MongoClient] = None
        self._database: Optional[Database] = None
        self._collections: dict[str, Collection] = {}
        self._connection_callbacks: dict[str, ConnectedCallback] = {}
        self._connected: bool = False
        self.__class__._initialized_map[graph_name] = True
        return None

    @property
    def graph_name(self: Connection) -> str:
        return self._graph_name

    @property
    def url(self: Connection) -> str:
        if self._url:
            return self._url
        return self._generate_default_url()

    def set_url(self: Connection, url: str) -> None:
        self._url = url

    def _generate_default_url(self: Connection) -> str:
        if self.graph_name == 'default':
            user_url = uconf()['pymongo.url'] or uconf()['pymongo.default.url']
        else:
            user_url = uconf()[f'pymongo.{self.graph_name}.url']
        if user_url is not None:
            self._url = user_url
            return user_url
        base = 'mongodb://localhost:27017/'
        proj = camelize(parameterize(path.basename(getcwd()))).lower()
        self._url = base + proj
        return self._url

    @property
    def client(self: Connection) -> MongoClient:
        if self._client is not None:
            return self._client
        self.connect()
        return self._client

    @property
    def database(self: Connection) -> Database:
        if self._database is not None:
            return self._database
        self.connect()
        return self._database

    def connect(self: Connection) -> None:
        self._client = MongoClient(self.url)
        self._database = self._client.get_database()
        self._connected = True
        for name, callback in self._connection_callbacks.items():
            self._call_callback(name, callback)

    def disconnect(self: Connection) -> None:
        if self._client is not None:
            self._client.close()
            self._client = None
            self._database = None
            self._collections = {}
            self._connected = False

    @property
    def connected(self: Collection) -> bool:
        return self._connected

    def collection(self: Connection,
                   name: str,
                   index_keys: list[str] | None = None) -> Collection:
        if self._collections.get(name) is not None:
            return self._collections[name]
        coll = self.database.get_collection(name)
        if index_keys is not None:
            ukeys = [(k, 1) for k in index_keys]
            coll.create_index(ukeys, name='ref', unique=True)
        self._collections[name] = coll
        return coll

    def add_connected_callback(self: Connection, name: str,
                               callback: ConnectedCallback) -> None:
        self._connection_callbacks[name] = callback
        if self._client:
            self._call_callback(name, callback)

    def _call_callback(self: Connection, name: str,
                       callback: ConnectedCallback) -> None:
        callback(self.collection(name))

    def collection_from(self: Connection, cls: type[T]) -> Collection:
        coll_name = cls.pconf.collection_name
        return self.collection(coll_name)

    default: ClassVar[Connection]

    @classmethod
    def get_collection(cls: type[Connection], pmcls: type[T]) -> Collection:
        graph = pmcls.cdef.jconf.cgraph.name
        connection = Connection(graph)
        return connection.collection_from(pmcls)

    @classmethod
    def from_class(cls: type[Connection], pmcls: type[T]) -> Connection:
        return Connection(pmcls.cdef.jconf.cgraph.name)
Exemplo n.º 31
0
def get_changed_intervals(source_db_uri, source_collection, last_execution,
                          curr_date, process_field, interval, resource_type):
    """
        Generation of the pipeline aggregation string, execution, and returns
        the periods with changes

        Ex.

        [{"$match":
            {"metric_name": {"$in": ["twitter_followers", "twitter_following"]},
             "updated": {"$gte": last_execution, "$lt": curr_date}}},
         { "$project":
             {"company_id": "$_id",
               "date": "$date",
               "year": {"$year": "$date"},
               "interval": {"$month": "$date"}}},
         {"$group":
             {"_id": { "company_id": "$company_id","year":"$year",
                        "interval": "$interval"}}},
         {"$project":  {"_id": 0, "company_id": "$_id.company_id",
                         "year": "$_id.year", "interval": "$_id.interval"}}]

         Returns a list of changed intervals with this structure:

         [{"company_id": ObjectID("2341231231"), "year": 2013, "interval": 7},
          {"company_id": ObjectID("2341231231"), "year": 2013, "interval": 8},
          {"company_id": ObjectID("2341231444"), "year": 2015, "interval": 9},
          {"company_id": ObjectID("2341231444"), "year": 2015, "interval": 11}
          ...]

    :param source_db_uri: Source DB URI, i.e. mongodb://localhost/databasename
    :param source_collection: Input collection name
    :param last_execution: Lower date
    :param curr_date: Upper date
    :param process_field: metric to process: "twitter_followers"
    :param interval: period type --> year, quarter, month, dayOfYear, week
    :param resource_type: Resource Type (company, person, ...)
    """

    client = MongoClient(source_db_uri)
    database = client.get_default_database()
    collection = database[source_collection]

    pipeline_periods = [{
        "$unwind": "$%s_ts" % process_field
    }, {
        "$project": {
            "%s_id" % resource_type: "$%s_id" % resource_type,
            "value": "$%s_ts.value" % process_field,
            "date": "$%s_ts.date" % process_field,
            "updated": "$%s_ts.updated" % process_field,
            "year": {
                "$year": "$%s_ts.date" % process_field
            },
            "interval": set_field_interval("%s_ts.date" % process_field,
                                           interval)
        }
    }, {
        "$match": {
            "updated": {
                "$lt": curr_date
            }
        }
    }, {
        "$group": {
            "_id": {
                "%s_id" % resource_type: "$%s_id" % resource_type,
                "year": "$year",
                "interval": "$interval"
            }
        },
    }, {
        "$project": {
            "_id": 0,
            "%s_id" % resource_type: "$_id.%s_id" % resource_type,
            "year": "$_id.year",
            "interval": "$_id.interval"
        }
    }]

    # If last_execution is informed, add the $gte clause to the "$match" elem.
    if last_execution is not None:
        pipeline_periods[2]['$match']["updated"]['$gte'] = last_execution

    changed_intervals = []

    for changed_interval in collection.aggregate(pipeline_periods,
                                                 allowDiskUse=True):
        changed_intervals.append(changed_interval)

    client.close()

    return changed_intervals
Exemplo n.º 32
0
class DBServer():
    def __init__(self, collection_name, table):
        self.collection_name = collection_name
        self.table = table
        cfg = DBUtils.read_config()
        url = "mongodb://{0}:{1}/".format(cfg.host, cfg.port)
        logger.debug("The db url", url)
        self._client = MongoClient(url)
        # can not connect to any db except admin server
        username, password = getattr(cfg, 'username',
                                     ''), getattr(cfg, 'password', '')
        if not (len(username) <= 0 or len(password) <= 0):
            self._client.admin.authenticate(cfg.username, cfg.password)
        db = self._client[collection_name]
        self.table = db[table]

    @log
    def query(self, pattern=None, **condition):
        """
        :param pattern: dict pattern suitable
        :param condition: condition to pagination
        :return:suitable information
        """
        if pattern is None:
            # if pattern is None,the conditions must be empty
            values = self.table.find().limit(20)
            return convert_list(values)
        if condition is None or len(condition) <= 0:
            # if condition is empty
            values = self.table.find(pattern)
            return convert_list(values)
        dcu = DefaultConditionUtil(**condition)
        if dcu.get_complex_expression() is not None:
            pattern = pattern.update(**dcu.get_complex_expression())
        ret = self.table.find(pattern).skip(dcu.get_offset()).limit(
            dcu.get_limit()).sort(dcu.sort_style())
        return convert_list(ret)

    @log
    def find_one(self, **condition):
        one = self.table.find_one(condition)
        return convert_dict(one)

    @log
    def find_all(self):
        """
        :return: all information
        """
        values = self.table.find()
        return convert_list(values)

    @log
    def insert(self, obj):
        self.table.insert(obj)

    @log
    def delete(self, pattern=None, **condition):
        """
        :param id: remove obj id
        :param pattern: pattern data
        :param condition:suitable data
        :return:nothing
        """
        if pattern is None:
            return
        self.table.remove(pattern)

    @log
    def update(self, pattern=None, **new):
        """
        :param id: update id
        :param other: condition
        :param new: new data
        :return:
        """
        self.table.update(pattern, new)

    @log
    def query_by_complex_condition(self, code):
        """
        :param code: query information via complex condition use javascript code
        :return:
        """
        values = self.table.find().where(code)
        return convert_list(values)

    @log
    def count(self, id=None, **conditions):
        if id is None and conditions is None:
            return {'count': self.table.find().count()}
        if conditions is None:
            return {'count': self.table.find(dict(id=id)).count()}
        if id is None:
            return {'count': self.table.find(conditions).count()}
        conditions['id'] = id
        return {'count': self.table.find(conditions)}

    @log
    def close(self):
        self._client.close()

    def __del__(self):
        self._client.close()
Exemplo n.º 33
0
def compute_ts_aggregations_1toN(source_db_uri,
                                 source_collection,
                                 changed_intervals,
                                 process_field,
                                 date_field,
                                 interval,
                                 operators_list,
                                 field_names_list,
                                 resource_type,
                                 withinTheInterval=True):
    """
        Generation of the pipeline aggregation string, execution
        (1 aggregated result), and returns the changed_intervals
        dictionary upgraded with the result document

        Ex.

        [{"$match": {
            "company_id": "2341231231",
            "date" {"$gte": lower_date, "$lt": upper_date}}},
         {"$project": {
              "date": "$date",
              "value": "$value"}},
         {"$sort": {"date": 1}},
         {"$group": {"_id": {"company_id": "$_id"},
                     "aggr_field_name_1": { "$last": "$value" },
                     "aggr_field_name_2": { "$sum": "$value" },
                     "aggr_field_name_3": { "$avg": "$value" }}},
         {"$project":
             {"_id": 0,
              "company_id": "$_id.company_id",
              "aggr_field_name_1": "aggr_field_name_1",
              "aggr_field_name_2": "aggr_field_name_2"
              "aggr_field_name_3": "aggr_field_name_3"}}]

         Returns a result document attached to the changed_intervals document:

         [{"company_id": "2341231231", "year": 2013, "interval": 7},
            "result": {"twitter_followers_last": 456,
                       "twitter_followers_first": 123,
                       "twitter_followers_count": 34
                       ...},
          {"company_id": "2341231231", "year": 2013, "interval": 8},
             "result": {"twitter_followers_last": 135},
          {"company_id": "2341231444", "year": 2015, "interval": 9},
             "result": {"twitter_followers_last": 1023},
          {"company_id": "2341231444", "year": 2015, "interval": 11},
             "result": {"twitter_followers_last": 1050},
         ...]

    :param source_db_uri: Source DB URI, i.e. mongodb://localhost/databasename
    :param source_collection: Input collection name
    :param proces_field: field to process "twitter_followers", "twitter_bio",...
    :param changed_intervals: List of documents with the changed intervals
    :param interval: period type --> year, quarter, month, dayOfYear, week
    :param operators_list: aggregator operator list i.e. "last sum avg"
    :param field_names_list: aggregator field name list i.e
    "twitter_followers_last twitter_followers_sum twitter_followers_avg"
    :param resource_type: Resource Type (company, person, ...)
    :param withinTheInterval: within Interval
    """

    client = MongoClient(source_db_uri)
    database = client.get_default_database()
    collection = database[source_collection]

    company_processed = 0
    for changed_interval in changed_intervals:
        lower_date, upper_date = \
            interval_date_range(changed_interval, interval)

        # Base pipeline aggregation list
        pipeline_aggregated = [{
            "$match": {
                "%s_id" % resource_type:
                changed_interval['%s_id' % resource_type],
                date_field: {
                    "$lt": upper_date
                }
            }
        }, {
            "$project": {
                "id": "$%s_id" % resource_type,
                "date": "$%s" % date_field,
                "value": "$%s" % process_field
            }
        }, {
            "$sort": {
                "date": 1
            }
        }, {
            "$group": {
                "_id": {
                    "id": "$id"
                }
            }
        }, {
            "$project": {
                "_id": 0,
                'id': '$_id.id'
            }
        }]

        if withinTheInterval:
            pipeline_aggregated[0]['$match'][date_field]['$gte'] = lower_date

        # Upgrade pipeline aggregation list with aggregation operators
        for i in range(len(operators_list)):
            pipeline_aggregated[3]['$group'][field_names_list[i]] = \
                {"$" + operators_list[i]: "$value"} \
                    if (operators_list[i] != 'count') else {"$sum": 1}
            pipeline_aggregated[4]['$project'][field_names_list[i]] = \
                "$" + field_names_list[i]

        # Upgrade changed_interval document with result
        for aggregated_value in collection.aggregate(pipeline_aggregated,
                                                     allowDiskUse=True):
            changed_interval['result'] = {}
            for i in range(len(operators_list)):
                changed_interval['result'][field_names_list[i]] = \
                    aggregated_value[field_names_list[i]]

        company_processed += 1

        if company_processed % 100 == 0:
            print "Processed %d/%d companies" % (company_processed,
                                                 len(changed_intervals))

    client.close()

    return changed_intervals
Exemplo n.º 34
0
def compute_aggregations(source_db_uri, source_collection, changed_intervals,
                         process_field, interval, operators_list,
                         field_names_list, resource_type):
    """
        Generation of the pipeline aggregation string, execution
        (1 aggregated result), and returns the changed_intervals
        dictionary upgraded with the result document

        Ex.

        [{"$match": {
            "metric_name":"twitter_followers",
            "company_id": "2341231231",
            "date" {"$gte": lower_date, "$lt": upper_date}}},
         {"$project": {
              "date": "$date",
              "value": "$value"}},
         {"$sort": {"date": 1}},
         {"$group": {"_id": {"company_id": "$_id"},
                     "aggr_field_name_1": { "$last": "$value" },
                     "aggr_field_name_2": { "$sum": "$value" },
                     "aggr_field_name_3": { "$avg": "$value" }}},
         {"$project":
             {"_id": 0,
              "company_id": "$_id.company_id",
              "aggr_field_name_1": "aggr_field_name_1",
              "aggr_field_name_2": "aggr_field_name_2"
              "aggr_field_name_3": "aggr_field_name_3"}}]

         Returns a result document attached to the changed_intervals document:

         [{"company_id": "2341231231", "year": 2013, "interval": 7},
            "result": {"twitter_followers_last": 456,
                       "twitter_followers_first": 123,
                       "twitter_followers_count": 34
                       ...},
          {"company_id": "2341231231", "year": 2013, "interval": 8},
             "result": {"twitter_followers_last": 135},
          {"company_id": "2341231444", "year": 2015, "interval": 9},
             "result": {"twitter_followers_last": 1023},
          {"company_id": "2341231444", "year": 2015, "interval": 11},
             "result": {"twitter_followers_last": 1050},
         ...]

    :param source_db_uri: Source DB URI, i.e. mongodb://localhost/databasename
    :param source_collection: Input collection name
    :param proces_field: field to process "twitter_followers", "twitter_bio",...
    :param changed_intervals: List of documents with the changed intervals
    :param interval: period type --> year, quarter, month, dayOfYear, week
    :param operators_list: aggregator operator list i.e. "last sum avg"
    :param field_names_list: aggregator field name list i.e
    "twitter_followers_last twitter_followers_sum twitter_followers_avg"
    """

    client = MongoClient(source_db_uri)
    database = client.get_default_database()
    collection = database[source_collection]

    print "Processing %d changed intervals...." % len(changed_intervals)
    NUM_OF_INTERVALS_TO_INFORM = 1000
    for idx, changed_interval in enumerate(changed_intervals):
        lower_date, upper_date = \
            interval_date_range(changed_interval, interval)

        # Base pipeline aggregation list
        pipeline_aggregated = [{
            "$match": {
                "%s_id" % resource_type:
                changed_interval['%s_id' % resource_type]
            }
        }, {
            "$unwind": "$%s_ts" % process_field
        }, {
            "$project": {
                "%s_id" % resource_type:
                "$%s_id" % resource_type,
                "value":
                "$%s_ts.value" % process_field,
                "date":
                "$%s_ts.date" % process_field,
                "updated":
                "$%s_ts.updated" % process_field,
                "year": {
                    "$year": "$%s_ts.date" % process_field
                },
                "interval":
                set_field_interval("%s_ts.date" % process_field, interval)
            }
        }, {
            "$match": {
                "date": {
                    "$gte": lower_date,
                    "$lt": upper_date
                }
            }
        }, {
            "$project": {
                "date": "$date",
                "value": "$value",
                "%s_id" % resource_type: "$%s_id" % resource_type
            }
        }, {
            "$sort": {
                "date": 1
            }
        }, {
            "$group": {
                "_id": {
                    "%s_id" % resource_type: "%s_id" % resource_type
                }
            }
        }, {
            "$project": {
                "_id": 0,
                "%s_id" % resource_type: "$_id.%s_id" % resource_type
            }
        }]

        # Upgrade pipeline aggregation list with aggregation operators
        for i in range(len(operators_list)):
            pipeline_aggregated[6]['$group'][field_names_list[i]] = \
                {"$" + operators_list[i]: "$value"} \
                    if (operators_list[i] != 'count') else {"$sum": 1}
            pipeline_aggregated[7]['$project'][field_names_list[i]] = \
                "$" + field_names_list[i]

        # Upgrade changed_interval document with result
        for aggregated_value in collection.aggregate(pipeline_aggregated,
                                                     allowDiskUse=True):
            changed_interval['result'] = {}
            for i in range(len(operators_list)):
                changed_interval['result'][field_names_list[i]] = \
                    aggregated_value[field_names_list[i]]

        if idx != 0 and idx % NUM_OF_INTERVALS_TO_INFORM == 0:
            print "%d intervals processed" % \
               ((idx / NUM_OF_INTERVALS_TO_INFORM) * NUM_OF_INTERVALS_TO_INFORM)

    client.close()
    return changed_intervals
def doEverything():
    # Get database connectivity information
    database, url = getDatabaseInfo()

    # Run test
    try:
        client = MongoClient(url)
        db = client[database]
        
        output = []
        output.append("Starting database test.... ")
        collectionName = "pythonMongo"
        output.append("Creating collection " + collectionName)
        collection = db[collectionName]
        
        #insert 1
        output.append("# 1 Inserts")
        output.append("# 1.1 Insert a single document to a collection")
        collection.insert({"name": "test1", "value": 1})
        output.append("Inserted {\"name\": \"test1\", \"value\": 1}")
        
        #insert many
        output.append("#1.2 Inserting multiple entries into collection")
        multiPost = [{"name": "test1", "value": 1},{"name": "test2", "value": 2}, {"name": "test3", "value": 3}] 
        collection.insert(multiPost)
        output.append("Inserted \n {\"name\": \"test1\", \"value\": 1} \n {\"name\": \"test2\", \"value\": 2} \n {\"name\": \"test3\", \"value\": 3}")
         
        # Find 
        output.append("#2 Queries")
        output.append("#2.1 Find one that matches a query condition")
        output.append(collection.find_one({"name": "test1"}))
         
        # Find all 
        output.append("#2.2 Find all that match a query condition")
        for doc in collection.find({"name": "test1"}):
            output.append(doc)
        
        # Display all documents
        output.append( "#2.3 Find all documents in collection")
        for doc in collection.find():
            output.append(doc)   
        
        # update document
        output.append("#3 Updating Documents")
        collection.update({"name": "test3"}, {"$set": { "value": 4}})
        output.append("Updated test3 with value 4")
         
        # delete document
        output.append("#4 Delete Documents")
        collection.remove({"name": "test2"})  
        output.append("Deleted all with name test2")
        
        # Display all collection names
        output.append("#5 Get a list of all of the collections")
        output.append( db.collection_names())
        
        output.append("#6 Drop a collection")
        db.drop_collection(collectionName)
    
    except Exception as e:
        logging.exception(e) 
        output.append("EXCEPTION (see log for details): " + str(e))
    finally:
        if client is not None:
            client.close()
            output.append("Connection to database has been closed")

    return output
Exemplo n.º 36
0
class DBServer():
    def __init__(self, collection_name, table):
        self.collection_name = collection_name
        self.table = table
        cfg = DBUtils.read_config()
        url = "mongodb://{0}:{1}/".format(cfg.host, cfg.port)
        logger.debug("The db url", url)
        self._client = MongoClient(url)
        # can not connect to any db except admin server
        username, password = getattr(cfg, 'username', ''), getattr(cfg, 'password', '')
        if not (len(username) <= 0 or len(password) <= 0):
            self._client.admin.authenticate(cfg.username, cfg.password)
        db = self._client[collection_name]
        self.table = db[table]

    @log
    def query(self, pattern=None, **condition):
        """
        :param pattern: dict pattern suitable
        :param condition: condition to pagination
        :return:suitable information
        """
        if pattern is None:
            # if pattern is None,the conditions must be empty
            values = self.table.find().limit(20)
            return convert_list(values)
        if condition is None or len(condition) <= 0:
            # if condition is empty
            values = self.table.find(pattern)
            return convert_list(values)
        dcu = DefaultConditionUtil(**condition)
        if dcu.get_complex_expression() is not None:
            pattern = pattern.update(**dcu.get_complex_expression())
        ret = self.table.find(pattern).skip(dcu.get_offset()).limit(dcu.get_limit()).sort(dcu.sort_style())
        return convert_list(ret)

    @log
    def find_one(self, **condition):
        one = self.table.find_one(condition)
        return convert_dict(one)

    @log
    def find_all(self):
        """
        :return: all information
        """
        values = self.table.find()
        return convert_list(values)

    @log
    def insert(self, obj):
        self.table.insert(obj)

    @log
    def delete(self, pattern=None, **condition):
        """
        :param id: remove obj id
        :param pattern: pattern data
        :param condition:suitable data
        :return:nothing
        """
        if pattern is None:
            return
        self.table.remove(pattern)

    @log
    def update(self, pattern=None, **new):
        """
        :param id: update id
        :param other: condition
        :param new: new data
        :return:
        """
        self.table.update(pattern, new)

    @log
    def query_by_complex_condition(self, code):
        """
        :param code: query information via complex condition use javascript code
        :return:
        """
        values = self.table.find().where(code)
        return convert_list(values)

    @log
    def count(self, id=None, **conditions):
        if id is None and conditions is None:
            return {'count': self.table.find().count()}
        if conditions is None:
            return {'count': self.table.find(dict(id=id)).count()}
        if id is None:
            return {'count': self.table.find(conditions).count()}
        conditions['id'] = id
        return {'count': self.table.find(conditions)}

    @log
    def close(self):
        self._client.close()

    def __del__(self):
        self._client.close()
Exemplo n.º 37
0
class GridFSOperations(Operations):

    def __init__(self, host, db_name='test', collection_name='fs'):
        self.client = MongoClient(host)
        self.db = Database(self.client, db_name)
        self.fs = GridFS(self.db, collection_name)

    def _new_file(self, name):
        return self.fs.new_file(
            filename=name,
            aliases=[],
            length=0,
            upload_date=datetime.now())

    @logmethod
    def init(self):
        pass

    @logmethod
    def access(self, inode, mode, ctx):
        return True

    @logmethod
    def getattr(self, inode):
        if inode == 1:
            return Operations.getattr(self, inode)
        else:
            return grid2attrs(self.fs.get(int2oid(inode)))

    @logmethod
    def lookup(self, parent_inode, name):

        if parent_inode != 1:
            raise FUSEError(errno.ENOENT)

        try:
            gridout = self.fs.get_last_version(filename=name.decode())
        except NoFile:
            raise FUSEError(errno.ENOENT)

        return grid2attrs(gridout)

    @logmethod
    def create(self, inode_parent, name, mode, flags, ctx):
        gridin = self._new_file(name.decode())
        fh = oid2int(gridin._id)
        grid_cache[fh] = gridin
        return (fh, grid2attrs(gridin))

    @logmethod
    def flush(self, fh):
        grid = grid_cache[fh]
        grid.close()

    @logmethod
    def setattr(self, inode, attr):
        gridout = self.fs.get(int2oid(inode))
        return grid2attrs(gridout)

    @logmethod
    def release(self, fh):
        del grid_cache[fh]

    @logmethod
    def forget(self, inode_list):

        for inode in inode_list:
            if inode in oid_cache.ints:
                del oid_cache.ints[inode]

    @logmethod
    def destroy(self):
        self.client.close()

    @logmethod
    def open(self, inode, flags):
        gridout = self.fs.get(int2oid(inode))
        grid_cache[inode] = gridout
        return inode

    @logmethod
    def read(self, fh, off, size):
        grid = grid_cache[fh]

        if isinstance(grid, GridIn):
            grid.close()
            grid = self.fs.get(int2oid(fh))
            grid_cache[fh] = grid

        grid.seek(off)
        return grid.read(size)

    @logmethod
    def write(self, fh, off, buf):
        grid = grid_cache[fh]

        if isinstance(grid, GridOut):
            offbuf = grid.read(off)
            grid = self._new_file(name=grid.name)
            grid_cache[fh] = grid
            grid.write(offbuf)
            del offbuf

        if grid.closed:
            grid = self._new_file(name=grid.name)
            grid_cache[fh] = grid

        grid.write(buf)
        return len(buf)

    @logmethod
    def unlink(self, parent_inode, name):

        if parent_inode != 1:
            Operations.unlink(self, parent_inode, name)
        else:
            for gridout in self.fs.find({'filename': name.decode()}):
                self.fs.delete(gridout._id)

    @logmethod
    def fsync(self, fh, datasync):
        Operations.fsync(self, fh, datasync)

    @logmethod
    def fsyncdir(self, fh, datasync):
        Operations.fsyncdir(self, fh, datasync)

    @logmethod
    def getxattr(self, inode, name):
        Operations.getxattr(self, inode, name)

    @logmethod
    def link(self, inode, new_parent_inode, new_name):
        Operations.link(self, inode, new_parent_inode, new_name)

    @logmethod
    def listxattr(self, inode):
        Operations.listxattr(self, inode)

    @logmethod
    def mkdir(self, parent_inode, name, mode, ctx):
        Operations.mkdir(self, parent_inode, name, mode, ctx)

    @logmethod
    def mknod(self, parent_inode, name, mode, rdev, ctx):
        Operations.mknod(self, parent_inode, name, mode, rdev, ctx)

    @logmethod
    def opendir(self, inode):
        Operations.opendir(self, inode)

    @logmethod
    def readdir(self, fh, off):
        Operations.readdir(self, fh, off)

    @logmethod
    def readlink(self, inode):
        Operations.readlink(self, inode)

    @logmethod
    def releasedir(self, fh):
        Operations.releasedir(self, fh)

    @logmethod
    def removexattr(self, inode, name):
        Operations.removexattr(self, inode, name)

    @logmethod
    def rename(self, inode_parent_old, name_old, inode_parent_new, name_new):
        Operations.rename(self,
            inode_parent_old, name_old, inode_parent_new, name_new)

    @logmethod
    def rmdir(self, inode_parent, name):
        Operations.rmdir(self, inode_parent, name)

    @logmethod
    def setxattr(self, inode, name, value):
        Operations.setxattr(self, inode, name, value)

    @logmethod
    def statfs(self):
        Operations.statfs(self)

    @logmethod
    def symlink(self, inode_parent, name, target, ctx):
        Operations.symlink(self, inode_parent, name, target, ctx)
Exemplo n.º 38
0
class MovieSpider(CrawlSpider):
    #初始化爬虫对象调用该方法
    def __init__(self):
        #调用父类的方法
        super().__init__(self)
        #访问mongodb数据库
        self.client = MongoClient("localhost", 27017)
        #创建或者打开urls集合
        self.url_connection = self.client['moviedb']['urls']

    #销毁爬虫对象,回调该方法
    def __del__(self):
        self.client.close()

    name = 'mv'
    #allowed_domains = ['www.xxx.com']
    start_urls = ['http://www.4567kan.com/frim/index1.html']
    link = LinkExtractor(allow=r'/frim/index1-\d+\.html')
    rules = (Rule(link, callback='parse_item', follow=False), )

    #解析每一个页码对应的页面,并且获取电影的详情
    def parse_item(self, response):
        li_list = response.xpath('/html/body/div[1]/div/div/div/div[2]/ul/li')
        for li in li_list:
            detial_url = "http://www.4567kan.com" + li.xpath(
                './div/a/@href').extract_first()
            # print(detial_url)

            #查询mongodb数据库中的url集合中有没有包含详情的url
            cursor = self.url_connection.find({"url": detial_url})
            if cursor.count() == 0:
                #当前的url没有访问过
                print("该url没有被访问,可以进行数据的爬取...")
                #保存当前的url到urls集合中
                self.url_connection.insert_one({"url": detial_url})
                #发起一个新的请求,提取电影详情页面的信息
                yield scrapy.Request(url=detial_url,
                                     callback=self.parse_detail)
            else:
                #当前的url已经访问过了
                print("当前url已经访问过,无需再访问")
                pass
            # yield scrapy.Request(url=detial_url, callback=self.parse_detail)

    #解析电影详情页面的, 解析出电影的名称和描述信息
    def parse_detail(self, response):
        #获取电影名称
        name = response.xpath(
            '/html/body/div[1]/div/div/div/div[2]/h1/text()').extract_first()

        #获取电影简介,电影描述信息
        desc = response.xpath(
            '/html/body/div[1]/div/div/div/div[2]/p[5]/span[2]//text()'
        ).extract_first()
        desc = ''.join(desc)
        print(f"电影名称:{name}\n电影简介:{desc}")

        item = MovieprojectItem()
        item['name'] = name
        item['desc'] = desc
        yield item
Exemplo n.º 39
0
class MongoController:
    """
    The main MongoDB controller class.
    Attributes:
    port - the port for the MongoDB service.
    temp_dir - the location of the MongoDB data and logs.
    client - a pymongo client pointed at the server.
    db_version - the version of the mongod executable.
    index_version - the version of the indexes created by the mongod executable - 1 for < 3.4.0,
        2 otherwise.
    includes_system_indexes - true if system indexes will be included when listing database
        indexes, false otherwise.
    """

    def __init__(self, mongoexe: Path, root_temp_dir: Path, use_wired_tiger: bool=False) -> None:
        '''
        Create and start a new MongoDB database. An unused port will be selected for the server.
        :param mongoexe: The path to the MongoDB server executable (e.g. mongod) to run.
        :param root_temp_dir: A temporary directory in which to store MongoDB data and log files.
            The files will be stored inside a child directory that is unique per invocation.
        :param use_wired_tiger: For MongoDB versions > 3.0, specify that the Wired Tiger storage
            engine should be used. Setting this to true for other versions will cause an error.
        '''
        if not mongoexe or not os.access(mongoexe, os.X_OK):
            raise test_util.TestException('mongod executable path {} does not exist or is not executable.'
                                .format(mongoexe))
        if not root_temp_dir:
            raise ValueError('root_temp_dir is None')

        # make temp dirs
        root_temp_dir = root_temp_dir.absolute()
        os.makedirs(root_temp_dir, exist_ok=True)
        self.temp_dir = Path(tempfile.mkdtemp(prefix='MongoController-', dir=str(root_temp_dir)))
        data_dir = self.temp_dir.joinpath('data')
        os.makedirs(data_dir)

        self.port = test_util.find_free_port()

        command = [str(mongoexe), '--port', str(self.port), '--dbpath', str(data_dir),
                   '--nojournal']
        if use_wired_tiger:
            command.extend(['--storageEngine', 'wiredTiger'])

        self._outfile = open(self.temp_dir.joinpath('mongo.log'), 'w')

        self._proc = subprocess.Popen(command, stdout=self._outfile, stderr=subprocess.STDOUT)
        time.sleep(1)  # wait for server to start up
        self.client = MongoClient('localhost', self.port)
        # check that the server is up. See
        # https://api.mongodb.com/python/3.7.0/api/pymongo/mongo_client.html
        #    #pymongo.mongo_client.MongoClient
        self.client.admin.command('ismaster')

        # get some info about the db
        self.db_version = self.client.server_info()['version']
        self.index_version = 2 if (semver.compare(self.db_version, '3.4.0') >= 0) else 1
        self.includes_system_indexes = (semver.compare(self.db_version, '3.2.0') < 0
                                        and not use_wired_tiger)

    def destroy(self, delete_temp_files: bool) -> None:
        """
        Shut down the MongoDB server.
        :param delete_temp_files: delete all the MongoDB data files and logs generated during the
            test.
        """
        if self.client:
            self.client.close()
        if self._proc:
            self._proc.terminate()
        if self._outfile:
            self._outfile.close()
        if delete_temp_files and self.temp_dir:
            shutil.rmtree(self.temp_dir)

    def clear_database(self, db_name, drop_indexes=False):
        '''
        Remove all data from a database.
        :param db_name: the name of the db to clear.
        :param drop_indexes: drop all indexes if true, retain indexes (which will be empty) if
            false.
        '''
        if drop_indexes:
            self.client.drop_database(db_name)
        else:
            db = self.client[db_name]
            for name in db.list_collection_names():
                if not name.startswith('system.'):
                    # don't drop collection since that drops indexes
                    db.get_collection(name).delete_many({})
# -*- coding:utf-8 -*-
from time import sleep

from pymongo.mongo_client import MongoClient


try:
    con = MongoClient("192.168.0.88")
    db = con.xe
    f = open("d:/lee/naverNews.txt", "a", encoding="utf-8")
    
    for nn in db.naverNews.find():
        f.write(str(nn["m"]) + "\t")
        f.write(str(nn["d"]) + "\t")
        f.write(str(nn["h"]) + "\t")
        f.write(nn["t"] + "\t")
        f.write(nn["desc"] + "\n")
    
    print("끝")
    f.close()
    con.close()
except Exception as e:
    print(e)
def doEverything():
    # Get database connectivity information
    database, url = getDatabaseInfo()

    # Run test
    try:
        client = MongoClient(url)
        db = client[database]
        
        output = []
        collectionName = "pythonMongoGalaxy"
        joinCollectionName = "pyJoin"
        cityTableName = "cityTable"
        codeTableName = "codeTable"
        
        output.append("# 1 Data Structures")
        output.append("# 1.1 Create a collection")
        output.append("Creating collection " + collectionName + " " + joinCollectionName)
        collection = db[collectionName]
        joinCollection = db[joinCollectionName]
        
        output.append("# 1.2 Create a table")
        output.append("Creating tables " + codeTableName + " " + cityTableName)
         
        db.command({"create" : codeTableName, "columns":[{"name":"countryCode","type":"int"},
                                                                    {"name": "countryName", "type": "varchar(50)"}]})
                  
        db.command({"create" : cityTableName, "columns":[{"name":"name","type":"varchar(50)"},
                                                                    {"name": "population", "type": "int"}, {"name": "longitude", "type": "decimal(8,4)"},
                                                                    {"name": "latitude", "type": "decimal(8,4)"}, {"name": "countryCode", "type": "int"}]})
        
        
        #insert 1
        output.append("# 1 Inserts")
        output.append("# 1.1 Insert a single document to a collection")
        collection.insert(kansasCity.toJSON())
        output.append("Inserted" )
        output.append(kansasCity.toJSON())
        
        #insert many
        output.append("#1.2 Inserting multiple entries into collection")
        multiPost = [seattle.toJSON(), newYork.toJSON(), london.toJSON(), tokyo.toJSON(), madrid.toJSON()] 
        collection.insert(multiPost)
        output.append("Inserted \n%s \n%s \n%s \n%s \n%s" % (seattle.toJSON(), newYork.toJSON(), london.toJSON(), tokyo.toJSON(), madrid.toJSON()))
        
        # # Find 
        output.append("\n#2 Queries")
        output.append("#2.1 Find one that matches a query condition")
        output.append(collection.find_one({"name": kansasCity.name}))
         
        # Find all 
        output.append("#2.2 Find all that match a query condition")
        for doc in collection.find({"longitude": {"$gt" : "40.0"}}):
            output.append(doc)
         
        # Display all documents
        output.append("#2.3 Find all documents in collection")
        for doc in collection.find():
            output.append(doc)
            
        #Count     
        output.append("#2.4 Count documents in collection")
        num = collection.find({"population": {"$lt" : 8000000}}).count()
        output.append("There are %d documents with a population less than 8 million" % num)
        
        #Order 
        output.append("#2.5 Order documents in collection")
        for doc in collection.find().sort("population", -1):
            output.append(doc)    
         
        # Distinct
        output.append("#2.6 Find distinct codes in collection")
        for doc in collection.distinct("countryCode"):
            output.append(doc)
            
        #Joins
        output.append("#2.7 Joins")
        sys = db["system.join"]
         
        joinCollection.insert({"countryCode": 1, "countryName": "United States of America" })
        joinCollection.insert({"countryCode": 44, "countryName": "United Kingdom" })
        joinCollection.insert({"countryCode": 81, "countryName": "Japan" })
        joinCollection.insert({"countryCode": 34, "countryName": "Spain" })
        joinCollection.insert({"countryCode": 61, "countryName": "Australia" })
         
        codeTable = db[codeTableName]
        codeTable.insert({"countryCode": 1}, {"countryName": "United State of America"})
        codeTable.insert({"countryCode": 44 }, {"countryName": "United Kingdom"})
        codeTable.insert({"countryCode": 81 }, {"countryName": "Japan"})
        codeTable.insert({"countryCode": 34 }, {"countryName": "Spain"})
        codeTable.insert({"countryCode": 61 }, {"countryName": "Australia"})
         
        codeTable = db[cityTableName]
        codeTable.insert(kansasCity.toJSON())
        codeTable.insert(multiPost)
         
        output.append("#2.7a Join collection-collection")
        joinCollectionCollection = { "$collections" : { collectionName : { "$project" : { "name" : 1 , "population" : 1 , "longitude" : 1 , "latitude" : 1}} , 
                                                       joinCollectionName : { "$project" : { "countryCode" : 1 , "countryName" : 1}}} , 
                                    "$condition" : { "pythonMongoGalaxy.countryCode": "pyJoin.countryCode"}}
        for doc in sys.find(joinCollectionCollection):
            output.append(doc)
              
        output.append("#2.7b Join table-collection")
        joinTableCollection = { "$collections" : { cityTableName : { "$project" : { "name" : 1 , "population" : 1 , "longitude" : 1 , "latitude" : 1}} , 
                                                  joinCollectionName : { "$project" : { "countryCode" : 1 , "countryName" : 1}}} , 
                               "$condition" : { "cityTable.countryCode": "pyJoin.countryCode"}}
        for doc in sys.find(joinTableCollection):
            output.append(doc)
              
        output.append("#2.7c Join table-table")
        joinTableTable= { "$collections" : { cityTableName : { "$project" : { "name" : 1 , "population" : 1 , "longitude" : 1 , "latitude" : 1}} ,
                                                   codeTableName : { "$project" : { "countryCode" : 1 , "countryName" : 1}}} , 
                               "$condition" : { "cityTable.countryCode": "codeTable.countryCode"}}
          
        for doc in sys.find(joinTableTable):
            output.append(doc)
         
        
        output.append("#2.8 Changed Batch Size")
        # docs = collection.find().batch_size(2)
        # for doc in docs:
        #     output.append(doc)
        
        output.append("#2.9 Projection clause")
        output.append("Displaying results without longitude and latitude:")
        for doc in collection.find({"countryCode" : 1}, {"longitude":0, "latitude" : 0}):
            output.append(doc)
        
        # update document
        output.append("\n#3 Update Documents")
        collection.update({"name": seattle.name}, {"$set": { "countryCode": 999}})
        output.append("Updated %s with countryCode 999" % seattle.name)
        
        # delete document
        output.append("\n#4 Delete Documents")
        collection.remove({"name": tokyo.name})  
        output.append("Deleted all with name %s" % tokyo.name)
        
        # Display all collection names
        output.append("\n#5 Get a list of all of the collections")
        output.append( db.collection_names())
        
        #SQL Passthrough
        output.append("\n#6 SQL passthrough")
        sql = db["system.sql"]
        query = {"$sql": "create table town (name varchar(255), countryCode int)"}
        for doc in sql.find(query):
            output.append(doc)
        
        query = {"$sql": "insert into town values ('Lawrence', 1)"}
        for doc in sql.find(query):
            output.append(doc)
        
        query = {"$sql": "drop table town"}
        for doc in sql.find(query):
            output.append(doc)
        
        #Transactions
        output.append("\n#7 Transactions")
        db.command({"transaction": "enable"})
        collection.insert(sydney.toJSON())
        db.command({"transaction": "commit"})
          
        collection.insert(melbourne.toJSON())
        db.command({"transaction": "rollback"})
        db.command({"transaction": "disable"})
        
        for doc in collection.find():
            output.append(doc)
        
        output.append("\n#8 output")
        
        output.append("#8.1 Count")
        count = db.command("count", collectionName)
        output.append("There are %d documents in the collection" % count['n'])
        
        output.append("#8.2 Distinct")
        distinct = db.command("distinct", collectionName, key="countryCode")
        output.append("The distinct country codes are %s" % distinct['values'])
        
        
        output.append("#8.3 collection names ")
        output.append(db.collection_names())
        
        
        output.append("#8.3 Database stats")
        output.append(db.command("dbstats"))
        
        output.append("#8.4 Collection stats")
        output.append(db.command("collstats", collectionName))
        
        
        output.append("\n#9 Drop a collection")
        db.drop_collection(collectionName)
        db.drop_collection(joinCollectionName)
        db.drop_collection(cityTableName)
        db.drop_collection(codeTableName)
    except Exception as e:
        logging.exception(e) 
        output.append("EXCEPTION (see log for details): " + str(e))
    finally:
        if client is not None:
            client.close()
            output.append("Connection to database has been closed")

    return output