def buy_item_from_store(self, user_id, item_id): item_info = self.get_item_info(item_id) if item_info is None: return f"There is no item found for item id {item_id}" print("Item info ", item_info) user_coins = self.get_user_coins(user_id) print("User Coins", user_coins) item_cost = item_info['cost'] if user_coins >= item_cost: mongo = MongoClient(MONGO_CONNECTION_STRING) mongo.beanbot.users.update_one( {'user_id': user_id}, {"$inc": { "beano_coin_count": -item_cost }}, upsert=True) self.give_item_to_player(item_info, user_id) mongo.close() return f"You purchased a {item_info['name']}. You have {user_coins - item_cost } BC remaining!" else: return f"You do not have enough coins to purchase this time. It costs {item_cost} and you have {user_coins}."
def stash(results): """ 暂存到mongo数据库中。 """ summary = {} mongo = MongoClient(**config.mongo) try: for item_model, objs in results: collection_name = item_model['name'] db = mongo.get_database('theforce') collection = db.get_collection(collection_name) collection.insert_many(objs) summary[collection_name] = len( objs) if collection_name not in summary else len( objs) + summary[collection_name] print print "=" * 40 print ' ' * 15, u'Stash' print "=" * 40 print print u"数据已成功保存到MongoDB的theforce库中,其中新增数据:" for name, length in summary.items(): print name, length finally: mongo.close()
def test_ipv6(self): c = MongoClient("mongodb://[::1]:%d" % (port,), replicaSet=self.name) # Client switches to IPv4 once it has first ismaster response. msg = 'discovered primary with IPv4 address "%r"' % (self.primary,) wait_until(lambda: c.primary == self.primary, msg) # Same outcome with both IPv4 and IPv6 seeds. c = MongoClient("[::1]:%d,localhost:%d" % (port, port), replicaSet=self.name) wait_until(lambda: c.primary == self.primary, msg) if client_context.auth_enabled: auth_str = "%s:%s@" % (db_user, db_pwd) else: auth_str = "" uri = "mongodb://%slocalhost:%d,[::1]:%d" % (auth_str, port, port) client = MongoClient(uri, replicaSet=self.name) client.pymongo_test.test.insert_one({"dummy": u("object")}) client.pymongo_test_bernie.test.insert_one({"dummy": u("object")}) dbs = client.database_names() self.assertTrue("pymongo_test" in dbs) self.assertTrue("pymongo_test_bernie" in dbs) client.close()
def get_store_inventory(self): mongo = MongoClient(MONGO_CONNECTION_STRING) response = mongo.beanbot.store_items.find({ "active": True }).sort("item_id") mongo.close() return response
class AssignmentPipeline(object): db_name = 'jd' collection_name = 'meidi' def open_spider(self, spider): self.client = MongoClient() self.db = self.client[self.db_name] file = './comments.csv' self.fp = open(file, 'w+') headers = ['creationTime', 'user', 'referenceName', 'content'] self.csv_file = csv.DictWriter(self.fp, headers, extrasaction='ignore') self.csv_file.writeheader() def process_item(self, item, spider): item_dict = dict(item) self.db[self.collection_name].insert_one(item_dict) self.csv_file.writerow(item_dict) return item def close_spider(self, spider): self.client.close() self.fp.close()
def test_ipv6(self): c = MongoClient("mongodb://[::1]:%d" % (port, ), replicaSet=self.name) # Client switches to IPv4 once it has first ismaster response. msg = 'discovered primary with IPv4 address "%r"' % (self.primary, ) wait_until(lambda: c.primary == self.primary, msg) # Same outcome with both IPv4 and IPv6 seeds. c = MongoClient("[::1]:%d,localhost:%d" % (port, port), replicaSet=self.name) wait_until(lambda: c.primary == self.primary, msg) if client_context.auth_enabled: auth_str = "%s:%s@" % (db_user, db_pwd) else: auth_str = "" uri = "mongodb://%slocalhost:%d,[::1]:%d" % (auth_str, port, port) client = MongoClient(uri, replicaSet=self.name) client.pymongo_test.test.insert_one({"dummy": u("object")}) client.pymongo_test_bernie.test.insert_one({"dummy": u("object")}) dbs = client.database_names() self.assertTrue("pymongo_test" in dbs) self.assertTrue("pymongo_test_bernie" in dbs) client.close()
def update_beano_stats(self, update_json): print("Pulling Stats from Mongo") mongo = MongoClient(MONGO_CONNECTION_STRING) stats = mongo.beanbot.beano_data.insert(update_json) mongo.close() print(stats) return stats
def get_beano_stats(self): print("Pulling Stats from Mongo") mongo = MongoClient(MONGO_CONNECTION_STRING) stats = defaultdict(int, mongo.beanbot.beano_data.find_one()) mongo.close() print(stats) return stats
def get_user_coins(self, user_id): print("Pulling Coin Amount for ", user_id) mongo = MongoClient(MONGO_CONNECTION_STRING) stats = defaultdict(int, mongo.beanbot.users.find_one({"user_id": user_id})) mongo.close() return stats['beano_coin_count']
def get_item_info(self, item_id): mongo = MongoClient(MONGO_CONNECTION_STRING) print("Pulling info for Item ID", item_id) response = mongo.beanbot.store_items.find_one({ "active": True, "item_id": int(item_id) }) mongo.close() return response
def doEverything(): # certfile = '/home/bryan/Downloads/baratheon.pem' conn = MongoClient(url) db = conn[database] commands = [] collectionName = "pythonMongo" commands.append("Creating collection " + collectionName) collection = db[collectionName] #insert 1 commands.append("# 1 Inserts") commands.append("# 1.1 Insert a single document to a collection") collection.insert({"name": "test1", "value": 1}) commands.append("Inserted {\"name\": \"test1\", \"value\": 1}") #insert many commands.append("#1.2 Inserting multiple entries into collection") multiPost = [{"name": "test1", "value": 1},{"name": "test2", "value": 2}, {"name": "test3", "value": 3}] collection.insert(multiPost) commands.append("Inserted \n {\"name\": \"test1\", \"value\": 1} \n {\"name\": \"test2\", \"value\": 2} \n {\"name\": \"test3\", \"value\": 3}") # Find commands.append("#2 Queries") commands.append("#2.1 Find one that matches a query condition") commands.append(collection.find_one({"name": "test1"})) # Find all commands.append("#2.2 Find all that match a query condition") for doc in collection.find({"name": "test1"}): commands.append(doc) # Display all documents commands.append( "#2.3 Find all documents in collection") for doc in collection.find(): commands.append(doc) # update document commands.append("#3 Updating Documents") collection.update({"name": "test3"}, {"$set": { "value": 4}}) commands.append("Updated test3 with value 4") # delete document commands.append("#4 Delete Documents") collection.remove({"name": "test2"}) commands.append("Deleted all with name test2") # Display all collection names commands.append("#5 Get a list of all of the collections") commands.append( db.collection_names()) commands.append("#6 Drop a collection") db.drop_collection(collectionName) conn.close() commands.append("Connection to database has been closed") return commands
def give_user_coins(self, user_id, coin_amount): print("Giving {} coins to {}".format(coin_amount, user_id)) mongo = MongoClient(MONGO_CONNECTION_STRING) mongo.beanbot.users.update_one({'user_id': user_id}, {"$inc": { "beano_coin_count": 1 }}, upsert=True) mongo.close() return None
def test_properties(self): c = client_context.rs_client c.admin.command('ping') wait_until(lambda: c.primary == self.primary, "discover primary") wait_until(lambda: c.arbiters == self.arbiters, "discover arbiters") wait_until(lambda: c.secondaries == self.secondaries, "discover secondaries") self.assertEqual(c.primary, self.primary) self.assertEqual(c.secondaries, self.secondaries) self.assertEqual(c.arbiters, self.arbiters) self.assertEqual(c.max_pool_size, 100) # Make sure MongoClient's properties are copied to Database and # Collection. for obj in c, c.pymongo_test, c.pymongo_test.test: self.assertEqual(obj.codec_options, CodecOptions()) self.assertEqual(obj.read_preference, ReadPreference.PRIMARY) self.assertEqual(obj.write_concern, WriteConcern()) cursor = c.pymongo_test.test.find() self.assertEqual(ReadPreference.PRIMARY, cursor._Cursor__read_preference) tag_sets = [{'dc': 'la', 'rack': '2'}, {'foo': 'bar'}] secondary = Secondary(tag_sets=tag_sets) c = MongoClient(pair, replicaSet=self.name, maxPoolSize=25, document_class=SON, tz_aware=True, read_preference=secondary, localThresholdMS=77, j=True) self.assertEqual(c.max_pool_size, 25) for obj in c, c.pymongo_test, c.pymongo_test.test: self.assertEqual(obj.codec_options, CodecOptions(SON, True)) self.assertEqual(obj.read_preference, secondary) self.assertEqual(obj.write_concern, WriteConcern(j=True)) cursor = c.pymongo_test.test.find() self.assertEqual(secondary, cursor._Cursor__read_preference) nearest = Nearest(tag_sets=[{'dc': 'ny'}, {}]) cursor = c.pymongo_test.get_collection("test", read_preference=nearest).find() self.assertEqual(nearest, cursor._Cursor__read_preference) self.assertEqual(c.max_bson_size, 16777216) c.close()
class MovieSpider(CrawlSpider): #初始化爬虫对象是调用该方法 def __init__(self): #调用父类的方法 super().__init__(self) #访问数据库,创建客户端对象 self.client = MongoClient('localhost', 27017) self.url_connection = self.client['moviedb']['urls'] #销毁爬虫对象时,回调该方法 def __del__(self): self.client.close() name = 'movie' # allowed_domains = ['www.xxx.com'] start_urls = ['http://www.4567kan.com/frim/index1.html'] link_1 = LinkExtractor(allow=r'http://www\.4567kan\.com/frim/index1\.html') link = LinkExtractor( allow=r'http://www\.4567kan\.com/frim/index1-\d+\.html') rules = ( Rule(link_1, callback='parse_item', follow=False), Rule(link, callback='parse_item', follow=False), ) def parse_item(self, response): # print(response.request.url) li_list = response.xpath('/html/body/div[1]/div/div/div/div[2]/ul/li') for li in li_list: self.detail_url = "http://www.4567kan.com" + li.xpath( './div/a/@href').extract_first() self.title = li.xpath('./div/a/@title').extract_first() # print('影名:'+self.title,'网址链接:'+self.detail_url) cursor = self.url_connection.find({'url': self.detail_url}) if cursor.count() == 0: print('该%s没有被访问,可以进行数据的爬取...' % self.detail_url) self.url_connection.insert_one({"url": self.detail_url}) #发起一个新的请求,访问该url的电影详情页面 yield scrapy.Request(url=self.detail_url, callback=self.parse_detail) else: print("当前的%s已经访问过,无需访问" % self.detail_url) def parse_detail(self, response): item = MovieprojectItem() item['name'] = response.xpath( '/html/body/div[1]/div/div/div/div[2]/h1/text()').extract_first() item['desc'] = response.xpath( '/html/body/div[1]/div/div/div/div[2]/p[5]/span[2]//text()' ).extract() item['desc'] = ''.join(item['desc']) yield item
def checkmongodbconnection(): try: c = MongoClient(MONGO_URI, server_api=ServerApi('1'), serverSelectionTimeoutMS=5000) c.admin.command('ismaster') time.sleep(2) c.close() return True except: print('\nCould not connect to MongoDB.\n\n') return False
def process_request(self, request: scrapy.http.Request, spider: scrapy.spiders.Spider) -> None: client = MongoClient(spider.settings.get('MONGODB_URI')) db = client[spider.settings.get('MONGODB_DATABASE')] collention = db[spider.settings.get('MONGODB_COLLECTION')] res = collention.find_one({'url': request.url}) client.close() if res is not None: logger.info("MongoDBDupeFilter filtered request to: %(request)s", {'request': request.url}, extra={'spider': spider}) raise IgnoreRequest()
def test_disconnect(self): c = MongoClient(host, port) coll = c.pymongo_test.bar c.close() c.close() coll.count() c.close() c.close() coll.count()
def test_properties(self): c = client_context.rs_client c.admin.command('ping') wait_until(lambda: c.primary == self.primary, "discover primary") wait_until(lambda: c.arbiters == self.arbiters, "discover arbiters") wait_until(lambda: c.secondaries == self.secondaries, "discover secondaries") self.assertEqual(c.primary, self.primary) self.assertEqual(c.secondaries, self.secondaries) self.assertEqual(c.arbiters, self.arbiters) self.assertEqual(c.max_pool_size, 100) # Make sure MongoClient's properties are copied to Database and # Collection. for obj in c, c.pymongo_test, c.pymongo_test.test: self.assertEqual(obj.codec_options, CodecOptions()) self.assertEqual(obj.read_preference, ReadPreference.PRIMARY) self.assertEqual(obj.write_concern, WriteConcern()) cursor = c.pymongo_test.test.find() self.assertEqual( ReadPreference.PRIMARY, cursor._Cursor__read_preference) tag_sets = [{'dc': 'la', 'rack': '2'}, {'foo': 'bar'}] secondary = Secondary(tag_sets=tag_sets) c = MongoClient( pair, replicaSet=self.name, maxPoolSize=25, document_class=SON, tz_aware=True, read_preference=secondary, localThresholdMS=77, j=True) self.assertEqual(c.max_pool_size, 25) for obj in c, c.pymongo_test, c.pymongo_test.test: self.assertEqual(obj.codec_options, CodecOptions(SON, True)) self.assertEqual(obj.read_preference, secondary) self.assertEqual(obj.write_concern, WriteConcern(j=True)) cursor = c.pymongo_test.test.find() self.assertEqual( secondary, cursor._Cursor__read_preference) nearest = Nearest(tag_sets=[{'dc': 'ny'}, {}]) cursor = c.pymongo_test.get_collection( "test", read_preference=nearest).find() self.assertEqual(nearest, cursor._Cursor__read_preference) self.assertEqual(c.max_bson_size, 16777216) c.close()
async def disconnect_mongo(client: MongoClient): """Close the MongoDB connection. Arguments: client(MongoClient) : MongoDB client """ try: client.close() except errors.PyMongoError: raise errors.PyMongoError except Exception: raise Exception return
def give_item_to_player(self, item, user_id): mongo = MongoClient(MONGO_CONNECTION_STRING) mongo.beanbot.users.update_one({'user_id': user_id}, { "$push": { "inventory": { "id": item['item_id'], "name": item['name'], "effect": item['effect'], "description": item['description'] } } }, upsert=True) mongo.close() print("Gave {} to {}".format(item['item_id'], user_id))
class Day10CrawlspiderDoubanPipeline(object): def open_spider(self, spider): """连接mongo数据库""" if spider.name == DoubanSpider.name: self.client = MongoClient() self.collection = self.client['douban']['top250'] def process_item(self, item, spider): """保存数据""" if spider.name == DoubanSpider.name: self.collection.insert_one(dict(item)) return item def close_spider(self, spider): """关闭连接数据库""" if spider.name == DoubanSpider.name: self.client.close()
class MongoDBPipeline(object): def open_spider(self, spider): self.client = MongoClient("localhost",27017) def process_item(self, item, spider): title = item['title'] content = item['content'] url = item['url'] dict1 = { "url":url, "title":title, "content":content } self.client['wangyi']['news'].save(dict1) return item def close_spider(self, spider): self.client.close() # import pymysql # class mysqlPileLine(object): # def __init__(self): # print("mysqlPileLine管道对象初始化...") # self.cursor = None # self.db = None # # def open_spider(self,spider): # self.db = pymysql.Connect(host='127.0.0.1',port=3306,user='******',password='******',db='qiubai',charset='utf8') # # def process_item(self,item,spider): # self.cursor = self.db.cursor() # # try: # self.cursor.execute('insert into tb_qiubai(title,content) values("%s","%s")'%(item["title"],item["content"])) # self.db.commit() # except Exception as e: # print(e) # self.db.rollback() # return item # # def close_spider(self,spider): # print("mysqlPileLine管道对象销毁...") # self.cursor.close() # self.db.close()
def save(metas, batch_num=100): """ 读取配置,把Mongo数据同步到mysql中。 """ mongo = MongoClient(**config.mongo) db = MySQLdb.connect(**config.mysql) cursor = db.cursor() print print "=" * 40 print ' ' * 15, u'Mongo --> MySQL' print "=" * 40 print try: mongo_db = mongo.get_database('theforce') for meta in metas: for model_name, item_model in meta.iter_model(): collection_name = item_model['name'] table_name = item_model['table'] attrs = meta.get_model_persist_attr_names(item_model) collection = mongo_db.get_collection(collection_name) results = [obj for obj in collection.find({})] sql = "insert into {0}({1}) values({2})".format( table_name, ','.join(attrs), ','.join(itertools.repeat('%s', len(attrs)))) print print '-' * 40 print u'开始处理{0}@mongo --> {1}@mysql, 共{2}条数据,每批{3}条批量迁移:'.format( collection_name, table_name, len(results), batch_num) # 分组进行批量处理 results2 = itertools.izip(itertools.count(), results) for group_key, group_it in itertools.groupby( results2, lambda item: item[0] / batch_num): print '.', values = [[obj[attr] for attr in attrs] for index, obj in group_it] cursor.executemany(sql, values) print u'[完成]' finally: mongo.close() cursor.close() db.close()
def test_kill_cursors_warning(self): # If kill_cursors is called while the client is disconnected, it # can't risk taking the lock to reconnect, in case it's being called # from Cursor.__del__, see PYTHON-799. Test that it shows a warning # in this case. client = MongoClient(host, port) collection = client.pymongo_test.test collection.insert({} for _ in range(4)) cursor = collection.find().batch_size(1) cursor.next() client.close() ctx = catch_warnings() try: warnings.simplefilter("error", UserWarning) self.assertRaises(UserWarning, cursor.close) finally: ctx.exit() # Reconnect. collection.find_one() cursor.close()
def pat_beano(self, user_id): print("Patting Beano") current_timestamp = datetime.now() response = ("idle", "There was some issue. Beano is confused.") mongo = MongoClient(MONGO_CONNECTION_STRING) user_stats = mongo.beanbot.users.find_one({'user_id': user_id}) if user_stats is None: print("{} not in mongo. Adding to mongo.".format(user_id)) self.create_new_user(user_id) user_stats = mongo.beanbot.users.find_one({'user_id': user_id}) print("User Stats : ", user_stats) last_pat_ts = user_stats['last_pat_timestamp'] pat_cd = (current_timestamp - last_pat_ts).total_seconds() / 60 print(pat_cd, " VS ", self.pat_cooldown) if pat_cd < self.pat_cooldown: response = ("idle", "You cannot pat Beano for another {} minutes.".format( self.pat_cooldown - pat_cd)) else: mongo.beanbot.users.update_one( {'user_id': user_id}, {"$set": { "last_pat_timestamp": current_timestamp }}, upsert=True) response = ("happy", "You pat Beano!") mongo.close() return response
'$set': { 'attachUrl_uu': None, 'attach_download_user': None, 'attachTask': Constant.TODO } }) except Exception as e: print(attach['attachUrl_uu'], e) continue # error attachs = db.component_original.find({"attachTask": Constant.ERROR}, { "_id": True, "attachUrl_uu": True }) for attach in attachs: try: requests.get(fs_api_delete % attach['attachUrl_uu']) db.component_original.update_one( {'_id': attach["_id"]}, {'$set': { 'attachUrl_uu': None, 'attach_download_user': None }}) except Exception as e: print(attach['attachUrl_uu'], e) continue cli.close()
class ZhihuPipeline(object): config = { 'uri': 'mongodb://localhost:27017', 'fsync': False, 'write_concern': 1, 'database': 'zhihu_userdb', 'collection': 'zhihu_userdb', 'separate_collections': False, 'replica_set': None, 'unique_key': None, 'buffer': None, 'append_timestamp': False, 'stop_on_duplicate': 0, } current_item = 0 item_buffer = [] duplicate_key_count = 0 def __init__(self, mongo_uri, mongo_db, mongo_replSet_name, mongo_replSet_uri): self.config["uri"] = mongo_uri self.config["database"] = mongo_db if mongo_replSet_name: #self.mongo_replSet_name = mongo_replSet_name self.config["replica_set"] = mongo_replSet_name if mongo_replSet_uri: #self.mongo_replSet_uri = mongo_replSet_uri self.config["uri"] = mongo_replSet_uri self.logger = logging.getLogger("scrapy-Zhihu-logger") @classmethod def from_crawler(cls, crawler): return cls( crawler.settings.get("MONGO_URI", "mongodb://localhost:27017/zhihu_userdb"), crawler.settings.get("MONGO_DATABASE", "zhihu_userdb"), crawler.settings.get("MONGO_REPLSET_NAME", "None"), crawler.settings.get("MONGO_REPLSET_URI", "None") ) def open_spider(self, spider): self.crawler = spider.crawler self.settings = spider.settings if self.config["replica_set"] is not None: self.conncetion = MongoReplicaSetClient( self.config["uri"], replicaSet = self.config["replica_set"], w = self.config["write_concern"], fsync = self.config["fsync"], read_preferences = ReadPreference.PRIMARY_PREFERRED ) else: self.connection = MongoClient( self.config["uri"], fsync = self.config["fsync"], read_preferences = ReadPreference.PRIMARY ) self.database = self.conncetion[self.config["database"]] self.collections = {'default': self.database[self.config['collection']]} self.logger.info('Connected dbpath: {0}, database: {1}'.format( self.config["uri"], self.config["database"] )) if self.config['stop_on_duplicate']: tmpValue = self.config['stop_on_duplicate'] if tmpValue < 0: msg = ( u'Negative values are not allowed for' u' MONGODB_STOP_ON_DUPLICATE option.' ) self.logger.error(msg) raise SyntaxError(msg) self.stop_on_duplicate = self.config['stop_on_duplicate'] else: self.stop_on_duplicate = 0 def close_spider(self, spider): if self.item_buffer: self.insert_item(self.item_buffer, spider) self.connection.close() def process_item(self, item, spider): item = dict((k, v) for k, v in item.iteritems() if v is not None and v != "") if self.config['buffer']: self.current_item += 1 if self.config['append_timestamp']: item['scrapy-mongodb'] = {'ts': datetime.datetime.utcnow()} self.item_buffer.append(item) if self.current_item == self.config['buffer']: self.current_item = 0 try: return self.insert_item(self.item_buffer, spider) finally: self.item_buffer = [] return item return self.insert_item(item, spider) def insert_item(self, item, spider): if not isinstance(item, list): item = dict(item) if self.config['append_timestamp']: item['scrapy-mongodb'] = {'ts': datetime.datetime.utcnow()} collection_name, collection = self.get_collection(spider.name) if self.config['unique_key'] is None: try: collection.insert(item, continue_on_error=True) self.logger.debug(u'Stored item(s) in MongoDB {0}/{1}'.format( self.config['database'], collection_name)) except errors.DuplicateKeyError: self.logger.debug(u'Duplicate key found') if (self.stop_on_duplicate > 0): self.duplicate_key_count += 1 if (self.duplicate_key_count >= self.stop_on_duplicate): self.crawler.engine.close_spider( spider, 'Number of duplicate key insertion exceeded' ) else: key = {} if isinstance(self.config['unique_key'], list): for k in dict(self.config['unique_key']).keys(): key[k] = item[k] else: key[self.config['unique_key']] = item[self.config['unique_key']] collection.update(key, item, upsert=True) self.logger.debug(u'Stored item(s) in MongoDB {0}/{1}'.format( self.config['database'], collection_name)) return item def get_collection(self, name): if self.config['separate_collections']: collection = self.collections.get(name) collection_name = name if not collection: collection = self.database[name] self.collections[name] = collection else: collection = self.collections.get('default') collection_name = self.config['collection'] if self.config['unique_key']: collection.ensure_index(self.config['unique_key'], unique=True) self.logger.info(u'Ensuring index for key {0}'.format( self.config['unique_key'])) return (collection_name, collection)
class Mongo(DatabaseManager): def __init__(self,options={}): self.config = { 'HOST':'localhost', 'PORT':27017, 'COLLECTION': 'default' } self.config.update(options) print self.config self.connect() def connect(self): self.client = MongoClient( self.config['HOST'], self.config['PORT'] ) self.db = self.client[self.config['COLLECTION']] def iter(self): return self.db.collection.find() def close(self): if self.client: self.client.close() def count(self): return self.db.collection.count() def put(self,data): if isinstance(data, types.ListType): # For bulk insert, inject timestamp for x in data: x[TIMESTAMP_CREATED] = datetime.datetime.utcnow() else: # For single insert, inject timestamp data[TIMESTAMP_CREATED] = datetime.datetime.utcnow() try: self.db.collection.insert(data) return True except Exception as e: print e return False def update(self): pass def get(self, query=False): if not query: # Get all documents in collection return list(self.db.collection.find()) def delete(self): pass
def update_preseries(target_db_uri, target_collection, source_db_uri, source_collection, interval, is_prediction, computed_aggregations, resource_type): """ Update aggregated collection in the target system :param target_db_uri: Target DB URI, i.e. mongodb://localhost/databasename :param target_collection: Output collection name :param source_db_uri: Source DB URI, i.e. mongodb://localhost/databasename :param source_collection: Input collection name :param interval: period type --> year, quarter, month, dayOfYear, week :param is_prediction: Boolean with True or False :param computed_aggregations: Document with results """ tgt_client = MongoClient(target_db_uri) tgt_db = tgt_client.get_default_database() tgt_col = tgt_db[target_collection] src_client = MongoClient(source_db_uri) src_db = src_client.get_default_database() src_col = src_db[source_collection] project_clause = { "_id": 0, "company_name": "$company_name", "company_foundation_date": "$foundation_date" } if resource_type == 'person': project_clause = { "_id": 0, "first_name": "$first_name", "last_name": "$last_name", "gender": "$gender" } if resource_type == 'investor': project_clause = { "_id": 0, "investor_name": "$investor_name", "investor_foundation_date": "$foundation_date" } bulk_counter = 0 full_counter = 0 block_size = 10000 aggregation_num = len(computed_aggregations) tries_num = 3 bulk = tgt_col.initialize_ordered_bulk_op() while full_counter < aggregation_num: comp_aggregation = computed_aggregations[full_counter] fields_to_insert, fields_to_update = \ prepare_fields(src_col, project_clause, resource_type, comp_aggregation, interval, is_prediction) find_pipeline = get_find_pipeline(resource_type, comp_aggregation, interval, fields_to_insert, is_prediction) bulk.find(find_pipeline).upsert().update({ "$setOnInsert": fields_to_insert, "$set": fields_to_update }) bulk_counter += 1 full_counter += 1 # Manage a page of block_size records if bulk_counter == block_size: try: bulk.execute() tries_num = 3 bulk = tgt_col.initialize_ordered_bulk_op() bulk_counter = 0 print "%d records processed" % full_counter except BulkWriteError as ex: # give a second chance to the execute if tries_num == 0: print "bulk.execute() failed 3 times..." print "ERROR processing Task. Exception: [%s]" % ex traceback.print_exc() raise ex sleep(0.5) bulk = tgt_col.initialize_ordered_bulk_op() bulk_counter = 0 full_counter -= block_size tries_num -= 1 except Exception as ex2: print "ERROR processing Task. Exception: [%s]" % ex2 traceback.print_exc() raise ex2 # Manage rest of records from the latest complete page to the end if bulk_counter > 0: try: bulk.execute() print "%d records processed. Finished" % full_counter except BulkWriteError as ex: # give a second chance to the execute sleep(1) bulk = tgt_col.initialize_ordered_bulk_op() full_counter = aggregation_num - bulk_counter for comp_aggr_inx in range(full_counter, aggregation_num): comp_aggregation = computed_aggregations[comp_aggr_inx] if len(comp_aggregation['result']) == 0: continue fields_to_insert, fields_to_update = \ prepare_fields(src_col, project_clause, resource_type, comp_aggregation, interval, is_prediction) find_pipeline = get_find_pipeline(resource_type, comp_aggregation, interval, fields_to_insert, is_prediction) bulk.find(find_pipeline).upsert().update({ "$setOnInsert": fields_to_insert, "$set": fields_to_update }) full_counter += 1 bulk.execute() print "%d records processed. Finished" % full_counter except Exception as ex: print "ERROR processing Task. Exception: [%s]" % ex traceback.print_exc() raise ex tgt_client.close() src_client.close()
class Connection: _graph_map: dict[str, Connection] = {} _initialized_map: dict[str, bool] = {} def __new__(cls: type[Connection], graph_name: str) -> Connection: if not cls._graph_map.get(graph_name): cls._graph_map[graph_name] = super(Connection, cls).__new__(cls) return cls._graph_map[graph_name] def __init__(self: Connection, graph_name: str) -> None: if self.__class__._initialized_map.get(graph_name): return self._graph_name: str = graph_name self._url: Optional[str] = None self._client: Optional[MongoClient] = None self._database: Optional[Database] = None self._collections: dict[str, Collection] = {} self._connection_callbacks: dict[str, ConnectedCallback] = {} self._connected: bool = False self.__class__._initialized_map[graph_name] = True return None @property def graph_name(self: Connection) -> str: return self._graph_name @property def url(self: Connection) -> str: if self._url: return self._url return self._generate_default_url() def set_url(self: Connection, url: str) -> None: self._url = url def _generate_default_url(self: Connection) -> str: if self.graph_name == 'default': user_url = uconf()['pymongo.url'] or uconf()['pymongo.default.url'] else: user_url = uconf()[f'pymongo.{self.graph_name}.url'] if user_url is not None: self._url = user_url return user_url base = 'mongodb://localhost:27017/' proj = camelize(parameterize(path.basename(getcwd()))).lower() self._url = base + proj return self._url @property def client(self: Connection) -> MongoClient: if self._client is not None: return self._client self.connect() return self._client @property def database(self: Connection) -> Database: if self._database is not None: return self._database self.connect() return self._database def connect(self: Connection) -> None: self._client = MongoClient(self.url) self._database = self._client.get_database() self._connected = True for name, callback in self._connection_callbacks.items(): self._call_callback(name, callback) def disconnect(self: Connection) -> None: if self._client is not None: self._client.close() self._client = None self._database = None self._collections = {} self._connected = False @property def connected(self: Collection) -> bool: return self._connected def collection(self: Connection, name: str, index_keys: list[str] | None = None) -> Collection: if self._collections.get(name) is not None: return self._collections[name] coll = self.database.get_collection(name) if index_keys is not None: ukeys = [(k, 1) for k in index_keys] coll.create_index(ukeys, name='ref', unique=True) self._collections[name] = coll return coll def add_connected_callback(self: Connection, name: str, callback: ConnectedCallback) -> None: self._connection_callbacks[name] = callback if self._client: self._call_callback(name, callback) def _call_callback(self: Connection, name: str, callback: ConnectedCallback) -> None: callback(self.collection(name)) def collection_from(self: Connection, cls: type[T]) -> Collection: coll_name = cls.pconf.collection_name return self.collection(coll_name) default: ClassVar[Connection] @classmethod def get_collection(cls: type[Connection], pmcls: type[T]) -> Collection: graph = pmcls.cdef.jconf.cgraph.name connection = Connection(graph) return connection.collection_from(pmcls) @classmethod def from_class(cls: type[Connection], pmcls: type[T]) -> Connection: return Connection(pmcls.cdef.jconf.cgraph.name)
def get_changed_intervals(source_db_uri, source_collection, last_execution, curr_date, process_field, interval, resource_type): """ Generation of the pipeline aggregation string, execution, and returns the periods with changes Ex. [{"$match": {"metric_name": {"$in": ["twitter_followers", "twitter_following"]}, "updated": {"$gte": last_execution, "$lt": curr_date}}}, { "$project": {"company_id": "$_id", "date": "$date", "year": {"$year": "$date"}, "interval": {"$month": "$date"}}}, {"$group": {"_id": { "company_id": "$company_id","year":"$year", "interval": "$interval"}}}, {"$project": {"_id": 0, "company_id": "$_id.company_id", "year": "$_id.year", "interval": "$_id.interval"}}] Returns a list of changed intervals with this structure: [{"company_id": ObjectID("2341231231"), "year": 2013, "interval": 7}, {"company_id": ObjectID("2341231231"), "year": 2013, "interval": 8}, {"company_id": ObjectID("2341231444"), "year": 2015, "interval": 9}, {"company_id": ObjectID("2341231444"), "year": 2015, "interval": 11} ...] :param source_db_uri: Source DB URI, i.e. mongodb://localhost/databasename :param source_collection: Input collection name :param last_execution: Lower date :param curr_date: Upper date :param process_field: metric to process: "twitter_followers" :param interval: period type --> year, quarter, month, dayOfYear, week :param resource_type: Resource Type (company, person, ...) """ client = MongoClient(source_db_uri) database = client.get_default_database() collection = database[source_collection] pipeline_periods = [{ "$unwind": "$%s_ts" % process_field }, { "$project": { "%s_id" % resource_type: "$%s_id" % resource_type, "value": "$%s_ts.value" % process_field, "date": "$%s_ts.date" % process_field, "updated": "$%s_ts.updated" % process_field, "year": { "$year": "$%s_ts.date" % process_field }, "interval": set_field_interval("%s_ts.date" % process_field, interval) } }, { "$match": { "updated": { "$lt": curr_date } } }, { "$group": { "_id": { "%s_id" % resource_type: "$%s_id" % resource_type, "year": "$year", "interval": "$interval" } }, }, { "$project": { "_id": 0, "%s_id" % resource_type: "$_id.%s_id" % resource_type, "year": "$_id.year", "interval": "$_id.interval" } }] # If last_execution is informed, add the $gte clause to the "$match" elem. if last_execution is not None: pipeline_periods[2]['$match']["updated"]['$gte'] = last_execution changed_intervals = [] for changed_interval in collection.aggregate(pipeline_periods, allowDiskUse=True): changed_intervals.append(changed_interval) client.close() return changed_intervals
class DBServer(): def __init__(self, collection_name, table): self.collection_name = collection_name self.table = table cfg = DBUtils.read_config() url = "mongodb://{0}:{1}/".format(cfg.host, cfg.port) logger.debug("The db url", url) self._client = MongoClient(url) # can not connect to any db except admin server username, password = getattr(cfg, 'username', ''), getattr(cfg, 'password', '') if not (len(username) <= 0 or len(password) <= 0): self._client.admin.authenticate(cfg.username, cfg.password) db = self._client[collection_name] self.table = db[table] @log def query(self, pattern=None, **condition): """ :param pattern: dict pattern suitable :param condition: condition to pagination :return:suitable information """ if pattern is None: # if pattern is None,the conditions must be empty values = self.table.find().limit(20) return convert_list(values) if condition is None or len(condition) <= 0: # if condition is empty values = self.table.find(pattern) return convert_list(values) dcu = DefaultConditionUtil(**condition) if dcu.get_complex_expression() is not None: pattern = pattern.update(**dcu.get_complex_expression()) ret = self.table.find(pattern).skip(dcu.get_offset()).limit( dcu.get_limit()).sort(dcu.sort_style()) return convert_list(ret) @log def find_one(self, **condition): one = self.table.find_one(condition) return convert_dict(one) @log def find_all(self): """ :return: all information """ values = self.table.find() return convert_list(values) @log def insert(self, obj): self.table.insert(obj) @log def delete(self, pattern=None, **condition): """ :param id: remove obj id :param pattern: pattern data :param condition:suitable data :return:nothing """ if pattern is None: return self.table.remove(pattern) @log def update(self, pattern=None, **new): """ :param id: update id :param other: condition :param new: new data :return: """ self.table.update(pattern, new) @log def query_by_complex_condition(self, code): """ :param code: query information via complex condition use javascript code :return: """ values = self.table.find().where(code) return convert_list(values) @log def count(self, id=None, **conditions): if id is None and conditions is None: return {'count': self.table.find().count()} if conditions is None: return {'count': self.table.find(dict(id=id)).count()} if id is None: return {'count': self.table.find(conditions).count()} conditions['id'] = id return {'count': self.table.find(conditions)} @log def close(self): self._client.close() def __del__(self): self._client.close()
def compute_ts_aggregations_1toN(source_db_uri, source_collection, changed_intervals, process_field, date_field, interval, operators_list, field_names_list, resource_type, withinTheInterval=True): """ Generation of the pipeline aggregation string, execution (1 aggregated result), and returns the changed_intervals dictionary upgraded with the result document Ex. [{"$match": { "company_id": "2341231231", "date" {"$gte": lower_date, "$lt": upper_date}}}, {"$project": { "date": "$date", "value": "$value"}}, {"$sort": {"date": 1}}, {"$group": {"_id": {"company_id": "$_id"}, "aggr_field_name_1": { "$last": "$value" }, "aggr_field_name_2": { "$sum": "$value" }, "aggr_field_name_3": { "$avg": "$value" }}}, {"$project": {"_id": 0, "company_id": "$_id.company_id", "aggr_field_name_1": "aggr_field_name_1", "aggr_field_name_2": "aggr_field_name_2" "aggr_field_name_3": "aggr_field_name_3"}}] Returns a result document attached to the changed_intervals document: [{"company_id": "2341231231", "year": 2013, "interval": 7}, "result": {"twitter_followers_last": 456, "twitter_followers_first": 123, "twitter_followers_count": 34 ...}, {"company_id": "2341231231", "year": 2013, "interval": 8}, "result": {"twitter_followers_last": 135}, {"company_id": "2341231444", "year": 2015, "interval": 9}, "result": {"twitter_followers_last": 1023}, {"company_id": "2341231444", "year": 2015, "interval": 11}, "result": {"twitter_followers_last": 1050}, ...] :param source_db_uri: Source DB URI, i.e. mongodb://localhost/databasename :param source_collection: Input collection name :param proces_field: field to process "twitter_followers", "twitter_bio",... :param changed_intervals: List of documents with the changed intervals :param interval: period type --> year, quarter, month, dayOfYear, week :param operators_list: aggregator operator list i.e. "last sum avg" :param field_names_list: aggregator field name list i.e "twitter_followers_last twitter_followers_sum twitter_followers_avg" :param resource_type: Resource Type (company, person, ...) :param withinTheInterval: within Interval """ client = MongoClient(source_db_uri) database = client.get_default_database() collection = database[source_collection] company_processed = 0 for changed_interval in changed_intervals: lower_date, upper_date = \ interval_date_range(changed_interval, interval) # Base pipeline aggregation list pipeline_aggregated = [{ "$match": { "%s_id" % resource_type: changed_interval['%s_id' % resource_type], date_field: { "$lt": upper_date } } }, { "$project": { "id": "$%s_id" % resource_type, "date": "$%s" % date_field, "value": "$%s" % process_field } }, { "$sort": { "date": 1 } }, { "$group": { "_id": { "id": "$id" } } }, { "$project": { "_id": 0, 'id': '$_id.id' } }] if withinTheInterval: pipeline_aggregated[0]['$match'][date_field]['$gte'] = lower_date # Upgrade pipeline aggregation list with aggregation operators for i in range(len(operators_list)): pipeline_aggregated[3]['$group'][field_names_list[i]] = \ {"$" + operators_list[i]: "$value"} \ if (operators_list[i] != 'count') else {"$sum": 1} pipeline_aggregated[4]['$project'][field_names_list[i]] = \ "$" + field_names_list[i] # Upgrade changed_interval document with result for aggregated_value in collection.aggregate(pipeline_aggregated, allowDiskUse=True): changed_interval['result'] = {} for i in range(len(operators_list)): changed_interval['result'][field_names_list[i]] = \ aggregated_value[field_names_list[i]] company_processed += 1 if company_processed % 100 == 0: print "Processed %d/%d companies" % (company_processed, len(changed_intervals)) client.close() return changed_intervals
def compute_aggregations(source_db_uri, source_collection, changed_intervals, process_field, interval, operators_list, field_names_list, resource_type): """ Generation of the pipeline aggregation string, execution (1 aggregated result), and returns the changed_intervals dictionary upgraded with the result document Ex. [{"$match": { "metric_name":"twitter_followers", "company_id": "2341231231", "date" {"$gte": lower_date, "$lt": upper_date}}}, {"$project": { "date": "$date", "value": "$value"}}, {"$sort": {"date": 1}}, {"$group": {"_id": {"company_id": "$_id"}, "aggr_field_name_1": { "$last": "$value" }, "aggr_field_name_2": { "$sum": "$value" }, "aggr_field_name_3": { "$avg": "$value" }}}, {"$project": {"_id": 0, "company_id": "$_id.company_id", "aggr_field_name_1": "aggr_field_name_1", "aggr_field_name_2": "aggr_field_name_2" "aggr_field_name_3": "aggr_field_name_3"}}] Returns a result document attached to the changed_intervals document: [{"company_id": "2341231231", "year": 2013, "interval": 7}, "result": {"twitter_followers_last": 456, "twitter_followers_first": 123, "twitter_followers_count": 34 ...}, {"company_id": "2341231231", "year": 2013, "interval": 8}, "result": {"twitter_followers_last": 135}, {"company_id": "2341231444", "year": 2015, "interval": 9}, "result": {"twitter_followers_last": 1023}, {"company_id": "2341231444", "year": 2015, "interval": 11}, "result": {"twitter_followers_last": 1050}, ...] :param source_db_uri: Source DB URI, i.e. mongodb://localhost/databasename :param source_collection: Input collection name :param proces_field: field to process "twitter_followers", "twitter_bio",... :param changed_intervals: List of documents with the changed intervals :param interval: period type --> year, quarter, month, dayOfYear, week :param operators_list: aggregator operator list i.e. "last sum avg" :param field_names_list: aggregator field name list i.e "twitter_followers_last twitter_followers_sum twitter_followers_avg" """ client = MongoClient(source_db_uri) database = client.get_default_database() collection = database[source_collection] print "Processing %d changed intervals...." % len(changed_intervals) NUM_OF_INTERVALS_TO_INFORM = 1000 for idx, changed_interval in enumerate(changed_intervals): lower_date, upper_date = \ interval_date_range(changed_interval, interval) # Base pipeline aggregation list pipeline_aggregated = [{ "$match": { "%s_id" % resource_type: changed_interval['%s_id' % resource_type] } }, { "$unwind": "$%s_ts" % process_field }, { "$project": { "%s_id" % resource_type: "$%s_id" % resource_type, "value": "$%s_ts.value" % process_field, "date": "$%s_ts.date" % process_field, "updated": "$%s_ts.updated" % process_field, "year": { "$year": "$%s_ts.date" % process_field }, "interval": set_field_interval("%s_ts.date" % process_field, interval) } }, { "$match": { "date": { "$gte": lower_date, "$lt": upper_date } } }, { "$project": { "date": "$date", "value": "$value", "%s_id" % resource_type: "$%s_id" % resource_type } }, { "$sort": { "date": 1 } }, { "$group": { "_id": { "%s_id" % resource_type: "%s_id" % resource_type } } }, { "$project": { "_id": 0, "%s_id" % resource_type: "$_id.%s_id" % resource_type } }] # Upgrade pipeline aggregation list with aggregation operators for i in range(len(operators_list)): pipeline_aggregated[6]['$group'][field_names_list[i]] = \ {"$" + operators_list[i]: "$value"} \ if (operators_list[i] != 'count') else {"$sum": 1} pipeline_aggregated[7]['$project'][field_names_list[i]] = \ "$" + field_names_list[i] # Upgrade changed_interval document with result for aggregated_value in collection.aggregate(pipeline_aggregated, allowDiskUse=True): changed_interval['result'] = {} for i in range(len(operators_list)): changed_interval['result'][field_names_list[i]] = \ aggregated_value[field_names_list[i]] if idx != 0 and idx % NUM_OF_INTERVALS_TO_INFORM == 0: print "%d intervals processed" % \ ((idx / NUM_OF_INTERVALS_TO_INFORM) * NUM_OF_INTERVALS_TO_INFORM) client.close() return changed_intervals
def doEverything(): # Get database connectivity information database, url = getDatabaseInfo() # Run test try: client = MongoClient(url) db = client[database] output = [] output.append("Starting database test.... ") collectionName = "pythonMongo" output.append("Creating collection " + collectionName) collection = db[collectionName] #insert 1 output.append("# 1 Inserts") output.append("# 1.1 Insert a single document to a collection") collection.insert({"name": "test1", "value": 1}) output.append("Inserted {\"name\": \"test1\", \"value\": 1}") #insert many output.append("#1.2 Inserting multiple entries into collection") multiPost = [{"name": "test1", "value": 1},{"name": "test2", "value": 2}, {"name": "test3", "value": 3}] collection.insert(multiPost) output.append("Inserted \n {\"name\": \"test1\", \"value\": 1} \n {\"name\": \"test2\", \"value\": 2} \n {\"name\": \"test3\", \"value\": 3}") # Find output.append("#2 Queries") output.append("#2.1 Find one that matches a query condition") output.append(collection.find_one({"name": "test1"})) # Find all output.append("#2.2 Find all that match a query condition") for doc in collection.find({"name": "test1"}): output.append(doc) # Display all documents output.append( "#2.3 Find all documents in collection") for doc in collection.find(): output.append(doc) # update document output.append("#3 Updating Documents") collection.update({"name": "test3"}, {"$set": { "value": 4}}) output.append("Updated test3 with value 4") # delete document output.append("#4 Delete Documents") collection.remove({"name": "test2"}) output.append("Deleted all with name test2") # Display all collection names output.append("#5 Get a list of all of the collections") output.append( db.collection_names()) output.append("#6 Drop a collection") db.drop_collection(collectionName) except Exception as e: logging.exception(e) output.append("EXCEPTION (see log for details): " + str(e)) finally: if client is not None: client.close() output.append("Connection to database has been closed") return output
class DBServer(): def __init__(self, collection_name, table): self.collection_name = collection_name self.table = table cfg = DBUtils.read_config() url = "mongodb://{0}:{1}/".format(cfg.host, cfg.port) logger.debug("The db url", url) self._client = MongoClient(url) # can not connect to any db except admin server username, password = getattr(cfg, 'username', ''), getattr(cfg, 'password', '') if not (len(username) <= 0 or len(password) <= 0): self._client.admin.authenticate(cfg.username, cfg.password) db = self._client[collection_name] self.table = db[table] @log def query(self, pattern=None, **condition): """ :param pattern: dict pattern suitable :param condition: condition to pagination :return:suitable information """ if pattern is None: # if pattern is None,the conditions must be empty values = self.table.find().limit(20) return convert_list(values) if condition is None or len(condition) <= 0: # if condition is empty values = self.table.find(pattern) return convert_list(values) dcu = DefaultConditionUtil(**condition) if dcu.get_complex_expression() is not None: pattern = pattern.update(**dcu.get_complex_expression()) ret = self.table.find(pattern).skip(dcu.get_offset()).limit(dcu.get_limit()).sort(dcu.sort_style()) return convert_list(ret) @log def find_one(self, **condition): one = self.table.find_one(condition) return convert_dict(one) @log def find_all(self): """ :return: all information """ values = self.table.find() return convert_list(values) @log def insert(self, obj): self.table.insert(obj) @log def delete(self, pattern=None, **condition): """ :param id: remove obj id :param pattern: pattern data :param condition:suitable data :return:nothing """ if pattern is None: return self.table.remove(pattern) @log def update(self, pattern=None, **new): """ :param id: update id :param other: condition :param new: new data :return: """ self.table.update(pattern, new) @log def query_by_complex_condition(self, code): """ :param code: query information via complex condition use javascript code :return: """ values = self.table.find().where(code) return convert_list(values) @log def count(self, id=None, **conditions): if id is None and conditions is None: return {'count': self.table.find().count()} if conditions is None: return {'count': self.table.find(dict(id=id)).count()} if id is None: return {'count': self.table.find(conditions).count()} conditions['id'] = id return {'count': self.table.find(conditions)} @log def close(self): self._client.close() def __del__(self): self._client.close()
class GridFSOperations(Operations): def __init__(self, host, db_name='test', collection_name='fs'): self.client = MongoClient(host) self.db = Database(self.client, db_name) self.fs = GridFS(self.db, collection_name) def _new_file(self, name): return self.fs.new_file( filename=name, aliases=[], length=0, upload_date=datetime.now()) @logmethod def init(self): pass @logmethod def access(self, inode, mode, ctx): return True @logmethod def getattr(self, inode): if inode == 1: return Operations.getattr(self, inode) else: return grid2attrs(self.fs.get(int2oid(inode))) @logmethod def lookup(self, parent_inode, name): if parent_inode != 1: raise FUSEError(errno.ENOENT) try: gridout = self.fs.get_last_version(filename=name.decode()) except NoFile: raise FUSEError(errno.ENOENT) return grid2attrs(gridout) @logmethod def create(self, inode_parent, name, mode, flags, ctx): gridin = self._new_file(name.decode()) fh = oid2int(gridin._id) grid_cache[fh] = gridin return (fh, grid2attrs(gridin)) @logmethod def flush(self, fh): grid = grid_cache[fh] grid.close() @logmethod def setattr(self, inode, attr): gridout = self.fs.get(int2oid(inode)) return grid2attrs(gridout) @logmethod def release(self, fh): del grid_cache[fh] @logmethod def forget(self, inode_list): for inode in inode_list: if inode in oid_cache.ints: del oid_cache.ints[inode] @logmethod def destroy(self): self.client.close() @logmethod def open(self, inode, flags): gridout = self.fs.get(int2oid(inode)) grid_cache[inode] = gridout return inode @logmethod def read(self, fh, off, size): grid = grid_cache[fh] if isinstance(grid, GridIn): grid.close() grid = self.fs.get(int2oid(fh)) grid_cache[fh] = grid grid.seek(off) return grid.read(size) @logmethod def write(self, fh, off, buf): grid = grid_cache[fh] if isinstance(grid, GridOut): offbuf = grid.read(off) grid = self._new_file(name=grid.name) grid_cache[fh] = grid grid.write(offbuf) del offbuf if grid.closed: grid = self._new_file(name=grid.name) grid_cache[fh] = grid grid.write(buf) return len(buf) @logmethod def unlink(self, parent_inode, name): if parent_inode != 1: Operations.unlink(self, parent_inode, name) else: for gridout in self.fs.find({'filename': name.decode()}): self.fs.delete(gridout._id) @logmethod def fsync(self, fh, datasync): Operations.fsync(self, fh, datasync) @logmethod def fsyncdir(self, fh, datasync): Operations.fsyncdir(self, fh, datasync) @logmethod def getxattr(self, inode, name): Operations.getxattr(self, inode, name) @logmethod def link(self, inode, new_parent_inode, new_name): Operations.link(self, inode, new_parent_inode, new_name) @logmethod def listxattr(self, inode): Operations.listxattr(self, inode) @logmethod def mkdir(self, parent_inode, name, mode, ctx): Operations.mkdir(self, parent_inode, name, mode, ctx) @logmethod def mknod(self, parent_inode, name, mode, rdev, ctx): Operations.mknod(self, parent_inode, name, mode, rdev, ctx) @logmethod def opendir(self, inode): Operations.opendir(self, inode) @logmethod def readdir(self, fh, off): Operations.readdir(self, fh, off) @logmethod def readlink(self, inode): Operations.readlink(self, inode) @logmethod def releasedir(self, fh): Operations.releasedir(self, fh) @logmethod def removexattr(self, inode, name): Operations.removexattr(self, inode, name) @logmethod def rename(self, inode_parent_old, name_old, inode_parent_new, name_new): Operations.rename(self, inode_parent_old, name_old, inode_parent_new, name_new) @logmethod def rmdir(self, inode_parent, name): Operations.rmdir(self, inode_parent, name) @logmethod def setxattr(self, inode, name, value): Operations.setxattr(self, inode, name, value) @logmethod def statfs(self): Operations.statfs(self) @logmethod def symlink(self, inode_parent, name, target, ctx): Operations.symlink(self, inode_parent, name, target, ctx)
class MovieSpider(CrawlSpider): #初始化爬虫对象调用该方法 def __init__(self): #调用父类的方法 super().__init__(self) #访问mongodb数据库 self.client = MongoClient("localhost", 27017) #创建或者打开urls集合 self.url_connection = self.client['moviedb']['urls'] #销毁爬虫对象,回调该方法 def __del__(self): self.client.close() name = 'mv' #allowed_domains = ['www.xxx.com'] start_urls = ['http://www.4567kan.com/frim/index1.html'] link = LinkExtractor(allow=r'/frim/index1-\d+\.html') rules = (Rule(link, callback='parse_item', follow=False), ) #解析每一个页码对应的页面,并且获取电影的详情 def parse_item(self, response): li_list = response.xpath('/html/body/div[1]/div/div/div/div[2]/ul/li') for li in li_list: detial_url = "http://www.4567kan.com" + li.xpath( './div/a/@href').extract_first() # print(detial_url) #查询mongodb数据库中的url集合中有没有包含详情的url cursor = self.url_connection.find({"url": detial_url}) if cursor.count() == 0: #当前的url没有访问过 print("该url没有被访问,可以进行数据的爬取...") #保存当前的url到urls集合中 self.url_connection.insert_one({"url": detial_url}) #发起一个新的请求,提取电影详情页面的信息 yield scrapy.Request(url=detial_url, callback=self.parse_detail) else: #当前的url已经访问过了 print("当前url已经访问过,无需再访问") pass # yield scrapy.Request(url=detial_url, callback=self.parse_detail) #解析电影详情页面的, 解析出电影的名称和描述信息 def parse_detail(self, response): #获取电影名称 name = response.xpath( '/html/body/div[1]/div/div/div/div[2]/h1/text()').extract_first() #获取电影简介,电影描述信息 desc = response.xpath( '/html/body/div[1]/div/div/div/div[2]/p[5]/span[2]//text()' ).extract_first() desc = ''.join(desc) print(f"电影名称:{name}\n电影简介:{desc}") item = MovieprojectItem() item['name'] = name item['desc'] = desc yield item
class MongoController: """ The main MongoDB controller class. Attributes: port - the port for the MongoDB service. temp_dir - the location of the MongoDB data and logs. client - a pymongo client pointed at the server. db_version - the version of the mongod executable. index_version - the version of the indexes created by the mongod executable - 1 for < 3.4.0, 2 otherwise. includes_system_indexes - true if system indexes will be included when listing database indexes, false otherwise. """ def __init__(self, mongoexe: Path, root_temp_dir: Path, use_wired_tiger: bool=False) -> None: ''' Create and start a new MongoDB database. An unused port will be selected for the server. :param mongoexe: The path to the MongoDB server executable (e.g. mongod) to run. :param root_temp_dir: A temporary directory in which to store MongoDB data and log files. The files will be stored inside a child directory that is unique per invocation. :param use_wired_tiger: For MongoDB versions > 3.0, specify that the Wired Tiger storage engine should be used. Setting this to true for other versions will cause an error. ''' if not mongoexe or not os.access(mongoexe, os.X_OK): raise test_util.TestException('mongod executable path {} does not exist or is not executable.' .format(mongoexe)) if not root_temp_dir: raise ValueError('root_temp_dir is None') # make temp dirs root_temp_dir = root_temp_dir.absolute() os.makedirs(root_temp_dir, exist_ok=True) self.temp_dir = Path(tempfile.mkdtemp(prefix='MongoController-', dir=str(root_temp_dir))) data_dir = self.temp_dir.joinpath('data') os.makedirs(data_dir) self.port = test_util.find_free_port() command = [str(mongoexe), '--port', str(self.port), '--dbpath', str(data_dir), '--nojournal'] if use_wired_tiger: command.extend(['--storageEngine', 'wiredTiger']) self._outfile = open(self.temp_dir.joinpath('mongo.log'), 'w') self._proc = subprocess.Popen(command, stdout=self._outfile, stderr=subprocess.STDOUT) time.sleep(1) # wait for server to start up self.client = MongoClient('localhost', self.port) # check that the server is up. See # https://api.mongodb.com/python/3.7.0/api/pymongo/mongo_client.html # #pymongo.mongo_client.MongoClient self.client.admin.command('ismaster') # get some info about the db self.db_version = self.client.server_info()['version'] self.index_version = 2 if (semver.compare(self.db_version, '3.4.0') >= 0) else 1 self.includes_system_indexes = (semver.compare(self.db_version, '3.2.0') < 0 and not use_wired_tiger) def destroy(self, delete_temp_files: bool) -> None: """ Shut down the MongoDB server. :param delete_temp_files: delete all the MongoDB data files and logs generated during the test. """ if self.client: self.client.close() if self._proc: self._proc.terminate() if self._outfile: self._outfile.close() if delete_temp_files and self.temp_dir: shutil.rmtree(self.temp_dir) def clear_database(self, db_name, drop_indexes=False): ''' Remove all data from a database. :param db_name: the name of the db to clear. :param drop_indexes: drop all indexes if true, retain indexes (which will be empty) if false. ''' if drop_indexes: self.client.drop_database(db_name) else: db = self.client[db_name] for name in db.list_collection_names(): if not name.startswith('system.'): # don't drop collection since that drops indexes db.get_collection(name).delete_many({})
# -*- coding:utf-8 -*- from time import sleep from pymongo.mongo_client import MongoClient try: con = MongoClient("192.168.0.88") db = con.xe f = open("d:/lee/naverNews.txt", "a", encoding="utf-8") for nn in db.naverNews.find(): f.write(str(nn["m"]) + "\t") f.write(str(nn["d"]) + "\t") f.write(str(nn["h"]) + "\t") f.write(nn["t"] + "\t") f.write(nn["desc"] + "\n") print("끝") f.close() con.close() except Exception as e: print(e)
def doEverything(): # Get database connectivity information database, url = getDatabaseInfo() # Run test try: client = MongoClient(url) db = client[database] output = [] collectionName = "pythonMongoGalaxy" joinCollectionName = "pyJoin" cityTableName = "cityTable" codeTableName = "codeTable" output.append("# 1 Data Structures") output.append("# 1.1 Create a collection") output.append("Creating collection " + collectionName + " " + joinCollectionName) collection = db[collectionName] joinCollection = db[joinCollectionName] output.append("# 1.2 Create a table") output.append("Creating tables " + codeTableName + " " + cityTableName) db.command({"create" : codeTableName, "columns":[{"name":"countryCode","type":"int"}, {"name": "countryName", "type": "varchar(50)"}]}) db.command({"create" : cityTableName, "columns":[{"name":"name","type":"varchar(50)"}, {"name": "population", "type": "int"}, {"name": "longitude", "type": "decimal(8,4)"}, {"name": "latitude", "type": "decimal(8,4)"}, {"name": "countryCode", "type": "int"}]}) #insert 1 output.append("# 1 Inserts") output.append("# 1.1 Insert a single document to a collection") collection.insert(kansasCity.toJSON()) output.append("Inserted" ) output.append(kansasCity.toJSON()) #insert many output.append("#1.2 Inserting multiple entries into collection") multiPost = [seattle.toJSON(), newYork.toJSON(), london.toJSON(), tokyo.toJSON(), madrid.toJSON()] collection.insert(multiPost) output.append("Inserted \n%s \n%s \n%s \n%s \n%s" % (seattle.toJSON(), newYork.toJSON(), london.toJSON(), tokyo.toJSON(), madrid.toJSON())) # # Find output.append("\n#2 Queries") output.append("#2.1 Find one that matches a query condition") output.append(collection.find_one({"name": kansasCity.name})) # Find all output.append("#2.2 Find all that match a query condition") for doc in collection.find({"longitude": {"$gt" : "40.0"}}): output.append(doc) # Display all documents output.append("#2.3 Find all documents in collection") for doc in collection.find(): output.append(doc) #Count output.append("#2.4 Count documents in collection") num = collection.find({"population": {"$lt" : 8000000}}).count() output.append("There are %d documents with a population less than 8 million" % num) #Order output.append("#2.5 Order documents in collection") for doc in collection.find().sort("population", -1): output.append(doc) # Distinct output.append("#2.6 Find distinct codes in collection") for doc in collection.distinct("countryCode"): output.append(doc) #Joins output.append("#2.7 Joins") sys = db["system.join"] joinCollection.insert({"countryCode": 1, "countryName": "United States of America" }) joinCollection.insert({"countryCode": 44, "countryName": "United Kingdom" }) joinCollection.insert({"countryCode": 81, "countryName": "Japan" }) joinCollection.insert({"countryCode": 34, "countryName": "Spain" }) joinCollection.insert({"countryCode": 61, "countryName": "Australia" }) codeTable = db[codeTableName] codeTable.insert({"countryCode": 1}, {"countryName": "United State of America"}) codeTable.insert({"countryCode": 44 }, {"countryName": "United Kingdom"}) codeTable.insert({"countryCode": 81 }, {"countryName": "Japan"}) codeTable.insert({"countryCode": 34 }, {"countryName": "Spain"}) codeTable.insert({"countryCode": 61 }, {"countryName": "Australia"}) codeTable = db[cityTableName] codeTable.insert(kansasCity.toJSON()) codeTable.insert(multiPost) output.append("#2.7a Join collection-collection") joinCollectionCollection = { "$collections" : { collectionName : { "$project" : { "name" : 1 , "population" : 1 , "longitude" : 1 , "latitude" : 1}} , joinCollectionName : { "$project" : { "countryCode" : 1 , "countryName" : 1}}} , "$condition" : { "pythonMongoGalaxy.countryCode": "pyJoin.countryCode"}} for doc in sys.find(joinCollectionCollection): output.append(doc) output.append("#2.7b Join table-collection") joinTableCollection = { "$collections" : { cityTableName : { "$project" : { "name" : 1 , "population" : 1 , "longitude" : 1 , "latitude" : 1}} , joinCollectionName : { "$project" : { "countryCode" : 1 , "countryName" : 1}}} , "$condition" : { "cityTable.countryCode": "pyJoin.countryCode"}} for doc in sys.find(joinTableCollection): output.append(doc) output.append("#2.7c Join table-table") joinTableTable= { "$collections" : { cityTableName : { "$project" : { "name" : 1 , "population" : 1 , "longitude" : 1 , "latitude" : 1}} , codeTableName : { "$project" : { "countryCode" : 1 , "countryName" : 1}}} , "$condition" : { "cityTable.countryCode": "codeTable.countryCode"}} for doc in sys.find(joinTableTable): output.append(doc) output.append("#2.8 Changed Batch Size") # docs = collection.find().batch_size(2) # for doc in docs: # output.append(doc) output.append("#2.9 Projection clause") output.append("Displaying results without longitude and latitude:") for doc in collection.find({"countryCode" : 1}, {"longitude":0, "latitude" : 0}): output.append(doc) # update document output.append("\n#3 Update Documents") collection.update({"name": seattle.name}, {"$set": { "countryCode": 999}}) output.append("Updated %s with countryCode 999" % seattle.name) # delete document output.append("\n#4 Delete Documents") collection.remove({"name": tokyo.name}) output.append("Deleted all with name %s" % tokyo.name) # Display all collection names output.append("\n#5 Get a list of all of the collections") output.append( db.collection_names()) #SQL Passthrough output.append("\n#6 SQL passthrough") sql = db["system.sql"] query = {"$sql": "create table town (name varchar(255), countryCode int)"} for doc in sql.find(query): output.append(doc) query = {"$sql": "insert into town values ('Lawrence', 1)"} for doc in sql.find(query): output.append(doc) query = {"$sql": "drop table town"} for doc in sql.find(query): output.append(doc) #Transactions output.append("\n#7 Transactions") db.command({"transaction": "enable"}) collection.insert(sydney.toJSON()) db.command({"transaction": "commit"}) collection.insert(melbourne.toJSON()) db.command({"transaction": "rollback"}) db.command({"transaction": "disable"}) for doc in collection.find(): output.append(doc) output.append("\n#8 output") output.append("#8.1 Count") count = db.command("count", collectionName) output.append("There are %d documents in the collection" % count['n']) output.append("#8.2 Distinct") distinct = db.command("distinct", collectionName, key="countryCode") output.append("The distinct country codes are %s" % distinct['values']) output.append("#8.3 collection names ") output.append(db.collection_names()) output.append("#8.3 Database stats") output.append(db.command("dbstats")) output.append("#8.4 Collection stats") output.append(db.command("collstats", collectionName)) output.append("\n#9 Drop a collection") db.drop_collection(collectionName) db.drop_collection(joinCollectionName) db.drop_collection(cityTableName) db.drop_collection(codeTableName) except Exception as e: logging.exception(e) output.append("EXCEPTION (see log for details): " + str(e)) finally: if client is not None: client.close() output.append("Connection to database has been closed") return output