def save_to_mongodb(collection: Collection, items: List[dict]): """ MongoDBのコレクションにアイテムのリストを保存する。 """ # MongoDBに保存する前に、後で使いやすいようにアイテムを書き換える。 for item in items: item['_id'] = item['id'] # 各アイテムのid属性をMongoDBの_id属性として使う。 # statisticsに含まれるviewCountプロパティなどの値が文字列になっているので、数値に変換する。 for key, value in item['statistics'].items(): item['statistics'][key] = int(value) # 単純にcollection.insert_many()を使うと_idが重複した場合にエラーになる。 # 代わりにcollection.bulk_write()で複数のupsert(insert or update)をまとめて行う。 operations = [ ReplaceOne({'_id': item['_id']}, item, upsert=True) for item in items ] result = collection.bulk_write(operations) logging.info(f'Upserted {result.upserted_count} documents.')
def set_mongo_track_data(df_name, info_df): ops_list = [] if df_name == "liked_track_features": db_coll = mongo_spotify_tracks id_col = "track_spid" elif df_name == "liked_track_artist_features": db_coll = mongo_spotify_artists id_col = "artist_spid" info_df["artist_genres"] = info_df["artist_genres"].map(list) else: return False feature_records = info_df.to_dict("records") for record in feature_records: ops_list.append(ReplaceOne({id_col: record[id_col]}, record, upsert=True)) chunked_ops = chunk(ops_list, 1000) for ops in chunked_ops: db_coll.bulk_write(ops, ordered=False) return True
async def update_players(): """Updates all players in the database.""" logger.info("Updating FPL players in database.") async with aiohttp.ClientSession() as session: fpl = FPL(session) players = await fpl.get_players(include_summary=True, return_json=True) for player in players: player["team"] = team_converter(player["team"]) requests = [ ReplaceOne({"id": player["id"]}, player, upsert=True) for player in players ] database.players.bulk_write(requests) logger.info("Adding Understat data to players in database.") understat_players = await get_understat_players() for player in understat_players: # Only update FPL player with desired attributes understat_attributes = { attribute: value for attribute, value in player.items() if attribute in desired_attributes } # Use player's full name and team to try and find the correct player search_string = f"{player['player_name']} {player['team_title']}" players = database.players.find({ "$text": { "$search": search_string } }, { "score": { "$meta": "textScore" } }).sort([("score", { "$meta": "textScore" })]) relevant_player = list(players)[0] database.players.update_one({"id": relevant_player["id"]}, {"$set": understat_attributes})
def save_update_areas(areas): saved_areas = list( map(lambda d: dict_to_area(d), area_repo.get_all_areas(0))) new_or_updated = [ item for item in areas if item not in saved_areas ] # need to create update objects for both new objects and updated ones print("new or updated = " + str(new_or_updated)) if (len(new_or_updated) == 0): return for area in new_or_updated: area.last_update = datetime.datetime.now() db.areas.bulk_write( list( map(lambda r: ReplaceOne({'_id': r.id}, r.to_dict(), upsert=True), new_or_updated)))
def save_update_regions(regions): saved_regions = list( map(lambda d: dict_to_region(d), region_repo.get_all_regions(0))) new_or_updated = [ item for item in regions if item not in saved_regions ] # need to create update objects for both new objects and updated ones print("new or updated = " + str(new_or_updated)) if (len(new_or_updated) == 0): return for region in new_or_updated: region.last_update = datetime.datetime.now() db.regions.bulk_write( list( map(lambda r: ReplaceOne({'_id': r.id}, r.to_dict(), upsert=True), new_or_updated)))
def process_bib_records(self,chunk_no,no_of_chunks,lbibs,itpp_bib_fields): chunk_size=len(lbibs)//(no_of_chunks-1) if chunk_no==(no_of_chunks-1): end_rec=len(lbibs) else: end_rec=(chunk_no)*chunk_size for bib in lbibs[(chunk_no-1)*chunk_size:end_rec]: #print(f"bib id is {bib.id}") bib_dict={} if "ITS" in bib.get_values('930','a'): bib_dict["record_type"]="ITS" elif "VOT" in bib.get_values('930','a'): bib_dict["record_type"]="VOT" else: bib_dict["record_type"]="BIB" bib_dict["record_id"]=bib.id bib_dict["bodysession"]=self.body+'/'+self.session bib_dict["snapshot_id"]=str(bib.id)+self.body+self.session dt = datetime.now(timezone.utc) time_string = dt.strftime(self.TIME) bib_dict["snapshottime"]=time_string for itpp_field_subfields in itpp_bib_fields: sbflds=[] for elem in itpp_field_subfields: field=elem[0] sbflds.extend(elem[1]) temp_dict={} temp_dict[field]=self.list_of_subfields(bib,field,sbflds) if len(temp_dict[field])>1: bib_dict[field]=temp_dict[field] elif len(temp_dict[field])==1: bib_dict[field]=temp_dict[field][0] else: bib_dict[field]="" #snapshot_list_bibs.append(bib_dict) #query={"record_id":bib_dict["record_id"]} query={"snapshot_id":bib_dict["snapshot_id"]} self.replace_list_recs.append(ReplaceOne(query, bib_dict, upsert=True)) return len(self.replace_list_recs)
def _put_batch(self, keys: List[bytes], values: List[bytes], expire_time_mss: List[Optional[int]]): """ Batch insert. :param key: List[bytes]. :param value: List[bytes]. :param expire_time_ms: List[Optional[int]]. The expiration time for each data in ``value``. """ keys = [base64.b16encode(key).decode() for key in keys] replaces = [] for key, value, expire_time_ms in zip(keys, values, expire_time_mss): replaces.append( ReplaceOne({'key': key}, { 'key': key, 'value': value, 'expire_time_ms': expire_time_ms, }, upsert=True)) self.c_collection.bulk_write(replaces, ordered=False)
def batch_update(self, data_list): """ 批量操作,存在则更新,不存在则insert :param data_list: :return: """ if not data_list: return 0 update_operations = list() try: for data in data_list: op = ReplaceOne({"ip": data["ip"]}, replacement=data, upsert=True) update_operations.append(op) self.get_conn().bulk_write(update_operations, ordered=False) return 1 except: logger.error(traceback.format_exc()) return 0
def write(self, documents): if self.client is None: self.client = MongoClient(host=self.uri, **self.spec) requests = [] for doc in documents: # match document based on _id field, if not found in current collection, # insert new one, otherwise overwrite it. requests.append( ReplaceOne( filter={'_id': doc.get('_id', None)}, replacement=doc, upsert=True)) resp = self.client[self.db][self.coll].bulk_write(requests) _LOGGER.debug( 'BulkWrite to MongoDB result in nModified:%d, nUpserted:%d, ' 'nMatched:%d, Errors:%s' % ( resp.modified_count, resp.upserted_count, resp.matched_count, resp.bulk_api_result.get('writeErrors')))
def replace_one(self, instance, **kwargs): ''' Replace a single instance's document entirely:: person1 = Person.new(name='joe', age=30) person2 = Person.new(name='jill', age=40) bulk = Person.bulk() bulk.replace_one(person, name='joejoe', age=50) bulk.replace_one(person2, name='jilly', age=60) bulk.save() Person.refresh_all_from_db([person1, person2]) # Now their documents are overwritten entirely :param instance: the instance to add a replace op to :return: the ``pymongo.ReplaceOne`` result ''' query = _inst_to_query(instance) rep = ReplaceOne(query, kwargs, upsert=False) self.ops.append(rep) return rep
def replace(self, col_name, query_builder, data, count, is_finish=False): """ :param col_name: 表名 :param data: 目标数据 :param count: 计数 """ col = self.db[col_name] cur_time = datetime.datetime.utcnow() del data['_id'] data['updateTime'] = cur_time if not is_finish: self.bulk.append(ReplaceOne(query_builder, data, upsert=True)) if len(self.bulk[col_name]) >= self.MONGOBULK or is_finish: s = time.time() col.bulk_write(self.bulk) e = time.time() self.bulk = [] print("***%s***, 替换%s个, 当前已操作 %s 个" % (e - s, self.MONGOBULK, count))
async def __import__(chat_id, data): if not data: return new = [] for note in data: # File ver 1 to 2 if 'name' in note: note['names'] = [note['name']] del note['name'] for item in [i for i in note if i not in ALLOWED_COLUMNS_NOTES]: del note[item] note['chat_id'] = chat_id note['created_date'] = datetime.fromisoformat(note['created_date']) if 'edited_date' in note: note['edited_date'] = datetime.fromisoformat(note['edited_date']) new.append(ReplaceOne({'chat_id': note['chat_id'], 'names': {'$in': [note['names'][0]]}}, note, upsert=True)) await db.notes.bulk_write(new)
async def upsert_many(self, events: List[Event]) -> int: if len(events) == 0: return 0 requests = [] for event in events: existing_event = await self.event_collection.find_one( {"url": event.url}) if existing_event is None: requests.append(InsertOne(event.dict(by_alias=True))) else: event.event_id = existing_event["_id"] event.date_published = existing_event["date_published"] if event.description != existing_event[ "description"] or event.title != existing_event[ "title"]: event.date_published = datetime.now(tz=pytz.utc) requests.append( ReplaceOne({"_id": event.event_id}, event.dict(by_alias=True), True)) response = await self.event_collection.bulk_write(requests) return response.upserted_count + response.inserted_count
def delete_expired_tokens(self): """Delete expired tokens. Also, remove docs with no tokens.""" now = datetime.utcnow() bulk_requests = [] docs = list( self.mgdb.tokens.find({ "$or": [{ "link.expires": { "$lte": now } }, { "fetch.expires": { "$lte": now } }, { "link": [], "fetch": [] }] })) for d in docs: if not d["link"] and not d["fetch"]: bulk_requests.append(DeleteOne(dict(_id=d["_id"]))) continue link = [] for t in d["link"]: if t["expires"] > now: link.append(t) fetch = [] for t in d["fetch"]: if t["expires"] > now: fetch.append(t) if not link and not fetch: bulk_requests.append(DeleteOne(dict(_id=d["_id"]))) else: bulk_requests.append( ReplaceOne(dict(_id=d["_id"]), dict(email=d["email"], link=link, fetch=fetch))) if bulk_requests: self.mgdb.tokens.bulk_write(bulk_requests)
def bulk_write_data_to_document_db_internal(cluster_name, namespace, database_name, collection_name, data): logger.info("About to apply %d bulk operations on the namespace: %s ", len(data), namespace) connection_string = get_cluster_connection_string(cluster_name) bulk_ops = [] for item in data: op = ReplaceOne({"_id": item["_id"]}, item, upsert=True) bulk_ops.append(op) logger.info( "Completed creating the %d replace_one bulk operations for namespace: %s ", len(data), namespace) try: with MongoClient(connection_string) as client: collection = client.get_database(database_name).get_collection( collection_name) result = collection.bulk_write(bulk_ops) logger.info( "Successfully wrote %d documents to namespace %s on Document DB.", len(data), namespace) return True except BulkWriteError as bwe: if 'writeErrors' in bwe: error_count = len(bwe['writeErrors']) dupe_count = len( filter(lambda we: "E11000 duplicate key error" in we["errmsg"], bwe['writeErrors'])) if error_count == dupe_count: logger.info( "Ignoring the duplicate key errors while writing on cluster: %s, namespace: %s", cluster_name, namespace) return True # TODO: have a retry logic BulkWriteError: batch op errors occurred logger.exception("Exception while doing bulk operations. %s", bwe.details, exc_info=True) raise
def getScore(self): userConn = self.db.users conn = self.db.score user, id = self.queryUser() if not user: return False login = urpLogin(user) login.login() time.sleep(0.1) urpScore = getScore(user) score = urpScore.getAll() if score: scoreInsert = {} scoreList = [] scoreInsert['num'] = user[0] scoreInsert['score'] = score scoreList.append(ReplaceOne(scoreInsert, scoreInsert, upsert=True)) result = conn.bulk_write(scoreList) userConn.find_one_and_update({'_id': id}, {'$set': {'status': 1}}) return user[0] + ' -- success' else: userConn.find_one_and_update({'_id': id}, {'$set': {'status': 0}}) return user[0] + ' -- fail'
def write_data(self, doc: dict, doc_key: str = None, force_timestamp=True): """write document with _ts (timestamp) included :Parameters: - `doc`: A document to be written - `doc_key` (optional): Document key (_id) to be used for document replacement/upsert """ if force_timestamp: doc['_ts'] = datetime.now() else: doc['_ts'] = doc.get('_ts', datetime.now()) if doc_key is not None: doc['_id'] = '%s' % doc_key self._statements.append( ReplaceOne(filter={'_id': doc['_id']}, replacement=doc, upsert=True)) else: self._statements.append(InsertOne(document=doc)) if len(self._statements) > self._threshold: self._write_to_server() return self._write_counter
def update(self, docs, update_lu=True, key=None, ordered=True, **kwargs): """ Function to update associated MongoStore collection. Args: docs: list of documents """ requests = [] for d in docs: d = jsanitize(d, allow_bson=True) # document-level validation is optional validates = True if self.validator: validates = self.validator.is_valid(d) if not validates: if self.validator.strict: raise ValueError('Document failed to validate: {}'.format(d)) else: self.logger.error('Document failed to validate: {}'.format(d)) if validates: if isinstance(key, list): search_doc = {k: d[k] for k in key} elif key: search_doc = {key: d[key]} else: search_doc = {self.key: d[self.key]} if update_lu: d[self.lu_field] = datetime.utcnow() requests.append(ReplaceOne(search_doc,d,upsert=True)) self.collection.bulk_write(requests,ordered=ordered)
def replace_records(self, mongo, docs): operations = [] for doc in docs: mongo_replacement_filter = dict() if isinstance(self.mongo_replacement_filter, str): mongo_replacement_filter = { self.mongo_replacement_filter: doc.get(self.mongo_replacement_filter, False) } elif isinstance(self.mongo_replacement_filter, dict): for k, v in self.mongo_replacement_filter.items(): if k == v: mongo_replacement_filter[k] = doc.get(k, False) else: mongo_replacement_filter[ k] = self.mongo_replacement_filter.get(k, False) operations.append( ReplaceOne(mongo_replacement_filter, doc, upsert=True)) # Send once every 1000 in batch if (len(operations) == 1000): logging.info('Making Request....') mongo.bulk_write(self.mongo_collection, operations, mongo_db=self.mongo_db, ordered=False) operations = [] logging.info('Request successfully finished....') if (len(operations) > 0): logging.info('Making Final Request....') mongo.bulk_write(self.mongo_collection, operations, mongo_db=self.mongo_db, ordered=False) logging.info('Final Request Finished.')
def queue_replace(self, filter_document, update_document, *args, **kwargs): self.batch.append( ReplaceOne(filter_document, update_document, *args, **kwargs))
# print(post1) # post1 = posts.find_one({"_id": post_id}) # print(post1) # post1 = posts.find_one({'_id': ObjectId("5a07a674dcfba13028c7022b")}) # print(post1) # posts.remove({"author": "Mike"}) # bulk insert # ids = posts.insert_many([{'i': i} for i in range(10000)]).inserted_ids # print("count of posts:", posts.count()) posts.remove({}) result = posts.bulk_write([ DeleteMany({}), # Remove all documents from the previous example. InsertOne({'_id': 1}), InsertOne({'_id': 2}), InsertOne({'_id': 3}), UpdateOne({'_id': 1}, {'$set': { 'foo': 'bar' }}), UpdateOne({'_id': 4}, {'$inc': { 'j': 1 }}, upsert=True), ReplaceOne({'j': 1}, {'j': 2}) ]) pprint(result.bulk_api_result) for post in posts.find(): pprint(post)
def _serialize_graph(session_id, req_id, sequence_graph): db = get_db() if db.sessions.find_one({'_id': session_id}) is None: raise SessionNotInitializedException() if db.requests.find_one({'_id': req_id}) is not None: return updates = [] functions = {} for k, func in sequence_graph.graph['functions'].items(): id, update = _serialize_func( db, session_id, k, func ) functions[k] = id if update is not None: updates.append(update) if len(updates) > 0: db.functions.bulk_write(updates) logger.info("Registered %i functions." % len(updates)) updates = [] nodes = {} for n in sequence_graph: node = sequence_graph.nodes[n] nodes[n] = _serialize_node( db, session_id, n, node, functions ) saves = _merge_graph(sequence_graph, nodes) for n in saves: entry = nodes[n] if entry is None: continue updates.append( ReplaceOne( {'_id': entry['_id']}, entry, upsert=True ) ) if len(updates) > 0: db.function_graph.bulk_write(updates) logger.info("Updated %i nodes in session graph." % len(updates)) start = sequence_graph.graph['start'] stop = sequence_graph.graph['stop'] start_ids = [] stop_ids = [] for _, v in sequence_graph.out_edges(start): start_ids.append( nodes[v]['_id'] ) for u, _ in sequence_graph.in_edges(stop): stop_ids.append( nodes[u]['_id'] ) entry = { '_id': req_id, 'session_id': session_id, 'start_time': str(datetime.datetime.now()), 'initials': start_ids, 'endpoints': stop_ids } db.requests.insert_one(entry) session = db.sessions.find_one({'_id': session_id}) start_points = set(session['start_points']) start_points = start_points.union(set(start_ids)) session['start_points'] = list(start_points) db.sessions.replace_one({'_id': session_id}, session) logger.info("Created execution request [session: %s, request_id: %s]" % (session_id, req_id))
def copy(source_params, dest_params, start_date, end_date): source_db = connect_mongodb(source_params) source_tables = get_table_names(source_params) dest_db = connect_mongodb(dest_params) dest_tables = get_table_names(dest_params) # records = [] # for record in source_db[source_tables['topics_table']].find(): # records.append(record) # print("total records {}".format(len(records))) # dest_db[dest_tables['topics_table']].insert_many( # records) # # records = [] # for record in source_db[source_tables['meta_table']].find(): # records.append(record) # print("total records {}".format(len(records))) # dest_db[dest_tables['meta_table']].insert_many( # records) # This is probably the most inefficient way of doing a copying a subset # of a # collection to another database but this is the one that requires the # minimum access # Wish this feature request is closed soon # https://jira.mongodb.org/browse/SERVER-13201 # Aggregation is the fastest way to get a subset of data from a collection, # next would be map reduce. map reduce can write output to another db # but it would only generate doc of schema # id:<object id>, value:<search/mapreduce result> dest_db[dest_tables['data_table']].create_index( [('topic_id', pymongo.DESCENDING), ('ts', pymongo.DESCENDING)], unique=True, background=False) records = [] i = 0 print("start obj:{}".format(ObjectId.from_datetime(start_date))) print("end obj:{}".format(ObjectId.from_datetime(end_date))) cursor = source_db[source_tables['data_table']].find({ '$and': [{ '_id': { '$gte': ObjectId.from_datetime(start_date) } }, { '_id': { '$lte': ObjectId.from_datetime(end_date) } }] }) print("Record count from cursor {}".format(cursor.count())) for record in cursor: i += 1 records.append( ReplaceOne({ 'ts': record['ts'], 'topic_id': record['topic_id'] }, { 'ts': record['ts'], 'topic_id': record['topic_id'], 'value': record['value'] }, upsert=True)) if i == 2000: print("total records {}".format(len(records))) dest_db[dest_tables['data_table']].bulk_write(records) i = 0 records = []
def replace_one(self, cliteria, document, upsert=False): self._batch.append( ReplaceOne(cliteria, to_mongo(document), upsert=upsert))
def publish_to_historian(self, to_publish_list): _log.debug("publish_to_historian number of items: {}".format( len(to_publish_list))) # Use the db instance to insert/update the topics # and data collections db = self._client.get_default_database() bulk_publish = [] for x in to_publish_list: ts = x['timestamp'] topic = x['topic'] value = x['value'] meta = x['meta'] # look at the topics that are stored in the database already # to see if this topic has a value topic_lower = topic.lower() topic_id = self._topic_id_map.get(topic_lower, None) db_topic_name = self._topic_name_map.get(topic_lower, None) if topic_id is None: row = db[self._topic_collection].insert_one( {'topic_name': topic}) topic_id = row.inserted_id self._topic_id_map[topic_lower] = topic_id self._topic_name_map[topic_lower] = topic elif db_topic_name != topic: _log.debug('Updating topic: {}'.format(topic)) result = db[self._topic_collection].update_one( {'_id': ObjectId(topic_id)}, {'$set': { 'topic_name': topic }}) assert result.matched_count self._topic_name_map[topic_lower] = topic old_meta = self._topic_meta.get(topic_id, {}) if set(old_meta.items()) != set(meta.items()): _log.debug('Updating meta for topic: {} {}'.format( topic, meta)) db[self._meta_collection].insert_one({ 'topic_id': topic_id, 'meta': meta }) self._topic_meta[topic_id] = meta # Reformat to a filter tha bulk inserter. bulk_publish.append( ReplaceOne({ 'ts': ts, 'topic_id': topic_id }, { 'ts': ts, 'topic_id': topic_id, 'value': value }, upsert=True)) # bulk_publish.append(InsertOne( # {'ts': ts, 'topic_id': topic_id, 'value': value})) try: # http://api.mongodb.org/python/current/api/pymongo/collection.html#pymongo.collection.Collection.bulk_write result = db[self._data_collection].bulk_write(bulk_publish) except BulkWriteError as bwe: _log.error("{}".format(bwe.details)) else: # No write errros here when if not result.bulk_api_result['writeErrors']: self.report_all_handled() else: # TODO handle when something happens during writing of data. _log.error('SOME THINGS DID NOT WORK')
"dims": { "width": 80, "length": 30, "height": 30 }, "age": 40 }), ReplaceOne ({ "id": 2, "name": "Ann", "dims": { "width": 60, "length": 80, "height": 25 }, "age": 33 }, { "id": 2, "name": "Annie", "dims": { "width": 70, "length": 50, "height": 25 }, "age": 41 }, upsert=True)] result = qtest.bulk_write (requests) result.inserted_count result.deleted_count result.modified_count result.upserted_ids
def put(self): """ Replace existing Product objects --- parameters: - in: body description: List of Product objects to be inserted to database required: true schema: type: array items: properties: _id: type: string data: type: object properties: name: type: string brand_name: type: string regular_price_value: type: number format: float offer_price_value: type: number format: float currency: type: string classification_l1: type: string classification_l2: type: string classification_l3: type: string classification_l4: type: string image_url: type: string responses: '200': description: Bulk Write result object from MongoDB content: application/json: schema: type: object properties: acknowledged: type: boolean matched_count: type: string modified_count: type: integer deleted_count: type: integer upserted_ids: type: array items: type: object properties: _id: type: string inserted_count: type: integer '500': description: Server encountered an error while performing bulk operation content: application/json: schema: type: object properties: message: type: string """ requests = [] for q in request.json: requests.append(ReplaceOne({"_id": ObjectId(q["_id"])}, q["data"])) return perform_bulk(collection, requests), 201
def insert_new_records(input_collection, output_collection, collection_str): try: #get the last insertion time in the input collection last_inserted_doc_input = input_collection.find({}).sort( 'insertion_datetime', pymongo.DESCENDING).limit(1) last_insertion_time_input = last_inserted_doc_input[0][ 'insertion_datetime'] #log last insertion time log_message = "Last insertion datetime in " + collection_str + ": " + str( last_insertion_time_input) logging.info(log_message) #print for testing # print dumps(last_inserted_doc[0], indent=2, default=json_util.default) #get the last insertion time in the output collection if "ies" not in collection_str: m_type = str(collection_str[:-1]) else: m_type = str(collection_str[:-3] + "y") last_inserted_doc_output = output_collection.find({ "type": m_type }).sort('insertion_datetime', pymongo.DESCENDING).limit(1) last_insertion_time_output = last_inserted_doc_output[0][ 'insertion_datetime'] #log last insertion time log_message = "Last insertion datetime in roadSensorValue for type " + m_type + ": " + str( last_insertion_time_output) logging.info(log_message) #get the datetime window end_date_time = last_insertion_time_input - datetime.timedelta( minutes=5) start_date_time = last_insertion_time_output - datetime.timedelta( minutes=5) # print for testing # print collection_str # print str(last_insertion_time_input) # print str(last_insertion_time_output) # print str(start_date_time) # print str(end_date_time) # print '***************************' #retrieve the records within the time window from the input collection query = { 'insertion_datetime': { '$gte': start_date_time, '$lte': end_date_time } } cursor_count = int(input_collection.find(query).count()) log_message = str( cursor_count) + " records to be inserted from " + collection_str logging.info(log_message) cursor = input_collection.find(query) #bulk write to the output collection requests = [] for doc in cursor: requests.append(ReplaceOne({"_id": doc["_id"]}, doc, upsert=True)) #write the new records to the output collection result = output_collection.bulk_write(requests) #log counts log_message = str( result.upserted_count ) + " records were upserted from " + collection_str + " to roadSensorValue" logging.info(log_message) log_message = str( result.matched_count) + " records were matched in roadSensorValue" logging.info(log_message) except Exception, e: print(e) sys.exit()
def _sync_collection(self, src_dbname, src_collname, dst_dbname, dst_collname): """ Sync a collection through batch write. """ self._logger.info( "[%s] sync collection '%s.%s'" % (self._current_process_name, src_dbname, src_collname)) while True: try: n = 0 #docs = [] reqs = [] batchsize = 1000 cursor = self._src_mc[src_dbname][src_collname].find( filter=None, cursor_type=pymongo.cursor.CursorType.EXHAUST, no_cursor_timeout=True, modifiers={'$snapshot': True}) count = cursor.count() if count == 0: self._logger.info('[%s] \t skip empty collection' % (self._current_process_name)) return for doc in cursor: #docs.append(doc) #if len(docs) == batchsize: # self._dst_mc[dst_dbname][dst_collname].insert_many(docs) # docs = [] reqs.append( ReplaceOne({'_id': doc['_id']}, doc, upsert=True)) if len(reqs) == batchsize: self._bulk_write(dst_dbname, dst_collname, reqs, ordered=False) reqs = [] n += 1 if n % 10000 == 0: self._logger.info( '[%s] \t %s.%s %d/%d (%.2f%%)' % (self._current_process_name, src_dbname, src_collname, n, count, float(n) / count * 100)) #if len(docs) > 0: # self._dst_mc[dst_dbname][dst_collname].insert_many(docs) if len(reqs) > 0: self._bulk_write(dst_dbname, dst_collname, reqs, ordered=False) self._logger.info( '[%s] \t %s.%s %d/%d (%.2f%%)' % (self._current_process_name, src_dbname, src_collname, n, count, float(n) / count * 100)) return except pymongo.errors.AutoReconnect: self._src_mc.close() self._src_mc = self.reconnect(self._src_host, self._src_port, username=self._src_username, password=self._src_password, w=self._w)
def process(self, instance): self.log.debug( "--- Integration of Master version for subset `{}` begins.".format( instance.data.get("subset", str(instance)) ) ) published_repres = instance.data.get("published_representations") if not published_repres: self.log.debug( "*** There are not published representations on the instance." ) return project_name = api.Session["AVALON_PROJECT"] # TODO raise error if master not set? anatomy = instance.context.data["anatomy"] if "master" not in anatomy.templates: self.log.warning("!!! Anatomy does not have set `master` key!") return if "path" not in anatomy.templates["master"]: self.log.warning(( "!!! There is not set `path` template in `master` anatomy" " for project \"{}\"." ).format(project_name)) return master_template = anatomy.templates["master"]["path"] self.log.debug("`Master` template check was successful. `{}`".format( master_template )) master_publish_dir = self.get_publish_dir(instance) src_version_entity = instance.data.get("versionEntity") filtered_repre_ids = [] for repre_id, repre_info in published_repres.items(): repre = repre_info["representation"] if repre["name"].lower() in self.ignored_representation_names: self.log.debug( "Filtering representation with name: `{}`".format( repre["name"].lower() ) ) filtered_repre_ids.append(repre_id) for repre_id in filtered_repre_ids: published_repres.pop(repre_id, None) if not published_repres: self.log.debug( "*** All published representations were filtered by name." ) return if src_version_entity is None: self.log.debug(( "Published version entity was not sent in representation data." " Querying entity from database." )) src_version_entity = ( self.version_from_representations(published_repres) ) if not src_version_entity: self.log.warning(( "!!! Can't find origin version in database." " Skipping Master version publish." )) return all_copied_files = [] transfers = instance.data.get("transfers", list()) for _src, dst in transfers: dst = os.path.normpath(dst) if dst not in all_copied_files: all_copied_files.append(dst) hardlinks = instance.data.get("hardlinks", list()) for _src, dst in hardlinks: dst = os.path.normpath(dst) if dst not in all_copied_files: all_copied_files.append(dst) all_repre_file_paths = [] for repre_info in published_repres.values(): published_files = repre_info.get("published_files") or [] for file_path in published_files: file_path = os.path.normpath(file_path) if file_path not in all_repre_file_paths: all_repre_file_paths.append(file_path) # TODO this is not best practice of getting resources for publish # WARNING due to this we must remove all files from master publish dir instance_publish_dir = os.path.normpath( instance.data["publishDir"] ) other_file_paths_mapping = [] for file_path in all_copied_files: # Check if it is from publishDir if not file_path.startswith(instance_publish_dir): continue if file_path in all_repre_file_paths: continue dst_filepath = file_path.replace( instance_publish_dir, master_publish_dir ) other_file_paths_mapping.append((file_path, dst_filepath)) # Current version old_version, old_repres = ( self.current_master_ents(src_version_entity) ) old_repres_by_name = { repre["name"].lower(): repre for repre in old_repres } if old_version: new_version_id = old_version["_id"] else: new_version_id = io.ObjectId() new_master_version = { "_id": new_version_id, "version_id": src_version_entity["_id"], "parent": src_version_entity["parent"], "type": "master_version", "schema": "pype:master_version-1.0" } schema.validate(new_master_version) # Don't make changes in database until everything is O.K. bulk_writes = [] if old_version: self.log.debug("Replacing old master version.") bulk_writes.append( ReplaceOne( {"_id": new_master_version["_id"]}, new_master_version ) ) else: self.log.debug("Creating first master version.") bulk_writes.append( InsertOne(new_master_version) ) # Separate old representations into `to replace` and `to delete` old_repres_to_replace = {} old_repres_to_delete = {} for repre_info in published_repres.values(): repre = repre_info["representation"] repre_name_low = repre["name"].lower() if repre_name_low in old_repres_by_name: old_repres_to_replace[repre_name_low] = ( old_repres_by_name.pop(repre_name_low) ) if old_repres_by_name: old_repres_to_delete = old_repres_by_name archived_repres = list(io.find({ # Check what is type of archived representation "type": "archived_repsentation", "parent": new_version_id })) archived_repres_by_name = {} for repre in archived_repres: repre_name_low = repre["name"].lower() archived_repres_by_name[repre_name_low] = repre backup_master_publish_dir = None if os.path.exists(master_publish_dir): backup_master_publish_dir = master_publish_dir + ".BACKUP" max_idx = 10 idx = 0 _backup_master_publish_dir = backup_master_publish_dir while os.path.exists(_backup_master_publish_dir): self.log.debug(( "Backup folder already exists." " Trying to remove \"{}\"" ).format(_backup_master_publish_dir)) try: shutil.rmtree(_backup_master_publish_dir) backup_master_publish_dir = _backup_master_publish_dir break except Exception: self.log.info(( "Could not remove previous backup folder." " Trying to add index to folder name" )) _backup_master_publish_dir = ( backup_master_publish_dir + str(idx) ) if not os.path.exists(_backup_master_publish_dir): backup_master_publish_dir = _backup_master_publish_dir break if idx > max_idx: raise AssertionError(( "Backup folders are fully occupied to max index \"{}\"" ).format(max_idx)) break idx += 1 self.log.debug("Backup folder path is \"{}\"".format( backup_master_publish_dir )) try: os.rename(master_publish_dir, backup_master_publish_dir) except PermissionError: raise AssertionError(( "Could not create master version because it is not" " possible to replace current master files." )) try: src_to_dst_file_paths = [] for repre_info in published_repres.values(): # Skip if new repre does not have published repre files published_files = repre_info["published_files"] if len(published_files) == 0: continue # Prepare anatomy data anatomy_data = repre_info["anatomy_data"] anatomy_data.pop("version", None) # Get filled path to repre context anatomy_filled = anatomy.format(anatomy_data) template_filled = anatomy_filled["master"]["path"] repre_data = { "path": str(template_filled), "template": master_template } repre_context = template_filled.used_values for key in self.db_representation_context_keys: if ( key in repre_context or key not in anatomy_data ): continue repre_context[key] = anatomy_data[key] # Prepare new repre repre = copy.deepcopy(repre_info["representation"]) repre["parent"] = new_master_version["_id"] repre["context"] = repre_context repre["data"] = repre_data repre.pop("_id", None) schema.validate(repre) repre_name_low = repre["name"].lower() # Replace current representation if repre_name_low in old_repres_to_replace: old_repre = old_repres_to_replace.pop(repre_name_low) repre["_id"] = old_repre["_id"] bulk_writes.append( ReplaceOne( {"_id": old_repre["_id"]}, repre ) ) # Unarchive representation elif repre_name_low in archived_repres_by_name: archived_repre = archived_repres_by_name.pop( repre_name_low ) old_id = archived_repre["old_id"] repre["_id"] = old_id bulk_writes.append( ReplaceOne( {"old_id": old_id}, repre ) ) # Create representation else: repre["_id"] = io.ObjectId() bulk_writes.append( InsertOne(repre) ) # Prepare paths of source and destination files if len(published_files) == 1: src_to_dst_file_paths.append( (published_files[0], template_filled) ) continue collections, remainders = clique.assemble(published_files) if remainders or not collections or len(collections) > 1: raise Exception(( "Integrity error. Files of published representation " "is combination of frame collections and single files." "Collections: `{}` Single files: `{}`" ).format(str(collections), str(remainders))) src_col = collections[0] # Get head and tail for collection frame_splitter = "_-_FRAME_SPLIT_-_" anatomy_data["frame"] = frame_splitter _anatomy_filled = anatomy.format(anatomy_data) _template_filled = _anatomy_filled["master"]["path"] head, tail = _template_filled.split(frame_splitter) padding = int( anatomy.templates["render"].get( "frame_padding", anatomy.templates["render"].get("padding") ) ) dst_col = clique.Collection( head=head, padding=padding, tail=tail ) dst_col.indexes.clear() dst_col.indexes.update(src_col.indexes) for src_file, dst_file in zip(src_col, dst_col): src_to_dst_file_paths.append( (src_file, dst_file) ) self.path_checks = [] # Copy(hardlink) paths of source and destination files # TODO should we *only* create hardlinks? # TODO should we keep files for deletion until this is successful? for src_path, dst_path in src_to_dst_file_paths: self.copy_file(src_path, dst_path) for src_path, dst_path in other_file_paths_mapping: self.copy_file(src_path, dst_path) # Archive not replaced old representations for repre_name_low, repre in old_repres_to_delete.items(): # Replace archived representation (This is backup) # - should not happen to have both repre and archived repre if repre_name_low in archived_repres_by_name: archived_repre = archived_repres_by_name.pop( repre_name_low ) repre["old_id"] = repre["_id"] repre["_id"] = archived_repre["_id"] repre["type"] = archived_repre["type"] bulk_writes.append( ReplaceOne( {"_id": archived_repre["_id"]}, repre ) ) else: repre["old_id"] = repre["_id"] repre["_id"] = io.ObjectId() repre["type"] = "archived_representation" bulk_writes.append( InsertOne(repre) ) if bulk_writes: io._database[io.Session["AVALON_PROJECT"]].bulk_write( bulk_writes ) # Remove backuped previous master if ( backup_master_publish_dir is not None and os.path.exists(backup_master_publish_dir) ): shutil.rmtree(backup_master_publish_dir) except Exception: if ( backup_master_publish_dir is not None and os.path.exists(backup_master_publish_dir) ): os.rename(backup_master_publish_dir, master_publish_dir) self.log.error(( "!!! Creating of Master version failed." " Previous master version maybe lost some data!" )) raise self.log.debug(( "--- Master version integration for subset `{}`" " seems to be successful." ).format( instance.data.get("subset", str(instance)) ))