def save_to_mongodb(collection: Collection, items: List[dict]):
    """
    MongoDBのコレクションにアイテムのリストを保存する。
    """
    # MongoDBに保存する前に、後で使いやすいようにアイテムを書き換える。
    for item in items:
        item['_id'] = item['id']  # 各アイテムのid属性をMongoDBの_id属性として使う。

        # statisticsに含まれるviewCountプロパティなどの値が文字列になっているので、数値に変換する。
        for key, value in item['statistics'].items():
            item['statistics'][key] = int(value)

    # 単純にcollection.insert_many()を使うと_idが重複した場合にエラーになる。
    # 代わりにcollection.bulk_write()で複数のupsert(insert or update)をまとめて行う。
    operations = [
        ReplaceOne({'_id': item['_id']}, item, upsert=True) for item in items
    ]
    result = collection.bulk_write(operations)
    logging.info(f'Upserted {result.upserted_count} documents.')
Exemplo n.º 2
0
def set_mongo_track_data(df_name, info_df):
    ops_list = []
    if df_name == "liked_track_features":
        db_coll = mongo_spotify_tracks
        id_col = "track_spid"
    elif df_name == "liked_track_artist_features":
        db_coll = mongo_spotify_artists
        id_col = "artist_spid"
        info_df["artist_genres"] = info_df["artist_genres"].map(list)
    else:
        return False
    feature_records = info_df.to_dict("records")

    for record in feature_records:
        ops_list.append(ReplaceOne({id_col: record[id_col]}, record, upsert=True))
    chunked_ops = chunk(ops_list, 1000)
    for ops in chunked_ops:
        db_coll.bulk_write(ops, ordered=False)
    return True
Exemplo n.º 3
0
async def update_players():
    """Updates all players in the database."""
    logger.info("Updating FPL players in database.")
    async with aiohttp.ClientSession() as session:
        fpl = FPL(session)
        players = await fpl.get_players(include_summary=True, return_json=True)
        for player in players:
            player["team"] = team_converter(player["team"])

    requests = [
        ReplaceOne({"id": player["id"]}, player, upsert=True)
        for player in players
    ]
    database.players.bulk_write(requests)

    logger.info("Adding Understat data to players in database.")
    understat_players = await get_understat_players()

    for player in understat_players:
        # Only update FPL player with desired attributes
        understat_attributes = {
            attribute: value
            for attribute, value in player.items()
            if attribute in desired_attributes
        }

        # Use player's full name and team to try and find the correct player
        search_string = f"{player['player_name']} {player['team_title']}"
        players = database.players.find({
            "$text": {
                "$search": search_string
            }
        }, {
            "score": {
                "$meta": "textScore"
            }
        }).sort([("score", {
            "$meta": "textScore"
        })])
        relevant_player = list(players)[0]

        database.players.update_one({"id": relevant_player["id"]},
                                    {"$set": understat_attributes})
Exemplo n.º 4
0
def save_update_areas(areas):
    saved_areas = list(
        map(lambda d: dict_to_area(d), area_repo.get_all_areas(0)))
    new_or_updated = [
        item for item in areas if item not in saved_areas
    ]  # need to create update objects for both new objects and updated ones

    print("new or updated = " + str(new_or_updated))

    if (len(new_or_updated) == 0):
        return

    for area in new_or_updated:
        area.last_update = datetime.datetime.now()

    db.areas.bulk_write(
        list(
            map(lambda r: ReplaceOne({'_id': r.id}, r.to_dict(), upsert=True),
                new_or_updated)))
Exemplo n.º 5
0
def save_update_regions(regions):
    saved_regions = list(
        map(lambda d: dict_to_region(d), region_repo.get_all_regions(0)))
    new_or_updated = [
        item for item in regions if item not in saved_regions
    ]  # need to create update objects for both new objects and updated ones

    print("new or updated = " + str(new_or_updated))

    if (len(new_or_updated) == 0):
        return

    for region in new_or_updated:
        region.last_update = datetime.datetime.now()

    db.regions.bulk_write(
        list(
            map(lambda r: ReplaceOne({'_id': r.id}, r.to_dict(), upsert=True),
                new_or_updated)))
    def process_bib_records(self,chunk_no,no_of_chunks,lbibs,itpp_bib_fields):
        chunk_size=len(lbibs)//(no_of_chunks-1)
        if chunk_no==(no_of_chunks-1):
            end_rec=len(lbibs)
        else:
            end_rec=(chunk_no)*chunk_size

        for bib in lbibs[(chunk_no-1)*chunk_size:end_rec]:
            #print(f"bib id is {bib.id}")
            bib_dict={}
            if "ITS" in bib.get_values('930','a'):
                bib_dict["record_type"]="ITS"
            elif "VOT" in bib.get_values('930','a'):
                bib_dict["record_type"]="VOT"
            else:
                bib_dict["record_type"]="BIB"

            bib_dict["record_id"]=bib.id
            bib_dict["bodysession"]=self.body+'/'+self.session
            bib_dict["snapshot_id"]=str(bib.id)+self.body+self.session
            dt = datetime.now(timezone.utc)
            time_string = dt.strftime(self.TIME)
            bib_dict["snapshottime"]=time_string

            for itpp_field_subfields in itpp_bib_fields:
                sbflds=[]
                for elem in itpp_field_subfields:
                    field=elem[0]
                    sbflds.extend(elem[1])
                temp_dict={}
                temp_dict[field]=self.list_of_subfields(bib,field,sbflds)
                if len(temp_dict[field])>1:
                    bib_dict[field]=temp_dict[field]
                elif len(temp_dict[field])==1:
                    bib_dict[field]=temp_dict[field][0]
                else:
                    bib_dict[field]=""
            #snapshot_list_bibs.append(bib_dict)
            #query={"record_id":bib_dict["record_id"]}
            query={"snapshot_id":bib_dict["snapshot_id"]}
            self.replace_list_recs.append(ReplaceOne(query, bib_dict, upsert=True))
        return len(self.replace_list_recs)
    def _put_batch(self, keys: List[bytes], values: List[bytes],
                   expire_time_mss: List[Optional[int]]):
        """
        Batch insert.

        :param key: List[bytes].
        :param value: List[bytes].
        :param expire_time_ms: List[Optional[int]]. The expiration time for each data in ``value``.
        """
        keys = [base64.b16encode(key).decode() for key in keys]
        replaces = []
        for key, value, expire_time_ms in zip(keys, values, expire_time_mss):
            replaces.append(
                ReplaceOne({'key': key}, {
                    'key': key,
                    'value': value,
                    'expire_time_ms': expire_time_ms,
                },
                           upsert=True))
        self.c_collection.bulk_write(replaces, ordered=False)
Exemplo n.º 8
0
 def batch_update(self, data_list):
     """
     批量操作,存在则更新,不存在则insert
     :param data_list:
     :return:
     """
     if not data_list:
         return 0
     update_operations = list()
     try:
         for data in data_list:
             op = ReplaceOne({"ip": data["ip"]},
                             replacement=data,
                             upsert=True)
             update_operations.append(op)
         self.get_conn().bulk_write(update_operations, ordered=False)
         return 1
     except:
         logger.error(traceback.format_exc())
         return 0
Exemplo n.º 9
0
 def write(self, documents):
   if self.client is None:
     self.client = MongoClient(host=self.uri, **self.spec)
   requests = []
   for doc in documents:
     # match document based on _id field, if not found in current collection,
     # insert new one, otherwise overwrite it.
     requests.append(
         ReplaceOne(
             filter={'_id': doc.get('_id', None)},
             replacement=doc,
             upsert=True))
   resp = self.client[self.db][self.coll].bulk_write(requests)
   _LOGGER.debug(
       'BulkWrite to MongoDB result in nModified:%d, nUpserted:%d, '
       'nMatched:%d, Errors:%s' % (
           resp.modified_count,
           resp.upserted_count,
           resp.matched_count,
           resp.bulk_api_result.get('writeErrors')))
Exemplo n.º 10
0
    def replace_one(self, instance, **kwargs):
        '''
        Replace a single instance's document entirely::

            person1 = Person.new(name='joe', age=30)
            person2 = Person.new(name='jill', age=40)
            bulk = Person.bulk()
            bulk.replace_one(person, name='joejoe', age=50)
            bulk.replace_one(person2, name='jilly', age=60)
            bulk.save()
            Person.refresh_all_from_db([person1, person2])
            # Now their documents are overwritten entirely

        :param instance: the instance to add a replace op to
        :return: the ``pymongo.ReplaceOne`` result
        '''
        query = _inst_to_query(instance)
        rep = ReplaceOne(query, kwargs, upsert=False)
        self.ops.append(rep)
        return rep
Exemplo n.º 11
0
    def replace(self, col_name, query_builder, data, count, is_finish=False):
        """
        :param col_name: 表名
        :param data: 目标数据
        :param count: 计数
        """
        col = self.db[col_name]
        cur_time = datetime.datetime.utcnow()
        del data['_id']
        data['updateTime'] = cur_time

        if not is_finish:
            self.bulk.append(ReplaceOne(query_builder, data, upsert=True))

        if len(self.bulk[col_name]) >= self.MONGOBULK or is_finish:
            s = time.time()
            col.bulk_write(self.bulk)
            e = time.time()
            self.bulk = []
            print("***%s***, 替换%s个, 当前已操作 %s 个" %
                  (e - s, self.MONGOBULK, count))
Exemplo n.º 12
0
async def __import__(chat_id, data):
    if not data:
        return

    new = []
    for note in data:

        # File ver 1 to 2
        if 'name' in note:
            note['names'] = [note['name']]
            del note['name']

        for item in [i for i in note if i not in ALLOWED_COLUMNS_NOTES]:
            del note[item]

        note['chat_id'] = chat_id
        note['created_date'] = datetime.fromisoformat(note['created_date'])
        if 'edited_date' in note:
            note['edited_date'] = datetime.fromisoformat(note['edited_date'])
        new.append(ReplaceOne({'chat_id': note['chat_id'], 'names': {'$in': [note['names'][0]]}}, note, upsert=True))

    await db.notes.bulk_write(new)
Exemplo n.º 13
0
    async def upsert_many(self, events: List[Event]) -> int:
        if len(events) == 0:
            return 0
        requests = []
        for event in events:
            existing_event = await self.event_collection.find_one(
                {"url": event.url})
            if existing_event is None:
                requests.append(InsertOne(event.dict(by_alias=True)))
            else:
                event.event_id = existing_event["_id"]
                event.date_published = existing_event["date_published"]
                if event.description != existing_event[
                        "description"] or event.title != existing_event[
                            "title"]:
                    event.date_published = datetime.now(tz=pytz.utc)
                requests.append(
                    ReplaceOne({"_id": event.event_id},
                               event.dict(by_alias=True), True))

        response = await self.event_collection.bulk_write(requests)
        return response.upserted_count + response.inserted_count
Exemplo n.º 14
0
 def delete_expired_tokens(self):
     """Delete expired tokens. Also, remove docs with no tokens."""
     now = datetime.utcnow()
     bulk_requests = []
     docs = list(
         self.mgdb.tokens.find({
             "$or": [{
                 "link.expires": {
                     "$lte": now
                 }
             }, {
                 "fetch.expires": {
                     "$lte": now
                 }
             }, {
                 "link": [],
                 "fetch": []
             }]
         }))
     for d in docs:
         if not d["link"] and not d["fetch"]:
             bulk_requests.append(DeleteOne(dict(_id=d["_id"])))
             continue
         link = []
         for t in d["link"]:
             if t["expires"] > now:
                 link.append(t)
         fetch = []
         for t in d["fetch"]:
             if t["expires"] > now:
                 fetch.append(t)
         if not link and not fetch:
             bulk_requests.append(DeleteOne(dict(_id=d["_id"])))
         else:
             bulk_requests.append(
                 ReplaceOne(dict(_id=d["_id"]),
                            dict(email=d["email"], link=link, fetch=fetch)))
     if bulk_requests:
         self.mgdb.tokens.bulk_write(bulk_requests)
Exemplo n.º 15
0
def bulk_write_data_to_document_db_internal(cluster_name, namespace,
                                            database_name, collection_name,
                                            data):
    logger.info("About to apply %d bulk operations on the namespace: %s ",
                len(data), namespace)
    connection_string = get_cluster_connection_string(cluster_name)
    bulk_ops = []
    for item in data:
        op = ReplaceOne({"_id": item["_id"]}, item, upsert=True)
        bulk_ops.append(op)
    logger.info(
        "Completed creating the %d replace_one bulk operations for namespace: %s ",
        len(data), namespace)
    try:
        with MongoClient(connection_string) as client:
            collection = client.get_database(database_name).get_collection(
                collection_name)
            result = collection.bulk_write(bulk_ops)
            logger.info(
                "Successfully wrote %d documents to namespace %s on Document DB.",
                len(data), namespace)
        return True
    except BulkWriteError as bwe:
        if 'writeErrors' in bwe:
            error_count = len(bwe['writeErrors'])
            dupe_count = len(
                filter(lambda we: "E11000 duplicate key error" in we["errmsg"],
                       bwe['writeErrors']))
            if error_count == dupe_count:
                logger.info(
                    "Ignoring the duplicate key errors while writing on cluster: %s, namespace: %s",
                    cluster_name, namespace)
                return True
        # TODO: have a retry logic BulkWriteError: batch op errors occurred
        logger.exception("Exception while doing bulk operations. %s",
                         bwe.details,
                         exc_info=True)
        raise
Exemplo n.º 16
0
 def getScore(self):
     userConn = self.db.users
     conn = self.db.score
     user, id = self.queryUser()
     if not user:
         return False
     login = urpLogin(user)
     login.login()
     time.sleep(0.1)
     urpScore = getScore(user)
     score = urpScore.getAll()
     if score:
         scoreInsert = {}
         scoreList = []
         scoreInsert['num'] = user[0]
         scoreInsert['score'] = score
         scoreList.append(ReplaceOne(scoreInsert, scoreInsert, upsert=True))
         result = conn.bulk_write(scoreList)
         userConn.find_one_and_update({'_id': id}, {'$set': {'status': 1}})
         return user[0] + ' -- success'
     else:
         userConn.find_one_and_update({'_id': id}, {'$set': {'status': 0}})
         return user[0] + ' -- fail'
Exemplo n.º 17
0
    def write_data(self, doc: dict, doc_key: str = None, force_timestamp=True):
        """write document with _ts (timestamp) included

        :Parameters:
         - `doc`: A document to be written
         - `doc_key` (optional): Document key (_id) to be used for 
         document replacement/upsert
        """
        if force_timestamp:
            doc['_ts'] = datetime.now()
        else:
            doc['_ts'] = doc.get('_ts', datetime.now())
        if doc_key is not None:
            doc['_id'] = '%s' % doc_key
            self._statements.append(
                ReplaceOne(filter={'_id': doc['_id']},
                           replacement=doc,
                           upsert=True))
        else:
            self._statements.append(InsertOne(document=doc))
        if len(self._statements) > self._threshold:
            self._write_to_server()
        return self._write_counter
Exemplo n.º 18
0
    def update(self, docs, update_lu=True, key=None, ordered=True, **kwargs):
        """
        Function to update associated MongoStore collection.

        Args:
            docs: list of documents
        """

        requests = []

        for d in docs:

            d = jsanitize(d, allow_bson=True)

            # document-level validation is optional
            validates = True
            if self.validator:
                validates = self.validator.is_valid(d)
                if not validates:
                    if self.validator.strict:
                        raise ValueError('Document failed to validate: {}'.format(d))
                    else:
                        self.logger.error('Document failed to validate: {}'.format(d))

            if validates:
                if isinstance(key, list):
                    search_doc = {k: d[k] for k in key}
                elif key:
                    search_doc = {key: d[key]}
                else:
                    search_doc = {self.key: d[self.key]}
                if update_lu:
                    d[self.lu_field] = datetime.utcnow()

                requests.append(ReplaceOne(search_doc,d,upsert=True))

        self.collection.bulk_write(requests,ordered=ordered)
Exemplo n.º 19
0
    def replace_records(self, mongo, docs):
        operations = []
        for doc in docs:
            mongo_replacement_filter = dict()
            if isinstance(self.mongo_replacement_filter, str):
                mongo_replacement_filter = {
                    self.mongo_replacement_filter:
                    doc.get(self.mongo_replacement_filter, False)
                }
            elif isinstance(self.mongo_replacement_filter, dict):
                for k, v in self.mongo_replacement_filter.items():
                    if k == v:
                        mongo_replacement_filter[k] = doc.get(k, False)
                    else:
                        mongo_replacement_filter[
                            k] = self.mongo_replacement_filter.get(k, False)

            operations.append(
                ReplaceOne(mongo_replacement_filter, doc, upsert=True))

            # Send once every 1000 in batch
            if (len(operations) == 1000):
                logging.info('Making Request....')
                mongo.bulk_write(self.mongo_collection,
                                 operations,
                                 mongo_db=self.mongo_db,
                                 ordered=False)
                operations = []
                logging.info('Request successfully finished....')

        if (len(operations) > 0):
            logging.info('Making Final Request....')
            mongo.bulk_write(self.mongo_collection,
                             operations,
                             mongo_db=self.mongo_db,
                             ordered=False)
            logging.info('Final Request Finished.')
Exemplo n.º 20
0
 def queue_replace(self, filter_document, update_document, *args, **kwargs):
     self.batch.append(
         ReplaceOne(filter_document, update_document, *args, **kwargs))
Exemplo n.º 21
0
# print(post1)
# post1 = posts.find_one({"_id": post_id})
# print(post1)
# post1 = posts.find_one({'_id': ObjectId("5a07a674dcfba13028c7022b")})
# print(post1)

# posts.remove({"author": "Mike"})
# bulk insert
# ids = posts.insert_many([{'i': i} for i in range(10000)]).inserted_ids
# print("count of posts:", posts.count())

posts.remove({})
result = posts.bulk_write([
    DeleteMany({}),  # Remove all documents from the previous example.
    InsertOne({'_id': 1}),
    InsertOne({'_id': 2}),
    InsertOne({'_id': 3}),
    UpdateOne({'_id': 1}, {'$set': {
        'foo': 'bar'
    }}),
    UpdateOne({'_id': 4}, {'$inc': {
        'j': 1
    }}, upsert=True),
    ReplaceOne({'j': 1}, {'j': 2})
])

pprint(result.bulk_api_result)

for post in posts.find():
    pprint(post)
Exemplo n.º 22
0
def _serialize_graph(session_id, req_id, sequence_graph):
    db = get_db()

    if db.sessions.find_one({'_id': session_id}) is None:
        raise SessionNotInitializedException()

    if db.requests.find_one({'_id': req_id}) is not None:
        return

    updates = []

    functions = {}

    for k, func in sequence_graph.graph['functions'].items():
        id, update = _serialize_func(
            db, session_id, k, func
        )
        functions[k] = id
        if update is not None:
            updates.append(update)

    if len(updates) > 0:
        db.functions.bulk_write(updates)
        logger.info("Registered %i functions." % len(updates))
    updates = []

    nodes = {}

    for n in sequence_graph:
        node = sequence_graph.nodes[n]
        nodes[n] = _serialize_node(
            db, session_id, n, node, functions
        )

    saves = _merge_graph(sequence_graph, nodes)

    for n in saves:
        entry = nodes[n]
        if entry is None:
            continue
        updates.append(
            ReplaceOne(
                {'_id': entry['_id']},
                entry,
                upsert=True
            )
        )

    if len(updates) > 0:
        db.function_graph.bulk_write(updates)
        logger.info("Updated %i nodes in session graph." % len(updates))

    start = sequence_graph.graph['start']
    stop = sequence_graph.graph['stop']

    start_ids = []
    stop_ids = []

    for _, v in sequence_graph.out_edges(start):
        start_ids.append(
            nodes[v]['_id']
        )

    for u, _ in sequence_graph.in_edges(stop):
        stop_ids.append(
            nodes[u]['_id']
        )

    entry = {
        '_id': req_id,
        'session_id': session_id,
        'start_time': str(datetime.datetime.now()),
        'initials': start_ids,
        'endpoints': stop_ids
    }
    db.requests.insert_one(entry)

    session = db.sessions.find_one({'_id': session_id})
    start_points = set(session['start_points'])
    start_points = start_points.union(set(start_ids))
    session['start_points'] = list(start_points)
    db.sessions.replace_one({'_id': session_id}, session)

    logger.info("Created execution request [session: %s, request_id: %s]" % (session_id, req_id))
Exemplo n.º 23
0
def copy(source_params, dest_params, start_date, end_date):
    source_db = connect_mongodb(source_params)
    source_tables = get_table_names(source_params)

    dest_db = connect_mongodb(dest_params)
    dest_tables = get_table_names(dest_params)

    # records = []
    # for record in source_db[source_tables['topics_table']].find():
    #     records.append(record)
    # print("total records {}".format(len(records)))
    # dest_db[dest_tables['topics_table']].insert_many(
    #     records)
    #
    # records = []
    # for record in source_db[source_tables['meta_table']].find():
    #     records.append(record)
    # print("total records {}".format(len(records)))
    # dest_db[dest_tables['meta_table']].insert_many(
    #     records)

    # This is probably the most inefficient way of doing a copying a subset
    # of a
    # collection to another database but this is the one that requires the
    # minimum access
    # Wish this feature request is closed soon
    # https://jira.mongodb.org/browse/SERVER-13201
    # Aggregation is the fastest way to get a subset of data from a collection,
    # next would be map reduce. map reduce can write output to another db
    # but it would only generate doc of schema
    # id:<object id>, value:<search/mapreduce result>

    dest_db[dest_tables['data_table']].create_index(
        [('topic_id', pymongo.DESCENDING), ('ts', pymongo.DESCENDING)],
        unique=True,
        background=False)
    records = []
    i = 0
    print("start obj:{}".format(ObjectId.from_datetime(start_date)))
    print("end obj:{}".format(ObjectId.from_datetime(end_date)))
    cursor = source_db[source_tables['data_table']].find({
        '$and': [{
            '_id': {
                '$gte': ObjectId.from_datetime(start_date)
            }
        }, {
            '_id': {
                '$lte': ObjectId.from_datetime(end_date)
            }
        }]
    })
    print("Record count from cursor {}".format(cursor.count()))
    for record in cursor:
        i += 1
        records.append(
            ReplaceOne({
                'ts': record['ts'],
                'topic_id': record['topic_id']
            }, {
                'ts': record['ts'],
                'topic_id': record['topic_id'],
                'value': record['value']
            },
                       upsert=True))
        if i == 2000:
            print("total records {}".format(len(records)))
            dest_db[dest_tables['data_table']].bulk_write(records)
            i = 0
            records = []
Exemplo n.º 24
0
 def replace_one(self, cliteria, document, upsert=False):
     self._batch.append(
         ReplaceOne(cliteria, to_mongo(document), upsert=upsert))
Exemplo n.º 25
0
        def publish_to_historian(self, to_publish_list):
            _log.debug("publish_to_historian number of items: {}".format(
                len(to_publish_list)))

            # Use the db instance to insert/update the topics
            # and data collections
            db = self._client.get_default_database()

            bulk_publish = []
            for x in to_publish_list:
                ts = x['timestamp']
                topic = x['topic']
                value = x['value']
                meta = x['meta']

                # look at the topics that are stored in the database already
                # to see if this topic has a value
                topic_lower = topic.lower()
                topic_id = self._topic_id_map.get(topic_lower, None)
                db_topic_name = self._topic_name_map.get(topic_lower, None)
                if topic_id is None:
                    row = db[self._topic_collection].insert_one(
                        {'topic_name': topic})
                    topic_id = row.inserted_id
                    self._topic_id_map[topic_lower] = topic_id
                    self._topic_name_map[topic_lower] = topic
                elif db_topic_name != topic:
                    _log.debug('Updating topic: {}'.format(topic))

                    result = db[self._topic_collection].update_one(
                        {'_id': ObjectId(topic_id)},
                        {'$set': {
                            'topic_name': topic
                        }})
                    assert result.matched_count
                    self._topic_name_map[topic_lower] = topic

                old_meta = self._topic_meta.get(topic_id, {})
                if set(old_meta.items()) != set(meta.items()):
                    _log.debug('Updating meta for topic: {} {}'.format(
                        topic, meta))
                    db[self._meta_collection].insert_one({
                        'topic_id': topic_id,
                        'meta': meta
                    })
                    self._topic_meta[topic_id] = meta

                # Reformat to a filter tha bulk inserter.
                bulk_publish.append(
                    ReplaceOne({
                        'ts': ts,
                        'topic_id': topic_id
                    }, {
                        'ts': ts,
                        'topic_id': topic_id,
                        'value': value
                    },
                               upsert=True))

            #                bulk_publish.append(InsertOne(
            #                    {'ts': ts, 'topic_id': topic_id, 'value': value}))

            try:
                # http://api.mongodb.org/python/current/api/pymongo/collection.html#pymongo.collection.Collection.bulk_write
                result = db[self._data_collection].bulk_write(bulk_publish)
            except BulkWriteError as bwe:
                _log.error("{}".format(bwe.details))

            else:  # No write errros here when
                if not result.bulk_api_result['writeErrors']:
                    self.report_all_handled()
                else:
                    # TODO handle when something happens during writing of data.
                    _log.error('SOME THINGS DID NOT WORK')
Exemplo n.º 26
0
    "dims": {
        "width": 80,
        "length": 30,
        "height": 30
    },
    "age": 40
}), ReplaceOne ({
    "id": 2,
    "name": "Ann",
    "dims": {
        "width": 60,
        "length": 80,
        "height": 25
    },
    "age": 33
}, {
    "id": 2,
    "name": "Annie",
    "dims": {
        "width": 70,
        "length": 50,
        "height": 25
    },
    "age": 41
}, upsert=True)]
result = qtest.bulk_write (requests)
result.inserted_count
result.deleted_count
result.modified_count
result.upserted_ids
Exemplo n.º 27
0
 def put(self):
     """
     Replace existing Product objects
     ---
     parameters:
         -   in: body
             description: List of Product objects to be inserted to database
             required: true
             schema:
                 type: array
                 items:
                     properties:
                         _id:
                             type: string
                         data:
                             type: object
                             properties:
                                 name:
                                     type: string
                                 brand_name:
                                     type: string
                                 regular_price_value:
                                     type: number
                                     format: float
                                 offer_price_value:
                                     type: number
                                     format: float
                                 currency:
                                     type: string
                                 classification_l1:
                                     type: string
                                 classification_l2:
                                     type: string
                                 classification_l3:
                                     type: string
                                 classification_l4:
                                     type: string
                                 image_url:
                                     type: string
     responses:
         '200':
             description: Bulk Write result object from MongoDB
             content:
                 application/json:
                     schema:
                         type: object
                         properties:
                             acknowledged:
                                 type: boolean
                             matched_count:
                                 type: string
                             modified_count:
                                 type: integer
                             deleted_count:
                                 type: integer
                             upserted_ids:
                                 type: array
                                 items:
                                     type: object
                                     properties:
                                         _id:
                                             type: string
                             inserted_count:
                                 type: integer
         '500':
             description: Server encountered an error while performing bulk operation
             content:
                 application/json:
                     schema:
                         type: object
                         properties:
                             message:
                                 type: string
     """
     requests = []
     for q in request.json:
         requests.append(ReplaceOne({"_id": ObjectId(q["_id"])}, q["data"]))
     return perform_bulk(collection, requests), 201
Exemplo n.º 28
0
def insert_new_records(input_collection, output_collection, collection_str):
    try:
        #get the last insertion time in the input collection
        last_inserted_doc_input = input_collection.find({}).sort(
            'insertion_datetime', pymongo.DESCENDING).limit(1)
        last_insertion_time_input = last_inserted_doc_input[0][
            'insertion_datetime']
        #log last insertion time
        log_message = "Last insertion datetime in " + collection_str + ": " + str(
            last_insertion_time_input)
        logging.info(log_message)
        #print for testing
        #        print dumps(last_inserted_doc[0], indent=2, default=json_util.default)

        #get the last insertion time in the output collection
        if "ies" not in collection_str:
            m_type = str(collection_str[:-1])
        else:
            m_type = str(collection_str[:-3] + "y")

        last_inserted_doc_output = output_collection.find({
            "type": m_type
        }).sort('insertion_datetime', pymongo.DESCENDING).limit(1)
        last_insertion_time_output = last_inserted_doc_output[0][
            'insertion_datetime']
        #log last insertion time
        log_message = "Last insertion datetime in roadSensorValue for type " + m_type + ": " + str(
            last_insertion_time_output)
        logging.info(log_message)

        #get the datetime window
        end_date_time = last_insertion_time_input - datetime.timedelta(
            minutes=5)
        start_date_time = last_insertion_time_output - datetime.timedelta(
            minutes=5)
        #        print for testing
        #        print collection_str
        #        print str(last_insertion_time_input)
        #        print str(last_insertion_time_output)
        #        print str(start_date_time)
        #        print str(end_date_time)
        #        print '***************************'

        #retrieve the records within the time window from the input collection
        query = {
            'insertion_datetime': {
                '$gte': start_date_time,
                '$lte': end_date_time
            }
        }
        cursor_count = int(input_collection.find(query).count())
        log_message = str(
            cursor_count) + " records to be inserted from " + collection_str
        logging.info(log_message)
        cursor = input_collection.find(query)

        #bulk write to the output collection
        requests = []
        for doc in cursor:
            requests.append(ReplaceOne({"_id": doc["_id"]}, doc, upsert=True))

        #write the new records to the output collection
        result = output_collection.bulk_write(requests)

        #log counts
        log_message = str(
            result.upserted_count
        ) + " records were upserted from " + collection_str + " to roadSensorValue"
        logging.info(log_message)
        log_message = str(
            result.matched_count) + " records were matched in roadSensorValue"
        logging.info(log_message)

    except Exception, e:
        print(e)
        sys.exit()
Exemplo n.º 29
0
 def _sync_collection(self, src_dbname, src_collname, dst_dbname,
                      dst_collname):
     """ Sync a collection through batch write.
     """
     self._logger.info(
         "[%s] sync collection '%s.%s'" %
         (self._current_process_name, src_dbname, src_collname))
     while True:
         try:
             n = 0
             #docs = []
             reqs = []
             batchsize = 1000
             cursor = self._src_mc[src_dbname][src_collname].find(
                 filter=None,
                 cursor_type=pymongo.cursor.CursorType.EXHAUST,
                 no_cursor_timeout=True,
                 modifiers={'$snapshot': True})
             count = cursor.count()
             if count == 0:
                 self._logger.info('[%s] \t skip empty collection' %
                                   (self._current_process_name))
                 return
             for doc in cursor:
                 #docs.append(doc)
                 #if len(docs) == batchsize:
                 #    self._dst_mc[dst_dbname][dst_collname].insert_many(docs)
                 #    docs = []
                 reqs.append(
                     ReplaceOne({'_id': doc['_id']}, doc, upsert=True))
                 if len(reqs) == batchsize:
                     self._bulk_write(dst_dbname,
                                      dst_collname,
                                      reqs,
                                      ordered=False)
                     reqs = []
                 n += 1
                 if n % 10000 == 0:
                     self._logger.info(
                         '[%s] \t %s.%s %d/%d (%.2f%%)' %
                         (self._current_process_name, src_dbname,
                          src_collname, n, count, float(n) / count * 100))
             #if len(docs) > 0:
             #    self._dst_mc[dst_dbname][dst_collname].insert_many(docs)
             if len(reqs) > 0:
                 self._bulk_write(dst_dbname,
                                  dst_collname,
                                  reqs,
                                  ordered=False)
                 self._logger.info(
                     '[%s] \t %s.%s %d/%d (%.2f%%)' %
                     (self._current_process_name, src_dbname, src_collname,
                      n, count, float(n) / count * 100))
             return
         except pymongo.errors.AutoReconnect:
             self._src_mc.close()
             self._src_mc = self.reconnect(self._src_host,
                                           self._src_port,
                                           username=self._src_username,
                                           password=self._src_password,
                                           w=self._w)
Exemplo n.º 30
0
    def process(self, instance):
        self.log.debug(
            "--- Integration of Master version for subset `{}` begins.".format(
                instance.data.get("subset", str(instance))
            )
        )
        published_repres = instance.data.get("published_representations")
        if not published_repres:
            self.log.debug(
                "*** There are not published representations on the instance."
            )
            return

        project_name = api.Session["AVALON_PROJECT"]

        # TODO raise error if master not set?
        anatomy = instance.context.data["anatomy"]
        if "master" not in anatomy.templates:
            self.log.warning("!!! Anatomy does not have set `master` key!")
            return

        if "path" not in anatomy.templates["master"]:
            self.log.warning((
                "!!! There is not set `path` template in `master` anatomy"
                " for project \"{}\"."
            ).format(project_name))
            return

        master_template = anatomy.templates["master"]["path"]
        self.log.debug("`Master` template check was successful. `{}`".format(
            master_template
        ))

        master_publish_dir = self.get_publish_dir(instance)

        src_version_entity = instance.data.get("versionEntity")
        filtered_repre_ids = []
        for repre_id, repre_info in published_repres.items():
            repre = repre_info["representation"]
            if repre["name"].lower() in self.ignored_representation_names:
                self.log.debug(
                    "Filtering representation with name: `{}`".format(
                        repre["name"].lower()
                    )
                )
                filtered_repre_ids.append(repre_id)

        for repre_id in filtered_repre_ids:
            published_repres.pop(repre_id, None)

        if not published_repres:
            self.log.debug(
                "*** All published representations were filtered by name."
            )
            return

        if src_version_entity is None:
            self.log.debug((
                "Published version entity was not sent in representation data."
                " Querying entity from database."
            ))
            src_version_entity = (
                self.version_from_representations(published_repres)
            )

        if not src_version_entity:
            self.log.warning((
                "!!! Can't find origin version in database."
                " Skipping Master version publish."
            ))
            return

        all_copied_files = []
        transfers = instance.data.get("transfers", list())
        for _src, dst in transfers:
            dst = os.path.normpath(dst)
            if dst not in all_copied_files:
                all_copied_files.append(dst)

        hardlinks = instance.data.get("hardlinks", list())
        for _src, dst in hardlinks:
            dst = os.path.normpath(dst)
            if dst not in all_copied_files:
                all_copied_files.append(dst)

        all_repre_file_paths = []
        for repre_info in published_repres.values():
            published_files = repre_info.get("published_files") or []
            for file_path in published_files:
                file_path = os.path.normpath(file_path)
                if file_path not in all_repre_file_paths:
                    all_repre_file_paths.append(file_path)

        # TODO this is not best practice of getting resources for publish
        # WARNING due to this we must remove all files from master publish dir
        instance_publish_dir = os.path.normpath(
            instance.data["publishDir"]
        )
        other_file_paths_mapping = []
        for file_path in all_copied_files:
            # Check if it is from publishDir
            if not file_path.startswith(instance_publish_dir):
                continue

            if file_path in all_repre_file_paths:
                continue

            dst_filepath = file_path.replace(
                instance_publish_dir, master_publish_dir
            )
            other_file_paths_mapping.append((file_path, dst_filepath))

        # Current version
        old_version, old_repres = (
            self.current_master_ents(src_version_entity)
        )

        old_repres_by_name = {
            repre["name"].lower(): repre for repre in old_repres
        }

        if old_version:
            new_version_id = old_version["_id"]
        else:
            new_version_id = io.ObjectId()

        new_master_version = {
            "_id": new_version_id,
            "version_id": src_version_entity["_id"],
            "parent": src_version_entity["parent"],
            "type": "master_version",
            "schema": "pype:master_version-1.0"
        }
        schema.validate(new_master_version)

        # Don't make changes in database until everything is O.K.
        bulk_writes = []

        if old_version:
            self.log.debug("Replacing old master version.")
            bulk_writes.append(
                ReplaceOne(
                    {"_id": new_master_version["_id"]},
                    new_master_version
                )
            )
        else:
            self.log.debug("Creating first master version.")
            bulk_writes.append(
                InsertOne(new_master_version)
            )

        # Separate old representations into `to replace` and `to delete`
        old_repres_to_replace = {}
        old_repres_to_delete = {}
        for repre_info in published_repres.values():
            repre = repre_info["representation"]
            repre_name_low = repre["name"].lower()
            if repre_name_low in old_repres_by_name:
                old_repres_to_replace[repre_name_low] = (
                    old_repres_by_name.pop(repre_name_low)
                )

        if old_repres_by_name:
            old_repres_to_delete = old_repres_by_name

        archived_repres = list(io.find({
            # Check what is type of archived representation
            "type": "archived_repsentation",
            "parent": new_version_id
        }))
        archived_repres_by_name = {}
        for repre in archived_repres:
            repre_name_low = repre["name"].lower()
            archived_repres_by_name[repre_name_low] = repre

        backup_master_publish_dir = None
        if os.path.exists(master_publish_dir):
            backup_master_publish_dir = master_publish_dir + ".BACKUP"
            max_idx = 10
            idx = 0
            _backup_master_publish_dir = backup_master_publish_dir
            while os.path.exists(_backup_master_publish_dir):
                self.log.debug((
                    "Backup folder already exists."
                    " Trying to remove \"{}\""
                ).format(_backup_master_publish_dir))

                try:
                    shutil.rmtree(_backup_master_publish_dir)
                    backup_master_publish_dir = _backup_master_publish_dir
                    break
                except Exception:
                    self.log.info((
                        "Could not remove previous backup folder."
                        " Trying to add index to folder name"
                    ))

                _backup_master_publish_dir = (
                    backup_master_publish_dir + str(idx)
                )
                if not os.path.exists(_backup_master_publish_dir):
                    backup_master_publish_dir = _backup_master_publish_dir
                    break

                if idx > max_idx:
                    raise AssertionError((
                        "Backup folders are fully occupied to max index \"{}\""
                    ).format(max_idx))
                    break

                idx += 1

            self.log.debug("Backup folder path is \"{}\"".format(
                backup_master_publish_dir
            ))
            try:
                os.rename(master_publish_dir, backup_master_publish_dir)
            except PermissionError:
                raise AssertionError((
                    "Could not create master version because it is not"
                    " possible to replace current master files."
                ))
        try:
            src_to_dst_file_paths = []
            for repre_info in published_repres.values():

                # Skip if new repre does not have published repre files
                published_files = repre_info["published_files"]
                if len(published_files) == 0:
                    continue

                # Prepare anatomy data
                anatomy_data = repre_info["anatomy_data"]
                anatomy_data.pop("version", None)

                # Get filled path to repre context
                anatomy_filled = anatomy.format(anatomy_data)
                template_filled = anatomy_filled["master"]["path"]

                repre_data = {
                    "path": str(template_filled),
                    "template": master_template
                }
                repre_context = template_filled.used_values
                for key in self.db_representation_context_keys:
                    if (
                        key in repre_context or
                        key not in anatomy_data
                    ):
                        continue

                    repre_context[key] = anatomy_data[key]

                # Prepare new repre
                repre = copy.deepcopy(repre_info["representation"])
                repre["parent"] = new_master_version["_id"]
                repre["context"] = repre_context
                repre["data"] = repre_data
                repre.pop("_id", None)

                schema.validate(repre)

                repre_name_low = repre["name"].lower()
                # Replace current representation
                if repre_name_low in old_repres_to_replace:
                    old_repre = old_repres_to_replace.pop(repre_name_low)
                    repre["_id"] = old_repre["_id"]
                    bulk_writes.append(
                        ReplaceOne(
                            {"_id": old_repre["_id"]},
                            repre
                        )
                    )

                # Unarchive representation
                elif repre_name_low in archived_repres_by_name:
                    archived_repre = archived_repres_by_name.pop(
                        repre_name_low
                    )
                    old_id = archived_repre["old_id"]
                    repre["_id"] = old_id
                    bulk_writes.append(
                        ReplaceOne(
                            {"old_id": old_id},
                            repre
                        )
                    )

                # Create representation
                else:
                    repre["_id"] = io.ObjectId()
                    bulk_writes.append(
                        InsertOne(repre)
                    )

                # Prepare paths of source and destination files
                if len(published_files) == 1:
                    src_to_dst_file_paths.append(
                        (published_files[0], template_filled)
                    )
                    continue

                collections, remainders = clique.assemble(published_files)
                if remainders or not collections or len(collections) > 1:
                    raise Exception((
                        "Integrity error. Files of published representation "
                        "is combination of frame collections and single files."
                        "Collections: `{}` Single files: `{}`"
                    ).format(str(collections), str(remainders)))

                src_col = collections[0]

                # Get head and tail for collection
                frame_splitter = "_-_FRAME_SPLIT_-_"
                anatomy_data["frame"] = frame_splitter
                _anatomy_filled = anatomy.format(anatomy_data)
                _template_filled = _anatomy_filled["master"]["path"]
                head, tail = _template_filled.split(frame_splitter)
                padding = int(
                    anatomy.templates["render"].get(
                        "frame_padding",
                        anatomy.templates["render"].get("padding")
                    )
                )

                dst_col = clique.Collection(
                    head=head, padding=padding, tail=tail
                )
                dst_col.indexes.clear()
                dst_col.indexes.update(src_col.indexes)
                for src_file, dst_file in zip(src_col, dst_col):
                    src_to_dst_file_paths.append(
                        (src_file, dst_file)
                    )

            self.path_checks = []

            # Copy(hardlink) paths of source and destination files
            # TODO should we *only* create hardlinks?
            # TODO should we keep files for deletion until this is successful?
            for src_path, dst_path in src_to_dst_file_paths:
                self.copy_file(src_path, dst_path)

            for src_path, dst_path in other_file_paths_mapping:
                self.copy_file(src_path, dst_path)

            # Archive not replaced old representations
            for repre_name_low, repre in old_repres_to_delete.items():
                # Replace archived representation (This is backup)
                # - should not happen to have both repre and archived repre
                if repre_name_low in archived_repres_by_name:
                    archived_repre = archived_repres_by_name.pop(
                        repre_name_low
                    )
                    repre["old_id"] = repre["_id"]
                    repre["_id"] = archived_repre["_id"]
                    repre["type"] = archived_repre["type"]
                    bulk_writes.append(
                        ReplaceOne(
                            {"_id": archived_repre["_id"]},
                            repre
                        )
                    )

                else:
                    repre["old_id"] = repre["_id"]
                    repre["_id"] = io.ObjectId()
                    repre["type"] = "archived_representation"
                    bulk_writes.append(
                        InsertOne(repre)
                    )

            if bulk_writes:
                io._database[io.Session["AVALON_PROJECT"]].bulk_write(
                    bulk_writes
                )

            # Remove backuped previous master
            if (
                backup_master_publish_dir is not None and
                os.path.exists(backup_master_publish_dir)
            ):
                shutil.rmtree(backup_master_publish_dir)

        except Exception:
            if (
                backup_master_publish_dir is not None and
                os.path.exists(backup_master_publish_dir)
            ):
                os.rename(backup_master_publish_dir, master_publish_dir)
            self.log.error((
                "!!! Creating of Master version failed."
                " Previous master version maybe lost some data!"
            ))
            raise

        self.log.debug((
            "--- Master version integration for subset `{}`"
            " seems to be successful."
        ).format(
            instance.data.get("subset", str(instance))
        ))