Exemplo n.º 1
0
    def execute(self):
        config = self.config
        coll = config.tests.test_collection
        self.set_source("dirname/test.txt")
        coll.insert_one({"hello": "world"})
        data = []
        for doc in coll.find():
            data.append(doc)
        assert len(data) == 1
        assert data[0]["_job_id"] == self._id
        assert data[0]["_src"] == "test.txt"

        coll.insert_many([
            {"hello": 1},
            {"hello": 2},
        ])
        from pymongo import InsertOne, DeleteOne, ReplaceOne, UpdateOne, \
            UpdateMany, DeleteMany

        requests = [InsertOne({'hello': 3}),
                    DeleteOne({'x': 1}),
                    ReplaceOne({'hello': 1},
                               {'hello': 1.234},
                               upsert=True),
                    UpdateOne({'hello': 2},
                              {'$set': {'hello': 2.234}},
                              upsert=True),
                    UpdateMany({'hello': 3}, {'$set': {'hello': 3.234}},
                               upsert=True),
                    DeleteMany({'x': 1})
                    ]
        coll.bulk_write(requests)

        requests = [InsertOne({'hello': 4}),
                    DeleteOne({'x': 1}),
                    ReplaceOne({'hello': 1.234},
                               {'zz': 1},
                               upsert=True),
                    UpdateOne({'hello': 2.234},
                              {'$set': {'zz': 2}},
                              upsert=True),
                    UpdateMany({'hello': 3.234},
                               {'$set': {'zz': 3}},
                               upsert=True),
                    DeleteMany({'x': 1})]

        coll.bulk_write(requests)
        coll.update_one({"hello": 5},
                        {"$set": {"ua": 1}},
                        upsert=True)

        coll.update_many({"hello": "5"},
                         {"$set": {"ua": 2}},
                         upsert=True)
        data = []
        for doc in coll.find():
            data.append(doc)
        assert len(data) == 7
        assert set([d["_job_id"] for d in data]) == {self._id}
        assert set([d["_src"] for d in data]) == {"test.txt"}
Exemplo n.º 2
0
def cmd_purge(fetcher=None, dataset=None, purge_all=False, **kwargs):
    """Purge one or more dataset"""
    """
    dlstats fetchers purge -f INSEE --purge-all
    dlstats fetchers purge -f INSEE -d IPCH-2015-FR-COICOP
    dlstats fetchers purge -f INSEE -d IPCH-2015-FR-COICOP -d IPC-2015-COICOP
    """

    ctx = client.Context(**kwargs)

    ctx.log("START purge for [%s]" % fetcher)

    if ctx.silent or click.confirm('Do you want to continue?', abort=True):

        start = time.time()

        db = ctx.mongo_database()

        from pymongo import DeleteMany

        if purge_all:
            query = {"name": fetcher}
            result = db[constants.COL_PROVIDERS].bulk_write(
                [DeleteMany(query)], ordered=False)
            ctx.log("Provider [%s] deleted" % fetcher)

            query = {"provider_name": fetcher}
            result = db[constants.COL_CATEGORIES].bulk_write(
                [DeleteMany(query)], ordered=False)

            ctx.log("Categories deleted: %s" % result.deleted_count)

        query = {"provider_name": fetcher}
        if not purge_all and dataset:
            query["dataset_code"] = {"$in": dataset}

        bulk_requests = [DeleteMany(query)]

        result = db[constants.COL_DATASETS].bulk_write(bulk_requests,
                                                       ordered=False)
        ctx.log("Datasets deleted: %s" % result.deleted_count)

        result = db[constants.COL_SERIES].bulk_write(bulk_requests,
                                                     ordered=False)
        ctx.log("Series deleted: %s" % result.deleted_count)

        result = db[constants.COL_SERIES_ARCHIVES].bulk_write(bulk_requests,
                                                              ordered=False)
        ctx.log("Series archives deleted: %s" % result.deleted_count)

        end = time.time() - start

        ctx.log("END purge for [%s] - time[%.3f]" % (fetcher, end))
Exemplo n.º 3
0
def func2():
    """
    批量插入文档函数
    """
    client = MongoClient('mongodb://127.0.0.1:27017/')  # 建立连接
    collection = client['blogdb'].get_collection(
        'posts', write_concern=WriteConcern(w=1, j=True, wtimeout=1))  # 选择集合
    # write_concern控制何时调用getLastError()
    # write_concern=1:mongod在写入内存之后,返回响应
    # write_concern=1 & journal:true:mongod在写入内存、journal日志之后,返回响应
    # write_concern=2:在集群模式生效,2时表示只有secondary从primary完成复制之后,返回响应
    try:
        insertData = [InsertOne({'title': i}) for i in range(4)]  # 插入文档
        otherData = [
            DeleteMany({}),  # Remove all documents.
            InsertOne({'_id': 1}),
            InsertOne({'_id': 2}),
            InsertOne({'_id': 3}),
            UpdateOne({'_id': 1}, {'$set': {
                'foo': 'bar'
            }}),
            UpdateOne({'_id': 4}, {'$inc': {
                'j': 1
            }}, upsert=True),
            ReplaceOne({'j': 1}, {'j': 2}),
            DeleteOne({'_id': 2})
        ]
        collection.bulk_write(otherData + insertData, ordered=True)
    except BulkWriteError as bwe:
        print(bwe.details)
Exemplo n.º 4
0
 def delete(self, date):
     if isinstance(date, string_types) or isinstance(date, datetime):
         return self.collection.delete_many(date_range(date)).deleted_count
     else:
         if len(date):
             return self.collection.bulk_write(
                 [DeleteMany(date_range(d)) for d in date]).deleted_count
    def start_requests(self):

        password = input("GameLog Password: "******"mongodb+srv://kevin:" + password + "@cluster0-focx3.mongodb.net/test?retryWrites=true&w=majority"
        client = pymongo.MongoClient(uri)
        db = client.NBA_Match_Ups
        dbTest = db.test

        dbTest.bulk_write([
            DeleteMany({}),  # Remove all documents
        ])

        players = db.Players.find({}, {
            'basketballreference_page': 1,
            "_id": 0
        })  #Project only the url of basketball reference page
        listOfUrls = []  #Empty List
        for document in players:  #For loop to go through the entire Cursor
            listOfUrls.append(
                document['basketballreference_page'])  #Add the url to the list

        # List of Urls to go through
        urls = listOfUrls
        #urls = [#"https://www.basketball-reference.com/players/l/lillada01/gamelog/2013"]
        #   "https://www.basketball-reference.com/players/l/lillada01.html"]
        for url in urls:  # Run parse for each url in urls
            yield scrapy.Request(url=url, callback=self.parsePlayers)
Exemplo n.º 6
0
    async def delete(self, models: ModuleType, indexer: str = "slug"):
        children = {}
        parent = await self.ancestors(models, True)
        if parent:
            self_class = self.__class__.__name__
            for field in fields(parent):
                if "model" in field.metadata and field.metadata[
                        "model"] == self_class:
                    childs = getattr(parent, field.name)
                    childs.remove(
                        getattr(
                            self, "_id"
                            if field.type == List[ObjectId] else indexer))
                    children[field.name] = childs

        actions = [
            DeleteOne({"_id": self._id}),
            DeleteMany({"path": {
                "$regex": f"^{self.get_url()}"
            }})
        ]
        if children:
            actions.append(UpdateOne({"_id": parent._id}, {"$set": children}))
        async with await self._table.database.client.start_session() as s:
            async with s.start_transaction():
                await self._table.bulk_write(actions)

        self.id_ = None
Exemplo n.º 7
0
 def update_orbits(self, username: str, fleets: List[Fleet]):
     requests = [
         DeleteMany({'username': username, 'sourceType': SourceType.planet.name}),
     ]
     for fleet in fleets:
         requests.append(InsertOne(serialize(fleet)))
     self.collection.bulk_write(requests)
Exemplo n.º 8
0
def gen_bulk_operations(iter, turncate=False):
    '''
    @param iter: The iterable object will be insert into DB.
    @param turncate: If True, a DeleteMany operation will be applied before other operations.
    @return: A list of BulkOperation subclass.
    '''

    return ([DeleteMany({})] if turncate else []) + \
        [InsertOne(r) for r in iter]
Exemplo n.º 9
0
 def callback(client):
     client.db.collection.bulk_write([
         InsertOne({}),
         InsertOne({}),
         UpdateOne({}, {'$inc': {
             'x': 1
         }}),
         DeleteMany({})
     ])
Exemplo n.º 10
0
def save_schedule(specialty, schedules):
    requests = []
    specialty = specialties_coll.find_one_and_update(
        specialty, {'$set': specialty},
        upsert=True,
        return_document=ReturnDocument.AFTER)
    requests.append(DeleteMany({'specialty': specialty['_id']}))
    for s in schedules:
        doc = s.copy()
        doc['specialty'] = specialty['_id']
        requests.append(InsertOne(doc))
    return schedule_coll.bulk_write(requests)
Exemplo n.º 11
0
def admin_create_new_api_key(email):
    api_key = tokening.generate_random_key(length=64)

    operations = [
        DeleteMany({'email': email}),
        InsertOne({
            'email': email,
            'api_key': api_key
        })
    ]
    admin_api_keys_collection.bulk_write(operations, ordered=True)

    return api_key
Exemplo n.º 12
0
Arquivo: events.py Projeto: 0pt1on/noc
 def handle_clean(self, options, events):
     before = options.get("before")
     if before:
         datetime.datetime.strptime(before, "%Y-%m-%d")
     else:
         self.print("Before is not set, use default")
         before = datetime.datetime.now() - DEFAULT_CLEAN
     force = options.get("force")
     aa = ActiveAlarm._get_collection()
     ah = ArchivedAlarm._get_collection()
     ae = ActiveEvent._get_collection()
     event_ts = ae.find_one({"timestamp": {"$lte": before}}, limit=1, sort=[("timestamp", 1)])
     event_ts = event_ts["timestamp"]
     print("[%s] Cleaned before %s ... \n" % ("events", before), end="")
     bulk = []
     window = CLEAN_WINDOW
     while event_ts < before:
         refer_event_ids = []
         for e in [aa, ah]:
             for ee in e.find(
                 {"timestamp": {"$gte": event_ts, "$lte": event_ts + CLEAN_WINDOW}},
                 {"opening_event": 1, "closing_event": 1},
             ):
                 if "opening_event" in ee:
                     refer_event_ids += [ee["opening_event"]]
                 if "closing_event" in ee:
                     refer_event_ids += [ee["closing_event"]]
         try:
             clear_qs = {
                 "timestamp": {"$gte": event_ts, "$lte": event_ts + CLEAN_WINDOW},
                 "_id": {"$nin": refer_event_ids},
             }
             self.print(
                 "Interval: %s, %s; Count: %d"
                 % (event_ts, event_ts + CLEAN_WINDOW, ae.count(clear_qs))
             )
             bulk += [DeleteMany(clear_qs)]
             event_ts += window
             if window != CLEAN_WINDOW:
                 window = CLEAN_WINDOW
         except DocumentTooLarge:
             window = window // 2
             if window < datetime.timedelta(hours=1):
                 self.die("Too many events for delete in interval %s" % window)
             event_ts -= window
     if force:
         self.print("All data before %s from active events will be Remove..\n" % before)
         for i in reversed(range(1, 10)):
             self.print("%d\n" % i)
             time.sleep(1)
         ae.bulk_write(bulk)
Exemplo n.º 13
0
    def start_requests(self):
        password = input("Password: "******"mongodb+srv://kevin:"+password+"@cluster0-focx3.mongodb.net/test?retryWrites=true&w=majority"
        client = pymongo.MongoClient(uri)
        db = client.NBA_Match_Ups.Teams

        db.bulk_write([
            DeleteMany({}),  # Remove all documents
        ])

        urls = [  # List of Urls to go through
            "https://www.basketball-reference.com/teams/"
        ]
        for url in urls:  # Run parse for each url in urls
            yield scrapy.Request(url=url, callback=self.parseTeamIndex)
Exemplo n.º 14
0
def clean_toc(db):
    logger.info('cleaning table of contents')
    out_of_sync_tocs = db.toc.find({'synced': False}, {
        '_id': False,
        'collection': True,
        'path': True
    })

    for doc in out_of_sync_tocs:
        db[doc['collection']].delete_many({'path': doc['path']})

    bulk = [DeleteMany({'synced': False})]
    try:
        db.toc.bulk_write(bulk)
    except BulkWriteError as e:
        logger.error(e)
Exemplo n.º 15
0
    def start_requests(self):

        uri = "mongodb+srv://kevin:" + password + "@cluster0-focx3.mongodb.net/test?retryWrites=true&w=majority"
        client = pymongo.MongoClient(uri)
        db = client.NBA_Match_Ups.Players

        db.bulk_write([
            DeleteMany({}),  # Remove all documents
        ])

        urls = [  # List of Urls to go through
            "https://www.basketball-reference.com/leagues/NBA_2019.html"
            #"https://www.basketball-reference.com/players/l/lillada01.html"
        ]
        for url in urls:  # Run parse for each url in urls
            yield scrapy.Request(url=url, callback=self.parseLeague)
Exemplo n.º 16
0
def trigger(email):
    # this function can be used for the initial send as well as resending

    helper_account = helper_accounts_collection.find_one({'email': email})

    if helper_account['account']['email_verified']:
        return formatting.status('email already verified')

    # Generate new token
    verification_token = tokening.generate_random_key(length=64)
    helper_id = ObjectId(helper_account["_id"])

    # Create new token record
    record = {'helper_id': helper_id, 'token': verification_token}
    operations = [DeleteMany({'helper_id': helper_id}), InsertOne(record)]
    email_tokens_collection.bulk_write(operations, ordered=True)

    # Trigger token-email
    return send(email, verification_token)
Exemplo n.º 17
0
    def delete_many(self, **kwargs):
        '''
        Delete many documents.

            person1 = Person.new(name='joe', age=30)
            person2 = Person.new(name='jill', age=31)
            person3 = Person.new(name='bob', age=50)
            bulk = Person.bulk()
            bulk.delete_many(age__gt=30)
            bulk.save()
            # Now jill and bob are gone, having age > 30

        :param **kwargs: the query to run the delete many with
        :return: the ``pymongo.DeleteMany`` result
        '''
        query, _ = self.klass._build_query(kwargs)
        delete = DeleteMany(query)
        self.ops.append(delete)
        return delete
def upload_meetings(db, documents):
    deleted_set = set()
    write_ops = []
    # Delete the old meetings first
    for doc in documents:
        course_marker = (doc['semester'], doc['year'], doc['courseId'])
        if course_marker not in deleted_set:
            write_ops.append(DeleteMany(
                {
                    'courseId': doc['courseId'],
                    'semester': doc['semester'],
                    'year': doc['year'],
                }
            ))
            deleted_set.add(course_marker)

    # Then add the new documents
    result = db['meetings'].bulk_write(write_ops)
    print('[Worker] Deleted', result.deleted_count, 'in meetings')
    result = db['meetings'].insert_many(documents)
    print('[Worker] Added', len(result.inserted_ids), 'in meetings')
Exemplo n.º 19
0
def writeFullXch(cnc, prd=False):
    if (prd):
        xdays = dtm.utcnow() - dtm(2016, 1, 1, 1, 1, 1, 111)
        xdays = xdays.days
    else:
        xdays = 5
    xdict = scrapeXchSet(cnc, ndays=xdays)
    if (xdict):
        odo = cnxCH.bulk_write(
            [DeleteMany({'_id': xdict['_id']}),
             InsertOne(xdict)])
    else:
        odo = False
    cnxLog.insert_one({
        'env': prd,
        'module': 'writeFullXch',
        'epoch': dtm.utcnow(),
        'gist': str(odo),
        'service': 'xch',
        'app': 'S-Ticker'
    })
    return odo
Exemplo n.º 20
0
    def delete_many(self, collection: str, doc_ids: List[ObjectId]):
        """
        Delete documents from the database.

        Parameters
        ----------
        collection: str
            The db collection to delete document from.
        doc_ids: List[ObjectId]
            The list of document ids to delete.
        """
        delete_request = DeleteMany({"_id": {"$in": doc_ids}})
        delete_many_results = self._db.get_collection(collection).bulk_write(
            [delete_request]
        )

        delete_msg = (
            f"Deleted {delete_many_results.deleted_count}/{len(doc_ids)} {collection}."
        )
        if delete_many_results.deleted_count != len(doc_ids):
            log.error(delete_msg)
        else:
            log.info(delete_msg)
Exemplo n.º 21
0
    def test_update_organisations(self, get_collection_mock):
        task = Mock()
        records = [{
            ORG_UNIQUE_FIELD: "1",
            "text": "data"
        }, {
            ORG_UNIQUE_FIELD: "12",
            "text": "data data"
        }]

        update_organisations(task, records)

        get_collection_mock.assert_called_once_with(
            collection_name=TREASURY_ORG_COLLECTION)
        get_collection_mock.return_value.bulk_write.assert_called_once_with([
            UpdateOne({ORG_UNIQUE_FIELD: "1"}, {"$set": records[0]},
                      upsert=True),
            UpdateOne({ORG_UNIQUE_FIELD: "12"}, {"$set": records[1]},
                      upsert=True),
            DeleteMany({'edrpou_code': {
                '$nin': ['1', '12']
            }})
        ])
Exemplo n.º 22
0
def update_organisations(task, records):
    collection = get_collection(collection_name=TREASURY_ORG_COLLECTION)
    operations = []
    codes = []
    for org in records:
        code = org[ORG_UNIQUE_FIELD]
        codes.append(code)
        operations.append(
            UpdateOne({ORG_UNIQUE_FIELD: code}, {"$set": org}, upsert=True))

    if codes:
        operations.append(
            DeleteMany({ORG_UNIQUE_FIELD: {
                "$nin": codes
            }})  # delete codes not on the list
        )

    try:
        result = collection.bulk_write(operations)
    except PyMongoError as e:
        logger.exception(e, extra={"MESSAGE_ID": "MONGODB_ACCESS_ERROR"})
        raise task.retry()
    else:
        return result.bulk_api_result
Exemplo n.º 23
0
# 搜索
# post1 = posts.find_one({"author": "Mike"})
# print(post1)
# post1 = posts.find_one({"_id": post_id})
# print(post1)
# post1 = posts.find_one({'_id': ObjectId("5a07a674dcfba13028c7022b")})
# print(post1)

# posts.remove({"author": "Mike"})
# bulk insert
# ids = posts.insert_many([{'i': i} for i in range(10000)]).inserted_ids
# print("count of posts:", posts.count())

posts.remove({})
result = posts.bulk_write([
    DeleteMany({}),  # Remove all documents from the previous example.
    InsertOne({'_id': 1}),
    InsertOne({'_id': 2}),
    InsertOne({'_id': 3}),
    UpdateOne({'_id': 1}, {'$set': {
        'foo': 'bar'
    }}),
    UpdateOne({'_id': 4}, {'$inc': {
        'j': 1
    }}, upsert=True),
    ReplaceOne({'j': 1}, {'j': 2})
])

pprint(result.bulk_api_result)

for post in posts.find():
Exemplo n.º 24
0
async def importfbans_func(message, fed, strings, document=None):
    global user_id
    file_type = os.path.splitext(document['file_name'])[1][1:]

    if file_type == 'json':
        if document['file_size'] > 1000000:
            await message.reply(strings['big_file_json'].format(num='1'))
            return
    elif file_type == 'csv':
        if document['file_size'] > 52428800:
            await message.reply(strings['big_file_csv'].format(num='50'))
            return
    else:
        await message.reply(strings['wrong_file_ext'])
        return

    f = await bot.download_file_by_id(document.file_id, io.BytesIO())
    msg = await message.reply(strings['importing_process'])

    data = None
    if file_type == 'json':
        try:
            data = rapidjson.load(f).items()
        except ValueError:
            return await message.reply(strings['invalid_file'])
    elif file_type == 'csv':
        data = csv.DictReader(io.TextIOWrapper(f))

    real_counter = 0

    queue_del = []
    queue_insert = []
    current_time = datetime.now()
    for row in data:
        if file_type == 'json':
            user_id = row[0]
            data = row[1]
        elif file_type == 'csv':
            if 'user_id' in row:
                user_id = int(row['user_id'])
            elif 'id' in row:
                user_id = int(row['id'])
            else:
                continue
        else:
            raise NotImplementedError

        new = {
            'fed_id': fed['fed_id'],
            'user_id': user_id
        }

        if 'reason' in row:
            new['reason'] = row['reason']

        if 'by' in row:
            new['by'] = int(row['by'])
        else:
            new['by'] = message.from_user.id

        if 'time' in row:
            new['time'] = datetime.fromtimestamp(int(row['time']))
        else:
            new['time'] = current_time

        if 'banned_chats' in row and type(row['banned_chats']) is list:
            new['banned_chats'] = row['banned_chats']

        queue_del.append(DeleteMany(
            {'fed_id': fed['fed_id'], 'user_id': user_id}))
        queue_insert.append(InsertOne(new))

        if len(queue_insert) == 1000:
            real_counter += len(queue_insert)

            # Make delete operation ordered before inserting.
            if queue_del:
                await db.fed_bans.bulk_write(queue_del, ordered=False)
            await db.fed_bans.bulk_write(queue_insert, ordered=False)

            queue_del = []
            queue_insert = []

    # Process last bans
    real_counter += len(queue_insert)
    if queue_del:
        await db.fed_bans.bulk_write(queue_del, ordered=False)
    if queue_insert:
        await db.fed_bans.bulk_write(queue_insert, ordered=False)

    await msg.edit_text(strings['import_done'].format(num=real_counter))
Exemplo n.º 25
0
    def reducer(self, key, values):
        """
        Cleans the metering data:
            - gets "acumulated or instant" values
            - removes negative and outliers
            - detects gaps
            - generates daily dataframe
        :param key: the device
        :param values: the information
        :return:
        """
        #create dataframe with the values:
        df = pd.DataFrame.from_records(
            values,
            columns=["ts", "value", "accumulated", "energytype", "source"])
        # group it by source and energyType
        source_group = df.groupby('source')

        for source, df_source_group in source_group:
            etype_group = df_source_group.groupby('energytype')
            for etype, df_etype_group in etype_group:
                df_etype_group = df_etype_group.set_index('ts')
                df_etype_group = df_etype_group.sort_index()
                df_etype_group['ts'] = df_etype_group.index
                # save billing information in raw_data
                # save billing information in raw_data
                raw_data = df_etype_group[["ts", "value",
                                           "accumulated"]].to_dict('records')
                for r in raw_data:
                    r.update({
                        "device": key,
                        "source": source,
                        "energy_type": etype,
                        "data_type": "metering",
                        "freq": "D"
                    })

                ops = [InsertOne(x) for x in raw_data]
                result = self.mongo['raw_data'].bulk_write([
                    DeleteMany({
                        "device": key,
                        "source": source,
                        "energy_type": etype,
                        "data_type": "metering",
                        "freq": "D"
                    }),
                ] + ops)

                # self.mongo['raw_data'].update({"device": key, "source": source, "energy_type": etype, "data_type": "metering"}, { "$set" : {
                #         "device": key, "source": source, "energy_type": etype, "companyId": self.companyId,
                #         "raw_data":df_etype_group[["ts","value","accumulated"]].to_dict('records')
                #     }
                # }, upsert=True)

                # self.mongo['raw_data'].update(
                #     {"device": key, "source": source, "energy_type": etype, "data_type": "metering"},
                #     {"$unset": {"errors": 1}},
                #     upsert=True)
                # check if metering is acumulated or instant:
                duplicated_index = df_etype_group.index.duplicated(keep='last')
                duplicated_values = df_etype_group[
                    duplicated_index].index.values.tolist()
                df_etype_group = df_etype_group[~duplicated_index]

                freq = calculate_frequency(df_etype_group)
                if not freq:
                    self.mongo['clean_data'].update(
                        {
                            "device": key,
                            "source": source,
                            "energy_type": etype,
                            "data_type": "metering",
                            "freq": "D"
                        }, {"$set": {
                            "errors": "can't infere frequency"
                        }},
                        upsert=True)
                    continue

                day_delta = timedelta(days=1)

                if df_etype_group.value.isnull().all():  # accumulated
                    df_etype_group = df_etype_group[['accumulated']]
                    if freq < day_delta:  # sub-daily frequency
                        df_etype_group = df_etype_group.resample(
                            "D").max().interpolate().diff(
                                1, 0).rename(columns={"accumulated": "value"})
                    else:  # super-daily frequency
                        df_etype_group = df_etype_group.resample(
                            "D").interpolate().diff(
                                1, 0).rename(columns={"accumulated": "value"})

                elif df_etype_group.accumulated.isnull().all():  #instant
                    df_etype_group = df_etype_group[['value']]
                    if freq < day_delta:  # sub-daily frequency
                        df_etype_group = df_etype_group.resample("D").sum()
                    else:  # super-daily frequency
                        df_etype_group.value = df_etype_group.value.cumsum()
                        df_etype_group = df_etype_group.resample(
                            "D").interpolate().diff(1, 0)
                else:
                    self.mongo['clean_data'].update(
                        {
                            "device": key,
                            "source": source,
                            "energy_type": etype,
                            "data_type": "metering",
                            "freq": "D"
                        }, {
                            "$set": {
                                "errors":
                                "device with accumulated and instant values at the same metering"
                            }
                        },
                        upsert=True)
                    continue
                df_etype_group['ts'] = df_etype_group.index

                #max_threshold = self.config['max_threshold'][etype] * 24 if etype in self.config['max_threshold'] else self.config['max_threshold']['default'] * 24
                #max_outlier_bool = dc.detect_max_threshold_outliers(df_etype_group['value'], max_threshold)
                #df_etype_group['value'] = dc.clean_series(df_etype_group['value'], max_outlier_bool)
                negative_values_bool = dc.detect_min_threshold_outliers(
                    df_etype_group['value'], 0)
                df_etype_group['value'] = dc.clean_series(
                    df_etype_group['value'], negative_values_bool)
                #znorm_bool = dc.detect_znorm_outliers(df_etype_group['value'], 30, mode="global")
                #df_etype_group['value'] = dc.clean_series(df_etype_group['value'], znorm_bool)

                #max_outliers = list(df_etype_group[max_outlier_bool].index)
                negative_outliers = list(
                    df_etype_group[negative_values_bool].index)
                #znorm_outliers = list(df_etype_group[znorm_bool].index)
                missing_values = list(
                    df_etype_group[df_etype_group.value.isnull()].index)

                clean_data = df_etype_group[['ts', 'value']].to_dict('records')
                for r in clean_data:
                    r.update({
                        "device": key,
                        "source": source,
                        "energy_type": etype,
                        "data_type": "metering",
                        "freq": "D"
                    })

                ops = [InsertOne(x) for x in clean_data]
                result = self.mongo['clean_data'].bulk_write([
                    DeleteMany({
                        "device": key,
                        "source": source,
                        "energy_type": etype,
                        "data_type": "metering",
                        "freq": "D"
                    }),
                ] + ops)

                self.mongo['data_quality'].update(
                    {
                        "device": key,
                        "source": source,
                        "energy_type": etype,
                        "data_type": "metering",
                        "freq": "D"
                    }, {
                        "$set": {
                            "duplicated_values": duplicated_values,
                            "frequency": freq.resolution,
                            "gaps": missing_values,
                            "negative_values": negative_outliers
                        }
                    },
                    upsert=True)

                #
                # self.mongo['raw_data'].update({"device": key, "source": source, "energy_type": etype, "data_type": "metering"},
                #                                 {"$set":
                #                                    {
                #                                     "clean_data": df_etype_group[['ts','value']].to_dict('records'),
                #                                     "negative_values": negative_outliers,
                #                                     "znorm_outliers": znorm_outliers,
                #                                     "max_outliers": max_outliers,
                #                                     "gaps": missing_values,
                #                                     "frequency": freq.resolution,
                #                                     "duplicated_values": duplicated_values
                #                                    }
                #                                 }, upsert=True)

                for row in df_etype_group.iterrows():
                    yield None, "\t".join([
                        str(row[1]['ts'].timestamp()), key,
                        str(row[1]['value']), etype, source
                    ])
Exemplo n.º 26
0
async def retry_messages_job(shared_stats):
    """ Each few minutes, try to handle message that were added to the
    pending queue (Unavailable messages)."""

    seen_ids = {}
    actions = []
    messages_actions = []
    gtasks: List[Coroutine] = []
    tasks = []
    loop = asyncio.get_event_loop()
    i = 0
    j = 0
    find_params = {}
    # if await PendingTX.collection.count_documents({}) > 500:
    #     find_params = {'message.item_type': 'inline'}

    while await PendingMessage.collection.count_documents(find_params):
        async for pending in PendingMessage.collection.find(find_params).sort([
            ('message.time', 1)
        ]).batch_size(256):
            LOGGER.debug(f"retry_message_job len_seen_ids={len(seen_ids)} "
                         f"len_gtasks={len(gtasks)} len_tasks={len(tasks)}")

            if shared_stats is not None:
                shared_stats['retry_messages_job_seen_ids'] = len(seen_ids)
                shared_stats['retry_messages_job_gtasks'] = len(gtasks)
                shared_stats['retry_messages_job_tasks'] = len(tasks)
                shared_stats['retry_messages_job_actions'] = len(actions)
                shared_stats['retry_messages_job_messages_actions'] = len(
                    messages_actions)
                shared_stats['retry_messages_job_i'] = i
                shared_stats['retry_messages_job_j'] = j

            if pending['message']['item_type'] == 'ipfs' or pending['message'][
                    'type'] == 'STORE':
                i += 15
                j += 100
            else:
                i += 1
                j += 1

            tasks.append(
                asyncio.create_task(
                    handle_pending_message(pending, seen_ids, actions,
                                           messages_actions)))

            if (j >= 20000):
                # Group tasks using asyncio.gather in `gtasks`.
                # await join_pending_message_tasks(tasks, actions_list=actions, messages_actions_list=messages_actions)
                gtasks.append(
                    asyncio.create_task(
                        join_pending_message_tasks(
                            tasks,
                            actions_list=actions,
                            messages_actions_list=messages_actions)))
                tasks = []
                actions = []
                messages_actions = []
                i = 0
                j = 0

            if (i >= 1024):
                await join_pending_message_tasks(tasks)
                # gtasks.append(asyncio.create_task(join_pending_message_tasks(tasks)))
                tasks = []
                i = 0

        gtasks.append(
            asyncio.create_task(
                join_pending_message_tasks(
                    tasks,
                    actions_list=actions,
                    messages_actions_list=messages_actions)))

        await asyncio.gather(*gtasks, return_exceptions=True)
        gtasks = []

        if await PendingMessage.collection.count_documents(find_params
                                                           ) > 100000:
            LOGGER.info('Cleaning messages')
            clean_actions = []
            # big collection, try to remove dups.
            for key, height in seen_ids.items():
                clean_actions.append(
                    DeleteMany({
                        'message.item_hash': key[0],
                        'message.sender': key[1],
                        'source.chain_name': key[2],
                        'source.height': {
                            '$gt': height
                        }
                    }))
            result = await PendingMessage.collection.bulk_write(clean_actions)
            LOGGER.info(repr(result))

        await asyncio.sleep(5)
    def reducer(self, key, values):
        """
        Cleans the metering data:
            - gets "acumulated or instant" values
            - removes negative and outliers
            - detects gaps
            - generates daily dataframe
        :param key: the device
        :param values: the information
        :return:
        """
        #create dataframe with the values:
        df = pd.DataFrame.from_records(values)
        # group it by source and energyType
        columns = [x[0] for x in self.config['output']['fields']]
        columns.remove("stationId")
        df = df.set_index('ts')
        df = df.sort_index()
        df['ts'] = df.index

        raw_data = df[columns].to_dict('records')
        for r in raw_data:
            r.update({"stationId": key})

        ops = [InsertOne(x) for x in raw_data]
        result = self.mongo['meteo_raw_data'].bulk_write(
            [
                DeleteMany(
                    {"stationId": key}),
            ] + ops
        )

        # check if duplicated meteo data
        duplicated_index = df.index.duplicated(keep='last')
        duplicated_values = df[duplicated_index].index.values.tolist()
        df = df[~duplicated_index]

        max_threshold = self.config['threshold']['max']
        max_outlier_bool = dc.detect_max_threshold_outliers(df['temperature'], max_threshold)
        df['temperature'] = dc.clean_series(df['temperature'], max_outlier_bool)

        min_threshold = self.config['threshold']['min']
        min_threshold_bool = dc.detect_min_threshold_outliers(df['temperature'], min_threshold)
        df['temperature'] = dc.clean_series(df['temperature'], min_threshold_bool)

        #znorm_bool = dc.detect_znorm_outliers(df['temperature'], 30, mode="global")
        #df['temperature'] = dc.clean_series(df['temperature'], znorm_bool)

        max_outliers = list(df[max_outlier_bool].index)
        negative_outliers = list(df[min_threshold_bool].index)
        #znorm_outliers = list(df[znorm_bool].index)
        missing_values = list(df[df.temperature.isnull()].index)

        clean_data = df[columns].to_dict('records')
        for r in clean_data:
            r.update({"stationId": key})

        ops = [InsertOne(x) for x in clean_data]
        result = self.mongo['meteo_clean_data'].bulk_write(
            [
                DeleteMany(
                    {"stationId": key}),
            ] + ops
        )

        self.mongo['meteo_data_quality'].update(
            {"stationId": key},
            {"$set":
                {
                    "overlapings": duplicated_values,
                    "gaps": missing_values,
                    "negative_values": negative_outliers,
         #           "znorm_outliers": znorm_outliers,
                    "max_outliers": max_outliers}
            }, upsert=True)

        all = [x[0] for x in self.config['output']['fields']]
        for row in df.iterrows():
            return_list = []
            for f in all:
                if f == "ts":
                    return_list.append(str(row[1]['ts'].timestamp()))
                elif f == "stationId":
                    return_list.append(key)
                else:
                    return_list.append(str(row[1][f]))

            yield None, "\t".join(return_list)
Exemplo n.º 28
0
 def delete_many(self, cliteria):
     self._batch.append(DeleteMany(cliteria))
Exemplo n.º 29
0
 def remove(self, collection: str, col_filter: dict):
     if collection not in self.transactions:
         self.transactions[collection] = []
     self.transactions[collection].append(DeleteMany(col_filter))
     return Remove(self.transactions)
Exemplo n.º 30
0
 def delete(self):
     """
     Delete multiple object(s) based on a filter
     ---
     parameters:
         -   in: body
             description: List of filter fields and value, on basis of which the object(s) will be deleted
             schema:
                 type: array
                 items:
                     anyOf:
                         - schema:
                             properties:
                                 _id:
                                 type: string
                         - schema:
                             properties:
                                 name:
                                 type: string
                         - schema:
                             properties:
                                 brand_name:
                                 type: string
                         - schema:
                             properties:
                                 regular_price_value:
                                 type: number
                                 format: float
                         - schema:
                             properties:
                                 offer_price_value:
                                 type: number
                                 format: float
                         - schema:
                             properties:
                                 currency:
                                 type: string
                         - schema:
                             properties:
                                 classification_l1:
                                 type: string
                         - schema:
                             properties:
                                 classification_l2:
                                 type: string
                         - schema:
                             properties:
                                 classification_l3:
                                 type: string
                         - schema:
                             properties:
                                 classification_l4:
                                 type: string
                         - schema:
                             properties:
                                 image_url:
                                 type: string
     responses:
         '200':
             description: Bulk Write result object from MongoDB
             content:
                 application/json:
                     schema:
                         type: object
                         properties:
                             acknowledged:
                                 type: boolean
                             matched_count:
                                 type: string
                             modified_count:
                                 type: integer
                             deleted_count:
                                 type: integer
                             upserted_ids:
                                 type: array
                                 items:
                                     type: object
                                     properties:
                                         _id:
                                             type: string
                             inserted_count:
                                 type: integer
         '500':
             description: Server encountered an error while performing bulk operation
             content:
                 application/json:
                     schema:
                         type: object
                         properties:
                             message:
                                 type: string
     """
     requests = []
     for q in request.json:
         requests.append(DeleteMany(q))
     return perform_bulk(collection, requests), 200