예제 #1
0
    async def test_clear_scroll(self, async_client):
        bulk = []
        for x in range(4):
            bulk.append({"index": {"_index": "test_index"}})
            bulk.append({"value": x})
        await async_client.bulk(bulk, refresh=True)

        with patch.object(
            async_client, "clear_scroll", wraps=async_client.clear_scroll
        ) as spy:
            _ = [
                x
                async for x in helpers.async_scan(
                    async_client, index="test_index", size=2
                )
            ]
            spy.assert_called_once()

            spy.reset_mock()
            _ = [
                x
                async for x in helpers.async_scan(
                    async_client, index="test_index", size=2, clear_scroll=True
                )
            ]
            spy.assert_called_once()

            spy.reset_mock()
            _ = [
                x
                async for x in helpers.async_scan(
                    async_client, index="test_index", size=2, clear_scroll=False
                )
            ]
            spy.assert_not_called()
예제 #2
0
    async def test_scroll_error(self, async_client):
        bulk = []
        for x in range(4):
            bulk.append({"index": {"_index": "test_index"}})
            bulk.append({"value": x})
        await async_client.bulk(bulk, refresh=True)

        with patch.object(async_client, "scroll", MockScroll()):
            data = [
                x
                async for x in helpers.async_scan(
                    async_client,
                    index="test_index",
                    size=2,
                    raise_on_error=False,
                    clear_scroll=False,
                )
            ]
            assert len(data) == 3
            assert data[-1] == {"scroll_data": 42}

        with patch.object(async_client, "scroll", MockScroll()):
            with pytest.raises(ScanError):
                data = [
                    x
                    async for x in helpers.async_scan(
                        async_client,
                        index="test_index",
                        size=2,
                        raise_on_error=True,
                        clear_scroll=False,
                    )
                ]
            assert len(data) == 3
            assert data[-1] == {"scroll_data": 42}
예제 #3
0
    async def test_initial_search_error(self, async_client):
        with patch.object(async_client, "clear_scroll", new_callable=AsyncMock):
            with patch.object(
                async_client,
                "search",
                MockResponse(
                    {
                        "_scroll_id": "dummy_id",
                        "_shards": {"successful": 4, "total": 5, "skipped": 0},
                        "hits": {"hits": [{"search_data": 1}]},
                    }
                ),
            ):
                with patch.object(async_client, "scroll", MockScroll()):

                    data = [
                        x
                        async for x in helpers.async_scan(
                            async_client,
                            index="test_index",
                            size=2,
                            raise_on_error=False,
                        )
                    ]
                    assert data == [{"search_data": 1}, {"scroll_data": 42}]

            with patch.object(
                async_client,
                "search",
                MockResponse(
                    {
                        "_scroll_id": "dummy_id",
                        "_shards": {"successful": 4, "total": 5, "skipped": 0},
                        "hits": {"hits": [{"search_data": 1}]},
                    }
                ),
            ):
                with patch.object(async_client, "scroll", MockScroll()) as mock_scroll:

                    with pytest.raises(ScanError):
                        data = [
                            x
                            async for x in helpers.async_scan(
                                async_client,
                                index="test_index",
                                size=2,
                                raise_on_error=True,
                            )
                        ]
                        assert data == [{"search_data": 1}]
                        assert mock_scroll.calls == []
async def async_scan_types() -> None:
    async for _ in async_scan(
        es,
        query={"query": {"match_all": {}}},
        request_timeout=10,
        clear_scroll=True,
        scroll_kwargs={"request_timeout": 10},
    ):
        pass
    async for _ in async_scan(
        es,
        raise_on_error=False,
        preserve_order=False,
        scroll="10m",
        size=10,
        request_timeout=10.0,
    ):
        pass
예제 #5
0
    async def scan(self, index, query):

        _generator = helpers.async_scan(
            self._client,
            query=query,
            index=index,
        )

        async for doc in _generator:
            yield doc
예제 #6
0
async def main():
    async for doc in async_scan(
            client=es,
            query={"query": {
                "match": {
                    "title": "python"
                }
            }},
            index="orders-*"):
        print(doc)
예제 #7
0
    async def test_logger(self, logger_mock, async_client):
        bulk = []
        for x in range(4):
            bulk.append({"index": {"_index": "test_index"}})
            bulk.append({"value": x})
        await async_client.bulk(bulk, refresh=True)

        with patch.object(async_client, "scroll", MockScroll()):
            _ = [
                x
                async for x in helpers.async_scan(
                    async_client,
                    index="test_index",
                    size=2,
                    raise_on_error=False,
                    clear_scroll=False,
                )
            ]
            logger_mock.warning.assert_called()

        with patch.object(async_client, "scroll", MockScroll()):
            try:
                _ = [
                    x
                    async for x in helpers.async_scan(
                        async_client,
                        index="test_index",
                        size=2,
                        raise_on_error=True,
                        clear_scroll=False,
                    )
                ]
            except ScanError:
                pass
            logger_mock.warning.assert_called_with(
                "Scroll request has only succeeded on %d (+%d skipped) shards out of %d.",
                4,
                0,
                5,
            )
예제 #8
0
    async def test_no_scroll_id_fast_route(self, async_client, scan_teardown):
        with patch.object(async_client, "search",
                          MockResponse({"no": "_scroll_id"})):
            with patch.object(async_client, "scroll") as scroll_mock:
                with patch.object(async_client, "clear_scroll") as clear_mock:
                    data = [
                        x async for x in helpers.async_scan(async_client,
                                                            index="test_index")
                    ]

                    assert data == []
                    scroll_mock.assert_not_called()
                    clear_mock.assert_not_called()
예제 #9
0
    async def test_all_documents_are_read(self, async_client):
        bulk = []
        for x in range(100):
            bulk.append({"index": {"_index": "test_index", "_id": x}})
            bulk.append({"answer": x, "correct": x == 42})
        await async_client.bulk(bulk, refresh=True)

        docs = [
            x
            async for x in helpers.async_scan(async_client, index="test_index", size=2)
        ]

        assert 100 == len(docs)
        assert set(map(str, range(100))) == set(d["_id"] for d in docs)
        assert set(range(100)) == set(d["_source"]["answer"] for d in docs)
예제 #10
0
async def keywordSearch(keywords, myindex):
    """
    查询
    """
    mysearch = {
        "query": {
            "match_all": {},
            # "constant_score":{"filter": {"term": {"cityid": keywords}}}
        },
        "size": 10000,
    }
    # 直接查询
    # res=es.search(index=myindex,body=mysearch)
    # print(res)
    # total=res["hits"]["total"]["value"]
    # resc=[item for item in res["hits"]["hits"]]
    # # print(res) #generator object scan at 0x0A686930
    # searchres=[]
    # for item in resc:
    #     tmp=item["_source"]
    #     searchres.append((tmp["hospitalid"],tmp["hospitalname"]))
    # print("共查询到 %d" %total)
    # print(f"查询结果 {searchres}{len(searchres)}")

    # helpers查询
    res = []
    async for doc in helpers.async_scan(
            client=es,
            query=mysearch,
            scroll="5m",  # 查询一次数据在ES中缓存10分钟再销毁
            index=myindex,
            timeout="10m",
    ):
        res.append(doc)

    # print(res)  # generator object scan at 0x0A686930
    # res = [item for item in res]
    # print(res) #generator object scan at 0x0A686930
    searchres = []
    for item in res:
        tmp = item["_source"]
        searchres.append((tmp["hospitalid"], tmp["hospitalname"]))
    print("共查询到%d" % len(res))
    print(searchres)
예제 #11
0
    async def test_order_can_be_preserved(self, async_client, scan_teardown):
        bulk = []
        for x in range(100):
            bulk.append({"index": {"_index": "test_index", "_id": x}})
            bulk.append({"answer": x, "correct": x == 42})
        await async_client.bulk(bulk, refresh=True)

        docs = [
            doc async for doc in helpers.async_scan(
                async_client,
                index="test_index",
                query={"sort": "answer"},
                preserve_order=True,
            )
        ]

        assert 100 == len(docs)
        assert list(map(str, range(100))) == list(d["_id"] for d in docs)
        assert list(range(100)) == list(d["_source"]["answer"] for d in docs)
예제 #12
0
async def query(
    session: plugins.session.SessionObject,
    query_defuzzed,
    query_limit=10000,
    shorten=False,
):
    """
    Advanced query and grab for stats.py
    """
    docs = []
    hits = 0
    assert session.database, "Database not connected!"
    async for hit in async_scan(
            client=session.database.client,
            query={
                "query": {
                    "bool": query_defuzzed
                },
                "sort": [{
                    "epoch": {
                        "order": "desc"
                    }
                }],
            },
    ):
        doc = hit["_source"]
        doc["id"] = doc["mid"]
        if plugins.aaa.can_access_email(session, doc):
            if not session.credentials:
                doc = anonymize(doc)
            if shorten:
                doc["body"] = (doc["body"] or "")[:200]
            trim_email(doc)
            docs.append(doc)
            hits += 1
            if hits > query_limit:
                break
    return docs
예제 #13
0
async def main():
    print("Welcome to the Apache Pony Mail -> Foal migrator.")
    print("This will copy your old database, adjust the structure, and insert the emails into your new foal database.")
    print("------------------------------------")
    old_es_url = input("Enter the full URL (including http/https) of your old ES server: ") or "http://localhost:9200/"
    new_es_url = input("Enter the full URL (including http/https) of your NEW ES server: ") or "http://localhost:9200/"
    if old_es_url == new_es_url:
        print("Old and new DB should not be the same, assuming error in input and exiting!")
        return
    old_es = AsyncElasticsearch([old_es_url])
    new_es = AsyncElasticsearch([new_es_url])

    old_dbname = input("What is the database name for the old Pony Mail emails? [ponymail]: ") or "ponymail"
    new_dbprefix = input("What is the database prefix for the new Pony Mail emails? [ponymail]: ") or "ponymail"

    do_dkim = True
    dkim_txt = input("Do you wish to perform DKIM re-indexing of all emails? This will still preserve old permalinks "
                     "(y/n) [y]: ") or "y"
    if dkim_txt.lower() == 'n':
        do_dkim = False

    # Define index names for new ES
    dbname_mbox = new_dbprefix + "-mbox"
    dbname_source = new_dbprefix + "-source"
    dbname_attachment = new_dbprefix + "-attachment"

    # Let's get started..!
    start_time = time.time()
    now = start_time
    processed = 0
    count = await old_es.count(index=old_dbname, doc_type="mbox")
    no_emails = count['count']

    print("------------------------------------")
    print("Starting migration of %u emails, this may take quite a while..." % no_emails)

    bulk_array = []

    async for doc in async_scan(
            client=old_es,
            query={"query": {"match_all": {}}},
            doc_type="mbox",
            index=old_dbname,
    ):
        list_id = doc['_source']['list_raw'].strip("<>")
        try:
            source = await old_es.get(index=old_dbname, doc_type="mbox_source", id=doc['_id'])
        # If we hit a 404 on a source, we have to fake an empty document, as we don't know the source.
        except:
            print("Source for %s was not found, faking it..." % doc['_id'])
            source = {
                '_source': {
                    'source': ""
                }
            }
        source_text: str = source['_source']['source']
        if ':' not in source_text:  # Base64
            source_text = base64.b64decode(source_text)
        else:  # bytify
            source_text = source_text.encode('utf-8', 'ignore')
        if do_dkim:
            dkim_id = generators.dkimid(None, None, list_id, None, source_text)
            old_id = doc['_id']
            doc['_source']['mid'] = dkim_id
            doc['_source']['permalinks'] = [
                dkim_id,
                old_id
            ]
        else:
            doc['_source']['permalinks'] = [
                doc['_id']
            ]

        source['_source']['permalinks'] = doc['_source']['permalinks']
        doc['_source']['dbid'] = hashlib.sha3_256(source_text).hexdigest()

        # Append migration details to notes field in doc
        notes = doc['_source'].get('_notes', [])
        # We want a list, not a single string
        if isinstance(notes, str):
            notes = list(notes)
        notes.append("MIGRATE: Document migrated from Pony Mail to Pony Mail Foal at %u, "
                     "using foal migrator v/%s" % (now, MIGRATION_MAGIC_NUMBER))
        # If we re-indexed the document, make a note of that as well.
        if do_dkim:
            notes.append("REINDEX: Document re-indexed with DKIM_ID at %u, "
                         "from %s to %s" % (now, dkim_id, old_id))
        doc['_source']['_notes'] = notes

        # Copy to new DB
        bulk_array.append({
            'index': dbname_mbox,
            'id': doc['_id'],
            'body': doc['_source']
        })
        bulk_array.append({
            'index': dbname_source,
            'id': doc['_source']['dbid'],
            'body': source['_source']
        })

        if len(bulk_array) > 100:
            await bulk_push(bulk_array, new_es)
            bulk_array[:] = []

        processed += 1
        if processed % 500 == 0:
            now = time.time()
            time_spent = now - start_time
            docs_per_second = processed / time_spent
            time_left = (no_emails - processed) / docs_per_second

            # stringify time left
            time_left_str = "%u seconds" % time_left
            if time_left > 60:
                time_left_str = "%u minute(s), %u second(s)" % (int(time_left / 60), time_left % 60)
            if time_left > 3600:
                time_left_str = "%u hour(s), %u minute(s), %u second(s)" % (
                int(time_left / 3600), int(time_left % 3600 / 60), time_left % 60)

            print("Processed %u emails, %u remain. ETA: %s (at %u emails per second)" %
                  (processed, (no_emails - processed), time_left_str, docs_per_second)
                  )

    # There may be some docs left over to push
    if bulk_array:
        await bulk_push(bulk_array, new_es)

    start_time = time.time()
    processed = 0
    count = await old_es.count(index=old_dbname, doc_type="attachment")
    no_att = count['count']
    print("Transferring %u attachments..." % no_att)
    async for doc in async_scan(
            client=old_es,
            query={"query": {"match_all": {}}},
            doc_type="attachment",
            index=old_dbname,
    ):
        # Copy to new DB
        await new_es.index(index=dbname_attachment, doc_type='_doc', id=doc['_id'], body=doc['_source'])

        processed += 1
        if processed % 500 == 0:
            now = time.time()
            time_spent = now - start_time
            docs_per_second = processed / time_spent
            time_left = (no_att - processed) / docs_per_second

            # stringify time left
            time_left_str = "%u seconds" % time_left
            if time_left > 60:
                time_left_str = "%u minute(s), %u second(s)" % (int(time_left / 60), time_left % 60)
            if time_left > 3600:
                time_left_str = "%u hour(s), %u minute(s), %u second(s)" % (
                int(time_left / 3600), int(time_left % 3600 / 60), time_left % 60)

            print("Processed %u emails, %u remain. ETA: %s (at %u attachments per second)" %
                  (processed, (no_att - processed), time_left_str, docs_per_second)
                  )

    await old_es.close()
    await new_es.close()
    print("All done, enjoy!")
예제 #14
0
async def get_public_activity(
        database: plugins.configuration.DBConfig) -> dict:
    """

    :param database: a PyPony database configuration
    :return: A dictionary with activity stats
    """
    client = AsyncElasticsearch([
        {
            "host": database.hostname,
            "port": database.port,
            "url_prefix": database.url_prefix or "",
            "use_ssl": database.secure,
        },
    ])

    # Fetch aggregations of all public emails
    s = (Search(using=client, index=database.db_prefix + "-mbox").query(
        "match", private=False).filter("range",
                                       date={
                                           "lt": "now+1d",
                                           "gt": "now-14d"
                                       }))

    s.aggs.bucket("number_of_lists", "cardinality", field="list_raw")
    s.aggs.bucket("number_of_senders", "cardinality", field="from_raw")
    s.aggs.bucket("daily_emails",
                  "date_histogram",
                  field="date",
                  calendar_interval="1d")

    res = await client.search(index=database.db_prefix + "-mbox",
                              body=s.to_dict(),
                              size=0)

    no_emails = res["hits"]["total"]["value"]
    no_lists = res["aggregations"]["number_of_lists"]["value"]
    no_senders = res["aggregations"]["number_of_senders"]["value"]
    daily_emails = []
    for entry in res["aggregations"]["daily_emails"]["buckets"]:
        daily_emails.append((entry["key"], entry["doc_count"]))

    # Now the nitty gritty thread count
    seen_emails = {}
    seen_topics = []
    thread_count = 0

    s = (Search(using=client, index=database.db_prefix + "-mbox").query(
        "match", private=False).filter("range",
                                       date={
                                           "lt": "now+1d",
                                           "gt": "now-14d"
                                       }))
    async for doc in async_scan(
            index=database.db_prefix + "-mbox",
            client=client,
            query=s.to_dict(),
            _source_includes=[
                "message-id",
                "in-reply-to",
                "subject",
                "references",
                "epoch",
                "list_raw",
            ],
    ):

        found = False
        message_id = doc["_source"].get("message-id")
        irt = doc["_source"].get("in-reply-to")
        references = doc["_source"].get("references")
        list_raw = doc["_source"].get("list_raw", "_")
        subject = doc["_source"].get("subject", "_")
        if irt and irt in seen_emails:
            seen_emails[message_id] = irt
            found = True
        elif references:
            for refid in re.split(r"\s+", references):
                if refid in seen_emails:
                    seen_emails[message_id] = refid
                    found = True
        if not found:
            subject = PYPONY_RE_PREFIX.sub("", subject)
            subject += list_raw
            if subject in seen_topics:
                seen_emails[message_id] = subject
            else:
                seen_topics.append(subject)
                thread_count += 1

    await client.close()

    activity = {
        "hits": no_emails,
        "no_threads": thread_count,
        "no_active_lists": no_lists,
        "participants": no_senders,
        "activity": daily_emails,
    }

    return activity
예제 #15
0
async def main(args):
    no_jobs = args.jobs
    graceful = args.graceful
    print("Welcome to the Apache Pony Mail -> Foal migrator.")
    print("This will copy your old database, adjust the structure, and insert the emails into your new foal database.")
    print("We will be utilizing %u cores for this operation." % no_jobs)
    print("------------------------------------")
    old_es_url = args.old_url or input("Enter the full URL (including http/https) of your old ES server: ") or "http://localhost:9200/"
    new_es_url = args.new_url or input("Enter the full URL (including http/https) of your NEW ES server: ") or "http://localhost:9200/"
    if old_es_url == new_es_url:
        print("Old and new DB should not be the same, assuming error in input and exiting!")
        return
    ols_es_async = AsyncElasticsearch([old_es_url])

    old_dbname = args.old_name or input("What is the database name for the old Pony Mail emails? [ponymail]: ") or "ponymail"
    new_dbprefix = args.new_prefix or input("What is the database prefix for the new Pony Mail emails? [ponymail]: ") or "ponymail"

    do_dkim = True
    dkim_txt = (
        input(
            "Do you wish to perform DKIM re-indexing of all emails? This will NOT preserve all old permalinks currently "
            "(y/n) [y]: "
        )
        or "y"
    )
    if dkim_txt.lower() == "n":
        do_dkim = False

    # Define index names for new ES
    dbname_mbox = new_dbprefix + "-mbox"
    dbname_source = new_dbprefix + "-source"
    dbname_attachment = new_dbprefix + "-attachment"

    # Let's get started..!
    # start_time = time.time()
    count = await ols_es_async.count(index=old_dbname, doc_type="mbox")
    no_emails = count["count"]

    print("------------------------------------")
    print("Starting migration of %u emails, this may take quite a while..." % no_emails)

    processes = MultiDocProcessor(old_es_url, new_es_url, process_document, no_jobs)

    docs_read = 0
    async for doc in async_scan(
        client=ols_es_async,
        query={"query": {"match_all": {}}},
        doc_type="mbox",
        index=old_dbname,
    ):
        docs_read += 1
        processes.feed(doc, old_dbname, dbname_source, dbname_mbox, do_dkim)
        # Don't speed too far ahead of processing...
        processed = processes.processed.value
        while docs_read - processed > 100 * no_jobs:
            await asyncio.sleep(0.01)
            processed = processes.processed.value + 0

        processes.status(no_emails)

    # There may be some docs left over to push
    processes.sighup()
    while processed < no_emails:  # Wait for all documents to have been processed.
        await asyncio.sleep(1)
        print(f"Waiting for bulk push to complete ({processed} out of {no_emails} done...)")
        processed = processes.processed.value

    processes.stop()

    # Process attachments
    # start_time = time.time()
    processes = MultiDocProcessor(old_es_url, new_es_url, process_attachment, no_jobs, graceful)
    docs_read = 0
    count = await ols_es_async.count(index=old_dbname, doc_type="attachment")
    no_att = count["count"]
    print("Transferring %u attachments..." % no_att)
    async for doc in async_scan(
        client=ols_es_async,
        query={"query": {"match_all": {}}},
        doc_type="attachment",
        index=old_dbname,
    ):
        processes.feed(doc, dbname_attachment)
        docs_read += 1

        # Don't speed ahead
        processed = processes.processed.value + 0
        while docs_read - processed > 10 * no_jobs:
            await asyncio.sleep(0.01)
            processed = processes.processed.value + 0

        processes.status(no_att)

    # There may be some docs left over to push
    processes.sighup()
    while processed < no_att:  # Wait for all attachments to have been processed.
        await asyncio.sleep(1)
        print(f"Waiting for bulk push to complete ({processed} out of {no_att} done...)")
        processed = processes.processed.value

    processes.stop()
    await ols_es_async.close()
    print("All done, enjoy!")
예제 #16
0
 async def scroll(self, model: Type[Model], **kwargs):
     async for i in async_scan(self._client, index=model.index, **kwargs):
         yield model(_id=i["_id"], _typecheck=False, **i["_source"])