Пример #1
0
def migrate_from_mirror_run_step(disable_orcid_push=True,
                                 disable_references_processing=True,
                                 step_no=1):
    """Allows to easily run step by step migration only for valid records """
    if step_no == 0:
        query = LegacyRecordsMirror.query.with_entities(
            LegacyRecordsMirror.recid).filter(
                LegacyRecordsMirror.valid.is_(True))
        recids_chunked = chunker(
            (str(res.recid) for res in query.yield_per(CHUNK_SIZE)),
            CHUNK_SIZE)
    elif 0 < step_no < 3:
        query = (PersistentIdentifier.query.with_entities(
            PersistentIdentifier.object_uuid).filter_by(
                pid_provider="recid").distinct())
        recids_chunked = chunker(
            (str(res.object_uuid) for res in query.yield_per(CHUNK_SIZE)),
            CHUNK_SIZE)
    else:
        echo("Wrong step number!")
        return

    task = migrate_recids_from_mirror(
        list(recids_chunked),
        disable_orcid_push=disable_orcid_push,
        disable_references_processing=disable_references_processing,
        step_no=step_no,
        one_step=True,
    )
    echo("All migration tasks have been scheduled.")
    return task
Пример #2
0
def get_bais_for_author_recids(recids):
    result = {}
    recids_chunked = chunker(recids, 100)
    for chunk in recids_chunked:
        query = _get_bais_for_author_recids_query(chunk)
        result.update({aut: bai for (aut, bai) in query.all()})
    return result
Пример #3
0
def test_chunker():
    iterable = range(10)

    expected = [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]
    result = chunker(iterable, 3)

    assert list(result) == expected
Пример #4
0
def test_chunker_with_min_num_chunks():
    iterable = range(5)

    expected = [[0, 1], [2, 3], [4]]
    result = chunker(iterable, 10, 3)

    assert list(result) == expected
Пример #5
0
def migrate_from_mirror(also_migrate=None, disable_orcid_push=True):
    """Migrate legacy records from the local mirror.
    By default, only the records that have not been migrated yet are migrated.

    Args:
        also_migrate(Optional[string]): if set to ``'broken'``, also broken
            records will be migrated. If set to ``'all'``, all records will be
            migrated.
        disable_orcid_push (bool): flag indicating whether the orcid_push
            should be disabled (if True) or executed at the end of migrations
            (if False).
    """
    disable_references_processing = False
    query = LegacyRecordsMirror.query.with_entities(LegacyRecordsMirror.recid)

    if also_migrate is None:
        query = query.filter(LegacyRecordsMirror.valid.is_(None))
    elif also_migrate == "broken":
        query = query.filter(LegacyRecordsMirror.valid.isnot(True))
    elif also_migrate == "all":
        disable_references_processing = True
    else:
        raise ValueError(
            '"also_migrate" should be either None, "all" or "broken"')

    recids_chunked = chunker(
        (res.recid for res in query.yield_per(CHUNK_SIZE)), CHUNK_SIZE)

    task = migrate_recids_from_mirror(
        list(recids_chunked),
        disable_orcid_push=disable_orcid_push,
        disable_references_processing=disable_references_processing,
    )
    LOGGER.info("All migration tasks have been scheduled.")
    return task
Пример #6
0
def test_chunker_ignores_min_num_chunks_when_iterable_not_sized():
    iterable = iter(range(5))

    expected = [[0, 1, 2, 3, 4]]
    result = chunker(iterable, 10, 3)

    assert list(result) == expected
Пример #7
0
    def get_records_ids_by_pids(cls, pids, max_batch=100):
        """If query is too big (~5000 pids) SQL refuses to run it,
        so it has to be split"""

        for batch in chunker(pids, max_chunk_size=max_batch):
            query = cls._get_records_ids_by_pids(batch)
            for data in query.yield_per(100):
                yield data.object_uuid
Пример #8
0
 def get_records_batched(cls, ids, with_deleted=False, max_batch=100):
     for batch in chunker(ids, max_chunk_size=max_batch):
         query = cls.model_cls.query.filter(cls.model_cls.id.in_(batch))
         if not with_deleted:
             query = query.filter(
                 (cls.model_cls.json.op("->")("deleted") == None)  # noqa
                 | (cls.model_cls.json["deleted"] != cast("True", JSONB)))
         for data in query.yield_per(100):
             yield cls(data.json, model=data)
Пример #9
0
def test_migrate_recids_from_mirror_external_push_enabled(inspire_app, enable_hal_push):
    with patch("inspirehep.migrator.tasks.run_hal_push.run") as mock_run_hal_push:
        migrate_recids_from_mirror(
            list(chunker([123], 1, 1)),
            disable_external_push=False,
            step_no=4,
            one_step=True,
        )
        mock_run_hal_push.assert_called_once()
Пример #10
0
    def fix_entries_by_update_date(cls,
                                   before=None,
                                   after=None,
                                   max_chunk=100):
        from inspirehep.records.tasks import regenerate_author_records_table_entries

        uuids_to_regenerate = LiteratureRecord.get_recids_by_updated_datetime(
            before, after)
        for batch in chunker(uuids_to_regenerate, max_chunk):
            regenerate_author_records_table_entries.delay(batch)
Пример #11
0
def assign_to_author(from_author_recid, to_author_recid, literature_recids):
    author_record = AuthorsRecord.get_record_by_pid_value(to_author_recid)
    num_workers = count_consumers_for_queue("assign")
    for batch in chunker(literature_recids, 10, num_workers):
        current_celery_app.send_task(
            "inspirehep.assign.tasks.assign_papers",
            kwargs={
                "from_author_recid": from_author_recid,
                "to_author_record": author_record,
                "author_papers_recids": batch,
            },
        )
    unstub_author_by_recid(to_author_recid)
Пример #12
0
def references(batch_size, db_batch_size):
    literature_uuids_query = PersistentIdentifier.query.filter(
        PersistentIdentifier.pid_type == "lit",
        PersistentIdentifier.status == PIDStatus.REGISTERED,
    ).with_entities(PersistentIdentifier.object_uuid)

    matcher_tasks = []
    result_chunks = chunker(literature_uuids_query.yield_per(db_batch_size), batch_size)
    for chunk in result_chunks:
        serialized_uuids = [str(uuid) for (uuid,) in chunk]
        matcher_task = match_references_by_uuids.s(serialized_uuids)
        matcher_tasks.append(matcher_task)

    matcher_task_group = group(matcher_tasks)
    group_result = matcher_task_group()
    group_result.join()  # waits for all tasks to be finished
Пример #13
0
def create_sitemap():
    page_size = current_app.config["SITEMAP_PAGE_SIZE"]
    sitemap_items = generate_sitemap_items()
    page = 1
    pages = chunker(sitemap_items, page_size)
    for page_items in pages:
        page_content = render_template("sitemap/page.xml", urlset=page_items)
        write_sitemap_page_content(page, page_content)
        page += 1

    page_range = range(1, page)
    index_items = [
        {"loc": get_sitemap_page_absolute_url(page_number)}
        for page_number in page_range
    ]
    index_content = render_template("sitemap/index.xml", urlset=index_items)
    write_sitemap_page_content("", index_content)
Пример #14
0
def assign_to_new_stub_author(from_author_recid, literature_recids):
    # TODO: differentiate from BEARD created stub author
    author_papers = get_literature_records_by_recid(literature_recids)
    author_signatures = get_author_signatures(from_author_recid, author_papers)
    stub_author_data = update_author_names({"name": {}}, author_signatures)
    to_author = create_new_stub_author(**stub_author_data)
    num_workers = count_consumers_for_queue("assign")
    for batch in chunker(literature_recids, 10, num_workers):
        current_celery_app.send_task(
            "inspirehep.assign.tasks.assign_papers",
            kwargs={
                "from_author_recid": from_author_recid,
                "to_author_record": to_author,
                "author_papers_recids": batch,
                "is_stub_author": True,
            },
        )
    return to_author["control_number"]
Пример #15
0
    def run(self):
        """Make changes to the records that need them."""
        checked_count, modified_count = 0, 0
        self.logger.info("Starting search, check & do job",
                         reason=self.__doc__)
        for chunk in chunker(self.search(), self.size):
            uuids = [r.meta.id for r in chunk]
            self.logger.info("Received record IDs from ES",
                             num_records=len(uuids))
            records = InspireRecord.get_records(uuids)
            self.logger.info("Fetched chunk of records from DB",
                             num_records=len(records))

            for record in records:
                state = {}
                logger = self.logger.bind(recid=record["control_number"])
                checked_count += 1
                record = InspireRecord.get_class_for_record(record)(
                    record, model=record.model)
                if not self.check(record, logger=logger, state=state):
                    logger.info("Not modifying record, check negative")
                    continue
                modified_count += 1
                logger.info("Modifying record, check positive")
                self.do(record, logger=logger, state=state)
                record.update(dict(record))

            if self.commit_after_each_batch:
                db.session.commit()

        db.session.commit()
        self.logger.info(
            "Search, check & do job finished successfully.",
            num_records_checked=checked_count,
            num_records_modified=modified_count,
        )
Пример #16
0
def populate_mirror_from_file(source):
    for i, chunk in enumerate(
            chunker(split_stream(read_file(source)), CHUNK_SIZE)):
        insert_into_mirror(chunk)
        inserted_records = i * CHUNK_SIZE + len(chunk)
        echo(f"Inserted {inserted_records} records into mirror")
Пример #17
0
 def get_records_by_pids(cls, pids, max_batch=100):
     for batch in chunker(pids, max_chunk_size=max_batch):
         query = cls.get_record_metadata_by_pids(batch)
         for data in query.yield_per(100):
             yield cls(data.json, model=data)
Пример #18
0
def test_chunker_doesnt_make_chunks_larger_than_max_chunk_size():
    iterable = range(5)
    expected = [[0, 1], [2, 3], [4]]
    result = chunker(iterable, 2, 2)

    assert list(result) == expected