def migrate_from_mirror_run_step(disable_orcid_push=True, disable_references_processing=True, step_no=1): """Allows to easily run step by step migration only for valid records """ if step_no == 0: query = LegacyRecordsMirror.query.with_entities( LegacyRecordsMirror.recid).filter( LegacyRecordsMirror.valid.is_(True)) recids_chunked = chunker( (str(res.recid) for res in query.yield_per(CHUNK_SIZE)), CHUNK_SIZE) elif 0 < step_no < 3: query = (PersistentIdentifier.query.with_entities( PersistentIdentifier.object_uuid).filter_by( pid_provider="recid").distinct()) recids_chunked = chunker( (str(res.object_uuid) for res in query.yield_per(CHUNK_SIZE)), CHUNK_SIZE) else: echo("Wrong step number!") return task = migrate_recids_from_mirror( list(recids_chunked), disable_orcid_push=disable_orcid_push, disable_references_processing=disable_references_processing, step_no=step_no, one_step=True, ) echo("All migration tasks have been scheduled.") return task
def get_bais_for_author_recids(recids): result = {} recids_chunked = chunker(recids, 100) for chunk in recids_chunked: query = _get_bais_for_author_recids_query(chunk) result.update({aut: bai for (aut, bai) in query.all()}) return result
def test_chunker(): iterable = range(10) expected = [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]] result = chunker(iterable, 3) assert list(result) == expected
def test_chunker_with_min_num_chunks(): iterable = range(5) expected = [[0, 1], [2, 3], [4]] result = chunker(iterable, 10, 3) assert list(result) == expected
def migrate_from_mirror(also_migrate=None, disable_orcid_push=True): """Migrate legacy records from the local mirror. By default, only the records that have not been migrated yet are migrated. Args: also_migrate(Optional[string]): if set to ``'broken'``, also broken records will be migrated. If set to ``'all'``, all records will be migrated. disable_orcid_push (bool): flag indicating whether the orcid_push should be disabled (if True) or executed at the end of migrations (if False). """ disable_references_processing = False query = LegacyRecordsMirror.query.with_entities(LegacyRecordsMirror.recid) if also_migrate is None: query = query.filter(LegacyRecordsMirror.valid.is_(None)) elif also_migrate == "broken": query = query.filter(LegacyRecordsMirror.valid.isnot(True)) elif also_migrate == "all": disable_references_processing = True else: raise ValueError( '"also_migrate" should be either None, "all" or "broken"') recids_chunked = chunker( (res.recid for res in query.yield_per(CHUNK_SIZE)), CHUNK_SIZE) task = migrate_recids_from_mirror( list(recids_chunked), disable_orcid_push=disable_orcid_push, disable_references_processing=disable_references_processing, ) LOGGER.info("All migration tasks have been scheduled.") return task
def test_chunker_ignores_min_num_chunks_when_iterable_not_sized(): iterable = iter(range(5)) expected = [[0, 1, 2, 3, 4]] result = chunker(iterable, 10, 3) assert list(result) == expected
def get_records_ids_by_pids(cls, pids, max_batch=100): """If query is too big (~5000 pids) SQL refuses to run it, so it has to be split""" for batch in chunker(pids, max_chunk_size=max_batch): query = cls._get_records_ids_by_pids(batch) for data in query.yield_per(100): yield data.object_uuid
def get_records_batched(cls, ids, with_deleted=False, max_batch=100): for batch in chunker(ids, max_chunk_size=max_batch): query = cls.model_cls.query.filter(cls.model_cls.id.in_(batch)) if not with_deleted: query = query.filter( (cls.model_cls.json.op("->")("deleted") == None) # noqa | (cls.model_cls.json["deleted"] != cast("True", JSONB))) for data in query.yield_per(100): yield cls(data.json, model=data)
def test_migrate_recids_from_mirror_external_push_enabled(inspire_app, enable_hal_push): with patch("inspirehep.migrator.tasks.run_hal_push.run") as mock_run_hal_push: migrate_recids_from_mirror( list(chunker([123], 1, 1)), disable_external_push=False, step_no=4, one_step=True, ) mock_run_hal_push.assert_called_once()
def fix_entries_by_update_date(cls, before=None, after=None, max_chunk=100): from inspirehep.records.tasks import regenerate_author_records_table_entries uuids_to_regenerate = LiteratureRecord.get_recids_by_updated_datetime( before, after) for batch in chunker(uuids_to_regenerate, max_chunk): regenerate_author_records_table_entries.delay(batch)
def assign_to_author(from_author_recid, to_author_recid, literature_recids): author_record = AuthorsRecord.get_record_by_pid_value(to_author_recid) num_workers = count_consumers_for_queue("assign") for batch in chunker(literature_recids, 10, num_workers): current_celery_app.send_task( "inspirehep.assign.tasks.assign_papers", kwargs={ "from_author_recid": from_author_recid, "to_author_record": author_record, "author_papers_recids": batch, }, ) unstub_author_by_recid(to_author_recid)
def references(batch_size, db_batch_size): literature_uuids_query = PersistentIdentifier.query.filter( PersistentIdentifier.pid_type == "lit", PersistentIdentifier.status == PIDStatus.REGISTERED, ).with_entities(PersistentIdentifier.object_uuid) matcher_tasks = [] result_chunks = chunker(literature_uuids_query.yield_per(db_batch_size), batch_size) for chunk in result_chunks: serialized_uuids = [str(uuid) for (uuid,) in chunk] matcher_task = match_references_by_uuids.s(serialized_uuids) matcher_tasks.append(matcher_task) matcher_task_group = group(matcher_tasks) group_result = matcher_task_group() group_result.join() # waits for all tasks to be finished
def create_sitemap(): page_size = current_app.config["SITEMAP_PAGE_SIZE"] sitemap_items = generate_sitemap_items() page = 1 pages = chunker(sitemap_items, page_size) for page_items in pages: page_content = render_template("sitemap/page.xml", urlset=page_items) write_sitemap_page_content(page, page_content) page += 1 page_range = range(1, page) index_items = [ {"loc": get_sitemap_page_absolute_url(page_number)} for page_number in page_range ] index_content = render_template("sitemap/index.xml", urlset=index_items) write_sitemap_page_content("", index_content)
def assign_to_new_stub_author(from_author_recid, literature_recids): # TODO: differentiate from BEARD created stub author author_papers = get_literature_records_by_recid(literature_recids) author_signatures = get_author_signatures(from_author_recid, author_papers) stub_author_data = update_author_names({"name": {}}, author_signatures) to_author = create_new_stub_author(**stub_author_data) num_workers = count_consumers_for_queue("assign") for batch in chunker(literature_recids, 10, num_workers): current_celery_app.send_task( "inspirehep.assign.tasks.assign_papers", kwargs={ "from_author_recid": from_author_recid, "to_author_record": to_author, "author_papers_recids": batch, "is_stub_author": True, }, ) return to_author["control_number"]
def run(self): """Make changes to the records that need them.""" checked_count, modified_count = 0, 0 self.logger.info("Starting search, check & do job", reason=self.__doc__) for chunk in chunker(self.search(), self.size): uuids = [r.meta.id for r in chunk] self.logger.info("Received record IDs from ES", num_records=len(uuids)) records = InspireRecord.get_records(uuids) self.logger.info("Fetched chunk of records from DB", num_records=len(records)) for record in records: state = {} logger = self.logger.bind(recid=record["control_number"]) checked_count += 1 record = InspireRecord.get_class_for_record(record)( record, model=record.model) if not self.check(record, logger=logger, state=state): logger.info("Not modifying record, check negative") continue modified_count += 1 logger.info("Modifying record, check positive") self.do(record, logger=logger, state=state) record.update(dict(record)) if self.commit_after_each_batch: db.session.commit() db.session.commit() self.logger.info( "Search, check & do job finished successfully.", num_records_checked=checked_count, num_records_modified=modified_count, )
def populate_mirror_from_file(source): for i, chunk in enumerate( chunker(split_stream(read_file(source)), CHUNK_SIZE)): insert_into_mirror(chunk) inserted_records = i * CHUNK_SIZE + len(chunk) echo(f"Inserted {inserted_records} records into mirror")
def get_records_by_pids(cls, pids, max_batch=100): for batch in chunker(pids, max_chunk_size=max_batch): query = cls.get_record_metadata_by_pids(batch) for data in query.yield_per(100): yield cls(data.json, model=data)
def test_chunker_doesnt_make_chunks_larger_than_max_chunk_size(): iterable = range(5) expected = [[0, 1], [2, 3], [4]] result = chunker(iterable, 2, 2) assert list(result) == expected