def test_should_load_file_successfull(self): data = reading.read_json_file(self.journals_json_path) self.assertTrue(type(data), list) self.assertEqual( data[0].get("v140")[0]["_"], "Colégio Brasileiro de Cirurgia Digestiva - CBCD", ) self.assertEqual(len(data), 3)
def import_documents_bundles_link_with_journal(file_path: str, session: Session): """Fachada responsável por ler o arquivo de link entre journals e documents bundles e atualizar os journals com os identificadores dos bundles O formato esperado para o arquivo de link é: ``` { "journal_id": [ { "id": "issue-2", "order": "0002", "number": "02", "volume": "02", "year": "2019", "supplement": "supplement", }, { "id": "issue-2", "order": "0002", "number": "02", "volume": "02", "year": "2019", "supplement": "supplement", }, ] } ``` """ links = reading.read_json_file(file_path) for journal_id, bundles in links.items(): try: _journal = session.journals.fetch(journal_id) for bundle_id in bundles: try: _journal.add_issue(bundle_id) except AlreadyExists: logger.debug("Bundle %s already exists in journal %s" % (bundle_id["id"], journal_id)) session.journals.update(_journal) session.changes.add({ "timestamp": utcnow(), "entity": "Journal", "id": _journal.id() }) except DoesNotExist: logger.debug("Journal %s does not exists, cannot link bundles." % journal_id)
def link_documents_bundles_with_journals(issue_path: str, output_path: str): """Busca pelo relacionamento entre periódicos e fascículos a partir de arquivos JSON extraídos de uma base MST. O resultado é escrito em um arquivo JSON contendo um objeto (dict) com identificadores de periócios como chaves e arrays de ids das issues que compõe o periódico""" journals_bundles = {} extract_isis.create_output_dir(output_path) issues_as_json = reading.read_json_file(issue_path) issues = conversion.conversion_issues_to_xylose(issues_as_json) issues = filter_issues(issues) for issue in issues: journal_id = issue.data["issue"]["v35"][0]["_"] journals_bundles.setdefault(journal_id, []) _issue_id = issue_to_kernel(issue)["_id"] exist_item = len( list( filter(lambda d: d["id"] == _issue_id, journals_bundles[journal_id]))) if not exist_item: _creation_date = parse_date(issue.publication_date) _supplement = "" if issue.type is "supplement": _supplement = "0" if issue.supplement_volume: _supplement = issue.supplement_volume elif issue.supplement_number: _supplement = issue.supplement_number journals_bundles[journal_id].append({ "id": _issue_id, "order": issue.order, "number": issue.number, "volume": issue.volume, "year": str(date_to_datetime(_creation_date).year), "supplement": _supplement, }) with open(output_path, "w") as output: output.write(json.dumps(journals_bundles, indent=4, sort_keys=True))
def import_documents_bundles_link_with_journal(file_path: str, session: Session): """Fachada responsável por ler o arquivo de link entre journals e documents bundles e atualizar os journals com os identificadores dos bundles O formato esperado para o arquivo de link é: ``` { "journal_id": [ { "id": "issue-2", "order": "0002", "number": "02", "volume": "02", "year": "2019", "supplement": "supplement", }, { "id": "issue-2", "order": "0002", "number": "02", "volume": "02", "year": "2019", "supplement": "supplement", }, ] } ``` """ links = reading.read_json_file(file_path) for journal_id, bundles_entries in links.items(): try: journal = session.journals.fetch(journal_id) except DoesNotExist: logger.debug('Journal "%s" does not exists, cannot link bundles.', journal_id) else: for bundle_entry in bundles_entries: # `bundle_entry` é um dict armazenado no Journal que o relaciona # com determinado bundle. try: journal.add_issue(bundle_entry) except AlreadyExists: logger.debug( 'Bundle "%s" already exists in journal "%s"', bundle_entry["id"], journal_id, ) update_journal(session, journal)
def link_documents_bundles_with_journals( journal_path: str, issue_path: str, output_path: str ): """Busca pelo relacionamento entre periódicos e fascículos a partir de arquivos JSON extraídos de uma base MST. O resultado é escrito em um arquivo JSON contendo um objeto (dict) com identificadores de periócios como chaves e arrays de ids das issues que compõe o periódico""" journals_bundles = {} extract_isis.create_output_dir(output_path) journals_as_json = reading.read_json_file(journal_path) issues_as_json = reading.read_json_file(issue_path) journals = conversion.conversion_journals_to_kernel(journals_as_json) issues = conversion.conversion_issues_to_xylose(issues_as_json) issues = filter_issues(issues) for journal in journals: journals_bundles[journal["id"]] = find_documents_bundles(journal, issues) with open(output_path, "w") as output: output.write(json.dumps(journals_bundles, indent=4, sort_keys=True))
def import_issues(json_file: str, session: Session): """Fachada com passo a passo de processamento e carga de fascículo em formato JSON para a base Kernel""" issues_as_json = reading.read_json_file(json_file) issues_as_xylose = conversion.conversion_issues_to_xylose(issues_as_json) issues_as_xylose = filter_issues(issues_as_xylose) manifests = conversion.conversion_issues_to_kernel(issues_as_xylose) for manifest in manifests: issue = DocumentsBundle(manifest=manifest) try: add_bundle(session, issue) except AlreadyExists as exc: logger.info(exc)
def import_journals(json_file: str, session: Session): """Fachada com passo a passo de processamento e carga de periódicos em formato JSON para a base Kernel""" try: journals_as_json = reading.read_json_file(json_file) manifests = conversion.conversion_journals_to_kernel( journals=journals_as_json) for manifest in manifests: journal = Journal(manifest=manifest) try: add_journal(session, journal) except AlreadyExists as exc: logger.info(exc) except (FileNotFoundError, ValueError) as exc: logger.debug(exc)
def import_issues(json_file: str, session: Session): """Fachada com passo a passo de processamento e carga de fascículo em formato JSON para a base Kernel""" issues_as_json = reading.read_json_file(json_file) issues_as_xylose = conversion.conversion_issues_to_xylose(issues_as_json) issues_as_xylose = filter_issues(issues_as_xylose) issues_as_kernel = conversion.conversion_issues_to_kernel(issues_as_xylose) for issue in issues_as_kernel: manifest = ManifestDomainAdapter(manifest=issue) try: session.documents_bundles.add(manifest) session.changes.add({ "timestamp": utcnow(), "entity": "DocumentsBundle", "id": manifest.id(), }) except AlreadyExists as exc: logger.info(str(exc))
def import_journals(json_file: str, session: Session): """Fachada com passo a passo de processamento e carga de periódicos em formato JSON para a base Kernel""" try: journals_as_json = reading.read_json_file(json_file) journals_as_kernel = conversion.conversion_journals_to_kernel( journals=journals_as_json ) for journal in journals_as_kernel: manifest = ManifestDomainAdapter(manifest=journal) try: session.journals.add(data=manifest) session.changes.add( {"timestamp": utcnow(), "entity": "Journal", "id": manifest.id()} ) except AlreadyExists as exc: logger.info(str(exc)) except (FileNotFoundError, ValueError) as exc: logger.debug(str(exc))
def register_documents_in_documents_bundle(session_db, file_documents: str, file_journals: str) -> None: err_filename = os.path.join(config.get("ERRORS_PATH"), "insert_documents_in_bundle.err") not_registered = [] journals = reading.read_json_file(file_journals) documents = reading.read_json_file(file_documents) data_journal = {} for journal in journals: o_journal = Journal(journal) if o_journal.print_issn: data_journal[o_journal.print_issn] = o_journal.scielo_issn if o_journal.electronic_issn: data_journal[o_journal.electronic_issn] = o_journal.scielo_issn if o_journal.scielo_issn: data_journal[o_journal.scielo_issn] = o_journal.scielo_issn documents_bundles = {} for scielo_id, document in documents.items(): is_issue = bool(document.get("volume") or document.get("number")) issn = "" for issn_type in ("eissn", "pissn", "issn"): issn = document.get(issn_type) if issn: break if is_issue: bundle_id = scielo_ids_generator.issue_id( data_journal[issn], document.get("year"), document.get("volume"), document.get("number"), document.get("supplement"), ) else: bundle_id = scielo_ids_generator.aops_bundle_id(data_journal[issn]) documents_bundles.setdefault(bundle_id, {}) documents_bundles[bundle_id].setdefault("items", []) documents_bundles[bundle_id]["items"].append({ "id": scielo_id, "order": document.get("order", ""), }) documents_bundles[bundle_id]["data"] = { "is_issue": is_issue, "bundle_id": bundle_id, "issn": data_journal[document.get("issn")], } for documents_bundle in documents_bundles.values(): data = documents_bundle["data"] items = documents_bundle["items"] try: documents_bundle = get_documents_bundle(session_db, data["bundle_id"], data["is_issue"], data["issn"]) except ValueError as error: files.write_file(err_filename, data["bundle_id"] + "\n", "a") not_registered.append(data["bundle_id"]) else: link_documents_bundles_with_documents(documents_bundle, items, session_db)
def register_documents_in_documents_bundle( session_db, file_documents: str, file_journals: str ) -> None: journals = reading.read_json_file(file_journals) data_journal = {} for journal in journals: o_journal = Journal(journal) for _issn in (o_journal.print_issn, o_journal.electronic_issn, o_journal.scielo_issn): if _issn: data_journal[_issn] = o_journal.scielo_issn def get_issn(document, data_journal=data_journal): """Recupera o ISSN ID do Periódico ao qual documento pertence""" for issn_type in ("eissn", "pissn", "issn"): if document.get(issn_type) is not None: issn_value = document[issn_type].strip() if data_journal.get(issn_value) is not None: return data_journal[issn_value] def get_bundle_info(issn, document): """ Obtém e retorna os dados do `bundle`: ID e se é um fascículo Args: issn (str): ISSN document (dict): Dados do documento Returns: tuple (bool, str): True para é fascículoID do `bundle` de fascículo ou aop """ bundle_id = scielo_ids_generator.any_bundle_id( issn, document.get("year"), document.get("volume"), document.get("number"), document.get("supplement"), ) aops_bundle_id = scielo_ids_generator.aops_bundle_id(issn) is_issue = bundle_id != aops_bundle_id return is_issue, bundle_id err_filename = os.path.join( config.get("ERRORS_PATH"), "insert_documents_in_bundle.err" ) with open(file_documents) as f: documents = f.readlines() documents_bundles = {} for document in documents: document = json.loads(document) issn_id = get_issn(document) if issn_id is None: logger.error("No ISSN in document '%s'", document["pid_v3"]) files.write_file(err_filename, document["pid_v3"] + "\n", "a") continue is_issue, bundle_id = get_bundle_info(issn_id, document) documents_bundles.setdefault(bundle_id, {}) documents_bundles[bundle_id].setdefault("items", []) documents_bundles[bundle_id]["items"].append( {"id": document.pop("pid_v3"), "order": document.get("order", "")} ) documents_bundles[bundle_id]["data"] = { "is_issue": is_issue, "bundle_id": bundle_id, "issn": issn_id, } for documents_bundle in documents_bundles.values(): data = documents_bundle["data"] items = documents_bundle["items"] try: documents_bundle = get_documents_bundle( session_db, data["bundle_id"], data["is_issue"], data["issn"] ) except ValueError as exc: logger.error( "The bundle '%s' was not updated. During executions " "this following exception was raised '%s'.", data["bundle_id"], exc, ) content = json.dumps({"issue": data["bundle_id"], "items": items}) files.write_file(err_filename, content + "\n", "a") else: link_documents_bundles_with_documents(documents_bundle, items, session_db)