def pack_article_xml(file_xml_path): original_filename, ign = files.extract_filename_ext_by_path(file_xml_path) obj_xml = xml.file2objXML(file_xml_path) sps_package = SPS_Package(obj_xml, original_filename) SPS_PKG_PATH = config.get("SPS_PKG_PATH") INCOMPLETE_SPS_PKG_PATH = config.get("INCOMPLETE_SPS_PKG_PATH") pkg_path = os.path.join(SPS_PKG_PATH, original_filename) bad_pkg_path = os.path.join(INCOMPLETE_SPS_PKG_PATH, original_filename) asset_replacements = list(set(sps_package.replace_assets_names())) logger.info("%s possui %s ativos digitais", file_xml_path, len(asset_replacements)) renditions, renditions_metadata = sps_package.get_renditions_metadata() logger.info("%s possui %s renditions", file_xml_path, len(renditions)) package_path = packing_assets(asset_replacements + renditions, pkg_path, bad_pkg_path, sps_package.package_name) files.write_file(os.path.join(package_path, "manifest.json"), json.dumps(renditions_metadata)) xml.objXML2file( os.path.join(package_path, "%s.xml" % (sps_package.package_name)), obj_xml)
def packing_assets(asset_replacements, pkg_path, bad_pkg_path, pkg_name): """ Retorna o caminho do pacote (pkg_path ou bad_pkg_path) """ errors = [] if not os.path.isdir(pkg_path): files.make_empty_dir(pkg_path) for old_path, new_fname in asset_replacements: error = download_asset(old_path, new_fname, pkg_path) if error: errors.append((old_path, new_fname, error)) if len(errors) > 0: # garante que existe pastas diferentes para # pacotes completos e incompletos if pkg_path == bad_pkg_path: bad_pkg_path += "_INCOMPLETE" # move pacote incompleto para a pasta de pacotes incompletos files.make_empty_dir(bad_pkg_path) for item in os.listdir(pkg_path): shutil.move(os.path.join(pkg_path, item), bad_pkg_path) shutil.rmtree(pkg_path) # gera relatorio de erros errors_filename = os.path.join(bad_pkg_path, "%s.err" % pkg_name) if len(errors) > 0: error_messages = "\n".join(["%s %s %s" % _err for _err in errors]) files.write_file(errors_filename, error_messages) return bad_pkg_path return pkg_path
def register_documents(session_db, storage, documents_sorter, folder) -> None: """Realiza o processo de importação de pacotes SPS no diretório indicado. O processo de importação segue as fases: registro de assets/renditions no object storage informado, registro do manifesto na base de dados do Kernel informada e ordenação dos documentos em um `documents_sorter` para posterior associação aos seus respectivos fascículos""" err_filename = os.path.join(config.get("ERRORS_PATH"), "insert_documents.err") for path, _, sps_files in os.walk(folder): if not sps_files: continue try: xml = list(filter(lambda f: f.endswith(".xml"), sps_files))[0] xml_path = os.path.join(path, xml) constructor.article_xml_constructor(xml_path, path, False) registration_result = register_document(path, session_db, storage) if registration_result: document_xml, document_id = registration_result documents_sorter.insert_document(document_id, document_xml) except (IndexError, ValueError, TypeError, exceptions.XMLError) as ex: msg = "Falha ao registrar documento %s: %s" % (path, ex) logger.error(msg) files.write_file(err_filename, msg, "a")
def article_html_generator(file_xml_path: str, dest_path: str) -> None: logger.debug("file: %s", file_xml_path) parsed_xml = XML(file_xml_path, no_network=False) html_generator = HTMLGenerator.parse( parsed_xml, valid_only=False, css="https://new.scielo.br/static/css/scielo-article.css", print_css="https://new.scielo.br/static/css/scielo-bundle-print.css", js="https://new.scielo.br/static/js/scielo-article-min.js", ) for lang, trans_result in html_generator: fpath, fname = os.path.split(file_xml_path) fname, fext = fname.rsplit(".", 1) out_fname = ".".join([fname, lang, "html"]) new_file_html_path = os.path.join(dest_path, out_fname) files.write_file( new_file_html_path, etree.tostring( trans_result, doctype=u"<!DOCTYPE html>", pretty_print=True, encoding="utf-8", method="html", ).decode("utf-8"), )
def extract_all_data(list_documents_pids: List[str]): """Extrai documentos XML a partir de uma lista de PIDS de entrada""" pids_to_extract, pids_extracteds, stage_path = files.fetch_stages_info( list_documents_pids, __name__) logger.info("Iniciando extração dos Documentos") count = 0 try: for documents_pid in tqdm( iterable=pids_to_extract, initial=len(pids_extracteds), total=len(list_documents_pids), ): documents_pid = documents_pid.strip() logger.debug("\t coletando dados do Documento '%s'", documents_pid) xml_article = article.ext_article_txt(documents_pid) if xml_article: count += 1 file_path = os.path.join(config.get("SOURCE_PATH"), "%s.xml" % documents_pid) logger.debug("\t Salvando arquivo '%s'", file_path) files.write_file(file_path, xml_article) files.register_latest_stage(stage_path, documents_pid) except KeyboardInterrupt: ... logger.info("\t Total de %s artigos", count)
def packing_assets(asset_replacements, pkg_path, incomplete_pkg_path, pkg_name, scielo_pid_v2): """Tem a responsabilidade de ``empacotar`` os ativos digitais e retorna o path do pacote. Args: asset_replacements: lista com os ativos pkg_path: caminho do pacote incomplete_pkg_path: caminho para os pacotes incompletos pkg_name: nome do pacote scielo_pid_v2: PID v2 Retornos: retorna o caminho ``pkg_path`` ou incomplete_pkg_path Exceções: Não lança exceções. """ errors = [] if not os.path.isdir(pkg_path): files.make_empty_dir(pkg_path) for old_path, new_fname in asset_replacements: try: get_asset(old_path, new_fname, pkg_path) except AssetNotFoundError as e: logger.error( "%s", { "pid": scielo_pid_v2, "pkg_name": pkg_name, "old_path": old_path, "new_fname": new_fname, "msg": str(e), }) errors.append((old_path, new_fname, str(e))) if len(errors) > 0: # garante que existe pastas diferentes para # pacotes completos e incompletos if pkg_path == incomplete_pkg_path: incomplete_pkg_path += "_INCOMPLETE" # move pacote incompleto para a pasta de pacotes incompletos files.make_empty_dir(incomplete_pkg_path) for item in os.listdir(pkg_path): shutil.move(os.path.join(pkg_path, item), incomplete_pkg_path) shutil.rmtree(pkg_path) # gera relatorio de erros errors_filename = os.path.join(incomplete_pkg_path, "%s.err" % pkg_name) error_messages = "\n".join(["%s %s %s" % _err for _err in errors]) files.write_file(errors_filename, error_messages) return incomplete_pkg_path return pkg_path
def extrated_journal_data(obj_journal): logger.info("\t coletando dados do periodico '%s'", obj_journal.title) list_articles = article.get_all_articles_notXML(obj_journal.scielo_issn) for name_article, xml_article in list_articles: logger.info("\t Salvando arquivo '%s'", name_article) files.write_file( os.path.join(config.get("SOURCE_PATH"), "%s.xml" % name_article), xml_article, ) logger.info("\t Total de %s artigos", len(list_articles))
def test_write_file(self): expected_text = "<a><b>bar</b></a>" filename = "foo_test.txt" try: files.write_file(filename, expected_text) with open(filename, "r") as f: text = f.read() finally: os.remove(filename) self.assertEqual(expected_text, text)
def get_and_write(pid, stage_path, poison_pill): if poison_pill.poisoned: return documents_pid = pid.strip() logger.debug("\t coletando dados do Documento '%s'", documents_pid) xml_article = article.ext_article_txt(documents_pid) if xml_article: file_path = os.path.join(config.get("SOURCE_PATH"), "%s.xml" % documents_pid) logger.debug("\t Salvando arquivo '%s'", file_path) files.write_file(file_path, xml_article) files.register_latest_stage(stage_path, documents_pid)
def conversion_article_xml(file_xml_path): article = files.read_file(file_xml_path) obj_xml = etree.fromstring(article) obj_html_body = xml.parser_body_xml(obj_xml) # sobrecreve o html escapado anterior pelo novo xml tratado remove = obj_xml.find("body/p") remove.getparent().replace(remove, obj_html_body) new_file_xml_path = os.path.join( config.get("CONVERSION_PATH"), os.path.split(file_xml_path)[1] ) files.write_file(new_file_xml_path, etree.tostring(obj_xml).decode("utf-8"))
def extract_all_data(list_documents_pids: List[str]): logger.info("Iniciando extração dos Documentos") count = 0 for documents_pid in tqdm(list_documents_pids): documents_pid = documents_pid.strip() logger.debug("\t coletando dados do Documento '%s'", documents_pid) xml_article = article.ext_article_txt(documents_pid) if xml_article: count += 1 file_path = os.path.join(config.get("SOURCE_PATH"), "%s.xml" % documents_pid) logger.debug("\t Salvando arquivo '%s'", file_path) files.write_file(file_path, xml_article) logger.info("\t Total de %s artigos", count)
def register_documents_in_documents_bundle( session_db, documents_sorted_in_bundles) -> None: err_filename = os.path.join(config.get("ERRORS_PATH"), "insert_documents_in_bundle.err") not_registered = [] for key, documents_bundle in documents_sorted_in_bundles.items(): data = documents_bundle["data"] items = documents_bundle["items"] try: documents_bundle = get_documents_bundle(session_db, data) except ValueError as error: files.write_file(err_filename, key + "\n", "a") not_registered.append(key) else: link_documents_bundles_with_documents(documents_bundle, items, session_db)
def register_documents(session_db, storage, documents_sorter) -> None: logger.info("Iniciando Envio dos do xmls") list_folders = files.list_files(config.get("SPS_PKG_PATH")) err_filename = os.path.join(config.get("ERRORS_PATH"), "insert_documents.err") for folder in list_folders: try: document_path = os.path.join(config.get("SPS_PKG_PATH"), folder) registration_result = register_document(document_path, session_db, storage) if registration_result: document_xml, document_id = registration_result documents_sorter.insert_document(document_id, document_xml) except Exception as ex: msg = "Falha ao registrar documento %s: %s" % (document_path, ex) logger.error(msg) files.write_file(err_filename, msg, "a")
def manage_error_file(errors, err_file, converted_file): if os.path.isfile(err_file): try: os.unlink(err_file) except: pass if errors: msg = [] for err, data in errors.items(): msg.append(err) msg.extend([ "{}:{}".format(ln, text) for ln, text in zip(data["lineno"], data["message"]) ]) files.write_file( err_file, "%s %s\n%s" % (files.read_file(converted_file), "=" * 80, "\n".join(msg)), )
def pack_article_xml(file_xml_path, poison_pill=PoisonPill()): """Empacoda um xml e seus ativos digitais. Args: file_xml_path: Caminho para o XML poison_pill: Injeta um PosionPill() Retornos: Sem retornos. Persiste o XML no ``package_path`` Exemplo: packing.pack_article_xml( os.path.join("S0044-59672003000300002.xml") ) Exceções: Não lança exceções. """ if poison_pill.poisoned: return original_filename, ign = files.extract_filename_ext_by_path(file_xml_path) obj_xml = xml.file2objXML(file_xml_path) sps_package = SPS_Package(obj_xml, original_filename) sps_package.fix("article_id_which_id_type_is_other", sps_package.scielo_pid_v2 and sps_package.scielo_pid_v2[-5:], silently=True) new_issns = ISSNs and ISSNs.get(sps_package.scielo_pid_v2[1:10]) if new_issns: sps_package.fix("issns", new_issns, silently=True) SPS_PKG_PATH = config.get("SPS_PKG_PATH") INCOMPLETE_SPS_PKG_PATH = config.get("INCOMPLETE_SPS_PKG_PATH") pkg_path = os.path.join(SPS_PKG_PATH, original_filename) incomplete_pkg_path = os.path.join(INCOMPLETE_SPS_PKG_PATH, original_filename) asset_replacements = list(set(sps_package.replace_assets_names())) logger.debug("%s possui %s ativos digitais", file_xml_path, len(asset_replacements)) source_json = get_source_json(sps_package.scielo_pid_v2) renditions, renditions_metadata = source_json.get_renditions_metadata() logger.debug("%s possui %s renditions", file_xml_path, len(renditions)) package_path = packing_assets( asset_replacements + renditions, pkg_path, incomplete_pkg_path, sps_package.package_name, sps_package.scielo_pid_v2, ) files.write_file(os.path.join(package_path, "manifest.json"), json.dumps(renditions_metadata)) xml.objXML2file( os.path.join(package_path, "%s.xml" % (sps_package.package_name)), obj_xml)
def register_documents_in_documents_bundle(session_db, file_documents: str, file_journals: str) -> None: err_filename = os.path.join(config.get("ERRORS_PATH"), "insert_documents_in_bundle.err") not_registered = [] journals = reading.read_json_file(file_journals) documents = reading.read_json_file(file_documents) data_journal = {} for journal in journals: o_journal = Journal(journal) if o_journal.print_issn: data_journal[o_journal.print_issn] = o_journal.scielo_issn if o_journal.electronic_issn: data_journal[o_journal.electronic_issn] = o_journal.scielo_issn if o_journal.scielo_issn: data_journal[o_journal.scielo_issn] = o_journal.scielo_issn documents_bundles = {} for scielo_id, document in documents.items(): is_issue = bool(document.get("volume") or document.get("number")) issn = "" for issn_type in ("eissn", "pissn", "issn"): issn = document.get(issn_type) if issn: break if is_issue: bundle_id = scielo_ids_generator.issue_id( data_journal[issn], document.get("year"), document.get("volume"), document.get("number"), document.get("supplement"), ) else: bundle_id = scielo_ids_generator.aops_bundle_id(data_journal[issn]) documents_bundles.setdefault(bundle_id, {}) documents_bundles[bundle_id].setdefault("items", []) documents_bundles[bundle_id]["items"].append({ "id": scielo_id, "order": document.get("order", ""), }) documents_bundles[bundle_id]["data"] = { "is_issue": is_issue, "bundle_id": bundle_id, "issn": data_journal[document.get("issn")], } for documents_bundle in documents_bundles.values(): data = documents_bundle["data"] items = documents_bundle["items"] try: documents_bundle = get_documents_bundle(session_db, data["bundle_id"], data["is_issue"], data["issn"]) except ValueError as error: files.write_file(err_filename, data["bundle_id"] + "\n", "a") not_registered.append(data["bundle_id"]) else: link_documents_bundles_with_documents(documents_bundle, items, session_db)
def register_documents_in_documents_bundle( session_db, file_documents: str, file_journals: str ) -> None: journals = reading.read_json_file(file_journals) data_journal = {} for journal in journals: o_journal = Journal(journal) for _issn in (o_journal.print_issn, o_journal.electronic_issn, o_journal.scielo_issn): if _issn: data_journal[_issn] = o_journal.scielo_issn def get_issn(document, data_journal=data_journal): """Recupera o ISSN ID do Periódico ao qual documento pertence""" for issn_type in ("eissn", "pissn", "issn"): if document.get(issn_type) is not None: issn_value = document[issn_type].strip() if data_journal.get(issn_value) is not None: return data_journal[issn_value] def get_bundle_info(issn, document): """ Obtém e retorna os dados do `bundle`: ID e se é um fascículo Args: issn (str): ISSN document (dict): Dados do documento Returns: tuple (bool, str): True para é fascículoID do `bundle` de fascículo ou aop """ bundle_id = scielo_ids_generator.any_bundle_id( issn, document.get("year"), document.get("volume"), document.get("number"), document.get("supplement"), ) aops_bundle_id = scielo_ids_generator.aops_bundle_id(issn) is_issue = bundle_id != aops_bundle_id return is_issue, bundle_id err_filename = os.path.join( config.get("ERRORS_PATH"), "insert_documents_in_bundle.err" ) with open(file_documents) as f: documents = f.readlines() documents_bundles = {} for document in documents: document = json.loads(document) issn_id = get_issn(document) if issn_id is None: logger.error("No ISSN in document '%s'", document["pid_v3"]) files.write_file(err_filename, document["pid_v3"] + "\n", "a") continue is_issue, bundle_id = get_bundle_info(issn_id, document) documents_bundles.setdefault(bundle_id, {}) documents_bundles[bundle_id].setdefault("items", []) documents_bundles[bundle_id]["items"].append( {"id": document.pop("pid_v3"), "order": document.get("order", "")} ) documents_bundles[bundle_id]["data"] = { "is_issue": is_issue, "bundle_id": bundle_id, "issn": issn_id, } for documents_bundle in documents_bundles.values(): data = documents_bundle["data"] items = documents_bundle["items"] try: documents_bundle = get_documents_bundle( session_db, data["bundle_id"], data["is_issue"], data["issn"] ) except ValueError as exc: logger.error( "The bundle '%s' was not updated. During executions " "this following exception was raised '%s'.", data["bundle_id"], exc, ) content = json.dumps({"issue": data["bundle_id"], "items": items}) files.write_file(err_filename, content + "\n", "a") else: link_documents_bundles_with_documents(documents_bundle, items, session_db)
def save_file(stage_path, file_path, documents_pid, article_content): logger.debug("\t Salvando arquivo '%s'", file_path) files.write_file(file_path, article_content) files.register_latest_stage(stage_path, documents_pid)