def pack_article_xml(file_xml_path): original_filename, ign = files.extract_filename_ext_by_path(file_xml_path) obj_xml = xml.file2objXML(file_xml_path) sps_package = SPS_Package(obj_xml, original_filename) SPS_PKG_PATH = config.get("SPS_PKG_PATH") INCOMPLETE_SPS_PKG_PATH = config.get("INCOMPLETE_SPS_PKG_PATH") pkg_path = os.path.join(SPS_PKG_PATH, original_filename) bad_pkg_path = os.path.join(INCOMPLETE_SPS_PKG_PATH, original_filename) asset_replacements = list(set(sps_package.replace_assets_names())) logger.info("%s possui %s ativos digitais", file_xml_path, len(asset_replacements)) renditions, renditions_metadata = sps_package.get_renditions_metadata() logger.info("%s possui %s renditions", file_xml_path, len(renditions)) package_path = packing_assets(asset_replacements + renditions, pkg_path, bad_pkg_path, sps_package.package_name) files.write_file(os.path.join(package_path, "manifest.json"), json.dumps(renditions_metadata)) xml.objXML2file( os.path.join(package_path, "%s.xml" % (sps_package.package_name)), obj_xml)
def convert_article_xml(file_xml_path): obj_xmltree = xml.loadToXML(file_xml_path) obj_xml = obj_xmltree.getroot() obj_xml.set("specific-use", "sps-1.9") obj_xml.set("dtd-version", "1.1") xml_sps = SPS_Package(obj_xmltree) # CONVERTE O BODY DO AM PARA SPS xml_sps.transform_body() # CONVERTE PUB-DATE PARA SPS 1.9 xml_sps.transform_pubdate() # CONSTROI O SCIELO-id NO XML CONVERTIDO xml_sps.create_scielo_id() # Remove a TAG <counts> do XML xml_sps.transform_article_meta_count() languages = "-".join(xml_sps.languages) _, fname = os.path.split(file_xml_path) fname, fext = fname.rsplit(".", 1) new_file_xml_path = os.path.join(config.get("CONVERSION_PATH"), "%s.%s.%s" % (fname, languages, fext)) xml.objXML2file(new_file_xml_path, xml_sps.xmltree, pretty=True)
def update_xml_with_alternatives(self, assets_alternatives, sps_package, xml_target_path): def add_alternative_to_alternatives_tag(image_element, image_filename): image_parent = image_element.getparent() new_alternative = etree.Element(image_element.tag) new_alternative.set("{http://www.w3.org/1999/xlink}href", image_filename) if image_parent.tag == "alternatives": image_parent.append(new_alternative) else: alternative_node = etree.Element("alternatives") alternative_node.tail = image_element.tail image_element.tail = None alternative_node.append(image_element) alternative_node.append(new_alternative) image_parent.append(alternative_node) _xmltree = deepcopy(sps_package.xmltree) for asset_filename, alternatives in assets_alternatives.items(): for new_name in alternatives: logger.debug('New alternative name for asset "%s": "%s"', asset_filename, new_name) asset_elems = _xmltree.findall( f'.//*[@xlink:href="{asset_filename}"]', namespaces={"xlink": "http://www.w3.org/1999/xlink"}, ) for elem in asset_elems: add_alternative_to_alternatives_tag(elem, new_name) # Salva XML com alterações xml.objXML2file(xml_target_path, _xmltree, pretty=True)
def article_xml_constructor(file_xml_path: str, dest_path: str, pid_database_engine, in_place: bool) -> None: logger.debug("file: %s", file_xml_path) parsed_xml = xml.loadToXML(file_xml_path) xml_sps = SPS_Package(parsed_xml) pid_v2 = xml_sps.scielo_pid_v2 # VERIFICA A EXISTÊNCIA DO PID V3 NO XC ATRAVES DO PID V2 if not pid_manager.check_pid_v3_by_v2(pid_database_engine, pid_v2): # CONSTROI O SCIELO-id NO XML CONVERTIDO xml_sps.create_scielo_id() # CRIA O PID V2 E V3 NA BASE DE DADOS DO XC pid_manager.create_pid(pid_database_engine, pid_v2, xml_sps.scielo_pid_v3) else: # SE CASO EXISTA O PID NO VERSÃO 3 NA BASE DO XC É PRECISO ADICIONAR NO XML pid_v3 = pid_manager.get_pid_v3_by_v2(pid_database_engine, pid_v2) xml_sps.scielo_pid_v3 = pid_v3 if in_place: new_file_xml_path = file_xml_path else: new_file_xml_path = os.path.join(dest_path, os.path.basename(file_xml_path)) xml.objXML2file(new_file_xml_path, xml_sps.xmltree, pretty=True)
def pack_article_xml(file_xml_path): original_filename, ign = files.extract_filename_ext_by_path(file_xml_path) obj_xml = xml.file2objXML(file_xml_path) sps_package = SPS_Package(obj_xml, original_filename) SPS_PKG_PATH = config.get("SPS_PKG_PATH") INCOMPLETE_SPS_PKG_PATH = config.get("INCOMPLETE_SPS_PKG_PATH") pkg_path = os.path.join(SPS_PKG_PATH, original_filename) bad_pkg_path = os.path.join(INCOMPLETE_SPS_PKG_PATH, original_filename) files.make_empty_dir(pkg_path) asset_replacements = list(set(sps_package.replace_assets_names())) logger.info("%s possui %s ativos digitais", file_xml_path, len(asset_replacements)) package_path = packing_assets(asset_replacements, pkg_path, bad_pkg_path, sps_package.package_name) xml.objXML2file( os.path.join(package_path, "%s.xml" % (sps_package.package_name)), obj_xml)
def test_objXML2file(self): xml_obj = etree.fromstring("""<root> <p>TEXTO é ç á à è</p> </root>""") test_dir = tempfile.mkdtemp() file_name = os.path.join(test_dir, "test.xml") xml.objXML2file(file_name, xml_obj) with open(file_name) as f: text = f.read() self.assertIn("<?xml version='1.0' encoding='utf-8'?>", text) self.assertIn("é ç á à è", text)
def article_xml_constructor(file_xml_path: str, dest_path: str) -> None: logger.debug("file: %s", file_xml_path) parsed_xml = xml.loadToXML(file_xml_path) xml_sps = SPS_Package(parsed_xml) # CONSTROI O SCIELO-id NO XML CONVERTIDO xml_sps.create_scielo_id() new_file_xml_path = os.path.join(dest_path, os.path.basename(file_xml_path)) xml.objXML2file(new_file_xml_path, xml_sps.xmltree, pretty=True)
def update_xml_file(self, xml_target_path, row, pack_name): """ Lê e atualiza o XML do pacote informado com os dados de artigos do arquivo articles_data_reader. """ obj_xmltree = xml.loadToXML(xml_target_path) logger.debug('Updating XML "%s" with CSV info', xml_target_path) sps_package = self._update_sps_package_obj(SPS_Package(obj_xmltree), pack_name, row, xml_target_path) # Salva XML com alterações xml.objXML2file(xml_target_path, sps_package.xmltree, pretty=True) return sps_package
def article_xml_constructor(file_xml_path: str, dest_path: str, pid_database_engine, in_place: bool) -> None: logger.debug("file: %s", file_xml_path) parsed_xml = xml.loadToXML(file_xml_path) xml_sps = SPS_Package(parsed_xml) register_pid_v3(pid_database_engine, xml_sps) if in_place: new_file_xml_path = file_xml_path else: new_file_xml_path = os.path.join(dest_path, os.path.basename(file_xml_path)) xml.objXML2file(new_file_xml_path, xml_sps.xmltree, pretty=True)
def convert_article_xml(file_xml_path: str, spy=False, poison_pill=PoisonPill()): if poison_pill.poisoned: return logger.info(os.path.basename(file_xml_path)) obj_xmltree = xml.loadToXML(file_xml_path) obj_xml = obj_xmltree.getroot() obj_xml.set("specific-use", "sps-1.9") obj_xml.set("dtd-version", "1.1") xml_sps = SPS_Package(obj_xmltree) # CONVERTE O BODY DO AM PARA SPS xml_sps.transform_body(spy) # Transforma XML em SPS 1.9 xml_sps.transform_content() # Completa datas presentes na base artigo e ausente no XML json_file_path = Path(config.get("SOURCE_PATH")).joinpath( Path(xml_sps.scielo_pid_v2 + ".json")) article = xylose_converter.json_file_to_xylose_article(json_file_path) document_pubdate, issue_pubdate = get_article_dates(article) xml_sps.complete_pub_date(document_pubdate, issue_pubdate) # Remove a TAG <counts> do XML xml_sps.transform_article_meta_count() languages = "-".join(xml_sps.languages) _, fname = os.path.split(file_xml_path) fname, fext = fname.rsplit(".", 1) new_file_xml_path = os.path.join(config.get("CONVERSION_PATH"), "%s.%s.%s" % (fname, languages, fext)) xml.objXML2file(new_file_xml_path, xml_sps.xmltree, pretty=True)
def update_articles_mixed_citations( source: str, output_folder: str = None, override: bool = False, disable_bar: bool = False, ): """Atualiza os elementos de ``mixed-citations`` em um ou mais XMLs. O resultado da atualização pode ser salvo no próprio arquivo XML ou em outro arquivo XML em um diretório diferente utilizando o parâmetro ``output_folder``. Marque o `override` como `True` para sobrescrever todas as mixed citations das referências, caso contrário, apenas as referências sem mixed citations serão atualizadas (padrão).""" CACHE_DIR = config.get("PARAGRAPH_CACHE_PATH") if not os.path.exists(source): raise FileNotFoundError("Source path '%s' does not exist" % source) elif output_folder is not None and not os.path.exists(output_folder): raise FileNotFoundError("Output folder '%s' does not exist" % output_folder) def get_references_text_from_paragraphs(paragraphs: list, pid: str) -> dict: """Filtra as referências a partir dos paragráfos. As referências possuem a mesma estrutura dos parágrafos na base MST exceto pelo índice (v888). Considera-se uma referência os registros que possuem o índice/order (v888) e a chave de `PID` para o artigo (v880). Params: paragraphs (List[dict]): Lista de parágrafos extraídos da base MST pid (str): Identificador do documento no formato `scielo-v2` Returns: references (Dict[str, str]): Dicionário com referências filtradas, e.g: {"order": "text"} """ references = {} for paragraph in paragraphs: article_pid = get_nested(paragraph, "v880", 0, "_", default=None) index = get_nested(paragraph, "v888", 0, "_", default=-1) if index != -1 and article_pid == pid: references[index] = XMLUtils.cleanup_mixed_citation_text( get_nested(paragraph, "v704", 0, "_")) return references def get_output_file_path(original_file, output_folder=None): """Retorna o path completo para um arquivo de saída""" if output_folder is None: return original_file return os.path.join(output_folder, os.path.basename(original_file)) def get_paragraphs_from_cache(file) -> list: """Retorna uma lista de paragráfos a partir de um arquivo JSON""" paragraphs = [] with open(file, "r") as f: for line in f.readlines(): paragraphs.append(json.loads(line)) return paragraphs xmls = get_files_in_path(source, extension=".xml") with tqdm(total=len(xmls), disable=disable_bar) as pbar: for xml in xmls: try: package = SPS_Package(etree.parse(xml)) if package.scielo_pid_v2 is None: logger.error( "Could not update file '%s' because its PID is unknown.", xml) continue paragraph_file = f"{CACHE_DIR}/{package.scielo_pid_v2}.json" paragraphs = get_paragraphs_from_cache(paragraph_file) references = get_references_text_from_paragraphs( paragraphs, pid=package.scielo_pid_v2) updated = package.update_mixed_citations(references, override=override) output_file = get_output_file_path(xml, output_folder) XMLUtils.objXML2file(output_file, package.xmltree, pretty=True) if len(updated) > 0: logger.debug("Updated %0.3d references from '%s' file.", len(updated), xml) except etree.XMLSyntaxError as e: logger.error(e) except FileNotFoundError as e: logger.error( "Could not update file '%s' " "the exception '%s' occurred.", xml, e) pbar.update(1)
def pack_article_xml(file_xml_path, poison_pill=PoisonPill()): """Empacoda um xml e seus ativos digitais. Args: file_xml_path: Caminho para o XML poison_pill: Injeta um PosionPill() Retornos: Sem retornos. Persiste o XML no ``package_path`` Exemplo: packing.pack_article_xml( os.path.join("S0044-59672003000300002.xml") ) Exceções: Não lança exceções. """ if poison_pill.poisoned: return original_filename, ign = files.extract_filename_ext_by_path(file_xml_path) obj_xml = xml.file2objXML(file_xml_path) sps_package = SPS_Package(obj_xml, original_filename) sps_package.fix("article_id_which_id_type_is_other", sps_package.scielo_pid_v2 and sps_package.scielo_pid_v2[-5:], silently=True) new_issns = ISSNs and ISSNs.get(sps_package.scielo_pid_v2[1:10]) if new_issns: sps_package.fix("issns", new_issns, silently=True) SPS_PKG_PATH = config.get("SPS_PKG_PATH") INCOMPLETE_SPS_PKG_PATH = config.get("INCOMPLETE_SPS_PKG_PATH") pkg_path = os.path.join(SPS_PKG_PATH, original_filename) incomplete_pkg_path = os.path.join(INCOMPLETE_SPS_PKG_PATH, original_filename) asset_replacements = list(set(sps_package.replace_assets_names())) logger.debug("%s possui %s ativos digitais", file_xml_path, len(asset_replacements)) source_json = get_source_json(sps_package.scielo_pid_v2) renditions, renditions_metadata = source_json.get_renditions_metadata() logger.debug("%s possui %s renditions", file_xml_path, len(renditions)) package_path = packing_assets( asset_replacements + renditions, pkg_path, incomplete_pkg_path, sps_package.package_name, sps_package.scielo_pid_v2, ) files.write_file(os.path.join(package_path, "manifest.json"), json.dumps(renditions_metadata)) xml.objXML2file( os.path.join(package_path, "%s.xml" % (sps_package.package_name)), obj_xml)
def optimise_xml_to_web(self, target_path, xml_target_path, pid): xml_filename = os.path.basename(xml_target_path) def read_file(filename): file_source_path = os.path.join(target_path, filename) try: with open(file_source_path, "rb") as file_obj: file_bytes = file_obj.read() except OSError as exc: raise packtools.exceptions.SPPackageError( "[%s] - Error reading file {} during {} optimization: {}". format(pid, filename, xml_filename, str(exc))) else: logger.debug('File "%s" reading %s bytes', file_source_path, len(file_bytes)) return file_bytes logger.debug("Optimizing XML file %s", xml_filename) try: xml_web_optimiser = packtools.XMLWebOptimiser( xml_filename, os.listdir(target_path), read_file, target_path) except (etree.XMLSyntaxError, etree.SerialisationError) as exc: logger.error( '[%s] - Error creating XMLWebOptimiser for "%s": %s', pid, xml_target_path, str(exc), ) else: optimised_xml = xml_web_optimiser.get_xml_file() logger.debug("Saving optimised XML file %s", xml_filename) xml.objXML2file(xml_target_path, etree.fromstring(optimised_xml), pretty=True) # Salva ativos digitais otimizados for asset_filename, asset_bytes in xml_web_optimiser.get_optimised_assets( ): if asset_bytes is None: logger.error( '[%s] - Error saving image file "%s" referenced in "%s": ' "no file bytes", pid, asset_filename, xml_filename, ) else: image_target_path = os.path.join(target_path, asset_filename) logger.debug('Saving image file "%s"', image_target_path) files.write_file_binary(image_target_path, asset_bytes) for asset_filename, asset_bytes in xml_web_optimiser.get_assets_thumbnails( ): if asset_bytes is None: logger.error( '[%s] - Error saving image file "%s" referenced in "%s": ' "no file bytes", pid, asset_filename, xml_filename, ) else: image_target_path = os.path.join(target_path, asset_filename) logger.debug('Saving image file "%s"', image_target_path) files.write_file_binary(image_target_path, asset_bytes)