def setUp(self): self.package_path = os.path.join(SAMPLES_PATH, "0034-8910-rsp-47-02-0231") self.xml_path = os.path.join(self.package_path, "0034-8910-rsp-47-02-0231.xml") self.xml_etree = loadToXML(self.xml_path) self.package_files = [ "0034-8910-rsp-47-02-0231-en.pdf", "0034-8910-rsp-47-02-0231-gf01-en.jpg", "0034-8910-rsp-47-02-0231-gf01-en.tif", "0034-8910-rsp-47-02-0231-gf01.jpg", "0034-8910-rsp-47-02-0231-gf01.tif", "0034-8910-rsp-47-02-0231.pdf", "0034-8910-rsp-47-02-0231.xml", ] self.second_package_path = os.path.join(SAMPLES_PATH, "0034-8910-rsp-47-02-0403") self.second_xml_path = os.path.join(self.second_package_path, "0034-8910-rsp-47-02-0403.xml") self.second_xml_etree = loadToXML(self.second_xml_path) self.second_package_files = [ "0034-8910-rsp-47-02-0403-gf01.jpg", "0034-8910-rsp-47-02-0403-gf01.tif", "0034-8910-rsp-47-02-0403.pdf", "0034-8910-rsp-47-02-0403.xml", ] self.session = Session()
def convert_article_xml(file_xml_path): obj_xmltree = xml.loadToXML(file_xml_path) obj_xml = obj_xmltree.getroot() obj_xml.set("specific-use", "sps-1.9") obj_xml.set("dtd-version", "1.1") xml_sps = SPS_Package(obj_xmltree) # CONVERTE O BODY DO AM PARA SPS xml_sps.transform_body() # CONVERTE PUB-DATE PARA SPS 1.9 xml_sps.transform_pubdate() # CONSTROI O SCIELO-id NO XML CONVERTIDO xml_sps.create_scielo_id() # Remove a TAG <counts> do XML xml_sps.transform_article_meta_count() languages = "-".join(xml_sps.languages) _, fname = os.path.split(file_xml_path) fname, fext = fname.rsplit(".", 1) new_file_xml_path = os.path.join(config.get("CONVERSION_PATH"), "%s.%s.%s" % (fname, languages, fext)) xml.objXML2file(new_file_xml_path, xml_sps.xmltree, pretty=True)
def article_xml_constructor(file_xml_path: str, dest_path: str, pid_database_engine, in_place: bool) -> None: logger.debug("file: %s", file_xml_path) parsed_xml = xml.loadToXML(file_xml_path) xml_sps = SPS_Package(parsed_xml) pid_v2 = xml_sps.scielo_pid_v2 # VERIFICA A EXISTÊNCIA DO PID V3 NO XC ATRAVES DO PID V2 if not pid_manager.check_pid_v3_by_v2(pid_database_engine, pid_v2): # CONSTROI O SCIELO-id NO XML CONVERTIDO xml_sps.create_scielo_id() # CRIA O PID V2 E V3 NA BASE DE DADOS DO XC pid_manager.create_pid(pid_database_engine, pid_v2, xml_sps.scielo_pid_v3) else: # SE CASO EXISTA O PID NO VERSÃO 3 NA BASE DO XC É PRECISO ADICIONAR NO XML pid_v3 = pid_manager.get_pid_v3_by_v2(pid_database_engine, pid_v2) xml_sps.scielo_pid_v3 = pid_v3 if in_place: new_file_xml_path = file_xml_path else: new_file_xml_path = os.path.join(dest_path, os.path.basename(file_xml_path)) xml.objXML2file(new_file_xml_path, xml_sps.xmltree, pretty=True)
def register_document(folder: str, session_db, storage) -> None: logger.info("Processando a Pasta %s", folder) list_files = files.list_files(folder) obj_xml = None prefix = "" xml_files = files.xml_files_list(folder) _renditions = list( filter(lambda file: ".pdf" in file or ".html" in file, list_files)) if len(xml_files) > 1: raise exceptions.XMLError("Existe %s xmls no pacote SPS", len(xml_files)) else: try: x_file = xml_files[0] except IndexError as ex: raise exceptions.XMLError("Não existe XML no pacote SPS: %s", ex) xml_path = os.path.join(folder, x_file) obj_xml = xml.loadToXML(xml_path) xml_sps = SPS_Package(obj_xml) # TODO: é possível que alguns artigos não possuam o self.acron prefix = xml_sps.media_prefix url_xml = storage.register(xml_path, prefix) static_assets, static_additionals = get_document_assets_path( obj_xml, list_files, folder) registered_assets = put_static_assets_into_storage(static_assets, prefix, storage) for additional_path in static_additionals.values(): storage.register(os.path.join(additional_path), prefix) if obj_xml: renditions = get_document_renditions(folder, _renditions, prefix, storage) manifest_data = ManifestDomainAdapter( manifest=manifest.get_document_manifest( obj_xml, url_xml, registered_assets, renditions)) try: session_db.documents.add(data=manifest_data) session_db.changes.add({ "timestamp": utcnow(), "entity": "Document", "id": manifest_data.id() }) logger.info("Document-store save: %s", manifest_data.id()) except AlreadyExists as exc: logger.exception(exc) return obj_xml, manifest_data.id()
def register_document(folder: str, session_db, storage) -> None: logger.info("Processando a Pasta %s", folder) list_files = files.list_files(folder) obj_xml = None prefix = "" xml_files = files.xml_files_list(folder) medias_files = set(list_files) - set(xml_files) if len(xml_files) > 1: raise exceptions.XMLError("Existe %s xmls no pacote SPS", len(xml_files)) else: try: x_file = xml_files[0] except IndexError as ex: raise exceptions.XMLError("Não existe XML no pacote SPS: %s", ex) xml_path = os.path.join(folder, x_file) obj_xml = xml.loadToXML(xml_path) xml_sps = SPS_Package(obj_xml) prefix = xml_sps.media_prefix url_xml = storage.register(xml_path, prefix) assets = [] for m_file in medias_files: assets.append({ "asset_id": m_file, "asset_url": storage.register(os.path.join(folder, m_file), prefix), }) if obj_xml: manifest_data = ManifestDomainAdapter( manifest=manifest.get_document_manifest(obj_xml, url_xml, assets)) try: session_db.documents.add(data=manifest_data) session_db.changes.add({ "timestamp": utcnow(), "entity": "Document", "id": manifest_data.id() }) logger.info("Document-store save: %s", manifest_data.id()) except AlreadyExists as exc: logger.exception(exc) return obj_xml, manifest_data.id()
def article_xml_constructor(file_xml_path: str, dest_path: str) -> None: logger.debug("file: %s", file_xml_path) parsed_xml = xml.loadToXML(file_xml_path) xml_sps = SPS_Package(parsed_xml) # CONSTROI O SCIELO-id NO XML CONVERTIDO xml_sps.create_scielo_id() new_file_xml_path = os.path.join(dest_path, os.path.basename(file_xml_path)) xml.objXML2file(new_file_xml_path, xml_sps.xmltree, pretty=True)
def update_xml_file(self, xml_target_path, row, pack_name): """ Lê e atualiza o XML do pacote informado com os dados de artigos do arquivo articles_data_reader. """ obj_xmltree = xml.loadToXML(xml_target_path) logger.debug('Updating XML "%s" with CSV info', xml_target_path) sps_package = self._update_sps_package_obj(SPS_Package(obj_xmltree), pack_name, row, xml_target_path) # Salva XML com alterações xml.objXML2file(xml_target_path, sps_package.xmltree, pretty=True) return sps_package
def article_xml_constructor(file_xml_path: str, dest_path: str, pid_database_engine, in_place: bool) -> None: logger.debug("file: %s", file_xml_path) parsed_xml = xml.loadToXML(file_xml_path) xml_sps = SPS_Package(parsed_xml) register_pid_v3(pid_database_engine, xml_sps) if in_place: new_file_xml_path = file_xml_path else: new_file_xml_path = os.path.join(dest_path, os.path.basename(file_xml_path)) xml.objXML2file(new_file_xml_path, xml_sps.xmltree, pretty=True)
def convert_article_xml(file_xml_path: str, spy=False, poison_pill=PoisonPill()): if poison_pill.poisoned: return logger.info(os.path.basename(file_xml_path)) obj_xmltree = xml.loadToXML(file_xml_path) obj_xml = obj_xmltree.getroot() obj_xml.set("specific-use", "sps-1.9") obj_xml.set("dtd-version", "1.1") xml_sps = SPS_Package(obj_xmltree) # CONVERTE O BODY DO AM PARA SPS xml_sps.transform_body(spy) # Transforma XML em SPS 1.9 xml_sps.transform_content() # Completa datas presentes na base artigo e ausente no XML json_file_path = Path(config.get("SOURCE_PATH")).joinpath( Path(xml_sps.scielo_pid_v2 + ".json")) article = xylose_converter.json_file_to_xylose_article(json_file_path) document_pubdate, issue_pubdate = get_article_dates(article) xml_sps.complete_pub_date(document_pubdate, issue_pubdate) # Remove a TAG <counts> do XML xml_sps.transform_article_meta_count() languages = "-".join(xml_sps.languages) _, fname = os.path.split(file_xml_path) fname, fext = fname.rsplit(".", 1) new_file_xml_path = os.path.join(config.get("CONVERSION_PATH"), "%s.%s.%s" % (fname, languages, fext)) xml.objXML2file(new_file_xml_path, xml_sps.xmltree, pretty=True)
def register_document(folder: str, session, storage, pid_database_engine, poison_pill=PoisonPill()) -> None: """Registra registra pacotes SPS em uma instância do Kernel e seus ativos digitais em um object storage.""" if poison_pill.poisoned: return logger.debug("Starting the import step for '%s' package.", folder) package_files = files.list_files(folder) xmls = files.xml_files_list(folder) if xmls is None or len(xmls) == 0: raise exceptions.XMLError( "There is no XML file into package '%s'. Please verify and try later." % folder ) from None xml_path = os.path.join(folder, xmls[0]) constructor.article_xml_constructor(xml_path, folder, pid_database_engine, False) try: obj_xml = xml.loadToXML(xml_path) except lxml.etree.ParseError as exc: raise exceptions.XMLError( "Could not parse the '%s' file, please validate" " this file before then try to import again." % xml_path, ) from None xml_sps = SPS_Package(obj_xml) pid_v3 = xml_sps.scielo_pid_v3 try: session.documents.fetch(id=pid_v3) except DoesNotExist: pass else: logger.debug( "Document '%s' already exist in kernel. Returning article result information", pid_v3, ) return get_article_result_dict(xml_sps) prefix = xml_sps.media_prefix or "" url_xml = storage.register(xml_path, prefix) static_assets, static_additionals = get_document_assets_path( obj_xml, package_files, folder ) registered_assets = put_static_assets_into_storage(static_assets, prefix, storage) for additional_path in static_additionals.values(): storage.register(os.path.join(additional_path), prefix) renditions = get_document_renditions(folder, prefix, storage) document = Document( manifest=manifest.get_document_manifest( xml_sps, url_xml, registered_assets, renditions ) ) try: add_document(session, document) if renditions: add_renditions(session, document) except AlreadyExists as exc: logger.error(exc) else: logger.debug("Document with id '%s' was imported.", document.id()) return get_article_result_dict(xml_sps)