def setUp(cls): cls.dj = DataJson(cls.get_sample("full_data.json")) cls.catalog = readers.read_catalog( cls.get_sample("full_data.json")) cls.maxDiff = None cls.longMessage = True cls.requests_mock = requests_mock.Mocker() cls.requests_mock.start() cls.requests_mock.get(requests_mock.ANY, real_http=True) cls.requests_mock.head(requests_mock.ANY, status_code=200)
def process_catalog(org, datajson): """Descarga y procesa el catálogo correspondiente a la organización.""" logger.info('=== Catálogo %s ===', org.upper()) os.chdir(org) try: config = ORGANISMS[org] logger.info('- Lectura de catálogo') # For XLSX catalogs, creates corresponding JSON file_ext = config["formato"] if file_ext == 'xlsx': res = requests.get(config['url']) with open('data.xlsx', 'w') as xlsx_file: xlsx_file.write(res.content) logger.info('- Transformación de XLSX a JSON') catalog = read_catalog('data.xlsx') elif file_ext == 'json': catalog = read_catalog(config['url']) elif file_ext == 'ckan': catalog = read_ckan_catalog(config['url']) else: raise ValueError('%s no es una extension valida para un catalogo.', file_ext) logger.info('- Escritura de catálogo') write_json_catalog(catalog, 'data.json') # Creates README and auxiliary reports logger.info('- Generación de reportes') datajson.generate_catalog_readme(catalog, export_path='README.md') datajson.generate_datasets_summary(catalog, export_path='datasets.csv') except: logger.error('Error al procesar el catálogo de %s', org, exc_info=True) finally: os.chdir('..') # Returns to parent dir.
def __init__(self, catalog, validator=None, verify_ssl=True, url_check_timeout=1, threads_count=1): self.download_url_ok = None self.catalog = read_catalog(catalog) self.summary = generate_datasets_summary(self.catalog, validator=validator, verify_ssl=verify_ssl) self.verify_url = verify_ssl self.url_check_timeout = url_check_timeout self.threads_count = threads_count
def generate_readme(catalog, export_path=None): """Genera una descripción textual en formato Markdown sobre los metadatos generales de un catálogo (título, editor, fecha de publicación, et cetera), junto con: - estado de los metadatos a nivel catálogo, - estado global de los metadatos, - cantidad de datasets federados y no federados, - detalles de los datasets no federados - cantidad de datasets y distribuciones incluidas Es utilizada por la rutina diaria de `libreria-catalogos` para generar un README con información básica sobre los catálogos mantenidos. Args: catalog (str o dict): Path a un catálogo en cualquier formato, JSON, XLSX, o diccionario de python. export_path (str): Path donde exportar el texto generado (en formato Markdown). Si se especifica, el método no devolverá nada. Returns: str: Texto de la descripción generada. """ # Si se paso una ruta, guardarla if isinstance(catalog, string_types): catalog_path_or_url = catalog else: catalog_path_or_url = None catalog = read_catalog(catalog) validation = validate_catalog(catalog) # Solo necesito indicadores para un catalogo indicators = generate_catalogs_indicators(catalog, CENTRAL_CATALOG)[0][0] with io.open(os.path.join(TEMPLATES_PATH, 'catalog_readme.txt'), 'r', encoding='utf-8') as template_file: readme_template = template_file.read() not_federated_datasets_list = "\n".join([ "- [{}]({})".format(dataset[0], dataset[1]) for dataset in indicators["datasets_no_federados"] ]) federated_removed_datasets_list = "\n".join([ "- [{}]({})".format(dataset[0], dataset[1]) for dataset in indicators["datasets_federados_eliminados"] ]) federated_datasets_list = "\n".join([ "- [{}]({})".format(dataset[0], dataset[1]) for dataset in indicators["datasets_federados"] ]) non_federated_pct = 1.0 - indicators["datasets_federados_pct"] if \ indicators["datasets_federados_pct"] is not None else \ indicators["datasets_federados_pct"] content = { "title": catalog.get("title"), "publisher_name": traverse_dict(catalog, ["publisher", "name"]), "publisher_mbox": traverse_dict(catalog, ["publisher", "mbox"]), "catalog_path_or_url": catalog_path_or_url, "description": catalog.get("description"), "global_status": validation["status"], "catalog_status": validation["error"]["catalog"]["status"], "no_of_datasets": len(catalog["dataset"]), "no_of_distributions": sum([ len(dataset["distribution"]) for dataset in catalog["dataset"] ]), "federated_datasets": indicators["datasets_federados_cant"], "not_federated_datasets": indicators["datasets_no_federados_cant"], "not_federated_datasets_pct": non_federated_pct, "not_federated_datasets_list": not_federated_datasets_list, "federated_removed_datasets_list": federated_removed_datasets_list, "federated_datasets_list": federated_datasets_list, } catalog_readme = readme_template.format(**content) if export_path: with io.open(export_path, 'w+', encoding='utf-8') as target: target.write(catalog_readme) else: return catalog_readme
def __init__(self, central_catalog, catalog): self.central_catalog = read_catalog(central_catalog) self.catalog = read_catalog(catalog) self.filtered_central = filter_by_likely_publisher( self.central_catalog.get('dataset', []), self.catalog.get('dataset', []))
def setUp(cls): cls.dj = DataJson(cls.get_sample("full_data.json")) cls.catalog = readers.read_catalog(cls.get_sample("full_data.json")) cls.maxDiff = None cls.longMessage = True