예제 #1
0
 def setUp(cls):
     cls.dj = DataJson(cls.get_sample("full_data.json"))
     cls.catalog = readers.read_catalog(
         cls.get_sample("full_data.json"))
     cls.maxDiff = None
     cls.longMessage = True
     cls.requests_mock = requests_mock.Mocker()
     cls.requests_mock.start()
     cls.requests_mock.get(requests_mock.ANY, real_http=True)
     cls.requests_mock.head(requests_mock.ANY, status_code=200)
예제 #2
0
def process_catalog(org, datajson):
    """Descarga y procesa el catálogo correspondiente a la organización."""
    logger.info('=== Catálogo %s ===', org.upper())
    os.chdir(org)
    try:
        config = ORGANISMS[org]

        logger.info('- Lectura de catálogo')
        # For XLSX catalogs, creates corresponding JSON
        file_ext = config["formato"]
        if file_ext == 'xlsx':
            res = requests.get(config['url'])
            with open('data.xlsx', 'w') as xlsx_file:
                xlsx_file.write(res.content)
            logger.info('- Transformación de XLSX a JSON')
            catalog = read_catalog('data.xlsx')

        elif file_ext == 'json':
            catalog = read_catalog(config['url'])

        elif file_ext == 'ckan':
            catalog = read_ckan_catalog(config['url'])

        else:
            raise ValueError('%s no es una extension valida para un catalogo.',
                             file_ext)

        logger.info('- Escritura de catálogo')
        write_json_catalog(catalog, 'data.json')

        # Creates README and auxiliary reports
        logger.info('- Generación de reportes')
        datajson.generate_catalog_readme(catalog, export_path='README.md')
        datajson.generate_datasets_summary(catalog, export_path='datasets.csv')
    except:
        logger.error('Error al procesar el catálogo de %s', org, exc_info=True)
    finally:
        os.chdir('..')  # Returns to parent dir.
예제 #3
0
 def __init__(self,
              catalog,
              validator=None,
              verify_ssl=True,
              url_check_timeout=1,
              threads_count=1):
     self.download_url_ok = None
     self.catalog = read_catalog(catalog)
     self.summary = generate_datasets_summary(self.catalog,
                                              validator=validator,
                                              verify_ssl=verify_ssl)
     self.verify_url = verify_ssl
     self.url_check_timeout = url_check_timeout
     self.threads_count = threads_count
예제 #4
0
def generate_readme(catalog, export_path=None):
    """Genera una descripción textual en formato Markdown sobre los
    metadatos generales de un catálogo (título, editor, fecha de
    publicación, et cetera), junto con:
        - estado de los metadatos a nivel catálogo,
        - estado global de los metadatos,
        - cantidad de datasets federados y no federados,
        - detalles de los datasets no federados
        - cantidad de datasets y distribuciones incluidas

    Es utilizada por la rutina diaria de `libreria-catalogos` para generar
    un README con información básica sobre los catálogos mantenidos.

    Args:
        catalog (str o dict): Path a un catálogo en cualquier formato,
            JSON, XLSX, o diccionario de python.
        export_path (str): Path donde exportar el texto generado (en
            formato Markdown). Si se especifica, el método no devolverá
            nada.

    Returns:
        str: Texto de la descripción generada.
    """
    # Si se paso una ruta, guardarla
    if isinstance(catalog, string_types):
        catalog_path_or_url = catalog
    else:
        catalog_path_or_url = None

    catalog = read_catalog(catalog)
    validation = validate_catalog(catalog)
    # Solo necesito indicadores para un catalogo
    indicators = generate_catalogs_indicators(catalog, CENTRAL_CATALOG)[0][0]

    with io.open(os.path.join(TEMPLATES_PATH, 'catalog_readme.txt'),
                 'r',
                 encoding='utf-8') as template_file:
        readme_template = template_file.read()

        not_federated_datasets_list = "\n".join([
            "- [{}]({})".format(dataset[0], dataset[1])
            for dataset in indicators["datasets_no_federados"]
        ])
        federated_removed_datasets_list = "\n".join([
            "- [{}]({})".format(dataset[0], dataset[1])
            for dataset in indicators["datasets_federados_eliminados"]
        ])
        federated_datasets_list = "\n".join([
            "- [{}]({})".format(dataset[0], dataset[1])
            for dataset in indicators["datasets_federados"]
        ])
        non_federated_pct = 1.0 - indicators["datasets_federados_pct"] if \
            indicators["datasets_federados_pct"] is not None else \
            indicators["datasets_federados_pct"]
        content = {
            "title":
            catalog.get("title"),
            "publisher_name":
            traverse_dict(catalog, ["publisher", "name"]),
            "publisher_mbox":
            traverse_dict(catalog, ["publisher", "mbox"]),
            "catalog_path_or_url":
            catalog_path_or_url,
            "description":
            catalog.get("description"),
            "global_status":
            validation["status"],
            "catalog_status":
            validation["error"]["catalog"]["status"],
            "no_of_datasets":
            len(catalog["dataset"]),
            "no_of_distributions":
            sum([
                len(dataset["distribution"]) for dataset in catalog["dataset"]
            ]),
            "federated_datasets":
            indicators["datasets_federados_cant"],
            "not_federated_datasets":
            indicators["datasets_no_federados_cant"],
            "not_federated_datasets_pct":
            non_federated_pct,
            "not_federated_datasets_list":
            not_federated_datasets_list,
            "federated_removed_datasets_list":
            federated_removed_datasets_list,
            "federated_datasets_list":
            federated_datasets_list,
        }

        catalog_readme = readme_template.format(**content)

    if export_path:
        with io.open(export_path, 'w+', encoding='utf-8') as target:
            target.write(catalog_readme)
    else:
        return catalog_readme
예제 #5
0
 def __init__(self, central_catalog, catalog):
     self.central_catalog = read_catalog(central_catalog)
     self.catalog = read_catalog(catalog)
     self.filtered_central = filter_by_likely_publisher(
         self.central_catalog.get('dataset', []),
         self.catalog.get('dataset', []))
예제 #6
0
 def setUp(cls):
     cls.dj = DataJson(cls.get_sample("full_data.json"))
     cls.catalog = readers.read_catalog(cls.get_sample("full_data.json"))
     cls.maxDiff = None
     cls.longMessage = True