Exemplo n.º 1
0
def process_catalog(org, datajson):
    """Descarga y procesa el catálogo correspondiente a la organización."""
    logger.info('=== Catálogo %s ===', org.upper())
    os.chdir(org)
    try:
        config = ORGANISMS[org]

        logger.info('- Lectura de catálogo')
        # For XLSX catalogs, creates corresponding JSON
        file_ext = config["formato"]
        if file_ext == 'xlsx':
            res = requests.get(config['url'], verify=False)
            with open('data.xlsx', 'w') as xlsx_file:
                xlsx_file.write(res.content)
            logger.info('- Transformación de XLSX a JSON')
            catalog = DataJson('data.xlsx')

        elif file_ext == 'json':
            catalog = read_catalog(config['url'])

        elif file_ext == 'ckan':
            catalog = read_ckan_catalog(config['url'])

        else:
            raise ValueError(
                '%s no es una extension valida para un catalogo.', file_ext)

        logger.info('- Escritura de catálogo')
        if catalog and len(catalog) > 0:
            write_json_catalog(catalog, 'data.json')
        else:
            raise Exception("El catálogo {} no se pudo generar".format(org))

        # Creates README and auxiliary reports
        logger.info('- Generación de reportes')
        datajson.generate_catalog_readme(catalog, export_path='README.md')
        datajson.generate_datasets_summary(catalog, export_path='datasets.csv')
    except:
        logger.error(
            'Error al procesar el catálogo de %s', org, exc_info=True)
    finally:
        os.chdir('..')  # Returns to parent dir.
def main(catalog_id,
         replace=True,
         debug_mode=False,
         debug_distribution_ids=None,
         do_scraping=True,
         do_distributions=True):
    server_environment = helpers.get_general_config()["environment"]
    # en un ambiente productivo SIEMPRE reemplaza por la nueva opción

    catalog_json_path = get_catalog_path(catalog_id)
    catalog_sources_dir = get_catalog_scraping_sources_dir(catalog_id)
    catalog_datasets_dir = get_catalog_datasets_dir(catalog_id)

    if server_environment == "prod":
        replace = True

    try:
        catalog = TimeSeriesDataJson(catalog_json_path)
    except:
        logger.error(
            "Error al intentar cargar el catálogo {}:".format(catalog_id))
        for line in traceback.format_exc().splitlines():
            logger.error(line)
        logger.error("Salteando al catálogo siguiente...")
        return

    logger.info("Datasets: {}".format(len(catalog.get_datasets())))
    logger.info("Distributions: {}".format(
        len(catalog.get_distributions(only_time_series=True))))
    logger.info("Fields: {}".format(len(catalog.get_fields())))

    # compone los paths a los excels de ied
    scrapingURLs = set(
        catalog.get_distributions(
            only_time_series=True, meta_field="scrapingFileURL"))
    scraping_xlsx_filenames = [os.path.basename(x) for x in scrapingURLs]
    scraping_xlsx_paths = [
        os.path.join(catalog_sources_dir, filename)
        for filename in scraping_xlsx_filenames
    ]

    # inicializa las listas que contienen los reportes
    report_files = []
    all_report_datasets = []
    all_report_distributions = []

    # PRIMERO: recorre catálogo en busca de distribuciones con series de tiempo
    if do_distributions:
        try:
            report_datasets, report_distributions = analyze_catalog(
                catalog_id,
                catalog,
                catalog_datasets_dir,
                replace=replace,
                debug_mode=debug_mode,
                debug_distribution_ids=debug_distribution_ids)
            all_report_datasets.extend(report_datasets)
            all_report_distributions.extend(report_distributions)

        except Exception as e:
            logger.error(
                'Error al procesar las distribuciones con series de tiempo:')
            for line in traceback.format_exc().splitlines():
                logger.error(line)

            if isinstance(e, KeyboardInterrupt):
                raise
            if debug_mode:
                raise

    # SEGUNDO: organiza el scraping por Excel descargado
    if do_scraping:
        msg = "Archivo {}: {} ({})"
        for scraping_xlsx_path in scraping_xlsx_paths:
            logger.info("Scraping de: {}".format(scraping_xlsx_path))

            try:
                report_datasets, report_distributions = scrape_file(
                    scraping_xlsx_path,
                    catalog,
                    catalog_datasets_dir,
                    replace=replace,
                    debug_mode=debug_mode,
                    debug_distribution_ids=debug_distribution_ids,
                    catalog_id=catalog_id)

                all_report_datasets.extend(report_datasets)
                all_report_distributions.extend(report_distributions)

                report_files.append({
                    "file_name": scraping_xlsx_path,
                    "file_status": "OK",
                    "file_notes": ""
                })

            except Exception as e:
                if isinstance(e, KeyboardInterrupt):
                    raise
                report_files.append({
                    "file_name": scraping_xlsx_path,
                    "file_status": "ERROR",
                    "file_notes": repr(e).encode("utf8")
                })

                trace_string = traceback.format_exc()
                logger.error(
                    msg.format(scraping_xlsx_path, "ERROR",
                               repr(e).encode("utf8")))
                for line in trace_string.splitlines():
                    logger.error(line)
                if debug_mode:
                    raise

    cols_rep_files = ("file_name", "file_status", "file_notes")

    cols_rep_dataset = ("distribution_scrapingFileURL", "dataset_identifier",
                        "dataset_status")

    cols_rep_distribution = ("distribution_scrapingFileURL",
                             "dataset_identifier", "distribution_identifier",
                             "distribution_status", "distribution_notes")

    # concatena todos los reportes
    complete_report_files = pd.DataFrame(report_files, columns=cols_rep_files)
    complete_report_datasets = pd.DataFrame(
        all_report_datasets, columns=cols_rep_dataset)
    complete_report_distributions = pd.DataFrame(
        all_report_distributions, columns=cols_rep_distribution)

    # guarda el reporte de archivos en EXCEL
    complete_report_files.to_excel(
        os.path.join(REPORTES_DIR, catalog_id,
                     SCRAPING_MAIL_CONFIG["attachments"]["files_report"]),
        encoding="utf-8",
        index=False)

    # guarda el reporte de datasets en EXCEL
    complete_report_datasets.to_excel(
        os.path.join(REPORTES_DIR, catalog_id,
                     SCRAPING_MAIL_CONFIG["attachments"]["datasets_report"]),
        encoding="utf-8",
        index=False)

    # guarda el reporte de distribuciones en EXCEL
    complete_report_distributions.to_excel(
        os.path.join(
            REPORTES_DIR, catalog_id,
            SCRAPING_MAIL_CONFIG["attachments"]["distributions_report"]),
        encoding="utf-8",
        index=False)

    # imprime resultados a la terminal
    indicators = generate_summary_indicators(complete_report_files,
                                             complete_report_datasets,
                                             complete_report_distributions)
    subject, message = generate_summary_message(catalog_id, indicators)

    with open(
            os.path.join(REPORTES_DIR, catalog_id,
                         SCRAPING_MAIL_CONFIG["subject"]), "wb") as f:
        if isinstance(subject, str):
            f.write(subject.encode("utf-8"))
        else:
            f.write(subject)
    with open(
            os.path.join(REPORTES_DIR, catalog_id,
                         SCRAPING_MAIL_CONFIG["message"]), "wb") as f:
        if isinstance(subject, str):
            f.write(message.encode("utf-8"))
        else:
            f.write(message)

    logger.info("Escribiendo nueva version de {}".format(catalog_json_path))
    write_json_catalog(catalog, catalog_json_path)

    logger.info("Indicadores:")
    for line in message.splitlines():
        logger.info(line)
Exemplo n.º 3
0
    def write_json_metadata(self):
        file_path = self.get_json_metadata_path()

        self.ensure_dir_exists(os.path.dirname(file_path))
        writers.write_json_catalog(self.metadata, file_path)
 def _generate_json_file_into_model(self, catalog):
     write_json_catalog(catalog, self.json_catalog_dir)
     self._save_generated_file_to_model(self.json_catalog_dir, 'data.json')