示例#1
0
    def test_validate_all_zero_series(self):
        catalog = DataJson(os.path.join(SAMPLES_DIR,
                                        'ts_all_zero_series.json'))
        valid = self.scrapper.run(
            catalog.get_distributions(only_time_series=True)[0], catalog)

        self.assertTrue(valid)
示例#2
0
    def test_validate_all_null_series(self):
        catalog = DataJson(os.path.join(SAMPLES_DIR,
                                        'ts_all_null_series.json'))
        distribution = catalog.get_distributions(only_time_series=True)[0]

        distribution = MockDistribution(distribution)
        self.scrapper.run(distribution, catalog)
示例#3
0
    def test_scrapper(self):
        catalog = DataJson(os.path.join(SAMPLES_DIR, 'full_ts_data.json'))
        distribution = catalog.get_distributions(only_time_series=True)[0]

        distribution = MockDistribution(distribution)
        result = self.scrapper.run(distribution, catalog)

        self.assertTrue(result)
示例#4
0
    def test_missing_dataframe_column(self):
        """Si falta una columna indicada por los metadatos, no se
        scrapea la distribución
        """

        catalog = DataJson(
            os.path.join(SAMPLES_DIR, 'distribution_missing_column.json'))
        self.scrapper.run(
            catalog.get_distributions(only_time_series=True)[0], catalog)
示例#5
0
    def test_missing_metadata_field(self):
        """No importa que un field no esté en metadatos, se scrapea
        igual, para obtener todas las series posibles
        """

        catalog = DataJson(os.path.join(SAMPLES_DIR, 'missing_field.json'))
        result = self.scrapper.run(
            catalog.get_distributions(only_time_series=True)[0], catalog)
        self.assertTrue(result)
def process_catalog(org, datajson):
    """Descarga y procesa el catálogo correspondiente a la organización."""
    logger.info('=== Catálogo %s ===', org.upper())
    os.chdir(org)
    try:
        config = ORGANISMS[org]

        logger.info('- Lectura de catálogo')
        # For XLSX catalogs, creates corresponding JSON
        file_ext = config["formato"]
        if file_ext == 'xlsx':
            logger.info('- Transformación de XLSX a JSON')
            catalog = DataJson(read_xlsx_catalog(config['url']))

        elif file_ext == 'json':
            catalog = DataJson(
                requests.get(config['url'], verify=False).json())

        elif file_ext == 'ckan':
            catalog = DataJson(read_ckan_catalog(config['url']))

        else:
            raise ValueError('%s no es una extension valida para un catalogo.',
                             file_ext)
        # agrega dataset_identifier y distribution_identifier
        catalog.get_datasets()
        catalog.get_distributions()
        catalog.get_time_series()

        logger.info('- Escritura de catálogo')
        if catalog and len(catalog) > 0:
            catalog.to_json('data.json')
        else:
            raise Exception("El catálogo {} no se pudo generar".format(org))

        # Creates README and auxiliary reports
        logger.info('- Generación de reportes')
        datajson.generate_catalog_readme(catalog, export_path='README.md')
        datajson.generate_datasets_summary(catalog, export_path='datasets.csv')
    except:
        logger.error('Error al procesar el catálogo de %s', org, exc_info=True)
    finally:
        os.chdir('..')  # Returns to parent dir.
示例#7
0
def get_time_series_distributions(catalog):
    """
    Devuelve las distribuciones que tengan un campo de series de tiempo
    Args:
        catalog (str o dict): DataJson o string con ruta o URL a un data.json
    Returns:
        list: lista de identifiers de las distribuciones
    """

    dj = DataJson(catalog)

    distributions = dj.get_distributions()

    def has_time_index(distribution):
        for field in distribution.get("field", []):
            if field.get("specialType") == "time_index":
                return True
        return False

    return list(filter(has_time_index, distributions))