def test_validate_all_zero_series(self): catalog = DataJson(os.path.join(SAMPLES_DIR, 'ts_all_zero_series.json')) valid = self.scrapper.run( catalog.get_distributions(only_time_series=True)[0], catalog) self.assertTrue(valid)
def test_validate_all_null_series(self): catalog = DataJson(os.path.join(SAMPLES_DIR, 'ts_all_null_series.json')) distribution = catalog.get_distributions(only_time_series=True)[0] distribution = MockDistribution(distribution) self.scrapper.run(distribution, catalog)
def test_scrapper(self): catalog = DataJson(os.path.join(SAMPLES_DIR, 'full_ts_data.json')) distribution = catalog.get_distributions(only_time_series=True)[0] distribution = MockDistribution(distribution) result = self.scrapper.run(distribution, catalog) self.assertTrue(result)
def test_missing_dataframe_column(self): """Si falta una columna indicada por los metadatos, no se scrapea la distribución """ catalog = DataJson( os.path.join(SAMPLES_DIR, 'distribution_missing_column.json')) self.scrapper.run( catalog.get_distributions(only_time_series=True)[0], catalog)
def test_missing_metadata_field(self): """No importa que un field no esté en metadatos, se scrapea igual, para obtener todas las series posibles """ catalog = DataJson(os.path.join(SAMPLES_DIR, 'missing_field.json')) result = self.scrapper.run( catalog.get_distributions(only_time_series=True)[0], catalog) self.assertTrue(result)
def process_catalog(org, datajson): """Descarga y procesa el catálogo correspondiente a la organización.""" logger.info('=== Catálogo %s ===', org.upper()) os.chdir(org) try: config = ORGANISMS[org] logger.info('- Lectura de catálogo') # For XLSX catalogs, creates corresponding JSON file_ext = config["formato"] if file_ext == 'xlsx': logger.info('- Transformación de XLSX a JSON') catalog = DataJson(read_xlsx_catalog(config['url'])) elif file_ext == 'json': catalog = DataJson( requests.get(config['url'], verify=False).json()) elif file_ext == 'ckan': catalog = DataJson(read_ckan_catalog(config['url'])) else: raise ValueError('%s no es una extension valida para un catalogo.', file_ext) # agrega dataset_identifier y distribution_identifier catalog.get_datasets() catalog.get_distributions() catalog.get_time_series() logger.info('- Escritura de catálogo') if catalog and len(catalog) > 0: catalog.to_json('data.json') else: raise Exception("El catálogo {} no se pudo generar".format(org)) # Creates README and auxiliary reports logger.info('- Generación de reportes') datajson.generate_catalog_readme(catalog, export_path='README.md') datajson.generate_datasets_summary(catalog, export_path='datasets.csv') except: logger.error('Error al procesar el catálogo de %s', org, exc_info=True) finally: os.chdir('..') # Returns to parent dir.
def get_time_series_distributions(catalog): """ Devuelve las distribuciones que tengan un campo de series de tiempo Args: catalog (str o dict): DataJson o string con ruta o URL a un data.json Returns: list: lista de identifiers de las distribuciones """ dj = DataJson(catalog) distributions = dj.get_distributions() def has_time_index(distribution): for field in distribution.get("field", []): if field.get("specialType") == "time_index": return True return False return list(filter(has_time_index, distributions))