def test_dataset_issued_no_inference(self): catalog = DataJson(os.path.join(SAMPLES_DIR, 'full_ts_data.json')) self.loader.run(catalog, self.catalog_id) issued = Dataset.objects.first().issued self.assertEqual( issued.date(), iso8601.parse_date(catalog.get_datasets()[0]['issued']).date())
def test_validate_all_null_series(self): catalog = DataJson(os.path.join(SAMPLES_DIR, 'ts_all_null_series.json')) distribution = catalog.get_distributions(only_time_series=True)[0] distribution = MockDistribution(distribution) self.scrapper.run(distribution, catalog)
def test_validate_all_zero_series(self): catalog = DataJson(os.path.join(SAMPLES_DIR, 'ts_all_zero_series.json')) valid = self.scrapper.run( catalog.get_distributions(only_time_series=True)[0], catalog) self.assertTrue(valid)
def get_distribution_metadata(resource_id): # Se importa 'datajson_actions' en la función para evitar dependencias circulares con 'config_controller' json_dict = get_data_json_contents() html_parser = HTMLParser() json_dict = html_parser.unescape(json_dict) datajson = DataJson(json_dict) dist = datajson.get_distribution(resource_id) return dist
def test_scrapper(self): catalog = DataJson(os.path.join(SAMPLES_DIR, 'full_ts_data.json')) distribution = catalog.get_distributions(only_time_series=True)[0] distribution = MockDistribution(distribution) result = self.scrapper.run(distribution, catalog) self.assertTrue(result)
def test_missing_dataframe_column(self): """Si falta una columna indicada por los metadatos, no se scrapea la distribución """ catalog = DataJson( os.path.join(SAMPLES_DIR, 'distribution_missing_column.json')) self.scrapper.run( catalog.get_distributions(only_time_series=True)[0], catalog)
def test_missing_metadata_field(self): """No importa que un field no esté en metadatos, se scrapea igual, para obtener todas las series posibles """ catalog = DataJson(os.path.join(SAMPLES_DIR, 'missing_field.json')) result = self.scrapper.run( catalog.get_distributions(only_time_series=True)[0], catalog) self.assertTrue(result)
def test_validate(self): catalog = os.path.join(SAMPLES_DIR, "data.json") catalog = DataJson(catalog) distrib_meta = catalog.get_distribution(identifier="125.1") df = pd.read_csv(distrib_meta["downloadURL"], parse_dates=["indice_tiempo" ]).set_index("indice_tiempo") dataset_meta = catalog.get_dataset( identifier=distrib_meta["dataset_identifier"]) validate_distribution(df, catalog, dataset_meta, distrib_meta)
def test_central_node_default(self, mock_indic, mock_load): mock_load.return_value = self.catalogs mock_indic.return_value = (self.indicators, self.network_indicators) task = IndicatorsGenerationTask.objects.create() generate_indicators(task) mock_indic.assert_any_call(DataJson(), self.catalogs, identifier_search=True) mock_indic.assert_any_call(DataJson(), self.catalogs, CENTRAL, identifier_search=True)
def test_repeated_field_id(self): catalog = os.path.join(SAMPLES_DIR, "repeated_field_id.json") catalog = DataJson(catalog) identifier = "125.1" distribution = catalog.get_distribution(identifier=identifier) dataset = catalog.get_dataset( identifier=distribution["dataset_identifier"]) df = pd.read_csv(distribution["downloadURL"], parse_dates=["indice_tiempo" ]).set_index("indice_tiempo") validate_distribution(df, catalog, dataset, distribution)
def update_catalog(): from pydatajson import writers, DataJson # Chequeo que la caché del datajson exista antes de pasar su path como parámetro if not os.path.isfile(CACHE_FILENAME): # No existe, así que la genero update_datajson_cache() catalog = DataJson(CACHE_FILENAME) catalog['themeTaxonomy'] = catalog.get('themeTaxonomy', []) new_catalog_filename = '%s/catalog.xlsx' % tempfile.mkdtemp( dir=CACHE_DIRECTORY) writers.write_xlsx_catalog(catalog, new_catalog_filename) os.rename(new_catalog_filename, XLSX_FILENAME) os.rmdir(new_catalog_filename.replace('/catalog.xlsx', ''))
def get_or_init_catalog_themes(self, catalog_id): """Devuelve un dict ID: label de los themes del catálogo""" if catalog_id in self.catalog_themes: return self.catalog_themes[catalog_id] # No lo tenemos guardado, parseo el datajson catalog = DataJson( json.loads(Node.objects.get(catalog_id=catalog_id).catalog)) self.catalog_themes[catalog_id] = {} for theme in catalog.get_themes(): self.catalog_themes[catalog_id][theme['id']] = theme['label'] return self.catalog_themes[catalog_id]
def test_undefined_central_node_uses_default(self, mock_indic, mock_load): mock_load.return_value = self.catalogs mock_indic.return_value = (self.indicators, self.network_indicators) CentralNode.objects.create() task = IndicatorsGenerationTask.objects.create() generate_indicators(task) mock_indic.assert_any_call(DataJson(), self.catalogs, identifier_search=True, broken_links=False, broken_links_threads=1) mock_indic.assert_any_call(DataJson(), self.catalogs, CENTRAL, identifier_search=True, broken_links=False, broken_links_threads=1)
def index_distribution(distribution_id, node_id, task_id, read_local=False, index=settings.TS_INDEX, force=False): node = Node.objects.get(id=node_id) task = ReadDataJsonTask.objects.get(id=task_id) catalog = DataJson(json.loads(node.catalog)) distribution_model = Distribution.objects.get(identifier=distribution_id, dataset__catalog__identifier=node.catalog_id) try: Scraper(read_local).run(distribution_model, catalog) changed = True _hash = distribution_model.enhanced_meta.filter(key=meta_keys.LAST_HASH) if _hash: changed = _hash[0].value != distribution_model.data_hash if changed or force: DistributionIndexer(index=index).run(distribution_model) distribution_model.enhanced_meta.update_or_create(key=meta_keys.LAST_HASH, defaults={'value': distribution_model.data_hash}) distribution_model.enhanced_meta.update_or_create(key=meta_keys.CHANGED, defaults={'value': str(changed)}) except Exception as e: _handle_exception(distribution_model.dataset, distribution_id, e, node, task)
def daily_routine(): """Rutina a ser ejecutada cada mañana por cron.""" logger.info('>>> COMIENZO DE LA RUTINA <<<') # Creates DataJson object to validate oragnisms logger.info('Instanciación DataJson') datajson = DataJson() logger.info('Creación de carpetas necesarias (de archivo y versionadas).') for org in ORGANISMS: ensure_dir_exists(org) ensure_dir_exists(os.path.join(TODAY_DIR, org)) logger.info('Procesamiento de cada organismo:') os.chdir(TODAY_DIR) for org in ORGANISMS: process_catalog(org, datajson) os.chdir(ROOT_DIR) logger.info('Actualizo los archivos bajo control de versiones:') files_of_day = glob.glob('{}/*/*'.format(TODAY_DIR)) for filename in files_of_day: logger.debug('- %s', filename) update_versioning(filename) logger.info('Push de los cambios encontrados.') GIT.push('origin', 'master') logger.info('>>> FIN DE LA RUTINA <<<')
def test_catalog_issued_infers_as_oldest_dataset(self): catalog = DataJson(os.path.join(SAMPLES_DIR, 'two_datasets.json')) self.loader.run(catalog, self.catalog_id) issued = Catalog.objects.first().issued dataset_issued = Dataset.objects.aggregate( Min('issued'))['issued__min'] self.assertEqual(issued.date(), dataset_issued.date())
def index_catalog(node: Node, task, read_local=False, force=False): """Ejecuta el pipeline de lectura, guardado e indexado de datos y metadatos sobre cada distribución del catálogo especificado """ try: catalog = DataJson(node.catalog_url, catalog_format=node.catalog_format) node.catalog = json.dumps(catalog) node.save() except Exception as e: IndexDataTask.info(task, READ_ERROR.format(node.catalog_id, e)) return distributions = Distribution.objects.filter( present=True, dataset__indexable=True, dataset__catalog__identifier=node.catalog_id) for distribution in distributions: api_index_enqueue(index_distribution, distribution.identifier, node.id, task.id, read_local, force=force)
def test_dataset_issued_infers_as_oldest_distribution(self): catalog = DataJson(os.path.join(SAMPLES_DIR, 'two_datasets.json')) self.loader.run(catalog, self.catalog_id) dataset = Dataset.objects.first() distribution_issued = Distribution.objects.filter(dataset=dataset). \ aggregate(Min('issued'))['issued__min'] self.assertEqual(dataset.issued.date(), distribution_issued.date())
def __init__(self, node: Node, task: IndexMetadataTask, index: str): self.node = node self.task = task self.index_name = index self.elastic: Elasticsearch = connections.get_connection() if not self.elastic.indices.exists(self.index_name): init_index(self.index_name) self.fields_meta = {} self.init_fields_meta_cache() try: data_json = DataJson(node.catalog_url) themes = data_json.get('themeTaxonomy', []) self.themes = self.get_themes(themes) except Exception: raise ValueError("Error de lectura de los themes del catálogo")
def get_catalog_errors(self): catalog = DataJson(catalog=self.catalog_url, catalog_format=self.catalog_format) all_errors = catalog.validate_catalog(only_errors=True) error_messages = [] catalog_validation = all_errors['error']['catalog'] if catalog_validation['errors']: error_messages.append(f"En catálogo {catalog_validation['title']}:" f" {catalog_validation['errors']}") for dataset_validation in all_errors['error']['dataset']: for error in dataset_validation['errors']: error_messages.append(f"En dataset {dataset_validation['title']}:" f" {error['message']}") return error_messages
def test_defined_central_node_catalog(self, mock_indic, mock_load): mock_load.return_value = self.catalogs mock_indic.return_value = (self.indicators, self.network_indicators) harvesting = HarvestingNode.objects.create(name='aName', url='harvest_url/', apikey='apikey', enabled=True) CentralNode.objects.create(node=harvesting) task = IndicatorsGenerationTask.objects.create() generate_indicators(task) mock_indic.assert_any_call(DataJson(), self.catalogs, identifier_search=True) mock_indic.assert_any_call(DataJson(), self.catalogs, 'harvest_url/data.json', identifier_search=True)
def generate(self): node = self.node try: data_json = DataJson(node.catalog_url) data_json.get_fields(only_time_series=True) catalog = Catalog.objects.get(identifier=node.catalog_id) except Exception as e: self.task.info( self.task, "Error en la lectura del data.json de {}: {}".format( node.catalog_id, e)) return self.calculate_catalog_indicators(node, catalog) self.calculate_series_indicators(node, data_json, catalog) self.calculate_distribution_indicators(node, data_json, catalog) self.calculate_dataset_indicators(node, data_json, catalog)
def setUpTestData(cls): cls.node = Node(catalog_id=cls.catalog_id, catalog_url=os.path.join(dir_path, 'full_data.json'), catalog_format='json', indexable=True) cls.node.catalog = json.dumps(DataJson(cls.node.catalog_url)) cls.node.save() cls.task = IndicatorsGenerationTask.objects.create() cls.catalogs = load_catalogs(cls.task, Node.objects.all())
def validate_format(self, url, file, _format): path = file.temporary_file_path() if file else url try: DataJson(path, catalog_format=_format) except NonParseableCatalog: raise ValidationError("El catálogo ingresado no es válido") except Exception as e: logging.getLogger(__file__).error(e) raise ValidationError("El catálogo ingresado no es válido")
def setUpTestData(cls): HarvestingNode.objects.create( name='aName', url='harvest_url', apikey='apikey', enabled=True) Node.objects.create(catalog_id='id1', catalog_url=cls.get_sample('full_data.json'), indexable=True) Node.objects.create(catalog_id='id2', catalog_url=cls.get_sample('minimum_data.json'), indexable=True) HarvestingNode.objects.create( catalog_id='idx1', name='indexador1', url=cls.get_sample('catalogo_justicia.json'), apikey='apikey', enabled=True) HarvestingNode.objects.create( catalog_id='idx2', name='indexador2', url=cls.get_sample('full_data.json'), apikey='apikey', enabled=True) task = IndicatorsGenerationTask.objects.create() cls.catalogs = load_catalogs(task, Node.objects.all()) # Quiero que los cargue por el path, no como url. Uso harvesting=False cls.indexing_catalogs = load_catalogs(task, HarvestingNode.objects.all()) central = DataJson(cls.get_sample('full_data.json')) cls.indicators, cls.network_indicators = \ DataJson().generate_catalogs_indicators(cls.catalogs, central_catalog=central, identifier_search=True, broken_links=True) cls.indexing_indicators, _ = \ DataJson().generate_catalogs_indicators(cls.indexing_catalogs, identifier_search=True, broken_links=True) config = TasksConfig.get_solo() config.indicators_url_check = True config.save() cls.dj = DataJson() with patch('monitoreo.apps.dashboard.indicators_tasks.CENTRAL', cls.get_sample('full_data.json')): call_command('indicadores')
def load_catalogs(task, nodes, harvesting=False): catalogs = [] for node in nodes: try: if harvesting: url = urljoin(node.url, 'data.json') catalog = DataJson(url) else: catalog = DataJson(node.catalog_url, catalog_format=node.catalog_format, verify_ssl=node.verify_ssl) except Exception as e: msg = f'Error accediendo al catálogo {node.catalog_id}: {str(e)}' IndicatorsGenerationTask.info(task, msg) continue catalog['identifier'] = node.catalog_id catalogs.append(catalog) return catalogs
def validate(self): error_messages = [] file_field = self.json_file if self.json_file else self.xlsx_file file_path = os.path.join(settings.MEDIA_ROOT, file_field.name) try: data_json = DataJson(file_path) except KeyError: return ["No se puede validar el catálogo ingresado"] if not data_json.is_valid_catalog(): error_report = data_json.validate_catalog() errors = error_report['error']['catalog']['errors'] for dataset in error_report['error']['dataset']: errors += dataset['errors'] error_messages = [error['message'] for error in errors] return error_messages
def get_time_series_distributions(catalog): """ Devuelve las distribuciones que tengan un campo de series de tiempo Args: catalog (str o dict): DataJson o string con ruta o URL a un data.json Returns: list: lista de identifiers de las distribuciones """ dj = DataJson(catalog) distributions = dj.get_distributions() def has_time_index(distribution): for field in distribution.get("field", []): if field.get("specialType") == "time_index": return True return False return list(filter(has_time_index, distributions))
def setUpClass(cls): super(AttachmentTests, cls).setUpClass() ReadDataJsonTask.objects.all().delete() Node.objects.all().delete() Catalog.objects.all().delete() cls.node = Node(catalog_id=cls.catalog_id, catalog_url=cls.catalog, indexable=True, catalog=json.dumps(DataJson(cls.catalog))) cls.node.save() call_command('read_datajson', whitelist=True, read_local=True)
def index(self, node, task): self._reset_catalog_if_exists(node) try: catalog = DataJson(node.catalog_url, catalog_format=node.catalog_format, verify_ssl=self.indexing_config.verify_ssl) catalog.generate_distribution_ids() node.catalog = json.dumps(catalog) node.save() except NonParseableCatalog as e: self._set_catalog_as_errored(node) ReadDataJsonTask.info(task, READ_ERROR.format(node.catalog_id, e)) return self.reset_fields(node) self._index_catalog(catalog, node, task) file_generator = CatalogFileGenerator(node) file_generator.generate_files()