def _handle_exception(dataset_model, distribution_id, exc, node, task): msg = u"Excepción en distrbución {} del catálogo {}: {}" if exc: e_msg = exc else: e_msg = format_exc() msg = msg.format(distribution_id, node.catalog_id, e_msg) ReadDataJsonTask.info(task, msg) logger.info(msg) with transaction.atomic(): try: distribution = Distribution.objects.get( identifier=distribution_id, dataset__catalog__identifier=node.catalog_id) distribution.error_msg = msg distribution.error = True distribution.field_set.update(error=True) distribution.save() except Distribution.DoesNotExist: pass # No usamos un contador manejado por el indicator_loader para asegurarse que los datasets # sean contados una única vez (pueden fallar una vez por cada una de sus distribuciones) dataset_model.error = True dataset_model.save() dataset_model.catalog.error = True dataset_model.catalog.save() if settings.RQ_QUEUES['indexing'].get('ASYNC', True): raise exc # Django-rq / sentry logging
class ScrapperTests(TestCase): def setUp(self): self.task = ReadDataJsonTask() self.task.save() self.scrapper = Scraper(read_local=True) def test_scrapper(self): catalog = DataJson(os.path.join(SAMPLES_DIR, 'full_ts_data.json')) distribution = catalog.get_distributions(only_time_series=True)[0] distribution = MockDistribution(distribution) result = self.scrapper.run(distribution, catalog) self.assertTrue(result) def test_missing_metadata_field(self): """No importa que un field no esté en metadatos, se scrapea igual, para obtener todas las series posibles """ catalog = DataJson(os.path.join(SAMPLES_DIR, 'missing_field.json')) distribution = catalog.get_distributions(only_time_series=True)[0] distribution = MockDistribution(distribution) result = self.scrapper.run(distribution, catalog) self.assertTrue(result) @raises(Exception) def test_missing_dataframe_column(self): """Si falta una columna indicada por los metadatos, no se scrapea la distribución """ catalog = DataJson( os.path.join(SAMPLES_DIR, 'distribution_missing_column.json')) distribution = catalog.get_distributions(only_time_series=True)[0] distribution = MockDistribution(distribution) self.scrapper.run(distribution, catalog) def test_validate_all_zero_series(self): catalog = DataJson(os.path.join(SAMPLES_DIR, 'ts_all_zero_series.json')) distribution = catalog.get_distributions(only_time_series=True)[0] distribution = MockDistribution(distribution) result = self.scrapper.run(distribution, catalog) self.assertTrue(result) @raises(FieldFewValuesError) def test_validate_all_null_series(self): catalog = DataJson(os.path.join(SAMPLES_DIR, 'ts_all_null_series.json')) distribution = catalog.get_distributions(only_time_series=True)[0] distribution = MockDistribution(distribution) self.scrapper.run(distribution, catalog)
def schedule_api_indexing(force=False): if ReadDataJsonTask.objects.filter(status=ReadDataJsonTask.RUNNING): logger.info(u'Ya está corriendo una indexación') return task = ReadDataJsonTask() task.save() read_datajson(task, force=force) # Si se corre el comando sincrónicamete (local/testing), generar el reporte if not settings.RQ_QUEUES['indexing'].get('ASYNC', True): task = ReadDataJsonTask.objects.get(id=task.id) ReportGenerator(task).generate()
def setUp(self): self.task = ReadDataJsonTask.objects.create() self.task.save() self.mgmt_task = ManagementTask() self.mgmt_task.save() self.node = Node(catalog_id=self.catalog_id, catalog_url=self.catalog, indexable=True) self.node.save()
def index_catalog(node, task, read_local=False, force=False): """Ejecuta el pipeline de lectura, guardado e indexado de datos y metadatos sobre cada distribución del catálogo especificado """ try: catalog = DataJson(node.catalog_url) node.catalog = json.dumps(catalog) node.save() except Exception as e: ReadDataJsonTask.info(task, READ_ERROR.format(node.catalog_id, e)) return distributions = Distribution.objects.filter( present=True, dataset__indexable=True, dataset__catalog__identifier=node.catalog_id) for distribution in distributions: index_distribution.delay(distribution.identifier, node.id, task.id, read_local, force=force)
class ReaderTests(TestCase): catalog = os.path.join(SAMPLES_DIR, 'full_ts_data.json') catalog_id = 'catalog_id' def setUp(self): self.task = ReadDataJsonTask.objects.create() self.task.save() self.mgmt_task = ManagementTask() self.mgmt_task.save() self.node = Node(catalog_id=self.catalog_id, catalog_url=self.catalog, indexable=True) self.node.save() def test_index_same_series_different_catalogs(self, *_): read_datajson(self.task, whitelist=True, read_local=True) index_catalog(self.node, self.mgmt_task, read_local=True) read_datajson(self.task, whitelist=True, read_local=True) index_catalog(self.node, self.mgmt_task, read_local=True) count = Field.objects.filter( identifier='212.1_PSCIOS_ERN_0_0_25').count() self.assertEqual(count, 1) def test_dont_index_same_distribution_twice(self, *_): read_datajson(self.task, whitelist=True, read_local=True) index_catalog(self.node, self.mgmt_task, read_local=True) read_datajson(self.task, whitelist=True, read_local=True) index_catalog(self.node, self.mgmt_task, read_local=True) distribution = Distribution.objects.get(identifier='212.1') # La distribucion es marcada como no indexable hasta que cambien sus datos self.assertEqual( distribution.enhanced_meta.get(key=meta_keys.CHANGED).value, 'False') def test_first_time_distribution_indexable(self, *_): read_datajson(self.task, whitelist=True, read_local=True) index_catalog( self.node, self.mgmt_task, read_local=True, ) distribution = Distribution.objects.get(identifier='212.1') self.assertEqual( distribution.enhanced_meta.get(key=meta_keys.CHANGED).value, 'True') def test_index_same_distribution_if_data_changed(self, *_): read_datajson(self.task, whitelist=True, read_local=True) index_catalog( self.node, self.mgmt_task, read_local=True, ) new_catalog = os.path.join(SAMPLES_DIR, 'full_ts_data_changed.json') self.node.catalog_url = new_catalog self.node.save() read_datajson(self.task, whitelist=True, read_local=True) index_catalog(self.node, self.mgmt_task, read_local=True) distribution = Distribution.objects.get(identifier='212.1') # La distribución fue indexada nuevamente, está marcada como indexable self.assertEqual( distribution.enhanced_meta.get(key=meta_keys.CHANGED).value, 'True') def test_error_distribution_logs(self, *_): catalog = os.path.join(SAMPLES_DIR, 'distribution_missing_downloadurl.json') self.node.catalog_url = catalog self.node.save() read_datajson(self.task, whitelist=True, read_local=True) index_catalog(self.node, self.mgmt_task, read_local=True) self.assertGreater( len(ReadDataJsonTask.objects.get(id=self.task.id).logs), 10)
def setUp(self): self.task = ReadDataJsonTask() self.task.save() self.scrapper = Scraper(read_local=True)