예제 #1
0
def _handle_exception(dataset_model, distribution_id, exc, node, task):
    msg = u"Excepción en distrbución {} del catálogo {}: {}"
    if exc:
        e_msg = exc
    else:
        e_msg = format_exc()
    msg = msg.format(distribution_id, node.catalog_id, e_msg)
    ReadDataJsonTask.info(task, msg)
    logger.info(msg)

    with transaction.atomic():
        try:
            distribution = Distribution.objects.get(
                identifier=distribution_id,
                dataset__catalog__identifier=node.catalog_id)
            distribution.error_msg = msg
            distribution.error = True
            distribution.field_set.update(error=True)
            distribution.save()
        except Distribution.DoesNotExist:
            pass

    # No usamos un contador manejado por el indicator_loader para asegurarse que los datasets
    # sean contados una única vez (pueden fallar una vez por cada una de sus distribuciones)
    dataset_model.error = True
    dataset_model.save()

    dataset_model.catalog.error = True
    dataset_model.catalog.save()

    if settings.RQ_QUEUES['indexing'].get('ASYNC', True):
        raise exc  # Django-rq / sentry logging
예제 #2
0
class ScrapperTests(TestCase):
    def setUp(self):
        self.task = ReadDataJsonTask()
        self.task.save()
        self.scrapper = Scraper(read_local=True)

    def test_scrapper(self):
        catalog = DataJson(os.path.join(SAMPLES_DIR, 'full_ts_data.json'))
        distribution = catalog.get_distributions(only_time_series=True)[0]

        distribution = MockDistribution(distribution)
        result = self.scrapper.run(distribution, catalog)

        self.assertTrue(result)

    def test_missing_metadata_field(self):
        """No importa que un field no esté en metadatos, se scrapea
        igual, para obtener todas las series posibles
        """

        catalog = DataJson(os.path.join(SAMPLES_DIR, 'missing_field.json'))
        distribution = catalog.get_distributions(only_time_series=True)[0]

        distribution = MockDistribution(distribution)
        result = self.scrapper.run(distribution, catalog)
        self.assertTrue(result)

    @raises(Exception)
    def test_missing_dataframe_column(self):
        """Si falta una columna indicada por los metadatos, no se
        scrapea la distribución
        """

        catalog = DataJson(
            os.path.join(SAMPLES_DIR, 'distribution_missing_column.json'))
        distribution = catalog.get_distributions(only_time_series=True)[0]

        distribution = MockDistribution(distribution)
        self.scrapper.run(distribution, catalog)

    def test_validate_all_zero_series(self):
        catalog = DataJson(os.path.join(SAMPLES_DIR,
                                        'ts_all_zero_series.json'))
        distribution = catalog.get_distributions(only_time_series=True)[0]

        distribution = MockDistribution(distribution)
        result = self.scrapper.run(distribution, catalog)
        self.assertTrue(result)

    @raises(FieldFewValuesError)
    def test_validate_all_null_series(self):
        catalog = DataJson(os.path.join(SAMPLES_DIR,
                                        'ts_all_null_series.json'))
        distribution = catalog.get_distributions(only_time_series=True)[0]

        distribution = MockDistribution(distribution)
        self.scrapper.run(distribution, catalog)
예제 #3
0
def schedule_api_indexing(force=False):
    if ReadDataJsonTask.objects.filter(status=ReadDataJsonTask.RUNNING):
        logger.info(u'Ya está corriendo una indexación')
        return

    task = ReadDataJsonTask()
    task.save()

    read_datajson(task, force=force)

    # Si se corre el comando sincrónicamete (local/testing), generar el reporte
    if not settings.RQ_QUEUES['indexing'].get('ASYNC', True):
        task = ReadDataJsonTask.objects.get(id=task.id)
        ReportGenerator(task).generate()
예제 #4
0
 def setUp(self):
     self.task = ReadDataJsonTask.objects.create()
     self.task.save()
     self.mgmt_task = ManagementTask()
     self.mgmt_task.save()
     self.node = Node(catalog_id=self.catalog_id, catalog_url=self.catalog, indexable=True)
     self.node.save()
예제 #5
0
def index_catalog(node, task, read_local=False, force=False):
    """Ejecuta el pipeline de lectura, guardado e indexado de datos
    y metadatos sobre cada distribución del catálogo especificado
    """

    try:
        catalog = DataJson(node.catalog_url)
        node.catalog = json.dumps(catalog)
        node.save()
    except Exception as e:
        ReadDataJsonTask.info(task, READ_ERROR.format(node.catalog_id, e))
        return

    distributions = Distribution.objects.filter(
        present=True,
        dataset__indexable=True,
        dataset__catalog__identifier=node.catalog_id)
    for distribution in distributions:
        index_distribution.delay(distribution.identifier,
                                 node.id,
                                 task.id,
                                 read_local,
                                 force=force)
예제 #6
0
class ReaderTests(TestCase):
    catalog = os.path.join(SAMPLES_DIR, 'full_ts_data.json')
    catalog_id = 'catalog_id'

    def setUp(self):
        self.task = ReadDataJsonTask.objects.create()
        self.task.save()
        self.mgmt_task = ManagementTask()
        self.mgmt_task.save()
        self.node = Node(catalog_id=self.catalog_id,
                         catalog_url=self.catalog,
                         indexable=True)
        self.node.save()

    def test_index_same_series_different_catalogs(self, *_):
        read_datajson(self.task, whitelist=True, read_local=True)
        index_catalog(self.node, self.mgmt_task, read_local=True)
        read_datajson(self.task, whitelist=True, read_local=True)
        index_catalog(self.node, self.mgmt_task, read_local=True)

        count = Field.objects.filter(
            identifier='212.1_PSCIOS_ERN_0_0_25').count()

        self.assertEqual(count, 1)

    def test_dont_index_same_distribution_twice(self, *_):
        read_datajson(self.task, whitelist=True, read_local=True)
        index_catalog(self.node, self.mgmt_task, read_local=True)
        read_datajson(self.task, whitelist=True, read_local=True)
        index_catalog(self.node, self.mgmt_task, read_local=True)

        distribution = Distribution.objects.get(identifier='212.1')

        # La distribucion es marcada como no indexable hasta que cambien sus datos
        self.assertEqual(
            distribution.enhanced_meta.get(key=meta_keys.CHANGED).value,
            'False')

    def test_first_time_distribution_indexable(self, *_):
        read_datajson(self.task, whitelist=True, read_local=True)
        index_catalog(
            self.node,
            self.mgmt_task,
            read_local=True,
        )

        distribution = Distribution.objects.get(identifier='212.1')

        self.assertEqual(
            distribution.enhanced_meta.get(key=meta_keys.CHANGED).value,
            'True')

    def test_index_same_distribution_if_data_changed(self, *_):
        read_datajson(self.task, whitelist=True, read_local=True)
        index_catalog(
            self.node,
            self.mgmt_task,
            read_local=True,
        )
        new_catalog = os.path.join(SAMPLES_DIR, 'full_ts_data_changed.json')
        self.node.catalog_url = new_catalog
        self.node.save()
        read_datajson(self.task, whitelist=True, read_local=True)
        index_catalog(self.node, self.mgmt_task, read_local=True)

        distribution = Distribution.objects.get(identifier='212.1')

        # La distribución fue indexada nuevamente, está marcada como indexable
        self.assertEqual(
            distribution.enhanced_meta.get(key=meta_keys.CHANGED).value,
            'True')

    def test_error_distribution_logs(self, *_):
        catalog = os.path.join(SAMPLES_DIR,
                               'distribution_missing_downloadurl.json')
        self.node.catalog_url = catalog
        self.node.save()
        read_datajson(self.task, whitelist=True, read_local=True)
        index_catalog(self.node, self.mgmt_task, read_local=True)

        self.assertGreater(
            len(ReadDataJsonTask.objects.get(id=self.task.id).logs), 10)
예제 #7
0
 def setUp(self):
     self.task = ReadDataJsonTask()
     self.task.save()
     self.scrapper = Scraper(read_local=True)