Exemplo n.º 1
0
def index_catalog(node: Node, task, read_local=False, force=False):
    """Ejecuta el pipeline de lectura, guardado e indexado de datos
    y metadatos sobre cada distribución del catálogo especificado
    """

    try:
        catalog = DataJson(node.catalog_url,
                           catalog_format=node.catalog_format)
        node.catalog = json.dumps(catalog)
        node.save()
    except Exception as e:
        IndexDataTask.info(task, READ_ERROR.format(node.catalog_id, e))
        return

    distributions = Distribution.objects.filter(
        present=True,
        dataset__indexable=True,
        dataset__catalog__identifier=node.catalog_id)
    for distribution in distributions:
        api_index_enqueue(index_distribution,
                          distribution.identifier,
                          node.id,
                          task.id,
                          read_local,
                          force=force)
Exemplo n.º 2
0
 def setUp(self):
     self.task = ReadDataJsonTask.objects.create()
     self.task.save()
     self.mgmt_task = ManagementTask()
     self.mgmt_task.save()
     self.node = Node(catalog_id=self.catalog_id, catalog_url=self.catalog, indexable=True)
     self.node.save()
Exemplo n.º 3
0
 def test_node_with_automatic_indexation_enabled_indexes_new_datasets(self):
     auto_index_node = Node(catalog_id='text_id',
                            catalog_url=self.catalog,
                            indexable=True,
                            new_datasets_auto_indexable=True)
     auto_index_node.save()
     index_catalog(auto_index_node, self.task)
     dataset = Catalog.objects.get(
         identifier=auto_index_node.catalog_id).dataset_set.first()
     self.assertTrue(dataset.indexable)
Exemplo n.º 4
0
    def test_read_datajson_several_nodes_call_index_catalog_once_per_node(
            self, index_catalog):
        Node(catalog_id='one_catalog',
             catalog_url='http://one_url.com',
             indexable=True).save()
        Node(catalog_id='other_catalog',
             catalog_url='http://other_url.com',
             indexable=True).save()

        task = ReadDataJsonTask.objects.create()
        read_datajson(task)
        self.assertEqual(index_catalog.delay.call_count, 2)
Exemplo n.º 5
0
 def test_dataset_list_returns_empty_if_no_related_datasets(self):
     new_node = Node(catalog_id='id4',
                     catalog_url=self.get_sample('full_data.json'),
                     indexable=True)
     valid, _, _ = sort_datasets_by_condition(
         new_node, DataJson(self.get_sample('full_data.json')))
     self.assertSetEqual(set(), valid)
Exemplo n.º 6
0
 def setUpTestData(cls):
     cls.node = Node(catalog_id=cls.catalog_id,
                     catalog_url=os.path.join(dir_path, 'full_data.json'),
                     catalog_format='json',
                     indexable=True)
     cls.node.catalog = json.dumps(DataJson(cls.node.catalog_url))
     cls.node.save()
     cls.task = IndicatorsGenerationTask.objects.create()
     cls.catalogs = load_catalogs(cls.task, Node.objects.all())
Exemplo n.º 7
0
def migrate_nodes(apps, schema_editor):
    Node = apps.get_model('management', 'Node')
    db_alias = schema_editor.connection.alias
    nodes = Node.objects.using(db_alias).all()

    for node in nodes:
        Node(catalog_id=node.catalog_id,
             catalog_url=node.catalog_url,
             indexable=node.indexable).save()
Exemplo n.º 8
0
def index_catalog(catalog_id, catalog_path, index, node=None):
    """Indexa un catálogo. Útil para tests"""
    if not node:
        node = Node(catalog_id=catalog_id,
                    catalog_url=catalog_path,
                    indexable=True)

    catalog = DataJson(node.catalog_url)
    node.catalog = json.dumps(catalog)
    node.save()
    task = ReadDataJsonTask()
    task.save()

    read_datajson(task, read_local=True, whitelist=True)
    for distribution in Distribution.objects.filter(
            dataset__catalog__identifier=catalog_id):
        DistributionIndexer(index=index).run(distribution)
    ElasticInstance.get().indices.forcemerge(index=index)
Exemplo n.º 9
0
    def test_missing_field_update(self):
        """Al actualizar una distribución, si falta un field
        previamente indexado, no se borran los datos anteriores
        """
        missing_field = '212.1_PSCIOS_ERS_0_0_22'

        node = Node(catalog_id=CATALOG_ID,
                    catalog_url=os.path.join(SAMPLES_DIR, 'full_ts_data.json'),
                    indexable=True)
        self._index_catalog('full_ts_data.json', node)
        with transaction.atomic():
            node.catalog_url = os.path.join(SAMPLES_DIR, 'full_ts_data.json')
            # Segunda corrida, 'actualización' del catálogo
            self._index_catalog('missing_field.json', node)

            results = Search(index=self.test_index) \
                .filter('match', series_id=missing_field).execute()

        self.assertTrue(len(results))
Exemplo n.º 10
0
    def setUpTestData(cls):
        cls.task = ReadDataJsonTask.objects.create(
            indexing_mode=ReadDataJsonTask.METADATA_ONLY)
        cls.node = Node(catalog_id=cls.catalog_id, catalog_url=cls.catalog,
                        indexable=True)
        cls.node.save()
        index_catalog(cls.node, cls.task, read_local=True, whitelist=True)

        cls.node_two = create_node(cls.catalog_two, cls.catalog_two_id)
        index_catalog(cls.node_two, cls.task, read_local=True, whitelist=True)
Exemplo n.º 11
0
 def test_read_datajson_command(self):
     identifier = 'test_id'
     Node(catalog_id=identifier,
          catalog_url=os.path.join(dir_path, 'sample_data.json'),
          indexable=True).save()
     # Esperado: mismo comportamiento que llamando la función read_datajson
     call_command('read_datajson', whitelist=True)
     self.assertTrue(
         Field.objects.filter(
             distribution__dataset__catalog__identifier=identifier))
Exemplo n.º 12
0
    def test_read_datajson_while_indexing(self):
        identifier = 'test_id'
        Node(catalog_id=identifier,
             catalog_url=os.path.join(dir_path, 'sample_data.json'),
             indexable=True).save()

        ReadDataJsonTask(status=ReadDataJsonTask.RUNNING).save()

        # Esperado: no se crea una segunda tarea
        call_command('read_datajson')
        self.assertEqual(ReadDataJsonTask.objects.all().count(), 1)
Exemplo n.º 13
0
 def test_read(self):
     identifier = 'test_id'
     Node(catalog_id=identifier,
          catalog_url=os.path.join(dir_path, 'sample_data.json'),
          indexable=True).save()
     task = ReadDataJsonTask()
     task.save()
     read_datajson(task, whitelist=True)
     self.assertTrue(
         Field.objects.filter(
             distribution__dataset__catalog__identifier=identifier))
Exemplo n.º 14
0
    def test_read_datajson_one_node_only_calls_task_for_that_node(
            self, index_catalog):
        Node(catalog_id='one_catalog',
             catalog_url='http://one_url.com',
             indexable=True).save()
        node = Node.objects.create(catalog_id='other_catalog',
                                   catalog_url='http://other_url.com',
                                   indexable=True)

        task = ReadDataJsonTask.objects.create(node=node)
        read_datajson(task)
        self.assertEqual(index_catalog.delay.call_count, 1)
Exemplo n.º 15
0
    def setUpClass(cls):
        super(AttachmentTests, cls).setUpClass()
        ReadDataJsonTask.objects.all().delete()
        Node.objects.all().delete()
        Catalog.objects.all().delete()
        cls.node = Node(catalog_id=cls.catalog_id,
                        catalog_url=cls.catalog,
                        indexable=True,
                        catalog=json.dumps(DataJson(cls.catalog)))

        cls.node.save()
        call_command('read_datajson', whitelist=True, read_local=True)
    def test_get_node(self):
        distribution = Mock()
        distribution.dataset.catalog.identifier = 'test_node'

        node = Node(catalog_id='test_node')
        with patch(
                'series_tiempo_ar_api.libs.datajsonar_repositories.distribution_repository.Node'
        ) as fake_node:
            fake_node.objects.get.return_value = node
            self.assertTrue(
                fake_node.objects.get.called_with(catalog_id='test_node'))
            self.assertEqual(
                DistributionRepository(distribution).get_node(), node)
Exemplo n.º 17
0
def process_node_register_file(register_file):
    """Registra (crea objetos Node) los nodos marcados como federado en el registro"""
    indexing_file = register_file.indexing_file
    yml = indexing_file.read()
    nodes = yaml.load(yml)
    for node, values in nodes.items():
        if bool(values['federado']) is True:  # evitar entrar al branch con un valor truthy
            try:
                node = Node.objects.get(catalog_id=node)
            except Node.DoesNotExist:
                node = Node(catalog_id=node)

            node.catalog_url = values['url']
            node.indexable = True
            node.save()

    register_file.state = NodeRegisterFile.PROCESSED
    register_file.save()
Exemplo n.º 18
0
class ReaderTests(TestCase):
    catalog = os.path.join(SAMPLES_DIR, 'full_ts_data.json')
    catalog_id = 'catalog_id'

    def setUp(self):
        self.task = ReadDataJsonTask.objects.create()
        self.node = Node(catalog_id=self.catalog_id,
                         catalog_url=self.catalog,
                         indexable=True)
        self.node.save()

    def test_index_same_series_different_catalogs(self):
        index_catalog(self.node, self.task, read_local=True, whitelist=True)
        index_catalog(self.node, self.task, read_local=True, whitelist=True)

        count = Field.objects.filter(
            metadata__contains='212.1_PSCIOS_ERN_0_0_25').count()

        self.assertEqual(count, 1)

    def test_error_distribution_logs(self):
        catalog = os.path.join(SAMPLES_DIR,
                               'distribution_missing_downloadurl.json')
        self.node.catalog_url = catalog
        self.node.save()
        index_catalog(self.node, self.task, read_local=True, whitelist=True)

        self.assertGreater(
            len(ReadDataJsonTask.objects.get(id=self.task.id).logs), 10)

    def test_index_only_time_series_if_specified(self):
        settings.DATAJSON_AR_TIME_SERIES_ONLY = True
        mixed_catalog = os.path.join(SAMPLES_DIR,
                                     'mixed_time_series_catalog.json')
        self.node.catalog_url = mixed_catalog
        self.node.save()
        index_catalog(self.node, self.task, read_local=True, whitelist=True)
        settings.DATAJSON_AR_TIME_SERIES_ONLY = False

        self.assertEqual(Distribution.objects.count(), 2)
        # La distribution ID 5.1 que no es serie de tiempo no fue creada
        self.assertFalse(Distribution.objects.filter(identifier__in=["5.1"]))

    def test_catalog_is_present_on_connection_success(self):
        index_catalog(self.node, self.task, read_local=True, whitelist=True)
        catalog_model = Catalog.objects.get(identifier=self.catalog_id)
        self.assertFalse(catalog_model.error)
        self.assertTrue(catalog_model.present)

    def test_catalog_is_not_present_on_connection_failure(self):
        index_catalog(self.node, self.task, read_local=True, whitelist=True)
        self.node.catalog_url = 'invalid_url'
        index_catalog(self.node, self.task, read_local=False, whitelist=True)
        catalog_model = Catalog.objects.get(identifier=self.catalog_id)
        self.assertTrue(catalog_model.error)
        self.assertFalse(catalog_model.present)

    def test_node_does_not_automatically_index_new_datasets_by_default(self):
        index_catalog(self.node, self.task)
        dataset = Catalog.objects.get(
            identifier=self.catalog_id).dataset_set.first()
        self.assertFalse(dataset.indexable)

    def test_whitelisted_node_automatically_indexes_new_datasets(self):
        index_catalog(self.node, self.task, whitelist=True)
        dataset = Catalog.objects.get(
            identifier=self.catalog_id).dataset_set.first()
        self.assertTrue(dataset.indexable)

    def test_node_with_automatic_indexation_enabled_indexes_new_datasets(self):
        auto_index_node = Node(catalog_id='text_id',
                               catalog_url=self.catalog,
                               indexable=True,
                               new_datasets_auto_indexable=True)
        auto_index_node.save()
        index_catalog(auto_index_node, self.task)
        dataset = Catalog.objects.get(
            identifier=auto_index_node.catalog_id).dataset_set.first()
        self.assertTrue(dataset.indexable)
Exemplo n.º 19
0
class ReaderTests(TestCase):
    catalog = os.path.join(SAMPLES_DIR, 'full_ts_data.json')
    catalog_id = 'catalog_id'

    def setUp(self):
        self.task = ReadDataJsonTask.objects.create()
        self.task.save()
        self.mgmt_task = ManagementTask()
        self.mgmt_task.save()
        self.node = Node(catalog_id=self.catalog_id,
                         catalog_url=self.catalog,
                         indexable=True)
        self.node.save()

    def test_index_same_series_different_catalogs(self, *_):
        read_datajson(self.task, whitelist=True, read_local=True)
        index_catalog(self.node, self.mgmt_task, read_local=True)
        read_datajson(self.task, whitelist=True, read_local=True)
        index_catalog(self.node, self.mgmt_task, read_local=True)

        count = Field.objects.filter(
            identifier='212.1_PSCIOS_ERN_0_0_25').count()

        self.assertEqual(count, 1)

    def test_dont_index_same_distribution_twice(self, *_):
        read_datajson(self.task, whitelist=True, read_local=True)
        index_catalog(self.node, self.mgmt_task, read_local=True)
        read_datajson(self.task, whitelist=True, read_local=True)
        index_catalog(self.node, self.mgmt_task, read_local=True)

        distribution = Distribution.objects.get(identifier='212.1')

        # La distribucion es marcada como no indexable hasta que cambien sus datos
        self.assertEqual(
            distribution.enhanced_meta.get(key=meta_keys.CHANGED).value,
            'False')

    def test_first_time_distribution_indexable(self, *_):
        read_datajson(self.task, whitelist=True, read_local=True)
        index_catalog(
            self.node,
            self.mgmt_task,
            read_local=True,
        )

        distribution = Distribution.objects.get(identifier='212.1')

        self.assertEqual(
            distribution.enhanced_meta.get(key=meta_keys.CHANGED).value,
            'True')

    def test_index_same_distribution_if_data_changed(self, *_):
        read_datajson(self.task, whitelist=True, read_local=True)
        index_catalog(
            self.node,
            self.mgmt_task,
            read_local=True,
        )
        new_catalog = os.path.join(SAMPLES_DIR, 'full_ts_data_changed.json')
        self.node.catalog_url = new_catalog
        self.node.save()
        read_datajson(self.task, whitelist=True, read_local=True)
        index_catalog(self.node, self.mgmt_task, read_local=True)

        distribution = Distribution.objects.get(identifier='212.1')

        # La distribución fue indexada nuevamente, está marcada como indexable
        self.assertEqual(
            distribution.enhanced_meta.get(key=meta_keys.CHANGED).value,
            'True')

    def test_error_distribution_logs(self, *_):
        catalog = os.path.join(SAMPLES_DIR,
                               'distribution_missing_downloadurl.json')
        self.node.catalog_url = catalog
        self.node.save()
        read_datajson(self.task, whitelist=True, read_local=True)
        index_catalog(self.node, self.mgmt_task, read_local=True)

        self.assertGreater(
            len(ReadDataJsonTask.objects.get(id=self.task.id).logs), 10)
Exemplo n.º 20
0
    def setUpTestData(cls):
        # Set harvesting node
        harvesting = HarvestingNode(name='aName',
                                    url='harvest_url',
                                    apikey='apikey',
                                    enabled=True)
        harvesting.save()
        # Set nodes
        node1 = Node(catalog_id='id1',
                     catalog_url=cls.get_sample('full_data.json'),
                     indexable=True)
        node2 = Node(catalog_id='id2',
                     catalog_url=cls.get_sample('minimum_data.json'),
                     indexable=True)
        invalid_node = Node(
            catalog_id='id3',
            catalog_url=cls.get_sample('missing_dataset_title.json'),
            indexable=True)
        node1.save()
        node2.save()
        invalid_node.save()
        # Set Catalogs and Datasets
        catalog1 = Catalog(title='catalog_1',
                           identifier='id1',
                           metadata='{}',
                           updated=True)
        catalog2 = Catalog(title='catalog_2',
                           identifier='id2',
                           metadata='{}',
                           updated=True)
        catalog3 = Catalog(title='catalog_3',
                           identifier='id3',
                           metadata='{}',
                           updated=True)
        catalog1.save()
        catalog2.save()
        catalog3.save()
        for cat in [catalog1, catalog2, catalog3]:
            dataset = Dataset(
                identifier='99db6631-d1c9-470b-a73e-c62daa32c777',
                metadata='{}',
                catalog=cat,
                indexable=True,
                present=True,
                updated=True)
            dataset.save()

        dataset2 = Dataset(identifier='99db6631-d1c9-470b-a73e-c62daa32c420',
                           metadata='{}',
                           catalog=catalog1,
                           indexable=True,
                           present=True,
                           updated=True)
        dataset2.save()
 def test_get_data_json(self, repository, fake_node):
     distribution = Mock()
     node = Node(catalog_id='test_node')
     fake_node.objects.get.return_value = node
     DistributionRepository(distribution).get_data_json()
     self.assertTrue(repository.called_with(node))
Exemplo n.º 22
0
class ReaderTests(IndexingTestCase):
    catalog = os.path.join(SAMPLES_DIR, 'full_ts_data.json')
    catalog_id = 'catalog_id'

    def setUp(self):
        self.task = ReadDataJsonTask.objects.create()
        self.task.save()
        self.mgmt_task = ManagementTask()
        self.mgmt_task.save()
        self.node = Node(catalog_id=self.catalog_id,
                         catalog_url=self.catalog,
                         indexable=True)
        self.node.save()

    def test_index_same_series_different_catalogs(self, *_):
        read_datajson(
            self.task,
            whitelist=True,
        )
        index_catalog(
            self.node,
            self.mgmt_task,
        )
        read_datajson(
            self.task,
            whitelist=True,
        )
        index_catalog(
            self.node,
            self.mgmt_task,
        )

        count = Field.objects.filter(
            identifier='212.1_PSCIOS_ERN_0_0_25').count()

        self.assertEqual(count, 1)

    def test_dont_index_same_distribution_twice(self, *_):
        read_datajson(
            self.task,
            whitelist=True,
        )
        index_catalog(
            self.node,
            self.mgmt_task,
        )
        read_datajson(
            self.task,
            whitelist=True,
        )
        index_catalog(
            self.node,
            self.mgmt_task,
        )

        distribution = Distribution.objects.get(identifier='212.1')

        # La distribucion es marcada como no indexable hasta que cambien sus datos
        self.assertEqual(
            distribution.enhanced_meta.get(key=meta_keys.CHANGED).value,
            'False')

    def test_first_time_distribution_indexable(self, *_):
        read_datajson(
            self.task,
            whitelist=True,
        )
        index_catalog(
            self.node,
            self.mgmt_task,
        )

        distribution = Distribution.objects.get(identifier='212.1')

        self.assertEqual(
            distribution.enhanced_meta.get(key=meta_keys.CHANGED).value,
            'True')

    def test_index_same_distribution_if_data_changed(self, *_):
        read_datajson(self.task, whitelist=True)
        index_catalog(self.node, self.mgmt_task)
        new_catalog = os.path.join(SAMPLES_DIR, 'full_ts_data_changed.json')
        self.node.catalog_url = new_catalog
        self.node.save()
        read_datajson(
            self.task,
            whitelist=True,
        )
        index_catalog(
            self.node,
            self.mgmt_task,
        )

        distribution = Distribution.objects.get(identifier='212.1')

        # La distribución fue indexada nuevamente, está marcada como indexable
        self.assertEqual(
            distribution.enhanced_meta.get(key=meta_keys.CHANGED).value,
            'True')

    def test_error_distribution_logs(self, *_):
        catalog = os.path.join(SAMPLES_DIR,
                               'distribution_missing_downloadurl.json')
        self.node.catalog_url = catalog
        self.node.save()
        read_datajson(
            self.task,
            whitelist=True,
        )
        index_catalog(
            self.node,
            self.mgmt_task,
        )

        self.assertGreater(
            len(ReadDataJsonTask.objects.get(id=self.task.id).logs), 10)

    def test_index_YYYY_MM_distribution(self, *_):
        catalog = os.path.join(SAMPLES_DIR, 'single_data_yyyy_mm.json')
        self.node.catalog_url = catalog
        self.node.save()

        read_datajson(
            self.task,
            whitelist=True,
        )
        index_catalog(
            self.node,
            self.mgmt_task,
        )

        distribution = Distribution.objects.get(identifier='102.1')

        self.assertEqual(
            distribution.enhanced_meta.get(key=meta_keys.CHANGED).value,
            'True')

    def test_index_YYYY_distribution(self, *_):
        catalog = os.path.join(SAMPLES_DIR, 'single_data_yyyy.json')
        self.node.catalog_url = catalog
        self.node.save()

        read_datajson(self.task, whitelist=True)
        index_catalog(self.node, self.mgmt_task)

        distribution = Distribution.objects.get(identifier='102.1')

        self.assertEqual(
            distribution.enhanced_meta.get(key=meta_keys.CHANGED).value,
            'True')

    @mock.patch('series_tiempo_ar_api.libs.indexing.catalog_reader.DataJson')
    def test_format_is_passed_to_data_json(self, data_json, *_):
        read_datajson(self.task, whitelist=True)
        self.node.catalog_format = 'xlsx'
        index_catalog(self.node, self.mgmt_task)

        self.assertEqual(data_json.call_args[1]['catalog_format'],
                         self.node.catalog_format)

    def test_significant_figures(self, *_):
        Catalog.objects.all().delete()
        catalog = os.path.join(SAMPLES_DIR, 'ipc_data.json')
        self.node.catalog_url = catalog
        self.node.save()

        read_datajson(self.task, whitelist=True)
        index_catalog(self.node, self.mgmt_task)

        field = Field.objects.get(
            identifier='serie_inflacion')  # Sacado del data.json
        self.assertEqual(
            field.enhanced_meta.get(key='significant_figures').value, '4')

    def test_custom_validation_options(self, *_):
        # Fallarán todas las validaciones
        config = DistributionValidatorConfig.get_solo()
        config.max_field_title_len = 0
        config.save()

        read_datajson(self.task, whitelist=True)
        index_catalog(self.node, self.mgmt_task)

        distribution = Distribution.objects.get(identifier='212.1')
        self.assertTrue(distribution.error)