def index_catalog(node: Node, task, read_local=False, force=False): """Ejecuta el pipeline de lectura, guardado e indexado de datos y metadatos sobre cada distribución del catálogo especificado """ try: catalog = DataJson(node.catalog_url, catalog_format=node.catalog_format) node.catalog = json.dumps(catalog) node.save() except Exception as e: IndexDataTask.info(task, READ_ERROR.format(node.catalog_id, e)) return distributions = Distribution.objects.filter( present=True, dataset__indexable=True, dataset__catalog__identifier=node.catalog_id) for distribution in distributions: api_index_enqueue(index_distribution, distribution.identifier, node.id, task.id, read_local, force=force)
def setUp(self): self.task = ReadDataJsonTask.objects.create() self.task.save() self.mgmt_task = ManagementTask() self.mgmt_task.save() self.node = Node(catalog_id=self.catalog_id, catalog_url=self.catalog, indexable=True) self.node.save()
def test_node_with_automatic_indexation_enabled_indexes_new_datasets(self): auto_index_node = Node(catalog_id='text_id', catalog_url=self.catalog, indexable=True, new_datasets_auto_indexable=True) auto_index_node.save() index_catalog(auto_index_node, self.task) dataset = Catalog.objects.get( identifier=auto_index_node.catalog_id).dataset_set.first() self.assertTrue(dataset.indexable)
def test_read_datajson_several_nodes_call_index_catalog_once_per_node( self, index_catalog): Node(catalog_id='one_catalog', catalog_url='http://one_url.com', indexable=True).save() Node(catalog_id='other_catalog', catalog_url='http://other_url.com', indexable=True).save() task = ReadDataJsonTask.objects.create() read_datajson(task) self.assertEqual(index_catalog.delay.call_count, 2)
def test_dataset_list_returns_empty_if_no_related_datasets(self): new_node = Node(catalog_id='id4', catalog_url=self.get_sample('full_data.json'), indexable=True) valid, _, _ = sort_datasets_by_condition( new_node, DataJson(self.get_sample('full_data.json'))) self.assertSetEqual(set(), valid)
def setUpTestData(cls): cls.node = Node(catalog_id=cls.catalog_id, catalog_url=os.path.join(dir_path, 'full_data.json'), catalog_format='json', indexable=True) cls.node.catalog = json.dumps(DataJson(cls.node.catalog_url)) cls.node.save() cls.task = IndicatorsGenerationTask.objects.create() cls.catalogs = load_catalogs(cls.task, Node.objects.all())
def migrate_nodes(apps, schema_editor): Node = apps.get_model('management', 'Node') db_alias = schema_editor.connection.alias nodes = Node.objects.using(db_alias).all() for node in nodes: Node(catalog_id=node.catalog_id, catalog_url=node.catalog_url, indexable=node.indexable).save()
def index_catalog(catalog_id, catalog_path, index, node=None): """Indexa un catálogo. Útil para tests""" if not node: node = Node(catalog_id=catalog_id, catalog_url=catalog_path, indexable=True) catalog = DataJson(node.catalog_url) node.catalog = json.dumps(catalog) node.save() task = ReadDataJsonTask() task.save() read_datajson(task, read_local=True, whitelist=True) for distribution in Distribution.objects.filter( dataset__catalog__identifier=catalog_id): DistributionIndexer(index=index).run(distribution) ElasticInstance.get().indices.forcemerge(index=index)
def test_missing_field_update(self): """Al actualizar una distribución, si falta un field previamente indexado, no se borran los datos anteriores """ missing_field = '212.1_PSCIOS_ERS_0_0_22' node = Node(catalog_id=CATALOG_ID, catalog_url=os.path.join(SAMPLES_DIR, 'full_ts_data.json'), indexable=True) self._index_catalog('full_ts_data.json', node) with transaction.atomic(): node.catalog_url = os.path.join(SAMPLES_DIR, 'full_ts_data.json') # Segunda corrida, 'actualización' del catálogo self._index_catalog('missing_field.json', node) results = Search(index=self.test_index) \ .filter('match', series_id=missing_field).execute() self.assertTrue(len(results))
def setUpTestData(cls): cls.task = ReadDataJsonTask.objects.create( indexing_mode=ReadDataJsonTask.METADATA_ONLY) cls.node = Node(catalog_id=cls.catalog_id, catalog_url=cls.catalog, indexable=True) cls.node.save() index_catalog(cls.node, cls.task, read_local=True, whitelist=True) cls.node_two = create_node(cls.catalog_two, cls.catalog_two_id) index_catalog(cls.node_two, cls.task, read_local=True, whitelist=True)
def test_read_datajson_command(self): identifier = 'test_id' Node(catalog_id=identifier, catalog_url=os.path.join(dir_path, 'sample_data.json'), indexable=True).save() # Esperado: mismo comportamiento que llamando la función read_datajson call_command('read_datajson', whitelist=True) self.assertTrue( Field.objects.filter( distribution__dataset__catalog__identifier=identifier))
def test_read_datajson_while_indexing(self): identifier = 'test_id' Node(catalog_id=identifier, catalog_url=os.path.join(dir_path, 'sample_data.json'), indexable=True).save() ReadDataJsonTask(status=ReadDataJsonTask.RUNNING).save() # Esperado: no se crea una segunda tarea call_command('read_datajson') self.assertEqual(ReadDataJsonTask.objects.all().count(), 1)
def test_read(self): identifier = 'test_id' Node(catalog_id=identifier, catalog_url=os.path.join(dir_path, 'sample_data.json'), indexable=True).save() task = ReadDataJsonTask() task.save() read_datajson(task, whitelist=True) self.assertTrue( Field.objects.filter( distribution__dataset__catalog__identifier=identifier))
def test_read_datajson_one_node_only_calls_task_for_that_node( self, index_catalog): Node(catalog_id='one_catalog', catalog_url='http://one_url.com', indexable=True).save() node = Node.objects.create(catalog_id='other_catalog', catalog_url='http://other_url.com', indexable=True) task = ReadDataJsonTask.objects.create(node=node) read_datajson(task) self.assertEqual(index_catalog.delay.call_count, 1)
def setUpClass(cls): super(AttachmentTests, cls).setUpClass() ReadDataJsonTask.objects.all().delete() Node.objects.all().delete() Catalog.objects.all().delete() cls.node = Node(catalog_id=cls.catalog_id, catalog_url=cls.catalog, indexable=True, catalog=json.dumps(DataJson(cls.catalog))) cls.node.save() call_command('read_datajson', whitelist=True, read_local=True)
def test_get_node(self): distribution = Mock() distribution.dataset.catalog.identifier = 'test_node' node = Node(catalog_id='test_node') with patch( 'series_tiempo_ar_api.libs.datajsonar_repositories.distribution_repository.Node' ) as fake_node: fake_node.objects.get.return_value = node self.assertTrue( fake_node.objects.get.called_with(catalog_id='test_node')) self.assertEqual( DistributionRepository(distribution).get_node(), node)
def process_node_register_file(register_file): """Registra (crea objetos Node) los nodos marcados como federado en el registro""" indexing_file = register_file.indexing_file yml = indexing_file.read() nodes = yaml.load(yml) for node, values in nodes.items(): if bool(values['federado']) is True: # evitar entrar al branch con un valor truthy try: node = Node.objects.get(catalog_id=node) except Node.DoesNotExist: node = Node(catalog_id=node) node.catalog_url = values['url'] node.indexable = True node.save() register_file.state = NodeRegisterFile.PROCESSED register_file.save()
class ReaderTests(TestCase): catalog = os.path.join(SAMPLES_DIR, 'full_ts_data.json') catalog_id = 'catalog_id' def setUp(self): self.task = ReadDataJsonTask.objects.create() self.node = Node(catalog_id=self.catalog_id, catalog_url=self.catalog, indexable=True) self.node.save() def test_index_same_series_different_catalogs(self): index_catalog(self.node, self.task, read_local=True, whitelist=True) index_catalog(self.node, self.task, read_local=True, whitelist=True) count = Field.objects.filter( metadata__contains='212.1_PSCIOS_ERN_0_0_25').count() self.assertEqual(count, 1) def test_error_distribution_logs(self): catalog = os.path.join(SAMPLES_DIR, 'distribution_missing_downloadurl.json') self.node.catalog_url = catalog self.node.save() index_catalog(self.node, self.task, read_local=True, whitelist=True) self.assertGreater( len(ReadDataJsonTask.objects.get(id=self.task.id).logs), 10) def test_index_only_time_series_if_specified(self): settings.DATAJSON_AR_TIME_SERIES_ONLY = True mixed_catalog = os.path.join(SAMPLES_DIR, 'mixed_time_series_catalog.json') self.node.catalog_url = mixed_catalog self.node.save() index_catalog(self.node, self.task, read_local=True, whitelist=True) settings.DATAJSON_AR_TIME_SERIES_ONLY = False self.assertEqual(Distribution.objects.count(), 2) # La distribution ID 5.1 que no es serie de tiempo no fue creada self.assertFalse(Distribution.objects.filter(identifier__in=["5.1"])) def test_catalog_is_present_on_connection_success(self): index_catalog(self.node, self.task, read_local=True, whitelist=True) catalog_model = Catalog.objects.get(identifier=self.catalog_id) self.assertFalse(catalog_model.error) self.assertTrue(catalog_model.present) def test_catalog_is_not_present_on_connection_failure(self): index_catalog(self.node, self.task, read_local=True, whitelist=True) self.node.catalog_url = 'invalid_url' index_catalog(self.node, self.task, read_local=False, whitelist=True) catalog_model = Catalog.objects.get(identifier=self.catalog_id) self.assertTrue(catalog_model.error) self.assertFalse(catalog_model.present) def test_node_does_not_automatically_index_new_datasets_by_default(self): index_catalog(self.node, self.task) dataset = Catalog.objects.get( identifier=self.catalog_id).dataset_set.first() self.assertFalse(dataset.indexable) def test_whitelisted_node_automatically_indexes_new_datasets(self): index_catalog(self.node, self.task, whitelist=True) dataset = Catalog.objects.get( identifier=self.catalog_id).dataset_set.first() self.assertTrue(dataset.indexable) def test_node_with_automatic_indexation_enabled_indexes_new_datasets(self): auto_index_node = Node(catalog_id='text_id', catalog_url=self.catalog, indexable=True, new_datasets_auto_indexable=True) auto_index_node.save() index_catalog(auto_index_node, self.task) dataset = Catalog.objects.get( identifier=auto_index_node.catalog_id).dataset_set.first() self.assertTrue(dataset.indexable)
class ReaderTests(TestCase): catalog = os.path.join(SAMPLES_DIR, 'full_ts_data.json') catalog_id = 'catalog_id' def setUp(self): self.task = ReadDataJsonTask.objects.create() self.task.save() self.mgmt_task = ManagementTask() self.mgmt_task.save() self.node = Node(catalog_id=self.catalog_id, catalog_url=self.catalog, indexable=True) self.node.save() def test_index_same_series_different_catalogs(self, *_): read_datajson(self.task, whitelist=True, read_local=True) index_catalog(self.node, self.mgmt_task, read_local=True) read_datajson(self.task, whitelist=True, read_local=True) index_catalog(self.node, self.mgmt_task, read_local=True) count = Field.objects.filter( identifier='212.1_PSCIOS_ERN_0_0_25').count() self.assertEqual(count, 1) def test_dont_index_same_distribution_twice(self, *_): read_datajson(self.task, whitelist=True, read_local=True) index_catalog(self.node, self.mgmt_task, read_local=True) read_datajson(self.task, whitelist=True, read_local=True) index_catalog(self.node, self.mgmt_task, read_local=True) distribution = Distribution.objects.get(identifier='212.1') # La distribucion es marcada como no indexable hasta que cambien sus datos self.assertEqual( distribution.enhanced_meta.get(key=meta_keys.CHANGED).value, 'False') def test_first_time_distribution_indexable(self, *_): read_datajson(self.task, whitelist=True, read_local=True) index_catalog( self.node, self.mgmt_task, read_local=True, ) distribution = Distribution.objects.get(identifier='212.1') self.assertEqual( distribution.enhanced_meta.get(key=meta_keys.CHANGED).value, 'True') def test_index_same_distribution_if_data_changed(self, *_): read_datajson(self.task, whitelist=True, read_local=True) index_catalog( self.node, self.mgmt_task, read_local=True, ) new_catalog = os.path.join(SAMPLES_DIR, 'full_ts_data_changed.json') self.node.catalog_url = new_catalog self.node.save() read_datajson(self.task, whitelist=True, read_local=True) index_catalog(self.node, self.mgmt_task, read_local=True) distribution = Distribution.objects.get(identifier='212.1') # La distribución fue indexada nuevamente, está marcada como indexable self.assertEqual( distribution.enhanced_meta.get(key=meta_keys.CHANGED).value, 'True') def test_error_distribution_logs(self, *_): catalog = os.path.join(SAMPLES_DIR, 'distribution_missing_downloadurl.json') self.node.catalog_url = catalog self.node.save() read_datajson(self.task, whitelist=True, read_local=True) index_catalog(self.node, self.mgmt_task, read_local=True) self.assertGreater( len(ReadDataJsonTask.objects.get(id=self.task.id).logs), 10)
def setUpTestData(cls): # Set harvesting node harvesting = HarvestingNode(name='aName', url='harvest_url', apikey='apikey', enabled=True) harvesting.save() # Set nodes node1 = Node(catalog_id='id1', catalog_url=cls.get_sample('full_data.json'), indexable=True) node2 = Node(catalog_id='id2', catalog_url=cls.get_sample('minimum_data.json'), indexable=True) invalid_node = Node( catalog_id='id3', catalog_url=cls.get_sample('missing_dataset_title.json'), indexable=True) node1.save() node2.save() invalid_node.save() # Set Catalogs and Datasets catalog1 = Catalog(title='catalog_1', identifier='id1', metadata='{}', updated=True) catalog2 = Catalog(title='catalog_2', identifier='id2', metadata='{}', updated=True) catalog3 = Catalog(title='catalog_3', identifier='id3', metadata='{}', updated=True) catalog1.save() catalog2.save() catalog3.save() for cat in [catalog1, catalog2, catalog3]: dataset = Dataset( identifier='99db6631-d1c9-470b-a73e-c62daa32c777', metadata='{}', catalog=cat, indexable=True, present=True, updated=True) dataset.save() dataset2 = Dataset(identifier='99db6631-d1c9-470b-a73e-c62daa32c420', metadata='{}', catalog=catalog1, indexable=True, present=True, updated=True) dataset2.save()
def test_get_data_json(self, repository, fake_node): distribution = Mock() node = Node(catalog_id='test_node') fake_node.objects.get.return_value = node DistributionRepository(distribution).get_data_json() self.assertTrue(repository.called_with(node))
class ReaderTests(IndexingTestCase): catalog = os.path.join(SAMPLES_DIR, 'full_ts_data.json') catalog_id = 'catalog_id' def setUp(self): self.task = ReadDataJsonTask.objects.create() self.task.save() self.mgmt_task = ManagementTask() self.mgmt_task.save() self.node = Node(catalog_id=self.catalog_id, catalog_url=self.catalog, indexable=True) self.node.save() def test_index_same_series_different_catalogs(self, *_): read_datajson( self.task, whitelist=True, ) index_catalog( self.node, self.mgmt_task, ) read_datajson( self.task, whitelist=True, ) index_catalog( self.node, self.mgmt_task, ) count = Field.objects.filter( identifier='212.1_PSCIOS_ERN_0_0_25').count() self.assertEqual(count, 1) def test_dont_index_same_distribution_twice(self, *_): read_datajson( self.task, whitelist=True, ) index_catalog( self.node, self.mgmt_task, ) read_datajson( self.task, whitelist=True, ) index_catalog( self.node, self.mgmt_task, ) distribution = Distribution.objects.get(identifier='212.1') # La distribucion es marcada como no indexable hasta que cambien sus datos self.assertEqual( distribution.enhanced_meta.get(key=meta_keys.CHANGED).value, 'False') def test_first_time_distribution_indexable(self, *_): read_datajson( self.task, whitelist=True, ) index_catalog( self.node, self.mgmt_task, ) distribution = Distribution.objects.get(identifier='212.1') self.assertEqual( distribution.enhanced_meta.get(key=meta_keys.CHANGED).value, 'True') def test_index_same_distribution_if_data_changed(self, *_): read_datajson(self.task, whitelist=True) index_catalog(self.node, self.mgmt_task) new_catalog = os.path.join(SAMPLES_DIR, 'full_ts_data_changed.json') self.node.catalog_url = new_catalog self.node.save() read_datajson( self.task, whitelist=True, ) index_catalog( self.node, self.mgmt_task, ) distribution = Distribution.objects.get(identifier='212.1') # La distribución fue indexada nuevamente, está marcada como indexable self.assertEqual( distribution.enhanced_meta.get(key=meta_keys.CHANGED).value, 'True') def test_error_distribution_logs(self, *_): catalog = os.path.join(SAMPLES_DIR, 'distribution_missing_downloadurl.json') self.node.catalog_url = catalog self.node.save() read_datajson( self.task, whitelist=True, ) index_catalog( self.node, self.mgmt_task, ) self.assertGreater( len(ReadDataJsonTask.objects.get(id=self.task.id).logs), 10) def test_index_YYYY_MM_distribution(self, *_): catalog = os.path.join(SAMPLES_DIR, 'single_data_yyyy_mm.json') self.node.catalog_url = catalog self.node.save() read_datajson( self.task, whitelist=True, ) index_catalog( self.node, self.mgmt_task, ) distribution = Distribution.objects.get(identifier='102.1') self.assertEqual( distribution.enhanced_meta.get(key=meta_keys.CHANGED).value, 'True') def test_index_YYYY_distribution(self, *_): catalog = os.path.join(SAMPLES_DIR, 'single_data_yyyy.json') self.node.catalog_url = catalog self.node.save() read_datajson(self.task, whitelist=True) index_catalog(self.node, self.mgmt_task) distribution = Distribution.objects.get(identifier='102.1') self.assertEqual( distribution.enhanced_meta.get(key=meta_keys.CHANGED).value, 'True') @mock.patch('series_tiempo_ar_api.libs.indexing.catalog_reader.DataJson') def test_format_is_passed_to_data_json(self, data_json, *_): read_datajson(self.task, whitelist=True) self.node.catalog_format = 'xlsx' index_catalog(self.node, self.mgmt_task) self.assertEqual(data_json.call_args[1]['catalog_format'], self.node.catalog_format) def test_significant_figures(self, *_): Catalog.objects.all().delete() catalog = os.path.join(SAMPLES_DIR, 'ipc_data.json') self.node.catalog_url = catalog self.node.save() read_datajson(self.task, whitelist=True) index_catalog(self.node, self.mgmt_task) field = Field.objects.get( identifier='serie_inflacion') # Sacado del data.json self.assertEqual( field.enhanced_meta.get(key='significant_figures').value, '4') def test_custom_validation_options(self, *_): # Fallarán todas las validaciones config = DistributionValidatorConfig.get_solo() config.max_field_title_len = 0 config.save() read_datajson(self.task, whitelist=True) index_catalog(self.node, self.mgmt_task) distribution = Distribution.objects.get(identifier='212.1') self.assertTrue(distribution.error)