def test_store_data_dedups_simple(self, source_config): rd1 = RawDatum.objects.store_data(source_config, FetchResult('unique', 'mydatums')) rd2 = RawDatum.objects.store_data(source_config, FetchResult('unique', 'mydatums')) assert rd1.pk == rd2.pk assert rd1.created is True assert rd2.created is False assert rd1.date_created == rd2.date_created assert rd1.date_modified < rd2.date_modified
def test_store_data_dedups_complex(self, source_config): data = '{"providerUpdatedDateTime":"2016-08-25T11:37:40Z","uris":{"canonicalUri":"https://provider.domain/files/7d2792031","providerUris":["https://provider.domain/files/7d2792031"]},"contributors":[{"name":"Person1","email":"*****@*****.**"},{"name":"Person2","email":"*****@*****.**"},{"name":"Person3","email":"*****@*****.**"},{"name":"Person4","email":"*****@*****.**"}],"title":"ReducingMorbiditiesinNeonatesUndergoingMRIScannig"}' rd1 = RawDatum.objects.store_data(source_config, FetchResult('unique', data)) rd2 = RawDatum.objects.store_data(source_config, FetchResult('unique', data)) assert rd1.pk == rd2.pk assert rd1.created is True assert rd2.created is False assert rd1.date_modified < rd2.date_modified assert rd1.date_created == rd2.date_created
def post(self, request, *args, **kwargs): try: jsonschema.validate(request.data, schemas.v1_push_schema) except (jsonschema.exceptions.ValidationError) as error: raise ParseError(detail=error.message) try: prelim_data = request.data['jsonData'] except ParseError as error: return Response( 'Invalid JSON - {0}'.format(error.message), status=status.HTTP_400_BAD_REQUEST ) # store raw data, assuming you can only submit one at a time with transaction.atomic(): try: doc_id = prelim_data['uris']['canonicalUri'] except KeyError: return Response({'errors': 'Canonical URI not found in uris.', 'data': prelim_data}, status=status.HTTP_400_BAD_REQUEST) config = self._get_source_config(request.user) raw = RawDatum.objects.store_data(config, FetchResult(doc_id, DictSerializer(pretty=False).serialize(prelim_data), timezone.now())) transformed_data = config.get_transformer().transform(raw.datum) data = {} data['data'] = transformed_data serializer = BasicNormalizedDataSerializer(data=data, context={'request': request}) if serializer.is_valid(): nm_instance = serializer.save() async_result = disambiguate.delay(nm_instance.id) return Response({'task_id': async_result.id}, status=status.HTTP_202_ACCEPTED) return Response({'errors': serializer.errors, 'data': prelim_data}, status=status.HTTP_400_BAD_REQUEST)
def test_swbiodiversity_transformer(): config = SourceConfig.objects.get(label=('org.swbiodiversity')) transformer = config.get_transformer() fetch_result = FetchResult( 'http://swbiodiversity.org/seinet/collections/misc/collprofiles.php?collid=187', data) raw_datum = RawDatum.objects.store_data(config, fetch_result) result = transformer.transform(raw_datum) assert result['@graph'][3]['@type'] == 'dataset' assert result['@graph'][3]['description'] == 'Sample description' assert result['@graph'][3]['title'] == 'A. Michael Powell Herbarium (SRSC)' assert result['@graph'][3]['extra'][ 'usage_rights'] == 'CC BY-NC (Attribution-Non-Commercial)' assert result['@graph'][3]['extra'][ 'access_rights'] == 'Sul Ross University' assert result['@graph'][3]['extra']['collection_statistics'] == { "(25%) georeferenced": "1,195", "(59%) identified to species": "2,849", "(61%) with images": "2,954", "families": "104", "genera": "361", "species": "661", "specimen records": "4,868", "total taxa (including subsp. and var.)": "762" } assert result['@graph'][4][ 'uri'] == 'http://swbiodiversity.org/seinet/collections/misc/collprofiles.php?collid=187'
def setup_ingest(self, claim_job): assert self.datum and self._config and not (self.raw or self.job or self.async_task) # TODO get rid of FetchResult, or make it more sensical from share.harvest.base import FetchResult fetch_result = FetchResult(self.datum_id, self.datum, self.datestamp) self.raw = RawDatum.objects.store_data(self._config, fetch_result) self.job = IngestJob.schedule(self.raw, claim=claim_job) return self
def test_rawdata(self, source_config): work = factories.AbstractCreativeWorkFactory( change__change_set__normalized_data__raw=models.RawDatum.objects. store_data(source_config, FetchResult('unique', 'data'))) work.change.change_set.normalized_data.delete() assert models.Change.objects.count() == 0 assert models.ChangeSet.objects.count() == 0 assert models.NormalizedData.objects.count() == 0 assert models.AbstractCreativeWork.objects.count() == 0
def test_store_data(self, source_config): rd = RawDatum.objects.store_data(source_config, FetchResult('unique', 'mydatums')) assert rd.date_modified is not None assert rd.date_created is not None assert rd.datum == 'mydatums' assert rd.suid.identifier == 'unique' assert rd.suid.source_config == source_config assert rd.sha256 == hashlib.sha256(b'mydatums').hexdigest()
def test_data_flow(self, source_config, monkeypatch, count, rediscovered, superfluous, limit, ingest, django_assert_num_queries): assert rediscovered <= count, 'Y tho' fake = Factory.create() mock_ingest_task = mock.Mock() monkeypatch.setattr('share.tasks.transform', mock_ingest_task) source_config.harvester.get_class()._do_fetch.extend([ (fake.sentence(), str(i * 50)) for i in range(count) ]) list( RawDatum.objects.store_chunk( source_config, (FetchResult(*tup) for tup in random.sample( source_config.harvester.get_class()._do_fetch, rediscovered)))) log = factories.HarvestLogFactory(source_config=source_config) tasks.harvest(log_id=log.id, superfluous=superfluous, limit=limit, ingest=ingest) log.refresh_from_db() assert log.completions == 1 assert log.status == HarvestLog.STATUS.succeeded assert log.raw_data.count() == (count if limit is None or count < limit else limit) if limit is not None and rediscovered: assert RawDatum.objects.filter().count() >= rediscovered assert RawDatum.objects.filter().count() <= rediscovered + max( 0, min(limit, count - rediscovered)) else: assert RawDatum.objects.filter().count() == ( count if limit is None or count < limit else limit) if ingest: if superfluous: assert mock_ingest_task.apply_async.call_count == min( count, limit or 99999) elif limit is not None: assert mock_ingest_task.apply_async.call_count <= min( limit, count) assert mock_ingest_task.apply_async.call_count >= min( limit, count) - rediscovered else: assert mock_ingest_task.apply_async.call_count == count - rediscovered else: assert mock_ingest_task.apply_async.call_count == 0
def test_swbiodiversity_transformer(): config = SourceConfig.objects.get(label=('org.swbiodiversity')) transformer = config.get_transformer() fetch_result = FetchResult( 'http://swbiodiversity.org/seinet/collections/misc/collprofiles.php?collid=187', data) raw_datum = RawDatum.objects.store_data(config, fetch_result) graph = transformer.transform(raw_datum) dataset = graph.filter_nodes(lambda n: n.type == 'dataset')[0] assert dataset.type == 'dataset' assert dataset['description'] == 'Sample description' assert dataset['title'] == 'A. Michael Powell Herbarium (SRSC)' assert dataset['extra'][ 'usage_rights'] == 'CC BY-NC (Attribution-Non-Commercial)' assert dataset['extra']['access_rights'] == 'Sul Ross University' assert dataset['extra']['collection_statistics'] == { "(25%) georeferenced": "1,195", "(59%) identified to species": "2,849", "(61%) with images": "2,954", "families": "104", "genera": "361", "species": "661", "specimen records": "4,868", "total taxa (including subsp. and var.)": "762" } agent_relations = dataset['agent_relations'] assert len(agent_relations) == 1 agent = agent_relations[0]['agent'] assert agent['given_name'] == 'Test' assert agent['identifiers'][0]['uri'] == 'mailto:[email protected]' identifiers = dataset['identifiers'] assert len(identifiers) == 1 assert identifiers[0][ 'uri'] == 'http://swbiodiversity.org/seinet/collections/misc/collprofiles.php?collid=187'
def test_data_flow(self, source_config, monkeypatch, count, rediscovered, superfluous, limit, ingest, django_assert_num_queries): assert rediscovered <= count, 'Y tho' fake = Factory.create() source_config.harvester.get_class()._do_fetch.extend([(fake.sentence(), str(i * 50)) for i in range(count)]) list(RawDatum.objects.store_chunk(source_config, ( FetchResult(*tup) for tup in random.sample(source_config.harvester.get_class()._do_fetch, rediscovered)) )) job = factories.HarvestJobFactory(source_config=source_config) tasks.harvest(job_id=job.id, superfluous=superfluous, limit=limit, ingest=ingest) job.refresh_from_db() assert job.completions == 1 assert job.status == HarvestJob.STATUS.succeeded assert job.raw_data.count() == (count if limit is None or count < limit else limit) if limit is not None and rediscovered: assert RawDatum.objects.filter().count() >= rediscovered assert RawDatum.objects.filter().count() <= rediscovered + max(0, min(limit, count - rediscovered)) else: assert RawDatum.objects.filter().count() == (count if limit is None or count < limit else limit) ingest_count = IngestJob.objects.filter(status=IngestJob.STATUS.created).count() if ingest: if superfluous: assert ingest_count == min(count, limit or 99999) elif limit is not None: assert ingest_count <= min(limit, count) assert ingest_count >= min(limit, count) - rediscovered else: assert ingest_count == count - rediscovered else: assert ingest_count == 0