def do_run(self, *args, **kwargs): # Load all relevant ContentTypes in a single query ContentType.objects.get_for_models(*apps.get_models('share'), for_concrete_models=False) logger.info('%s started make JSON patches for NormalizedData %s at %s', self.started_by, self.normalized.id, datetime.datetime.utcnow().isoformat()) try: with transaction.atomic(): cg = ChangeGraph(self.normalized.data['@graph'], namespace=self.normalized.source.username) cg.process() cs = ChangeSet.objects.from_graph(cg, self.normalized.id) if cs and (self.source.is_robot or self.source.is_trusted): # TODO: verify change set is not overwriting user created object cs.accept() except Exception as e: logger.info( 'Failed make JSON patches for NormalizedData %s with exception %s. Retrying...', self.normalized.id, e) raise self.retry(countdown=10, exc=e) logger.info( 'Finished make JSON patches for NormalizedData %s by %s at %s', self.normalized.id, self.started_by, datetime.datetime.utcnow().isoformat())
def _apply_changes(self, job, normalized_datum): updated = None try: # Load all relevant ContentTypes in a single query ContentType.objects.get_for_models(*apps.get_models('share'), for_concrete_models=False) with transaction.atomic(): cg = ChangeGraph(normalized_datum.data['@graph'], namespace=normalized_datum.source.username) cg.process() cs = ChangeSet.objects.from_graph(cg, normalized_datum.id) if cs and (normalized_datum.source.is_robot or normalized_datum.source.is_trusted or Source.objects.filter(user=normalized_datum.source).exists()): updated = cs.accept() # Retry if it was just the wrong place at the wrong time except (exceptions.IngestConflict, OperationalError) as e: job.retries = (job.retries or 0) + 1 job.save(update_fields=('retries',)) if job.retries > self.MAX_RETRIES: raise job.reschedule() return if not updated: return # Nothing to index # Index works that were added or directly updated updated_works = set(x.id for x in updated if isinstance(x, AbstractCreativeWork)) # and works that matched, even if they didn't change, in case any related objects did existing_works = set(n.instance.id for n in cg.nodes if isinstance(n.instance, AbstractCreativeWork)) return list(updated_works | existing_works)
def test_accept_subject(self, normalized_data_id): models.Subject.objects.bulk_create([ models.Subject(name='Felines') ]) assert models.Subject.objects.filter(name='Felines').count() == 1 graph = ChangeGraph([{ '@id': '_:987', '@type': 'subject', 'name': 'Felines' }, { '@id': '_:678', '@type': 'throughsubjects', 'subject': {'@id': '_:987', '@type': 'subject'}, 'creative_work': {'@id': '_:789', '@type': 'preprint'}, }, { '@id': '_:789', '@type': 'preprint', 'title': 'All About Cats', }]) graph.process() change_set = models.ChangeSet.objects.from_graph(graph, normalized_data_id) change_set.accept() assert models.Preprint.objects.filter(subjects__name='Felines').count() == 1 assert models.Preprint.objects.filter(subjects__name='Felines').first().title == 'All About Cats'
def test_normalize_workidentifier(self, input, output, Graph): graph = ChangeGraph( Graph(WorkIdentifier(uri=input, creative_work=None))) graph.process(disambiguate=False) assert graph.serialize() == (Graph( WorkIdentifier(uri=output, parse=True, creative_work=None)) if output else [])
def test_add_relation_related(self, normalized_data_id): ''' A work exists. Add a second work with a relation to the first work. The first work should have the appropriate inverse relation to the second work. ''' uri = 'http://osf.io/special-snowflake' models.ChangeSet.objects.from_graph(ChangeGraph([{ '@id': '_:1234', '@type': 'article', 'title': 'All About Cats', 'identifiers': [{'@id': '_:2345', '@type': 'workidentifier'}] }, { '@id': '_:2345', '@type': 'workidentifier', 'uri': uri, 'creative_work': {'@id': '_:1234', '@type': 'article'} }]), normalized_data_id).accept() assert models.Article.objects.count() == 1 graph = ChangeGraph([{ '@id': '_:1234', '@type': 'preprint', 'title': 'Dogs are okay too', 'related_works': [{'@id': '_:foo', '@type': 'cites'}] }, { '@id': '_:foo', '@type': 'cites', 'subject': {'@id': '_:1234', '@type': 'preprint'}, 'related': {'@id': '_:2345', '@type': 'creativework'}, }, { '@id': '_:2345', '@type': 'creativework', 'identifiers': [{'@id': '_:4567', '@type': 'workidentifier'}] }, { '@id': '_:4567', '@type': 'workidentifier', 'uri': uri, 'creative_work': {'@id': '_:2345', '@type': 'creativework'} }]) graph.process() change_set = models.ChangeSet.objects.from_graph(graph, normalized_data_id) change_set.accept() assert models.Article.objects.count() == 1 assert models.Preprint.objects.count() == 1 assert models.CreativeWork.objects.filter(type='share.creativework').count() == 0 cat = models.Article.objects.first() dog = models.Preprint.objects.first() assert dog.outgoing_creative_work_relations.count() == 1 assert dog.outgoing_creative_work_relations.first()._meta.model_name == 'cites' assert dog.outgoing_creative_work_relations.first().related == cat assert cat.incoming_creative_work_relations.count() == 1 assert cat.incoming_creative_work_relations.first()._meta.model_name == 'cites' assert cat.incoming_creative_work_relations.first().subject == dog
def test_is_blank(self): node = ChangeGraph([{ '@id': '_:1234', '@type': 'person', }]).nodes[0] node._id = '1234' assert node.is_blank is False
def test_create_extra(self): graph = ChangeGraph([], namespace='testing') node = graph.create(None, 'tag', { 'name': 'Foo', 'extra': { 'tag': 'Foo' } }) assert node.namespace == 'testing'
def test_delete_cascade(self, queryset, deltas, Graph): initial_cg = ChangeGraph(Graph(*self.initial)) initial_cg.process(disambiguate=False) ChangeSet.objects.from_graph(initial_cg, factories.NormalizedDataFactory().id).accept() before = {model: model.objects.count() for model in deltas.keys()} queryset.delete() for model, delta in deltas.items(): assert model.objects.count() - before[model] == delta
def test_external_reference(self): ChangeGraph.from_jsonld({ '@graph': [{ '@id': '_:5678', '@type': 'contributor', 'person': { '@id': 8, '@type': 'person' } }] }, disambiguate=False)
def test_unresolved_reference(self): with pytest.raises(UnresolvableReference): ChangeGraph.from_jsonld({ '@graph': [{ '@id': '_:5678', '@type': 'contributor', 'person': { '@id': '_:1234', '@type': 'person' } }] }, disambiguate=False)
def test_delete_cascade(self, queryset, deltas, Graph): initial_cg = ChangeGraph(Graph(*self.initial)) initial_cg.process(disambiguate=False) ChangeSet.objects.from_graph( initial_cg, factories.NormalizedDataFactory().id).accept() before = {model: model.objects.count() for model in deltas.keys()} queryset.delete() for model, delta in deltas.items(): assert model.objects.count() - before[model] == delta
def test_generic_creative_work(self, normalized_data_id): ''' A Preprint with an Identifier exists. Accept a changeset with a CreativeWork with the same Identifier and a different title. The Preprint's title should be updated to the new value, but its type should remain the same. ''' old_title = 'Ambiguous Earthquakes' uri = 'http://osf.io/special-snowflake' original_change_set = models.ChangeSet.objects.from_graph(ChangeGraph([{ '@id': '_:1234', '@type': 'preprint', 'title': old_title, 'identifiers': [{'@id': '_:2345', '@type': 'workidentifier'}] }, { '@id': '_:2345', '@type': 'workidentifier', 'uri': uri, 'creative_work': {'@id': '_:1234', '@type': 'preprint'} }]), normalized_data_id) preprint, identifier = original_change_set.accept() id = preprint.id assert identifier.uri == uri assert models.Preprint.objects.count() == 1 assert models.CreativeWork.objects.filter(type='share.creativework').count() == 0 assert models.Preprint.objects.get(id=id).title == old_title new_title = 'Ambidextrous Earthquakes' graph = ChangeGraph([{ '@id': '_:1234', '@type': 'creativework', 'title': new_title, 'identifiers': [{'@id': '_:2345', '@type': 'workidentifier'}] }, { '@id': '_:2345', '@type': 'workidentifier', 'uri': uri, 'creative_work': {'@id': '_:1234', '@type': 'creativework'} }]) graph.process() change_set = models.ChangeSet.objects.from_graph(graph, normalized_data_id) change_set.accept() assert models.Preprint.objects.count() == 1 assert models.CreativeWork.objects.filter(type='share.creativework').count() == 0 assert models.Preprint.objects.get(id=id).title == new_title
def test_no_timetraveling(self, Graph): newer_graph = ChangeGraph( Graph( Publication( id=1, sparse=True, identifiers=[WorkIdentifier(1)], date_updated='2017-02-03T18:07:53.385000', is_deleted=False, ))) newer_graph.process() ChangeSet.objects.from_graph(newer_graph, NormalizedDataFactory().id).accept() older_graph = ChangeGraph( Graph( Publication(id=1, sparse=True, identifiers=[WorkIdentifier(1)], date_updated='2017-02-03T18:07:50.000000', is_deleted=True, title='Not Previously Changed'))) older_graph.process() assert older_graph.nodes[0].change == { 'title': 'Not Previously Changed' }
def test_can_delete_work(self, john_doe, normalized_data_id): graph = ChangeGraph([{ '@id': '_:abc', '@type': 'workidentifier', 'uri': 'http://osf.io/faq', 'creative_work': {'@id': '_:789', '@type': 'preprint'} }, { '@id': '_:789', '@type': 'preprint', 'title': 'All About Cats', }]) graph.process() change_set = models.ChangeSet.objects.from_graph(graph, normalized_data_id) preprint, identifier = change_set.accept() assert preprint.is_deleted is False graph = ChangeGraph([{ '@id': '_:abc', '@type': 'workidentifier', 'uri': 'http://osf.io/faq', 'creative_work': {'@id': '_:789', '@type': 'preprint'} }, { '@id': '_:789', 'is_deleted': True, '@type': 'preprint', }]) graph.process() models.ChangeSet.objects.from_graph(graph, normalized_data_id).accept() preprint.refresh_from_db() assert preprint.is_deleted is True
def test_no_merge_on_blank_value(self, Graph): blank_cited_as = [ Publication(identifiers=[WorkIdentifier(1)], agent_relations=[ Publisher(cited_as='', agent=Organization(1)), ]) ] initial_cg = ChangeGraph(Graph(*blank_cited_as)) initial_cg.process() ChangeSet.objects.from_graph(initial_cg, NormalizedDataFactory().id).accept() assert models.Publication.objects.count() == 1 assert models.Publisher.objects.count() == 1 assert models.Organization.objects.count() == 1 additional_pub = [ Publication(identifiers=[WorkIdentifier(1)], agent_relations=[ Publisher(cited_as='', agent=Organization(1)), Publisher(cited_as='', agent=Organization(2)), ]) ] next_cg = ChangeGraph(Graph(*additional_pub)) next_cg.process() ChangeSet.objects.from_graph(next_cg, NormalizedDataFactory().id).accept() assert models.Publication.objects.count() == 1 assert models.Publisher.objects.count() == 2 assert models.Organization.objects.count() == 2
def change_node(): return ChangeGraph([{ '@id': '_:1234', '@type': 'person', 'given_name': 'No', 'family_name': 'Matter', }]).nodes[0]
def test_subject_accept(self, normalized_data_id): Subject.objects.bulk_create([ Subject(name='Felines', lineages=[]) ]) assert Subject.objects.filter(name='Felines').count() == 1 graph = ChangeGraph.from_jsonld({ '@graph': [{ '@id': '_:987', '@type': 'subject', 'name': 'Felines' }, { '@id': '_:678', '@type': 'throughsubjects', 'subject': {'@id': '_:987', '@type': 'subject'}, 'creative_work': {'@id': '_:789', '@type': 'preprint'}, }, { '@id': '_:789', '@type': 'preprint', 'title': 'All About Cats', }] }) change_set = ChangeSet.objects.from_graph(graph, normalized_data_id) change_set.accept() assert Preprint.objects.filter(subjects__name='Felines').count() == 1 assert Preprint.objects.filter(subjects__name='Felines').first().title == 'All About Cats'
def create_graph_dependencies(): return ChangeGraph([{ '@id': '_:123', '@type': 'person', 'given_name': 'Jane', 'family_name': 'Doe', }, { '@id': '_:456', '@type': 'Creator', 'agent': { '@id': '_:123', '@type': 'person' }, 'creative_work': { '@id': '_:789', '@type': 'preprint' }, }, { '@id': '_:789', '@type': 'preprint', 'title': 'All About Cats', 'related_agents': [{ '@id': '_:456', '@type': 'Creator' }] }])
def test_topological_sort_many_to_many(self): graph = ChangeGraph.from_jsonld( { '@graph': [{ '@id': '_:91011', '@type': 'preprint', 'contributors': [{ '@id': '_:5678', '@type': 'contributor' }] }, { '@id': '_:5678', '@type': 'contributor', 'person': { '@id': '_:1234', '@type': 'person' } }, { '@id': '_:1234', '@type': 'person', 'given_name': 'Doe', 'family_name': 'Jane', }] }, disambiguate=False) assert len(graph.nodes) == 3 assert graph.nodes[0].id == '_:91011' assert graph.nodes[1].id == '_:1234' assert graph.nodes[2].id == '_:5678'
def create_graph_dependencies(): return ChangeGraph.from_jsonld( { '@graph': [{ '@id': '_:123', '@type': 'person', 'given_name': 'Jane', 'family_name': 'Doe', }, { '@id': '_:456', '@type': 'contributor', 'person': { '@id': '_:123', '@type': 'person' }, 'creative_work': { '@id': '_:789', '@type': 'preprint' }, }, { '@id': '_:789', '@type': 'preprint', 'title': 'All About Cats', }] }, disambiguate=False)
def test_update_dependencies_accept(self, john_doe, normalized_data_id): graph = ChangeGraph.from_jsonld({ '@graph': [{ '@id': john_doe.pk, '@type': 'person', 'given_name': 'Jane', }, { '@id': '_:456', '@type': 'contributor', 'person': { '@id': john_doe.pk, '@type': 'person' }, 'creative_work': { '@id': '_:789', '@type': 'preprint' }, }, { '@id': '_:789', '@type': 'preprint', 'title': 'All About Cats', }] }) change_set = ChangeSet.objects.from_graph(graph, normalized_data_id) change_set.accept() john_doe.refresh_from_db() assert john_doe.given_name == 'Jane' assert Preprint.objects.filter( contributor__person=john_doe).count() == 1 assert Preprint.objects.filter( contributor__person=john_doe).first().title == 'All About Cats'
def test_update_dependencies_accept(self, john_doe, normalized_data_id): graph = ChangeGraph([{ '@id': IDObfuscator.encode(john_doe), '@type': 'person', 'given_name': 'Jane', }, { '@id': '_:456', '@type': 'Creator', 'agent': { '@id': IDObfuscator.encode(john_doe), '@type': 'person' }, 'creative_work': { '@id': '_:789', '@type': 'preprint' }, }, { '@id': '_:789', '@type': 'preprint', 'title': 'All About Cats', }]) change_set = models.ChangeSet.objects.from_graph( graph, normalized_data_id) change_set.accept() john_doe.refresh_from_db() assert john_doe.given_name == 'Jane' assert models.Preprint.objects.filter( agent_relations__agent=john_doe).count() == 1 assert models.Preprint.objects.filter( agent_relations__agent=john_doe).first().title == 'All About Cats'
def create_graph(): return ChangeGraph([{ '@id': '_:1234', '@type': 'person', 'given_name': 'Jane', 'family_name': 'Doe', }])
def test_topological_sort_many_to_many(self): graph = ChangeGraph([{ '@id': '_:91011', '@type': 'preprint', 'contributors': [{ '@id': '_:5678', '@type': 'contributor' }] }, { '@id': '_:5678', '@type': 'contributor', 'agent': { '@id': '_:1234', '@type': 'person' }, 'creative_work': { '@id': '_:91011', '@type': 'preprint' }, }, { '@id': '_:1234', '@type': 'person', 'given_name': 'Doe', 'family_name': 'Jane', }]) assert len(graph.nodes) == 3 # assert graph.nodes[0].id == '_:1234' # assert graph.nodes[1].id == '_:91011' assert graph.nodes[2].id == '_:5678'
def test_no_changes(self, Graph): initial_cg = ChangeGraph(Graph(*initial)) initial_cg.process() ChangeSet.objects.from_graph(initial_cg, NormalizedDataFactory().id).accept() Graph.discarded_ids.clear() cg = ChangeGraph(Graph(*initial)) cg.process() assert ChangeSet.objects.from_graph(cg, NormalizedDataFactory().id) is None
def test_can_delete_work(self, john_doe, normalized_data_id): graph = ChangeGraph.from_jsonld({ '@graph': [{ '@id': '_:abc', '@type': 'link', 'url': 'https://share.osf.io/faq', 'type': 'provider', }, { '@id': '_:456', '@type': 'throughlinks', 'link': {'@id': '_:abc', '@type': 'link'}, 'creative_work': {'@id': '_:789', '@type': 'preprint'}, }, { '@id': '_:789', '@type': 'preprint', 'title': 'All About Cats', }] }) change_set = ChangeSet.objects.from_graph(graph, normalized_data_id) link, preprint, _ = change_set.accept() assert preprint.is_deleted is False ChangeSet.objects.from_graph(ChangeGraph.from_jsonld({ '@graph': [{ '@id': '_:abc', '@type': 'link', 'type': 'provider', 'url': 'https://share.osf.io/faq', }, { '@id': '_:456', '@type': 'throughlinks', 'link': {'@id': '_:abc', '@type': 'link'}, 'creative_work': {'@id': '_:789', '@type': 'preprint'}, }, { '@id': '_:789', 'is_deleted': True, '@type': 'preprint', 'links': [{'@id': '_:456', '@type': 'throughlinks'}] }] }), normalized_data_id).accept() preprint.refresh_from_db() assert preprint.is_deleted is True
def disambiguate(self, normalized_id): normalized = NormalizedData.objects.select_related('source__source').get( pk=normalized_id) if self.request.id: self.update_state(meta={'source': normalized.source.source.long_title}) # Load all relevant ContentTypes in a single query ContentType.objects.get_for_models(*apps.get_models('share'), for_concrete_models=False) updated = None try: with transaction.atomic(): cg = ChangeGraph(normalized.data['@graph'], namespace=normalized.source.username) cg.process() cs = ChangeSet.objects.from_graph(cg, normalized.id) if cs and (normalized.source.is_robot or normalized.source.is_trusted or Source.objects.filter(user=normalized.source).exists()): # TODO: verify change set is not overwriting user created object updated = cs.accept() except Exception as e: raise self.retry( exc=e, countdown=(random.random() + 1) * min(settings.CELERY_RETRY_BACKOFF_BASE**self.request.retries, 60 * 15)) if not updated: return # Only index creativeworks on the fly, for the moment. updated_works = set(x.id for x in updated if isinstance(x, AbstractCreativeWork)) existing_works = set(n.instance.id for n in cg.nodes if isinstance(n.instance, AbstractCreativeWork)) ids = list(updated_works | existing_works) try: SearchIndexer(self.app).index('creativework', *ids) except Exception as e: logger.exception('Could not add results from %r to elasticqueue', normalized) raise
def test_all_disambiguate(self, input, Graph, normalized_data_id): graph = ChangeGraph(Graph(*input)) ChangeSet.objects.from_graph(graph, normalized_data_id).accept() assert all(n.instance is None for n in graph.nodes) GraphDisambiguator().find_instances(graph) assert all(n.instance for n in graph.nodes) assert all(n.instance._meta.model_name == n.type for n in graph.nodes)
def from_graph(self, graph, disambiguate=False): nd = NormalizedData.objects.create(normalized_data=graph, source=share_source) return ChangeSet.objects.from_graph( ChangeGraph.from_jsonld( graph, disambiguate=disambiguate, ), nd.pk)
def test_change_work_type(self, normalized_data_id): ''' A CreativeWork with an Identifier exists. Accept a new changeset with a Preprint with the same Identifier. The preprint should disambiguate to the existing work, and the work's type should be updated to Preprint ''' title = 'Ambiguous Earthquakes' uri = 'http://osf.io/special-snowflake' cg = ChangeGraph([{ '@id': '_:1234', '@type': 'project', 'title': title, 'identifiers': [{'@id': '_:2345', '@type': 'workidentifier'}] }, { '@id': '_:2345', '@type': 'workidentifier', 'uri': uri, 'creative_work': {'@id': '_:1234', '@type': 'project'} }]) cg.process() original_change_set = models.ChangeSet.objects.from_graph(cg, normalized_data_id) work, identifier = original_change_set.accept() id = work.id assert identifier.uri == uri assert models.Project.objects.count() == 1 assert models.Preprint.objects.count() == 0 assert models.CreativeWork.objects.count() == 1 assert models.Project.objects.all()[0].changes.count() == 1 cg = ChangeGraph([{ '@id': '_:1234', '@type': 'preprint', 'identifiers': [{'@id': '_:2345', '@type': 'workidentifier'}] }, { '@id': '_:2345', '@type': 'workidentifier', 'uri': uri, 'creative_work': {'@id': '_:1234', '@type': 'preprint'} }]) cg.process() change_set = models.ChangeSet.objects.from_graph(cg, normalized_data_id) change_set.accept() assert models.Project.objects.count() == 0 assert models.Preprint.objects.count() == 1 assert models.CreativeWork.objects.count() == 1 assert models.Preprint.objects.get(id=id).title == title assert models.Preprint.objects.all()[0].changes.count() == 2
def update_graph(jane_doe): return ChangeGraph.from_jsonld({ '@graph': [{ '@id': jane_doe.pk, '@type': 'person', 'family_name': 'Dough', }] })
def test_relationships(self): node = ChangeGraph([{ '@id': '_:5678', '@type': 'contributor', 'agent': { '@id': '_:1234', '@type': 'person' } }, { '@id': '_:1234', '@type': 'person' }]).nodes[1] assert node.type == 'contributor' assert node.attrs == {} assert len(node.related()) == 1 assert node.related('agent').related.id == '_:1234' assert node.related('agent').related.type == 'person'
def do_run(self, *args, **kwargs): # Load all relevant ContentTypes in a single query ContentType.objects.get_for_models(*apps.get_models('share'), for_concrete_models=False) logger.info('%s started make JSON patches for NormalizedData %s at %s', self.started_by, self.normalized.id, datetime.datetime.utcnow().isoformat()) try: with transaction.atomic(): cg = ChangeGraph(self.normalized.data['@graph'], namespace=self.normalized.source.username) cg.process() cs = ChangeSet.objects.from_graph(cg, self.normalized.id) if cs and (self.source.is_robot or self.source.is_trusted): # TODO: verify change set is not overwriting user created object cs.accept() except Exception as e: logger.info('Failed make JSON patches for NormalizedData %s with exception %s. Retrying...', self.normalized.id, e) raise self.retry(countdown=10, exc=e) logger.info('Finished make JSON patches for NormalizedData %s by %s at %s', self.normalized.id, self.started_by, datetime.datetime.utcnow().isoformat())
def test_reaccept(self, input, Graph): initial_cg = ChangeGraph(Graph(*initial)) initial_cg.process() ChangeSet.objects.from_graph(initial_cg, NormalizedDataFactory().id).accept() Graph.reseed() # Force new values to be generated first_cg = ChangeGraph(Graph(*input)) first_cg.process() first_cs = ChangeSet.objects.from_graph(first_cg, NormalizedDataFactory().id) assert first_cs is not None first_cs.accept() second_cg = ChangeGraph(Graph(*input)) second_cg.process() second_cs = ChangeSet.objects.from_graph(second_cg, NormalizedDataFactory().id) assert second_cs is None
def test_no_merge_on_blank_value(self, Graph): blank_cited_as = [ Publication( identifiers=[WorkIdentifier(1)], agent_relations=[ Publisher(cited_as='', agent=Organization(1)), ] ) ] initial_cg = ChangeGraph(Graph(*blank_cited_as)) initial_cg.process() ChangeSet.objects.from_graph(initial_cg, NormalizedDataFactory().id).accept() assert models.Publication.objects.count() == 1 assert models.Publisher.objects.count() == 1 assert models.Organization.objects.count() == 1 additional_pub = [ Publication( identifiers=[WorkIdentifier(1)], agent_relations=[ Publisher(cited_as='', agent=Organization(1)), Publisher(cited_as='', agent=Organization(2)), ] ) ] next_cg = ChangeGraph(Graph(*additional_pub)) next_cg.process() ChangeSet.objects.from_graph(next_cg, NormalizedDataFactory().id).accept() assert models.Publication.objects.count() == 1 assert models.Publisher.objects.count() == 2 assert models.Organization.objects.count() == 2
def test_no_timetraveling(self, Graph): newer_graph = ChangeGraph(Graph( Publication( id=1, sparse=True, identifiers=[WorkIdentifier(1)], date_updated='2017-02-03T18:07:53.385000', is_deleted=False, ) )) newer_graph.process() ChangeSet.objects.from_graph(newer_graph, NormalizedDataFactory().id).accept() older_graph = ChangeGraph(Graph( Publication( id=1, sparse=True, identifiers=[WorkIdentifier(1)], date_updated='2017-02-03T18:07:50.000000', is_deleted=True, title='Not Previously Changed' ) )) older_graph.process() assert older_graph.nodes[0].change == {'title': 'Not Previously Changed'}
def test_split_brain(self, Graph): initial_cg = ChangeGraph(Graph(*initial)) initial_cg.process() ChangeSet.objects.from_graph(initial_cg, NormalizedDataFactory().id).accept() # Multiple matches found for a thing should break cg = ChangeGraph(Graph(Preprint(identifiers=[WorkIdentifier(1), WorkIdentifier(2)]))) with pytest.raises(NotImplementedError) as e: cg.process() assert e.value.args[0] == "Multiple <class 'share.models.creative.Preprint'>s found"
def test_disambiguate(self, input, model, delta, Graph): initial_cg = ChangeGraph(Graph(*initial)) initial_cg.process(disambiguate=False) ChangeSet.objects.from_graph(initial_cg, NormalizedDataFactory().id).accept() Graph.reseed() # Nasty hack to avoid progres' fuzzy counting before = model.objects.exclude(change=None).count() cg = ChangeGraph(Graph(*input)) cg.process() cs = ChangeSet.objects.from_graph(cg, NormalizedDataFactory().id) if cs is not None: cs.accept() assert (model.objects.exclude(change=None).count() - before) == delta
def test_disambiguate(self, input, model_delta, Graph): initial_cg = ChangeGraph(Graph(*initial)) initial_cg.process(disambiguate=False) ChangeSet.objects.from_graph(initial_cg, NormalizedDataFactory().id).accept() Graph.reseed() before_count = {} for model in model_delta.keys(): before_count[model] = model.objects.filter(type=model._meta.label_lower).count() cg = ChangeGraph(Graph(*input)) cg.process() cs = ChangeSet.objects.from_graph(cg, NormalizedDataFactory().id) if cs is not None: cs.accept() for model in model_delta.keys(): assert model.objects.filter(type=model._meta.label_lower).count() - before_count[model] == model_delta[model]
def test_normalize_contributor_creator_relation(self, input, output, Graph): graph = ChangeGraph(Graph(CreativeWork(agent_relations=input))) graph.process(disambiguate=False) assert graph.serialize() == Graph(CreativeWork(agent_relations=output))
def test_normalize_person_relation(self, input, output, Graph): graph = ChangeGraph(Graph(*input)) graph.process(disambiguate=False) assert graph.serialize() == Graph(*output)
def test_normalize_agentidentifier(self, input, output, Graph): graph = ChangeGraph(Graph(AgentIdentifier(uri=input, agent=None))) graph.process(disambiguate=False) assert graph.serialize() == (Graph(AgentIdentifier(uri=output, parse=True, agent=None)) if output else [])
def test_create_extra(self): graph = ChangeGraph([], namespace='testing') node = graph.create(None, 'tag', {'name': 'Foo', 'extra': {'tag': 'Foo'}}) assert node.namespace == 'testing'
def test_normalize_agentworkrelation(self, input, output, Graph): graph = ChangeGraph(Graph(input)) graph.process(disambiguate=False) assert graph.serialize() == Graph(output)
def test_normalize_tags_on_work(self, input, output, Graph): graph = ChangeGraph(Graph(CreativeWork(tags=input))) graph.normalize() graph.prune() assert [n.serialize() for n in sorted(graph.nodes, key=lambda x: x.type + str(x.id))] == Graph(CreativeWork(tags=output))
def from_graph(self, graph, disambiguate=False): nd = NormalizedData.objects.create(data=graph, source=share_source) cg = ChangeGraph(graph['@graph']) cg.process(disambiguate=disambiguate) return ChangeSet.objects.from_graph(cg, nd.pk)
def test_normalize_organization_institution_name(self, input, output, Graph): graph = ChangeGraph(Graph(*input)) graph.process(disambiguate=False) assert graph.serialize() == Graph(*output)
def test_normalize_agent(self, input, output, Graph): graph = ChangeGraph(Graph(input)) graph.process(disambiguate=False) assert graph.serialize() == (Graph(output) if output else [])
def test_normalize_workidentifier(self, input, output, Graph): graph = ChangeGraph(Graph(WorkIdentifier(uri=input, creative_work=None))) graph.process(disambiguate=False) assert graph.serialize() == (Graph(WorkIdentifier(uri=output, parse=True, creative_work=None)) if output else [])
def test_no_timetraveling_many(self, Graph): oldest_graph = ChangeGraph(Graph( Publication( id=1, sparse=True, is_deleted=True, title='The first title', description='The first description', identifiers=[WorkIdentifier(1)], date_updated='2016-02-03T18:07:50.000000', ) )) oldest_graph.process() ChangeSet.objects.from_graph(oldest_graph, NormalizedDataFactory().id).accept() newer_graph = ChangeGraph(Graph( Publication( id=1, sparse=True, is_deleted=False, identifiers=[WorkIdentifier(1)], date_updated='2017-02-03T18:07:50.000000', ) )) newer_graph.process() ChangeSet.objects.from_graph(newer_graph, NormalizedDataFactory().id).accept() newest_graph = ChangeGraph(Graph( Publication( id=1, sparse=True, title='The final title', identifiers=[WorkIdentifier(1)], date_updated='2017-02-03T18:07:53.385000', ) )) newest_graph.process() ChangeSet.objects.from_graph(newest_graph, NormalizedDataFactory().id).accept() older_graph = ChangeGraph(Graph( Publication( id=1, sparse=True, is_deleted=True, title='The second title', description='The final description', identifiers=[WorkIdentifier(1)], date_updated='2017-01-01T18:00:00.000000', ) )) older_graph.process() assert older_graph.nodes[0].change == {'description': 'The final description'}
def test_normalize_tag(self, input, output, Graph): graph = ChangeGraph(Graph(CreativeWork(tags=[input]))) graph.process(disambiguate=False) assert graph.serialize() == Graph(CreativeWork(tags=output))
def test_normalize_creativework(self, input, output, Graph): graph = ChangeGraph(Graph(CreativeWork(**input))) graph.process(disambiguate=False) assert graph.serialize() == Graph(CreativeWork(**output))
def test_add_work_with_existing_relation(self, normalized_data_id): ''' Harvest a work that has a relation to some work identified by a DOI. The related work should be a CreativeWork with no information except the one Identifier. Then harvest a work with the same DOI. It should update the CreativeWork's type and attributes instead of creating a new work. ''' uri = 'http://osf.io/special-snowflake' models.ChangeSet.objects.from_graph(ChangeGraph([{ '@id': '_:1234', '@type': 'preprint', 'title': 'Dogs are okay', 'related_works': [{'@id': '_:foo', '@type': 'cites'}] }, { '@id': '_:foo', '@type': 'cites', 'subject': {'@id': '_:1234', '@type': 'preprint'}, 'related': {'@id': '_:2345', '@type': 'creativework'}, }, { '@id': '_:2345', '@type': 'creativework', 'identifiers': [{'@id': '_:4567', '@type': 'workidentifier'}] }, { '@id': '_:4567', '@type': 'workidentifier', 'uri': uri, 'creative_work': {'@id': '_:2345', '@type': 'creativework'} }]), normalized_data_id).accept() assert models.CreativeWork.objects.filter(type='share.creativework').count() == 1 assert models.Preprint.objects.count() == 1 assert models.Article.objects.count() == 0 change = ChangeGraph([{ '@id': '_:1234', '@type': 'article', 'title': 'All About Cats', 'identifiers': [{'@id': '_:2345', '@type': 'workidentifier'}] }, { '@id': '_:2345', '@type': 'workidentifier', 'uri': uri, 'creative_work': {'@id': '_:1234', '@type': 'article'} }]) change.process() models.ChangeSet.objects.from_graph(change, normalized_data_id).accept() assert models.CreativeWork.objects.filter(type='share.creativework').count() == 0 assert models.Article.objects.count() == 1 assert models.Preprint.objects.count() == 1 cat = models.Article.objects.first() dog = models.Preprint.objects.first() assert dog.outgoing_creative_work_relations.count() == 1 assert dog.outgoing_creative_work_relations.first()._meta.model_name == 'cites' assert dog.outgoing_creative_work_relations.first().related == cat assert cat.incoming_creative_work_relations.count() == 1 assert cat.incoming_creative_work_relations.first()._meta.model_name == 'cites' assert cat.incoming_creative_work_relations.first().subject == dog