def test_store_record_inspirehep_api_author_new_wrong_response_code(workflow_app): record_data = { '$schema': 'http://localhost:5000/schemas/records/authors.json', 'name': {'value': 'Robert Johnson'}, '_collections': ['Authors'] } workflow = workflow_object_class.create({}) workflow.extra_data['is-update'] = False workflow.data = record_data eng = MagicMock(workflow_definition=MagicMock(data_type='authors')) with patch.dict(workflow_app.config, { 'FEATURE_FLAG_ENABLE_REST_RECORD_MANAGEMENT': True, 'INSPIREHEP_URL': "http://web:8000" }): with requests_mock.Mocker() as requests_mocker: requests_mocker.register_uri( 'POST', '{url}/authors'.format( url=workflow_app.config.get("INSPIREHEP_URL")), headers={'content-type': 'application/json'}, status_code=401, json={ "message": "Something" } ) with pytest.raises(WorkflowsError): store_record(workflow, eng)
def start_workflow_for_literature_submission(): json = request.get_json() submission_data = json['data'] workflow_object = workflow_object_class.create( data={}, id_user=submission_data['acquisition_source']['internal_uid'], data_type="hep" ) submission_data['acquisition_source']['submission_number'] = str(workflow_object.id) workflow_object.data = submission_data workflow_object.extra_data['formdata'] = json['form_data'] workflow_object.extra_data['source_data'] = { 'extra_data': copy.deepcopy(workflow_object.extra_data), 'data': copy.deepcopy(workflow_object.data), } workflow_object.save() db.session.commit() workflow_object_id = workflow_object.id start.delay("article", object_id=workflow_object.id) return jsonify({'workflow_object_id': workflow_object_id})
def test_regression_store_record_does_not_commit_when_error(workflow_app): data = { '$schema': 'http://localhost:5000/schemas/records/hep.json', '_collections': ['Literature'], 'document_type': ['article'], 'titles': [{'title': 'title'}], } eng = MagicMock(workflow_definition=MagicMock(data_type='hep')) obj = workflow_object_class.create(data) record_count = RecordMetadata.query.count() assert record_count == 0 with patch.object( InspireRecord, 'download_documents_and_figures', side_effect=Exception ): # pytest.raises catches the exception and makes the test passing immediately try: store_record(obj, eng) except Exception: record_count = RecordMetadata.query.count() assert record_count == 0
def test_store_root_new_record(workflow_app): config = { 'FEATURE_FLAG_ENABLE_MERGER': True } eng = MagicMock(workflow_definition=MagicMock(data_type='hep')) with patch.dict(current_app.config, config): head = TestRecordMetadata.create_from_kwargs(index=False, has_pid=False) head_uuid = head.record_metadata.id record = head.record_metadata.json obj = workflow_object_class.create(record) root = { 'version': 'original', 'acquisition_source': {'source': 'arXiv'} } extra_data = { 'head_uuid': str(head_uuid), 'merger_root': root, } obj.extra_data = extra_data store_root(obj, eng) root_entry = read_wf_record_source(head_uuid, 'arxiv') assert root_entry.json == root
def test_store_record_inspirehep_api_author_new(workflow_app): record_data = { '$schema': 'http://localhost:5000/schemas/records/authors.json', 'name': {'value': 'Robert Johnson'}, '_collections': ['Authors'] } workflow = workflow_object_class.create({}) workflow.extra_data['is-update'] = False workflow.data = record_data expected_head_uuid = 'uuid_number_123456' expected_control_number = 222 eng = MagicMock(workflow_definition=MagicMock(data_type='authors')) with patch.dict(workflow_app.config, { 'FEATURE_FLAG_ENABLE_REST_RECORD_MANAGEMENT': True, 'INSPIREHEP_URL': "http://web:8000" }): with requests_mock.Mocker() as requests_mocker: requests_mocker.register_uri( 'POST', '{url}/authors'.format( url=workflow_app.config.get("INSPIREHEP_URL")), headers={'content-type': 'application/json'}, status_code=201, json={ "metadata": { "control_number": expected_control_number }, 'id_': expected_head_uuid } ) store_record(workflow, eng) # not throwing exception assert workflow.data['control_number'] == expected_control_number assert workflow.extra_data['head_uuid'] == expected_head_uuid
def submit(): """Get form data and start workflow.""" form = LiteratureForm(formdata=request.form) visitor = DataExporter() visitor.visit(form) workflow_object = workflow_object_class.create( data={}, id_user=current_user.get_id(), data_type="hep" ) workflow_object.extra_data['formdata'] = copy.deepcopy(visitor.data) visitor.data = normalize_formdata(workflow_object, visitor.data) workflow_object.data = formdata_to_model(workflow_object, visitor.data) workflow_object.extra_data['source_data'] = { 'extra_data': copy.deepcopy(workflow_object.extra_data), 'data': copy.deepcopy(workflow_object.data), } workflow_object.save() db.session.commit() # Start workflow. delayed=True will execute the workflow in the # background using, for example, Celery. start.delay("article", object_id=workflow_object.id) if 'chapter' in visitor.data.get('type_of_doc') and not visitor.data.get('parent_book'): return redirect(url_for('.success_book_parent')) else: return redirect(url_for('.success'))
def submitnew(): """Form action handler for INSPIRE author new form.""" form = AuthorUpdateForm(formdata=request.form) visitor = DataExporter() visitor.visit(form) workflow_object = workflow_object_class.create( data={}, id_user=current_user.get_id(), data_type="authors" ) workflow_object.extra_data['formdata'] = copy.deepcopy(visitor.data) workflow_object.data = formdata_to_model(workflow_object, visitor.data) workflow_object.save() db.session.commit() # Start workflow. delayed=True will execute the workflow in the # background using, for example, Celery. start.delay("author", object_id=workflow_object.id) ctx = { "inspire_url": get_inspire_url(visitor.data) } return render_template('authors/forms/new_success.html', **ctx)
def test_has_same_source(app, simple_record): obj = workflow_object_class.create( data=simple_record, status=ObjectStatus.HALTED, data_type='hep', ) obj_id = obj.id obj.save() es.indices.refresh('holdingpen-hep') obj2 = WorkflowObject.create(data=simple_record, data_type='hep') match_non_completed_wf_in_holdingpen(obj2, None) same_source_func = has_same_source('holdingpen_matches') assert same_source_func(obj2, None) assert obj2.extra_data['holdingpen_matches'] == [obj_id] # change source and match the wf in the holdingpen different_source_rec = dict(simple_record) different_source_rec['acquisition_source'] = {'source': 'different'} obj3 = WorkflowObject.create(data=different_source_rec, data_type='hep') assert match_non_completed_wf_in_holdingpen(obj3, None) assert not same_source_func(obj3, None)
def test_previously_rejected_from_not_fully_harvested_category_is_not_auto_approved( mocked_refextract_extract_refs, mocked_api_request_magpie, mocked_api_request_beard, mocked_is_pdf_link, mocked_package_download, mocked_arxiv_download, workflow_app, mocked_external_services, ): record, categories = core_record() record["arxiv_eprints"][0]["categories"] = ["q-bio.GN"] obj = workflow_object_class.create( data=record, status=ObjectStatus.COMPLETED, data_type="hep" ) obj.extra_data["approved"] = False # reject it obj.save() es.indices.refresh("holdingpen-hep") extra_config = { "BEARD_API_URL": "http://example.com/beard", "MAGPIE_API_URL": "http://example.com/magpie", "ARXIV_CATEGORIES": categories, } with workflow_app.app_context(): with mock.patch.dict(workflow_app.config, extra_config): workflow_id = build_workflow(record).id eng_uuid = start("article", object_id=workflow_id) eng = WorkflowEngine.from_uuid(eng_uuid) obj2 = eng.processed_objects[0] assert not obj2.extra_data["auto-approved"] assert len(obj2.extra_data["previously_rejected_matches"]) > 0 assert obj2.status == ObjectStatus.COMPLETED
def test_stop_matched_holdingpen_wfs(app, simple_record): # need to run a wf in order to assign to it the wf definition and a uuid # for it obj = workflow_object_class.create( data_type='hep', **simple_record ) workflow_uuid = start('article', object_id=obj.id) eng = WorkflowEngine.from_uuid(workflow_uuid) obj = eng.processed_objects[0] obj.status = ObjectStatus.HALTED obj.save() obj_id = obj.id es.indices.refresh('holdingpen-hep') obj2 = WorkflowObject.create(data_type='hep', **simple_record) obj2_id = obj2.id match_non_completed_wf_in_holdingpen(obj2, None) assert obj2.extra_data['holdingpen_matches'] == [obj_id] stop_matched_holdingpen_wfs(obj2, None) stopped_wf = workflow_object_class.get(obj_id) assert stopped_wf.status == ObjectStatus.COMPLETED assert stopped_wf.extra_data['stopped-by-wf'] == obj2_id
def test_article_workflow_stops_when_record_is_not_valid(workflow_app): invalid_record = { 'document_type': [ 'article', ], 'titles': [ {'title': 'A title'}, ], } obj = workflow_object_class.create( data=invalid_record, data_type='hep', id_user=1, ) obj_id = obj.id with pytest.raises(ValidationError): start('article', invalid_record, obj_id) obj = workflow_object_class.get(obj_id) assert obj.status == ObjectStatus.ERROR assert '_error_msg' in obj.extra_data assert 'required' in obj.extra_data['_error_msg']
def start_author_workflow(): submission_data = request.get_json()['data'] workflow_object = workflow_object_class.create( data={}, # can be changed to get the user id from the current user once we implement authentication id_user=submission_data['acquisition_source']['internal_uid'], data_type='authors' ) submission_data['acquisition_source']['submission_number'] = str(workflow_object.id) workflow_object.data = submission_data workflow_object.extra_data['is-update'] = bool(submission_data.get('control_number')) workflow_object.extra_data['source_data'] = { 'data': copy.deepcopy(workflow_object.data), 'extra_data': copy.deepcopy(workflow_object.extra_data) } workflow_object.save() db.session.commit() workflow_object_id = workflow_object.id start.delay( 'author', object_id=workflow_object.id) return jsonify({'workflow_object_id': workflow_object_id})
def test_save_roots(workflow_app): head = InspireRecord.create_or_update(fake_record('title1', 123), skip_files=False) head.commit() update = InspireRecord.create_or_update(fake_record('title2', 456), skip_files=False) update.commit() obj = workflow_object_class.create( data={}, data_type='hep' ) obj.extra_data['head_uuid'] = str(head.id) obj.extra_data['update_uuid'] = str(update.id) obj.save() # Union: keep the most recently created/updated root from each source. insert_wf_record_source(json={'version': 'original'}, record_uuid=head.id, source='arxiv') insert_wf_record_source(json={'version': 'updated'}, record_uuid=update.id, source='arxiv') insert_wf_record_source(json={'version': 'updated'}, record_uuid=update.id, source='publisher') save_roots(obj, None) arxiv_rec = read_wf_record_source(head.id, 'arxiv') assert arxiv_rec.json == {'version': 'updated'} pub_rec = read_wf_record_source(head.id, 'publisher') assert pub_rec.json == {'version': 'updated'} assert not read_wf_record_source(update.id, 'arxiv') assert not read_wf_record_source(update.id, 'publisher')
def test_store_record_does_not_raise_in_the_orcid_receiver(mock_attempt_push, app): config = { 'FEATURE_FLAG_ENABLE_ORCID_PUSH': True, 'RECORDS_SKIP_FILES': False, } eng = MagicMock(workflow_definition=MagicMock(data_type='hep')) with patch.dict(current_app.config, config): obj = workflow_object_class.create({ '$schema': 'http://localhost:5000/schemas/records/hep.json', '_collections': [ 'Literature', ], 'authors': [ { 'full_name': 'Patra, Asim', 'ids': [ { 'schema': 'ORCID', 'value': '0000-0003-1166-2790', }, ], }, ], 'document_type': [ 'article', ], 'titles': [ {'title': 'title'}, ], }) store_record(obj, eng) # Does not raise.
def test_save_roots(workflow_app): # XXX: for some reason, this must be internal. from inspirehep.modules.migrator.tasks import record_insert_or_replace head = record_insert_or_replace(fake_record('title1', 123)) update = record_insert_or_replace(fake_record('title2', 456)) obj = workflow_object_class.create( data={}, data_type='hep' ) obj.extra_data['head_uuid'] = str(head.id) obj.extra_data['update_uuid'] = str(update.id) obj.save() insert_wf_record_source(json={}, record_uuid=head.id, source='a') insert_wf_record_source(json={}, record_uuid=head.id, source='b') # this will not be saved because there's already an entry with source `a` insert_wf_record_source(json={}, record_uuid=update.id, source='a') insert_wf_record_source(json={}, record_uuid=update.id, source='c') save_roots(obj, None) assert read_wf_record_source(str(head.id), 'a') assert read_wf_record_source(str(head.id), 'b') assert read_wf_record_source(str(head.id), 'c')
def test_audit(small_app): user_id = None workflow_id = None with small_app.app_context(): user = User(email="*****@*****.**", active=True) user.password = "******" db.session.add(user) workflows_object = workflow_object_class.create({}, data_type="hep") db.session.commit() user_id = user.id workflow_id = workflows_object.id with small_app.app_context(): logging_info = { 'object_id': workflow_id, 'user_id': user_id, 'score': 0.222113, 'user_action': "Non-CORE", 'decision': "Rejected", 'source': "test", 'action': "accept" } audit = WorkflowsAudit(**logging_info) audit.save() db.session.commit() assert WorkflowsAudit.query.count() == 1 audit_entry = WorkflowsAudit.query.filter( WorkflowsAudit.object_id == workflow_id ).one() assert audit_entry assert audit_entry.action == "accept" assert audit_entry.score == 0.222113 relevance_prediction = dict( max_score=0.222113, decision="Rejected" ) with small_app.app_context(): log_workflows_action( action="accept_core", relevance_prediction=relevance_prediction, object_id=workflow_id, user_id=None, source="test", user_action="accept" ) db.session.commit() assert WorkflowsAudit.query.count() == 2 audit_entry = WorkflowsAudit.query.filter( WorkflowsAudit.action == "accept_core" ).one() assert audit_entry assert audit_entry.action == "accept_core" assert audit_entry.score == 0.222113
def test_is_stale_data_is_true(workflow_app): head = TestRecordMetadata.create_from_kwargs(index=False, has_pid=False) obj = workflow_object_class.create({}) obj.extra_data['is-update'] = True obj.extra_data['head_uuid'] = head.record_metadata.id obj.extra_data['head_version_id'] = head.record_metadata.version_id - 1 assert is_stale_data(obj, None)
def create_wf(arxiv_id, control_number): wf = workflow_object_class.create( data_type='hep', data={ 'arxiv_eprints': [{'value': arxiv_id}], 'control_number': control_number } ) wf.status = ObjectStatus.COMPLETED wf.save()
def test_match_wf_in_error_goes_in_initial_state(workflow_app): record = generate_record() obj = workflow_object_class.create(data=record, data_type="hep") obj.status = ObjectStatus.INITIAL obj.save() es.indices.refresh("holdingpen-hep") with pytest.raises(WorkflowsError): workflow_id = build_workflow(record).id start("article", object_id=workflow_id)
def build_workflow(workflow_data, data_type='hep', **kwargs): workflow_object = workflow_object_class.create( data_type=data_type, data=workflow_data, extra_data={ 'source_data': { 'data': deepcopy(workflow_data), 'extra_data': {}, } }, **kwargs ) return workflow_object
def start_merger(head_id, update_id, current_user_id=None): """Start a new ManualMerge workflow to merge two records manually. Args: head_id: the id of the first record to merge. This record is the one that will be updated with the new information. update_id: the id of the second record to merge. This record is the one that is going to be deleted and replaced by `head`. current_user_id: Id of the current user provided by the Flask app. Returns: (int): the current workflow object's id. """ data = { 'pid_type': 'lit', # TODO: support 'recid_head': head_id, 'recid_update': update_id, } head = get_db_record('lit', head_id) update = get_db_record('lit', update_id) workflow_object = workflow_object_class.create( data=None, id_user=current_user_id, data_type='hep' ) wf_id = workflow_object.id # to retrieve it later workflow_object.extra_data.update(data) update_source = LiteratureReader(update).source update_source = update_source if update_source else 'arxiv' workflow_object.extra_data['update_source'] = update_source.lower() workflow_object.extra_data['head_control_number'] = head_id workflow_object.extra_data['update_control_number'] = update_id workflow_object.extra_data['head_uuid'] = str(head.id) workflow_object.extra_data['update_uuid'] = str(update.id) workflow_object.extra_data['head'] = head workflow_object.extra_data['update'] = update workflow_object.save() start('manual_merge', object_id=wf_id) return wf_id
def workflow(): workflow_object = workflow_object_class.create( data={}, id_user=1, data_type="hep" ) workflow_object.save() db.session.commit() workflow_object.continue_workflow = lambda **args: True yield workflow_object WorkflowObjectModel.query.filter_by(id=workflow_object.id).delete() db.session.commit()
def test_inspect_merge_view_returns_400(workflow_app): factory = TestRecordMetadata.create_from_kwargs( json={'titles': [{'title': 'Curated version'}]} ) obj = workflow_object_class.create( data=factory.record_metadata.json, data_type='hep', ) obj.save() db.session.commit() with workflow_app.test_client() as client: response = client.get('/workflows/inspect_merge/{}'.format(obj.id)) assert response.status_code == 400
def test_workflow_loads_from_source_data_fails_on_no_source_data( load_from_source_data_workflow, workflow_app, record_from_db, ): extra_data_without_source_data = {} workflow_id = workflow_object_class.create( data_type='hep', data=record_from_db, extra_data=extra_data_without_source_data, ).id with pytest.raises(ValueError) as exc: start('load_source_data', object_id=workflow_id) assert exc.match(r'source_data.*missing')
def test_normalize_journal_titles_known_journals_with_ref(workflow_app, insert_journals_in_db): record = { "_collections": [ "Literature" ], "titles": [ "A title" ], "document_type": [ "book", "note", "report" ], "publication_info": [ { "journal_title": "A Test Journal1", "journal_record": { "$ref": "http://localhost:5000/api/journals/1936475" } }, { "cnum": "C01-01-01" }, { "journal_title": "Test.Jou.2", "journal_record": { "$ref": "http://localhost:5000/api/journals/1936476" } } ] } obj = workflow_object_class.create( data=record, id_user=1, data_type='hep' ) normalize_journal_titles(obj, None) assert obj.data['publication_info'][0]['journal_title'] == 'Test.Jou.1' assert obj.data['publication_info'][2]['journal_title'] == 'Test.Jou.2' assert obj.data['publication_info'][0]['journal_record'] == {'$ref': 'http://localhost:5000/api/journals/1936475'} assert obj.data['publication_info'][2]['journal_record'] == {'$ref': 'http://localhost:5000/api/journals/1936476'}
def start_workflow_for_submission(self, endpoint, submission_data, control_number=None): workflow_object = workflow_object_class.create( data={}, id_user=current_user.get_id(), data_type=self.endpoint_to_data_type[endpoint] ) submission_data['acquisition_source'] = dict( email=current_user.email, datetime=datetime.datetime.utcnow().isoformat(), method='submitter', submission_number=str(workflow_object.id), internal_uid=int(workflow_object.id_user), ) orcid = self._get_user_orcid() if orcid: submission_data['acquisition_source']['orcid'] = orcid serializer = self._get_serializer_from_endpoint(endpoint) serialized_data = serializer().load(submission_data).data if control_number: serialized_data['control_number'] = int(control_number) workflow_object.data = serialized_data workflow_object.extra_data['is-update'] = bool(control_number) workflow_object.extra_data['source_data'] = { 'data': copy.deepcopy(workflow_object.data), 'extra_data': copy.deepcopy(workflow_object.extra_data) } workflow_object.save() db.session.commit() workflow_object_id = workflow_object.id start.delay( self.endpoint_to_workflow_name[endpoint], object_id=workflow_object.id) return workflow_object_id
def test_inspect_merge_view(workflow_app): factory = TestRecordMetadata.create_from_kwargs( json={'titles': [{'title': 'Curated version'}]} ) obj = workflow_object_class.create( data=factory.record_metadata.json, data_type='hep', ) obj.save() db.session.commit() head = deepcopy(factory.record_metadata.json) factory.record_metadata.json['titles'][0]['title'] = 'second curated version' db.session.add(factory.record_metadata) db.session.commit() obj.extra_data['merger_root'] = { 'titles': [{'title': 'Second version'}], 'document_type': ['article'], '_collections': ['Literature'], } obj.extra_data['merger_original_root'] = { 'titles': [{'title': 'First version'}], 'document_type': ['article'], '_collections': ['Literature'], } obj.extra_data['merger_head_revision'] = factory.inspire_record.revision_id expected = { 'root': obj.extra_data['merger_original_root'], 'head': head, 'update': obj.extra_data['merger_root'], 'merged': factory.record_metadata.json } with workflow_app.test_client() as client: response = client.get('/workflows/inspect_merge/{}'.format(obj.id)) assert response.status_code == 200 assert json.loads(response.data) == expected
def test_authors_workflow_continues_when_record_is_valid(workflow_app, mocked_external_services): valid_record = { '_collections': ['Authors'], 'name': { 'preferred_name': 'John Smith', 'value': 'Smith, John' } } obj = workflow_object_class.create( data=valid_record, data_type='authors', id_user=1, ) start('author', valid_record, obj.id) obj = workflow_object_class.get(obj.id) assert obj.status == ObjectStatus.HALTED assert '_error_msg' not in obj.extra_data
def submit(): """Get form data and start workflow.""" form = LiteratureForm(formdata=request.form) visitor = DataExporter() visitor.visit(form) workflow_object = workflow_object_class.create( data={}, id_user=current_user.get_id(), data_type="hep" ) workflow_object.extra_data['formdata'] = copy.deepcopy(visitor.data) workflow_object.data = formdata_to_model(workflow_object, visitor.data) workflow_object.save() db.session.commit() # Start workflow. delayed=True will execute the workflow in the # background using, for example, Celery. start.delay("article", object_id=workflow_object.id) return redirect(url_for('.success'))
def test_pending_holdingpen_matches_wf_if_not_completed(app, simple_record): obj = workflow_object_class.create( data=simple_record, status=ObjectStatus.HALTED, data_type='hep', ) obj_id = obj.id obj.save() es.indices.refresh('holdingpen-hep') obj2 = WorkflowObject.create(data=simple_record, data_type='hep') assert match_non_completed_wf_in_holdingpen(obj2, None) assert obj2.extra_data['holdingpen_matches'] == [obj_id] obj = workflow_object_class.get(obj_id) obj.status = ObjectStatus.COMPLETED obj.save() es.indices.refresh('holdingpen-hep') # doesn't match anymore because obj is COMPLETED assert not match_non_completed_wf_in_holdingpen(obj2, None)
def test_store_record_does_not_raise_in_the_orcid_receiver( mock_attempt_push, app): config = { 'FEATURE_FLAG_ENABLE_ORCID_PUSH': True, 'RECORDS_SKIP_FILES': False, } eng = MagicMock(workflow_definition=MagicMock(data_type='hep')) with patch.dict(current_app.config, config): obj = workflow_object_class.create({ '$schema': 'http://localhost:5000/schemas/records/hep.json', '_collections': [ 'Literature', ], 'authors': [ { 'full_name': 'Patra, Asim', 'ids': [ { 'schema': 'ORCID', 'value': '0000-0003-1166-2790', }, ], }, ], 'document_type': [ 'article', ], 'titles': [ { 'title': 'title' }, ], }) store_record(obj, eng) # Does not raise.
def test_normalize_journal_titles_known_journals_no_ref(workflow_app, insert_journals_in_db): record = { "_collections": [ "Literature" ], "titles": [ "A title" ], "document_type": [ "book", "note", "report" ], "publication_info": [ { "journal_title": "A Test Journal1" }, { "cnum": "C01-01-01" }, { "journal_title": "Test.Jou.2" } ] } obj = workflow_object_class.create( data=record, id_user=1, data_type='hep' ) normalize_journal_titles(obj, None) assert obj.data['publication_info'][0]['journal_title'] == 'Test.Jou.1' assert obj.data['publication_info'][2]['journal_title'] == 'Test.Jou.2' assert obj.data['publication_info'][0]['journal_record'] == {'$ref': 'http://localhost:5000/api/journals/1936475'} assert obj.data['publication_info'][2]['journal_record'] == {'$ref': 'http://localhost:5000/api/journals/1936476'}
def test_normalize_journal_titles_unknown_journals_no_ref(workflow_app, insert_journals_in_db): record = { "_collections": [ "Literature" ], "titles": [ "A title" ], "document_type": [ "book", "note", "report" ], "publication_info": [ { "journal_title": "Unknown1" }, { "cnum": "C01-01-01" }, { "journal_title": "Unknown2" } ] } obj = workflow_object_class.create( data=record, id_user=1, data_type='hep' ) normalize_journal_titles(obj, None) assert obj.data['publication_info'][0]['journal_title'] == 'Unknown1' assert obj.data['publication_info'][2]['journal_title'] == 'Unknown2' assert 'journal_record' not in obj.data['publication_info'][0] assert 'journal_record' not in obj.data['publication_info'][2]
def test_store_record_inspirehep_api_literature_new(workflow_app): record_data = { '$schema': 'http://localhost:5000/schemas/records/hep.json', 'titles': [{ 'title': 'Follow hour including staff wrong.' }], 'document_type': ['article'], '_collections': ['Literature'] } workflow = workflow_object_class.create({}) workflow.extra_data['is-update'] = False workflow.data = record_data expected_head_uuid = 'uuid_number_123456' expected_control_number = 111 eng = MagicMock(workflow_definition=MagicMock(data_type='hep')) with patch.dict( workflow_app.config, { 'FEATURE_FLAG_ENABLE_REST_RECORD_MANAGEMENT': True, 'INSPIREHEP_URL': "http://web:8000" }): with requests_mock.Mocker() as requests_mocker: requests_mocker.register_uri( 'POST', '{url}/literature'.format( url=workflow_app.config.get("INSPIREHEP_URL")), headers={'content-type': 'application/json'}, status_code=201, json={ "metadata": { "control_number": expected_control_number }, 'uuid': expected_head_uuid }) store_record(workflow, eng) # not throwing exception assert workflow.data['control_number'] == expected_control_number assert workflow.extra_data['head_uuid'] == expected_head_uuid
def test_authors_workflow_stops_when_record_is_not_valid(workflow_app): invalid_record = { 'name': { 'preferred_name': 'John Smith', 'value': 'Smith, John' } } obj = workflow_object_class.create( data=invalid_record, data_type='authors', id_user=1, ) obj_id = obj.id with pytest.raises(ValidationError): start('author', invalid_record, obj_id) obj = workflow_object_class.get(obj_id) assert obj.status == ObjectStatus.ERROR assert '_error_msg' in obj.extra_data assert 'required' in obj.extra_data['_error_msg']
def test_store_record_inspirehep_api_author_new(workflow_app): record_data = { '$schema': 'http://localhost:5000/schemas/records/authors.json', 'name': { 'value': 'Robert Johnson' }, '_collections': ['Authors'] } workflow = workflow_object_class.create({}) workflow.extra_data['is-update'] = False workflow.data = record_data expected_head_uuid = 'uuid_number_123456' expected_control_number = 222 eng = MagicMock(workflow_definition=MagicMock(data_type='authors')) with patch.dict( workflow_app.config, { 'FEATURE_FLAG_ENABLE_REST_RECORD_MANAGEMENT': True, 'INSPIREHEP_URL': "http://web:8000" }): with requests_mock.Mocker() as requests_mocker: requests_mocker.register_uri( 'POST', '{url}/authors'.format( url=workflow_app.config.get("INSPIREHEP_URL")), headers={'content-type': 'application/json'}, status_code=201, json={ "metadata": { "control_number": expected_control_number }, 'id_': expected_head_uuid }) store_record(workflow, eng) # not throwing exception assert workflow.data['control_number'] == expected_control_number assert workflow.extra_data['head_uuid'] == expected_head_uuid
def test_save_roots(workflow_app): head = InspireRecord.create_or_update(fake_record('title1', 123), skip_files=False) head.commit() update = InspireRecord.create_or_update(fake_record('title2', 456), skip_files=False) update.commit() obj = workflow_object_class.create(data={}, data_type='hep') obj.extra_data['head_uuid'] = str(head.id) obj.extra_data['update_uuid'] = str(update.id) obj.save() # Union: keep the most recently created/updated root from each source. insert_wf_record_source(json={'version': 'original'}, record_uuid=head.id, source='arxiv') insert_wf_record_source(json={'version': 'updated'}, record_uuid=update.id, source='arxiv') insert_wf_record_source(json={'version': 'updated'}, record_uuid=update.id, source='publisher') save_roots(obj, None) arxiv_rec = read_wf_record_source(head.id, 'arxiv') assert arxiv_rec.json == {'version': 'updated'} pub_rec = read_wf_record_source(head.id, 'publisher') assert pub_rec.json == {'version': 'updated'} assert not read_wf_record_source(update.id, 'arxiv') assert not read_wf_record_source(update.id, 'publisher')
def start_workflow_for_author_submission(): submission_data = request.get_json()['data'] workflow_object = workflow_object_class.create( data={}, id_user=submission_data['acquisition_source']['internal_uid'], data_type='authors') submission_data['acquisition_source']['submission_number'] = str( workflow_object.id) workflow_object.data = submission_data workflow_object.extra_data['is-update'] = bool( submission_data.get('control_number')) workflow_object.extra_data['source_data'] = { 'data': copy.deepcopy(workflow_object.data), 'extra_data': copy.deepcopy(workflow_object.extra_data) } workflow_object.save() db.session.commit() workflow_object_id = workflow_object.id start.delay('author', object_id=workflow_object.id) return jsonify({'workflow_object_id': workflow_object_id})
def test_save_roots(workflow_app): # XXX: for some reason, this must be internal. from inspirehep.modules.migrator.tasks import record_insert_or_replace head = record_insert_or_replace(fake_record('title1', 123)) update = record_insert_or_replace(fake_record('title2', 456)) obj = workflow_object_class.create(data={}, data_type='hep') obj.extra_data['head_uuid'] = str(head.id) obj.extra_data['update_uuid'] = str(update.id) obj.save() insert_wf_record_source(json={}, record_uuid=head.id, source='a') insert_wf_record_source(json={}, record_uuid=head.id, source='b') # this will not be saved because there's already an entry with source `a` insert_wf_record_source(json={}, record_uuid=update.id, source='a') insert_wf_record_source(json={}, record_uuid=update.id, source='c') save_roots(obj, None) assert read_wf_record_source(str(head.id), 'a') assert read_wf_record_source(str(head.id), 'b') assert read_wf_record_source(str(head.id), 'c')
def test_previously_rejected_from_not_fully_harvested_category_is_not_auto_approved( mocked_refextract_extract_refs, mocked_api_request_magpie, mocked_api_request_beard, mocked_is_pdf_link, mocked_package_download, mocked_arxiv_download, workflow_app, mocked_external_services, ): record, categories = core_record() record['arxiv_eprints'][0]['categories'] = ['q-bio.GN'] obj = workflow_object_class.create( data=record, status=ObjectStatus.COMPLETED, data_type='hep', ) obj.extra_data['approved'] = False # reject it obj.save() es.indices.refresh('holdingpen-hep') extra_config = { "BEARD_API_URL": "http://example.com/beard", "MAGPIE_API_URL": "http://example.com/magpie", 'ARXIV_CATEGORIES': categories, } with workflow_app.app_context(): with mock.patch.dict(workflow_app.config, extra_config): workflow_id = build_workflow(record).id eng_uuid = start('article', object_id=workflow_id) eng = WorkflowEngine.from_uuid(eng_uuid) obj2 = eng.processed_objects[0] assert not obj2.extra_data['auto-approved'] assert len(obj2.extra_data['previously_rejected_matches']) > 0 assert obj2.status == ObjectStatus.COMPLETED
def test_is_stale_data_returns_false_if_is_update_is_falsy(workflow_app): TestRecordMetadata.create_from_kwargs(index=False, has_pid=False) obj = workflow_object_class.create({}) assert is_stale_data(obj, None) is False
def submit_results(job_id, errors, log_file, results_uri, results_data=None): """Receive the submission of the results of a crawl job. Then it spawns the appropiate workflow according to whichever workflow the crawl job specifies. :param job_id: Id of the crawler job. :param errors: Errors that happened, if any (seems ambiguous) :param log_file: Path to the log file of the crawler job. :param results_uri: URI to the file containing the results of the crawl job, namely the records extracted. :param results_data: Optional data payload with the results list, to skip retrieving them from the `results_uri`, useful for slow or unreliable storages. """ results_path = urlparse(results_uri).path job = CrawlerJob.get_by_job(job_id) job.logs = log_file job.results = results_uri if errors: job.status = JobStatus.ERROR job.save() db.session.commit() raise CrawlerJobError(str(errors)) if results_data is None: results_data = _extract_results_data(results_path) for crawl_result in results_data: crawl_result = copy.deepcopy(crawl_result) try: _check_crawl_result_format(crawl_result) except KeyError as e: crawl_result = _crawl_result_from_exception(e, crawl_result) record = crawl_result.pop('record') crawl_errors = crawl_result['errors'] current_app.logger.debug('Parsing record: {}'.format(record)) engine = WorkflowEngine.with_name(job.workflow) engine.save() obj = workflow_object_class.create(data=record) obj.id_workflow = str(engine.uuid) if crawl_errors: obj.status = ObjectStatus.ERROR obj.extra_data['crawl_errors'] = crawl_result else: extra_data = { 'crawler_job_id': job_id, 'crawler_results_path': results_path, } record_extra = record.pop('extra_data', {}) if record_extra: extra_data['record_extra'] = record_extra obj.extra_data['source_data'] = { 'data': copy.deepcopy(record), 'extra_data': copy.deepcopy(extra_data), } obj.extra_data.update(extra_data) obj.data_type = current_app.config['CRAWLER_DATA_TYPE'] obj.save() db.session.commit() crawler_object = CrawlerWorkflowObject(job_id=job_id, object_id=obj.id) db.session.add(crawler_object) queue = current_app.config['CRAWLER_CELERY_QUEUE'] if not crawl_errors: start.apply_async( kwargs={ 'workflow_name': job.workflow, 'object_id': obj.id, }, queue=queue, ) current_app.logger.info('Parsed {} records.'.format(len(results_data))) job.status = JobStatus.FINISHED job.save() db.session.commit()