def test_build_publication(record): result = Publication.build(record) expected_result = Publication( abstract="2 curated authors with recid", authors=["Doe, John"], collaborations=["ATLAS"], keywords=["effective action", "approximation: semiclassical"], publication_id=374836, title="Title", topics=["Theory-HEP"], ) assert result == expected_result
def test_get_signatures_only_curated( scan_mock, es_record_with_2_curated_authors, es_record_with_curated_author_and_no_recid, es_record_with_non_curated_author, ): scan_mock.side_effect = [[ es_record_with_2_curated_authors, es_record_with_curated_author_and_no_recid, es_record_with_non_curated_author, ]] signatures = get_signatures(only_curated=True) expected_signatures = [ Signature( author_affiliation="Rutgers U., Piscataway", author_id=989440, author_name="Seiberg, N.", publication=Publication( abstract="2 curated authors with recid", authors=["Seiberg, N.", "Jimmy"], collaborations=[], keywords=["effective action", "approximation: semiclassical"], publication_id=374836, title="Title", topics=["Theory-HEP"], ), signature_block="SABARGn", signature_uuid="94fc2b0a-dc17-42c2-bae3-ca0024079e52", is_curated_author_id=True, ), Signature( author_affiliation="UAIC", author_id=989440, author_name="Jimmy", publication=Publication( abstract="2 curated authors with recid", authors=["Seiberg, N.", "Jimmy"], collaborations=[], keywords=["effective action", "approximation: semiclassical"], publication_id=374836, title="Title", topics=["Theory-HEP"], ), signature_block="JANa", signature_uuid="94fc2b0a-dc17-42c2-bae3-ca0024079e55", is_curated_author_id=True, ), ] assert sorted(signatures, key=itemgetter("signature_uuid")) == sorted( expected_signatures, key=itemgetter("signature_uuid"))
def non_curated_signature(): signature = Signature( author_affiliation="Texas U.", author_id=None, author_name="Weinberg, Steven", publication=Publication( abstract="Author not curated", authors=["Weinberg, Steven"], collaborations=[], keywords=["book"], publication_id=406190, title="The Quantum theory of fields. Vol. 1: Foundations", topics=["Theory-HEP", "General Physics"], ), signature_block="WANBARGs", signature_uuid="5e550ded-e955-4a22-b906-8af5aaa9f1e2", is_curated_author_id=False, ) return signature
def curated_signature(): signature = Signature( author_affiliation="Rutgers U., Piscataway", author_id=989440, author_name="Seiberg, Nana", publication=Publication( abstract="abstract", authors=["Seiberg, N.", "Jimmy"], collaborations=["ATLAS", "CMS"], keywords=["effective action", "approximation: semiclassical"], publication_id=374836, title="Title", topics=["Theory-HEP", "Physics"], ), signature_block="SABARGn", signature_uuid="94fc2b0a-dc17-42c2-bae3-ca0024079e52", is_curated_author_id=True ) return signature
def test_build_signature_with_non_curated_author(non_curated_author, record): result = Signature.build(non_curated_author, record) expected_result = Signature( author_affiliation="Rutgers U., Piscataway", author_id=None, author_name="Doe, John", publication=Publication( abstract="2 curated authors with recid", authors=["Doe, John"], collaborations=["ATLAS"], keywords=["effective action", "approximation: semiclassical"], publication_id=374836, title="Title", topics=["Theory-HEP"], ), signature_block="JOhn", signature_uuid="94fc2b0a-dc17-42c2-bae3-ca0024079e52", is_curated_author_id=False, ) assert result == expected_result
def test_train_and_save_distance_model( scan_mock, sample_mock, choices_mock, fit_mock, score_mock, tmpdir, ethnicity_path, es_record_with_many_curated_authors, ): choices = [ # same cluster, different name ("JOhn", "94fc2b0a-dc17-42c2-bae3-ca0024079e52"), "94fc2b0a-dc17-42c2-bae3-ca0024079e53", # same cluster, same name ("JOhn", "94fc2b0a-dc17-42c2-bae3-ca0024079e54"), "94fc2b0a-dc17-42c2-bae3-ca0024079e55", # different cluster, different name ("JOhn", "94fc2b0a-dc17-42c2-bae3-ca0024079e56"), "94fc2b0a-dc17-42c2-bae3-ca0024079e57", # different cluster, same name ("JOhn", "94fc2b0a-dc17-42c2-bae3-ca0024079e52"), "94fc2b0a-dc17-42c2-bae3-ca0024079e54", ] scan_mock.side_effect = [[es_record_with_many_curated_authors]] signatures = [ Signature( author_affiliation="Rutgers U., Piscataway", author_id=1, author_name="Doe, John", publication=Publication( abstract="Many curated authors", authors=[ "Doe, John", "Doe, J", "Doe, John", "Doe, John", "Doe, John", "Doe, John", "Jamie", "Jamie", ], collaborations=[], keywords=["keyword"], publication_id=1, title="Title", topics=["category"], ), signature_block="JOhn", signature_uuid="94fc2b0a-dc17-42c2-bae3-ca0024079e52", is_curated_author_id=True, ), Signature( author_affiliation="Rutgers U., Piscataway", author_id=1, author_name="Doe, John", publication=Publication( abstract="Many curated authors", authors=[ "Doe, John", "Doe, J", "Doe, John", "Doe, John", "Doe, John", "Doe, John", "Jamie", "Jamie", ], collaborations=[], keywords=["keyword"], publication_id=1, title="Title", topics=["category"], ), signature_block="JOhn", signature_uuid="94fc2b0a-dc17-42c2-bae3-ca0024079e53", is_curated_author_id=True, ), Signature( author_affiliation="Rutgers U., Piscataway", author_id=1, author_name="Doe, J", publication=Publication( abstract="Many curated authors", authors=[ "Doe, John", "Doe, J", "Doe, John", "Doe, John", "Doe, John", "Doe, John", "Jamie", "Jamie", ], collaborations=[], keywords=["keyword"], publication_id=1, title="Title", topics=["category"], ), signature_block="JOhn", signature_uuid="94fc2b0a-dc17-42c2-bae3-ca0024079e54", is_curated_author_id=True, ), Signature( author_affiliation="Rutgers U., Piscataway", author_id=2, author_name="Doe, John", publication=Publication( abstract="Many curated authors", authors=[ "Doe, John", "Doe, J", "Doe, John", "Doe, John", "Doe, John", "Doe, John", "Jamie", "Jamie", ], collaborations=[], keywords=["keyword"], publication_id=1, title="Title", topics=["category"], ), signature_block="JOhn", signature_uuid="94fc2b0a-dc17-42c2-bae3-ca0024079e55", is_curated_author_id=True, ), Signature( author_affiliation="Rutgers U., Piscataway", author_id=2, author_name="Doe, John", publication=Publication( abstract="Many curated authors", authors=[ "Doe, John", "Doe, J", "Doe, John", "Doe, John", "Doe, John", "Doe, John", "Jamie", "Jamie", ], collaborations=[], keywords=["keyword"], publication_id=1, title="Title", topics=["category"], ), signature_block="JOhn", signature_uuid="94fc2b0a-dc17-42c2-bae3-ca0024079e56", is_curated_author_id=True, ), Signature( author_affiliation="", author_id=3, author_name="Doe, John", publication=Publication( abstract="Many curated authors", authors=[ "Doe, John", "Doe, J", "Doe, John", "Doe, John", "Doe, John", "Doe, John", "Jamie", "Jamie", ], collaborations=[], keywords=["keyword"], publication_id=1, title="Title", topics=["category"], ), signature_block="JOhn", signature_uuid="94fc2b0a-dc17-42c2-bae3-ca0024079e57", is_curated_author_id=True, ), Signature( author_affiliation="", author_id=6, author_name="Doe, John", publication=Publication( abstract="Many curated authors", authors=[ "Doe, John", "Doe, J", "Doe, John", "Doe, John", "Doe, John", "Doe, John", "Jamie", "Jamie", ], collaborations=[], keywords=["keyword"], publication_id=1, title="Title", topics=["category"], ), signature_block="JOhn", signature_uuid="94fc2b0a-dc17-42c2-bae3-ca0024079e58", is_curated_author_id=True, ), Signature( author_affiliation="Rutgers U., Piscataway", author_id=7, author_name="Jamie", publication=Publication( abstract="Many curated authors", authors=[ "Doe, John", "Doe, J", "Doe, John", "Doe, John", "Doe, John", "Doe, John", "Jamie", "Jamie", ], collaborations=[], keywords=["keyword"], publication_id=1, title="Title", topics=["category"], ), signature_block="Jana", signature_uuid="94fc2b0a-dc17-42c2-bae3-ca0024079e59", is_curated_author_id=True, ), ] choices_mock.side_effect = choices + choices signatures_dict = {signature.signature_uuid: signature for signature in signatures} signatures_list = [signatures_dict, signatures_dict] sample_mock.return_value = signatures_list score_mock.return_value = 0.85 distance_model_path = tmpdir.join("distance.pkl") train_and_save_distance_model(ethnicity_path, distance_model_path, 4) assert os.path.getsize(distance_model_path) > 0
def test_distance_estimator_load_data(scan_mock, es_record_with_many_curated_authors): scan_mock.side_effect = [[es_record_with_many_curated_authors]] signatures = get_signatures() pairs = [ { "same_cluster": True, "signature_uuids": [ "94fc2b0a-dc17-42c2-bae3-ca0024079e52", "94fc2b0a-dc17-42c2-bae3-ca0024079e53", ], }, { "same_cluster": True, "signature_uuids": [ "94fc2b0a-dc17-42c2-bae3-ca0024079e54", "94fc2b0a-dc17-42c2-bae3-ca0024079e55", ], }, { "same_cluster": False, "signature_uuids": [ "94fc2b0a-dc17-42c2-bae3-ca0024079e56", "94fc2b0a-dc17-42c2-bae3-ca0024079e57", ], }, { "same_cluster": False, "signature_uuids": [ "94fc2b0a-dc17-42c2-bae3-ca0024079e52", "94fc2b0a-dc17-42c2-bae3-ca0024079e54", ], }, ] distance_estimator = DistanceEstimator(None) distance_estimator.load_data(signatures, pairs, 4) expected_X = array( [ [ Signature( author_affiliation="Rutgers U., Piscataway", author_id=1, author_name="Doe, John", publication=Publication( abstract="Many curated authors", authors=[ "Doe, John", "Doe, J", "Doe, John", "Doe, John", "Doe, John", "Doe, John", "Jamie", "Jamie", ], collaborations=[], keywords=["keyword"], publication_id=1, title="Title", topics=["category"], ), signature_block="JOhn", signature_uuid="94fc2b0a-dc17-42c2-bae3-ca0024079e52", is_curated_author_id=True, ), Signature( author_affiliation="Rutgers U., Piscataway", author_id=1, author_name="Doe, J", publication=Publication( abstract="Many curated authors", authors=[ "Doe, John", "Doe, J", "Doe, John", "Doe, John", "Doe, John", "Doe, John", "Jamie", "Jamie", ], collaborations=[], keywords=["keyword"], publication_id=1, title="Title", topics=["category"], ), signature_block="JOhn", signature_uuid="94fc2b0a-dc17-42c2-bae3-ca0024079e53", is_curated_author_id=True), ], [ Signature( author_affiliation="Rutgers U., Piscataway", author_id=2, author_name="Doe, John", publication=Publication( abstract="Many curated authors", authors=[ "Doe, John", "Doe, J", "Doe, John", "Doe, John", "Doe, John", "Doe, John", "Jamie", "Jamie", ], collaborations=[], keywords=["keyword"], publication_id=1, title="Title", topics=["category"], ), signature_block="JOhn", signature_uuid="94fc2b0a-dc17-42c2-bae3-ca0024079e54", is_curated_author_id=True), Signature( author_affiliation="Rutgers U., Piscataway", author_id=2, author_name="Doe, John", publication=Publication( abstract="Many curated authors", authors=[ "Doe, John", "Doe, J", "Doe, John", "Doe, John", "Doe, John", "Doe, John", "Jamie", "Jamie", ], collaborations=[], keywords=["keyword"], publication_id=1, title="Title", topics=["category"], ), signature_block="JOhn", signature_uuid="94fc2b0a-dc17-42c2-bae3-ca0024079e55", is_curated_author_id=True), ], [ Signature( author_affiliation="", author_id=6, author_name="Doe, John", publication=Publication( abstract="Many curated authors", authors=[ "Doe, John", "Doe, J", "Doe, John", "Doe, John", "Doe, John", "Doe, John", "Jamie", "Jamie", ], collaborations=[], keywords=["keyword"], publication_id=1, title="Title", topics=["category"], ), signature_block="JOhn", signature_uuid="94fc2b0a-dc17-42c2-bae3-ca0024079e56", is_curated_author_id=True), Signature( author_affiliation="Rutgers U., Piscataway", author_id=7, author_name="Jamie", publication=Publication( abstract="Many curated authors", authors=[ "Doe, John", "Doe, J", "Doe, John", "Doe, John", "Doe, John", "Doe, John", "Jamie", "Jamie", ], collaborations=[], keywords=["keyword"], publication_id=1, title="Title", topics=["category"], ), signature_block="Jana", signature_uuid="94fc2b0a-dc17-42c2-bae3-ca0024079e57", is_curated_author_id=True), ], [ Signature( author_affiliation="Rutgers U., Piscataway", author_id=1, author_name="Doe, John", publication=Publication( abstract="Many curated authors", authors=[ "Doe, John", "Doe, J", "Doe, John", "Doe, John", "Doe, John", "Doe, John", "Jamie", "Jamie", ], collaborations=[], keywords=["keyword"], publication_id=1, title="Title", topics=["category"], ), signature_block="JOhn", signature_uuid="94fc2b0a-dc17-42c2-bae3-ca0024079e52", is_curated_author_id=True), Signature( author_affiliation="Rutgers U., Piscataway", author_id=2, author_name="Doe, John", publication=Publication( abstract="Many curated authors", authors=[ "Doe, John", "Doe, J", "Doe, John", "Doe, John", "Doe, John", "Doe, John", "Jamie", "Jamie", ], collaborations=[], keywords=["keyword"], publication_id=1, title="Title", topics=["category"], ), signature_block="JOhn", signature_uuid="94fc2b0a-dc17-42c2-bae3-ca0024079e54", is_curated_author_id=True), ], ], dtype=object, ) expected_y = array([0, 0, 1, 1]) assert (distance_estimator.X == expected_X).all() assert (distance_estimator.y == expected_y).all()
def test_clusterer_load_data( scan_mock, distance_estimator_mock, es_record_with_curated_author, es_record_with_non_curated_author, ): scan_mock.side_effect = [[ es_record_with_curated_author, es_record_with_non_curated_author ]] signatures = get_signatures() input_clusters = get_input_clusters(signatures) clusterer = Clusterer(distance_estimator_mock) clusterer.load_data(signatures, input_clusters) expected_X = array( [ [ Signature( author_affiliation="Rutgers U., Piscataway", author_id=989441, author_name="Doe, John", publication=Publication( abstract="2 curated authors with recid", authors=["Doe, John"], collaborations=["ATLAS"], keywords=[ "effective action", "approximation: semiclassical" ], publication_id=374836, title="Title", topics=["Theory-HEP"], ), signature_block="JOhn", signature_uuid="94fc2b0a-dc17-42c2-bae3-ca0024079e52", is_curated_author_id=True) ], [ Signature( author_affiliation="Rutgers U., Piscataway", author_id=989443, author_name="Seiberg, Nana.", publication=Publication( abstract="Author curated no recid", authors=["Seiberg, Nana."], collaborations=[], keywords=["thesis", "string model"], publication_id=421404, title="Black holes in string theory", topics=["Theory-HEP"], ), signature_block="SABARGn", signature_uuid="94fc2b0a-dc17-42c2-bae3-ca0024079e51", is_curated_author_id=False, ) ], ], dtype=object, ) expected_y = array([0, -1]) assert (clusterer.X == expected_X).all() assert (clusterer.y == expected_y).all()
def test_get_signatures_for_all( scan_mock, es_record_with_2_curated_authors, es_record_with_curated_author_and_no_recid, es_record_with_non_curated_author, ): scan_mock.side_effect = [[ es_record_with_2_curated_authors, es_record_with_curated_author_and_no_recid, es_record_with_non_curated_author, ]] signatures = get_signatures() expected_signatures = [ Signature( author_affiliation="Rutgers U., Piscataway", author_id=989440, author_name="Seiberg, N.", publication=Publication( abstract="2 curated authors with recid", authors=["Seiberg, N.", "Jimmy"], collaborations=[], keywords=["effective action", "approximation: semiclassical"], publication_id=374836, title="Title", topics=["Theory-HEP"], ), signature_block="SABARGn", signature_uuid="94fc2b0a-dc17-42c2-bae3-ca0024079e52", is_curated_author_id=True), Signature( author_affiliation="UAIC", author_id=989440, author_name="Jimmy", publication=Publication( abstract="2 curated authors with recid", authors=["Seiberg, N.", "Jimmy"], collaborations=[], keywords=["effective action", "approximation: semiclassical"], publication_id=374836, title="Title", topics=["Theory-HEP"], ), signature_block="JANa", signature_uuid="94fc2b0a-dc17-42c2-bae3-ca0024079e55", is_curated_author_id=True, ), Signature( author_affiliation="Texas U.", author_id=None, author_name="Weinberg, Steven", publication=Publication( abstract="Author not curated", authors=["Weinberg, Steven"], collaborations=[], keywords=["book"], publication_id=406190, title="The Quantum theory of fields. Vol. 1: Foundations", topics=["Theory-HEP", "General Physics"], ), signature_block="WANBARGs", signature_uuid="5e550ded-e955-4a22-b906-8af5aaa9f1e2", is_curated_author_id=False), Signature(author_affiliation="Rutgers U., Piscataway", author_id=989443, author_name="Seiberg, Nana.", publication=Publication( abstract="Author curated no recid", authors=["Seiberg, Nana."], collaborations=[], keywords=["thesis", "string model"], publication_id=421404, title="Black holes in string theory", topics=["Theory-HEP"], ), signature_block="SABARGn", signature_uuid="94fc2b0a-dc17-42c2-bae3-ca0024079e51", is_curated_author_id=False), ] assert sorted(signatures, key=itemgetter("signature_uuid")) == sorted( expected_signatures, key=itemgetter("signature_uuid"))
def test_process_clustering_output_signatures_without_author_id(): clusterer_mock = MagicMock() clusterer_mock.clusterer.labels_ = numpy.array([1, 1]) clusterer_mock.X = numpy.array( [ [ Signature( author_affiliation="Rutgers U., Piscataway", author_id=None, author_name="Doe, John", publication=Publication( abstract="Many curated authors", authors=[ "Doe, John", "Doe, J", "Doe, John", "Doe, John", "Doe, John", "Doe, John", "Jamie", "Jamie", ], collaborations=[], keywords=["keyword"], publication_id=1, title="Title", topics=["category"], ), signature_block="JOhn", signature_uuid="94fc2b0a-dc17-42c2-bae3-ca0024079e52", is_curated_author_id=False, ) ], [ Signature( author_affiliation="Rutgers U., Piscataway", author_id=None, author_name="Doe, John", publication=Publication( abstract="Many curated authors", authors=[ "Doe, John", "Doe, J", "Doe, John", "Doe, John", "Doe, John", "Doe, John", "Jamie", "Jamie", ], collaborations=[], keywords=["keyword"], publication_id=1, title="Title", topics=["category"], ), signature_block="JOhn", signature_uuid="94fc2b0a-dc17-42c2-bae3-ca0024079e54", is_curated_author_id=False, ) ], ], dtype=object, ) expected_output = [{ "signatures": [ (1, "94fc2b0a-dc17-42c2-bae3-ca0024079e52"), (1, "94fc2b0a-dc17-42c2-bae3-ca0024079e54"), ], "authors": [], }] output = process_clustering_output(clusterer_mock) assert not DeepDiff(output, expected_output, ignore_order=True)
def test_process_clustering_output_signatures_multiple_curated_author_ids(): clusterer_mock = MagicMock() clusterer_mock.clusterer.labels_ = numpy.array([0, 0, 1, 1, 1]) clusterer_mock.X = numpy.array( [ [ Signature( author_affiliation="Rutgers U., Piscataway", author_id=1, author_name="Doe, John", publication=Publication( abstract="Many curated authors", authors=[ "Doe, John", "Doe, J", "Doe, John", "Doe, John", "Doe, John", "Doe, John", "Jamie", "Jamie", ], collaborations=[], keywords=["keyword"], publication_id=11, title="Title", topics=["category"], ), signature_block="JOhn", signature_uuid="94fc2b0a-dc17-42c2-bae3-ca0024079e52", is_curated_author_id=True, ) ], [ Signature( author_affiliation="Rutgers U., Piscataway", author_id=None, author_name="Doe, John", publication=Publication( abstract="Many curated authors", authors=[ "Doe, John", "Doe, J", "Doe, John", "Doe, John", "Doe, John", "Doe, John", "Jamie", "Jamie", ], collaborations=[], keywords=["keyword"], publication_id=12, title="Title", topics=["category"], ), signature_block="JOhn", signature_uuid="94fc2b0a-dc17-42c2-bae3-ca0024079e53", is_curated_author_id=False, ) ], [ Signature( author_affiliation="Rutgers U., Piscataway", author_id=3, author_name="Doe, John", publication=Publication( abstract="Many curated authors", authors=[ "Doe, John", "Doe, J", "Doe, John", "Doe, John", "Doe, John", "Doe, John", "Jamie", "Jamie", ], collaborations=[], keywords=["keyword"], publication_id=13, title="Title", topics=["category"], ), signature_block="JOhn", signature_uuid="94fc2b0a-dc17-42c2-bae3-ca0024079e54", is_curated_author_id=True, ) ], [ Signature( author_affiliation="Rutgers U., Piscataway", author_id=None, author_name="Doe, John", publication=Publication( abstract="Many curated authors", authors=[ "Doe, John", "Doe, J", "Doe, John", "Doe, John", "Doe, John", "Doe, John", "Jamie", "Jamie", ], collaborations=[], keywords=["keyword"], publication_id=14, title="Title", topics=["category"], ), signature_block="JOhn", signature_uuid="94fc2b0a-dc17-42c2-bae3-ca0024079e55", is_curated_author_id=False, ) ], [ Signature( author_affiliation="Rutgers U., Piscataway", author_id=5, author_name="Doe, John", publication=Publication( abstract="Many curated authors", authors=[ "Doe, John", "Doe, J", "Doe, John", "Doe, John", "Doe, John", "Doe, John", "Jamie", "Jamie", ], collaborations=[], keywords=["keyword"], publication_id=15, title="Title", topics=["category"], ), signature_block="JOhn", signature_uuid="94fc2b0a-dc17-42c2-bae3-ca0024079e56", is_curated_author_id=True, ) ], ], dtype=object, ) expected_output = [ { "signatures": [ { "publication_id": 11, "signature_uuid": "94fc2b0a-dc17-42c2-bae3-ca0024079e52", }, { "publication_id": 12, "signature_uuid": "94fc2b0a-dc17-42c2-bae3-ca0024079e53", }, ], "authors": [{"author_id": 1, "has_claims": True}], }, { "signatures": [ { "publication_id": 13, "signature_uuid": "94fc2b0a-dc17-42c2-bae3-ca0024079e54", }, { "publication_id": 14, "signature_uuid": "94fc2b0a-dc17-42c2-bae3-ca0024079e55", }, { "publication_id": 15, "signature_uuid": "94fc2b0a-dc17-42c2-bae3-ca0024079e56", }, ], "authors": [ {"author_id": 3, "has_claims": True}, {"author_id": 5, "has_claims": True}, ], }, ] output = process_clustering_output(clusterer_mock) assert not DeepDiff(output, expected_output, ignore_order=True)