def test_cosine_similarity(): """Test for CosineSimilarity.""" X = np.array([[1, 0, 0, 0, 0, 0], [1, 0, 1, 1, 0, 0], [1, 0, 0, 1, 0, 0], [0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1]]) Xt = CosineSimilarity().fit_transform(X) assert_array_almost_equal(Xt, [[0.], [2**-0.5], [1.], [0.], [1.]]) Xt = CosineSimilarity().fit_transform(sp.csr_matrix(X)) assert_array_almost_equal(Xt, [[0.], [2**-0.5], [1.], [0.], [1.]])
def fit(self): """Fit data using the estimator""" transformer = FeatureUnion([ ( "author_full_name_similarity", Pipeline([ ( "pairs", PairTransformer( element_transformer=Pipeline([ ( "full_name", FuncTransformer(func=get_author_full_name), ), ("shaper", Shaper(newshape=(-1, ))), ( "tf-idf", TfidfVectorizer( analyzer="char_wb", ngram_range=(2, 4), dtype=np.float32, decode_error="replace", ), ), ]), groupby=group_by_signature, ), ), ("combiner", CosineSimilarity()), ]), ), ( "author_second_initial_similarity", Pipeline([ ( "pairs", PairTransformer( element_transformer=FuncTransformer( func=get_second_initial), groupby=group_by_signature, ), ), ( "combiner", StringDistance( similarity_function="character_equality"), ), ]), ), ( "author_first_given_name_similarity", Pipeline([ ( "pairs", PairTransformer( element_transformer=FuncTransformer( func=get_first_given_name), groupby=group_by_signature, ), ), ("combiner", StringDistance()), ]), ), ( "author_second_given_name_similarity", Pipeline([ ( "pairs", PairTransformer( element_transformer=FuncTransformer( func=get_second_given_name), groupby=group_by_signature, ), ), ("combiner", StringDistance()), ]), ), ( "author_other_names_similarity", Pipeline([ ( "pairs", PairTransformer( element_transformer=Pipeline([ ( "other_names", FuncTransformer( func=get_author_other_names), ), ("shaper", Shaper(newshape=(-1, ))), ( "tf-idf", TfidfVectorizer( analyzer="char_wb", ngram_range=(2, 4), dtype=np.float32, decode_error="replace", ), ), ]), groupby=group_by_signature, ), ), ("combiner", CosineSimilarity()), ]), ), ( "affiliation_similarity", Pipeline([ ( "pairs", PairTransformer( element_transformer=Pipeline([ ( "affiliation", FuncTransformer( func=get_normalized_affiliation), ), ("shaper", Shaper(newshape=(-1, ))), ( "tf-idf", TfidfVectorizer( analyzer="char_wb", ngram_range=(2, 4), dtype=np.float32, decode_error="replace", ), ), ]), groupby=group_by_signature, ), ), ("combiner", CosineSimilarity()), ]), ), ( "coauthors_similarity", Pipeline([ ( "pairs", PairTransformer( element_transformer=Pipeline([ ( "coauthors", FuncTransformer( func=get_coauthors_neighborhood), ), ("shaper", Shaper(newshape=(-1, ))), ( "tf-idf", TfidfVectorizer( dtype=np.float32, decode_error="replace", ), ), ]), groupby=group_by_signature, ), ), ("combiner", CosineSimilarity()), ]), ), ( "abstract_similarity", Pipeline([ ( "pairs", PairTransformer( element_transformer=Pipeline([ ( "abstract", FuncTransformer(func=get_abstract), ), ("shaper", Shaper(newshape=(-1, ))), ( "tf-idf", TfidfVectorizer( dtype=np.float32, decode_error="replace", ), ), ]), groupby=group_by_signature, ), ), ("combiner", CosineSimilarity()), ]), ), ( "keywords_similarity", Pipeline([ ( "pairs", PairTransformer( element_transformer=Pipeline([ ( "keywords", FuncTransformer(func=get_keywords), ), ("shaper", Shaper(newshape=(-1, ))), ( "tf-idf", TfidfVectorizer( dtype=np.float32, decode_error="replace", ), ), ]), groupby=group_by_signature, ), ), ("combiner", CosineSimilarity()), ]), ), ( "collaborations_similarity", Pipeline([ ( "pairs", PairTransformer( element_transformer=Pipeline([ ( "collaborations", FuncTransformer(func=get_collaborations), ), ("shaper", Shaper(newshape=(-1, ))), ( "tf-idf", TfidfVectorizer( dtype=np.float32, decode_error="replace", ), ), ]), groupby=group_by_signature, ), ), ("combiner", CosineSimilarity()), ]), ), ( "subject_similairty", Pipeline([ ( "pairs", PairTransformer( element_transformer=Pipeline([ ( "keywords", FuncTransformer(func=get_topics), ), ("shaper", Shaper(newshape=(-1))), ( "tf-idf", TfidfVectorizer( dtype=np.float32, decode_error="replace", ), ), ]), groupby=group_by_signature, ), ), ("combiner", CosineSimilarity()), ]), ), ( "title_similarity", Pipeline([ ( "pairs", PairTransformer( element_transformer=Pipeline([ ("title", FuncTransformer(func=get_title)), ("shaper", Shaper(newshape=(-1, ))), ( "tf-idf", TfidfVectorizer( analyzer="char_wb", ngram_range=(2, 4), dtype=np.float32, decode_error="replace", ), ), ]), groupby=group_by_signature, ), ), ("combiner", CosineSimilarity()), ]), ), ( "author_ethnicity", Pipeline([ ( "pairs", PairTransformer( element_transformer=Pipeline([ ( "name", FuncTransformer(func=get_author_full_name), ), ("shaper", Shaper(newshape=(-1, ))), ( "classifier", EstimatorTransformer( self.ethnicity_estimator.estimator), ), ]), groupby=group_by_signature, ), ), ("sigmoid", FuncTransformer(func=expit)), ("combiner", ElementMultiplication()), ]), ), ]) classifier = RandomForestClassifier(n_estimators=500, n_jobs=8) self.distance_estimator = Pipeline([("transformer", transformer), ("classifier", classifier)]) self.distance_estimator.fit(self.X, self.y)
def _build_distance_estimator(X, y, verbose=0, ethnicity_estimator=None, fast=False): """Build a vector reprensation of a pair of signatures.""" if not fast: transformer = FeatureUnion([ ("author_full_name_similarity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("full_name", FuncTransformer(func=get_author_full_name)), ("shaper", Shaper(newshape=(-1, ))), ("tf-idf", TfidfVectorizer(analyzer="char_wb", ngram_range=(2, 4), dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("author_second_initial_similarity", Pipeline([ ("pairs", PairTransformer(element_transformer=FuncTransformer( func=get_second_initial), groupby=group_by_signature)), ("combiner", StringDistance(similarity_function="character_equality")) ])), ("author_first_given_name_similarity", Pipeline([("pairs", PairTransformer(element_transformer=FuncTransformer( func=get_first_given_name), groupby=group_by_signature)), ("combiner", StringDistance())])), ("author_second_given_name_similarity", Pipeline([("pairs", PairTransformer(element_transformer=FuncTransformer( func=get_second_given_name), groupby=group_by_signature)), ("combiner", StringDistance())])), ("author_other_names_similarity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("other_names", FuncTransformer(func=get_author_other_names)), ("shaper", Shaper(newshape=(-1, ))), ("tf-idf", TfidfVectorizer(analyzer="char_wb", ngram_range=(2, 4), dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("affiliation_similarity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("affiliation", FuncTransformer(func=get_author_affiliation)), ("shaper", Shaper(newshape=(-1, ))), ("tf-idf", TfidfVectorizer(analyzer="char_wb", ngram_range=(2, 4), dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("coauthors_similarity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("coauthors", FuncTransformer(func=get_coauthors_from_range)), ("shaper", Shaper(newshape=(-1, ))), ("tf-idf", TfidfVectorizer(dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("title_similarity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("title", FuncTransformer(func=get_title)), ("shaper", Shaper(newshape=(-1, ))), ("tf-idf", TfidfVectorizer(analyzer="char_wb", ngram_range=(2, 4), dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("journal_similarity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("journal", FuncTransformer(func=get_journal)), ("shaper", Shaper(newshape=(-1, ))), ("tf-idf", TfidfVectorizer(analyzer="char_wb", ngram_range=(2, 4), dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("abstract_similarity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("abstract", FuncTransformer(func=get_abstract)), ("shaper", Shaper(newshape=(-1, ))), ("tf-idf", TfidfVectorizer(dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("keywords_similarity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("keywords", FuncTransformer(func=get_keywords)), ("shaper", Shaper(newshape=(-1, ))), ("tf-idf", TfidfVectorizer(dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("collaborations_similarity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("collaborations", FuncTransformer(func=get_collaborations)), ("shaper", Shaper(newshape=(-1, ))), ("tf-idf", TfidfVectorizer(dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("subject_similairty", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("keywords", FuncTransformer(func=get_topics)), ("shaper", Shaper(newshape=(-1))), ("tf-idf", TfidfVectorizer(dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("year_diff", Pipeline([("pairs", FuncTransformer(func=get_year, dtype=np.int)), ("combiner", AbsoluteDifference())])) ]) else: transformer = FeatureUnion([ ("author_full_name_similarity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("full_name", FuncTransformer(func=get_author_full_name)), ("shaper", Shaper(newshape=(-1, ))), ("tf-idf", TfidfVectorizer(analyzer="char_wb", ngram_range=(2, 4), dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("author_other_names_similarity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("other_names", FuncTransformer(func=get_author_other_names)), ("shaper", Shaper(newshape=(-1, ))), ("tf-idf", TfidfVectorizer(analyzer="char_wb", ngram_range=(2, 4), dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("affiliation_similarity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("affiliation", FuncTransformer(func=get_author_affiliation)), ("shaper", Shaper(newshape=(-1, ))), ("tf-idf", TfidfVectorizer(analyzer="char_wb", ngram_range=(2, 4), dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("coauthors_similarity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("coauthors", FuncTransformer(func=get_coauthors_from_range)), ("shaper", Shaper(newshape=(-1, ))), ("tf-idf", TfidfVectorizer(dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("title_similarity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("title", FuncTransformer(func=get_title)), ("shaper", Shaper(newshape=(-1, ))), ("tf-idf", TfidfVectorizer(analyzer="char_wb", ngram_range=(2, 4), dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("year_diff", Pipeline([("pairs", FuncTransformer(func=get_year, dtype=np.int)), ("combiner", AbsoluteDifference())])) ]) if ethnicity_estimator is not None: transformer.transformer_list.append( ("author_ethnicity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("name", FuncTransformer(func=get_author_full_name)), ("shaper", Shaper(newshape=(-1, ))), ("classifier", EstimatorTransformer(ethnicity_estimator)), ]), groupby=group_by_signature)), ("sigmoid", FuncTransformer(func=expit)), ("combiner", ElementMultiplication())]))) # Train a classifier on these vectors classifier = GradientBoostingClassifier(n_estimators=500, max_depth=9, max_features=10, learning_rate=0.125, verbose=verbose) # classifier = RandomForestClassifier(n_estimators=500, # verbose=verbose, # n_jobs=8) # Return the whole pipeline estimator = Pipeline([("transformer", transformer), ("classifier", classifier)]).fit(X, y) return estimator
def fit(self): transformer = FeatureUnion([ ('author_full_name_similarity', Pipeline([ ('pairs', PairTransformer( element_transformer=Pipeline([ ('full_name', FuncTransformer(func=get_author_full_name)), ('shaper', Shaper(newshape=(-1, ))), ('tf-idf', TfidfVectorizer( analyzer='char_wb', ngram_range=(2, 4), dtype=np.float32, decode_error='replace', )), ]), groupby=group_by_signature, )), ('combiner', CosineSimilarity()), ])), ('author_second_initial_similarity', Pipeline([ ('pairs', PairTransformer( element_transformer=FuncTransformer( func=get_second_initial), groupby=group_by_signature, )), ('combiner', StringDistance(similarity_function='character_equality')), ])), ('author_first_given_name_similarity', Pipeline([ ('pairs', PairTransformer(element_transformer=FuncTransformer( func=get_first_given_name), groupby=group_by_signature)), ('combiner', StringDistance()), ])), ('author_second_given_name_similarity', Pipeline([ ('pairs', PairTransformer( element_transformer=FuncTransformer( func=get_second_given_name), groupby=group_by_signature, )), ('combiner', StringDistance()), ])), ('author_other_names_similarity', Pipeline([ ('pairs', PairTransformer( element_transformer=Pipeline([ ('other_names', FuncTransformer(func=get_author_other_names)), ('shaper', Shaper(newshape=(-1, ))), ('tf-idf', TfidfVectorizer( analyzer='char_wb', ngram_range=(2, 4), dtype=np.float32, decode_error='replace', )), ]), groupby=group_by_signature, )), ('combiner', CosineSimilarity()), ])), ('affiliation_similarity', Pipeline([ ('pairs', PairTransformer( element_transformer=Pipeline([ ('affiliation', FuncTransformer(func=get_author_affiliation)), ('shaper', Shaper(newshape=(-1, ))), ('tf-idf', TfidfVectorizer( analyzer='char_wb', ngram_range=(2, 4), dtype=np.float32, decode_error='replace', )), ]), groupby=group_by_signature, )), ('combiner', CosineSimilarity()), ])), ('coauthors_similarity', Pipeline([ ('pairs', PairTransformer( element_transformer=Pipeline([ ('coauthors', FuncTransformer(func=get_coauthors_neighborhood)), ('shaper', Shaper(newshape=(-1, ))), ('tf-idf', TfidfVectorizer( dtype=np.float32, decode_error='replace', )), ]), groupby=group_by_signature, )), ('combiner', CosineSimilarity()), ])), ('abstract_similarity', Pipeline([ ('pairs', PairTransformer( element_transformer=Pipeline([ ('abstract', FuncTransformer(func=get_abstract)), ('shaper', Shaper(newshape=(-1, ))), ('tf-idf', TfidfVectorizer( dtype=np.float32, decode_error='replace', )), ]), groupby=group_by_signature, )), ('combiner', CosineSimilarity()), ])), ('keywords_similarity', Pipeline([ ('pairs', PairTransformer( element_transformer=Pipeline([ ('keywords', FuncTransformer(func=get_keywords)), ('shaper', Shaper(newshape=(-1, ))), ('tf-idf', TfidfVectorizer( dtype=np.float32, decode_error='replace', )), ]), groupby=group_by_signature, )), ('combiner', CosineSimilarity()), ])), ('collaborations_similarity', Pipeline([ ('pairs', PairTransformer( element_transformer=Pipeline([ ('collaborations', FuncTransformer(func=get_collaborations)), ('shaper', Shaper(newshape=(-1, ))), ('tf-idf', TfidfVectorizer( dtype=np.float32, decode_error='replace', )), ]), groupby=group_by_signature, )), ('combiner', CosineSimilarity()), ])), ('subject_similairty', Pipeline([ ('pairs', PairTransformer( element_transformer=Pipeline([ ('keywords', FuncTransformer(func=get_topics)), ('shaper', Shaper(newshape=(-1))), ('tf-idf', TfidfVectorizer( dtype=np.float32, decode_error='replace', )), ]), groupby=group_by_signature, )), ('combiner', CosineSimilarity()), ])), ('title_similarity', Pipeline([ ('pairs', PairTransformer( element_transformer=Pipeline([ ('title', FuncTransformer(func=get_title)), ('shaper', Shaper(newshape=(-1, ))), ('tf-idf', TfidfVectorizer( analyzer='char_wb', ngram_range=(2, 4), dtype=np.float32, decode_error='replace', )), ]), groupby=group_by_signature, )), ('combiner', CosineSimilarity()), ])), ('author_ethnicity', Pipeline([ ('pairs', PairTransformer( element_transformer=Pipeline([ ('name', FuncTransformer(func=get_author_full_name)), ('shaper', Shaper(newshape=(-1, ))), ('classifier', EstimatorTransformer( self.ethnicity_estimator.estimator)), ]), groupby=group_by_signature, )), ('sigmoid', FuncTransformer(func=expit)), ('combiner', ElementMultiplication()), ])), ]) classifier = RandomForestClassifier(n_estimators=500, n_jobs=8) self.distance_estimator = Pipeline([('transformer', transformer), ('classifier', classifier)]) self.distance_estimator.fit(self.X, self.y)
def _build_distance_estimator(X, y, w2v, PoS, NER, regressor, verbose=1): """Build a vector reprensation of a pair of signatures.""" if w2v == 'glove': PairVecTransformer = PairGloveTransformer elif w2v == 'spacy': PairVecTransformer = PairSpacyVecTransformer elif w2v == 'polyglot': PairVecTransformer = PairPolyglotVecTransformer else: print('error passing w2v argument value') if PoS == 'polyglot': get_nouns = polyglot_nouns get_verbs = polyglot_verbs get_words = polyglot_words get_particle = polyglot_particle get_interjection = polyglot_interjection get_symbol = polyglot_symbol get_numbers = polyglot_numbers get_proper_nouns = polyglot_proper_nouns get_pronouns = polyglot_pronouns get_auxiliary_verbs = polyglot_auxiliary_verbs get_adjectives = polyglot_adjectives get_adverbs = polyglot_adverbs get_punctuation = polyglot_punctuation get_determiner = polyglot_determiner get_coordinating_conjunction = polyglot_coordinating_conjunction get_adpositions = polyglot_adpositions get_others = polyglot_others get_subordinating_conjunctions = polyglot_subordinating_conjunctions elif PoS == 'spacy': get_nouns = spacy_noun get_verbs = spacy_verb get_words = spacy_tokens get_particle = spacy_part get_interjection = spacy_intj get_symbol = spacy_sym get_numbers = spacy_num get_proper_nouns = spacy_propn get_pronouns = spacy_pron get_auxiliary_verbs = spacy_aux get_adjectives = spacy_adj get_adverbs = spacy_adv get_punctuation = spacy_punct get_determiner = spacy_det get_coordinating_conjunction = spacy_conj get_adpositions = spacy_adp get_others = spacy_x get_subordinating_conjunctions = spacy_sconj else: print('error passing PoS argument value') transformer = FeatureUnion([ ("get_nouns", Pipeline(steps=[ ('pairtransformer', PairTransformer(element_transformer=FuncTransformer( dtype=None, func=get_nouns), groupby=None)), ('sop', SmallerOtherParing()), ('pgt', PairVecTransformer()), ('rgpc', RefGroupPairCosine()), ('gm', GetMatches()), ('sd', SolveDuplicate()), ('ac', AvgPOSCombiner()), ])), ("get_verbs", Pipeline(steps=[ ('pairtransformer', PairTransformer(element_transformer=FuncTransformer( dtype=None, func=get_verbs), groupby=None)), ('sop', SmallerOtherParing()), ('pgt', PairVecTransformer()), ('rgpc', RefGroupPairCosine()), ('gm', GetMatches()), ('sd', SolveDuplicate()), ('ac', AvgPOSCombiner()), ])), ("get_words", Pipeline(steps=[ ('pairtransformer', PairTransformer(element_transformer=FuncTransformer( dtype=None, func=get_words), groupby=None)), ('sop', SmallerOtherParing()), ('pgt', PairVecTransformer()), ('rgpc', RefGroupPairCosine()), ('gm', GetMatches()), ('sd', SolveDuplicate()), ('ac', AvgPOSCombiner()), ])), ("get_particle", Pipeline(steps=[ ('pairtransformer', PairTransformer(element_transformer=FuncTransformer( dtype=None, func=get_particle), groupby=None)), ('sop', SmallerOtherParing()), ('pgt', PairVecTransformer()), ('rgpc', RefGroupPairCosine()), ('gm', GetMatches()), ('sd', SolveDuplicate()), ('ac', AvgPOSCombiner()), ])), ("get_symbol", Pipeline(steps=[ ('pairtransformer', PairTransformer(element_transformer=FuncTransformer( dtype=None, func=get_symbol), groupby=None)), ('sop', SmallerOtherParing()), ('pgt', PairVecTransformer()), ('rgpc', RefGroupPairCosine()), ('gm', GetMatches()), ('sd', SolveDuplicate()), ('ac', AvgPOSCombiner()), ])), ("num_diff", Pipeline(steps=[ ('pairtransformer', PairTransformer(element_transformer=Pipeline([ ("rsn", FuncTransformer(func=replace_spelled_numbers)), ("get_num", FuncTransformer(func=get_numbers)), ("to_num", FuncTransformer(func=to_numeric)), ]), groupby=None)), ('1st_nm_comb', NumCombiner()), ])), ("get_proper_nouns", Pipeline(steps=[ ('pairtransformer', PairTransformer(element_transformer=FuncTransformer( dtype=None, func=get_proper_nouns), groupby=None)), ('sop', SmallerOtherParing()), ('pgt', PairVecTransformer()), ('rgpc', RefGroupPairCosine()), ('gm', GetMatches()), ('sd', SolveDuplicate()), ('ac', AvgPOSCombiner()), ])), ("get_pronouns", Pipeline(steps=[ ('pairtransformer', PairTransformer(element_transformer=FuncTransformer( dtype=None, func=get_pronouns), groupby=None)), ('sop', SmallerOtherParing()), ('pgt', PairVecTransformer()), ('rgpc', RefGroupPairCosine()), ('gm', GetMatches()), ('sd', SolveDuplicate()), ('ac', AvgPOSCombiner()), ])), ("get_auxiliary_verbs", Pipeline(steps=[ ('pairtransformer', PairTransformer(element_transformer=FuncTransformer( dtype=None, func=get_auxiliary_verbs), groupby=None)), ('sop', SmallerOtherParing()), ('pgt', PairVecTransformer()), ('rgpc', RefGroupPairCosine()), ('gm', GetMatches()), ('sd', SolveDuplicate()), ('ac', AvgPOSCombiner()), ])), ("adjectives_glove", Pipeline(steps=[ ('pairtransformer', PairTransformer(element_transformer=FuncTransformer( dtype=None, func=get_adjectives), groupby=None)), ('sop', SmallerOtherParing()), ('pgt', PairVecTransformer()), ('rgpc', RefGroupPairCosine()), ('gm', GetMatches()), ('sd', SolveDuplicate()), ('ac', AvgPOSCombiner()), ])), ("adverbs_glove", Pipeline(steps=[ ('pairtransformer', PairTransformer(element_transformer=FuncTransformer( dtype=None, func=get_adverbs), groupby=None)), ('sop', SmallerOtherParing()), ('pgt', PairVecTransformer()), ('rgpc', RefGroupPairCosine()), ('gm', GetMatches()), ('sd', SolveDuplicate()), ('ac', AvgPOSCombiner()), ])), ("get_punctuation", Pipeline(steps=[ ('pairtransformer', PairTransformer(element_transformer=FuncTransformer( dtype=None, func=get_punctuation), groupby=None)), ('sop', SmallerOtherParing()), ('pgt', PairVecTransformer()), ('rgpc', RefGroupPairCosine()), ('gm', GetMatches()), ('sd', SolveDuplicate()), ('ac', AvgPOSCombiner()), ])), ("get_determiner", Pipeline(steps=[ ('pairtransformer', PairTransformer(element_transformer=FuncTransformer( dtype=None, func=get_determiner), groupby=None)), ('sop', SmallerOtherParing()), ('pgt', PairVecTransformer()), ('rgpc', RefGroupPairCosine()), ('gm', GetMatches()), ('sd', SolveDuplicate()), ('ac', AvgPOSCombiner()), ])), ("get_coordinating_conjunction", Pipeline(steps=[ ('pairtransformer', PairTransformer(element_transformer=FuncTransformer( dtype=None, func=get_coordinating_conjunction), groupby=None)), ('sop', SmallerOtherParing()), ('pgt', PairVecTransformer()), ('rgpc', RefGroupPairCosine()), ('gm', GetMatches()), ('sd', SolveDuplicate()), ('ac', AvgPOSCombiner()), ])), ("get_adpositions", Pipeline(steps=[ ('pairtransformer', PairTransformer(element_transformer=FuncTransformer( dtype=None, func=get_adpositions), groupby=None)), ('sop', SmallerOtherParing()), ('pgt', PairVecTransformer()), ('rgpc', RefGroupPairCosine()), ('gm', GetMatches()), ('sd', SolveDuplicate()), ('ac', AvgPOSCombiner()), ])), ("get_subordinating_conjunctions", Pipeline(steps=[ ('pairtransformer', PairTransformer(element_transformer=FuncTransformer( dtype=None, func=get_subordinating_conjunctions), groupby=None)), ('sop', SmallerOtherParing()), ('pgt', PairVecTransformer()), ('rgpc', RefGroupPairCosine()), ('gm', GetMatches()), ('sd', SolveDuplicate()), ('ac', AvgPOSCombiner()), ])), ("spacy_organizations", Pipeline(steps=[ ('pairtransformer', PairTransformer(element_transformer=FuncTransformer( dtype=None, func=spacy_organizations), groupby=None)), ('sop', SmallerOtherParing()), ('pgt', PairVecTransformer()), ('rgpc', RefGroupPairCosine()), ('gm', GetMatches()), ('sd', SolveDuplicate()), ('ac', AvgPOSCombiner()), ])), ("spacy_persons", Pipeline(steps=[ ('pairtransformer', PairTransformer(element_transformer=FuncTransformer( dtype=None, func=spacy_persons), groupby=None)), ('sop', SmallerOtherParing()), ('pgt', PairVecTransformer()), ('rgpc', RefGroupPairCosine()), ('gm', GetMatches()), ('sd', SolveDuplicate()), ('ac', AvgPOSCombiner()), ])), ("spacy_locations", Pipeline(steps=[ ('pairtransformer', PairTransformer(element_transformer=FuncTransformer( dtype=None, func=spacy_locations), groupby=None)), ('sop', SmallerOtherParing()), ('pgt', PairVecTransformer()), ('rgpc', RefGroupPairCosine()), ('gm', GetMatches()), ('sd', SolveDuplicate()), ('ac', AvgPOSCombiner()), ])), ("spacy_groups", Pipeline(steps=[ ('pairtransformer', PairTransformer(element_transformer=FuncTransformer( dtype=None, func=spacy_groups), groupby=None)), ('sop', SmallerOtherParing()), ('pgt', PairVecTransformer()), ('rgpc', RefGroupPairCosine()), ('gm', GetMatches()), ('sd', SolveDuplicate()), ('ac', AvgPOSCombiner()), ])), ("spacy_geo_locations", Pipeline(steps=[ ('pairtransformer', PairTransformer(element_transformer=FuncTransformer( dtype=None, func=spacy_geo_locations), groupby=None)), ('sop', SmallerOtherParing()), ('pgt', PairVecTransformer()), ('rgpc', RefGroupPairCosine()), ('gm', GetMatches()), ('sd', SolveDuplicate()), ('ac', AvgPOSCombiner()), ])), ("sent_tfidf", Pipeline([ ("pairs", PairTransformer(element_transformer=Pipeline( [("1st_verb", FuncTransformer( func=get_text)), ("shaper", Shaper(newshape=(-1, ))), ("tf-idf", TfidfVectorizer(analyzer="char_wb", ngram_range=(2, 3), dtype=np.float32, decode_error="replace", stop_words="english"))]))), ("combiner", CosineSimilarity()) ])), ("sent_len_diff", Pipeline(steps=[ ('pairtransformer', PairTransformer(element_transformer=FuncTransformer(dtype=None, func=len), groupby=None)), ('abs_diff', AbsoluteDifference()), ])), ]) # Train a classifier on these vectors if regressor == 'lasso': classifier = LassoLarsCV(cv=5, max_iter=512, n_jobs=-1) elif regressor == 'RF': classifier = RandomForestRegressor(n_jobs=-1, max_depth=8, n_estimators=500) else: print('Error passing the regressor type') # Return the whole pipeline estimator = Pipeline([("transformer", transformer), ("classifier", classifier)]).fit(X, y) return estimator
def build_distance_estimator(X, y): # Build a vector reprensation of a pair of signatures transformer = FeatureUnion([ ("author_full_name_similarity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("full_name", FuncTransformer(func=get_author_full_name)), ("shaper", Shaper(newshape=(-1, ))), ("tf-idf", TfidfVectorizer(analyzer="char_wb", ngram_range=(2, 4), dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("author_other_names_similarity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("other_names", FuncTransformer(func=get_author_other_names)), ("shaper", Shaper(newshape=(-1, ))), ("tf-idf", TfidfVectorizer(analyzer="char_wb", ngram_range=(2, 4), dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("author_initials_similarity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("initials", FuncTransformer(func=get_author_initials)), ("shaper", Shaper(newshape=(-1, ))), ("count", CountVectorizer(analyzer="char_wb", ngram_range=(1, 1), binary=True, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("affiliation_similarity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("affiliation", FuncTransformer(func=get_author_affiliation)), ("shaper", Shaper(newshape=(-1, ))), ("tf-idf", TfidfVectorizer(analyzer="char_wb", ngram_range=(2, 4), dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("coauthors_similarity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("coauthors", FuncTransformer(func=get_coauthors)), ("shaper", Shaper(newshape=(-1, ))), ("tf-idf", TfidfVectorizer(dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("title_similarity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("title", FuncTransformer(func=get_title)), ("shaper", Shaper(newshape=(-1, ))), ("tf-idf", TfidfVectorizer(analyzer="char_wb", ngram_range=(2, 4), dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("journal_similarity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("journal", FuncTransformer(func=get_journal)), ("shaper", Shaper(newshape=(-1, ))), ("tf-idf", TfidfVectorizer(analyzer="char_wb", ngram_range=(2, 4), dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("abstract_similarity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("abstract", FuncTransformer(func=get_abstract)), ("shaper", Shaper(newshape=(-1, ))), ("tf-idf", TfidfVectorizer(dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("keywords_similarity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("keywords", FuncTransformer(func=get_keywords)), ("shaper", Shaper(newshape=(-1, ))), ("tf-idf", TfidfVectorizer(dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("collaborations_similarity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("collaborations", FuncTransformer(func=get_collaborations)), ("shaper", Shaper(newshape=(-1, ))), ("tf-idf", TfidfVectorizer(dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("references_similarity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("references", FuncTransformer(func=get_references)), ("shaper", Shaper(newshape=(-1, ))), ("tf-idf", TfidfVectorizer(dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ( "year_diff", Pipeline([ ("pairs", FuncTransformer(func=get_year, dtype=np.int)), ("combiner", AbsoluteDifference() ) # FIXME: when one is missing ])) ]) # Train a classifier on these vectors classifier = GradientBoostingClassifier(n_estimators=500, max_depth=9, max_features=10, learning_rate=0.125, verbose=3) # Return the whole pipeline estimator = Pipeline([("transformer", transformer), ("classifier", classifier)]).fit(X, y) return estimator
def _build_distance_estimator(X, y, verbose=0, ethnicity_estimator=None): """Build a vector reprensation of a pair of signatures.""" transformer = FeatureUnion([ # ("author_full_name_similarity", Pipeline([ # ("pairs", PairTransformer(element_transformer=Pipeline([ # ("full_name", FuncTransformer(func=get_author_full_name)), # ("shaper", Shaper(newshape=(-1,))), # ("tf-idf", TfidfVectorizer(analyzer="char_wb", # ngram_range=(2, 4), # dtype=np.float32, # decode_error="replace")), # ]), groupby=group_by_signature)), # ("combiner", CosineSimilarity()) # ])), # ("author_second_initial_similarity", Pipeline([ # ("pairs", PairTransformer(element_transformer=FuncTransformer( # func=get_second_initial # ), groupby=group_by_signature)), # ("combiner", StringDistance( # similarity_function="character_equality")) # ])), # ("mesh_similarity", Pipeline([ # ("pairs", PairTransformer(element_transformer=Pipeline([ # ("mesh_terms", FuncTransformer(func=get_mesh_terms)), # ("shaper", Shaper(newshape=(-1,))), # ("tf-idf", TfidfVectorizer(dtype=np.float32, # decode_error="replace")), # ]), groupby=group_by_signature)), # ("combiner", CosineSimilarity()) # ])), ("mesh_word2vec", Pipeline([ ("pairs", FuncTransformer(func=get_mesh_word2vec)), ("combiner", MyCosineSimilarity()) ])), ("affiliation_similarity", Pipeline([ ("pairs", PairTransformer(element_transformer=Pipeline([ ("affiliation", FuncTransformer(func=get_author_affiliation)), ("shaper", Shaper(newshape=(-1,))), ("tf-idf", TfidfVectorizer(analyzer="char_wb", ngram_range=(2, 4), dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity()) ])), ("title_similarity", Pipeline([ ("pairs", PairTransformer(element_transformer=Pipeline([ ("title", FuncTransformer(func=get_title)), ("shaper", Shaper(newshape=(-1,))), ("tf-idf", TfidfVectorizer(analyzer="char_wb", ngram_range=(2, 4), dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity()) ])), ("journal_similarity", Pipeline([ ("pairs", PairTransformer(element_transformer=Pipeline([ ("journal", FuncTransformer(func=get_journal)), ("shaper", Shaper(newshape=(-1,))), ("tf-idf", TfidfVectorizer(analyzer="char_wb", ngram_range=(2, 4), dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity()) ])), ("abstract_similarity", Pipeline([ ("pairs", PairTransformer(element_transformer=Pipeline([ ("abstract", FuncTransformer(func=get_abstract)), ("shaper", Shaper(newshape=(-1,))), ("tf-idf", TfidfVectorizer(dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity()) ])) # ("mesh_word2vec", Pipeline([ # ("pairs", FuncTransformer(func=get_year, dtype=np.int)), # ("combiner", AbsoluteDifference()) # ])) # ("keywords_similarity", Pipeline([ # ("pairs", PairTransformer(element_transformer=Pipeline([ # ("keywords", FuncTransformer(func=get_keywords)), # ("shaper", Shaper(newshape=(-1,))), # ("tf-idf", TfidfVectorizer(dtype=np.float32, # decode_error="replace")), # ]), groupby=group_by_signature)), # ("combiner", CosineSimilarity()) # ])), # ("year_diff", Pipeline([ # ("pairs", FuncTransformer(func=get_year, dtype=np.int)), # ("combiner", AbsoluteDifference()) # ])) ]) if ethnicity_estimator is not None: transformer.transformer_list.append(("author_ethnicity", Pipeline([ ("pairs", PairTransformer(element_transformer=Pipeline([ ("name", FuncTransformer(func=get_author_full_name)), ("shaper", Shaper(newshape=(-1,))), ("classifier", EstimatorTransformer(ethnicity_estimator)), ]), groupby=group_by_signature)), ("sigmoid", FuncTransformer(func=expit)), ("combiner", ElementMultiplication()) ]))) # Train a classifier on these vectors classifier = GradientBoostingClassifier(n_estimators=2000, max_depth=9, max_features=5, learning_rate=0.125, verbose=verbose) # Return the whole pipeline estimator = Pipeline([("transformer", transformer), ("classifier", classifier)]).fit(X, y) return estimator