def fit(self): """Fit data using the estimator""" transformer = FeatureUnion([ ( "author_full_name_similarity", Pipeline([ ( "pairs", PairTransformer( element_transformer=Pipeline([ ( "full_name", FuncTransformer(func=get_author_full_name), ), ("shaper", Shaper(newshape=(-1, ))), ( "tf-idf", TfidfVectorizer( analyzer="char_wb", ngram_range=(2, 4), dtype=np.float32, decode_error="replace", ), ), ]), groupby=group_by_signature, ), ), ("combiner", CosineSimilarity()), ]), ), ( "author_second_initial_similarity", Pipeline([ ( "pairs", PairTransformer( element_transformer=FuncTransformer( func=get_second_initial), groupby=group_by_signature, ), ), ( "combiner", StringDistance( similarity_function="character_equality"), ), ]), ), ( "author_first_given_name_similarity", Pipeline([ ( "pairs", PairTransformer( element_transformer=FuncTransformer( func=get_first_given_name), groupby=group_by_signature, ), ), ("combiner", StringDistance()), ]), ), ( "author_second_given_name_similarity", Pipeline([ ( "pairs", PairTransformer( element_transformer=FuncTransformer( func=get_second_given_name), groupby=group_by_signature, ), ), ("combiner", StringDistance()), ]), ), ( "author_other_names_similarity", Pipeline([ ( "pairs", PairTransformer( element_transformer=Pipeline([ ( "other_names", FuncTransformer( func=get_author_other_names), ), ("shaper", Shaper(newshape=(-1, ))), ( "tf-idf", TfidfVectorizer( analyzer="char_wb", ngram_range=(2, 4), dtype=np.float32, decode_error="replace", ), ), ]), groupby=group_by_signature, ), ), ("combiner", CosineSimilarity()), ]), ), ( "affiliation_similarity", Pipeline([ ( "pairs", PairTransformer( element_transformer=Pipeline([ ( "affiliation", FuncTransformer( func=get_normalized_affiliation), ), ("shaper", Shaper(newshape=(-1, ))), ( "tf-idf", TfidfVectorizer( analyzer="char_wb", ngram_range=(2, 4), dtype=np.float32, decode_error="replace", ), ), ]), groupby=group_by_signature, ), ), ("combiner", CosineSimilarity()), ]), ), ( "coauthors_similarity", Pipeline([ ( "pairs", PairTransformer( element_transformer=Pipeline([ ( "coauthors", FuncTransformer( func=get_coauthors_neighborhood), ), ("shaper", Shaper(newshape=(-1, ))), ( "tf-idf", TfidfVectorizer( dtype=np.float32, decode_error="replace", ), ), ]), groupby=group_by_signature, ), ), ("combiner", CosineSimilarity()), ]), ), ( "abstract_similarity", Pipeline([ ( "pairs", PairTransformer( element_transformer=Pipeline([ ( "abstract", FuncTransformer(func=get_abstract), ), ("shaper", Shaper(newshape=(-1, ))), ( "tf-idf", TfidfVectorizer( dtype=np.float32, decode_error="replace", ), ), ]), groupby=group_by_signature, ), ), ("combiner", CosineSimilarity()), ]), ), ( "keywords_similarity", Pipeline([ ( "pairs", PairTransformer( element_transformer=Pipeline([ ( "keywords", FuncTransformer(func=get_keywords), ), ("shaper", Shaper(newshape=(-1, ))), ( "tf-idf", TfidfVectorizer( dtype=np.float32, decode_error="replace", ), ), ]), groupby=group_by_signature, ), ), ("combiner", CosineSimilarity()), ]), ), ( "collaborations_similarity", Pipeline([ ( "pairs", PairTransformer( element_transformer=Pipeline([ ( "collaborations", FuncTransformer(func=get_collaborations), ), ("shaper", Shaper(newshape=(-1, ))), ( "tf-idf", TfidfVectorizer( dtype=np.float32, decode_error="replace", ), ), ]), groupby=group_by_signature, ), ), ("combiner", CosineSimilarity()), ]), ), ( "subject_similairty", Pipeline([ ( "pairs", PairTransformer( element_transformer=Pipeline([ ( "keywords", FuncTransformer(func=get_topics), ), ("shaper", Shaper(newshape=(-1))), ( "tf-idf", TfidfVectorizer( dtype=np.float32, decode_error="replace", ), ), ]), groupby=group_by_signature, ), ), ("combiner", CosineSimilarity()), ]), ), ( "title_similarity", Pipeline([ ( "pairs", PairTransformer( element_transformer=Pipeline([ ("title", FuncTransformer(func=get_title)), ("shaper", Shaper(newshape=(-1, ))), ( "tf-idf", TfidfVectorizer( analyzer="char_wb", ngram_range=(2, 4), dtype=np.float32, decode_error="replace", ), ), ]), groupby=group_by_signature, ), ), ("combiner", CosineSimilarity()), ]), ), ( "author_ethnicity", Pipeline([ ( "pairs", PairTransformer( element_transformer=Pipeline([ ( "name", FuncTransformer(func=get_author_full_name), ), ("shaper", Shaper(newshape=(-1, ))), ( "classifier", EstimatorTransformer( self.ethnicity_estimator.estimator), ), ]), groupby=group_by_signature, ), ), ("sigmoid", FuncTransformer(func=expit)), ("combiner", ElementMultiplication()), ]), ), ]) classifier = RandomForestClassifier(n_estimators=500, n_jobs=8) self.distance_estimator = Pipeline([("transformer", transformer), ("classifier", classifier)]) self.distance_estimator.fit(self.X, self.y)
def _build_distance_estimator(X, y, verbose=0, ethnicity_estimator=None, fast=False): """Build a vector reprensation of a pair of signatures.""" if not fast: transformer = FeatureUnion([ ("author_full_name_similarity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("full_name", FuncTransformer(func=get_author_full_name)), ("shaper", Shaper(newshape=(-1, ))), ("tf-idf", TfidfVectorizer(analyzer="char_wb", ngram_range=(2, 4), dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("author_second_initial_similarity", Pipeline([ ("pairs", PairTransformer(element_transformer=FuncTransformer( func=get_second_initial), groupby=group_by_signature)), ("combiner", StringDistance(similarity_function="character_equality")) ])), ("author_first_given_name_similarity", Pipeline([("pairs", PairTransformer(element_transformer=FuncTransformer( func=get_first_given_name), groupby=group_by_signature)), ("combiner", StringDistance())])), ("author_second_given_name_similarity", Pipeline([("pairs", PairTransformer(element_transformer=FuncTransformer( func=get_second_given_name), groupby=group_by_signature)), ("combiner", StringDistance())])), ("author_other_names_similarity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("other_names", FuncTransformer(func=get_author_other_names)), ("shaper", Shaper(newshape=(-1, ))), ("tf-idf", TfidfVectorizer(analyzer="char_wb", ngram_range=(2, 4), dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("affiliation_similarity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("affiliation", FuncTransformer(func=get_author_affiliation)), ("shaper", Shaper(newshape=(-1, ))), ("tf-idf", TfidfVectorizer(analyzer="char_wb", ngram_range=(2, 4), dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("coauthors_similarity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("coauthors", FuncTransformer(func=get_coauthors_from_range)), ("shaper", Shaper(newshape=(-1, ))), ("tf-idf", TfidfVectorizer(dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("title_similarity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("title", FuncTransformer(func=get_title)), ("shaper", Shaper(newshape=(-1, ))), ("tf-idf", TfidfVectorizer(analyzer="char_wb", ngram_range=(2, 4), dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("journal_similarity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("journal", FuncTransformer(func=get_journal)), ("shaper", Shaper(newshape=(-1, ))), ("tf-idf", TfidfVectorizer(analyzer="char_wb", ngram_range=(2, 4), dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("abstract_similarity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("abstract", FuncTransformer(func=get_abstract)), ("shaper", Shaper(newshape=(-1, ))), ("tf-idf", TfidfVectorizer(dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("keywords_similarity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("keywords", FuncTransformer(func=get_keywords)), ("shaper", Shaper(newshape=(-1, ))), ("tf-idf", TfidfVectorizer(dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("collaborations_similarity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("collaborations", FuncTransformer(func=get_collaborations)), ("shaper", Shaper(newshape=(-1, ))), ("tf-idf", TfidfVectorizer(dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("subject_similairty", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("keywords", FuncTransformer(func=get_topics)), ("shaper", Shaper(newshape=(-1))), ("tf-idf", TfidfVectorizer(dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("year_diff", Pipeline([("pairs", FuncTransformer(func=get_year, dtype=np.int)), ("combiner", AbsoluteDifference())])) ]) else: transformer = FeatureUnion([ ("author_full_name_similarity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("full_name", FuncTransformer(func=get_author_full_name)), ("shaper", Shaper(newshape=(-1, ))), ("tf-idf", TfidfVectorizer(analyzer="char_wb", ngram_range=(2, 4), dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("author_other_names_similarity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("other_names", FuncTransformer(func=get_author_other_names)), ("shaper", Shaper(newshape=(-1, ))), ("tf-idf", TfidfVectorizer(analyzer="char_wb", ngram_range=(2, 4), dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("affiliation_similarity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("affiliation", FuncTransformer(func=get_author_affiliation)), ("shaper", Shaper(newshape=(-1, ))), ("tf-idf", TfidfVectorizer(analyzer="char_wb", ngram_range=(2, 4), dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("coauthors_similarity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("coauthors", FuncTransformer(func=get_coauthors_from_range)), ("shaper", Shaper(newshape=(-1, ))), ("tf-idf", TfidfVectorizer(dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("title_similarity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("title", FuncTransformer(func=get_title)), ("shaper", Shaper(newshape=(-1, ))), ("tf-idf", TfidfVectorizer(analyzer="char_wb", ngram_range=(2, 4), dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("year_diff", Pipeline([("pairs", FuncTransformer(func=get_year, dtype=np.int)), ("combiner", AbsoluteDifference())])) ]) if ethnicity_estimator is not None: transformer.transformer_list.append( ("author_ethnicity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("name", FuncTransformer(func=get_author_full_name)), ("shaper", Shaper(newshape=(-1, ))), ("classifier", EstimatorTransformer(ethnicity_estimator)), ]), groupby=group_by_signature)), ("sigmoid", FuncTransformer(func=expit)), ("combiner", ElementMultiplication())]))) # Train a classifier on these vectors classifier = GradientBoostingClassifier(n_estimators=500, max_depth=9, max_features=10, learning_rate=0.125, verbose=verbose) # classifier = RandomForestClassifier(n_estimators=500, # verbose=verbose, # n_jobs=8) # Return the whole pipeline estimator = Pipeline([("transformer", transformer), ("classifier", classifier)]).fit(X, y) return estimator
def fit(self): transformer = FeatureUnion([ ('author_full_name_similarity', Pipeline([ ('pairs', PairTransformer( element_transformer=Pipeline([ ('full_name', FuncTransformer(func=get_author_full_name)), ('shaper', Shaper(newshape=(-1, ))), ('tf-idf', TfidfVectorizer( analyzer='char_wb', ngram_range=(2, 4), dtype=np.float32, decode_error='replace', )), ]), groupby=group_by_signature, )), ('combiner', CosineSimilarity()), ])), ('author_second_initial_similarity', Pipeline([ ('pairs', PairTransformer( element_transformer=FuncTransformer( func=get_second_initial), groupby=group_by_signature, )), ('combiner', StringDistance(similarity_function='character_equality')), ])), ('author_first_given_name_similarity', Pipeline([ ('pairs', PairTransformer(element_transformer=FuncTransformer( func=get_first_given_name), groupby=group_by_signature)), ('combiner', StringDistance()), ])), ('author_second_given_name_similarity', Pipeline([ ('pairs', PairTransformer( element_transformer=FuncTransformer( func=get_second_given_name), groupby=group_by_signature, )), ('combiner', StringDistance()), ])), ('author_other_names_similarity', Pipeline([ ('pairs', PairTransformer( element_transformer=Pipeline([ ('other_names', FuncTransformer(func=get_author_other_names)), ('shaper', Shaper(newshape=(-1, ))), ('tf-idf', TfidfVectorizer( analyzer='char_wb', ngram_range=(2, 4), dtype=np.float32, decode_error='replace', )), ]), groupby=group_by_signature, )), ('combiner', CosineSimilarity()), ])), ('affiliation_similarity', Pipeline([ ('pairs', PairTransformer( element_transformer=Pipeline([ ('affiliation', FuncTransformer(func=get_author_affiliation)), ('shaper', Shaper(newshape=(-1, ))), ('tf-idf', TfidfVectorizer( analyzer='char_wb', ngram_range=(2, 4), dtype=np.float32, decode_error='replace', )), ]), groupby=group_by_signature, )), ('combiner', CosineSimilarity()), ])), ('coauthors_similarity', Pipeline([ ('pairs', PairTransformer( element_transformer=Pipeline([ ('coauthors', FuncTransformer(func=get_coauthors_neighborhood)), ('shaper', Shaper(newshape=(-1, ))), ('tf-idf', TfidfVectorizer( dtype=np.float32, decode_error='replace', )), ]), groupby=group_by_signature, )), ('combiner', CosineSimilarity()), ])), ('abstract_similarity', Pipeline([ ('pairs', PairTransformer( element_transformer=Pipeline([ ('abstract', FuncTransformer(func=get_abstract)), ('shaper', Shaper(newshape=(-1, ))), ('tf-idf', TfidfVectorizer( dtype=np.float32, decode_error='replace', )), ]), groupby=group_by_signature, )), ('combiner', CosineSimilarity()), ])), ('keywords_similarity', Pipeline([ ('pairs', PairTransformer( element_transformer=Pipeline([ ('keywords', FuncTransformer(func=get_keywords)), ('shaper', Shaper(newshape=(-1, ))), ('tf-idf', TfidfVectorizer( dtype=np.float32, decode_error='replace', )), ]), groupby=group_by_signature, )), ('combiner', CosineSimilarity()), ])), ('collaborations_similarity', Pipeline([ ('pairs', PairTransformer( element_transformer=Pipeline([ ('collaborations', FuncTransformer(func=get_collaborations)), ('shaper', Shaper(newshape=(-1, ))), ('tf-idf', TfidfVectorizer( dtype=np.float32, decode_error='replace', )), ]), groupby=group_by_signature, )), ('combiner', CosineSimilarity()), ])), ('subject_similairty', Pipeline([ ('pairs', PairTransformer( element_transformer=Pipeline([ ('keywords', FuncTransformer(func=get_topics)), ('shaper', Shaper(newshape=(-1))), ('tf-idf', TfidfVectorizer( dtype=np.float32, decode_error='replace', )), ]), groupby=group_by_signature, )), ('combiner', CosineSimilarity()), ])), ('title_similarity', Pipeline([ ('pairs', PairTransformer( element_transformer=Pipeline([ ('title', FuncTransformer(func=get_title)), ('shaper', Shaper(newshape=(-1, ))), ('tf-idf', TfidfVectorizer( analyzer='char_wb', ngram_range=(2, 4), dtype=np.float32, decode_error='replace', )), ]), groupby=group_by_signature, )), ('combiner', CosineSimilarity()), ])), ('author_ethnicity', Pipeline([ ('pairs', PairTransformer( element_transformer=Pipeline([ ('name', FuncTransformer(func=get_author_full_name)), ('shaper', Shaper(newshape=(-1, ))), ('classifier', EstimatorTransformer( self.ethnicity_estimator.estimator)), ]), groupby=group_by_signature, )), ('sigmoid', FuncTransformer(func=expit)), ('combiner', ElementMultiplication()), ])), ]) classifier = RandomForestClassifier(n_estimators=500, n_jobs=8) self.distance_estimator = Pipeline([('transformer', transformer), ('classifier', classifier)]) self.distance_estimator.fit(self.X, self.y)
def train(records, use_categories=True): """Train a classifier on the given arXiv records. :param records: Records are expected as a list of dictionaries with the following fields required: "title", "abstract", "categories" and "decision". The decision field should be either "CORE", "Non-CORE" or "Rejected". Example: records = [{u'decision': "CORE", u'title': u'Effects of top compositeness', u'abstract': u'We investigate the effects of (...)' u'categories': [u'cond-mat.mes-hall', u'cond-mat.mtrl-sci']}, {...}, ...] :param use_categories: Whether the "categories" is used to build the classifier. :return: the trained pipeline """ records = np.array(records, dtype=np.object).reshape((-1, 1)) if use_categories: transformer = Pipeline([ ("features", FeatureUnion([ ("title_abstract", Pipeline([("getter", FuncTransformer(func=_get_title_abstract)), ("shape", Shaper(newshape=(-1, ))), ("tfidf", TfidfVectorizer(min_df=3, max_df=0.1, norm="l2", ngram_range=(1, 1), stop_words="english", strip_accents="unicode", dtype=np.float32, decode_error="replace"))])), ("categories", Pipeline([("getter", FuncTransformer(func=_get_categories)), ("shape", Shaper(newshape=(-1, ))), ("tfidf", TfidfVectorizer(norm="l2", dtype=np.float32, decode_error="replace"))])), ])), ("scaling", Normalizer()) ]) else: transformer = Pipeline([("getter", FuncTransformer(func=_get_title_abstract)), ("shape", Shaper(newshape=(-1, ))), ("tfidf", TfidfVectorizer(min_df=3, max_df=0.1, norm="l2", ngram_range=(1, 1), stop_words="english", strip_accents="unicode", dtype=np.float32, decode_error="replace")), ("scaling", Normalizer())]) X = transformer.fit_transform(records) y = np.array([r[0]["decision"] for r in records]) grid = GridSearchCV( LinearSVC(), param_grid={"C": np.linspace(start=0.2, stop=0.5, num=20)}, scoring="accuracy", cv=3, verbose=3) grid.fit(X, y) return Pipeline([("transformer", transformer), ("classifier", grid.best_estimator_)])
def build_distance_estimator(X, y): # Build a vector reprensation of a pair of signatures transformer = FeatureUnion([ ("author_full_name_similarity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("full_name", FuncTransformer(func=get_author_full_name)), ("shaper", Shaper(newshape=(-1, ))), ("tf-idf", TfidfVectorizer(analyzer="char_wb", ngram_range=(2, 4), dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("author_other_names_similarity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("other_names", FuncTransformer(func=get_author_other_names)), ("shaper", Shaper(newshape=(-1, ))), ("tf-idf", TfidfVectorizer(analyzer="char_wb", ngram_range=(2, 4), dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("author_initials_similarity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("initials", FuncTransformer(func=get_author_initials)), ("shaper", Shaper(newshape=(-1, ))), ("count", CountVectorizer(analyzer="char_wb", ngram_range=(1, 1), binary=True, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("affiliation_similarity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("affiliation", FuncTransformer(func=get_author_affiliation)), ("shaper", Shaper(newshape=(-1, ))), ("tf-idf", TfidfVectorizer(analyzer="char_wb", ngram_range=(2, 4), dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("coauthors_similarity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("coauthors", FuncTransformer(func=get_coauthors)), ("shaper", Shaper(newshape=(-1, ))), ("tf-idf", TfidfVectorizer(dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("title_similarity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("title", FuncTransformer(func=get_title)), ("shaper", Shaper(newshape=(-1, ))), ("tf-idf", TfidfVectorizer(analyzer="char_wb", ngram_range=(2, 4), dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("journal_similarity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("journal", FuncTransformer(func=get_journal)), ("shaper", Shaper(newshape=(-1, ))), ("tf-idf", TfidfVectorizer(analyzer="char_wb", ngram_range=(2, 4), dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("abstract_similarity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("abstract", FuncTransformer(func=get_abstract)), ("shaper", Shaper(newshape=(-1, ))), ("tf-idf", TfidfVectorizer(dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("keywords_similarity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("keywords", FuncTransformer(func=get_keywords)), ("shaper", Shaper(newshape=(-1, ))), ("tf-idf", TfidfVectorizer(dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("collaborations_similarity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("collaborations", FuncTransformer(func=get_collaborations)), ("shaper", Shaper(newshape=(-1, ))), ("tf-idf", TfidfVectorizer(dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ("references_similarity", Pipeline([("pairs", PairTransformer(element_transformer=Pipeline([ ("references", FuncTransformer(func=get_references)), ("shaper", Shaper(newshape=(-1, ))), ("tf-idf", TfidfVectorizer(dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity())])), ( "year_diff", Pipeline([ ("pairs", FuncTransformer(func=get_year, dtype=np.int)), ("combiner", AbsoluteDifference() ) # FIXME: when one is missing ])) ]) # Train a classifier on these vectors classifier = GradientBoostingClassifier(n_estimators=500, max_depth=9, max_features=10, learning_rate=0.125, verbose=3) # Return the whole pipeline estimator = Pipeline([("transformer", transformer), ("classifier", classifier)]).fit(X, y) return estimator
def _build_distance_estimator(X, y, verbose=0, ethnicity_estimator=None): """Build a vector reprensation of a pair of signatures.""" transformer = FeatureUnion([ # ("author_full_name_similarity", Pipeline([ # ("pairs", PairTransformer(element_transformer=Pipeline([ # ("full_name", FuncTransformer(func=get_author_full_name)), # ("shaper", Shaper(newshape=(-1,))), # ("tf-idf", TfidfVectorizer(analyzer="char_wb", # ngram_range=(2, 4), # dtype=np.float32, # decode_error="replace")), # ]), groupby=group_by_signature)), # ("combiner", CosineSimilarity()) # ])), # ("author_second_initial_similarity", Pipeline([ # ("pairs", PairTransformer(element_transformer=FuncTransformer( # func=get_second_initial # ), groupby=group_by_signature)), # ("combiner", StringDistance( # similarity_function="character_equality")) # ])), # ("mesh_similarity", Pipeline([ # ("pairs", PairTransformer(element_transformer=Pipeline([ # ("mesh_terms", FuncTransformer(func=get_mesh_terms)), # ("shaper", Shaper(newshape=(-1,))), # ("tf-idf", TfidfVectorizer(dtype=np.float32, # decode_error="replace")), # ]), groupby=group_by_signature)), # ("combiner", CosineSimilarity()) # ])), ("mesh_word2vec", Pipeline([ ("pairs", FuncTransformer(func=get_mesh_word2vec)), ("combiner", MyCosineSimilarity()) ])), ("affiliation_similarity", Pipeline([ ("pairs", PairTransformer(element_transformer=Pipeline([ ("affiliation", FuncTransformer(func=get_author_affiliation)), ("shaper", Shaper(newshape=(-1,))), ("tf-idf", TfidfVectorizer(analyzer="char_wb", ngram_range=(2, 4), dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity()) ])), ("title_similarity", Pipeline([ ("pairs", PairTransformer(element_transformer=Pipeline([ ("title", FuncTransformer(func=get_title)), ("shaper", Shaper(newshape=(-1,))), ("tf-idf", TfidfVectorizer(analyzer="char_wb", ngram_range=(2, 4), dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity()) ])), ("journal_similarity", Pipeline([ ("pairs", PairTransformer(element_transformer=Pipeline([ ("journal", FuncTransformer(func=get_journal)), ("shaper", Shaper(newshape=(-1,))), ("tf-idf", TfidfVectorizer(analyzer="char_wb", ngram_range=(2, 4), dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity()) ])), ("abstract_similarity", Pipeline([ ("pairs", PairTransformer(element_transformer=Pipeline([ ("abstract", FuncTransformer(func=get_abstract)), ("shaper", Shaper(newshape=(-1,))), ("tf-idf", TfidfVectorizer(dtype=np.float32, decode_error="replace")), ]), groupby=group_by_signature)), ("combiner", CosineSimilarity()) ])) # ("mesh_word2vec", Pipeline([ # ("pairs", FuncTransformer(func=get_year, dtype=np.int)), # ("combiner", AbsoluteDifference()) # ])) # ("keywords_similarity", Pipeline([ # ("pairs", PairTransformer(element_transformer=Pipeline([ # ("keywords", FuncTransformer(func=get_keywords)), # ("shaper", Shaper(newshape=(-1,))), # ("tf-idf", TfidfVectorizer(dtype=np.float32, # decode_error="replace")), # ]), groupby=group_by_signature)), # ("combiner", CosineSimilarity()) # ])), # ("year_diff", Pipeline([ # ("pairs", FuncTransformer(func=get_year, dtype=np.int)), # ("combiner", AbsoluteDifference()) # ])) ]) if ethnicity_estimator is not None: transformer.transformer_list.append(("author_ethnicity", Pipeline([ ("pairs", PairTransformer(element_transformer=Pipeline([ ("name", FuncTransformer(func=get_author_full_name)), ("shaper", Shaper(newshape=(-1,))), ("classifier", EstimatorTransformer(ethnicity_estimator)), ]), groupby=group_by_signature)), ("sigmoid", FuncTransformer(func=expit)), ("combiner", ElementMultiplication()) ]))) # Train a classifier on these vectors classifier = GradientBoostingClassifier(n_estimators=2000, max_depth=9, max_features=5, learning_rate=0.125, verbose=verbose) # Return the whole pipeline estimator = Pipeline([("transformer", transformer), ("classifier", classifier)]).fit(X, y) return estimator