def fit(self):
        """Fit data using the estimator"""
        transformer = FeatureUnion([
            (
                "author_full_name_similarity",
                Pipeline([
                    (
                        "pairs",
                        PairTransformer(
                            element_transformer=Pipeline([
                                (
                                    "full_name",
                                    FuncTransformer(func=get_author_full_name),
                                ),
                                ("shaper", Shaper(newshape=(-1, ))),
                                (
                                    "tf-idf",
                                    TfidfVectorizer(
                                        analyzer="char_wb",
                                        ngram_range=(2, 4),
                                        dtype=np.float32,
                                        decode_error="replace",
                                    ),
                                ),
                            ]),
                            groupby=group_by_signature,
                        ),
                    ),
                    ("combiner", CosineSimilarity()),
                ]),
            ),
            (
                "author_second_initial_similarity",
                Pipeline([
                    (
                        "pairs",
                        PairTransformer(
                            element_transformer=FuncTransformer(
                                func=get_second_initial),
                            groupby=group_by_signature,
                        ),
                    ),
                    (
                        "combiner",
                        StringDistance(
                            similarity_function="character_equality"),
                    ),
                ]),
            ),
            (
                "author_first_given_name_similarity",
                Pipeline([
                    (
                        "pairs",
                        PairTransformer(
                            element_transformer=FuncTransformer(
                                func=get_first_given_name),
                            groupby=group_by_signature,
                        ),
                    ),
                    ("combiner", StringDistance()),
                ]),
            ),
            (
                "author_second_given_name_similarity",
                Pipeline([
                    (
                        "pairs",
                        PairTransformer(
                            element_transformer=FuncTransformer(
                                func=get_second_given_name),
                            groupby=group_by_signature,
                        ),
                    ),
                    ("combiner", StringDistance()),
                ]),
            ),
            (
                "author_other_names_similarity",
                Pipeline([
                    (
                        "pairs",
                        PairTransformer(
                            element_transformer=Pipeline([
                                (
                                    "other_names",
                                    FuncTransformer(
                                        func=get_author_other_names),
                                ),
                                ("shaper", Shaper(newshape=(-1, ))),
                                (
                                    "tf-idf",
                                    TfidfVectorizer(
                                        analyzer="char_wb",
                                        ngram_range=(2, 4),
                                        dtype=np.float32,
                                        decode_error="replace",
                                    ),
                                ),
                            ]),
                            groupby=group_by_signature,
                        ),
                    ),
                    ("combiner", CosineSimilarity()),
                ]),
            ),
            (
                "affiliation_similarity",
                Pipeline([
                    (
                        "pairs",
                        PairTransformer(
                            element_transformer=Pipeline([
                                (
                                    "affiliation",
                                    FuncTransformer(
                                        func=get_normalized_affiliation),
                                ),
                                ("shaper", Shaper(newshape=(-1, ))),
                                (
                                    "tf-idf",
                                    TfidfVectorizer(
                                        analyzer="char_wb",
                                        ngram_range=(2, 4),
                                        dtype=np.float32,
                                        decode_error="replace",
                                    ),
                                ),
                            ]),
                            groupby=group_by_signature,
                        ),
                    ),
                    ("combiner", CosineSimilarity()),
                ]),
            ),
            (
                "coauthors_similarity",
                Pipeline([
                    (
                        "pairs",
                        PairTransformer(
                            element_transformer=Pipeline([
                                (
                                    "coauthors",
                                    FuncTransformer(
                                        func=get_coauthors_neighborhood),
                                ),
                                ("shaper", Shaper(newshape=(-1, ))),
                                (
                                    "tf-idf",
                                    TfidfVectorizer(
                                        dtype=np.float32,
                                        decode_error="replace",
                                    ),
                                ),
                            ]),
                            groupby=group_by_signature,
                        ),
                    ),
                    ("combiner", CosineSimilarity()),
                ]),
            ),
            (
                "abstract_similarity",
                Pipeline([
                    (
                        "pairs",
                        PairTransformer(
                            element_transformer=Pipeline([
                                (
                                    "abstract",
                                    FuncTransformer(func=get_abstract),
                                ),
                                ("shaper", Shaper(newshape=(-1, ))),
                                (
                                    "tf-idf",
                                    TfidfVectorizer(
                                        dtype=np.float32,
                                        decode_error="replace",
                                    ),
                                ),
                            ]),
                            groupby=group_by_signature,
                        ),
                    ),
                    ("combiner", CosineSimilarity()),
                ]),
            ),
            (
                "keywords_similarity",
                Pipeline([
                    (
                        "pairs",
                        PairTransformer(
                            element_transformer=Pipeline([
                                (
                                    "keywords",
                                    FuncTransformer(func=get_keywords),
                                ),
                                ("shaper", Shaper(newshape=(-1, ))),
                                (
                                    "tf-idf",
                                    TfidfVectorizer(
                                        dtype=np.float32,
                                        decode_error="replace",
                                    ),
                                ),
                            ]),
                            groupby=group_by_signature,
                        ),
                    ),
                    ("combiner", CosineSimilarity()),
                ]),
            ),
            (
                "collaborations_similarity",
                Pipeline([
                    (
                        "pairs",
                        PairTransformer(
                            element_transformer=Pipeline([
                                (
                                    "collaborations",
                                    FuncTransformer(func=get_collaborations),
                                ),
                                ("shaper", Shaper(newshape=(-1, ))),
                                (
                                    "tf-idf",
                                    TfidfVectorizer(
                                        dtype=np.float32,
                                        decode_error="replace",
                                    ),
                                ),
                            ]),
                            groupby=group_by_signature,
                        ),
                    ),
                    ("combiner", CosineSimilarity()),
                ]),
            ),
            (
                "subject_similairty",
                Pipeline([
                    (
                        "pairs",
                        PairTransformer(
                            element_transformer=Pipeline([
                                (
                                    "keywords",
                                    FuncTransformer(func=get_topics),
                                ),
                                ("shaper", Shaper(newshape=(-1))),
                                (
                                    "tf-idf",
                                    TfidfVectorizer(
                                        dtype=np.float32,
                                        decode_error="replace",
                                    ),
                                ),
                            ]),
                            groupby=group_by_signature,
                        ),
                    ),
                    ("combiner", CosineSimilarity()),
                ]),
            ),
            (
                "title_similarity",
                Pipeline([
                    (
                        "pairs",
                        PairTransformer(
                            element_transformer=Pipeline([
                                ("title", FuncTransformer(func=get_title)),
                                ("shaper", Shaper(newshape=(-1, ))),
                                (
                                    "tf-idf",
                                    TfidfVectorizer(
                                        analyzer="char_wb",
                                        ngram_range=(2, 4),
                                        dtype=np.float32,
                                        decode_error="replace",
                                    ),
                                ),
                            ]),
                            groupby=group_by_signature,
                        ),
                    ),
                    ("combiner", CosineSimilarity()),
                ]),
            ),
            (
                "author_ethnicity",
                Pipeline([
                    (
                        "pairs",
                        PairTransformer(
                            element_transformer=Pipeline([
                                (
                                    "name",
                                    FuncTransformer(func=get_author_full_name),
                                ),
                                ("shaper", Shaper(newshape=(-1, ))),
                                (
                                    "classifier",
                                    EstimatorTransformer(
                                        self.ethnicity_estimator.estimator),
                                ),
                            ]),
                            groupby=group_by_signature,
                        ),
                    ),
                    ("sigmoid", FuncTransformer(func=expit)),
                    ("combiner", ElementMultiplication()),
                ]),
            ),
        ])
        classifier = RandomForestClassifier(n_estimators=500, n_jobs=8)

        self.distance_estimator = Pipeline([("transformer", transformer),
                                            ("classifier", classifier)])
        self.distance_estimator.fit(self.X, self.y)
示例#2
0
def test_pair_transformer():
    """Test for PairTransformer."""
    X = np.array([[0, 1], [2, 0], [2, 5]], dtype=np.float)
    tf = PairTransformer(element_transformer=FuncTransformer(lambda v: v + 1))
    Xt = tf.fit_transform(X)
    assert_array_almost_equal(Xt, X + 1)

    X = np.array([[0, 1], [2, 0], [2, 5],
                  [0, 1], [2, 0], [2, 5]], dtype=np.float)
    tf = PairTransformer(element_transformer=FuncTransformer(lambda v: v + 1),
                         groupby=lambda r: r[0])
    Xt = tf.fit_transform(X)
    assert_array_almost_equal(Xt, X + 1)

    X = np.array([[0, 1], [2, 3], [4, 5]], dtype=np.float)
    Xt = PairTransformer(element_transformer=MinMaxScaler()).fit_transform(X)
    assert_array_almost_equal(Xt, [[0, 0.2], [0.4, 0.6], [0.8, 1.0]])

    X = np.array([[0, 1], [2, 3]], dtype=np.float)
    tf = PairTransformer(element_transformer=OneHotEncoder(sparse=True))
    Xt = tf.fit_transform(X)
    assert sp.issparse(Xt)
    assert_array_almost_equal(Xt.todense(), [[1, 0, 0, 0, 0, 1, 0, 0],
                                             [0, 0, 1, 0, 0, 0, 0, 1]])

    X = sp.csr_matrix(np.array([[0, 1], [2, 3]], dtype=np.float))
    tf = PairTransformer(element_transformer=StandardScaler(with_mean=False))
    Xt = tf.fit_transform(X)
    assert sp.issparse(Xt)
    assert_array_almost_equal(Xt.todense(), [[0, 0.89442719],
                                             [1.78885438, 2.68328157]])
示例#3
0
def _build_distance_estimator(X,
                              y,
                              verbose=0,
                              ethnicity_estimator=None,
                              fast=False):
    """Build a vector reprensation of a pair of signatures."""
    if not fast:
        transformer = FeatureUnion([
            ("author_full_name_similarity",
             Pipeline([("pairs",
                        PairTransformer(element_transformer=Pipeline([
                            ("full_name",
                             FuncTransformer(func=get_author_full_name)),
                            ("shaper", Shaper(newshape=(-1, ))),
                            ("tf-idf",
                             TfidfVectorizer(analyzer="char_wb",
                                             ngram_range=(2, 4),
                                             dtype=np.float32,
                                             decode_error="replace")),
                        ]),
                                        groupby=group_by_signature)),
                       ("combiner", CosineSimilarity())])),
            ("author_second_initial_similarity",
             Pipeline([
                 ("pairs",
                  PairTransformer(element_transformer=FuncTransformer(
                      func=get_second_initial),
                                  groupby=group_by_signature)),
                 ("combiner",
                  StringDistance(similarity_function="character_equality"))
             ])),
            ("author_first_given_name_similarity",
             Pipeline([("pairs",
                        PairTransformer(element_transformer=FuncTransformer(
                            func=get_first_given_name),
                                        groupby=group_by_signature)),
                       ("combiner", StringDistance())])),
            ("author_second_given_name_similarity",
             Pipeline([("pairs",
                        PairTransformer(element_transformer=FuncTransformer(
                            func=get_second_given_name),
                                        groupby=group_by_signature)),
                       ("combiner", StringDistance())])),
            ("author_other_names_similarity",
             Pipeline([("pairs",
                        PairTransformer(element_transformer=Pipeline([
                            ("other_names",
                             FuncTransformer(func=get_author_other_names)),
                            ("shaper", Shaper(newshape=(-1, ))),
                            ("tf-idf",
                             TfidfVectorizer(analyzer="char_wb",
                                             ngram_range=(2, 4),
                                             dtype=np.float32,
                                             decode_error="replace")),
                        ]),
                                        groupby=group_by_signature)),
                       ("combiner", CosineSimilarity())])),
            ("affiliation_similarity",
             Pipeline([("pairs",
                        PairTransformer(element_transformer=Pipeline([
                            ("affiliation",
                             FuncTransformer(func=get_author_affiliation)),
                            ("shaper", Shaper(newshape=(-1, ))),
                            ("tf-idf",
                             TfidfVectorizer(analyzer="char_wb",
                                             ngram_range=(2, 4),
                                             dtype=np.float32,
                                             decode_error="replace")),
                        ]),
                                        groupby=group_by_signature)),
                       ("combiner", CosineSimilarity())])),
            ("coauthors_similarity",
             Pipeline([("pairs",
                        PairTransformer(element_transformer=Pipeline([
                            ("coauthors",
                             FuncTransformer(func=get_coauthors_from_range)),
                            ("shaper", Shaper(newshape=(-1, ))),
                            ("tf-idf",
                             TfidfVectorizer(dtype=np.float32,
                                             decode_error="replace")),
                        ]),
                                        groupby=group_by_signature)),
                       ("combiner", CosineSimilarity())])),
            ("title_similarity",
             Pipeline([("pairs",
                        PairTransformer(element_transformer=Pipeline([
                            ("title", FuncTransformer(func=get_title)),
                            ("shaper", Shaper(newshape=(-1, ))),
                            ("tf-idf",
                             TfidfVectorizer(analyzer="char_wb",
                                             ngram_range=(2, 4),
                                             dtype=np.float32,
                                             decode_error="replace")),
                        ]),
                                        groupby=group_by_signature)),
                       ("combiner", CosineSimilarity())])),
            ("journal_similarity",
             Pipeline([("pairs",
                        PairTransformer(element_transformer=Pipeline([
                            ("journal", FuncTransformer(func=get_journal)),
                            ("shaper", Shaper(newshape=(-1, ))),
                            ("tf-idf",
                             TfidfVectorizer(analyzer="char_wb",
                                             ngram_range=(2, 4),
                                             dtype=np.float32,
                                             decode_error="replace")),
                        ]),
                                        groupby=group_by_signature)),
                       ("combiner", CosineSimilarity())])),
            ("abstract_similarity",
             Pipeline([("pairs",
                        PairTransformer(element_transformer=Pipeline([
                            ("abstract", FuncTransformer(func=get_abstract)),
                            ("shaper", Shaper(newshape=(-1, ))),
                            ("tf-idf",
                             TfidfVectorizer(dtype=np.float32,
                                             decode_error="replace")),
                        ]),
                                        groupby=group_by_signature)),
                       ("combiner", CosineSimilarity())])),
            ("keywords_similarity",
             Pipeline([("pairs",
                        PairTransformer(element_transformer=Pipeline([
                            ("keywords", FuncTransformer(func=get_keywords)),
                            ("shaper", Shaper(newshape=(-1, ))),
                            ("tf-idf",
                             TfidfVectorizer(dtype=np.float32,
                                             decode_error="replace")),
                        ]),
                                        groupby=group_by_signature)),
                       ("combiner", CosineSimilarity())])),
            ("collaborations_similarity",
             Pipeline([("pairs",
                        PairTransformer(element_transformer=Pipeline([
                            ("collaborations",
                             FuncTransformer(func=get_collaborations)),
                            ("shaper", Shaper(newshape=(-1, ))),
                            ("tf-idf",
                             TfidfVectorizer(dtype=np.float32,
                                             decode_error="replace")),
                        ]),
                                        groupby=group_by_signature)),
                       ("combiner", CosineSimilarity())])),
            ("subject_similairty",
             Pipeline([("pairs",
                        PairTransformer(element_transformer=Pipeline([
                            ("keywords", FuncTransformer(func=get_topics)),
                            ("shaper", Shaper(newshape=(-1))),
                            ("tf-idf",
                             TfidfVectorizer(dtype=np.float32,
                                             decode_error="replace")),
                        ]),
                                        groupby=group_by_signature)),
                       ("combiner", CosineSimilarity())])),
            ("year_diff",
             Pipeline([("pairs", FuncTransformer(func=get_year, dtype=np.int)),
                       ("combiner", AbsoluteDifference())]))
        ])

    else:
        transformer = FeatureUnion([
            ("author_full_name_similarity",
             Pipeline([("pairs",
                        PairTransformer(element_transformer=Pipeline([
                            ("full_name",
                             FuncTransformer(func=get_author_full_name)),
                            ("shaper", Shaper(newshape=(-1, ))),
                            ("tf-idf",
                             TfidfVectorizer(analyzer="char_wb",
                                             ngram_range=(2, 4),
                                             dtype=np.float32,
                                             decode_error="replace")),
                        ]),
                                        groupby=group_by_signature)),
                       ("combiner", CosineSimilarity())])),
            ("author_other_names_similarity",
             Pipeline([("pairs",
                        PairTransformer(element_transformer=Pipeline([
                            ("other_names",
                             FuncTransformer(func=get_author_other_names)),
                            ("shaper", Shaper(newshape=(-1, ))),
                            ("tf-idf",
                             TfidfVectorizer(analyzer="char_wb",
                                             ngram_range=(2, 4),
                                             dtype=np.float32,
                                             decode_error="replace")),
                        ]),
                                        groupby=group_by_signature)),
                       ("combiner", CosineSimilarity())])),
            ("affiliation_similarity",
             Pipeline([("pairs",
                        PairTransformer(element_transformer=Pipeline([
                            ("affiliation",
                             FuncTransformer(func=get_author_affiliation)),
                            ("shaper", Shaper(newshape=(-1, ))),
                            ("tf-idf",
                             TfidfVectorizer(analyzer="char_wb",
                                             ngram_range=(2, 4),
                                             dtype=np.float32,
                                             decode_error="replace")),
                        ]),
                                        groupby=group_by_signature)),
                       ("combiner", CosineSimilarity())])),
            ("coauthors_similarity",
             Pipeline([("pairs",
                        PairTransformer(element_transformer=Pipeline([
                            ("coauthors",
                             FuncTransformer(func=get_coauthors_from_range)),
                            ("shaper", Shaper(newshape=(-1, ))),
                            ("tf-idf",
                             TfidfVectorizer(dtype=np.float32,
                                             decode_error="replace")),
                        ]),
                                        groupby=group_by_signature)),
                       ("combiner", CosineSimilarity())])),
            ("title_similarity",
             Pipeline([("pairs",
                        PairTransformer(element_transformer=Pipeline([
                            ("title", FuncTransformer(func=get_title)),
                            ("shaper", Shaper(newshape=(-1, ))),
                            ("tf-idf",
                             TfidfVectorizer(analyzer="char_wb",
                                             ngram_range=(2, 4),
                                             dtype=np.float32,
                                             decode_error="replace")),
                        ]),
                                        groupby=group_by_signature)),
                       ("combiner", CosineSimilarity())])),
            ("year_diff",
             Pipeline([("pairs", FuncTransformer(func=get_year, dtype=np.int)),
                       ("combiner", AbsoluteDifference())]))
        ])

    if ethnicity_estimator is not None:
        transformer.transformer_list.append(
            ("author_ethnicity",
             Pipeline([("pairs",
                        PairTransformer(element_transformer=Pipeline([
                            ("name",
                             FuncTransformer(func=get_author_full_name)),
                            ("shaper", Shaper(newshape=(-1, ))),
                            ("classifier",
                             EstimatorTransformer(ethnicity_estimator)),
                        ]),
                                        groupby=group_by_signature)),
                       ("sigmoid", FuncTransformer(func=expit)),
                       ("combiner", ElementMultiplication())])))

    # Train a classifier on these vectors

    classifier = GradientBoostingClassifier(n_estimators=500,
                                            max_depth=9,
                                            max_features=10,
                                            learning_rate=0.125,
                                            verbose=verbose)

    # classifier = RandomForestClassifier(n_estimators=500,
    #                                     verbose=verbose,
    #                                     n_jobs=8)

    # Return the whole pipeline
    estimator = Pipeline([("transformer", transformer),
                          ("classifier", classifier)]).fit(X, y)

    return estimator
示例#4
0
    def fit(self):
        transformer = FeatureUnion([
            ('author_full_name_similarity',
             Pipeline([
                 ('pairs',
                  PairTransformer(
                      element_transformer=Pipeline([
                          ('full_name',
                           FuncTransformer(func=get_author_full_name)),
                          ('shaper', Shaper(newshape=(-1, ))),
                          ('tf-idf',
                           TfidfVectorizer(
                               analyzer='char_wb',
                               ngram_range=(2, 4),
                               dtype=np.float32,
                               decode_error='replace',
                           )),
                      ]),
                      groupby=group_by_signature,
                  )),
                 ('combiner', CosineSimilarity()),
             ])),
            ('author_second_initial_similarity',
             Pipeline([
                 ('pairs',
                  PairTransformer(
                      element_transformer=FuncTransformer(
                          func=get_second_initial),
                      groupby=group_by_signature,
                  )),
                 ('combiner',
                  StringDistance(similarity_function='character_equality')),
             ])),
            ('author_first_given_name_similarity',
             Pipeline([
                 ('pairs',
                  PairTransformer(element_transformer=FuncTransformer(
                      func=get_first_given_name),
                                  groupby=group_by_signature)),
                 ('combiner', StringDistance()),
             ])),
            ('author_second_given_name_similarity',
             Pipeline([
                 ('pairs',
                  PairTransformer(
                      element_transformer=FuncTransformer(
                          func=get_second_given_name),
                      groupby=group_by_signature,
                  )),
                 ('combiner', StringDistance()),
             ])),
            ('author_other_names_similarity',
             Pipeline([
                 ('pairs',
                  PairTransformer(
                      element_transformer=Pipeline([
                          ('other_names',
                           FuncTransformer(func=get_author_other_names)),
                          ('shaper', Shaper(newshape=(-1, ))),
                          ('tf-idf',
                           TfidfVectorizer(
                               analyzer='char_wb',
                               ngram_range=(2, 4),
                               dtype=np.float32,
                               decode_error='replace',
                           )),
                      ]),
                      groupby=group_by_signature,
                  )),
                 ('combiner', CosineSimilarity()),
             ])),
            ('affiliation_similarity',
             Pipeline([
                 ('pairs',
                  PairTransformer(
                      element_transformer=Pipeline([
                          ('affiliation',
                           FuncTransformer(func=get_author_affiliation)),
                          ('shaper', Shaper(newshape=(-1, ))),
                          ('tf-idf',
                           TfidfVectorizer(
                               analyzer='char_wb',
                               ngram_range=(2, 4),
                               dtype=np.float32,
                               decode_error='replace',
                           )),
                      ]),
                      groupby=group_by_signature,
                  )),
                 ('combiner', CosineSimilarity()),
             ])),
            ('coauthors_similarity',
             Pipeline([
                 ('pairs',
                  PairTransformer(
                      element_transformer=Pipeline([
                          ('coauthors',
                           FuncTransformer(func=get_coauthors_neighborhood)),
                          ('shaper', Shaper(newshape=(-1, ))),
                          ('tf-idf',
                           TfidfVectorizer(
                               dtype=np.float32,
                               decode_error='replace',
                           )),
                      ]),
                      groupby=group_by_signature,
                  )),
                 ('combiner', CosineSimilarity()),
             ])),
            ('abstract_similarity',
             Pipeline([
                 ('pairs',
                  PairTransformer(
                      element_transformer=Pipeline([
                          ('abstract', FuncTransformer(func=get_abstract)),
                          ('shaper', Shaper(newshape=(-1, ))),
                          ('tf-idf',
                           TfidfVectorizer(
                               dtype=np.float32,
                               decode_error='replace',
                           )),
                      ]),
                      groupby=group_by_signature,
                  )),
                 ('combiner', CosineSimilarity()),
             ])),
            ('keywords_similarity',
             Pipeline([
                 ('pairs',
                  PairTransformer(
                      element_transformer=Pipeline([
                          ('keywords', FuncTransformer(func=get_keywords)),
                          ('shaper', Shaper(newshape=(-1, ))),
                          ('tf-idf',
                           TfidfVectorizer(
                               dtype=np.float32,
                               decode_error='replace',
                           )),
                      ]),
                      groupby=group_by_signature,
                  )),
                 ('combiner', CosineSimilarity()),
             ])),
            ('collaborations_similarity',
             Pipeline([
                 ('pairs',
                  PairTransformer(
                      element_transformer=Pipeline([
                          ('collaborations',
                           FuncTransformer(func=get_collaborations)),
                          ('shaper', Shaper(newshape=(-1, ))),
                          ('tf-idf',
                           TfidfVectorizer(
                               dtype=np.float32,
                               decode_error='replace',
                           )),
                      ]),
                      groupby=group_by_signature,
                  )),
                 ('combiner', CosineSimilarity()),
             ])),
            ('subject_similairty',
             Pipeline([
                 ('pairs',
                  PairTransformer(
                      element_transformer=Pipeline([
                          ('keywords', FuncTransformer(func=get_topics)),
                          ('shaper', Shaper(newshape=(-1))),
                          ('tf-idf',
                           TfidfVectorizer(
                               dtype=np.float32,
                               decode_error='replace',
                           )),
                      ]),
                      groupby=group_by_signature,
                  )),
                 ('combiner', CosineSimilarity()),
             ])),
            ('title_similarity',
             Pipeline([
                 ('pairs',
                  PairTransformer(
                      element_transformer=Pipeline([
                          ('title', FuncTransformer(func=get_title)),
                          ('shaper', Shaper(newshape=(-1, ))),
                          ('tf-idf',
                           TfidfVectorizer(
                               analyzer='char_wb',
                               ngram_range=(2, 4),
                               dtype=np.float32,
                               decode_error='replace',
                           )),
                      ]),
                      groupby=group_by_signature,
                  )),
                 ('combiner', CosineSimilarity()),
             ])),
            ('author_ethnicity',
             Pipeline([
                 ('pairs',
                  PairTransformer(
                      element_transformer=Pipeline([
                          ('name', FuncTransformer(func=get_author_full_name)),
                          ('shaper', Shaper(newshape=(-1, ))),
                          ('classifier',
                           EstimatorTransformer(
                               self.ethnicity_estimator.estimator)),
                      ]),
                      groupby=group_by_signature,
                  )),
                 ('sigmoid', FuncTransformer(func=expit)),
                 ('combiner', ElementMultiplication()),
             ])),
        ])
        classifier = RandomForestClassifier(n_estimators=500, n_jobs=8)

        self.distance_estimator = Pipeline([('transformer', transformer),
                                            ('classifier', classifier)])
        self.distance_estimator.fit(self.X, self.y)
示例#5
0
def _build_distance_estimator(X, y, w2v, PoS, NER, regressor, verbose=1):
    """Build a vector reprensation of a pair of signatures."""
    if w2v == 'glove':
        PairVecTransformer = PairGloveTransformer
    elif w2v == 'spacy':
        PairVecTransformer = PairSpacyVecTransformer
    elif w2v == 'polyglot':
        PairVecTransformer = PairPolyglotVecTransformer
    else:
        print('error passing w2v argument value')

    if PoS == 'polyglot':
        get_nouns = polyglot_nouns
        get_verbs = polyglot_verbs
        get_words = polyglot_words
        get_particle = polyglot_particle
        get_interjection = polyglot_interjection
        get_symbol = polyglot_symbol
        get_numbers = polyglot_numbers
        get_proper_nouns = polyglot_proper_nouns
        get_pronouns = polyglot_pronouns
        get_auxiliary_verbs = polyglot_auxiliary_verbs
        get_adjectives = polyglot_adjectives
        get_adverbs = polyglot_adverbs
        get_punctuation = polyglot_punctuation
        get_determiner = polyglot_determiner
        get_coordinating_conjunction = polyglot_coordinating_conjunction
        get_adpositions = polyglot_adpositions
        get_others = polyglot_others
        get_subordinating_conjunctions = polyglot_subordinating_conjunctions
    elif PoS == 'spacy':
        get_nouns = spacy_noun
        get_verbs = spacy_verb
        get_words = spacy_tokens
        get_particle = spacy_part
        get_interjection = spacy_intj
        get_symbol = spacy_sym
        get_numbers = spacy_num
        get_proper_nouns = spacy_propn
        get_pronouns = spacy_pron
        get_auxiliary_verbs = spacy_aux
        get_adjectives = spacy_adj
        get_adverbs = spacy_adv
        get_punctuation = spacy_punct
        get_determiner = spacy_det
        get_coordinating_conjunction = spacy_conj
        get_adpositions = spacy_adp
        get_others = spacy_x
        get_subordinating_conjunctions = spacy_sconj
    else:
        print('error passing PoS argument value')

    transformer = FeatureUnion([
        ("get_nouns",
         Pipeline(steps=[
             ('pairtransformer',
              PairTransformer(element_transformer=FuncTransformer(
                  dtype=None, func=get_nouns),
                              groupby=None)),
             ('sop', SmallerOtherParing()),
             ('pgt', PairVecTransformer()),
             ('rgpc', RefGroupPairCosine()),
             ('gm', GetMatches()),
             ('sd', SolveDuplicate()),
             ('ac', AvgPOSCombiner()),
         ])),
        ("get_verbs",
         Pipeline(steps=[
             ('pairtransformer',
              PairTransformer(element_transformer=FuncTransformer(
                  dtype=None, func=get_verbs),
                              groupby=None)),
             ('sop', SmallerOtherParing()),
             ('pgt', PairVecTransformer()),
             ('rgpc', RefGroupPairCosine()),
             ('gm', GetMatches()),
             ('sd', SolveDuplicate()),
             ('ac', AvgPOSCombiner()),
         ])),
        ("get_words",
         Pipeline(steps=[
             ('pairtransformer',
              PairTransformer(element_transformer=FuncTransformer(
                  dtype=None, func=get_words),
                              groupby=None)),
             ('sop', SmallerOtherParing()),
             ('pgt', PairVecTransformer()),
             ('rgpc', RefGroupPairCosine()),
             ('gm', GetMatches()),
             ('sd', SolveDuplicate()),
             ('ac', AvgPOSCombiner()),
         ])),
        ("get_particle",
         Pipeline(steps=[
             ('pairtransformer',
              PairTransformer(element_transformer=FuncTransformer(
                  dtype=None, func=get_particle),
                              groupby=None)),
             ('sop', SmallerOtherParing()),
             ('pgt', PairVecTransformer()),
             ('rgpc', RefGroupPairCosine()),
             ('gm', GetMatches()),
             ('sd', SolveDuplicate()),
             ('ac', AvgPOSCombiner()),
         ])),
        ("get_symbol",
         Pipeline(steps=[
             ('pairtransformer',
              PairTransformer(element_transformer=FuncTransformer(
                  dtype=None, func=get_symbol),
                              groupby=None)),
             ('sop', SmallerOtherParing()),
             ('pgt', PairVecTransformer()),
             ('rgpc', RefGroupPairCosine()),
             ('gm', GetMatches()),
             ('sd', SolveDuplicate()),
             ('ac', AvgPOSCombiner()),
         ])),
        ("num_diff",
         Pipeline(steps=[
             ('pairtransformer',
              PairTransformer(element_transformer=Pipeline([
                  ("rsn", FuncTransformer(func=replace_spelled_numbers)),
                  ("get_num", FuncTransformer(func=get_numbers)),
                  ("to_num", FuncTransformer(func=to_numeric)),
              ]),
                              groupby=None)),
             ('1st_nm_comb', NumCombiner()),
         ])),
        ("get_proper_nouns",
         Pipeline(steps=[
             ('pairtransformer',
              PairTransformer(element_transformer=FuncTransformer(
                  dtype=None, func=get_proper_nouns),
                              groupby=None)),
             ('sop', SmallerOtherParing()),
             ('pgt', PairVecTransformer()),
             ('rgpc', RefGroupPairCosine()),
             ('gm', GetMatches()),
             ('sd', SolveDuplicate()),
             ('ac', AvgPOSCombiner()),
         ])),
        ("get_pronouns",
         Pipeline(steps=[
             ('pairtransformer',
              PairTransformer(element_transformer=FuncTransformer(
                  dtype=None, func=get_pronouns),
                              groupby=None)),
             ('sop', SmallerOtherParing()),
             ('pgt', PairVecTransformer()),
             ('rgpc', RefGroupPairCosine()),
             ('gm', GetMatches()),
             ('sd', SolveDuplicate()),
             ('ac', AvgPOSCombiner()),
         ])),
        ("get_auxiliary_verbs",
         Pipeline(steps=[
             ('pairtransformer',
              PairTransformer(element_transformer=FuncTransformer(
                  dtype=None, func=get_auxiliary_verbs),
                              groupby=None)),
             ('sop', SmallerOtherParing()),
             ('pgt', PairVecTransformer()),
             ('rgpc', RefGroupPairCosine()),
             ('gm', GetMatches()),
             ('sd', SolveDuplicate()),
             ('ac', AvgPOSCombiner()),
         ])),
        ("adjectives_glove",
         Pipeline(steps=[
             ('pairtransformer',
              PairTransformer(element_transformer=FuncTransformer(
                  dtype=None, func=get_adjectives),
                              groupby=None)),
             ('sop', SmallerOtherParing()),
             ('pgt', PairVecTransformer()),
             ('rgpc', RefGroupPairCosine()),
             ('gm', GetMatches()),
             ('sd', SolveDuplicate()),
             ('ac', AvgPOSCombiner()),
         ])),
        ("adverbs_glove",
         Pipeline(steps=[
             ('pairtransformer',
              PairTransformer(element_transformer=FuncTransformer(
                  dtype=None, func=get_adverbs),
                              groupby=None)),
             ('sop', SmallerOtherParing()),
             ('pgt', PairVecTransformer()),
             ('rgpc', RefGroupPairCosine()),
             ('gm', GetMatches()),
             ('sd', SolveDuplicate()),
             ('ac', AvgPOSCombiner()),
         ])),
        ("get_punctuation",
         Pipeline(steps=[
             ('pairtransformer',
              PairTransformer(element_transformer=FuncTransformer(
                  dtype=None, func=get_punctuation),
                              groupby=None)),
             ('sop', SmallerOtherParing()),
             ('pgt', PairVecTransformer()),
             ('rgpc', RefGroupPairCosine()),
             ('gm', GetMatches()),
             ('sd', SolveDuplicate()),
             ('ac', AvgPOSCombiner()),
         ])),
        ("get_determiner",
         Pipeline(steps=[
             ('pairtransformer',
              PairTransformer(element_transformer=FuncTransformer(
                  dtype=None, func=get_determiner),
                              groupby=None)),
             ('sop', SmallerOtherParing()),
             ('pgt', PairVecTransformer()),
             ('rgpc', RefGroupPairCosine()),
             ('gm', GetMatches()),
             ('sd', SolveDuplicate()),
             ('ac', AvgPOSCombiner()),
         ])),
        ("get_coordinating_conjunction",
         Pipeline(steps=[
             ('pairtransformer',
              PairTransformer(element_transformer=FuncTransformer(
                  dtype=None, func=get_coordinating_conjunction),
                              groupby=None)),
             ('sop', SmallerOtherParing()),
             ('pgt', PairVecTransformer()),
             ('rgpc', RefGroupPairCosine()),
             ('gm', GetMatches()),
             ('sd', SolveDuplicate()),
             ('ac', AvgPOSCombiner()),
         ])),
        ("get_adpositions",
         Pipeline(steps=[
             ('pairtransformer',
              PairTransformer(element_transformer=FuncTransformer(
                  dtype=None, func=get_adpositions),
                              groupby=None)),
             ('sop', SmallerOtherParing()),
             ('pgt', PairVecTransformer()),
             ('rgpc', RefGroupPairCosine()),
             ('gm', GetMatches()),
             ('sd', SolveDuplicate()),
             ('ac', AvgPOSCombiner()),
         ])),
        ("get_subordinating_conjunctions",
         Pipeline(steps=[
             ('pairtransformer',
              PairTransformer(element_transformer=FuncTransformer(
                  dtype=None, func=get_subordinating_conjunctions),
                              groupby=None)),
             ('sop', SmallerOtherParing()),
             ('pgt', PairVecTransformer()),
             ('rgpc', RefGroupPairCosine()),
             ('gm', GetMatches()),
             ('sd', SolveDuplicate()),
             ('ac', AvgPOSCombiner()),
         ])),
        ("spacy_organizations",
         Pipeline(steps=[
             ('pairtransformer',
              PairTransformer(element_transformer=FuncTransformer(
                  dtype=None, func=spacy_organizations),
                              groupby=None)),
             ('sop', SmallerOtherParing()),
             ('pgt', PairVecTransformer()),
             ('rgpc', RefGroupPairCosine()),
             ('gm', GetMatches()),
             ('sd', SolveDuplicate()),
             ('ac', AvgPOSCombiner()),
         ])),
        ("spacy_persons",
         Pipeline(steps=[
             ('pairtransformer',
              PairTransformer(element_transformer=FuncTransformer(
                  dtype=None, func=spacy_persons),
                              groupby=None)),
             ('sop', SmallerOtherParing()),
             ('pgt', PairVecTransformer()),
             ('rgpc', RefGroupPairCosine()),
             ('gm', GetMatches()),
             ('sd', SolveDuplicate()),
             ('ac', AvgPOSCombiner()),
         ])),
        ("spacy_locations",
         Pipeline(steps=[
             ('pairtransformer',
              PairTransformer(element_transformer=FuncTransformer(
                  dtype=None, func=spacy_locations),
                              groupby=None)),
             ('sop', SmallerOtherParing()),
             ('pgt', PairVecTransformer()),
             ('rgpc', RefGroupPairCosine()),
             ('gm', GetMatches()),
             ('sd', SolveDuplicate()),
             ('ac', AvgPOSCombiner()),
         ])),
        ("spacy_groups",
         Pipeline(steps=[
             ('pairtransformer',
              PairTransformer(element_transformer=FuncTransformer(
                  dtype=None, func=spacy_groups),
                              groupby=None)),
             ('sop', SmallerOtherParing()),
             ('pgt', PairVecTransformer()),
             ('rgpc', RefGroupPairCosine()),
             ('gm', GetMatches()),
             ('sd', SolveDuplicate()),
             ('ac', AvgPOSCombiner()),
         ])),
        ("spacy_geo_locations",
         Pipeline(steps=[
             ('pairtransformer',
              PairTransformer(element_transformer=FuncTransformer(
                  dtype=None, func=spacy_geo_locations),
                              groupby=None)),
             ('sop', SmallerOtherParing()),
             ('pgt', PairVecTransformer()),
             ('rgpc', RefGroupPairCosine()),
             ('gm', GetMatches()),
             ('sd', SolveDuplicate()),
             ('ac', AvgPOSCombiner()),
         ])),
        ("sent_tfidf",
         Pipeline([
             ("pairs",
              PairTransformer(element_transformer=Pipeline(
                  [("1st_verb",
                    FuncTransformer(
                        func=get_text)), ("shaper", Shaper(newshape=(-1, ))),
                   ("tf-idf",
                    TfidfVectorizer(analyzer="char_wb",
                                    ngram_range=(2, 3),
                                    dtype=np.float32,
                                    decode_error="replace",
                                    stop_words="english"))]))),
             ("combiner", CosineSimilarity())
         ])),
        ("sent_len_diff",
         Pipeline(steps=[
             ('pairtransformer',
              PairTransformer(element_transformer=FuncTransformer(dtype=None,
                                                                  func=len),
                              groupby=None)),
             ('abs_diff', AbsoluteDifference()),
         ])),
    ])

    # Train a classifier on these vectors
    if regressor == 'lasso':
        classifier = LassoLarsCV(cv=5, max_iter=512, n_jobs=-1)
    elif regressor == 'RF':
        classifier = RandomForestRegressor(n_jobs=-1,
                                           max_depth=8,
                                           n_estimators=500)
    else:
        print('Error passing the regressor type')

    # Return the whole pipeline
    estimator = Pipeline([("transformer", transformer),
                          ("classifier", classifier)]).fit(X, y)

    return estimator
示例#6
0
def build_distance_estimator(X, y):
    # Build a vector reprensation of a pair of signatures
    transformer = FeatureUnion([
        ("author_full_name_similarity",
         Pipeline([("pairs",
                    PairTransformer(element_transformer=Pipeline([
                        ("full_name",
                         FuncTransformer(func=get_author_full_name)),
                        ("shaper", Shaper(newshape=(-1, ))),
                        ("tf-idf",
                         TfidfVectorizer(analyzer="char_wb",
                                         ngram_range=(2, 4),
                                         dtype=np.float32,
                                         decode_error="replace")),
                    ]),
                                    groupby=group_by_signature)),
                   ("combiner", CosineSimilarity())])),
        ("author_other_names_similarity",
         Pipeline([("pairs",
                    PairTransformer(element_transformer=Pipeline([
                        ("other_names",
                         FuncTransformer(func=get_author_other_names)),
                        ("shaper", Shaper(newshape=(-1, ))),
                        ("tf-idf",
                         TfidfVectorizer(analyzer="char_wb",
                                         ngram_range=(2, 4),
                                         dtype=np.float32,
                                         decode_error="replace")),
                    ]),
                                    groupby=group_by_signature)),
                   ("combiner", CosineSimilarity())])),
        ("author_initials_similarity",
         Pipeline([("pairs",
                    PairTransformer(element_transformer=Pipeline([
                        ("initials",
                         FuncTransformer(func=get_author_initials)),
                        ("shaper", Shaper(newshape=(-1, ))),
                        ("count",
                         CountVectorizer(analyzer="char_wb",
                                         ngram_range=(1, 1),
                                         binary=True,
                                         decode_error="replace")),
                    ]),
                                    groupby=group_by_signature)),
                   ("combiner", CosineSimilarity())])),
        ("affiliation_similarity",
         Pipeline([("pairs",
                    PairTransformer(element_transformer=Pipeline([
                        ("affiliation",
                         FuncTransformer(func=get_author_affiliation)),
                        ("shaper", Shaper(newshape=(-1, ))),
                        ("tf-idf",
                         TfidfVectorizer(analyzer="char_wb",
                                         ngram_range=(2, 4),
                                         dtype=np.float32,
                                         decode_error="replace")),
                    ]),
                                    groupby=group_by_signature)),
                   ("combiner", CosineSimilarity())])),
        ("coauthors_similarity",
         Pipeline([("pairs",
                    PairTransformer(element_transformer=Pipeline([
                        ("coauthors", FuncTransformer(func=get_coauthors)),
                        ("shaper", Shaper(newshape=(-1, ))),
                        ("tf-idf",
                         TfidfVectorizer(dtype=np.float32,
                                         decode_error="replace")),
                    ]),
                                    groupby=group_by_signature)),
                   ("combiner", CosineSimilarity())])),
        ("title_similarity",
         Pipeline([("pairs",
                    PairTransformer(element_transformer=Pipeline([
                        ("title", FuncTransformer(func=get_title)),
                        ("shaper", Shaper(newshape=(-1, ))),
                        ("tf-idf",
                         TfidfVectorizer(analyzer="char_wb",
                                         ngram_range=(2, 4),
                                         dtype=np.float32,
                                         decode_error="replace")),
                    ]),
                                    groupby=group_by_signature)),
                   ("combiner", CosineSimilarity())])),
        ("journal_similarity",
         Pipeline([("pairs",
                    PairTransformer(element_transformer=Pipeline([
                        ("journal", FuncTransformer(func=get_journal)),
                        ("shaper", Shaper(newshape=(-1, ))),
                        ("tf-idf",
                         TfidfVectorizer(analyzer="char_wb",
                                         ngram_range=(2, 4),
                                         dtype=np.float32,
                                         decode_error="replace")),
                    ]),
                                    groupby=group_by_signature)),
                   ("combiner", CosineSimilarity())])),
        ("abstract_similarity",
         Pipeline([("pairs",
                    PairTransformer(element_transformer=Pipeline([
                        ("abstract", FuncTransformer(func=get_abstract)),
                        ("shaper", Shaper(newshape=(-1, ))),
                        ("tf-idf",
                         TfidfVectorizer(dtype=np.float32,
                                         decode_error="replace")),
                    ]),
                                    groupby=group_by_signature)),
                   ("combiner", CosineSimilarity())])),
        ("keywords_similarity",
         Pipeline([("pairs",
                    PairTransformer(element_transformer=Pipeline([
                        ("keywords", FuncTransformer(func=get_keywords)),
                        ("shaper", Shaper(newshape=(-1, ))),
                        ("tf-idf",
                         TfidfVectorizer(dtype=np.float32,
                                         decode_error="replace")),
                    ]),
                                    groupby=group_by_signature)),
                   ("combiner", CosineSimilarity())])),
        ("collaborations_similarity",
         Pipeline([("pairs",
                    PairTransformer(element_transformer=Pipeline([
                        ("collaborations",
                         FuncTransformer(func=get_collaborations)),
                        ("shaper", Shaper(newshape=(-1, ))),
                        ("tf-idf",
                         TfidfVectorizer(dtype=np.float32,
                                         decode_error="replace")),
                    ]),
                                    groupby=group_by_signature)),
                   ("combiner", CosineSimilarity())])),
        ("references_similarity",
         Pipeline([("pairs",
                    PairTransformer(element_transformer=Pipeline([
                        ("references", FuncTransformer(func=get_references)),
                        ("shaper", Shaper(newshape=(-1, ))),
                        ("tf-idf",
                         TfidfVectorizer(dtype=np.float32,
                                         decode_error="replace")),
                    ]),
                                    groupby=group_by_signature)),
                   ("combiner", CosineSimilarity())])),
        (
            "year_diff",
            Pipeline([
                ("pairs", FuncTransformer(func=get_year, dtype=np.int)),
                ("combiner", AbsoluteDifference()
                 )  # FIXME: when one is missing
            ]))
    ])

    # Train a classifier on these vectors
    classifier = GradientBoostingClassifier(n_estimators=500,
                                            max_depth=9,
                                            max_features=10,
                                            learning_rate=0.125,
                                            verbose=3)

    # Return the whole pipeline
    estimator = Pipeline([("transformer", transformer),
                          ("classifier", classifier)]).fit(X, y)

    return estimator
示例#7
0
def _build_distance_estimator(X, y, verbose=0, ethnicity_estimator=None):
    """Build a vector reprensation of a pair of signatures."""
    transformer = FeatureUnion([
        # ("author_full_name_similarity", Pipeline([
        #     ("pairs", PairTransformer(element_transformer=Pipeline([
        #         ("full_name", FuncTransformer(func=get_author_full_name)),
        #         ("shaper", Shaper(newshape=(-1,))),
        #         ("tf-idf", TfidfVectorizer(analyzer="char_wb",
        #                                    ngram_range=(2, 4),
        #                                    dtype=np.float32,
        #                                    decode_error="replace")),
        #     ]), groupby=group_by_signature)),
        #     ("combiner", CosineSimilarity())
        # ])),
        # ("author_second_initial_similarity", Pipeline([
        #     ("pairs", PairTransformer(element_transformer=FuncTransformer(
        #         func=get_second_initial
        #     ), groupby=group_by_signature)),
        #     ("combiner", StringDistance(
        #         similarity_function="character_equality"))
        # ])),
        # ("mesh_similarity", Pipeline([
        #     ("pairs", PairTransformer(element_transformer=Pipeline([
        #         ("mesh_terms", FuncTransformer(func=get_mesh_terms)),
        #         ("shaper", Shaper(newshape=(-1,))),
        #         ("tf-idf", TfidfVectorizer(dtype=np.float32,
        #                                    decode_error="replace")),
        #     ]), groupby=group_by_signature)),
        #     ("combiner", CosineSimilarity())
        # ])),
        ("mesh_word2vec", Pipeline([
            ("pairs", FuncTransformer(func=get_mesh_word2vec)),
            ("combiner", MyCosineSimilarity())
        ])),
        ("affiliation_similarity", Pipeline([
            ("pairs", PairTransformer(element_transformer=Pipeline([
                ("affiliation", FuncTransformer(func=get_author_affiliation)),
                ("shaper", Shaper(newshape=(-1,))),
                ("tf-idf", TfidfVectorizer(analyzer="char_wb",
                                           ngram_range=(2, 4),
                                           dtype=np.float32,
                                           decode_error="replace")),
            ]), groupby=group_by_signature)),
            ("combiner", CosineSimilarity())
        ])),
        ("title_similarity", Pipeline([
            ("pairs", PairTransformer(element_transformer=Pipeline([
                ("title", FuncTransformer(func=get_title)),
                ("shaper", Shaper(newshape=(-1,))),
                ("tf-idf", TfidfVectorizer(analyzer="char_wb",
                                           ngram_range=(2, 4),
                                           dtype=np.float32,
                                           decode_error="replace")),
            ]), groupby=group_by_signature)),
            ("combiner", CosineSimilarity())
        ])),
        ("journal_similarity", Pipeline([
            ("pairs", PairTransformer(element_transformer=Pipeline([
                ("journal", FuncTransformer(func=get_journal)),
                ("shaper", Shaper(newshape=(-1,))),
                ("tf-idf", TfidfVectorizer(analyzer="char_wb",
                                           ngram_range=(2, 4),
                                           dtype=np.float32,
                                           decode_error="replace")),
            ]), groupby=group_by_signature)),
            ("combiner", CosineSimilarity())
        ])),
        ("abstract_similarity", Pipeline([
            ("pairs", PairTransformer(element_transformer=Pipeline([
                ("abstract", FuncTransformer(func=get_abstract)),
                ("shaper", Shaper(newshape=(-1,))),
                ("tf-idf", TfidfVectorizer(dtype=np.float32,
                                           decode_error="replace")),
            ]), groupby=group_by_signature)),
            ("combiner", CosineSimilarity())
        ]))
        # ("mesh_word2vec", Pipeline([
        #     ("pairs", FuncTransformer(func=get_year, dtype=np.int)),
        #     ("combiner", AbsoluteDifference())
        # ]))
        # ("keywords_similarity", Pipeline([
        #     ("pairs", PairTransformer(element_transformer=Pipeline([
        #         ("keywords", FuncTransformer(func=get_keywords)),
        #         ("shaper", Shaper(newshape=(-1,))),
        #         ("tf-idf", TfidfVectorizer(dtype=np.float32,
        #                                    decode_error="replace")),
        #     ]), groupby=group_by_signature)),
        #     ("combiner", CosineSimilarity())
        # ])),
        # ("year_diff", Pipeline([
        #     ("pairs", FuncTransformer(func=get_year, dtype=np.int)),
        #     ("combiner", AbsoluteDifference())
        # ]))
    ])

    if ethnicity_estimator is not None:
        transformer.transformer_list.append(("author_ethnicity", Pipeline([
            ("pairs", PairTransformer(element_transformer=Pipeline([
                ("name", FuncTransformer(func=get_author_full_name)),
                ("shaper", Shaper(newshape=(-1,))),
                ("classifier", EstimatorTransformer(ethnicity_estimator)),
            ]), groupby=group_by_signature)),
            ("sigmoid", FuncTransformer(func=expit)),
            ("combiner", ElementMultiplication())
        ])))

    # Train a classifier on these vectors
    classifier = GradientBoostingClassifier(n_estimators=2000,
                                            max_depth=9,
                                            max_features=5,
                                            learning_rate=0.125,
                                            verbose=verbose)

    # Return the whole pipeline
    estimator = Pipeline([("transformer", transformer),
                          ("classifier", classifier)]).fit(X, y)

    return estimator
示例#8
0
def test_pair_transformer():
    """Test for PairTransformer."""
    X = np.array([[0, 1], [2, 0], [2, 5]], dtype=np.float)
    tf = PairTransformer(element_transformer=FuncTransformer(lambda v: v + 1))
    Xt = tf.fit_transform(X)
    assert_array_almost_equal(Xt, X + 1)

    X = np.array([[0, 1], [2, 0], [2, 5], [0, 1], [2, 0], [2, 5]],
                 dtype=np.float)
    tf = PairTransformer(element_transformer=FuncTransformer(lambda v: v + 1),
                         groupby=lambda r: r[0])
    Xt = tf.fit_transform(X)
    assert_array_almost_equal(Xt, X + 1)

    X = np.array([[0, 1], [2, 3], [4, 5]], dtype=np.float)
    Xt = PairTransformer(element_transformer=MinMaxScaler()).fit_transform(X)
    assert_array_almost_equal(Xt, [[0, 0.2], [0.4, 0.6], [0.8, 1.0]])

    X = np.array([[0, 1], [2, 3]], dtype=np.float)
    tf = PairTransformer(element_transformer=OneHotEncoder(sparse=True))
    Xt = tf.fit_transform(X)
    assert sp.issparse(Xt)
    assert_array_almost_equal(
        Xt.todense(), [[1, 0, 0, 0, 0, 1, 0, 0], [0, 0, 1, 0, 0, 0, 0, 1]])

    X = sp.csr_matrix(np.array([[0, 1], [2, 3]], dtype=np.float))
    tf = PairTransformer(element_transformer=StandardScaler(with_mean=False))
    Xt = tf.fit_transform(X)
    assert sp.issparse(Xt)
    assert_array_almost_equal(Xt.todense(),
                              [[0, 0.89442719], [1.78885438, 2.68328157]])