示例#1
0
def test_load_ytrue():
    ix_all = createmultiindex(X=getXst())
    y = getytrue()
    assert y.shape[0] == ix_all.shape[0]
    assert unique(y).shape[0] == 2
    print(y.sample(10))
    assert isinstance(y, pd.Series)
示例#2
0
def test_pruningpipe():
    print('start', pd.datetime.now())
    n_rows = 500
    n_cluster = 25
    n_simplequestions = 50
    n_pointedquestions = 50
    Xst = getXst(nrows=n_rows)
    ixc = createmultiindex(X=Xst)
    y_true = getytrue()
    y_true = y_true.loc[ixc]
    print(pd.datetime.now(), 'data loaded')
    pipe = PruningPipe(
        connector=DfConnector(
            scorer=Pipeline(steps=[
                ('scores', FeatureUnion(_lr_score_list)),
                ('imputer', SimpleImputer(strategy='constant', fill_value=0))]
            )
        ),
        pruningclf=Explorer(clustermixin=KBinsCluster(n_clusters=n_cluster)),
        sbsmodel=FeatureUnion(transformer_list=_sbs_score_list),
        classifier=LogisticRegressionCV()
    )
    pipe.fit(X=Xst, y=y_true)
    y_pred = pipe.predict(X=Xst)
    precision = precision_score(y_true=y_true, y_pred=y_pred)
    recall = recall_score(y_true=y_true, y_pred=y_pred)
    accuracy = balanced_accuracy_score(y_true=y_true, y_pred=y_pred)
    print('***\nscores:\n')
    print('precision score:{}\n recall score:{}\n balanced accuracy score:{}'.format(
        precision, recall, accuracy))
示例#3
0
def test_build_questions_cluster_score(fixture_data, fixture_scores):
    X_lr = fixture_data
    scorer = fixture_scores
    cluster = KMeans(n_clusters=10)
    X_sbs = DfVisualSbs().fit_transform(X=X_lr)
    X_score = scorer.fit_transform(X=X_lr)
    y_cluster = cluster.fit_predict(X=X_score)
    y_score = np.mean(X_score, axis=1)
    X_info = pd.DataFrame(data=np.column_stack([y_cluster, y_score]),
                          index=createmultiindex(X=X_lr),
                          columns=['y_cluster', 'y_score'])
    X_all = pd.concat([X_info, X_sbs], ignore_index=False, axis=1)
    X_all.sort_values(by=['y_score', 'y_cluster'], inplace=True)

    questions = SimpleQuestions(n_questions=10)
    ix_questions = questions.fit_transform(X=pd.Series(data=y_cluster, index=createmultiindex(X=X_lr)))
    X_questions = X_all.loc[ix_questions]
    print(X_questions.sample(5))
    assert True
示例#4
0
def test_esconnector():
    print('start', pd.datetime.now())
    n_rows = 500
    n_cluster = 25
    Xst = getXst(nrows=n_rows)
    left = Xst[0]
    esclient = elasticsearch.Elasticsearch()
    scoreplan = {
        'name': {
            'type': 'FreeText'
        },
        'street': {
            'type': 'FreeText'
        },
        'city': {
            'type': 'FreeText'
        },
        'duns': {
            'type': 'Exact'
        },
        'postalcode': {
            'type': 'FreeText'
        },
        'countrycode': {
            'type': 'Exact'
        }
    }
    escon = EsConnector(
        client=esclient,
        scoreplan=scoreplan,
        index="right",
        explain=False,
        size=20
    )
    ixc = createmultiindex(X=Xst)
    y_true = getytrue()
    y_true = y_true.loc[ixc]
    print(pd.datetime.now(), 'data loaded')
    pipe = PruningPipe(
        connector=escon,
        pruningclf=Explorer(clustermixin=KBinsCluster(n_clusters=n_cluster)),
        sbsmodel=FeatureUnion(transformer_list=_sbs_score_list),
        classifier=LogisticRegressionCV()
    )
    pipe.fit(X=left, y=y_true)
    y_pred = pipe.predict(X=left)
    scores = get_commonscores(y_pred=y_pred, y_true=y_true)
    precision = scores['precision']
    recall = scores['recall']
    accuracy = scores['balanced_accuracy']
    print('***\nscores:\n')
    print('precision score:{}\n recall score:{}\n balanced accuracy score:{}'.format(
        precision, recall, accuracy))
示例#5
0
    def _getindex(self, X, y=None):
        """
        Return the cartesian product index of both dataframes
        Args:
            X (list): [df_source, df_target]
            y (pd.Series/pd.DataFrame/pd.MultiIndex): dummy, not used

        Returns:
            pd.MultiIndex
        """
        ix = createmultiindex(X=X, names=self.ixnamepairs)
        return ix
示例#6
0
    def fit(self, X, y=None):
        """

        Args:
            X (list): [df_source, df_target]
            y: dummy, not used

        Returns:
            self
        """
        self.index = createmultiindex(X=X, names=self.ixnamepairs)
        self.dfnum = pd.Series(index=np.arange(0, len(self.index)),
                               data=self.index.values,
                               name='ix')
        self.dfix = pd.Series(index=self.index,
                              data=np.arange(0, len(self.index)),
                              name='ixnum')
        return self
示例#7
0
def getytrue(Xst=None):
    """

    Args:
        Xst: source and target dataframe for which to get the labelling

    Returns:
        pd.Series: supervised training data
    """
    if Xst is None:
        Xst = getXst()
    ix_all = createmultiindex(X=Xst)
    y_true = pd.Series(data=zeros(shape=(ix_all.shape[0], )),
                       index=ix_all,
                       name='y_true').fillna(0)
    y_saved = open_csv(filename='ytrue.csv',
                       foldername=_folder_companydf,
                       index_col=[0, 1])['y_true']
    y_true.loc[y_saved.index.intersection(ix_all)] = y_saved
    return y_true
示例#8
0
def test_ask_simple_questions_return_multiindex(fixture_data, fixture_scores):
    n_clusters = 5
    n_questions = 6
    X_lr = fixture_data
    scorer = fixture_scores
    X_score = scorer.fit_transform(X=X_lr)
    cluster = KMeans(n_clusters=n_clusters)
    y_cluster = pd.Series(
        data=cluster.fit_predict(X=X_score),
        index=createmultiindex(X=X_lr)
    )
    questions = SimpleQuestions(n_questions=n_questions)
    ix_questions = questions.fit_transform(X=y_cluster)
    assert ix_questions.ndim == 1
    assert ix_questions.shape[0] <= n_questions * n_clusters
    assert ix_questions.shape[0] > 0

    X_sbs = DfVisualSbs().fit_transform(X=X_lr)
    X_questions = X_sbs.loc[ix_questions]
    assert X_questions.shape[0] == ix_questions.shape[0]
示例#9
0
def test_hardquestions(fixture_data, fixture_scores):
    n_clusters = 5
    n_questions = 6
    X_lr = fixture_data
    y_true = getytrue()
    scorer = fixture_scores
    X_score = scorer.fit_transform(X=X_lr)
    cluster = KMeans(n_clusters=n_clusters)
    y_cluster = pd.Series(
        data=cluster.fit_predict(X=X_score),
        index=createmultiindex(X=X_lr)
    )
    questions = HardQuestions(n_questions=n_questions)
    ix_questions = questions.fit_transform(X=y_cluster, y=y_true)
    assert ix_questions.ndim == 1
    assert ix_questions.shape[0] <= n_questions * n_clusters
    assert ix_questions.shape[0] > 0
    X_sbs = DfVisualSbs().fit_transform(X=X_lr)
    X_questions = X_sbs.loc[ix_questions]
    assert X_questions.shape[0] == ix_questions.shape[0]
示例#10
0
def test_clusterclassifier(fixture_scores, fixture_data):
    n_clusters = 10
    n_questions = 200
    X_lr = fixture_data
    y_true = getytrue(Xst=X_lr)
    X_raw = fixture_scores.fit_transform(X=X_lr)
    X_reduced = PCA(n_components=3).fit_transform(X_raw)
    cluster = KMeans(n_clusters=n_clusters)
    y_cluster = pd.Series(data=cluster.fit_predict(X=X_reduced),
                          index=createmultiindex(X=X_lr))
    questions = SimpleQuestions(n_questions=n_questions)
    ix_questions = questions.fit_transform(X=y_cluster)
    y_true = y_true.loc[y_cluster.index.intersection(y_true.index)]
    print('number of labellized rows found :{}'.format(len(y_true)))
    clf = ClusterClassifier(cluster=cluster)
    clf.fit(X=y_cluster, y=y_true)
    print('all match: {}'.format(clf.allmatch))
    print('no match: {}'.format(clf.nomatch))
    print('mixed match: {}'.format(clf.mixedmatch))
    print('not found: {}'.format(clf.notfound))
    y_pred = clf.predict(X=y_cluster)
    res = pd.Series(y_pred).value_counts()
    print(res)
    ('street_token', SbsApplyComparator(on='street', comparator='token')),
    ('city_fuzzy', SbsApplyComparator(on='city', comparator='simple')),
    ('postalcode_fuzzy', SbsApplyComparator(on='postalcode', comparator='simple')),
    ('postalcode_contains', SbsApplyComparator(on='postalcode', comparator='contains')),
]

n_rows = 500 # Number of rows to compare in each datasets
n_cluster = 10 # Number of clusters used in the exploratory step
n_simplequestions = 100 # Number of questions per cluster
n_pointedquestions = 100 # Number of additional questions for clusters with mixed matches


##Load the data
print('start', pd.datetime.now())
Xst = getXst(nrows=n_rows)
ixc = createmultiindex(X=Xst)
# Load the vector corresponding to Xst
y_true = getytrue().loc[ixc]
print(y_true.value_counts())
print(pd.datetime.now(), 'data loaded')

## Explore the data:
connector = DfConnector(
        scorer=Pipeline(steps=[
            ('scores', FeatureUnion(_lr_score_list)),
            ('imputer', SimpleImputer(strategy='constant', fill_value=0))]
        )
    )
### Fit the cluster non-supervizes
explorer = Explorer(clustermixin=KBinsCluster(n_clusters=n_cluster), n_simple=n_simplequestions, n_hard=n_pointedquestions)
Xst = connector.fit_transform(X=Xst)
示例#12
0
 def getindex(self, X):
     return createmultiindex(X=X, names=self.ixnamepairs)