def test_load_ytrue(): ix_all = createmultiindex(X=getXst()) y = getytrue() assert y.shape[0] == ix_all.shape[0] assert unique(y).shape[0] == 2 print(y.sample(10)) assert isinstance(y, pd.Series)
def test_pruningpipe(): print('start', pd.datetime.now()) n_rows = 500 n_cluster = 25 n_simplequestions = 50 n_pointedquestions = 50 Xst = getXst(nrows=n_rows) ixc = createmultiindex(X=Xst) y_true = getytrue() y_true = y_true.loc[ixc] print(pd.datetime.now(), 'data loaded') pipe = PruningPipe( connector=DfConnector( scorer=Pipeline(steps=[ ('scores', FeatureUnion(_lr_score_list)), ('imputer', SimpleImputer(strategy='constant', fill_value=0))] ) ), pruningclf=Explorer(clustermixin=KBinsCluster(n_clusters=n_cluster)), sbsmodel=FeatureUnion(transformer_list=_sbs_score_list), classifier=LogisticRegressionCV() ) pipe.fit(X=Xst, y=y_true) y_pred = pipe.predict(X=Xst) precision = precision_score(y_true=y_true, y_pred=y_pred) recall = recall_score(y_true=y_true, y_pred=y_pred) accuracy = balanced_accuracy_score(y_true=y_true, y_pred=y_pred) print('***\nscores:\n') print('precision score:{}\n recall score:{}\n balanced accuracy score:{}'.format( precision, recall, accuracy))
def test_esconnector(): print('start', pd.datetime.now()) n_rows = 500 n_cluster = 25 Xst = getXst(nrows=n_rows) left = Xst[0] esclient = elasticsearch.Elasticsearch() scoreplan = { 'name': { 'type': 'FreeText' }, 'street': { 'type': 'FreeText' }, 'city': { 'type': 'FreeText' }, 'duns': { 'type': 'Exact' }, 'postalcode': { 'type': 'FreeText' }, 'countrycode': { 'type': 'Exact' } } escon = EsConnector( client=esclient, scoreplan=scoreplan, index="right", explain=False, size=20 ) ixc = createmultiindex(X=Xst) y_true = getytrue() y_true = y_true.loc[ixc] print(pd.datetime.now(), 'data loaded') pipe = PruningPipe( connector=escon, pruningclf=Explorer(clustermixin=KBinsCluster(n_clusters=n_cluster)), sbsmodel=FeatureUnion(transformer_list=_sbs_score_list), classifier=LogisticRegressionCV() ) pipe.fit(X=left, y=y_true) y_pred = pipe.predict(X=left) scores = get_commonscores(y_pred=y_pred, y_true=y_true) precision = scores['precision'] recall = scores['recall'] accuracy = scores['balanced_accuracy'] print('***\nscores:\n') print('precision score:{}\n recall score:{}\n balanced accuracy score:{}'.format( precision, recall, accuracy))
def test_sbsmodel(): X_lr = getXst(nrows=100) y_true = getytrue(Xst=X_lr) df_sbs = DfVisualSbs().fit_transform(X=X_lr) df_sbs = df_sbs.loc[y_true.index] transformer = make_union(*[ SbsApplyComparator(on='name', comparator='simple'), SbsApplyComparator(on='name', comparator='token'), SbsApplyComparator(on='street', comparator='simple') ]) imp = SimpleImputer(strategy='constant', fill_value=0) transformer = make_pipeline(*[transformer, imp]) clf = Classifier() mypipe = PipeSbsClf(transformer=transformer, classifier=clf) mypipe.fit(X=df_sbs, y=y_true) print(mypipe.score(X=df_sbs, y=y_true))
def test_lrmodel(): X_lr = getXst(nrows=100) y_true = getytrue(Xst=X_lr) scorer = make_union(*[ VectorizerConnector(on='name', analyzer='char'), VectorizerConnector(on='street', analyzer='char'), ExactConnector(on='countrycode'), ExactConnector(on='postalcode'), ExactConnector(on='duns') ]) imp = SimpleImputer(strategy='constant', fill_value=0) transformer = make_pipeline(*[scorer, imp]) clf = Classifier() mypipe = PipeDfClf(transformer=transformer, classifier=clf) X_score = mypipe.transformer.fit_transform(X=X_lr) mypipe.fit(X=X_lr, y=y_true) print(mypipe.score(X=X_lr, y=y_true))
def test_explorer(): print(pd.datetime.now()) n_rows = 200 n_cluster = 10 n_simplequestions = 200 n_hardquestions = 200 Xst = getXst(nrows=n_rows) y_true = getytrue(Xst=Xst) print(pd.datetime.now(), 'data loaded') connector = DfConnector(scorer=Pipeline( steps=[('scores', FeatureUnion(_score_list) ), ('imputer', SimpleImputer(strategy='constant', fill_value=0))])) explorer = Explorer(clustermixin=KBinsCluster(n_clusters=n_cluster), n_simple=n_simplequestions, n_hard=n_hardquestions) connector.fit(X=Xst) # Xsm is the transformed output from the connector, i.e. the score matrix Xsm = connector.transform(X=Xst) print(pd.datetime.now(), 'score ok') # ixc is the index corresponding to the score matrix ixc = Xsm.index ix_simple = explorer.ask_simple(X=pd.DataFrame(data=Xsm, index=ixc), fit_cluster=True) print(pd.datetime.now(), 'length of ix_simple {}'.format(ix_simple.shape[0])) sbs_simple = connector.getsbs(X=Xst, on_ix=ix_simple) print('***** SBS SIMPLE ******') print(sbs_simple.sample(5)) print('*****') y_simple = y_true.loc[ix_simple] ix_hard = explorer.ask_hard(X=pd.DataFrame(data=Xsm, index=ixc), y=y_simple) print(pd.datetime.now(), 'length of ix_hard {}'.format(ix_hard.shape[0])) sbs_hard = connector.getsbs(X=Xst, on_ix=ix_hard) print(sbs_hard.sample(5)) print('*****') y_train = y_true.loc[ix_simple.union(ix_hard)] print('length of y_train: {}'.format(y_train.shape[0])) explorer.fit(X=pd.DataFrame(data=Xsm, index=ixc), y=y_train, fit_cluster=True) print('results of pred:\n', pd.Series(explorer.predict(X=Xsm)).value_counts()) print('****')
def test_pruning(): print('start', pd.datetime.now()) n_rows = 200 n_cluster = 10 n_simplequestions = 200 n_hardquestions = 200 Xst = getXst(nrows=n_rows) y_true = getytrue(Xst=Xst) print(pd.datetime.now(), 'data loaded') connector = DfConnector(scorer=Pipeline( steps=[('scores', FeatureUnion(_score_list) ), ('imputer', SimpleImputer(strategy='constant', fill_value=0))])) explorer = Explorer(clustermixin=KBinsCluster(n_clusters=n_cluster), n_simple=n_simplequestions, n_hard=n_hardquestions) connector.fit(X=Xst) # Xst is the transformed output from the connector, i.e. the score matrix Xsm = connector.transform(X=Xst) print(pd.datetime.now(), 'score ok') # ixc is the index corresponding to the score matrix ixc = Xsm.index y_true = y_true.loc[ixc] ix_simple = explorer.ask_simple(X=pd.DataFrame(data=Xsm, index=ixc), fit_cluster=True) ix_hard = explorer.ask_hard(X=pd.DataFrame(data=Xsm, index=ixc), y=y_true.loc[ix_simple]) ix_train = ix_simple.union(ix_hard) print('number of training samples:{}'.format(ix_train.shape[0])) X_train = pd.DataFrame(data=Xsm, index=ixc).loc[ix_train] y_train = y_true.loc[ix_train] explorer.fit(X=X_train, y=y_train, fit_cluster=True) y_pruning = explorer.predict(X=Xsm) y_pruning = pd.Series(data=y_pruning, name='y_pruning', index=ixc) y_pred = (y_pruning > 0).astype(int) precision = precision_score(y_true=y_true, y_pred=y_pred) recall = recall_score(y_true=y_true, y_pred=y_pred) accuracy = balanced_accuracy_score(y_true=y_true, y_pred=y_pred) print('***\npruning scores:\n') print('precision score:{}\n recall score:{}\n balanced accuracy score:{}'. format(precision, recall, accuracy))
def test_hardquestions(fixture_data, fixture_scores): n_clusters = 5 n_questions = 6 X_lr = fixture_data y_true = getytrue() scorer = fixture_scores X_score = scorer.fit_transform(X=X_lr) cluster = KMeans(n_clusters=n_clusters) y_cluster = pd.Series( data=cluster.fit_predict(X=X_score), index=createmultiindex(X=X_lr) ) questions = HardQuestions(n_questions=n_questions) ix_questions = questions.fit_transform(X=y_cluster, y=y_true) assert ix_questions.ndim == 1 assert ix_questions.shape[0] <= n_questions * n_clusters assert ix_questions.shape[0] > 0 X_sbs = DfVisualSbs().fit_transform(X=X_lr) X_questions = X_sbs.loc[ix_questions] assert X_questions.shape[0] == ix_questions.shape[0]
def test_pipe_df(): df_source = getsource(nrows=100) df_target = gettarget(nrows=100) assert df_source.columns.equals(df_target.columns) print(pd.datetime.now(), ' | ', 'number of rows on left:{}'.format(df_source.shape[0])) print(pd.datetime.now(), ' | ', 'number of rows on right:{}'.format(df_target.shape[0])) scorer = FeatureUnion(transformer_list=[( 'name_char', VectorizerConnector(on='name', analyzer='char') ), ('street_char', VectorizerConnector(on='street', analyzer='char') ), ('countrycode_exact', ExactConnector(on='countrycode'))]) dfcon = DfConnector(scorer=scorer) Xsm = dfcon.fit_transform(X=[df_source, df_target]) ix_con = Xsm.index y_true = getytrue(Xst=[df_source, df_target]).loc[ix_con] Xsbs = dfcon.getsbs(X=[df_source, df_target], on_ix=ix_con) scores_further = scorer_sbs.fit_transform(X=Xsbs) scores_further = pd.DataFrame(data=scores_further, index=ix_con, columns=[c[0] for c in _sbs_score_list]) scores_further = pd.concat([Xsm, scores_further], axis=1, ignore_index=False) X = scores_further scoring = ['precision', 'recall', 'accuracy'] print(pd.datetime.now(), ' | starting score') pipe = Pipeline( steps=[('Impute', SimpleImputer(strategy='constant', fill_value=0) ), ('Scaler', Normalizer()), ('PCA', PCA(n_components=4)), ('Predictor', GradientBoostingClassifier(n_estimators=1000, max_depth=5))]) scores = cross_validate(estimator=pipe, X=X, y=y_true, scoring=scoring, cv=5) for c in scoring: print(pd.datetime.now(), ' | {} score1: {}'.format(c, np.average(scores['test_' + c])))
def test_pipeModel(): X_lr = getXst(nrows=100) y_true = getytrue(Xst=X_lr) transformer1 = make_union(*[ VectorizerConnector(on='name', analyzer='word'), VectorizerConnector(on='street', analyzer='word'), ExactConnector(on='countrycode'), ExactConnector(on='duns') ]) imp1 = SimpleImputer(strategy='constant', fill_value=0) transformer1 = make_pipeline(*[transformer1, imp1]) def myfunc(X): y_name = X[:, 0] y_street = X[:, 1] y_country = X[:, 2] y_duns = X[:, 3] y_return = np.logical_or( y_duns == 1, np.logical_and(y_country == 1, np.logical_or(y_name > 0.3, y_street > 0.3))) return y_return clf1 = FunctionClassifier(func=myfunc) lrmodel = PipeDfClf(transformer=transformer1, classifier=clf1) transformer2 = make_union(*[ SbsApplyComparator(on='name', comparator='simple'), SbsApplyComparator(on='name', comparator='token'), SbsApplyComparator(on='street', comparator='simple'), SbsApplyComparator(on='city', comparator='simple'), SbsApplyComparator(on='postalcode', comparator='simple'), ]) imp2 = SimpleImputer(strategy='constant', fill_value=0) transformer2 = make_pipeline(*[transformer2, imp2]) clf = Classifier() sbsmodel = PipeSbsClf(transformer=transformer2, classifier=clf) totalpipe = PruningDfSbsClf(lrmodel=lrmodel, sbsmodel=sbsmodel) totalpipe.fit(X=X_lr, y_lr=y_true, y_sbs=y_true) print(totalpipe.score(X=X_lr, y=y_true))
def test_clusterclassifier(fixture_scores, fixture_data): n_clusters = 10 n_questions = 200 X_lr = fixture_data y_true = getytrue(Xst=X_lr) X_raw = fixture_scores.fit_transform(X=X_lr) X_reduced = PCA(n_components=3).fit_transform(X_raw) cluster = KMeans(n_clusters=n_clusters) y_cluster = pd.Series(data=cluster.fit_predict(X=X_reduced), index=createmultiindex(X=X_lr)) questions = SimpleQuestions(n_questions=n_questions) ix_questions = questions.fit_transform(X=y_cluster) y_true = y_true.loc[y_cluster.index.intersection(y_true.index)] print('number of labellized rows found :{}'.format(len(y_true))) clf = ClusterClassifier(cluster=cluster) clf.fit(X=y_cluster, y=y_true) print('all match: {}'.format(clf.allmatch)) print('no match: {}'.format(clf.nomatch)) print('mixed match: {}'.format(clf.mixedmatch)) print('not found: {}'.format(clf.notfound)) y_pred = clf.predict(X=y_cluster) res = pd.Series(y_pred).value_counts() print(res)
def test_pipe_es(): df_source = getsource(nrows=100) df_target = gettarget(nrows=None) assert df_source.columns.equals(df_target.columns) print(pd.datetime.now(), ' | ', 'number of rows on left:{}'.format(df_source.shape[0])) print(pd.datetime.now(), ' | ', 'number of rows on right:{}'.format(df_target.shape[0])) esclient = elasticsearch.Elasticsearch() scoreplan = { 'name': { 'type': 'FreeText' }, 'street': { 'type': 'FreeText' }, 'city': { 'type': 'FreeText' }, 'duns': { 'type': 'Exact' }, 'postalcode': { 'type': 'FreeText' }, 'countrycode': { 'type': 'Exact' } } escon = EsConnector(client=esclient, scoreplan=scoreplan, index="right", explain=False, size=10) #Xsm is the similarity matrix Xsm = escon.fit_transform(X=df_source) ix_con = Xsm.index y_true = getytrue(Xst=[df_source, df_target]).loc[ix_con] Xsbs = escon.getsbs(X=df_source, on_ix=ix_con) scores_further = scorer_sbs.fit_transform(X=Xsbs) scores_further = pd.DataFrame(data=scores_further, index=ix_con, columns=[c[0] for c in _sbs_score_list]) scores_further = pd.concat([Xsm[['es_score']], scores_further], axis=1, ignore_index=False) X = scores_further scoring = ['precision', 'recall', 'accuracy'] print(pd.datetime.now(), ' | starting score') pipe = Pipeline( steps=[('Impute', SimpleImputer(strategy='constant', fill_value=0) ), ('Scaler', Normalizer()), ('PCA', PCA(n_components=4)), ('Predictor', GradientBoostingClassifier(n_estimators=1000, max_depth=5))]) scores = cross_validate(estimator=pipe, X=X, y=y_true, scoring=scoring, cv=5) for c in scoring: print(pd.datetime.now(), ' | {} score1: {}'.format(c, np.average(scores['test_' + c])))
('postalcode_fuzzy', SbsApplyComparator(on='postalcode', comparator='simple')), ('postalcode_contains', SbsApplyComparator(on='postalcode', comparator='contains')), ] n_rows = 500 # Number of rows to compare in each datasets n_cluster = 10 # Number of clusters used in the exploratory step n_simplequestions = 100 # Number of questions per cluster n_pointedquestions = 100 # Number of additional questions for clusters with mixed matches ##Load the data print('start', pd.datetime.now()) Xst = getXst(nrows=n_rows) ixc = createmultiindex(X=Xst) # Load the vector corresponding to Xst y_true = getytrue().loc[ixc] print(y_true.value_counts()) print(pd.datetime.now(), 'data loaded') ## Explore the data: connector = DfConnector( scorer=Pipeline(steps=[ ('scores', FeatureUnion(_lr_score_list)), ('imputer', SimpleImputer(strategy='constant', fill_value=0))] ) ) ### Fit the cluster non-supervizes explorer = Explorer(clustermixin=KBinsCluster(n_clusters=n_cluster), n_simple=n_simplequestions, n_hard=n_pointedquestions) Xst = connector.fit_transform(X=Xst) explorer.fit_cluster(X=Xst)
from sklearn.pipeline import make_union from suricate.dftransformers.vectorizer import VectorizerConnector from suricate.data.base import ix_names from suricate.data.companies import getsource, gettarget, getXst, getytrue left = getsource(nrows=100) right = gettarget(nrows=100) X_lr = getXst(nrows=100) y_true = getytrue(Xst=X_lr) def test_loaddata(): print(ix_names['ixname']) print(left.shape[0]) print(right.shape[0]) assert True def test_tfidf(): expected_shape = left.shape[0] * right.shape[0] stages = [ VectorizerConnector(on='name', analyzer='char', pruning=False), VectorizerConnector(on='street', analyzer='char', pruning=False), ] scorer = make_union(*stages) scorer.fit(X=X_lr) X_score = scorer.transform(X=X_lr) assert X_score.shape[0] == expected_shape pass