def test_pruningpipe(): print('start', pd.datetime.now()) n_rows = 500 n_cluster = 25 n_simplequestions = 50 n_pointedquestions = 50 Xst = getXst(nrows=n_rows) ixc = createmultiindex(X=Xst) y_true = getytrue() y_true = y_true.loc[ixc] print(pd.datetime.now(), 'data loaded') pipe = PruningPipe( connector=DfConnector( scorer=Pipeline(steps=[ ('scores', FeatureUnion(_lr_score_list)), ('imputer', SimpleImputer(strategy='constant', fill_value=0))] ) ), pruningclf=Explorer(clustermixin=KBinsCluster(n_clusters=n_cluster)), sbsmodel=FeatureUnion(transformer_list=_sbs_score_list), classifier=LogisticRegressionCV() ) pipe.fit(X=Xst, y=y_true) y_pred = pipe.predict(X=Xst) precision = precision_score(y_true=y_true, y_pred=y_pred) recall = recall_score(y_true=y_true, y_pred=y_pred) accuracy = balanced_accuracy_score(y_true=y_true, y_pred=y_pred) print('***\nscores:\n') print('precision score:{}\n recall score:{}\n balanced accuracy score:{}'.format( precision, recall, accuracy))
def test_esconnector(): print('start', pd.datetime.now()) n_rows = 500 n_cluster = 25 Xst = getXst(nrows=n_rows) left = Xst[0] esclient = elasticsearch.Elasticsearch() scoreplan = { 'name': { 'type': 'FreeText' }, 'street': { 'type': 'FreeText' }, 'city': { 'type': 'FreeText' }, 'duns': { 'type': 'Exact' }, 'postalcode': { 'type': 'FreeText' }, 'countrycode': { 'type': 'Exact' } } escon = EsConnector( client=esclient, scoreplan=scoreplan, index="right", explain=False, size=20 ) ixc = createmultiindex(X=Xst) y_true = getytrue() y_true = y_true.loc[ixc] print(pd.datetime.now(), 'data loaded') pipe = PruningPipe( connector=escon, pruningclf=Explorer(clustermixin=KBinsCluster(n_clusters=n_cluster)), sbsmodel=FeatureUnion(transformer_list=_sbs_score_list), classifier=LogisticRegressionCV() ) pipe.fit(X=left, y=y_true) y_pred = pipe.predict(X=left) scores = get_commonscores(y_pred=y_pred, y_true=y_true) precision = scores['precision'] recall = scores['recall'] accuracy = scores['balanced_accuracy'] print('***\nscores:\n') print('precision score:{}\n recall score:{}\n balanced accuracy score:{}'.format( precision, recall, accuracy))
def test_explorer(): print(pd.datetime.now()) n_rows = 200 n_cluster = 10 n_simplequestions = 200 n_hardquestions = 200 Xst = getXst(nrows=n_rows) y_true = getytrue(Xst=Xst) print(pd.datetime.now(), 'data loaded') connector = DfConnector(scorer=Pipeline( steps=[('scores', FeatureUnion(_score_list) ), ('imputer', SimpleImputer(strategy='constant', fill_value=0))])) explorer = Explorer(clustermixin=KBinsCluster(n_clusters=n_cluster), n_simple=n_simplequestions, n_hard=n_hardquestions) connector.fit(X=Xst) # Xsm is the transformed output from the connector, i.e. the score matrix Xsm = connector.transform(X=Xst) print(pd.datetime.now(), 'score ok') # ixc is the index corresponding to the score matrix ixc = Xsm.index ix_simple = explorer.ask_simple(X=pd.DataFrame(data=Xsm, index=ixc), fit_cluster=True) print(pd.datetime.now(), 'length of ix_simple {}'.format(ix_simple.shape[0])) sbs_simple = connector.getsbs(X=Xst, on_ix=ix_simple) print('***** SBS SIMPLE ******') print(sbs_simple.sample(5)) print('*****') y_simple = y_true.loc[ix_simple] ix_hard = explorer.ask_hard(X=pd.DataFrame(data=Xsm, index=ixc), y=y_simple) print(pd.datetime.now(), 'length of ix_hard {}'.format(ix_hard.shape[0])) sbs_hard = connector.getsbs(X=Xst, on_ix=ix_hard) print(sbs_hard.sample(5)) print('*****') y_train = y_true.loc[ix_simple.union(ix_hard)] print('length of y_train: {}'.format(y_train.shape[0])) explorer.fit(X=pd.DataFrame(data=Xsm, index=ixc), y=y_train, fit_cluster=True) print('results of pred:\n', pd.Series(explorer.predict(X=Xsm)).value_counts()) print('****')
def test_pruning(): print('start', pd.datetime.now()) n_rows = 200 n_cluster = 10 n_simplequestions = 200 n_hardquestions = 200 Xst = getXst(nrows=n_rows) y_true = getytrue(Xst=Xst) print(pd.datetime.now(), 'data loaded') connector = DfConnector(scorer=Pipeline( steps=[('scores', FeatureUnion(_score_list) ), ('imputer', SimpleImputer(strategy='constant', fill_value=0))])) explorer = Explorer(clustermixin=KBinsCluster(n_clusters=n_cluster), n_simple=n_simplequestions, n_hard=n_hardquestions) connector.fit(X=Xst) # Xst is the transformed output from the connector, i.e. the score matrix Xsm = connector.transform(X=Xst) print(pd.datetime.now(), 'score ok') # ixc is the index corresponding to the score matrix ixc = Xsm.index y_true = y_true.loc[ixc] ix_simple = explorer.ask_simple(X=pd.DataFrame(data=Xsm, index=ixc), fit_cluster=True) ix_hard = explorer.ask_hard(X=pd.DataFrame(data=Xsm, index=ixc), y=y_true.loc[ix_simple]) ix_train = ix_simple.union(ix_hard) print('number of training samples:{}'.format(ix_train.shape[0])) X_train = pd.DataFrame(data=Xsm, index=ixc).loc[ix_train] y_train = y_true.loc[ix_train] explorer.fit(X=X_train, y=y_train, fit_cluster=True) y_pruning = explorer.predict(X=Xsm) y_pruning = pd.Series(data=y_pruning, name='y_pruning', index=ixc) y_pred = (y_pruning > 0).astype(int) precision = precision_score(y_true=y_true, y_pred=y_pred) recall = recall_score(y_true=y_true, y_pred=y_pred) accuracy = balanced_accuracy_score(y_true=y_true, y_pred=y_pred) print('***\npruning scores:\n') print('precision score:{}\n recall score:{}\n balanced accuracy score:{}'. format(precision, recall, accuracy))
Xst = getXst(nrows=n_rows) ixc = createmultiindex(X=Xst) # Load the vector corresponding to Xst y_true = getytrue().loc[ixc] print(y_true.value_counts()) print(pd.datetime.now(), 'data loaded') ## Explore the data: connector = DfConnector( scorer=Pipeline(steps=[ ('scores', FeatureUnion(_lr_score_list)), ('imputer', SimpleImputer(strategy='constant', fill_value=0))] ) ) ### Fit the cluster non-supervizes explorer = Explorer(clustermixin=KBinsCluster(n_clusters=n_cluster), n_simple=n_simplequestions, n_hard=n_pointedquestions) Xst = connector.fit_transform(X=Xst) explorer.fit_cluster(X=Xst) ### Ask simple questions ix_simple = explorer.ask_simple(X=Xst) Sbs_simple = connector.getsbs(X=Xst, on_ix=ix_simple) y_simple = y_true.loc[ix_simple] ### Fit the cluser with supervized data explorer.fit(X=Xst, y=y_simple, fit_cluster=False) ### Ask hard (pointed) questions ix_hard = explorer.ask_hard(X=Xst, y=y_simple) Sbs_hard = connector.getsbs(X=Xst, on_ix=ix_hard) y_hard = y_true.loc[ix_hard]
# REBUILD Y_true y_true = pd.read_sql(sql="SELECT * FROM y_true WHERE y_true.y_true = 1", con=engine).set_index(['ix_source', 'ix_target'], drop=True) y_truetemp = Xst[['ix']] y_truetemp['y_true'] = 0 y_truetemp.loc[y_true.index.intersection(Xst.index), 'y_true'] = y_true.loc[y_true.index.intersection(Xst.index), 'y_true'] y_true = y_truetemp.copy() del y_truetemp ### y_true has now a multiindex, ix, and y_true columns ## Fit the cluster to non-supervized data exp = Explorer(n_simple=n_questions, n_hard=n_questions) exp.fit_cluster(X=Xst[['es_score']]) y_cluster = pd.Series(data=exp.pred_cluster(X=Xst), index=Xst.index, name='y_cluster') X_cluster = pd.DataFrame(y_cluster) X_cluster['avg_score'] = Xst[['es_score']].mean(axis=1) X_cluster['y_true'] = y_true['y_true'] X_cluster['ix'] = Xst['ix'] X_cluster.reset_index(inplace=True, drop=False) X_cluster.set_index('ix', inplace=True) X_cluster = X_cluster[[ 'ix_source', 'ix_target', 'avg_score', 'y_cluster', 'y_true' ]] X_cluster.to_sql('cluster_output', con=engine, if_exists='replace')