def test_load_ytrue(): ix_all = createmultiindex(X=getXst()) y = getytrue() assert y.shape[0] == ix_all.shape[0] assert unique(y).shape[0] == 2 print(y.sample(10)) assert isinstance(y, pd.Series)
def test_pruningpipe(): print('start', pd.datetime.now()) n_rows = 500 n_cluster = 25 n_simplequestions = 50 n_pointedquestions = 50 Xst = getXst(nrows=n_rows) ixc = createmultiindex(X=Xst) y_true = getytrue() y_true = y_true.loc[ixc] print(pd.datetime.now(), 'data loaded') pipe = PruningPipe( connector=DfConnector( scorer=Pipeline(steps=[ ('scores', FeatureUnion(_lr_score_list)), ('imputer', SimpleImputer(strategy='constant', fill_value=0))] ) ), pruningclf=Explorer(clustermixin=KBinsCluster(n_clusters=n_cluster)), sbsmodel=FeatureUnion(transformer_list=_sbs_score_list), classifier=LogisticRegressionCV() ) pipe.fit(X=Xst, y=y_true) y_pred = pipe.predict(X=Xst) precision = precision_score(y_true=y_true, y_pred=y_pred) recall = recall_score(y_true=y_true, y_pred=y_pred) accuracy = balanced_accuracy_score(y_true=y_true, y_pred=y_pred) print('***\nscores:\n') print('precision score:{}\n recall score:{}\n balanced accuracy score:{}'.format( precision, recall, accuracy))
def test_build_questions_cluster_score(fixture_data, fixture_scores): X_lr = fixture_data scorer = fixture_scores cluster = KMeans(n_clusters=10) X_sbs = DfVisualSbs().fit_transform(X=X_lr) X_score = scorer.fit_transform(X=X_lr) y_cluster = cluster.fit_predict(X=X_score) y_score = np.mean(X_score, axis=1) X_info = pd.DataFrame(data=np.column_stack([y_cluster, y_score]), index=createmultiindex(X=X_lr), columns=['y_cluster', 'y_score']) X_all = pd.concat([X_info, X_sbs], ignore_index=False, axis=1) X_all.sort_values(by=['y_score', 'y_cluster'], inplace=True) questions = SimpleQuestions(n_questions=10) ix_questions = questions.fit_transform(X=pd.Series(data=y_cluster, index=createmultiindex(X=X_lr))) X_questions = X_all.loc[ix_questions] print(X_questions.sample(5)) assert True
def test_esconnector(): print('start', pd.datetime.now()) n_rows = 500 n_cluster = 25 Xst = getXst(nrows=n_rows) left = Xst[0] esclient = elasticsearch.Elasticsearch() scoreplan = { 'name': { 'type': 'FreeText' }, 'street': { 'type': 'FreeText' }, 'city': { 'type': 'FreeText' }, 'duns': { 'type': 'Exact' }, 'postalcode': { 'type': 'FreeText' }, 'countrycode': { 'type': 'Exact' } } escon = EsConnector( client=esclient, scoreplan=scoreplan, index="right", explain=False, size=20 ) ixc = createmultiindex(X=Xst) y_true = getytrue() y_true = y_true.loc[ixc] print(pd.datetime.now(), 'data loaded') pipe = PruningPipe( connector=escon, pruningclf=Explorer(clustermixin=KBinsCluster(n_clusters=n_cluster)), sbsmodel=FeatureUnion(transformer_list=_sbs_score_list), classifier=LogisticRegressionCV() ) pipe.fit(X=left, y=y_true) y_pred = pipe.predict(X=left) scores = get_commonscores(y_pred=y_pred, y_true=y_true) precision = scores['precision'] recall = scores['recall'] accuracy = scores['balanced_accuracy'] print('***\nscores:\n') print('precision score:{}\n recall score:{}\n balanced accuracy score:{}'.format( precision, recall, accuracy))
def _getindex(self, X, y=None): """ Return the cartesian product index of both dataframes Args: X (list): [df_source, df_target] y (pd.Series/pd.DataFrame/pd.MultiIndex): dummy, not used Returns: pd.MultiIndex """ ix = createmultiindex(X=X, names=self.ixnamepairs) return ix
def fit(self, X, y=None): """ Args: X (list): [df_source, df_target] y: dummy, not used Returns: self """ self.index = createmultiindex(X=X, names=self.ixnamepairs) self.dfnum = pd.Series(index=np.arange(0, len(self.index)), data=self.index.values, name='ix') self.dfix = pd.Series(index=self.index, data=np.arange(0, len(self.index)), name='ixnum') return self
def getytrue(Xst=None): """ Args: Xst: source and target dataframe for which to get the labelling Returns: pd.Series: supervised training data """ if Xst is None: Xst = getXst() ix_all = createmultiindex(X=Xst) y_true = pd.Series(data=zeros(shape=(ix_all.shape[0], )), index=ix_all, name='y_true').fillna(0) y_saved = open_csv(filename='ytrue.csv', foldername=_folder_companydf, index_col=[0, 1])['y_true'] y_true.loc[y_saved.index.intersection(ix_all)] = y_saved return y_true
def test_ask_simple_questions_return_multiindex(fixture_data, fixture_scores): n_clusters = 5 n_questions = 6 X_lr = fixture_data scorer = fixture_scores X_score = scorer.fit_transform(X=X_lr) cluster = KMeans(n_clusters=n_clusters) y_cluster = pd.Series( data=cluster.fit_predict(X=X_score), index=createmultiindex(X=X_lr) ) questions = SimpleQuestions(n_questions=n_questions) ix_questions = questions.fit_transform(X=y_cluster) assert ix_questions.ndim == 1 assert ix_questions.shape[0] <= n_questions * n_clusters assert ix_questions.shape[0] > 0 X_sbs = DfVisualSbs().fit_transform(X=X_lr) X_questions = X_sbs.loc[ix_questions] assert X_questions.shape[0] == ix_questions.shape[0]
def test_hardquestions(fixture_data, fixture_scores): n_clusters = 5 n_questions = 6 X_lr = fixture_data y_true = getytrue() scorer = fixture_scores X_score = scorer.fit_transform(X=X_lr) cluster = KMeans(n_clusters=n_clusters) y_cluster = pd.Series( data=cluster.fit_predict(X=X_score), index=createmultiindex(X=X_lr) ) questions = HardQuestions(n_questions=n_questions) ix_questions = questions.fit_transform(X=y_cluster, y=y_true) assert ix_questions.ndim == 1 assert ix_questions.shape[0] <= n_questions * n_clusters assert ix_questions.shape[0] > 0 X_sbs = DfVisualSbs().fit_transform(X=X_lr) X_questions = X_sbs.loc[ix_questions] assert X_questions.shape[0] == ix_questions.shape[0]
def test_clusterclassifier(fixture_scores, fixture_data): n_clusters = 10 n_questions = 200 X_lr = fixture_data y_true = getytrue(Xst=X_lr) X_raw = fixture_scores.fit_transform(X=X_lr) X_reduced = PCA(n_components=3).fit_transform(X_raw) cluster = KMeans(n_clusters=n_clusters) y_cluster = pd.Series(data=cluster.fit_predict(X=X_reduced), index=createmultiindex(X=X_lr)) questions = SimpleQuestions(n_questions=n_questions) ix_questions = questions.fit_transform(X=y_cluster) y_true = y_true.loc[y_cluster.index.intersection(y_true.index)] print('number of labellized rows found :{}'.format(len(y_true))) clf = ClusterClassifier(cluster=cluster) clf.fit(X=y_cluster, y=y_true) print('all match: {}'.format(clf.allmatch)) print('no match: {}'.format(clf.nomatch)) print('mixed match: {}'.format(clf.mixedmatch)) print('not found: {}'.format(clf.notfound)) y_pred = clf.predict(X=y_cluster) res = pd.Series(y_pred).value_counts() print(res)
('street_token', SbsApplyComparator(on='street', comparator='token')), ('city_fuzzy', SbsApplyComparator(on='city', comparator='simple')), ('postalcode_fuzzy', SbsApplyComparator(on='postalcode', comparator='simple')), ('postalcode_contains', SbsApplyComparator(on='postalcode', comparator='contains')), ] n_rows = 500 # Number of rows to compare in each datasets n_cluster = 10 # Number of clusters used in the exploratory step n_simplequestions = 100 # Number of questions per cluster n_pointedquestions = 100 # Number of additional questions for clusters with mixed matches ##Load the data print('start', pd.datetime.now()) Xst = getXst(nrows=n_rows) ixc = createmultiindex(X=Xst) # Load the vector corresponding to Xst y_true = getytrue().loc[ixc] print(y_true.value_counts()) print(pd.datetime.now(), 'data loaded') ## Explore the data: connector = DfConnector( scorer=Pipeline(steps=[ ('scores', FeatureUnion(_lr_score_list)), ('imputer', SimpleImputer(strategy='constant', fill_value=0))] ) ) ### Fit the cluster non-supervizes explorer = Explorer(clustermixin=KBinsCluster(n_clusters=n_cluster), n_simple=n_simplequestions, n_hard=n_pointedquestions) Xst = connector.fit_transform(X=Xst)
def getindex(self, X): return createmultiindex(X=X, names=self.ixnamepairs)