示例#1
0
    def test_select_matcher_valid_multiple_metrics(self):
        A = read_csv_metadata(path_a, key='id')
        B = read_csv_metadata(path_b, key='id')
        feature_vectors = read_csv_metadata(path_f, ltable=A, rtable=B)
        dtmatcher = DTMatcher()
        nbmatcher = NBMatcher()
        rfmatcher = RFMatcher()
        svmmatcher = SVMMatcher()
        linregmatcher = LinRegMatcher()
        logregmatcher = LogRegMatcher()
        matchers = [dtmatcher, nbmatcher, rfmatcher, svmmatcher, linregmatcher, logregmatcher]

        result = select_matcher(matchers, x=None, y=None, table=feature_vectors,
                                exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'],
                                target_attr='gold', k=7)
        header = ['Name', 'Matcher', 'Num folds']
        result_df_p = result['drill_down_cv_stats']['precision']
        result_df_f = result['drill_down_cv_stats']['f1']
        result_df_r = result['drill_down_cv_stats']['recall']
        # Check header of precision dataframe
        self.assertEqual(set(header) == set(list(result_df_p.columns[[0, 1, 2]])), True)
        self.assertEqual('Mean score', result_df_p.columns[len(result_df_p.columns) - 1])
        # Check header of f1 dataframe
        self.assertEqual(set(header) == set(list(result_df_f.columns[[0, 1, 2]])), True)
        self.assertEqual('Mean score', result_df_f.columns[len(result_df_f.columns) - 1])
        # Check header of recall dataframe
        self.assertEqual(set(header) == set(list(result_df_r.columns[[0, 1, 2]])), True)
        self.assertEqual('Mean score', result_df_p.columns[len(result_df_r.columns) - 1])
        d = result_df_p.set_index('Name')
        p_max = d.loc[result['selected_matcher'].name, 'Mean score']
        a_max = pd.np.max(d['Mean score'])
        self.assertEqual(p_max, a_max)
    def test_select_matcher_valid_cv_stats_3(self):
        A = read_csv_metadata(path_a, key='id')
        B = read_csv_metadata(path_b, key='id')
        feature_vectors = read_csv_metadata(path_f, ltable=A, rtable=B)
        dtmatcher = DTMatcher()
        nbmatcher = NBMatcher()
        rfmatcher = RFMatcher()
        svmmatcher = SVMMatcher()
        linregmatcher = LinRegMatcher()
        logregmatcher = LogRegMatcher()
        matchers = [dtmatcher, nbmatcher, rfmatcher, svmmatcher, linregmatcher, logregmatcher]

        result = select_matcher(matchers, x=None, y=None, table=feature_vectors,
                                exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'],
                                metric_to_select_matcher='recall',
                                metrics_to_display='recall',
                                target_attr='gold', k=7)
        header = ['Matcher', 'Average recall']
        result_df = result['cv_stats']
        result_df_r = result['drill_down_cv_stats']['recall']
        self.assertEqual(set(header) == set(list(result_df.columns[[0, 1]])), True)
        d = result_df.set_index('Matcher')
        p_max = d.ix[result['selected_matcher'].name, 'Average recall']
        a_max = pd.np.max(result_df_r['Mean score'])
        self.assertEqual(p_max, a_max)
示例#3
0
    def test_select_matcher_invalid_no_display_drill_down(self):
        A = read_csv_metadata(path_a, key='id')
        B = read_csv_metadata(path_b, key='id')
        feature_vectors = read_csv_metadata(path_f, ltable=A, rtable=B)
        dtmatcher = DTMatcher()
        nbmatcher = NBMatcher()
        rfmatcher = RFMatcher()
        svmmatcher = SVMMatcher()
        linregmatcher = LinRegMatcher()
        logregmatcher = LogRegMatcher()
        matchers = [
            dtmatcher, nbmatcher, rfmatcher, svmmatcher, linregmatcher,
            logregmatcher
        ]

        result = select_matcher(
            matchers,
            x=None,
            y=None,
            table=feature_vectors,
            exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'],
            metrics_to_display=['precision'],
            target_attr='gold',
            k=7)
        result_df_p = result['drill_down_cv_stats']['recall']
示例#4
0
    def test_select_matcher_valid_1(self):
        A = read_csv_metadata(path_a, key='id')
        B = read_csv_metadata(path_b, key='id')
        # C = read_csv_metadata(path_c, ltable=A, rtable=B, fk_ltable='ltable.id',
        #                       fk_rtable='rtable.id', key='_id')
        # C['labels'] = labels
        feature_vectors = read_csv_metadata(path_f, ltable=A, rtable=B)
        dtmatcher = DTMatcher()
        nbmatcher = NBMatcher()
        rfmatcher = RFMatcher()
        svmmatcher = SVMMatcher()
        linregmatcher = LinRegMatcher()
        logregmatcher = LogRegMatcher()
        # xgmatcher = XGBoostMatcher()
        matchers = [dtmatcher, nbmatcher, rfmatcher, svmmatcher, linregmatcher,
                    logregmatcher]

        result = select_matcher(matchers, x=None, y=None, table=feature_vectors,
                                exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'],
                                target_attr='gold', k=7)
        header = ['Name', 'Matcher', 'Num folds']
        result_df = result['drill_down_cv_stats']['precision']
        self.assertEqual(set(header) == set(list(result_df.columns[[0, 1, 2]])), True)
        self.assertEqual('Mean score', result_df.columns[len(result_df.columns) - 1])
        d = result_df.set_index('Name')
        p_max = d.loc[result['selected_matcher'].name, 'Mean score']
        a_max = pd.np.max(d['Mean score'])
        self.assertEqual(p_max, a_max)
 def test_select_matcher_valid_2(self):
     A = read_csv_metadata(path_a, key='id')
     B = read_csv_metadata(path_b, key='id')
     # C = read_csv_metadata(path_c, ltable=A, rtable=B, fk_ltable='ltable.id',
     #                       fk_rtable='rtable.id', key='_id')
     # labels = [0] * 7
     # labels.extend([1] * 8)
     # C['labels'] = labels
     # feature_table = get_features_for_matching(A, B)
     # feature_vectors = extract_feature_vecs(C, feature_table=feature_table, attrs_after='gold')
     # feature_vectors.fillna(0, inplace=True)
     feature_vectors = read_csv_metadata(path_f, ltable=A, rtable=B)
     dtmatcher = DTMatcher()
     nbmatcher = NBMatcher()
     rfmatcher = RFMatcher()
     svmmatcher = SVMMatcher()
     linregmatcher = LinRegMatcher()
     logregmatcher = LogRegMatcher()
     matchers = [dtmatcher, nbmatcher, rfmatcher, svmmatcher, linregmatcher, logregmatcher]
     col_list = list(feature_vectors.columns)
     l = list_diff(col_list, [cm.get_key(feature_vectors), cm.get_fk_ltable(feature_vectors),
                              cm.get_fk_rtable(feature_vectors),
                              'gold'])
     X = feature_vectors[l]
     Y = feature_vectors['gold']
     result = select_matcher(matchers, x=X, y=Y)
     header = ['Name', 'Matcher', 'Num folds']
     result_df = result['drill_down_cv_stats']['precision']
     self.assertEqual(set(header) == set(list(result_df.columns[[0, 1, 2]])), True)
     self.assertEqual('Mean score', result_df.columns[len(result_df.columns) - 1])
     d = result_df.set_index('Name')
     p_max = d.ix[result['selected_matcher'].name, 'Mean score']
     a_max = pd.np.max(d['Mean score'])
     self.assertEqual(p_max, a_max)
示例#6
0
 def test_select_matcher_valid_2(self):
     A = read_csv_metadata(path_a, key='id')
     B = read_csv_metadata(path_b, key='id')
     # C = read_csv_metadata(path_c, ltable=A, rtable=B, fk_ltable='ltable.id',
     #                       fk_rtable='rtable.id', key='_id')
     # labels = [0] * 7
     # labels.extend([1] * 8)
     # C['labels'] = labels
     # feature_table = get_features_for_matching(A, B)
     # feature_vectors = extract_feature_vecs(C, feature_table=feature_table, attrs_after='gold')
     # feature_vectors.fillna(0, inplace=True)
     feature_vectors = read_csv_metadata(path_f, ltable=A, rtable=B)
     dtmatcher = DTMatcher()
     nbmatcher = NBMatcher()
     rfmatcher = RFMatcher()
     svmmatcher = SVMMatcher()
     linregmatcher = LinRegMatcher()
     logregmatcher = LogRegMatcher()
     matchers = [dtmatcher, nbmatcher, rfmatcher, svmmatcher, linregmatcher, logregmatcher]
     col_list = list(feature_vectors.columns)
     l = list_diff(col_list, [cm.get_key(feature_vectors), cm.get_fk_ltable(feature_vectors),
                              cm.get_fk_rtable(feature_vectors),
                              'gold'])
     X = feature_vectors[l]
     Y = feature_vectors['gold']
     result = select_matcher(matchers, x=X, y=Y)
     header = ['Name', 'Matcher', 'Num folds']
     result_df = result['drill_down_cv_stats']['precision']
     self.assertEqual(set(header) == set(list(result_df.columns[[0, 1, 2]])), True)
     self.assertEqual('Mean score', result_df.columns[len(result_df.columns) - 1])
     d = result_df.set_index('Name')
     p_max = d.loc[result['selected_matcher'].name, 'Mean score']
     a_max = pd.np.max(d['Mean score'])
     self.assertEqual(p_max, a_max)
    def test_select_matcher_target_attr_not_present(self):
        A = read_csv_metadata(path_a, key='id')
        B = read_csv_metadata(path_b, key='id')
        # C = read_csv_metadata(path_c, ltable=A, rtable=B, fk_ltable='ltable.id',
        #                       fk_rtable='rtable.id', key='_id')
        # labels = [0] * 7
        # labels.extend([1] * 8)
        # C['labels'] = labels
        # feature_table = get_features_for_matching(A, B)
        # feature_vectors = extract_feature_vecs(C, feature_table=feature_table, attrs_after='gold')
        # feature_vectors.fillna(0, inplace=True)
        feature_vectors = read_csv_metadata(path_f, ltable=A, rtable=B)
        dtmatcher = DTMatcher()
        nbmatcher = NBMatcher()
        rfmatcher = RFMatcher()
        svmmatcher = SVMMatcher()
        linregmatcher = LinRegMatcher()
        logregmatcher = LogRegMatcher()
        matchers = [dtmatcher, nbmatcher, rfmatcher, svmmatcher, linregmatcher, logregmatcher]

        col_list = list(feature_vectors.columns)
        l = list_diff(col_list, [cm.get_fk_ltable(feature_vectors),
                                 cm.get_fk_rtable(feature_vectors)
                                 ])
        feature_vectors = feature_vectors[l]

        result = select_matcher(matchers, x=None, y=None, table=feature_vectors,
                                exclude_attrs='_id',
                                target_attr='labels1', k=2)
    def test_select_matcher_valid_multiple_metrics(self):
        A = read_csv_metadata(path_a, key='id')
        B = read_csv_metadata(path_b, key='id')
        feature_vectors = read_csv_metadata(path_f, ltable=A, rtable=B)
        dtmatcher = DTMatcher()
        nbmatcher = NBMatcher()
        rfmatcher = RFMatcher()
        svmmatcher = SVMMatcher()
        linregmatcher = LinRegMatcher()
        logregmatcher = LogRegMatcher()
        matchers = [dtmatcher, nbmatcher, rfmatcher, svmmatcher, linregmatcher, logregmatcher]

        result = select_matcher(matchers, x=None, y=None, table=feature_vectors,
                                exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'],
                                target_attr='gold', k=7)
        header = ['Name', 'Matcher', 'Num folds']
        result_df_p = result['drill_down_cv_stats']['precision']
        result_df_f = result['drill_down_cv_stats']['f1']
        result_df_r = result['drill_down_cv_stats']['recall']
        # Check header of precision dataframe
        self.assertEqual(set(header) == set(list(result_df_p.columns[[0, 1, 2]])), True)
        self.assertEqual('Mean score', result_df_p.columns[len(result_df_p.columns) - 1])
        # Check header of f1 dataframe
        self.assertEqual(set(header) == set(list(result_df_f.columns[[0, 1, 2]])), True)
        self.assertEqual('Mean score', result_df_f.columns[len(result_df_f.columns) - 1])
        # Check header of recall dataframe
        self.assertEqual(set(header) == set(list(result_df_r.columns[[0, 1, 2]])), True)
        self.assertEqual('Mean score', result_df_p.columns[len(result_df_r.columns) - 1])
        d = result_df_p.set_index('Name')
        p_max = d.ix[result['selected_matcher'].name, 'Mean score']
        a_max = pd.np.max(d['Mean score'])
        self.assertEqual(p_max, a_max)
    def test_select_matcher_valid_1(self):
        A = read_csv_metadata(path_a, key='id')
        B = read_csv_metadata(path_b, key='id')
        # C = read_csv_metadata(path_c, ltable=A, rtable=B, fk_ltable='ltable.id',
        #                       fk_rtable='rtable.id', key='_id')
        # C['labels'] = labels
        feature_vectors = read_csv_metadata(path_f, ltable=A, rtable=B)
        dtmatcher = DTMatcher()
        nbmatcher = NBMatcher()
        rfmatcher = RFMatcher()
        svmmatcher = SVMMatcher()
        linregmatcher = LinRegMatcher()
        logregmatcher = LogRegMatcher()
        # xgmatcher = XGBoostMatcher()
        matchers = [dtmatcher, nbmatcher, rfmatcher, svmmatcher, linregmatcher,
                    logregmatcher]

        result = select_matcher(matchers, x=None, y=None, table=feature_vectors,
                                exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'],
                                target_attr='gold', k=7)
        header = ['Name', 'Matcher', 'Num folds']
        result_df = result['drill_down_cv_stats']['precision']
        self.assertEqual(set(header) == set(list(result_df.columns[[0, 1, 2]])), True)
        self.assertEqual('Mean score', result_df.columns[len(result_df.columns) - 1])
        d = result_df.set_index('Name')
        p_max = d.ix[result['selected_matcher'].name, 'Mean score']
        a_max = pd.np.max(d['Mean score'])
        self.assertEqual(p_max, a_max)
示例#10
0
 def test_select_matcher_target_attr_not_series(self):
     A = read_csv_metadata(path_a, key='id')
     B = read_csv_metadata(path_b, key='id')
     # C = read_csv_metadata(path_c, ltable=A, rtable=B, fk_ltable='ltable.id',
     #                       fk_rtable='rtable.id', key='_id')
     # labels = [0] * 7
     # labels.extend([1] * 8)
     # C['labels'] = labels
     # feature_table = get_features_for_matching(A, B)
     # feature_vectors = extract_feature_vecs(C, feature_table=feature_table, attrs_after='gold')
     # feature_vectors.fillna(0, inplace=True)
     feature_vectors = read_csv_metadata(path_f, ltable=A, rtable=B)
     dtmatcher = DTMatcher()
     nbmatcher = NBMatcher()
     rfmatcher = RFMatcher()
     svmmatcher = SVMMatcher()
     linregmatcher = LinRegMatcher()
     logregmatcher = LogRegMatcher()
     matchers = [
         dtmatcher, nbmatcher, rfmatcher, svmmatcher, linregmatcher,
         logregmatcher
     ]
     col_list = list(feature_vectors.columns)
     l = list_diff(col_list, [
         cm.get_fk_ltable(feature_vectors),
         cm.get_fk_rtable(feature_vectors), 'gold'
     ])
     X = feature_vectors[l]
     Y = feature_vectors[['gold']]
     result = select_matcher(matchers, x=X, y=Y)
示例#11
0
    def test_select_matcher_valid_cv_stats_3(self):
        A = read_csv_metadata(path_a, key='id')
        B = read_csv_metadata(path_b, key='id')
        feature_vectors = read_csv_metadata(path_f, ltable=A, rtable=B)
        dtmatcher = DTMatcher()
        nbmatcher = NBMatcher()
        rfmatcher = RFMatcher()
        svmmatcher = SVMMatcher()
        linregmatcher = LinRegMatcher()
        logregmatcher = LogRegMatcher()
        matchers = [
            dtmatcher, nbmatcher, rfmatcher, svmmatcher, linregmatcher,
            logregmatcher
        ]

        result = select_matcher(
            matchers,
            x=None,
            y=None,
            table=feature_vectors,
            exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'],
            metric_to_select_matcher='recall',
            metrics_to_display='recall',
            target_attr='gold',
            k=7)
        header = ['Matcher', 'Average recall']
        result_df = result['cv_stats']
        result_df_r = result['drill_down_cv_stats']['recall']
        self.assertEqual(
            set(header) == set(list(result_df.columns[[0, 1]])), True)
        d = result_df.set_index('Matcher')
        p_max = d.loc[result['selected_matcher'].name, 'Average recall']
        a_max = np.max(result_df_r['Mean score'])
        self.assertEqual(p_max, a_max)
def selector_matcher_combiner(matchers, combiners, x=None, y=None, table=None, exclude_attrs=None, target_attr=None,
                              weights=None, threshold=None, k=5):
    if not isinstance(matchers, list):
        matchers = [matchers]
    if not isinstance(combiners, list):
        combiners = [combiners]
    matcher_list = get_matcher_list(matchers, combiners, weights, threshold)
    return select_matcher(matcher_list, x=x,  y=y, table=table, exclude_attrs=exclude_attrs, target_attr=target_attr,
                          k=k)
    def test_select_matcher_invalid_metric_to_select_matcher(self):
        A = read_csv_metadata(path_a, key='id')
        B = read_csv_metadata(path_b, key='id')
        feature_vectors = read_csv_metadata(path_f, ltable=A, rtable=B)
        dtmatcher = DTMatcher()
        nbmatcher = NBMatcher()
        rfmatcher = RFMatcher()
        svmmatcher = SVMMatcher()
        linregmatcher = LinRegMatcher()
        logregmatcher = LogRegMatcher()
        matchers = [dtmatcher, nbmatcher, rfmatcher, svmmatcher, linregmatcher, logregmatcher]

        result = select_matcher(matchers, x=None, y=None, table=feature_vectors,
                                exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'],
                                metric_to_select_matcher='test',
                                target_attr='gold', k=7)
示例#14
0
    def test_select_matcher_invalid_metric_to_select_matcher(self):
        A = read_csv_metadata(path_a, key='id')
        B = read_csv_metadata(path_b, key='id')
        feature_vectors = read_csv_metadata(path_f, ltable=A, rtable=B)
        dtmatcher = DTMatcher()
        nbmatcher = NBMatcher()
        rfmatcher = RFMatcher()
        svmmatcher = SVMMatcher()
        linregmatcher = LinRegMatcher()
        logregmatcher = LogRegMatcher()
        matchers = [dtmatcher, nbmatcher, rfmatcher, svmmatcher, linregmatcher, logregmatcher]

        result = select_matcher(matchers, x=None, y=None, table=feature_vectors,
                                exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'],
                                metric_to_select_matcher='test',
                                target_attr='gold', k=7)
    def test_select_matcher_invalid_no_display_drill_down(self):
        A = read_csv_metadata(path_a, key='id')
        B = read_csv_metadata(path_b, key='id')
        feature_vectors = read_csv_metadata(path_f, ltable=A, rtable=B)
        dtmatcher = DTMatcher()
        nbmatcher = NBMatcher()
        rfmatcher = RFMatcher()
        svmmatcher = SVMMatcher()
        linregmatcher = LinRegMatcher()
        logregmatcher = LogRegMatcher()
        matchers = [dtmatcher, nbmatcher, rfmatcher, svmmatcher, linregmatcher, logregmatcher]

        result = select_matcher(matchers, x=None, y=None, table=feature_vectors,
                                exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'],
                                metrics_to_display=['precision'],
                                target_attr='gold', k=7)
        result_df_p = result['drill_down_cv_stats']['recall']
def selector_matcher_combiner(matchers,
                              combiners,
                              x=None,
                              y=None,
                              table=None,
                              exclude_attrs=None,
                              target_attr=None,
                              weights=None,
                              threshold=None,
                              k=5):
    if not isinstance(matchers, list):
        matchers = [matchers]
    if not isinstance(combiners, list):
        combiners = [combiners]
    matcher_list = get_matcher_list(matchers, combiners, weights, threshold)
    return select_matcher(matcher_list,
                          x=x,
                          y=y,
                          table=table,
                          exclude_attrs=exclude_attrs,
                          target_attr=target_attr,
                          k=k)
 def test_select_matcher_invalid_df(self):
     select_matcher(matchers=[], table="", exclude_attrs=[], target_attr="")
示例#18
0
 def test_select_matcher_invalid_args(self):
     select_matcher(matchers=[], table="", exclude_attrs=[])
 def test_select_matcher_invalid_args(self):
     select_matcher(matchers=[], table="", exclude_attrs=[])
示例#20
0
 def test_select_matcher_invalid_df(self):
     select_matcher(matchers=[], table="", exclude_attrs=[], target_attr="")