def test_select_matcher_valid_3(self):
     A = read_csv_metadata(path_a, key='id')
     B = read_csv_metadata(path_b, key='id')
     # C = read_csv_metadata(path_c, ltable=A, rtable=B, fk_ltable='ltable.id',
     #                       fk_rtable='rtable.id', key='_id')
     # labels = [0] * 7
     # labels.extend([1] * 8)
     # C['labels'] = labels
     # feature_table = get_features_for_matching(A, B)
     # feature_vectors = extract_feature_vecs(C, feature_table=feature_table, attrs_after='gold')
     # feature_vectors.fillna(0, inplace=True)
     feature_vectors = read_csv_metadata(path_f, ltable=A, rtable=B)
     dtmatcher = DTMatcher()
     nbmatcher = NBMatcher()
     rfmatcher = RFMatcher()
     svmmatcher = SVMMatcher()
     linregmatcher = LinRegMatcher()
     logregmatcher = LogRegMatcher()
     matchers = [dtmatcher, nbmatcher, rfmatcher, svmmatcher, linregmatcher, logregmatcher]
     col_list = list(feature_vectors.columns)
     l = list_diff(col_list, [cm.get_key(feature_vectors), cm.get_fk_ltable(feature_vectors),
                              cm.get_fk_rtable(feature_vectors),
                              'gold'])
     X = feature_vectors[l]
     Y = feature_vectors['gold']
     result = select_matcher(matchers, x=X, y=Y, metric='recall')
     header = ['Name', 'Matcher', 'Num folds']
     result_df = result['cv_stats']
     self.assertEqual(set(header) == set(list(result_df.columns[[0, 1, 2]])), True)
     self.assertEqual('Mean score', result_df.columns[len(result_df.columns) - 1])
     d = result_df.set_index('Name')
     p_max = d.ix[result['selected_matcher'].name, 'Mean score']
     a_max = pd.np.max(d['Mean score'])
     self.assertEqual(p_max, a_max)
    def test_select_matcher_target_attr_not_present(self):
        A = read_csv_metadata(path_a, key='id')
        B = read_csv_metadata(path_b, key='id')
        # C = read_csv_metadata(path_c, ltable=A, rtable=B, fk_ltable='ltable.id',
        #                       fk_rtable='rtable.id', key='_id')
        # labels = [0] * 7
        # labels.extend([1] * 8)
        # C['labels'] = labels
        # feature_table = get_features_for_matching(A, B)
        # feature_vectors = extract_feature_vecs(C, feature_table=feature_table, attrs_after='gold')
        # feature_vectors.fillna(0, inplace=True)
        feature_vectors = read_csv_metadata(path_f, ltable=A, rtable=B)
        dtmatcher = DTMatcher()
        nbmatcher = NBMatcher()
        rfmatcher = RFMatcher()
        svmmatcher = SVMMatcher()
        linregmatcher = LinRegMatcher()
        logregmatcher = LogRegMatcher()
        matchers = [dtmatcher, nbmatcher, rfmatcher, svmmatcher, linregmatcher, logregmatcher]

        col_list = list(feature_vectors.columns)
        l = list_diff(col_list, [cm.get_fk_ltable(feature_vectors),
                                 cm.get_fk_rtable(feature_vectors)
                                 ])
        feature_vectors = feature_vectors[l]

        result = select_matcher(matchers, x=None, y=None, table=feature_vectors,
                                exclude_attrs='_id',
                                target_attr='labels1', k=2)
    def test_select_matcher_valid_1(self):
        A = read_csv_metadata(path_a, key='id')
        B = read_csv_metadata(path_b, key='id')
        # C = read_csv_metadata(path_c, ltable=A, rtable=B, fk_ltable='ltable.id',
        #                       fk_rtable='rtable.id', key='_id')
        # C['labels'] = labels
        feature_vectors = read_csv_metadata(path_f, ltable=A, rtable=B)
        dtmatcher = DTMatcher()
        nbmatcher = NBMatcher()
        rfmatcher = RFMatcher()
        svmmatcher = SVMMatcher()
        linregmatcher = LinRegMatcher()
        logregmatcher = LogRegMatcher()
        matchers = [dtmatcher, nbmatcher, rfmatcher, svmmatcher, linregmatcher, logregmatcher]

        result = select_matcher(matchers, x=None, y=None, table=feature_vectors,
                                exclude_attrs=['ltable.id', 'rtable.id', '_id', 'gold'],
                                target_attr='gold', k=7)
        header = ['Name', 'Matcher', 'Num folds']
        result_df = result['cv_stats']
        self.assertEqual(set(header) == set(list(result_df.columns[[0, 1, 2]])), True)
        self.assertEqual('Mean score', result_df.columns[len(result_df.columns) - 1])
        d = result_df.set_index('Name')
        p_max = d.ix[result['selected_matcher'].name, 'Mean score']
        a_max = pd.np.max(d['Mean score'])
        self.assertEqual(p_max, a_max)
Пример #4
0
    def test_debug_rf_matcher_valid_2(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B)
        labels = [0] * 7
        labels.extend([1] * 8)
        C['labels'] = labels

        feature_table = get_features_for_matching(A, B)
        feature_vectors = extract_feature_vecs(C, feature_table=feature_table,
                                               attrs_after='labels')
        rf = RFMatcher()
        rf.fit(table=feature_vectors, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'],
               target_attr='labels')

        debug_randomforest_matcher(rf.clf, A.ix[1], B.ix[2], feat_table=feature_table,
                              fv_columns=feature_vectors.columns,
                              exclude_attrs=['ltable_ID', 'rtable_ID', '_id', 'labels'])
Пример #5
0
    def test_vis_tuple_debug_rf_matcher_valid_1(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B)
        labels = [0] * 7
        labels.extend([1] * 8)
        C['labels'] = labels

        feature_table = get_features_for_matching(A, B)
        feature_vectors = extract_feature_vecs(C, feature_table=feature_table,
                                               attrs_after='labels')

        rf = RFMatcher()
        rf.fit(table=feature_vectors, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'],
               target_attr='labels')
        s = pd.DataFrame(feature_vectors.ix[0])
        s1 = s.T
        vis_tuple_debug_rf_matcher(rf, s1,
                                   exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'])
Пример #6
0
    def test_valid_names_for_matchers(self):
        matchers1 = {
            "DT": DTMatcher(),
            "LinReg": LinRegMatcher(),
            "LogReg": LogRegMatcher(),
            "NB": NBMatcher(),
            "RF": RFMatcher(),
            "SVM": SVMMatcher()
        }

        matchers2 = {
            "DT": DTMatcher(name='temp'),
            "LinReg": LinRegMatcher(name='temp'),
            "LogReg": LogRegMatcher(name='temp'),
            "NB": NBMatcher(name='temp'),
            "RF": RFMatcher(name='temp'),
            "SVM": SVMMatcher(name='temp')
        }

        for m_name, matcher in six.iteritems(matchers1):
            self.assertEqual(isinstance(matcher.name, six.string_types), True)

        for m_name, matcher in six.iteritems(matchers2):
            self.assertEqual(matcher.name, 'temp')
Пример #7
0
    def test_vis_debug_matcher_rf_label_col_wi_sp_name(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B)
        labels = [0] * 7
        labels.extend([1] * 8)
        C['_predicted'] = labels

        feature_table = get_features_for_matching(A, B)
        feature_vectors = extract_feature_vecs(C, feature_table=feature_table,
                                               attrs_after='_predicted')

        rf = RFMatcher()
        train_test = mu.train_test_split(feature_vectors)

        train = train_test['train']
        test = train_test['test']
        _vis_debug_rf(rf, train, test,
                      exclude_attrs=['_id', 'ltable_ID', 'rtable_ID'],
                      target_attr='_predicted', show_window=False)
Пример #8
0
    def test_vis_debug_matcher_rf_ex_attrs_notin_test(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')
        C = read_csv_metadata(path_c, ltable=A, rtable=B)
        labels = [0] * 7
        labels.extend([1] * 8)
        C['labels'] = labels

        feature_table = get_features_for_matching(A, B)
        feature_vectors = extract_feature_vecs(C, feature_table=feature_table,
                                               attrs_after='labels')

        rf = RFMatcher()
        train_test = mu.train_test_split(feature_vectors)

        train = train_test['train']
        test = train_test['test']
        test.drop('_id', inplace=True, axis=1)
        _vis_debug_rf(rf, train, test,
                      exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'],
                      target_attr='labels', show_window=False)
Пример #9
0
 def test_vis_debug_matcher_rf_invalid_tar_attr(self):
     _vis_debug_rf(RFMatcher(), pd.DataFrame(), pd.DataFrame(),
                   exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'labels'],
                   target_attr=None, show_window=False)