예제 #1
0
def extract_features(ltable_df, rtable_df, candset_df):
    tokenizers = em.get_tokenizers_for_matching()
    sim_functions = em.get_sim_funs_for_matching()
    left_attr_types = em.get_attr_types(ltable_df)
    right_attr_types = em.get_attr_types(rtable_df)
    correspondences = em.get_attr_corres(ltable_df, rtable_df)

    feature_dict_list = []
    attribute_type_rank = {'boolean':1, 'numeric':2, 'str_eq_1w':3, 'str_bt_1w_5w':4, 'str_bt_5w_10w':5, 'str_gt_10w':6, 'un_determined':7}
    for c in correspondences['corres']:
        if left_attr_types[c[0]] != right_attr_types[c[1]]:
            if attribute_type_rank[left_attr_types[c[0]]] < attribute_type_rank[right_attr_types[c[1]]]:
                left_attr_types[c[0]] = right_attr_types[c[1]]
            else:
                right_attr_types[c[1]] = left_attr_types[c[0]]

    feature_records = get_features(ltable_df,rtable_df,left_attr_types, right_attr_types, correspondences, tokenizers, sim_functions)
    #Remove all features based on id - they are often useless
    feature_records = feature_records[feature_records.left_attribute !='id']
    feature_records.reset_index(inplace=True,drop=True)

    distance_functions = ["lev_dist", "rdf"]
    non_normalized_functions = ["aff", "sw", "swn", "nmw"]
    keep_features = [True]*feature_records.shape[0]
    for i in range(feature_records.shape[0]):
        feature = feature_records.loc[i,"feature_name"]
        for func in distance_functions + non_normalized_functions:
            if func in feature:
                keep_features[i] = False
    feature_records = feature_records.loc[keep_features,:]

    print("\n\nExtracting the full set of features:")
    candset_features_df = em.extract_feature_vecs(candset_df,feature_table=feature_records,attrs_after='gold',show_progress=True,n_jobs=-1)
    candset_features_df.fillna(value=0, inplace=True)

    return candset_features_df
예제 #2
0
# spliting data into training and testing sets
train_test = em.split_train_test(tbl_labeled, train_proportion=0.7)

dev_set = train_test['train']
eval_set = train_test['test']
em.to_csv_metadata(dev_set, 'datasets/dev_set.csv')
em.to_csv_metadata(eval_set, 'datasets/eval_set.csv')

# myset = em.split_train_test(dev_set, train_proportion=0.9)
# I_set = myset['train']
# J_set = myset['test']
# em.to_csv_metadata(I_set, 'datasets/I_set.csv')
# em.to_csv_metadata(J_set, 'datasets/J_set.csv')

# creating feature for matching
match_t = em.get_tokenizers_for_matching()
match_s = em.get_sim_funs_for_matching()
atypes1 = em.get_attr_types(sampled_movies)
atypes2 = em.get_attr_types(sampled_tracks)
match_c = em.get_attr_corres(sampled_movies, sampled_tracks)
match_f = em.get_features(sampled_movies, sampled_tracks, atypes1, atypes2,
                          match_c, match_t, match_s)

# generating feature vectors
H = em.extract_feature_vecs(dev_set,
                            feature_table=match_f,
                            attrs_after='label',
                            show_progress=False)

# filling missing values in feature vectors
H.fillna(value=0, inplace=True)
def run_magellan(train_set,
                 valid_set,
                 test_set,
                 feature_combinations,
                 classifiers,
                 experiment_name,
                 write_test_set_for_inspection=False):
    train_path = os.path.dirname(train_set)
    train_file = os.path.basename(train_set)
    test_path = os.path.dirname(test_set)
    test_file = os.path.basename(test_set)
    report_train_name = train_file.replace('.csv', '')
    report_test_name = test_file.replace('.csv', '')

    train_set_left = train_file.replace('pairs', 'left')
    train_set_right = train_file.replace('pairs', 'right')

    test_set_left = test_file.replace('pairs', 'left')
    test_set_right = test_file.replace('pairs', 'right')

    os.makedirs(os.path.dirname(
        '../../../reports/magellan/{}/'.format(experiment_name)),
                exist_ok=True)

    try:
        os.remove('../../../reports/magellan/{}/{}_{}.csv'.format(
            experiment_name, report_train_name, report_test_name))
    except OSError:
        pass

    with open(
            '../../../reports/magellan/{}/{}_{}.csv'.format(
                experiment_name, report_train_name, report_test_name),
            "w") as f:
        f.write(
            'feature#####model#####mean_train_score#####std_train_score#####mean_valid_score#####std_valid_score#####precision_test#####recall_test#####f1_test#####best_params#####train_time#####prediction_time#####feature_importance#####experiment_name#####train_set#####test_set\n'
        )

    for run in range(1, 4):
        for feature_combination in feature_combinations:

            A_t = em.read_csv_metadata(train_path + '/' + train_set_left,
                                       key='mag_id')
            B_t = em.read_csv_metadata(train_path + '/' + train_set_right,
                                       key='mag_id')
            # Load the pre-labeled data
            S_t = em.read_csv_metadata(train_set,
                                       key='_id',
                                       ltable=A_t,
                                       rtable=B_t,
                                       fk_ltable='ltable_mag_id',
                                       fk_rtable='rtable_mag_id')

            A_gs = em.read_csv_metadata(test_path + '/' + test_set_left,
                                        key='mag_id')
            B_gs = em.read_csv_metadata(test_path + '/' + test_set_right,
                                        key='mag_id')
            # Load the pre-labeled data
            S_gs = em.read_csv_metadata(test_set,
                                        key='_id',
                                        ltable=A_gs,
                                        rtable=B_gs,
                                        fk_ltable='ltable_mag_id',
                                        fk_rtable='rtable_mag_id')

            A_t.fillna('', inplace=True)
            A_gs.fillna('', inplace=True)

            B_t.fillna('', inplace=True)
            B_gs.fillna('', inplace=True)

            S_t.fillna('', inplace=True)
            S_gs.fillna('', inplace=True)

            ## DIRTY FIX, CLEAN UP!
            if 'name' in A_t.columns:
                A_t["price"] = A_t["price"].replace(r'^\s*$',
                                                    np.nan,
                                                    regex=True)
                A_t["price"] = A_t["price"].astype('float64')
                A_gs["price"] = A_gs["price"].replace(r'^\s*$',
                                                      np.nan,
                                                      regex=True)
                A_gs["price"] = A_gs["price"].astype('float64')
                B_t["price"] = B_t["price"].replace(r'^\s*$',
                                                    np.nan,
                                                    regex=True)
                B_t["price"] = B_t["price"].astype('float64')
                B_gs["price"] = B_gs["price"].replace(r'^\s*$',
                                                      np.nan,
                                                      regex=True)
                B_gs["price"] = B_gs["price"].astype('float64')

                S_t["ltable_price"] = S_t["ltable_price"].replace(r'^\s*$',
                                                                  np.nan,
                                                                  regex=True)
                S_t["ltable_price"] = S_t["ltable_price"].astype('float64')
                S_t["rtable_price"] = S_t["rtable_price"].replace(r'^\s*$',
                                                                  np.nan,
                                                                  regex=True)
                S_t["rtable_price"] = S_t["rtable_price"].astype('float64')

                S_gs["ltable_price"] = S_gs["ltable_price"].replace(r'^\s*$',
                                                                    np.nan,
                                                                    regex=True)
                S_gs["ltable_price"] = S_gs["ltable_price"].astype('float64')
                S_gs["rtable_price"] = S_gs["rtable_price"].replace(r'^\s*$',
                                                                    np.nan,
                                                                    regex=True)
                S_gs["rtable_price"] = S_gs["rtable_price"].astype('float64')

            atypes1 = em.get_attr_types(A_t)
            atypes2 = em.get_attr_types(B_t)

            match_c = em.get_attr_corres(A_t, B_t)

            match_c['corres'] = []

            # select attributes to compare
            for feature in feature_combination:
                match_c['corres'].append((feature, feature))

            tok = em.get_tokenizers_for_matching()
            sim = em.get_sim_funs_for_matching()

            F_t = em.get_features(A_t, B_t, atypes1, atypes2, match_c, tok,
                                  sim)

            H_t = em.extract_feature_vecs(S_t,
                                          feature_table=F_t,
                                          attrs_after=['label', 'pair_id'],
                                          show_progress=False)
            H_gs = em.extract_feature_vecs(S_gs,
                                           feature_table=F_t,
                                           attrs_after='label',
                                           show_progress=False)

            H_t = H_t.fillna(-1)
            H_gs = H_gs.fillna(-1)

            validation_ids_df = pd.read_csv(valid_set)
            val_df = H_t[H_t['pair_id'].isin(
                validation_ids_df['pair_id'].values)]
            train_only_df = H_t[~H_t['pair_id'].
                                isin(validation_ids_df['pair_id'].values)]

            train_only_df = train_only_df.drop(columns='pair_id')
            val_df = val_df.drop(columns='pair_id')

            train_only_df = train_only_df.sample(frac=1, random_state=42)

            pos_neg = H_t['label'].value_counts()
            pos_neg = round(pos_neg[0] / pos_neg[1])

            train_ind = []
            val_ind = []

            for i in range(len(train_only_df) - 1):
                train_ind.append(-1)

            for i in range(len(val_df) - 1):
                val_ind.append(0)

            ps = PredefinedSplit(test_fold=np.concatenate((train_ind,
                                                           val_ind)))

            train_df = pd.concat([train_only_df, val_df])

            for k, v in classifiers.items():

                classifier = v['clf']
                if 'random_state' in classifier.get_params().keys():
                    classifier = classifier.set_params(**{'random_state': run})

                # add pos_neg ratio to XGBoost params
                if k == 'XGBoost':
                    v['params']['scale_pos_weight']: [1, pos_neg]

                model = RandomizedSearchCV(cv=ps,
                                           estimator=classifier,
                                           param_distributions=v['params'],
                                           random_state=42,
                                           n_jobs=4,
                                           scoring='f1',
                                           n_iter=500,
                                           pre_dispatch=8,
                                           return_train_score=True)

                feats_train = train_df.drop(
                    ['_id', 'ltable_mag_id', 'rtable_mag_id', 'label'], axis=1)
                labels_train = train_df['label']
                feats_gs = H_gs.drop(
                    ['_id', 'ltable_mag_id', 'rtable_mag_id', 'label'], axis=1)
                labels_gs = H_gs['label']

                try:
                    model.fit(feats_train, labels_train)
                except ValueError:
                    set_trace()

                parameters = model.best_params_

                score_names = [
                    'mean_train_score', 'std_train_score', 'mean_test_score',
                    'std_test_score'
                ]
                scores = {}
                score_string = ''
                for name in score_names:
                    scores[name] = model.cv_results_[name][model.best_index_]
                    score_string = score_string + name + ': ' + str(
                        scores[name]) + ' '

                feature_names = list(feats_train.columns)

                if k == 'LogisticRegression' or k == 'LinearSVC':
                    most_important_features = model.best_estimator_.coef_
                    word_importance = zip(feature_names,
                                          most_important_features[0].tolist())
                    word_importance = sorted(
                        word_importance,
                        key=lambda importance: importance[1],
                        reverse=True)
                if k == 'RandomForest' or k == 'DecisionTree':
                    most_important_features = model.best_estimator_.feature_importances_
                    word_importance = zip(feature_names,
                                          most_important_features.tolist())
                    word_importance = sorted(
                        word_importance,
                        key=lambda importance: importance[1],
                        reverse=True)
                if k == 'NaiveBayes':
                    word_importance = ''
                if k == 'XGBoost':
                    most_important_features = model.best_estimator_.feature_importances_
                    word_importance = zip(feature_names,
                                          most_important_features.tolist())
                    word_importance = sorted(
                        word_importance,
                        key=lambda importance: importance[1],
                        reverse=True)

                if k == 'LogisticRegression':
                    learner = LogisticRegression(random_state=run,
                                                 solver='liblinear',
                                                 **parameters)
                elif k == 'NaiveBayes':
                    learner = GaussianNB()
                elif k == 'DecisionTree':
                    learner = DecisionTreeClassifier(random_state=run,
                                                     **parameters)
                elif k == 'LinearSVC':
                    learner = LinearSVC(random_state=run,
                                        dual=False,
                                        **parameters)
                elif k == 'RandomForest':
                    learner = RandomForestClassifier(random_state=run,
                                                     n_jobs=4,
                                                     **parameters)
                elif k == 'XGBoost':
                    learner = xgb.XGBClassifier(random_state=run,
                                                n_jobs=4,
                                                **parameters)
                else:
                    print('Learner is not a valid option')
                    break

                model = learner
                feats_train = train_only_df.sample(frac=1, random_state=42)
                feats_train = train_only_df.drop(
                    ['_id', 'ltable_mag_id', 'rtable_mag_id', 'label'], axis=1)
                labels_train = train_only_df['label']

                start = time.time()
                model.fit(feats_train, labels_train)
                end = time.time()

                train_time = end - start

                start = time.time()
                preds_gs = model.predict(feats_gs)

                end = time.time()

                pred_time = end - start

                gs_report = classification_report(labels_gs,
                                                  preds_gs,
                                                  output_dict=True)

                feature_report = '+'.join(feature_combination)

                if write_test_set_for_inspection:

                    out_path = '../../../data/processed/wdc-lspc/inspection/{}/magellan/'.format(
                        experiment_name)
                    os.makedirs(os.path.dirname(out_path), exist_ok=True)

                    file_name = '_'.join([
                        os.path.basename(train_set),
                        os.path.basename(test_set), k, feature_report
                    ])
                    file_name = file_name.replace('.csv', '')
                    file_name += f'_{run}.pkl.gz'

                    test_inspection_df = S_gs.copy()
                    if k == 'LinearSVC':
                        proba_gs = model.decision_function(feats_gs).tolist()
                    else:
                        proba_gs = model.predict_proba(feats_gs).tolist()
                    test_inspection_df['pred'] = preds_gs
                    test_inspection_df['Class Prob'] = proba_gs
                    test_inspection_df.to_pickle(out_path + file_name,
                                                 compression='gzip')

                with open(
                        '../../../reports/magellan/{}/{}_{}.csv'.format(
                            experiment_name, report_train_name,
                            report_test_name), "a") as f:
                    f.write(feature_report + '#####' + k + '#####' +
                            str(scores['mean_train_score']) + '#####' +
                            str(scores['std_train_score']) + '#####' +
                            str(scores['mean_test_score']) + '#####' +
                            str(scores['std_test_score']) + '#####' +
                            str(gs_report['1']['precision']) + '#####' +
                            str(gs_report['1']['recall']) + '#####' +
                            str(gs_report['1']['f1-score']) + '#####' +
                            str(parameters) + '#####' + str(train_time) +
                            '#####' + str(pred_time) + '#####' +
                            str(word_importance[0:100]) + '#####' +
                            experiment_name + '#####' + report_train_name +
                            '#####' + report_test_name + '\n')
예제 #4
0
파일: helper.py 프로젝트: hzhang0418/al4em
'''
helper.py
'''
import random
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

import py_entitymatching as em

tok_name2func = em.get_tokenizers_for_matching(q=[2, 3, 4, 5])
sim_name2func = em.get_sim_funs_for_matching()


def get_features_in_random_forest(rf: RandomForestClassifier) -> list:
    rf_features = set()
    for tree in rf.estimators_:
        for f in tree.tree_.feature:
            if f != -2:
                rf_features.add(f)
    return list(rf_features)


def get_features_in_decision_tree(tree: DecisionTreeClassifier) -> list:
    tree_features = set()
    for f in tree.tree_.feature:
        if f != -2:
            tree_features.add(f)
    return list(tree_features)