Python RFMatcher 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: py_entitymatching

메소드/함수: RFMatcher

hotexamples.com에서의 예제들: 9

Python RFMatcher - 9개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 py_entitymatching.RFMatcher에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: matcher.py 프로젝트: is4ac/cs839-data-science

def main():
    # Read in data files
    A = em.read_csv_metadata(FOLDER + 'A.csv', key='id')  # imdb data
    B = em.read_csv_metadata(FOLDER + 'B.csv', key='id')  # tmdb data
    G = em.read_csv_metadata(FOLDER + 'G.csv',
                             key='_id',
                             ltable=A,
                             rtable=B,
                             fk_ltable='l_id',
                             fk_rtable='r_id')  # labeled data
    # Split G into I and J for CV
    IJ = em.split_train_test(G, train_proportion=0.5, random_state=0)
    I = IJ['train']
    J = IJ['test']
    # Save I and J to files
    I.to_csv(FOLDER + 'I.csv', index=False)
    J.to_csv(FOLDER + 'J.csv', index=False)
    # Generate features set F
    F = em.get_features_for_matching(A, B, validate_inferred_attr_types=False)
    #print(F.feature_name)
    #print(type(F))
    # Convert I to a set of feature vectors using F
    H = em.extract_feature_vecs(I,
                                feature_table=F,
                                attrs_after='label',
                                show_progress=False)
    #print(H.head)
    # Check of missing values
    #print(any(pd.notnull(H)))
    excluded_attributes = ['_id', 'l_id', 'r_id', 'label']
    # Fill in missing values with column's average
    H = em.impute_table(H, exclude_attrs=excluded_attributes, strategy='mean')
    # Create a set of matchers
    dt = em.DTMatcher(name='DecisionTree', random_state=0)
    svm = em.SVMMatcher(name='SVM', random_state=0)
    rf = em.RFMatcher(name='RF', random_state=0)
    lg = em.LogRegMatcher(name='LogReg', random_state=0)
    ln = em.LinRegMatcher(name='LinReg')
    nb = em.NBMatcher(name='NaiveBayes')
    # Selecting best matcher with CV using F1-score as criteria
    CV_result = em.select_matcher([dt, rf, svm, ln, lg, nb],
                                  table=H,
                                  exclude_attrs=excluded_attributes,
                                  k=10,
                                  target_attr='label',
                                  metric_to_select_matcher='f1',
                                  random_state=0)
    print(CV_result['cv_stats'])  # RF is the best matcher
    # Train matchers on H
    dt.fit(table=H, exclude_attrs=excluded_attributes, target_attr='label')
    rf.fit(table=H, exclude_attrs=excluded_attributes, target_attr='label')
    svm.fit(table=H, exclude_attrs=excluded_attributes, target_attr='label')
    lg.fit(table=H, exclude_attrs=excluded_attributes, target_attr='label')
    ln.fit(table=H, exclude_attrs=excluded_attributes, target_attr='label')
    nb.fit(table=H, exclude_attrs=excluded_attributes, target_attr='label')
    # Convert J into a set of features using F
    L = em.extract_feature_vecs(J,
                                feature_table=F,
                                attrs_after='label',
                                show_progress=False)
    # Fill in missing values with column's average
    L = em.impute_table(L, exclude_attrs=excluded_attributes, strategy='mean')
    # Predict on L with trained matchers
    predictions_dt = dt.predict(table=L,
                                exclude_attrs=excluded_attributes,
                                append=True,
                                target_attr='predicted',
                                inplace=False,
                                return_probs=False,
                                probs_attr='proba')
    predictions_rf = rf.predict(table=L,
                                exclude_attrs=excluded_attributes,
                                append=True,
                                target_attr='predicted',
                                inplace=False,
                                return_probs=False,
                                probs_attr='proba')
    predictions_svm = svm.predict(table=L,
                                  exclude_attrs=excluded_attributes,
                                  append=True,
                                  target_attr='predicted',
                                  inplace=False,
                                  return_probs=False,
                                  probs_attr='proba')
    predictions_lg = lg.predict(table=L,
                                exclude_attrs=excluded_attributes,
                                append=True,
                                target_attr='predicted',
                                inplace=False,
                                return_probs=False,
                                probs_attr='proba')
    predictions_ln = ln.predict(table=L,
                                exclude_attrs=excluded_attributes,
                                append=True,
                                target_attr='predicted',
                                inplace=False,
                                return_probs=False,
                                probs_attr='proba')
    predictions_nb = nb.predict(table=L,
                                exclude_attrs=excluded_attributes,
                                append=True,
                                target_attr='predicted',
                                inplace=False,
                                return_probs=False,
                                probs_attr='proba')
    # Evaluate predictions
    dt_eval = em.eval_matches(predictions_dt, 'label', 'predicted')
    em.print_eval_summary(dt_eval)
    rf_eval = em.eval_matches(predictions_rf, 'label', 'predicted')
    em.print_eval_summary(rf_eval)
    svm_eval = em.eval_matches(predictions_svm, 'label', 'predicted')
    em.print_eval_summary(svm_eval)
    lg_eval = em.eval_matches(predictions_lg, 'label', 'predicted')
    em.print_eval_summary(lg_eval)
    ln_eval = em.eval_matches(predictions_ln, 'label', 'predicted')
    em.print_eval_summary(ln_eval)
    nb_eval = em.eval_matches(predictions_nb, 'label', 'predicted')
    em.print_eval_summary(nb_eval)

예제 #2

파일 보기

match_f = em.get_features(sampled_movies, sampled_tracks, atypes1, atypes2,
                          match_c, match_t, match_s)

# generating feature vectors
H = em.extract_feature_vecs(dev_set,
                            feature_table=match_f,
                            attrs_after='label',
                            show_progress=False)

# filling missing values in feature vectors
H.fillna(value=0, inplace=True)

# creating a set of learning-based matchers
dt = em.DTMatcher(name='DecisionTree', random_state=0)
svm = em.SVMMatcher(name='SVM', random_state=0)
rf = em.RFMatcher(name='RF', random_state=0)
lg = em.LogRegMatcher(name='LogReg', random_state=0)
ln = em.LinRegMatcher(name='LinReg')
nb = em.NBMatcher(name='NaiveBayes')

# Selecting the best matcher using cross-validation

# precision of matchers for 5-fold cross validations
result_p = em.select_matcher(
    [dt, svm, rf, lg, ln, nb],
    table=H,
    exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'],
    k=5,
    target_attr='label',
    metric='precision',
    random_state=0)

예제 #3

파일 보기

파일: stage3.py 프로젝트: lbai94/CS839

A = em.read_csv_metadata('../Data/amazon.csv', key='ID');
B = em.read_csv_metadata('../Data/Barnob.csv', key='ID');

G = em.read_csv_metadata('../Data/Label.csv', 
                         key='_id',
                         ltable=A, rtable=B, 
                         fk_ltable='ltable_ID', fk_rtable='rtable_ID')

IJ = em.split_train_test(G, train_proportion=0.6, random_state=0);
I = IJ['train'];
J = IJ['test'];

# Create a set of ML-matchers
dt = em.DTMatcher(name='DecisionTree', random_state=0);
rf = em.RFMatcher(name='Random Forest', random_state=0);
svm = em.SVMMatcher(name='SVM', random_state=0);
nb = em.NBMatcher(name='Naive Bayes');
lg = em.LogRegMatcher(name='Logistic Reg', random_state=0);
ln = em.LinRegMatcher(name='Linear Reg');

F = em.get_features_for_matching(A, B, validate_inferred_attr_types=False);

H = em.extract_feature_vecs(I, 
                            feature_table=F, 
                            attrs_after='gold_labels',
                            show_progress=False)

H = em.impute_table(H, 
                exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'gold_labels'],
                strategy='mean');

예제 #4

파일 보기

def run_magellan_models(sampler,
                        blocking,
                        lsh_args,
                        sequential_args,
                        return_prob_estimates=True):
    '''
    1. Loads data from processed folder of dataset choice.
    2. Performs blocking according to given hyper-parameters
    3. For every given blocking set, generate automatic features
    4. Run suite of shallow learning algorithms on candidate sets

    Inputs:
            sampler: sampling technique that was used to generate data: iterative or naive
            blocking: blocking algorithm used: iterative or lsh
            lsh_args = dictionary: seeds, char_ngrams, bands --> dictionary
            sequential_args: cutoff_distance , min_shared_tokens
    Outputs:
            training_pred_dict, validation_pred_dict, test_pred_dict, pre_blocked_all_sets_labels, post_blocked_all_sets_labels
    
    '''
    if (sampler != "iterative") & (sampler != "naive"):
        raise ValueError(
            "Sampler should be iterative or naive (completely random).")

    # Load Training Set according to sampler
    em.del_catalog()
    lhs_table = em.read_csv_metadata(
        "../data/processed_amazon_google/amz_google_" + sampler +
        "_X_train_lhs.csv").rename(columns={"Unnamed: 0": "id_lhs"})
    rhs_table = em.read_csv_metadata(
        "../data/processed_amazon_google/amz_google_" + sampler +
        "_X_train_rhs.csv").rename(columns={"Unnamed: 0": "id_rhs"})
    y_train = pd.read_csv("../data/processed_amazon_google/amz_google_" +
                          sampler + "_y_train.csv")
    em.del_catalog()
    em.set_key(lhs_table, "id_lhs")
    em.set_key(rhs_table, "id_rhs")

    n_train = lhs_table.shape[0]

    # Blocking
    blocking_cols = ["title_amzn", "title_g"]
    feature_cols = [[
        'title_amzn', 'description_amzn', 'manufacturer_amzn', 'price_amzn'
    ], ['title_g', 'description_g', 'manufacturer_g', 'price_g']]
    id_names = ["id_amzn", "id_g"]
    lsh_blocking_col_ids = 1

    print("Blocking Train Set")
    if (blocking == "lsh"):
        # [1,2] hashes on title and description
        candidates = lsh_blocking(lhs_table,
                                  rhs_table,
                                  lsh_blocking_col_ids,
                                  5, ["id_amzn", "id_g"],
                                  char_ngram=lsh_args["char_ngram"],
                                  seeds=lsh_args["seeds"],
                                  bands=lsh_args["bands"])
    elif (blocking == "sequential"):
        # Initial Rough Blocking on Overlapped Attributes
        candidates = overlapped_attribute_blocking(
            lhs_table, rhs_table, blocking_cols,
            sequential_args["min_shared_tokens"], feature_cols, id_names)
        # Fine Grained Blocking on edit distance
        candidates = edit_distance_blocking(None, None, blocking_cols,
                                            sequential_args["cutoff_distance"],
                                            True, candidates)
    else:
        raise ValueError("Blocking must be lsh or sequential")

    print(f"Generated Candidate size has {candidates.shape[0]} rows")

    # Generate Features
    id_names_phrase = ["_amzn",
                       "_g"]  # Trims away these suffixes from id columns
    feature_cols = [
        [
            'title_amzn',
            'description_amzn',  # removed manufacturer due to missingess: produces features with nans
            'price_amzn'
        ],
        ['title_g', 'description_g', 'price_g']
    ]

    generated_df_train = automatic_feature_gen(candidates, feature_cols,
                                               id_names, id_names_phrase)
    generated_df_train = pd.merge(generated_df_train,
                                  y_train,
                                  left_on=["id_amzn", "id_g"],
                                  right_on=["id_amzn", "id_g"],
                                  how="left")
    generated_df_train.y = generated_df_train.y.map({1.0: int(1), np.nan: 0})

    # Store Training Column names. Ensures that if by chance a new column is generated in
    # validation or test phase, these ones will be ignored
    model_features = generated_df_train.columns

    # If only one class is present in blocking stage, skip training a matcher as it would be impossible
    # Essentially label all blocked tuples as being a match
    train_matchers = True
    if (len(generated_df_train.y.unique())) <= 1:
        train_matchers = False
        print(
            f"Train Candidate Pairs only consist of one class. Skipping matcher training and setting blocker as a matcher."
        )

    if train_matchers:
        # Train Models on training set
        #dt = em.DTMatcher(name='DecisionTree', random_state=0)
        #svm = em.SVMMatcher(name='SVM', random_state=0)
        rf = em.RFMatcher(name='RF', random_state=0)
        lg = em.LogRegMatcher(name='LogReg', random_state=0)
        xg = em.XGBoostMatcher(name="Xg-Boost", random_state=0)

        # dt.fit(table = generated_df_train,
        #         exclude_attrs=['index', 'id_amzn','id_g','index_num_lhs', 'index_num_rhs'],
        #         target_attr='y')
        # svm.fit(table = generated_df_train,
        #         exclude_attrs=['index', 'id_amzn','id_g','index_num_lhs', 'index_num_rhs'],
        #         target_attr='y')
        rf.fit(table=generated_df_train,
               exclude_attrs=[
                   'index', 'id_amzn', 'id_g', 'index_num_lhs', 'index_num_rhs'
               ],
               target_attr='y')
        lg.fit(table=generated_df_train,
               exclude_attrs=[
                   'index', 'id_amzn', 'id_g', 'index_num_lhs', 'index_num_rhs'
               ],
               target_attr='y')
        xg.fit(table=generated_df_train,
               exclude_attrs=[
                   'index', 'id_amzn', 'id_g', 'index_num_lhs', 'index_num_rhs'
               ],
               target_attr='y')

        models = [rf, lg, xg]

        training_predictions = {}
        for model in models:
            training_predictions[model.name] = model.predict(
                table=generated_df_train,
                exclude_attrs=[
                    'index', 'id_amzn', 'id_g', 'index_num_lhs',
                    'index_num_rhs', "y"
                ],
                return_probs=return_prob_estimates)

    # Load validation Set + Generate the feature columns
    em.del_catalog()
    lhs_table = em.read_csv_metadata(
        "../data/processed_amazon_google/amz_google_" + sampler +
        "_X_valid_lhs.csv").rename(columns={"Unnamed: 0": "id_lhs"})
    rhs_table = em.read_csv_metadata(
        "../data/processed_amazon_google/amz_google_" + sampler +
        "_X_valid_rhs.csv").rename(columns={"Unnamed: 0": "id_rhs"})
    y_valid = pd.read_csv("../data/processed_amazon_google/amz_google_" +
                          sampler + "_y_valid.csv")
    em.del_catalog()
    em.set_key(lhs_table, "id_lhs")
    em.set_key(rhs_table, "id_rhs")

    n_valid = lhs_table.shape[0]
    print("Blocking Validation Set")
    if (blocking == "lsh"):
        candidates = lsh_blocking(lhs_table,
                                  rhs_table,
                                  lsh_blocking_col_ids,
                                  5, ["id_amzn", "id_g"],
                                  char_ngram=lsh_args["char_ngram"],
                                  seeds=lsh_args["seeds"],
                                  bands=lsh_args["bands"])
    elif (blocking == "sequential"):
        # Initial Rough Blocking on Overlapped Attributes
        candidates = overlapped_attribute_blocking(
            lhs_table, rhs_table, blocking_cols,
            sequential_args["min_shared_tokens"], feature_cols, id_names)
        # Fine Grained Blocking on edit distance
        candidates = edit_distance_blocking(None, None, blocking_cols,
                                            sequential_args["cutoff_distance"],
                                            True, candidates)
    else:
        raise ValueError("Blocking must be lsh or sequential")

    generated_df_valid = automatic_feature_gen(candidates, feature_cols,
                                               id_names, id_names_phrase)
    generated_df_valid = pd.merge(generated_df_valid,
                                  y_valid,
                                  left_on=["id_amzn", "id_g"],
                                  right_on=["id_amzn", "id_g"],
                                  how="left")
    generated_df_valid.y = generated_df_valid.y.map({1.0: int(1), np.nan: 0})
    generated_df_valid = generated_df_valid.loc[:, model_features]
    ## TODO: think of a better idea!! it is because we enforce all generated data sets to have same columns as training set
    generated_df_valid = generated_df_valid.fillna(0)

    if train_matchers:
        # Predict on Validation Set
        validation_predictions = {}
        for model in models:
            validation_predictions[model.name] = model.predict(
                table=generated_df_valid,
                exclude_attrs=[
                    'index', 'id_amzn', 'id_g', 'index_num_lhs',
                    'index_num_rhs', "y"
                ],
                return_probs=return_prob_estimates)

        # Retrain on all data
        generated_final_train = pd.concat(
            [generated_df_train, generated_df_valid], axis=0)

        # dt.fit(table = generated_final_train,
        #         exclude_attrs=['index', 'id_amzn','id_g','index_num_lhs', 'index_num_rhs'],
        #         target_attr='y')
        # svm.fit(table = generated_final_train,
        #         exclude_attrs=['index', 'id_amzn','id_g','index_num_lhs', 'index_num_rhs'],
        #         target_attr='y')
        rf.fit(table=generated_final_train,
               exclude_attrs=[
                   'index', 'id_amzn', 'id_g', 'index_num_lhs', 'index_num_rhs'
               ],
               target_attr='y')
        lg.fit(table=generated_final_train,
               exclude_attrs=[
                   'index', 'id_amzn', 'id_g', 'index_num_lhs', 'index_num_rhs'
               ],
               target_attr='y')
        xg.fit(table=generated_final_train,
               exclude_attrs=[
                   'index', 'id_amzn', 'id_g', 'index_num_lhs', 'index_num_rhs'
               ],
               target_attr='y')

    # Finally Generate Test Set Predictions
    em.del_catalog()
    lhs_table = em.read_csv_metadata(
        "../data/processed_amazon_google/amz_google_" + sampler +
        "_X_test_lhs.csv").rename(columns={"Unnamed: 0": "id_lhs"})
    rhs_table = em.read_csv_metadata(
        "../data/processed_amazon_google/amz_google_" + sampler +
        "_X_test_rhs.csv").rename(columns={"Unnamed: 0": "id_rhs"})
    y_test = pd.read_csv("../data/processed_amazon_google/amz_google_" +
                         sampler + "_y_test.csv")
    em.del_catalog()
    em.set_key(lhs_table, "id_lhs")
    em.set_key(rhs_table, "id_rhs")

    n_test = lhs_table.shape[0]
    print("Blocking Test Set")
    if (blocking == "lsh"):
        candidates = lsh_blocking(lhs_table,
                                  rhs_table,
                                  lsh_blocking_col_ids,
                                  5, ["id_amzn", "id_g"],
                                  char_ngram=lsh_args["char_ngram"],
                                  seeds=lsh_args["seeds"],
                                  bands=lsh_args["bands"])
    elif (blocking == "sequential"):
        # Initial Rough Blocking on Overlapped Attributes
        candidates = overlapped_attribute_blocking(
            lhs_table, rhs_table, blocking_cols,
            sequential_args["min_shared_tokens"], feature_cols, id_names)
        # Fine Grained Blocking on edit distance
        candidates = edit_distance_blocking(None, None, blocking_cols,
                                            sequential_args["cutoff_distance"],
                                            True, candidates)
    else:
        raise ValueError("Blocking must be lsh or sequential")

    generated_df_test = automatic_feature_gen(candidates, feature_cols,
                                              id_names, id_names_phrase)
    generated_df_test = pd.merge(generated_df_test,
                                 y_test,
                                 left_on=["id_amzn", "id_g"],
                                 right_on=["id_amzn", "id_g"],
                                 how="left")
    generated_df_test.y = generated_df_test.y.map({1.0: int(1), np.nan: 0})
    generated_df_test = generated_df_test.loc[:, model_features]
    generated_df_test = generated_df_test.fillna(0)

    if train_matchers:
        # Predict on test Set
        test_predictions = {}
        for model in models:
            print(model.name)
            test_predictions[model.name] = model.predict(
                table=generated_df_test,
                exclude_attrs=[
                    'index', 'id_amzn', 'id_g', 'index_num_lhs',
                    'index_num_rhs', "y"
                ],
                return_probs=return_prob_estimates)

    # Create pre_blocked_all_sets_labels to store truth of candidate tuples after BLOCKING
    pre_blocked_all_sets_labels = {
        "train": y_train,
        "valid": y_valid,
        "test": y_test
    }
    post_blocked_all_sets_labels = {
        "train": generated_df_train[["id_amzn", "id_g", "y"]],
        "valid": generated_df_valid[["id_amzn", "id_g", "y"]],
        "test": generated_df_test[["id_amzn", "id_g", "y"]]
    }

    if (blocking == "lsh"):
        metadata = lsh_args
    else:
        metadata = sequential_args
    print(
        "-----------------------------------------------------------------------------"
    )
    print(
        f"Finished Experiment using {sampler} and {blocking} with params: {metadata} where train_matchers is: {train_matchers}"
    )
    print(
        "-----------------------------------------------------------------------------"
    )
    # Add in sample sizes
    metadata["n_train"] = n_train
    metadata["n_valid"] = n_valid
    metadata["n_test"] = n_test
    metadata["sampler"] = sampler
    metadata["blocking"] = blocking

    # return matcher predictions if train_matchers occurs otherwise return predictions via the blocker
    if train_matchers:
        return (training_predictions, validation_predictions, test_predictions,
                pre_blocked_all_sets_labels, post_blocked_all_sets_labels,
                metadata)
    else:
        training_predictions, validation_predictions, test_predictions = blocker_as_matcher(
            n_train, n_valid, n_test)
        return (training_predictions, validation_predictions, test_predictions,
                pre_blocked_all_sets_labels, post_blocked_all_sets_labels,
                metadata)

예제 #5

파일 보기

파일: EntityMatching.py 프로젝트: ShoyiRen/cs839-project

# Save Set I
#em.to_csv_metadata(I, './TableI.csv')
# Save Set J
#em.to_csv_metadata(J, './TableJ.csv')

# Automatic feature generation
F = em.get_features_for_matching(A, B, validate_inferred_attr_types=False)
H = em.extract_feature_vecs(I, feature_table=F, attrs_after=['gold_labels'])
# Fill missing values
H.fillna(value='NaN', inplace=True)

# Create ML matchers
dt = em.DTMatcher(name='DecisionTree')
svm = em.SVMMatcher(name='SVM')
rf = em.RFMatcher(name='RandomForest')
lg = em.LogRegMatcher(name='LogisticRegression')
ln = em.LinRegMatcher(name='LinearRegression')
nb = em.NBMatcher(name='NaiveBayes')
# Select the best matcher
result = em.select_matcher(
    [dt, rf, svm, ln, lg, nb],
    table=H,
    exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'gold_labels'],
    k=5,
    target_attr='gold_labels',
    metric_to_select_matcher='f1')
print(result['cv_stats'])
best_matcher = result['selected_matcher']

# Evaluate the matcher

예제 #6

파일 보기

파일: solution.py 프로젝트: dyyfk/cs639_spring19

def main():

    A = em.read_csv_metadata('ltable.csv',
                             key="ltable_id",
                             encoding='ISO-8859-1')
    B = em.read_csv_metadata('rtable.csv',
                             key="rtable_id",
                             encoding='ISO-8859-1')

    ob = em.OverlapBlocker()
    C = ob.block_tables(
        A,
        B,
        'title',
        'title',
        l_output_attrs=['title', 'category', 'brand', 'modelno', 'price'],
        r_output_attrs=['title', 'category', 'brand', 'modelno', 'price'],
        overlap_size=1,
        show_progress=False)
    S = em.sample_table(C, 450)

    G = em.read_csv_metadata("train.csv",
                             key='id',
                             ltable=A,
                             rtable=B,
                             fk_ltable='ltable_id',
                             fk_rtable='rtable_id')
    feature_table = em.get_features_for_matching(
        A, B, validate_inferred_attr_types=False)
    G = em.label_table(S, 'label')

    attrs_from_table = [
        'ltable_title', 'ltable_category', 'ltable_brand', 'ltable_modelno',
        'ltable_price', 'rtable_title', 'rtable_category', 'rtable_brand',
        'rtable_modelno', 'rtable_price'
    ]
    H = em.extract_feature_vecs(G,
                                feature_table=feature_table,
                                attrs_before=attrs_from_table,
                                attrs_after='label',
                                show_progress=False)
    H.fillna('0', inplace=True)
    #     H = em.impute_table(
    #         H, exclude_attrs=['_id', 'ltable_ltable_id', 'rtable_rtable_id','label'], strategy='mean')
    rf = em.RFMatcher()

    attrs_to_be_excluded = []
    attrs_to_be_excluded.extend(
        ['_id', 'ltable_ltable_id', 'rtable_rtable_id', 'label'])
    attrs_to_be_excluded.extend(attrs_from_table)

    rf.fit(table=H, exclude_attrs=attrs_to_be_excluded, target_attr='label')

    attrs_from_table = [
        'ltable_title', 'ltable_category', 'ltable_brand', 'ltable_modelno',
        'ltable_price', 'rtable_title', 'rtable_category', 'rtable_brand',
        'rtable_modelno', 'rtable_price'
    ]
    L = em.extract_feature_vecs(C,
                                feature_table=feature_table,
                                attrs_before=attrs_from_table,
                                show_progress=False,
                                n_jobs=-1)

    attrs_to_be_excluded = []
    attrs_to_be_excluded.extend(
        ['_id', 'ltable_ltable_id', 'rtable_rtable_id'])
    attrs_to_be_excluded.extend(attrs_from_table)

    predictions = rf.predict(table=L,
                             exclude_attrs=attrs_to_be_excluded,
                             append=True,
                             target_attr='predicted',
                             inplace=False)

    dataset = pd.DataFrame({"id": G[0]['id'], 'label': predictions['label']})
    dataset.to_csv("./prediction2.csv", index=False)

예제 #7

파일 보기

파일: EntityMatching_e2e.py 프로젝트: jatinarora2409/EntityMatchingResearch

feature_table = em.get_features_for_matching(
    A, B, validate_inferred_attr_types=False)

# Select the attrs. to be included in the feature vector table
attrs_from_table = [
    'ltable_name', 'ltable_addr', 'ltable_city', 'ltable_phone', 'rtable_name',
    'rtable_addr', 'rtable_city', 'rtable_phone'
]

H = em.extract_feature_vecs(G,
                            feature_table=feature_table,
                            attrs_before=attrs_from_table,
                            attrs_after='gold',
                            show_progress=False)

rf = em.RFMatcher()

attrs_to_be_excluded = []
attrs_to_be_excluded.extend(['_id', 'ltable_id', 'rtable_id', 'gold'])
attrs_to_be_excluded.extend(attrs_from_table)

rf.fit(table=H, exclude_attrs=attrs_to_be_excluded, target_attr='gold')

L = em.extract_feature_vecs(C,
                            feature_table=feature_table,
                            attrs_before=attrs_from_table,
                            show_progress=False,
                            n_jobs=-1)

attrs_to_be_excluded = []
attrs_to_be_excluded.extend(['_id', 'ltable_id', 'rtable_id'])

예제 #8

파일 보기

파일: matching_workflow.py 프로젝트: pjmartinkus/CS_839_DataScience

def workflow(path_A, path_B, path_labeled):

    # Load csv files as dataframes and set the key attribute in the dataframe
    A = em.read_csv_metadata(path_A, key='ID')
    B = em.read_csv_metadata(path_B, key='ID')

    # Run attribute equivalence blocker on brand
    ab = em.AttrEquivalenceBlocker()
    C1 = ab.block_tables(A,
                         B,
                         'Brand',
                         'Brand',
                         l_output_attrs=[
                             'Name', 'Price', 'Brand', 'Screen Size', 'RAM',
                             'Hard Drive Capacity', 'Processor Type',
                             'Processor Speed', 'Operating System',
                             'Clean Name'
                         ],
                         r_output_attrs=[
                             'Name', 'Price', 'Brand', 'Screen Size', 'RAM',
                             'Hard Drive Capacity', 'Processor Type',
                             'Processor Speed', 'Operating System',
                             'Clean Name'
                         ])

    # Get features for rule based blocking
    block_f = em.get_features_for_blocking(A,
                                           B,
                                           validate_inferred_attr_types=False)

    # Run rule based blocker with rule for jaccard score on Clean Name column
    rb = em.RuleBasedBlocker()
    rb.add_rule(
        ['Clean_Name_Clean_Name_jac_qgm_3_qgm_3(ltuple, rtuple) < 0.2'],
        block_f)
    C2 = rb.block_candset(C1)

    # Run black box blocker to compare screen size, ram, and hard drive capacity
    bb_screen = em.BlackBoxBlocker()
    bb_screen.set_black_box_function((screen_ram_hd_equal))
    C = bb_screen.block_candset(C2)

    # Load the labeled data
    L = em.read_csv_metadata(path_labeled,
                             key='_id',
                             ltable=A,
                             rtable=B,
                             fk_ltable='ltable_ID',
                             fk_rtable='rtable_ID')

    # Generate features
    feature_table = em.get_features_for_matching(
        A, B, validate_inferred_attr_types=False)
    feature_subset = feature_table.iloc[np.r_[4:10, 40:len(feature_table)], :]
    em.add_blackbox_feature(feature_subset, 'refurbished', refurbished)

    # Extract feature vectors
    feature_vectors_dev = em.extract_feature_vecs(L,
                                                  feature_table=feature_subset,
                                                  attrs_after='gold')

    # Impute feature vectors with the mean of the column values.
    feature_vectors_dev = em.impute_table(
        feature_vectors_dev,
        exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'gold'],
        strategy='mean')

    # Train using feature vectors from the labeled data
    matcher = em.RFMatcher(name='RF')
    matcher.fit(table=feature_vectors_dev,
                exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'gold'],
                target_attr='gold')

    # Extract feature vectors for the rest of the data
    feature_vectors = em.extract_feature_vecs(C, feature_table=feature_subset)

    # Impute feature vectors with the mean of the column values.
    feature_vectors = em.impute_table(
        feature_vectors,
        exclude_attrs=['_id', 'ltable_ID', 'rtable_ID'],
        strategy='mean')

    # Make predictions for the whole data set
    predictions = matcher.predict(
        table=feature_vectors,
        exclude_attrs=['_id', 'ltable_ID', 'rtable_ID'],
        append=True,
        target_attr='predicted',
        inplace=False)
    predictions = predictions.loc[:, [
        '_id', 'ltable_ID', 'rtable_ID', 'predicted'
    ]]

    return predictions[predictions['predicted'] == 1]

예제 #9

파일 보기

파일: Magellan.py 프로젝트: A0L0XIIV/McMaster_Grad_Projects

def main():
    # WELCOME TO MY MAGELLAN RUN SCRIPT
    print("\n-------------WELCOME TO MY MAGELLAN RUN SCRIPT-------------\n")

    # Get the datasets directory
    datasets_dir = 'B:\McMaster\CAS 764 - Advance Topics in Data Management\Project\Data\\'
    print("- Dataset directory: " + datasets_dir)
    print("- List of folders/files: ")
    print(os.listdir(datasets_dir))
    print("- Please enter new dataset folder name:")
    datasets_dir += input()
    print("- Dataset directory set to: " + datasets_dir)

    dateset_dir_files = os.listdir(datasets_dir)
    print("- List of files in dataset folder: ")
    print(dateset_dir_files)

    # Get the path of the input table A
    print("- Enter an index for Table A file (0-x):")
    file_index_A = input()
    filename_A = dateset_dir_files[int(file_index_A)]
    print("Table A file set to: " + filename_A)

    # Get the path of the input table
    path_A = datasets_dir + os.sep + filename_A

    # Get the path of the input table B
    print("- Enter an index for Table B file (0-x):")
    file_index_B = input()
    filename_B = dateset_dir_files[int(file_index_B)]
    print("Table B file set to: " + filename_B)

    # Get the path of the input table
    path_B = datasets_dir + os.sep + filename_B

    # Print Table A column names
    A = em.read_csv_metadata(path_A)
    print("- List of columns of Table A: ")
    print(list(A.columns))
    # Get the Table A id/primary key column name
    print('- Enter Table A primary key column index (ex. 0):')
    pk_A_index = input()
    pk_A = A.columns[int(pk_A_index)]

    # Print Table B column names
    B = em.read_csv_metadata(path_B)
    print("- List of columns of Table B: ")
    print(list(B.columns))
    # Get the Table B id/primary key column name
    print('- Enter Table B primary key column index (ex. 0):')
    pk_B_index = input()
    pk_B = A.columns[int(pk_A_index)]

    # READING TABLES AND SETTING METADATA
    print("\n-------------READING TABLES AND SETTING METADATA-------------\n")

    # Both read csv and set metadata id as ID column
    #A = em.read_csv_metadata(path_A, key=pk_A)
    #B = em.read_csv_metadata(path_B, key=pk_B)
    em.set_key(A, pk_A)
    em.set_key(B, pk_B)

    # Number of tables
    print('- Number of tuples in A: ' + str(len(A)))
    print('- Number of tuples in B: ' + str(len(B)))
    print('- Number of tuples in A X B (i.e the cartesian product): ' +
          str(len(A) * len(B)))

    # Print first 5 tuples of tables
    print(A.head())
    print(B.head())

    # Display the keys of the input tables
    print("- Table A primary key: " + em.get_key(A))
    print("- Table B primary key: " + em.get_key(B))

    # DOWNSAMPLING
    print("\n-------------DOWNSAMPING-------------\n")

    print("- Do you want to use downsampling? (y or n):")
    print("- Table A: " + str(len(A)) + ", Table B: " + str(len(B)))
    print("- NOTE: Recommended if both tables have 100K+ tuples.")
    is_downsample = input()
    if (is_downsample == 'y'):
        print("- Size of the downsampled tables (ex. 200):")
        downsample_size = input()
        # If the tables are large we can downsample the tables like this
        A1, B1 = em.down_sample(A, B, downsample_size, 1, show_progress=False)
        print("- Length of Table A1" + len(A1))
        print("- Length of Table B1" + len(B1))

    # BLOCKING
    print("\n-------------BLOCKING-------------\n")

    print("- Do you want to use blocking? (y or n):")
    is_blocking = input()
    if (is_blocking == 'y'):

        # Check if the 2 tables column names are the same
        if (list(A.columns) == list(B.columns)):
            C_attr_eq = []  # Attr Equ blocker result list
            C_overlap = []  # Overlap blocker result list
            C_blackbox = []  # BlackBox blocker result list

            # Left and right table attribute prefixes
            l_prefix = "ltable_"
            r_prefix = "rtable_"

            print("\n- List of columns: ")
            print(list(A.columns))
            # Labeling output table column selection
            print(
                "\n- Enter the indexes of columns that you want to see in labeling table (0-"
                + str(len(A.columns) - 1) + "):")
            out_attr = []
            for i in range(1, len(A.columns)):
                print("- Finish with empty character(enter+enter) " + str(i))
                add_to_attr = input()
                if (add_to_attr == ''):
                    break
                # Get indexes from user and add columns into out_attr list
                out_attr.append(A.columns[int(add_to_attr)])

            # Print output attributes
            print(out_attr)

            # Loop for adding/combining new blockers
            while (True):
                # Blocker selection
                print(
                    "\n- Do yo want to use Attribute Equivalence[ab] (same), Overlap[ob] (similar) or Blackbox[bb] blocker:"
                )
                blocker_selection = input()

                # ----- Attribute Equivalence Blocker -----
                if (blocker_selection == 'ab'):
                    # Create attribute equivalence blocker
                    ab = em.AttrEquivalenceBlocker()
                    # Counter for indexes
                    attr_eq_counter = 0
                    # Check if Overlap Blocker used before
                    if (C_overlap and not C_overlap[-1].empty):
                        print(
                            "\n- Do you want to work on Overlap Blocker candidate set or not (y or n):"
                        )
                        use_cand_set = input()
                        if (use_cand_set == 'y'):
                            C_attr_eq.append(
                                C_overlap[-1])  # Add last output of ob
                            attr_eq_counter += 1  # For skipping block_table function in first time

                    # Check if BlackBox Blocker used before
                    if (C_blackbox and not C_blackbox[-1].empty):
                        print(
                            "\n- Do you want to work on BlackBox Blocker candidate set or not (y or n):"
                        )
                        use_cand_set = input()
                        if (use_cand_set == 'y'):
                            C_attr_eq.append(
                                C_blackbox[-1])  # Add last output of ob
                            attr_eq_counter += 1  # For skipping block_table function in first time

                    # Loop for adding more columns/attributes into Attr Equ blocker
                    while (True):
                        # List column names
                        print("\n- List of columns: ")
                        print(list(A.columns))
                        # Get blocking attribute/column
                        print(
                            "\n- Which column (w/ index) to use for equivalence blocking? (ex. 1):"
                        )
                        blocking_col_index = input()
                        blocking_col = A.columns[int(blocking_col_index)]

                        print(
                            "\n- Do you want to add missing values into blocking? (y or n):"
                        )
                        add_missing_val = input()
                        if (add_missing_val == 'y'):
                            add_missing_val = True
                        else:
                            add_missing_val = False

                        # First time using Attr Equ blocker, use A and B
                        if (attr_eq_counter == 0):
                            # Block using selected (blocking_col) attribute on A and B
                            C_attr_eq.append(
                                ab.block_tables(A,
                                                B,
                                                blocking_col,
                                                blocking_col,
                                                l_output_attrs=out_attr,
                                                r_output_attrs=out_attr,
                                                l_output_prefix=l_prefix,
                                                r_output_prefix=r_prefix,
                                                allow_missing=add_missing_val,
                                                n_jobs=-1))
                        # Not first time, add new constraint into previous candidate set
                        else:
                            # Block using selected (blocking_col) attribute on previous (last=-1) candidate set
                            C_attr_eq.append(
                                ab.block_candset(C_attr_eq[-1],
                                                 l_block_attr=blocking_col,
                                                 r_block_attr=blocking_col,
                                                 allow_missing=add_missing_val,
                                                 n_jobs=-1,
                                                 show_progress=False))

                        # DEBUG BLOCKING
                        print(
                            "\n- Attribute Equivalence Blocker Debugging...\n")
                        # Debug last blocker output
                        dbg = em.debug_blocker(C_attr_eq[-1],
                                               A,
                                               B,
                                               output_size=200,
                                               n_jobs=-1)

                        # Display first few tuple pairs from the debug_blocker's output
                        print("\n- Blocking debug results:")
                        print(dbg.head())

                        attr_eq_counter += 1  # Increase the counter

                        # Continue to use Attribute Equivalence Blocker or not
                        print("\n- Length of candidate set: " +
                              str(len(C_attr_eq[-1])))
                        print(
                            "- Add another column into Attribute Equivalence Blocker[a] OR Reset last blocker's output[r]:"
                        )
                        ab_next_operation = input()
                        if (not ab_next_operation.islower()):
                            ab_next_operation = ab_next_operation.lower(
                            )  # Lower case
                        # Continue using Attribute Equivalence Blocker
                        if (ab_next_operation == 'a'):
                            continue
                        # Reset/remove last blocker's output from candidate set list
                        elif (ab_next_operation == 'r'):
                            del C_attr_eq[-1]
                            print("\n- Last blocker output removed!")
                            print(
                                "- Continue to use Attribute Equivalence Blocker (y or n):"
                            )
                            ab_next_operation = input()
                            if (ab_next_operation == 'n'):
                                break
                        # Finish Attribute Equivalence Blocker
                        else:
                            break

                # ----- Overlap Blocker -----
                elif (blocker_selection == 'ob'):
                    # Create attribute equivalence blocker
                    ob = em.OverlapBlocker()
                    # Counter for indexes
                    overlap_counter = 0
                    # Check if Attribute Equivalence Blocker used before
                    if (C_attr_eq and not C_attr_eq[-1].empty):
                        print(
                            "\n- Do you want to work on Attribute Equivalence Blocker candidate set or not (y or n):"
                        )
                        use_cand_set = input()
                        if (use_cand_set == 'y'):
                            C_overlap.append(
                                C_attr_eq[-1])  # Add last output of ab
                            overlap_counter += 1  # For skipping block_table function in first time

                    # Check if BlackBox Blocker used before
                    if (C_blackbox and not C_blackbox[-1].empty):
                        print(
                            "\n- Do you want to work on BlackBox Blocker candidate set or not (y or n):"
                        )
                        use_cand_set = input()
                        if (use_cand_set == 'y'):
                            C_overlap.append(
                                C_blackbox[-1])  # Add last output of ob
                            overlap_counter += 1  # For skipping block_table function in first time

                    # Loop for adding more columns/attributes into Overlap blocker
                    while (True):
                        # List column names
                        print("- List of columns: ")
                        print(list(A.columns))
                        # Get blocking attribute/column
                        print(
                            "- Which column (w/ index) to use for overlap blocking? (ex. 1):"
                        )
                        blocking_col_index = input()
                        blocking_col = A.columns[int(blocking_col_index)]

                        print(
                            "\n- Do you want to add missing values into blocking? (y or n):"
                        )
                        add_missing_val = input()
                        if (add_missing_val == 'y'):
                            add_missing_val = True
                        else:
                            add_missing_val = False

                        print("\n- Use words as a token? (y or n):")
                        use_world_level = input()
                        if (use_world_level == 'y'):
                            use_world_level = True
                            q_gram_value = None
                        else:
                            use_world_level = False
                            print(
                                "\n- Q-gram q value (ex. 2 --> JO HN SM IT H):"
                            )
                            q_gram_value = input()
                            q_gram_value = int(q_gram_value)

                        print(
                            "\n- Enter the overlap size (# of tokens that overlap):"
                        )
                        overlap_size = input()
                        overlap_size = int(overlap_size)

                        print(
                            "\n- Do you want to remove (a, an, the) from token set? (y or n):"
                        )
                        use_stop_words = input()
                        if (use_stop_words == 'y'):
                            use_stop_words = True
                        else:
                            use_stop_words = False

                        # First time using Overlap blocker, use A and B
                        if (overlap_counter == 0):
                            # Block using selected (blocking_col) attribute on A and B
                            C_overlap.append(
                                ob.block_tables(A,
                                                B,
                                                blocking_col,
                                                blocking_col,
                                                l_output_attrs=out_attr,
                                                r_output_attrs=out_attr,
                                                l_output_prefix=l_prefix,
                                                r_output_prefix=r_prefix,
                                                rem_stop_words=use_stop_words,
                                                q_val=q_gram_value,
                                                word_level=use_world_level,
                                                overlap_size=overlap_size,
                                                allow_missing=add_missing_val,
                                                n_jobs=-1))
                        # Not first time, add new constraint into previous candidate set
                        else:
                            # Block using selected (blocking_col) attribute on previous (last=-1) candidate set
                            C_overlap.append(
                                ob.block_candset(C_overlap[-1],
                                                 l_overlap_attr=blocking_col,
                                                 r_overlap_attr=blocking_col,
                                                 rem_stop_words=use_stop_words,
                                                 q_val=q_gram_value,
                                                 word_level=use_world_level,
                                                 overlap_size=overlap_size,
                                                 allow_missing=add_missing_val,
                                                 n_jobs=-1,
                                                 show_progress=False))

                        # DEBUG BLOCKING
                        print("\n- Overlap Blocker Debugging...\n")
                        # Debug last blocker output
                        dbg = em.debug_blocker(C_overlap[-1],
                                               A,
                                               B,
                                               output_size=200,
                                               n_jobs=-1)

                        # Display first few tuple pairs from the debug_blocker's output
                        print("\n- Blocking debug results:")
                        print(dbg.head())

                        overlap_counter += 1  # Increase the counter

                        # Continue to use Attribute Equivalence Blocker or not
                        print("\n- Length of candidate set: " +
                              str(len(C_overlap[-1])))
                        print(
                            "- Add another column into Overlap Blocker[a] OR Reset last blocker's output[r]:"
                        )
                        ob_next_operation = input()
                        if (not ob_next_operation.islower()):
                            ob_next_operation = ob_next_operation.lower(
                            )  # Lower case
                        # Continue using Overlap Blocker
                        if (ob_next_operation == 'a'):
                            continue
                        # Reset/remove last blocker's output from candidate set list
                        elif (ob_next_operation == 'r'):
                            del C_overlap[-1]
                            print("\n- Last blocker output removed!")
                            print(
                                "- Continue to use Overlap Blocker (y or n):")
                            ob_next_operation = input()
                            if (ob_next_operation == 'n'):
                                break
                        # Finish Overlap Blocker
                        else:
                            break

                # ----- BlackBox Blocker -----
                elif (blocker_selection == 'bb'):
                    # Create attribute equivalence blocker
                    bb = em.BlackBoxBlocker()
                    # Counter for indexes
                    blackbox_counter = 0
                    # Check if Overlap Blocker used before
                    if (C_attr_eq and not C_attr_eq[-1].empty):
                        print(
                            "\n- Do you want to work on Attribute Equivalence Blocker candidate set or not (y or n):"
                        )
                        use_cand_set = input()
                        if (use_cand_set == 'y'):
                            C_blackbox.append(
                                C_attr_eq[-1])  # Add last output of ob
                            blackbox_counter += 1  # For skipping block_table function in first time

                    # Check if Overlap Blocker used before
                    if (C_overlap and not C_overlap[-1].empty):
                        print(
                            "\n- Do you want to work on Overlap Blocker candidate set or not (y or n):"
                        )
                        use_cand_set = input()
                        if (use_cand_set == 'y'):
                            C_blackbox.append(
                                C_overlap[-1])  # Add last output of ob
                            blackbox_counter += 1  # For skipping block_table function in first time

                    # Loop for adding more columns/attributes into BlackBox blocker
                    while (True):
                        # Set function
                        bb.set_black_box_function(
                            number_10_percent_comparision)

                        # First time using Overlap blocker, use A and B
                        if (overlap_counter == 0):
                            # Block on A and B
                            C_blackbox.append(
                                bb.block_tables(A,
                                                B,
                                                l_output_attrs=out_attr,
                                                r_output_attrs=out_attr,
                                                l_output_prefix=l_prefix,
                                                r_output_prefix=r_prefix,
                                                n_jobs=-1,
                                                show_progress=False))
                        # Not first time, add new constraint into previous candidate set
                        else:
                            # Block on previous (last=-1) candidate set
                            C_blackbox.append(
                                bb.block_candset(C_blackbox[-1],
                                                 n_jobs=-1,
                                                 show_progress=False))

                        # DEBUG BLOCKING
                        print("\n- BlackBox Blocker Debugging...\n")
                        # Debug last blocker output
                        dbg = em.debug_blocker(C_blackbox[-1],
                                               A,
                                               B,
                                               output_size=200,
                                               n_jobs=-1)

                        # Display first few tuple pairs from the debug_blocker's output
                        print("\n- Blocking debug results:")
                        print(dbg.head())

                        blackbox_counter += 1  # Increase the counter

                        # Continue to use Attribute Equivalence Blocker or not
                        print("\n- Length of candidate set: " +
                              str(len(C_blackbox[-1])))
                        print(
                            "- Add another column into BlackBox Blocker[a] OR Reset last blocker's output[r]:"
                        )
                        bb_next_operation = input()
                        if (not bb_next_operation.islower()):
                            bb_next_operation = bb_next_operation.lower(
                            )  # Lower case
                        # Continue using Overlap Blocker
                        if (bb_next_operation == 'a'):
                            continue
                        # Reset/remove last blocker's output from candidate set list
                        elif (bb_next_operation == 'r'):
                            del C_blackbox[-1]
                            print("\n- Last blocker output removed!")
                            print(
                                "- Continue to use BlackBox Blocker (y or n):")
                            bb_next_operation = input()
                            if (bb_next_operation == 'n'):
                                break
                        # Finish BlackBox Blocker
                        else:
                            break

                print("\n- Do you want to add/use another blocker? (y or n):")
                blocker_decision = input()
                if (blocker_decision == 'n'):
                    break

            print(
                "\n- Which blocker output you want to use? (Attr Equ-ab, Overlap-ob, BlackBox-bb, Union-un)"
            )
            blocker_output_selection = input()
            # Attribute Equ
            if (blocker_output_selection == "ab"):
                C = C_attr_eq[-1]
            # Overlap
            elif (blocker_output_selection == "ob"):
                C = C_overlap[-1]
                # Overlap
            elif (blocker_output_selection == "bb"):
                C = C_blackbox[-1]
            # Union of blockers
            elif (blocker_output_selection == "un"):
                # Combine/union blockers candidate sets
                print("\n- TODO: Unions Attr Equ and Overlap only!")
                if (C_attr_eq and C_overlap and not C_attr_eq[-1].empty and
                        not C_overlap[-1].empty):  # Both blocker types used
                    C = em.combine_blocker_outputs_via_union(
                        [C_attr_eq[-1], C_overlap[-1]])
                    print(
                        "\n- Blockers candidate set outputs combined via union."
                    )
                else:  # Error
                    C = []
                    print(
                        "\n- ERROR: Candidate set C is empty! Check blockers' results."
                    )
            # Error
            else:
                C = []
                print(
                    "\n- ERROR: Candidate set C is empty! Check blockers' results."
                )
            print("\n- Length of C: " + str(len(C)))

        else:
            print(
                "\n- 2 Tables column names are different, they must be the same"
            )
            print(list(A.columns))
            print(list(B.columns))

    # SAMPLING&LABELING
    print("\n-------------SAMPLING&LABELING-------------\n")

    print("- Choose sampling size (eg. 450):")
    sampling_size = input()
    while (int(sampling_size) > len(C)):
        print("- Sampling size cannot be bigger than " + str(len(C)))
        sampling_size = input()

    # Sample  candidate set
    S = em.sample_table(C, int(sampling_size))

    print("- New window will pop-up for " + sampling_size + " sized table.")
    print("- If there is a match, change tuple's label value to 1")

    # Label S
    G = em.label_table(S, 'label')

    #DEVELOPMENT AND EVALUATION
    print("\n-------------DEVELOPMENT AND EVALUATION-------------\n")

    # Split S into development set (I) and evaluation set (J)
    IJ = em.split_train_test(G, train_proportion=0.7, random_state=0)
    I = IJ['train']
    J = IJ['test']

    #SELECTING THE BEST MATCHER
    print("\n-------------SELECTING THE BEST MATCHER-------------\n")

    # Create a set of ML-matchers
    dt = em.DTMatcher(name='DecisionTree', random_state=0)
    svm = em.SVMMatcher(name='SVM', random_state=0)
    rf = em.RFMatcher(name='RF', random_state=0)
    lg = em.LogRegMatcher(name='LogReg', random_state=0)
    ln = em.LinRegMatcher(name='LinReg')
    nb = em.NBMatcher(name='NaiveBayes')

    print(
        "\n- 6 different ML-matchers created: DL, SVM, RF, LogReg, LinReg, NB")

    print("\n- Creating features...")
    # Generate features
    feature_table = em.get_features_for_matching(
        A, B, validate_inferred_attr_types=False)

    print("\n- Features list:")
    # List the names of the features generated
    print(feature_table['feature_name'])

    print("\n- Converting the development set to feature vectors...")
    # Convert the I into a set of feature vectors using feature_table
    H = em.extract_feature_vecs(I,
                                feature_table=feature_table,
                                attrs_after='label',
                                show_progress=False)

    print("\n- Feature table first rows:")
    # Display first few rows
    print(H.head())

    # Primary key of tables = prefix + pk = l_id, r_id
    ltable_pk = l_prefix + pk_A
    rtable_pk = r_prefix + pk_B

    # Check if the feature vectors contain missing values
    # A return value of True means that there are missing values
    is_missing_values = any(pd.notnull(H))
    print("\n- Does feature vector have missing values: " +
          str(is_missing_values))
    if (is_missing_values):
        # Impute feature vectors with the mean of the column values.
        H = em.impute_table(
            H,
            exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'],
            strategy='mean',
            val_all_nans=0.0)
        #print("\n- Feature table first rows:")
        # Display first few rows
        #print(H.head())
        print("- Impute table function used for missing values.")

    print("\n- Selecting the best matcher using cross-validation...")
    # Select the best ML matcher using CV
    result = em.select_matcher(
        matchers=[dt, rf, svm, ln, lg, nb],
        table=H,
        exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'],
        k=5,
        target_attr='label',
        metric_to_select_matcher='f1',
        random_state=0)
    print("\n- Results:")
    print(result['cv_stats'])

    #DEBUGGING THE MATCHER
    print("\n-------------DEBUGGING THE MATCHER-------------\n")

    #  Split feature vectors into train and test
    UV = em.split_train_test(H, train_proportion=0.5)
    U = UV['train']
    V = UV['test']

    # Debug decision tree using GUI
    em.vis_debug_rf(rf,
                    U,
                    V,
                    exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'],
                    target_attr='label')

    print("\n- Do you want to add another feature?")

    H = em.extract_feature_vecs(I,
                                feature_table=feature_table,
                                attrs_after='label',
                                show_progress=False)

    # Check if the feature vectors contain missing values
    # A return value of True means that there are missing values
    is_missing_values = any(pd.notnull(H))
    print("\n- Does feature vector have missing values: " +
          str(is_missing_values))
    if (is_missing_values):
        # Impute feature vectors with the mean of the column values.
        H = em.impute_table(
            H,
            exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'],
            strategy='mean')
        print("\n- Feature table first rows:")
        # Display first few rows
        print(H.head())

    # Select the best ML matcher using CV
    result = em.select_matcher(
        [dt, rf, svm, ln, lg, nb],
        table=H,
        exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'],
        k=5,
        target_attr='label',
        metric_to_select_matcher='f1',
        random_state=0)

    print("\n- Results:")
    print(result['cv_stats'])

    #EVALUATING THE MATCHING OUTPUT
    print("\n-------------EVALUATING THE MATCHING OUTPUT-------------\n")

    print("\n- Converting the evaluation set to feature vectors...")
    # Convert J into a set of feature vectors using feature table
    L = em.extract_feature_vecs(J,
                                feature_table=feature_table,
                                attrs_after='label',
                                show_progress=False)

    # Check if the feature vectors contain missing values
    # A return value of True means that there are missing values
    is_missing_values = any(pd.notnull(L))
    print("\n- Does feature vector have missing values: " +
          str(is_missing_values))
    if (is_missing_values):
        # Impute feature vectors with the mean of the column values.
        L = em.impute_table(
            L,
            exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'],
            strategy='mean')
        print("\n- Feature table first rows:")
        # Display first few rows
        print(L.head())

    print("\n- Training the selected matcher...")
    # Train using feature vectors from I
    rf.fit(table=H,
           exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'],
           target_attr='label')

    print("\n- Predicting the matches...")
    # Predict on L
    predictions = rf.predict(
        table=L,
        exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'],
        append=True,
        target_attr='predicted',
        inplace=False)

    print("\n- Evaluating the prediction...")
    # Evaluate the predictions
    eval_result = em.eval_matches(predictions, 'label', 'predicted')
    print(em.print_eval_summary(eval_result))

    print("\n- Time elapsed:")
    print(datetime.now() - startTime)

    print("\n-------------END-------------\n")