def compute_accuracy_J(matcher, return_probs_arg, H, J): # Train using feature vectors from I matcher.fit(table=H, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'label'], target_attr='label') # Convert J into a set of feature vectors using F L = em.extract_feature_vecs(J, feature_table=F, attrs_after='label', show_progress=False) # Impute L L = em.impute_table( L, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'label'], strategy='mean') # Predict on L predictions = matcher.predict( table=L, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'label'], append=True, target_attr='predicted', inplace=False, return_probs=return_probs_arg, probs_attr='proba') # print(predictions.head()) # Evaluate the predictions eval_result = em.eval_matches(predictions, 'label', 'predicted') em.print_eval_summary(eval_result)
def predict_matching_tuples(A, B, C, G): # Split G into I and J for CV IJ = em.split_train_test(G, train_proportion=0.5, random_state=0) I = IJ['train'] # Generate features set F F = em.get_features_for_matching(A, B, validate_inferred_attr_types=False) # Convert G to a set of feature vectors using F H = em.extract_feature_vecs(I, feature_table=F, attrs_after='label', show_progress=False) excluded_attributes = ['_id', 'l_id', 'r_id', 'label'] # Fill in missing values with column's average H = em.impute_table(H, exclude_attrs=excluded_attributes, strategy='mean') # Create and train a logistic regression - the best matcher from stage3. lg = em.LogRegMatcher(name='LogReg', random_state=0) lg.fit(table=H, exclude_attrs=excluded_attributes, target_attr='label') # Convert C into a set of features using F L = em.extract_feature_vecs(C, feature_table=F, show_progress=False) # Fill in missing values with column's average L = em.impute_table(L, exclude_attrs=['_id', 'l_id', 'r_id'], strategy='mean') # Predict on L with trained matcher predictions = lg.predict(table=L, exclude_attrs=['_id', 'l_id', 'r_id'], append=True, target_attr='predicted', inplace=False, return_probs=False, probs_attr='proba') # Extract the matched pairs' ids matched_pairs = predictions[predictions.predicted == 1] matched_ids = matched_pairs[['l_id', 'r_id']] # Save matched_pairs to file so we don't have to train and predict each time the code is executed matched_ids.to_csv(FOLDER + 'predictedMatchedIDs.csv', index=False)
random_state=0) result['cv_stats'] #Read in the complete candidate set obtained after blocking (973 tuples) C = em.read_csv_metadata("/mnt/c/Users/sreya/Downloads/candidates_large1.csv", key='_id', ltable=A, rtable=B, fk_ltable='ltable_ID', fk_rtable='rtable_ID') C K = em.extract_feature_vecs(C, feature_table=F, show_progress=False) # Impute feature vectors with the mean of the column values K = em.impute_table(K, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID'], strategy='mean') # fit with best matcher which was logistic regression lg.fit(table=H, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'Match'], target_attr='Match') # take best classifier (logistic regression) and output predictions (i.e. matches) predictions = lg.predict(table=K, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID'], append=True, target_attr='predicted', inplace=False) # save set of matches between two tables
attrs_after='label', show_progress=False) # create learners random_state = 0 dt = em.DTMatcher(name='DecisionTree', random_state=random_state) rf = em.RFMatcher(name='RF', random_state=random_state) svm = em.SVMMatcher(name='SVM', random_state=random_state) ln = em.LinRegMatcher(name='LinReg') lg = em.LogRegMatcher(name='LogReg', random_state=random_state) nb = em.NBMatcher(name='NaiveBayes') # Impute feature vectors with the mean of the column values. H = em.impute_table(H, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'label'], strategy='mean') #initial results result = em.select_matcher( [dt, rf, svm, ln, lg, nb], table=H, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'label'], k=5, target_attr='label', metric_to_select_matcher='f1', random_state=0) result['cv_stats'] classifiers = np.array([dt, rf, svm, ln, lg, nb])
ltable_pk = "ltable_" + pk_A rtable_pk = "rtable_" + pk_B dt = em.DTMatcher(name='DecisionTree', random_state=0) svm = em.SVMMatcher(name='SVM', random_state=0) rf = em.RFMatcher(name='RF', random_state=0) lg = em.LogRegMatcher(name='LogReg', random_state=0) ln = em.LinRegMatcher(name='LinReg') nb = em.NBMatcher(name='NaiveBayes') feature_table = em.get_features_for_matching(A, B, validate_inferred_attr_types=False) H = em.extract_feature_vecs(I, feature_table=feature_table, attrs_after='label', show_progress=False) H = em.impute_table(H, exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'], strategy='mean', val_all_nans=0.0) result = em.select_matcher(matchers=[dt, rf, svm, ln, lg, nb], table=H, exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'], k=5, target_attr='label', metric_to_select_matcher='f1', random_state=0) print("\n- Matcher selection Results:") print(result['cv_stats']) UV = em.split_train_test(H, train_proportion=0.5) U = UV['train'] V = UV['test'] em.vis_debug_rf(rf, U, V, exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'], target_attr='label') em.vis_debug_dt(dt, U, V,
def main(): # Read in data files A = em.read_csv_metadata(FOLDER + 'A.csv', key='id') # imdb data B = em.read_csv_metadata(FOLDER + 'B.csv', key='id') # tmdb data G = em.read_csv_metadata(FOLDER + 'G.csv', key='_id', ltable=A, rtable=B, fk_ltable='l_id', fk_rtable='r_id') # labeled data # Split G into I and J for CV IJ = em.split_train_test(G, train_proportion=0.5, random_state=0) I = IJ['train'] J = IJ['test'] # Save I and J to files I.to_csv(FOLDER + 'I.csv', index=False) J.to_csv(FOLDER + 'J.csv', index=False) # Generate features set F F = em.get_features_for_matching(A, B, validate_inferred_attr_types=False) #print(F.feature_name) #print(type(F)) # Convert I to a set of feature vectors using F H = em.extract_feature_vecs(I, feature_table=F, attrs_after='label', show_progress=False) #print(H.head) # Check of missing values #print(any(pd.notnull(H))) excluded_attributes = ['_id', 'l_id', 'r_id', 'label'] # Fill in missing values with column's average H = em.impute_table(H, exclude_attrs=excluded_attributes, strategy='mean') # Create a set of matchers dt = em.DTMatcher(name='DecisionTree', random_state=0) svm = em.SVMMatcher(name='SVM', random_state=0) rf = em.RFMatcher(name='RF', random_state=0) lg = em.LogRegMatcher(name='LogReg', random_state=0) ln = em.LinRegMatcher(name='LinReg') nb = em.NBMatcher(name='NaiveBayes') # Selecting best matcher with CV using F1-score as criteria CV_result = em.select_matcher([dt, rf, svm, ln, lg, nb], table=H, exclude_attrs=excluded_attributes, k=10, target_attr='label', metric_to_select_matcher='f1', random_state=0) print(CV_result['cv_stats']) # RF is the best matcher # Train matchers on H dt.fit(table=H, exclude_attrs=excluded_attributes, target_attr='label') rf.fit(table=H, exclude_attrs=excluded_attributes, target_attr='label') svm.fit(table=H, exclude_attrs=excluded_attributes, target_attr='label') lg.fit(table=H, exclude_attrs=excluded_attributes, target_attr='label') ln.fit(table=H, exclude_attrs=excluded_attributes, target_attr='label') nb.fit(table=H, exclude_attrs=excluded_attributes, target_attr='label') # Convert J into a set of features using F L = em.extract_feature_vecs(J, feature_table=F, attrs_after='label', show_progress=False) # Fill in missing values with column's average L = em.impute_table(L, exclude_attrs=excluded_attributes, strategy='mean') # Predict on L with trained matchers predictions_dt = dt.predict(table=L, exclude_attrs=excluded_attributes, append=True, target_attr='predicted', inplace=False, return_probs=False, probs_attr='proba') predictions_rf = rf.predict(table=L, exclude_attrs=excluded_attributes, append=True, target_attr='predicted', inplace=False, return_probs=False, probs_attr='proba') predictions_svm = svm.predict(table=L, exclude_attrs=excluded_attributes, append=True, target_attr='predicted', inplace=False, return_probs=False, probs_attr='proba') predictions_lg = lg.predict(table=L, exclude_attrs=excluded_attributes, append=True, target_attr='predicted', inplace=False, return_probs=False, probs_attr='proba') predictions_ln = ln.predict(table=L, exclude_attrs=excluded_attributes, append=True, target_attr='predicted', inplace=False, return_probs=False, probs_attr='proba') predictions_nb = nb.predict(table=L, exclude_attrs=excluded_attributes, append=True, target_attr='predicted', inplace=False, return_probs=False, probs_attr='proba') # Evaluate predictions dt_eval = em.eval_matches(predictions_dt, 'label', 'predicted') em.print_eval_summary(dt_eval) rf_eval = em.eval_matches(predictions_rf, 'label', 'predicted') em.print_eval_summary(rf_eval) svm_eval = em.eval_matches(predictions_svm, 'label', 'predicted') em.print_eval_summary(svm_eval) lg_eval = em.eval_matches(predictions_lg, 'label', 'predicted') em.print_eval_summary(lg_eval) ln_eval = em.eval_matches(predictions_ln, 'label', 'predicted') em.print_eval_summary(ln_eval) nb_eval = em.eval_matches(predictions_nb, 'label', 'predicted') em.print_eval_summary(nb_eval)
def automatic_feature_gen(candidate_table, feature_cols, id_names, id_names_phrase): ''' NB! The automatic function creates pairwise features. Consequently, it will convert internally the colnames in lhs and rhs portions of feature cols to the SAME name. It does this by trimming the `id_names_phrase` portion (suffix or prefix) from each column name It assumes that the id names are of the form id_{id_names_phrase} e.g. id_amzn Replaces Nans in candidate table with empty strings Takes in a single DataFrame object (lhs_table and rhs_table concatenated) and splits it into two tables then generates features on each of the sub tables. Inputs: candidate_table: single Pandas DataFrame (typically output of blocking_algorithms.py functions) Outputs: ''' em.del_catalog() candidate_table = candidate_table.reset_index() lhs_table = candidate_table.loc[:, feature_cols[0] + [id_names[0]]] rhs_table = candidate_table.loc[:, feature_cols[1] + [id_names[1]]] lhs_colnames = [] for colname in lhs_table: if colname != id_names[0]: lhs_colnames.append(re.sub(id_names_phrase[0], "", colname)) else: lhs_colnames.append(colname) rhs_colnames = [] for colname in rhs_table: if colname != id_names[1]: rhs_colnames.append(re.sub(id_names_phrase[1], "", colname)) else: rhs_colnames.append(colname) lhs_table.columns = lhs_colnames rhs_table.columns = rhs_colnames # To circumvent the same product ID coming up again (due to it being in multiple candidate comparisons) lhs_table["index_num_lhs"] = np.arange(lhs_table.shape[0]) rhs_table["index_num_rhs"] = np.arange(rhs_table.shape[0]) em.set_key(lhs_table, "index_num_lhs") # changed from id_names em.set_key(rhs_table, "index_num_rhs") # Generate List Of Features matching_features = em.get_features_for_matching( lhs_table.drop(id_names[0], axis=1), rhs_table.drop(id_names[1], axis=1), validate_inferred_attr_types=False) # Extract feature vectors and save as a DF # Set primary keys and foreign keys for candidate table candidate_table["index"] = np.arange(candidate_table.shape[0]) # Add foreign keys to candidate table candidate_table["index_num_lhs"] = np.arange(lhs_table.shape[0]) candidate_table["index_num_rhs"] = np.arange(rhs_table.shape[0]) em.set_key(candidate_table, "index") em.set_fk_ltable(candidate_table, "index_num_lhs") em.set_fk_rtable(candidate_table, "index_num_rhs") em.set_ltable(candidate_table, lhs_table) em.set_rtable(candidate_table, rhs_table) matching_features_df = em.extract_feature_vecs( candidate_table, feature_table=matching_features, show_progress=False) matching_features_df = em.impute_table( matching_features_df, exclude_attrs=['index', "index_num_lhs", "index_num_rhs"], strategy='mean') # add back the amzn and google ids matching_features_df["id_amzn"] = candidate_table.id_amzn matching_features_df["id_g"] = candidate_table.id_g matching_features_df = matching_features_df.fillna(value=0) # print(matching_features_df.describe()) # print(f"Number na {matching_features_df.isna().apply(sum)}") # print(f"Number null {matching_features_df.isnull().apply(sum)}") return matching_features_df
dt = em.DTMatcher(name='DecisionTree', random_state=0); rf = em.RFMatcher(name='Random Forest', random_state=0); svm = em.SVMMatcher(name='SVM', random_state=0); nb = em.NBMatcher(name='Naive Bayes'); lg = em.LogRegMatcher(name='Logistic Reg', random_state=0); ln = em.LinRegMatcher(name='Linear Reg'); F = em.get_features_for_matching(A, B, validate_inferred_attr_types=False); H = em.extract_feature_vecs(I, feature_table=F, attrs_after='gold_labels', show_progress=False) H = em.impute_table(H, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'gold_labels'], strategy='mean'); # print(any(pd.notnull(H))); result = em.select_matcher([dt, rf, svm, nb, lg, ln], table=H, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'gold_labels'], k=5, target_attr='gold_labels', metric_to_select_matcher='f1', random_state=0); print(result['cv_stats']); Y = dt; L = em.extract_feature_vecs(J, feature_table=F,
l_attr_types = em.get_attr_types(kaggle_data) r_attr_types = em.get_attr_types(imdb_data) tok = em.get_tokenizers_for_matching() sim = em.get_sim_funs_for_matching() F = em.get_features(kaggle_data, imdb_data, l_attr_types, r_attr_types, attr_corres, tok, sim) # Given the set of desired features **F**, we can now calculate the feature values for our training data and also impute the missing values in our data. In this case, we choose to replace the missing values with the mean of the column. # In[28]: train_features = em.extract_feature_vecs(train_data, feature_table=F, attrs_after='label', show_progress=False) train_features = em.impute_table(train_features, exclude_attrs=['_id', 'l_id', 'r_id', 'label'], strategy='mean') # Using the calculated features, we can evaluate the performance of different machine learning algorithms and select the best one for our matching task. # In[29]: result = em.select_matcher([dt, rf, svm, ln, lg, nb], table=train_features, exclude_attrs=['_id', 'l_id', 'r_id', 'label'], k=5, target_attr='label', metric='f1', random_state=0) result['cv_stats'] # We can observe based on the reported accuracy of different techniques that the "random forest (RF)" algorithm achieves the best performance. Thus, it is best to use this technique for the matching. # #### Substep F: Evaluating the quality of our matching
feature_string = """lev_sim(wspace(float(ltuple['price']) + float(ltuple['rating'])), wspace(float(rtuple['price']) + float(rtuple['rating'])))""" feature = em.get_feature_fn(feature_string, sim, tok) # Add feature to F em.add_feature(F, 'lev_ws_price+rating', feature) # Convert the sample set into a set of feature vectors using F H = em.extract_feature_vecs(G, feature_table=F, attrs_after='labe', show_progress=False) # impute missing values H = em.impute_table(H, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'labe'], strategy='mean') # Fit a Naive Bayes matcher matcher = em.NBMatcher(name='NaiveBayes') matcher.fit(table=H, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'labe'], target_attr='labe') # Apply matcher to the whole dataset Ht = em.extract_feature_vecs(C, feature_table=F, show_progress=False) Ht = em.impute_table(Ht, exclude_attrs=['_id', 'ltable_id', 'rtable_id'], strategy='mean') predictions = matcher.predict(table=Ht, exclude_attrs=['_id', 'ltable_id', 'rtable_id'],
def workflow(path_A, path_B, path_labeled): # Load csv files as dataframes and set the key attribute in the dataframe A = em.read_csv_metadata(path_A, key='ID') B = em.read_csv_metadata(path_B, key='ID') # Run attribute equivalence blocker on brand ab = em.AttrEquivalenceBlocker() C1 = ab.block_tables(A, B, 'Brand', 'Brand', l_output_attrs=[ 'Name', 'Price', 'Brand', 'Screen Size', 'RAM', 'Hard Drive Capacity', 'Processor Type', 'Processor Speed', 'Operating System', 'Clean Name' ], r_output_attrs=[ 'Name', 'Price', 'Brand', 'Screen Size', 'RAM', 'Hard Drive Capacity', 'Processor Type', 'Processor Speed', 'Operating System', 'Clean Name' ]) # Get features for rule based blocking block_f = em.get_features_for_blocking(A, B, validate_inferred_attr_types=False) # Run rule based blocker with rule for jaccard score on Clean Name column rb = em.RuleBasedBlocker() rb.add_rule( ['Clean_Name_Clean_Name_jac_qgm_3_qgm_3(ltuple, rtuple) < 0.2'], block_f) C2 = rb.block_candset(C1) # Run black box blocker to compare screen size, ram, and hard drive capacity bb_screen = em.BlackBoxBlocker() bb_screen.set_black_box_function((screen_ram_hd_equal)) C = bb_screen.block_candset(C2) # Load the labeled data L = em.read_csv_metadata(path_labeled, key='_id', ltable=A, rtable=B, fk_ltable='ltable_ID', fk_rtable='rtable_ID') # Generate features feature_table = em.get_features_for_matching( A, B, validate_inferred_attr_types=False) feature_subset = feature_table.iloc[np.r_[4:10, 40:len(feature_table)], :] em.add_blackbox_feature(feature_subset, 'refurbished', refurbished) # Extract feature vectors feature_vectors_dev = em.extract_feature_vecs(L, feature_table=feature_subset, attrs_after='gold') # Impute feature vectors with the mean of the column values. feature_vectors_dev = em.impute_table( feature_vectors_dev, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'gold'], strategy='mean') # Train using feature vectors from the labeled data matcher = em.RFMatcher(name='RF') matcher.fit(table=feature_vectors_dev, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'gold'], target_attr='gold') # Extract feature vectors for the rest of the data feature_vectors = em.extract_feature_vecs(C, feature_table=feature_subset) # Impute feature vectors with the mean of the column values. feature_vectors = em.impute_table( feature_vectors, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID'], strategy='mean') # Make predictions for the whole data set predictions = matcher.predict( table=feature_vectors, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID'], append=True, target_attr='predicted', inplace=False) predictions = predictions.loc[:, [ '_id', 'ltable_ID', 'rtable_ID', 'predicted' ]] return predictions[predictions['predicted'] == 1]
# In[261]: H = em.extract_feature_vecs(I, feature_table=Ft, attrs_after='label', show_progress=False) # Perform matches and display results below (after performing cross-validation) # In[262]: H = em.impute_table(H, exclude_attrs=['_id', 'ltable_Id', 'rtable_Id', 'label'], strategy='mean') # In[161]: result = em.select_matcher([dt, rf, svm, ln, lg,nb], table=H, exclude_attrs=['_id', 'ltable_Id', 'rtable_Id', 'label'], k=5, target_attr='label', metric_to_select_matcher='f1', random_state=0) result['cv_stats'] # Picking Random Forest as it has the highest average F1 score. We are not adding any rule based matchers as the precision,recall and F1 scores are already above the required thresholds.
def main(): # Read in data files A = em.read_csv_metadata(FOLDER + 'A.csv', key='id') # imdb data B = em.read_csv_metadata(FOLDER + 'B.csv', key='id') # tmdb data G = em.read_csv_metadata(FOLDER + 'G.csv', key='_id', ltable=A, rtable=B, fk_ltable='l_id', fk_rtable='r_id') # labeled data # Split G into I and J for CV IJ = em.split_train_test(G, train_proportion=0.5, random_state=0) I = IJ['train'] # Generate features set F F = em.get_features_for_matching(A, B, validate_inferred_attr_types=False) # Convert I to a set of feature vectors using F H = em.extract_feature_vecs(I, feature_table=F, attrs_after='label', show_progress=False) excluded_attributes = ['_id', 'l_id', 'r_id', 'label'] # Fill in missing values with column's average H = em.impute_table(H, exclude_attrs=excluded_attributes, strategy='mean') # Create and train a logistic regression - the best matcher from stage3. lg = em.LogRegMatcher(name='LogReg', random_state=0) lg.fit(table=H, exclude_attrs=excluded_attributes, target_attr='label') # Read in the candidate tuple pairs. C = em.read_csv_metadata(FOLDER + 'C.csv', key='_id', ltable=A, rtable=B, fk_ltable='l_id', fk_rtable='r_id') # labeled data # Convert C into a set of features using F L = em.extract_feature_vecs(C, feature_table=F, show_progress=False) # Fill in missing values with column's average L = em.impute_table(L, exclude_attrs=['_id', 'l_id', 'r_id'], strategy='mean') # Predict on L with trained matcher predictions = lg.predict(table=L, exclude_attrs=['_id', 'l_id', 'r_id'], append=True, target_attr='predicted', inplace=False, return_probs=False, probs_attr='proba') # Output the merged table (Basically what matches). # We start with rows from A that matches. # We then merge value from B into A. matched_pairs = predictions[predictions.predicted == 1] left_ids = matched_pairs['l_id'].to_frame() left_ids.columns = ['id'] merged = pd.merge(A, left_ids, on='id') merged.set_index('id', inplace=True) B.set_index('id', inplace=True) black_list = {'a872', 'a987'} for pair in matched_pairs.itertuples(): aid = pair.l_id bid = pair.r_id if (aid in black_list): continue # Title: keep title from A, if title from B is not an exact matched # from A, append B’s title to the alternative title field if B’s title # is not already in A’s alternative title. m_title = merged.loc[aid, 'title'] a_title = merged.loc[aid, 'title'] b_title = B.loc[bid, 'title'] if (b_title != a_title): if pd.isnull(merged.loc[aid, 'alternative_titles']): merged.loc[aid, 'alternative_titles'] = b_title else: alt = set(merged.loc[aid, 'alternative_titles'].split(';')) if (b_title not in alt): merged.loc[aid, 'alternative_titles'] += ';' + b_title for col in [ 'directors', 'writers', 'cast', 'genres', 'keywords', 'languages', 'production_companies', 'production_countries' ]: merged.loc[aid, col] = merge_cell(merged.loc[aid, col], B.loc[bid, col]) # Content rating: keep A # Release year: keep A # Opening_weekend_revenue: keep A # Run time m_runtime = int( (merged.loc[aid, 'run_time'] + B.loc[bid, 'run_time']) / 2) merged.loc[aid, 'run_time'] = m_runtime # Budget and Revenue for col in ['budget', 'revenue']: merged.loc[aid, col] = merge_money(merged.loc[aid, col], B.loc[bid, col]) # Rating: take the average after converting B rating to scale 10. m_rating = (merged.loc[aid, 'rating'] + 0.1 * B.loc[bid, 'rating']) / 2 merged.loc[aid, 'rating'] = m_rating merged.to_csv(FOLDER + 'E.csv', index=True)
def main(): # WELCOME TO MY MAGELLAN RUN SCRIPT print("\n-------------WELCOME TO MY MAGELLAN RUN SCRIPT-------------\n") # Get the datasets directory datasets_dir = 'B:\McMaster\CAS 764 - Advance Topics in Data Management\Project\Data\\' print("- Dataset directory: " + datasets_dir) print("- List of folders/files: ") print(os.listdir(datasets_dir)) print("- Please enter new dataset folder name:") datasets_dir += input() print("- Dataset directory set to: " + datasets_dir) dateset_dir_files = os.listdir(datasets_dir) print("- List of files in dataset folder: ") print(dateset_dir_files) # Get the path of the input table A print("- Enter an index for Table A file (0-x):") file_index_A = input() filename_A = dateset_dir_files[int(file_index_A)] print("Table A file set to: " + filename_A) # Get the path of the input table path_A = datasets_dir + os.sep + filename_A # Get the path of the input table B print("- Enter an index for Table B file (0-x):") file_index_B = input() filename_B = dateset_dir_files[int(file_index_B)] print("Table B file set to: " + filename_B) # Get the path of the input table path_B = datasets_dir + os.sep + filename_B # Print Table A column names A = em.read_csv_metadata(path_A) print("- List of columns of Table A: ") print(list(A.columns)) # Get the Table A id/primary key column name print('- Enter Table A primary key column index (ex. 0):') pk_A_index = input() pk_A = A.columns[int(pk_A_index)] # Print Table B column names B = em.read_csv_metadata(path_B) print("- List of columns of Table B: ") print(list(B.columns)) # Get the Table B id/primary key column name print('- Enter Table B primary key column index (ex. 0):') pk_B_index = input() pk_B = A.columns[int(pk_A_index)] # READING TABLES AND SETTING METADATA print("\n-------------READING TABLES AND SETTING METADATA-------------\n") # Both read csv and set metadata id as ID column #A = em.read_csv_metadata(path_A, key=pk_A) #B = em.read_csv_metadata(path_B, key=pk_B) em.set_key(A, pk_A) em.set_key(B, pk_B) # Number of tables print('- Number of tuples in A: ' + str(len(A))) print('- Number of tuples in B: ' + str(len(B))) print('- Number of tuples in A X B (i.e the cartesian product): ' + str(len(A) * len(B))) # Print first 5 tuples of tables print(A.head()) print(B.head()) # Display the keys of the input tables print("- Table A primary key: " + em.get_key(A)) print("- Table B primary key: " + em.get_key(B)) # DOWNSAMPLING print("\n-------------DOWNSAMPING-------------\n") print("- Do you want to use downsampling? (y or n):") print("- Table A: " + str(len(A)) + ", Table B: " + str(len(B))) print("- NOTE: Recommended if both tables have 100K+ tuples.") is_downsample = input() if (is_downsample == 'y'): print("- Size of the downsampled tables (ex. 200):") downsample_size = input() # If the tables are large we can downsample the tables like this A1, B1 = em.down_sample(A, B, downsample_size, 1, show_progress=False) print("- Length of Table A1" + len(A1)) print("- Length of Table B1" + len(B1)) # BLOCKING print("\n-------------BLOCKING-------------\n") print("- Do you want to use blocking? (y or n):") is_blocking = input() if (is_blocking == 'y'): # Check if the 2 tables column names are the same if (list(A.columns) == list(B.columns)): C_attr_eq = [] # Attr Equ blocker result list C_overlap = [] # Overlap blocker result list C_blackbox = [] # BlackBox blocker result list # Left and right table attribute prefixes l_prefix = "ltable_" r_prefix = "rtable_" print("\n- List of columns: ") print(list(A.columns)) # Labeling output table column selection print( "\n- Enter the indexes of columns that you want to see in labeling table (0-" + str(len(A.columns) - 1) + "):") out_attr = [] for i in range(1, len(A.columns)): print("- Finish with empty character(enter+enter) " + str(i)) add_to_attr = input() if (add_to_attr == ''): break # Get indexes from user and add columns into out_attr list out_attr.append(A.columns[int(add_to_attr)]) # Print output attributes print(out_attr) # Loop for adding/combining new blockers while (True): # Blocker selection print( "\n- Do yo want to use Attribute Equivalence[ab] (same), Overlap[ob] (similar) or Blackbox[bb] blocker:" ) blocker_selection = input() # ----- Attribute Equivalence Blocker ----- if (blocker_selection == 'ab'): # Create attribute equivalence blocker ab = em.AttrEquivalenceBlocker() # Counter for indexes attr_eq_counter = 0 # Check if Overlap Blocker used before if (C_overlap and not C_overlap[-1].empty): print( "\n- Do you want to work on Overlap Blocker candidate set or not (y or n):" ) use_cand_set = input() if (use_cand_set == 'y'): C_attr_eq.append( C_overlap[-1]) # Add last output of ob attr_eq_counter += 1 # For skipping block_table function in first time # Check if BlackBox Blocker used before if (C_blackbox and not C_blackbox[-1].empty): print( "\n- Do you want to work on BlackBox Blocker candidate set or not (y or n):" ) use_cand_set = input() if (use_cand_set == 'y'): C_attr_eq.append( C_blackbox[-1]) # Add last output of ob attr_eq_counter += 1 # For skipping block_table function in first time # Loop for adding more columns/attributes into Attr Equ blocker while (True): # List column names print("\n- List of columns: ") print(list(A.columns)) # Get blocking attribute/column print( "\n- Which column (w/ index) to use for equivalence blocking? (ex. 1):" ) blocking_col_index = input() blocking_col = A.columns[int(blocking_col_index)] print( "\n- Do you want to add missing values into blocking? (y or n):" ) add_missing_val = input() if (add_missing_val == 'y'): add_missing_val = True else: add_missing_val = False # First time using Attr Equ blocker, use A and B if (attr_eq_counter == 0): # Block using selected (blocking_col) attribute on A and B C_attr_eq.append( ab.block_tables(A, B, blocking_col, blocking_col, l_output_attrs=out_attr, r_output_attrs=out_attr, l_output_prefix=l_prefix, r_output_prefix=r_prefix, allow_missing=add_missing_val, n_jobs=-1)) # Not first time, add new constraint into previous candidate set else: # Block using selected (blocking_col) attribute on previous (last=-1) candidate set C_attr_eq.append( ab.block_candset(C_attr_eq[-1], l_block_attr=blocking_col, r_block_attr=blocking_col, allow_missing=add_missing_val, n_jobs=-1, show_progress=False)) # DEBUG BLOCKING print( "\n- Attribute Equivalence Blocker Debugging...\n") # Debug last blocker output dbg = em.debug_blocker(C_attr_eq[-1], A, B, output_size=200, n_jobs=-1) # Display first few tuple pairs from the debug_blocker's output print("\n- Blocking debug results:") print(dbg.head()) attr_eq_counter += 1 # Increase the counter # Continue to use Attribute Equivalence Blocker or not print("\n- Length of candidate set: " + str(len(C_attr_eq[-1]))) print( "- Add another column into Attribute Equivalence Blocker[a] OR Reset last blocker's output[r]:" ) ab_next_operation = input() if (not ab_next_operation.islower()): ab_next_operation = ab_next_operation.lower( ) # Lower case # Continue using Attribute Equivalence Blocker if (ab_next_operation == 'a'): continue # Reset/remove last blocker's output from candidate set list elif (ab_next_operation == 'r'): del C_attr_eq[-1] print("\n- Last blocker output removed!") print( "- Continue to use Attribute Equivalence Blocker (y or n):" ) ab_next_operation = input() if (ab_next_operation == 'n'): break # Finish Attribute Equivalence Blocker else: break # ----- Overlap Blocker ----- elif (blocker_selection == 'ob'): # Create attribute equivalence blocker ob = em.OverlapBlocker() # Counter for indexes overlap_counter = 0 # Check if Attribute Equivalence Blocker used before if (C_attr_eq and not C_attr_eq[-1].empty): print( "\n- Do you want to work on Attribute Equivalence Blocker candidate set or not (y or n):" ) use_cand_set = input() if (use_cand_set == 'y'): C_overlap.append( C_attr_eq[-1]) # Add last output of ab overlap_counter += 1 # For skipping block_table function in first time # Check if BlackBox Blocker used before if (C_blackbox and not C_blackbox[-1].empty): print( "\n- Do you want to work on BlackBox Blocker candidate set or not (y or n):" ) use_cand_set = input() if (use_cand_set == 'y'): C_overlap.append( C_blackbox[-1]) # Add last output of ob overlap_counter += 1 # For skipping block_table function in first time # Loop for adding more columns/attributes into Overlap blocker while (True): # List column names print("- List of columns: ") print(list(A.columns)) # Get blocking attribute/column print( "- Which column (w/ index) to use for overlap blocking? (ex. 1):" ) blocking_col_index = input() blocking_col = A.columns[int(blocking_col_index)] print( "\n- Do you want to add missing values into blocking? (y or n):" ) add_missing_val = input() if (add_missing_val == 'y'): add_missing_val = True else: add_missing_val = False print("\n- Use words as a token? (y or n):") use_world_level = input() if (use_world_level == 'y'): use_world_level = True q_gram_value = None else: use_world_level = False print( "\n- Q-gram q value (ex. 2 --> JO HN SM IT H):" ) q_gram_value = input() q_gram_value = int(q_gram_value) print( "\n- Enter the overlap size (# of tokens that overlap):" ) overlap_size = input() overlap_size = int(overlap_size) print( "\n- Do you want to remove (a, an, the) from token set? (y or n):" ) use_stop_words = input() if (use_stop_words == 'y'): use_stop_words = True else: use_stop_words = False # First time using Overlap blocker, use A and B if (overlap_counter == 0): # Block using selected (blocking_col) attribute on A and B C_overlap.append( ob.block_tables(A, B, blocking_col, blocking_col, l_output_attrs=out_attr, r_output_attrs=out_attr, l_output_prefix=l_prefix, r_output_prefix=r_prefix, rem_stop_words=use_stop_words, q_val=q_gram_value, word_level=use_world_level, overlap_size=overlap_size, allow_missing=add_missing_val, n_jobs=-1)) # Not first time, add new constraint into previous candidate set else: # Block using selected (blocking_col) attribute on previous (last=-1) candidate set C_overlap.append( ob.block_candset(C_overlap[-1], l_overlap_attr=blocking_col, r_overlap_attr=blocking_col, rem_stop_words=use_stop_words, q_val=q_gram_value, word_level=use_world_level, overlap_size=overlap_size, allow_missing=add_missing_val, n_jobs=-1, show_progress=False)) # DEBUG BLOCKING print("\n- Overlap Blocker Debugging...\n") # Debug last blocker output dbg = em.debug_blocker(C_overlap[-1], A, B, output_size=200, n_jobs=-1) # Display first few tuple pairs from the debug_blocker's output print("\n- Blocking debug results:") print(dbg.head()) overlap_counter += 1 # Increase the counter # Continue to use Attribute Equivalence Blocker or not print("\n- Length of candidate set: " + str(len(C_overlap[-1]))) print( "- Add another column into Overlap Blocker[a] OR Reset last blocker's output[r]:" ) ob_next_operation = input() if (not ob_next_operation.islower()): ob_next_operation = ob_next_operation.lower( ) # Lower case # Continue using Overlap Blocker if (ob_next_operation == 'a'): continue # Reset/remove last blocker's output from candidate set list elif (ob_next_operation == 'r'): del C_overlap[-1] print("\n- Last blocker output removed!") print( "- Continue to use Overlap Blocker (y or n):") ob_next_operation = input() if (ob_next_operation == 'n'): break # Finish Overlap Blocker else: break # ----- BlackBox Blocker ----- elif (blocker_selection == 'bb'): # Create attribute equivalence blocker bb = em.BlackBoxBlocker() # Counter for indexes blackbox_counter = 0 # Check if Overlap Blocker used before if (C_attr_eq and not C_attr_eq[-1].empty): print( "\n- Do you want to work on Attribute Equivalence Blocker candidate set or not (y or n):" ) use_cand_set = input() if (use_cand_set == 'y'): C_blackbox.append( C_attr_eq[-1]) # Add last output of ob blackbox_counter += 1 # For skipping block_table function in first time # Check if Overlap Blocker used before if (C_overlap and not C_overlap[-1].empty): print( "\n- Do you want to work on Overlap Blocker candidate set or not (y or n):" ) use_cand_set = input() if (use_cand_set == 'y'): C_blackbox.append( C_overlap[-1]) # Add last output of ob blackbox_counter += 1 # For skipping block_table function in first time # Loop for adding more columns/attributes into BlackBox blocker while (True): # Set function bb.set_black_box_function( number_10_percent_comparision) # First time using Overlap blocker, use A and B if (overlap_counter == 0): # Block on A and B C_blackbox.append( bb.block_tables(A, B, l_output_attrs=out_attr, r_output_attrs=out_attr, l_output_prefix=l_prefix, r_output_prefix=r_prefix, n_jobs=-1, show_progress=False)) # Not first time, add new constraint into previous candidate set else: # Block on previous (last=-1) candidate set C_blackbox.append( bb.block_candset(C_blackbox[-1], n_jobs=-1, show_progress=False)) # DEBUG BLOCKING print("\n- BlackBox Blocker Debugging...\n") # Debug last blocker output dbg = em.debug_blocker(C_blackbox[-1], A, B, output_size=200, n_jobs=-1) # Display first few tuple pairs from the debug_blocker's output print("\n- Blocking debug results:") print(dbg.head()) blackbox_counter += 1 # Increase the counter # Continue to use Attribute Equivalence Blocker or not print("\n- Length of candidate set: " + str(len(C_blackbox[-1]))) print( "- Add another column into BlackBox Blocker[a] OR Reset last blocker's output[r]:" ) bb_next_operation = input() if (not bb_next_operation.islower()): bb_next_operation = bb_next_operation.lower( ) # Lower case # Continue using Overlap Blocker if (bb_next_operation == 'a'): continue # Reset/remove last blocker's output from candidate set list elif (bb_next_operation == 'r'): del C_blackbox[-1] print("\n- Last blocker output removed!") print( "- Continue to use BlackBox Blocker (y or n):") bb_next_operation = input() if (bb_next_operation == 'n'): break # Finish BlackBox Blocker else: break print("\n- Do you want to add/use another blocker? (y or n):") blocker_decision = input() if (blocker_decision == 'n'): break print( "\n- Which blocker output you want to use? (Attr Equ-ab, Overlap-ob, BlackBox-bb, Union-un)" ) blocker_output_selection = input() # Attribute Equ if (blocker_output_selection == "ab"): C = C_attr_eq[-1] # Overlap elif (blocker_output_selection == "ob"): C = C_overlap[-1] # Overlap elif (blocker_output_selection == "bb"): C = C_blackbox[-1] # Union of blockers elif (blocker_output_selection == "un"): # Combine/union blockers candidate sets print("\n- TODO: Unions Attr Equ and Overlap only!") if (C_attr_eq and C_overlap and not C_attr_eq[-1].empty and not C_overlap[-1].empty): # Both blocker types used C = em.combine_blocker_outputs_via_union( [C_attr_eq[-1], C_overlap[-1]]) print( "\n- Blockers candidate set outputs combined via union." ) else: # Error C = [] print( "\n- ERROR: Candidate set C is empty! Check blockers' results." ) # Error else: C = [] print( "\n- ERROR: Candidate set C is empty! Check blockers' results." ) print("\n- Length of C: " + str(len(C))) else: print( "\n- 2 Tables column names are different, they must be the same" ) print(list(A.columns)) print(list(B.columns)) # SAMPLING&LABELING print("\n-------------SAMPLING&LABELING-------------\n") print("- Choose sampling size (eg. 450):") sampling_size = input() while (int(sampling_size) > len(C)): print("- Sampling size cannot be bigger than " + str(len(C))) sampling_size = input() # Sample candidate set S = em.sample_table(C, int(sampling_size)) print("- New window will pop-up for " + sampling_size + " sized table.") print("- If there is a match, change tuple's label value to 1") # Label S G = em.label_table(S, 'label') #DEVELOPMENT AND EVALUATION print("\n-------------DEVELOPMENT AND EVALUATION-------------\n") # Split S into development set (I) and evaluation set (J) IJ = em.split_train_test(G, train_proportion=0.7, random_state=0) I = IJ['train'] J = IJ['test'] #SELECTING THE BEST MATCHER print("\n-------------SELECTING THE BEST MATCHER-------------\n") # Create a set of ML-matchers dt = em.DTMatcher(name='DecisionTree', random_state=0) svm = em.SVMMatcher(name='SVM', random_state=0) rf = em.RFMatcher(name='RF', random_state=0) lg = em.LogRegMatcher(name='LogReg', random_state=0) ln = em.LinRegMatcher(name='LinReg') nb = em.NBMatcher(name='NaiveBayes') print( "\n- 6 different ML-matchers created: DL, SVM, RF, LogReg, LinReg, NB") print("\n- Creating features...") # Generate features feature_table = em.get_features_for_matching( A, B, validate_inferred_attr_types=False) print("\n- Features list:") # List the names of the features generated print(feature_table['feature_name']) print("\n- Converting the development set to feature vectors...") # Convert the I into a set of feature vectors using feature_table H = em.extract_feature_vecs(I, feature_table=feature_table, attrs_after='label', show_progress=False) print("\n- Feature table first rows:") # Display first few rows print(H.head()) # Primary key of tables = prefix + pk = l_id, r_id ltable_pk = l_prefix + pk_A rtable_pk = r_prefix + pk_B # Check if the feature vectors contain missing values # A return value of True means that there are missing values is_missing_values = any(pd.notnull(H)) print("\n- Does feature vector have missing values: " + str(is_missing_values)) if (is_missing_values): # Impute feature vectors with the mean of the column values. H = em.impute_table( H, exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'], strategy='mean', val_all_nans=0.0) #print("\n- Feature table first rows:") # Display first few rows #print(H.head()) print("- Impute table function used for missing values.") print("\n- Selecting the best matcher using cross-validation...") # Select the best ML matcher using CV result = em.select_matcher( matchers=[dt, rf, svm, ln, lg, nb], table=H, exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'], k=5, target_attr='label', metric_to_select_matcher='f1', random_state=0) print("\n- Results:") print(result['cv_stats']) #DEBUGGING THE MATCHER print("\n-------------DEBUGGING THE MATCHER-------------\n") # Split feature vectors into train and test UV = em.split_train_test(H, train_proportion=0.5) U = UV['train'] V = UV['test'] # Debug decision tree using GUI em.vis_debug_rf(rf, U, V, exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'], target_attr='label') print("\n- Do you want to add another feature?") H = em.extract_feature_vecs(I, feature_table=feature_table, attrs_after='label', show_progress=False) # Check if the feature vectors contain missing values # A return value of True means that there are missing values is_missing_values = any(pd.notnull(H)) print("\n- Does feature vector have missing values: " + str(is_missing_values)) if (is_missing_values): # Impute feature vectors with the mean of the column values. H = em.impute_table( H, exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'], strategy='mean') print("\n- Feature table first rows:") # Display first few rows print(H.head()) # Select the best ML matcher using CV result = em.select_matcher( [dt, rf, svm, ln, lg, nb], table=H, exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'], k=5, target_attr='label', metric_to_select_matcher='f1', random_state=0) print("\n- Results:") print(result['cv_stats']) #EVALUATING THE MATCHING OUTPUT print("\n-------------EVALUATING THE MATCHING OUTPUT-------------\n") print("\n- Converting the evaluation set to feature vectors...") # Convert J into a set of feature vectors using feature table L = em.extract_feature_vecs(J, feature_table=feature_table, attrs_after='label', show_progress=False) # Check if the feature vectors contain missing values # A return value of True means that there are missing values is_missing_values = any(pd.notnull(L)) print("\n- Does feature vector have missing values: " + str(is_missing_values)) if (is_missing_values): # Impute feature vectors with the mean of the column values. L = em.impute_table( L, exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'], strategy='mean') print("\n- Feature table first rows:") # Display first few rows print(L.head()) print("\n- Training the selected matcher...") # Train using feature vectors from I rf.fit(table=H, exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'], target_attr='label') print("\n- Predicting the matches...") # Predict on L predictions = rf.predict( table=L, exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'], append=True, target_attr='predicted', inplace=False) print("\n- Evaluating the prediction...") # Evaluate the predictions eval_result = em.eval_matches(predictions, 'label', 'predicted') print(em.print_eval_summary(eval_result)) print("\n- Time elapsed:") print(datetime.now() - startTime) print("\n-------------END-------------\n")