def main(): A = em.read_csv_metadata('../Data/A_imdb.csv', key='id') B = em.read_csv_metadata('../Data/B_tmdb.csv', key='id') ab = em.AttrEquivalenceBlocker() shared_attributes = ['title', 'directors', 'release_year', 'languages'] C = ab.block_tables(A, B, 'directors', 'directors', l_output_attrs=shared_attributes, r_output_attrs=shared_attributes) # Take a sample of 10 pairs S = em.sample_table(C, 100) print(S) G = em.label_table(S, label_column_name='gold_labels') train_test = em.split_train_test(G, train_proportion=0.5) train, test = train_test['train'], train_test['test'] # Get feature for matching match_f = em.get_features_for_matching(A, B) H = em.extract_feature_vecs(train, attrs_before=['ltable_title', 'rtable_title'], feature_table=match_f, attrs_after=['gold_labels']) H.fillna(value=0, inplace=True) print(H) # Specifying Matchers and Performing Matching. dt = em.DTMatcher(max_depth=5) # A decision tree matcher. # Train the matcher dt.fit(table=H, exclude_attrs=[ '_id', 'ltable_id', 'rtable_id', 'ltable_title', 'rtable_title', 'gold_labels' ], target_attr='gold_labels') # Predict F = em.extract_feature_vecs(test, attrs_before=['ltable_title', 'rtable_title'], feature_table=match_f, attrs_after=['gold_labels']) F.fillna(value=0, inplace=True) print(F) pred_table = dt.predict(table=F, exclude_attrs=[ '_id', 'ltable_id', 'rtable_id', 'ltable_title', 'rtable_title', 'gold_labels' ], target_attr='predicted_labels', return_probs=True, probs_attr='proba', append=True, inplace=True) print(pred_table) eval_summary = em.eval_matches(pred_table, 'gold_labels', 'predicted_labels') em.print_eval_summary(eval_summary)
def compute_accuracy_J(matcher, return_probs_arg, H, J): # Train using feature vectors from I matcher.fit(table=H, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'label'], target_attr='label') # Convert J into a set of feature vectors using F L = em.extract_feature_vecs(J, feature_table=F, attrs_after='label', show_progress=False) # Impute L L = em.impute_table( L, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'label'], strategy='mean') # Predict on L predictions = matcher.predict( table=L, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'label'], append=True, target_attr='predicted', inplace=False, return_probs=return_probs_arg, probs_attr='proba') # print(predictions.head()) # Evaluate the predictions eval_result = em.eval_matches(predictions, 'label', 'predicted') em.print_eval_summary(eval_result)
def predict(self, dataset, impute_value=0): dataset = dataset.copy() with io.capture_output() as captured: dataset['id'] = dataset['left_id'] = dataset['right_id'] = np.arange(dataset.shape[0]) leftDF = dataset[self.lcolumns].copy() leftDF.columns = self.columns rightDF = dataset[self.rcolumns].copy() rightDF.columns = self.columns em.set_key(dataset, 'id') em.set_key(leftDF, 'id') em.set_key(rightDF, 'id') em.set_ltable(dataset, leftDF) em.set_rtable(dataset, rightDF) em.set_fk_ltable(dataset, 'left_id') em.set_fk_rtable(dataset, 'right_id') self.exctracted_features = em.extract_feature_vecs(dataset, feature_table=self.feature_table) self.exctracted_features = self.exctracted_features.fillna(impute_value) exclude_tmp = list( set(self.exclude_attrs) - (set(self.exclude_attrs) - set(self.exctracted_features.columns))) self.predictions = self.model.predict(table=self.exctracted_features, exclude_attrs=exclude_tmp, return_probs=True, target_attr='pred', probs_attr='match_score', append=True) del dataset del captured gc.collect() return self.predictions['match_score'].values
def extract_features_auto(ltable_df, rtable_df, candset_df): feature_list = em.get_features_for_matching(ltable_df,rtable_df,validate_inferred_attr_types=False) #Remove all features based on id - they are often useless feature_list = feature_list[feature_list.left_attribute !='id'] print("\n\nExtracting the full set of features:") candset_features_df = em.extract_feature_vecs(candset_df,feature_table=feature_list,attrs_after='gold',show_progress=True) candset_features_df.fillna(value=0, inplace=True) return candset_features_df
def get_feature_vectors(C, feature_table, attrs_before=None, attrs_after=None): H = em.extract_feature_vecs(C, feature_table=feature_table, attrs_before=attrs_before, attrs_after=attrs_after, show_progress=True, n_jobs=-1) # Set NaNs to 0 H.fillna(0, inplace=True) return H
def predict_matching_tuples(A, B, C, G): # Split G into I and J for CV IJ = em.split_train_test(G, train_proportion=0.5, random_state=0) I = IJ['train'] # Generate features set F F = em.get_features_for_matching(A, B, validate_inferred_attr_types=False) # Convert G to a set of feature vectors using F H = em.extract_feature_vecs(I, feature_table=F, attrs_after='label', show_progress=False) excluded_attributes = ['_id', 'l_id', 'r_id', 'label'] # Fill in missing values with column's average H = em.impute_table(H, exclude_attrs=excluded_attributes, strategy='mean') # Create and train a logistic regression - the best matcher from stage3. lg = em.LogRegMatcher(name='LogReg', random_state=0) lg.fit(table=H, exclude_attrs=excluded_attributes, target_attr='label') # Convert C into a set of features using F L = em.extract_feature_vecs(C, feature_table=F, show_progress=False) # Fill in missing values with column's average L = em.impute_table(L, exclude_attrs=['_id', 'l_id', 'r_id'], strategy='mean') # Predict on L with trained matcher predictions = lg.predict(table=L, exclude_attrs=['_id', 'l_id', 'r_id'], append=True, target_attr='predicted', inplace=False, return_probs=False, probs_attr='proba') # Extract the matched pairs' ids matched_pairs = predictions[predictions.predicted == 1] matched_ids = matched_pairs[['l_id', 'r_id']] # Save matched_pairs to file so we don't have to train and predict each time the code is executed matched_ids.to_csv(FOLDER + 'predictedMatchedIDs.csv', index=False)
def extract_features(ltable_df, rtable_df, candset_df): tokenizers = em.get_tokenizers_for_matching() sim_functions = em.get_sim_funs_for_matching() left_attr_types = em.get_attr_types(ltable_df) right_attr_types = em.get_attr_types(rtable_df) correspondences = em.get_attr_corres(ltable_df, rtable_df) feature_dict_list = [] attribute_type_rank = {'boolean':1, 'numeric':2, 'str_eq_1w':3, 'str_bt_1w_5w':4, 'str_bt_5w_10w':5, 'str_gt_10w':6, 'un_determined':7} for c in correspondences['corres']: if left_attr_types[c[0]] != right_attr_types[c[1]]: if attribute_type_rank[left_attr_types[c[0]]] < attribute_type_rank[right_attr_types[c[1]]]: left_attr_types[c[0]] = right_attr_types[c[1]] else: right_attr_types[c[1]] = left_attr_types[c[0]] feature_records = get_features(ltable_df,rtable_df,left_attr_types, right_attr_types, correspondences, tokenizers, sim_functions) #Remove all features based on id - they are often useless feature_records = feature_records[feature_records.left_attribute !='id'] feature_records.reset_index(inplace=True,drop=True) distance_functions = ["lev_dist", "rdf"] non_normalized_functions = ["aff", "sw", "swn", "nmw"] keep_features = [True]*feature_records.shape[0] for i in range(feature_records.shape[0]): feature = feature_records.loc[i,"feature_name"] for func in distance_functions + non_normalized_functions: if func in feature: keep_features[i] = False feature_records = feature_records.loc[keep_features,:] print("\n\nExtracting the full set of features:") candset_features_df = em.extract_feature_vecs(candset_df,feature_table=feature_records,attrs_after='gold',show_progress=True,n_jobs=-1) candset_features_df.fillna(value=0, inplace=True) return candset_features_df
key='_id', ltable=A, rtable=B, fk_ltable='ltable_id', fk_rtable='rtable_id') print('Length is ' + str(len(G))) rf = em.RFMatcher(name='RF', random_state=0) feature_table = em.get_features_for_matching(A, B, validate_inferred_attr_types=False) # Remove the id comparisons feature_table = feature_table.drop([0, 1, 2, 3], axis=0) H = em.extract_feature_vecs(G, feature_table=feature_table, attrs_after='gold', show_progress=False) def RS(proba, batch_size): return np.random.choice(range(proba.shape[0]),batch_size,replace=False) def LC(proba, batch_size): return np.argsort(np.max(proba,axis=1))[:batch_size] def BT(proba, batch_size): sorted_proba = np.sort(proba,axis=1) return np.argsort(sorted_proba[:,-1]-sorted_proba[:,-2])[:batch_size] Features = em.extract_feature_vecs(G, feature_table=feature_table, attrs_after='gold', show_progress=False)
import logging logging.basicConfig(level=logging.INFO) print("Mem. usage before reading:{0} (GB)".format( psutil.virtual_memory().used / 1e9)) A = em.read_csv_metadata('../datasets/sample_msd_100k.csv', key='id') B = em.read_csv_metadata('../datasets/sample_msd_100k.csv', key='id') C = em.read_csv_metadata('../datasets/candset_msd_300k.csv', key='_id', ltable=A, rtable=B, fk_ltable='l_id', fk_rtable='r_id') print("Mem. usage after reading:{0} (GB)".format(psutil.virtual_memory().used / 1e9)) len(C) memUsageBefore = psutil.virtual_memory().used / 1e9 timeBefore = time.time() feature_table = em.get_features_for_matching(A, B) memUsageBefore = psutil.virtual_memory().used / 1e9 timeBefore = time.time() feature_vecs = em.extract_feature_vecs(C, feature_table=feature_table) timeAfter = time.time() memUsageAfter = psutil.virtual_memory().used / 1e9 print( 'Mem.usage (after reading): {0} (GB), Mem.usage (after extract featvecs): {1} (GB), diff: {2} (GB)' .format(memUsageBefore, memUsageAfter, memUsageAfter - memUsageBefore)) print('Time. diff: {0} (secs)'.format(timeAfter - timeBefore))
def automatic_feature_gen(candidate_table, feature_cols, id_names, id_names_phrase): ''' NB! The automatic function creates pairwise features. Consequently, it will convert internally the colnames in lhs and rhs portions of feature cols to the SAME name. It does this by trimming the `id_names_phrase` portion (suffix or prefix) from each column name It assumes that the id names are of the form id_{id_names_phrase} e.g. id_amzn Replaces Nans in candidate table with empty strings Takes in a single DataFrame object (lhs_table and rhs_table concatenated) and splits it into two tables then generates features on each of the sub tables. Inputs: candidate_table: single Pandas DataFrame (typically output of blocking_algorithms.py functions) Outputs: ''' em.del_catalog() candidate_table = candidate_table.reset_index() lhs_table = candidate_table.loc[:, feature_cols[0] + [id_names[0]]] rhs_table = candidate_table.loc[:, feature_cols[1] + [id_names[1]]] lhs_colnames = [] for colname in lhs_table: if colname != id_names[0]: lhs_colnames.append(re.sub(id_names_phrase[0], "", colname)) else: lhs_colnames.append(colname) rhs_colnames = [] for colname in rhs_table: if colname != id_names[1]: rhs_colnames.append(re.sub(id_names_phrase[1], "", colname)) else: rhs_colnames.append(colname) lhs_table.columns = lhs_colnames rhs_table.columns = rhs_colnames # To circumvent the same product ID coming up again (due to it being in multiple candidate comparisons) lhs_table["index_num_lhs"] = np.arange(lhs_table.shape[0]) rhs_table["index_num_rhs"] = np.arange(rhs_table.shape[0]) em.set_key(lhs_table, "index_num_lhs") # changed from id_names em.set_key(rhs_table, "index_num_rhs") # Generate List Of Features matching_features = em.get_features_for_matching( lhs_table.drop(id_names[0], axis=1), rhs_table.drop(id_names[1], axis=1), validate_inferred_attr_types=False) # Extract feature vectors and save as a DF # Set primary keys and foreign keys for candidate table candidate_table["index"] = np.arange(candidate_table.shape[0]) # Add foreign keys to candidate table candidate_table["index_num_lhs"] = np.arange(lhs_table.shape[0]) candidate_table["index_num_rhs"] = np.arange(rhs_table.shape[0]) em.set_key(candidate_table, "index") em.set_fk_ltable(candidate_table, "index_num_lhs") em.set_fk_rtable(candidate_table, "index_num_rhs") em.set_ltable(candidate_table, lhs_table) em.set_rtable(candidate_table, rhs_table) matching_features_df = em.extract_feature_vecs( candidate_table, feature_table=matching_features, show_progress=False) matching_features_df = em.impute_table( matching_features_df, exclude_attrs=['index', "index_num_lhs", "index_num_rhs"], strategy='mean') # add back the amzn and google ids matching_features_df["id_amzn"] = candidate_table.id_amzn matching_features_df["id_g"] = candidate_table.id_g matching_features_df = matching_features_df.fillna(value=0) # print(matching_features_df.describe()) # print(f"Number na {matching_features_df.isna().apply(sum)}") # print(f"Number null {matching_features_df.isnull().apply(sum)}") return matching_features_df
match_f.drop([13, 14, 15, 16], inplace=True) # In[114]: # List the names of the features generated match_f['feature_name'] # Converting the development set to feature vectors # ------------------ # In[116]: # Convert the I into a set of feature vectors using F H = em.extract_feature_vecs(I, feature_table=match_f, attrs_after=['gold_labels']) # In[117]: ## Display first three rows H.head(3) # Selecting the best matcher using cross-validation # ------------------ # Now, we select the best matcher using k-fold cross-validation. # For the purposes of this guide, we use ten fold cross validation and use 'precision' and 'recall' metric to select the best matcher # In[120]:
def workflow(path_A, path_B, path_labeled): # Load csv files as dataframes and set the key attribute in the dataframe A = em.read_csv_metadata(path_A, key='ID') B = em.read_csv_metadata(path_B, key='ID') # Run attribute equivalence blocker on brand ab = em.AttrEquivalenceBlocker() C1 = ab.block_tables(A, B, 'Brand', 'Brand', l_output_attrs=[ 'Name', 'Price', 'Brand', 'Screen Size', 'RAM', 'Hard Drive Capacity', 'Processor Type', 'Processor Speed', 'Operating System', 'Clean Name' ], r_output_attrs=[ 'Name', 'Price', 'Brand', 'Screen Size', 'RAM', 'Hard Drive Capacity', 'Processor Type', 'Processor Speed', 'Operating System', 'Clean Name' ]) # Get features for rule based blocking block_f = em.get_features_for_blocking(A, B, validate_inferred_attr_types=False) # Run rule based blocker with rule for jaccard score on Clean Name column rb = em.RuleBasedBlocker() rb.add_rule( ['Clean_Name_Clean_Name_jac_qgm_3_qgm_3(ltuple, rtuple) < 0.2'], block_f) C2 = rb.block_candset(C1) # Run black box blocker to compare screen size, ram, and hard drive capacity bb_screen = em.BlackBoxBlocker() bb_screen.set_black_box_function((screen_ram_hd_equal)) C = bb_screen.block_candset(C2) # Load the labeled data L = em.read_csv_metadata(path_labeled, key='_id', ltable=A, rtable=B, fk_ltable='ltable_ID', fk_rtable='rtable_ID') # Generate features feature_table = em.get_features_for_matching( A, B, validate_inferred_attr_types=False) feature_subset = feature_table.iloc[np.r_[4:10, 40:len(feature_table)], :] em.add_blackbox_feature(feature_subset, 'refurbished', refurbished) # Extract feature vectors feature_vectors_dev = em.extract_feature_vecs(L, feature_table=feature_subset, attrs_after='gold') # Impute feature vectors with the mean of the column values. feature_vectors_dev = em.impute_table( feature_vectors_dev, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'gold'], strategy='mean') # Train using feature vectors from the labeled data matcher = em.RFMatcher(name='RF') matcher.fit(table=feature_vectors_dev, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'gold'], target_attr='gold') # Extract feature vectors for the rest of the data feature_vectors = em.extract_feature_vecs(C, feature_table=feature_subset) # Impute feature vectors with the mean of the column values. feature_vectors = em.impute_table( feature_vectors, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID'], strategy='mean') # Make predictions for the whole data set predictions = matcher.predict( table=feature_vectors, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID'], append=True, target_attr='predicted', inplace=False) predictions = predictions.loc[:, [ '_id', 'ltable_ID', 'rtable_ID', 'predicted' ]] return predictions[predictions['predicted'] == 1]
IJ = em.split_train_test(G, train_proportion=0.6, random_state=0); I = IJ['train']; J = IJ['test']; # Create a set of ML-matchers dt = em.DTMatcher(name='DecisionTree', random_state=0); rf = em.RFMatcher(name='Random Forest', random_state=0); svm = em.SVMMatcher(name='SVM', random_state=0); nb = em.NBMatcher(name='Naive Bayes'); lg = em.LogRegMatcher(name='Logistic Reg', random_state=0); ln = em.LinRegMatcher(name='Linear Reg'); F = em.get_features_for_matching(A, B, validate_inferred_attr_types=False); H = em.extract_feature_vecs(I, feature_table=F, attrs_after='gold_labels', show_progress=False) H = em.impute_table(H, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'gold_labels'], strategy='mean'); # print(any(pd.notnull(H))); result = em.select_matcher([dt, rf, svm, nb, lg, ln], table=H, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'gold_labels'], k=5, target_attr='gold_labels', metric_to_select_matcher='f1', random_state=0); print(result['cv_stats']);
# Initialising all ML algos dt = em.DTMatcher(name='DecisionTree', random_state=0) svm = em.SVMMatcher(name='SVM', random_state=0) rf = em.RFMatcher(name='RF', random_state=0) lg = em.LogRegMatcher(name='LogReg', random_state=0) ln = em.LinRegMatcher(name='LinReg') # Generating features for training features = em.get_features_for_matching(A, B, validate_inferred_attr_types=False) # Extracting feature vectors to train and create model H = em.extract_feature_vecs(train_set, feature_table=features, attrs_after='label', show_progress=False) H.head() # Checking if any value is null any(pd.notnull(H)) # We found null values. Hence, used impute_table to fill up the other values with strategy - mean. H = em.impute_table(H, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'], strategy='mean') # Running select matcher step to run all possible algos and pick the best ML result = em.select_matcher( [dt, rf, svm, ln, lg],
ltable=metacriticData, rtable=wikiData, fk_ltable="ltable_ID", fk_rtable="rtable_ID") print("Reading I and J from files") print(len(I)) print(len(J)) # Generate a set of features F = em.get_features_for_matching(metacriticData, wikiData, validate_inferred_attr_types=False) # Convert the I into a set of feature vectors using F H = em.extract_feature_vecs(I, feature_table=F, attrs_after='label', show_progress=False) # create learners random_state = 0 dt = em.DTMatcher(name='DecisionTree', random_state=random_state) rf = em.RFMatcher(name='RF', random_state=random_state) svm = em.SVMMatcher(name='SVM', random_state=random_state) ln = em.LinRegMatcher(name='LinReg') lg = em.LogRegMatcher(name='LogReg', random_state=random_state) nb = em.NBMatcher(name='NaiveBayes') # Impute feature vectors with the mean of the column values. H = em.impute_table(H, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'label'],
# # First, we obtain all the features we could use for matching. Ft is our feature table # In[260]: Ft = em.get_features_for_matching(A, B, validate_inferred_attr_types=False) # Use the system to generate feature vectors from set I. This is called set H # In[261]: H = em.extract_feature_vecs(I, feature_table=Ft, attrs_after='label', show_progress=False) # Perform matches and display results below (after performing cross-validation) # In[262]: H = em.impute_table(H, exclude_attrs=['_id', 'ltable_Id', 'rtable_Id', 'label'], strategy='mean') # In[161]:
# prepare classifiers dt = em.DTMatcher(name='DecisionTree', random_state=0) svm = em.SVMMatcher(name='SVM', kernel='linear', random_state=0) rf = em.RFMatcher(name='RF', random_state=0) lg = em.LogRegMatcher(name='LogReg', random_state=0) ln = em.LinRegMatcher(name='LinReg') nb = em.NBMatcher(name='NaiveBayes') # need A and B csv files feature_table = em.get_features_for_matching( A, B, validate_inferred_attr_types=False) print(feature_table.feature_name) H = em.extract_feature_vecs(I, feature_table=feature_table, attrs_after='label', show_progress=False) H.fillna(value=0, inplace=True) # select best matcher # precision result = em.select_matcher( [dt, svm, rf, lg, ln, nb], table=H, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'label'], k=5, target_attr='label', metric_to_select_matcher='precision', random_state=0) # recall
] l_attr_types = em.get_attr_types(kaggle_data) r_attr_types = em.get_attr_types(imdb_data) tok = em.get_tokenizers_for_matching() sim = em.get_sim_funs_for_matching() F = em.get_features(kaggle_data, imdb_data, l_attr_types, r_attr_types, attr_corres, tok, sim) # Given the set of desired features **F**, we can now calculate the feature values for our training data and also impute the missing values in our data. In this case, we choose to replace the missing values with the mean of the column. # In[28]: train_features = em.extract_feature_vecs(train_data, feature_table=F, attrs_after='label', show_progress=False) train_features = em.impute_table(train_features, exclude_attrs=['_id', 'l_id', 'r_id', 'label'], strategy='mean') # Using the calculated features, we can evaluate the performance of different machine learning algorithms and select the best one for our matching task. # In[29]: result = em.select_matcher([dt, rf, svm, ln, lg, nb], table=train_features, exclude_attrs=['_id', 'l_id', 'r_id', 'label'], k=5, target_attr='label', metric='f1', random_state=0) result['cv_stats'] # We can observe based on the reported accuracy of different techniques that the "random forest (RF)" algorithm achieves the best performance. Thus, it is best to use this technique for the matching.
B.iloc[:, 1:8], validate_inferred_attr_types=False) # Create a feature on the value of (price + rating), then compute Levenshtein similarity sim = em.get_sim_funs_for_matching() tok = em.get_tokenizers_for_matching() feature_string = """lev_sim(wspace(float(ltuple['price']) + float(ltuple['rating'])), wspace(float(rtuple['price']) + float(rtuple['rating'])))""" feature = em.get_feature_fn(feature_string, sim, tok) # Add feature to F em.add_feature(F, 'lev_ws_price+rating', feature) # Convert the sample set into a set of feature vectors using F H = em.extract_feature_vecs(G, feature_table=F, attrs_after='labe', show_progress=False) # impute missing values H = em.impute_table(H, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'labe'], strategy='mean') # Fit a Naive Bayes matcher matcher = em.NBMatcher(name='NaiveBayes') matcher.fit(table=H, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'labe'], target_attr='labe') # Apply matcher to the whole dataset Ht = em.extract_feature_vecs(C, feature_table=F, show_progress=False)
fk_rtable='rtable_id') print('Number of tuples in Labelled: ' + str(len(G))) feature_table = em.get_features_for_matching( A, B, validate_inferred_attr_types=False) # Select the attrs. to be included in the feature vector table attrs_from_table = [ 'ltable_name', 'ltable_addr', 'ltable_city', 'ltable_phone', 'rtable_name', 'rtable_addr', 'rtable_city', 'rtable_phone' ] H = em.extract_feature_vecs(G, feature_table=feature_table, attrs_before=attrs_from_table, attrs_after='gold', show_progress=False) rf = em.RFMatcher() attrs_to_be_excluded = [] attrs_to_be_excluded.extend(['_id', 'ltable_id', 'rtable_id', 'gold']) attrs_to_be_excluded.extend(attrs_from_table) rf.fit(table=H, exclude_attrs=attrs_to_be_excluded, target_attr='gold') L = em.extract_feature_vecs(C, feature_table=feature_table, attrs_before=attrs_from_table, show_progress=False,
def main(): # Read in data files A = em.read_csv_metadata(FOLDER + 'A.csv', key='id') # imdb data B = em.read_csv_metadata(FOLDER + 'B.csv', key='id') # tmdb data G = em.read_csv_metadata(FOLDER + 'G.csv', key='_id', ltable=A, rtable=B, fk_ltable='l_id', fk_rtable='r_id') # labeled data # Split G into I and J for CV IJ = em.split_train_test(G, train_proportion=0.5, random_state=0) I = IJ['train'] J = IJ['test'] # Save I and J to files I.to_csv(FOLDER + 'I.csv', index=False) J.to_csv(FOLDER + 'J.csv', index=False) # Generate features set F F = em.get_features_for_matching(A, B, validate_inferred_attr_types=False) #print(F.feature_name) #print(type(F)) # Convert I to a set of feature vectors using F H = em.extract_feature_vecs(I, feature_table=F, attrs_after='label', show_progress=False) #print(H.head) # Check of missing values #print(any(pd.notnull(H))) excluded_attributes = ['_id', 'l_id', 'r_id', 'label'] # Fill in missing values with column's average H = em.impute_table(H, exclude_attrs=excluded_attributes, strategy='mean') # Create a set of matchers dt = em.DTMatcher(name='DecisionTree', random_state=0) svm = em.SVMMatcher(name='SVM', random_state=0) rf = em.RFMatcher(name='RF', random_state=0) lg = em.LogRegMatcher(name='LogReg', random_state=0) ln = em.LinRegMatcher(name='LinReg') nb = em.NBMatcher(name='NaiveBayes') # Selecting best matcher with CV using F1-score as criteria CV_result = em.select_matcher([dt, rf, svm, ln, lg, nb], table=H, exclude_attrs=excluded_attributes, k=10, target_attr='label', metric_to_select_matcher='f1', random_state=0) print(CV_result['cv_stats']) # RF is the best matcher # Train matchers on H dt.fit(table=H, exclude_attrs=excluded_attributes, target_attr='label') rf.fit(table=H, exclude_attrs=excluded_attributes, target_attr='label') svm.fit(table=H, exclude_attrs=excluded_attributes, target_attr='label') lg.fit(table=H, exclude_attrs=excluded_attributes, target_attr='label') ln.fit(table=H, exclude_attrs=excluded_attributes, target_attr='label') nb.fit(table=H, exclude_attrs=excluded_attributes, target_attr='label') # Convert J into a set of features using F L = em.extract_feature_vecs(J, feature_table=F, attrs_after='label', show_progress=False) # Fill in missing values with column's average L = em.impute_table(L, exclude_attrs=excluded_attributes, strategy='mean') # Predict on L with trained matchers predictions_dt = dt.predict(table=L, exclude_attrs=excluded_attributes, append=True, target_attr='predicted', inplace=False, return_probs=False, probs_attr='proba') predictions_rf = rf.predict(table=L, exclude_attrs=excluded_attributes, append=True, target_attr='predicted', inplace=False, return_probs=False, probs_attr='proba') predictions_svm = svm.predict(table=L, exclude_attrs=excluded_attributes, append=True, target_attr='predicted', inplace=False, return_probs=False, probs_attr='proba') predictions_lg = lg.predict(table=L, exclude_attrs=excluded_attributes, append=True, target_attr='predicted', inplace=False, return_probs=False, probs_attr='proba') predictions_ln = ln.predict(table=L, exclude_attrs=excluded_attributes, append=True, target_attr='predicted', inplace=False, return_probs=False, probs_attr='proba') predictions_nb = nb.predict(table=L, exclude_attrs=excluded_attributes, append=True, target_attr='predicted', inplace=False, return_probs=False, probs_attr='proba') # Evaluate predictions dt_eval = em.eval_matches(predictions_dt, 'label', 'predicted') em.print_eval_summary(dt_eval) rf_eval = em.eval_matches(predictions_rf, 'label', 'predicted') em.print_eval_summary(rf_eval) svm_eval = em.eval_matches(predictions_svm, 'label', 'predicted') em.print_eval_summary(svm_eval) lg_eval = em.eval_matches(predictions_lg, 'label', 'predicted') em.print_eval_summary(lg_eval) ln_eval = em.eval_matches(predictions_ln, 'label', 'predicted') em.print_eval_summary(ln_eval) nb_eval = em.eval_matches(predictions_nb, 'label', 'predicted') em.print_eval_summary(nb_eval)
def main(): # Read in data files A = em.read_csv_metadata(FOLDER + 'A.csv', key='id') # imdb data B = em.read_csv_metadata(FOLDER + 'B.csv', key='id') # tmdb data G = em.read_csv_metadata(FOLDER + 'G.csv', key='_id', ltable=A, rtable=B, fk_ltable='l_id', fk_rtable='r_id') # labeled data # Split G into I and J for CV IJ = em.split_train_test(G, train_proportion=0.5, random_state=0) I = IJ['train'] # Generate features set F F = em.get_features_for_matching(A, B, validate_inferred_attr_types=False) # Convert I to a set of feature vectors using F H = em.extract_feature_vecs(I, feature_table=F, attrs_after='label', show_progress=False) excluded_attributes = ['_id', 'l_id', 'r_id', 'label'] # Fill in missing values with column's average H = em.impute_table(H, exclude_attrs=excluded_attributes, strategy='mean') # Create and train a logistic regression - the best matcher from stage3. lg = em.LogRegMatcher(name='LogReg', random_state=0) lg.fit(table=H, exclude_attrs=excluded_attributes, target_attr='label') # Read in the candidate tuple pairs. C = em.read_csv_metadata(FOLDER + 'C.csv', key='_id', ltable=A, rtable=B, fk_ltable='l_id', fk_rtable='r_id') # labeled data # Convert C into a set of features using F L = em.extract_feature_vecs(C, feature_table=F, show_progress=False) # Fill in missing values with column's average L = em.impute_table(L, exclude_attrs=['_id', 'l_id', 'r_id'], strategy='mean') # Predict on L with trained matcher predictions = lg.predict(table=L, exclude_attrs=['_id', 'l_id', 'r_id'], append=True, target_attr='predicted', inplace=False, return_probs=False, probs_attr='proba') # Output the merged table (Basically what matches). # We start with rows from A that matches. # We then merge value from B into A. matched_pairs = predictions[predictions.predicted == 1] left_ids = matched_pairs['l_id'].to_frame() left_ids.columns = ['id'] merged = pd.merge(A, left_ids, on='id') merged.set_index('id', inplace=True) B.set_index('id', inplace=True) black_list = {'a872', 'a987'} for pair in matched_pairs.itertuples(): aid = pair.l_id bid = pair.r_id if (aid in black_list): continue # Title: keep title from A, if title from B is not an exact matched # from A, append B’s title to the alternative title field if B’s title # is not already in A’s alternative title. m_title = merged.loc[aid, 'title'] a_title = merged.loc[aid, 'title'] b_title = B.loc[bid, 'title'] if (b_title != a_title): if pd.isnull(merged.loc[aid, 'alternative_titles']): merged.loc[aid, 'alternative_titles'] = b_title else: alt = set(merged.loc[aid, 'alternative_titles'].split(';')) if (b_title not in alt): merged.loc[aid, 'alternative_titles'] += ';' + b_title for col in [ 'directors', 'writers', 'cast', 'genres', 'keywords', 'languages', 'production_companies', 'production_countries' ]: merged.loc[aid, col] = merge_cell(merged.loc[aid, col], B.loc[bid, col]) # Content rating: keep A # Release year: keep A # Opening_weekend_revenue: keep A # Run time m_runtime = int( (merged.loc[aid, 'run_time'] + B.loc[bid, 'run_time']) / 2) merged.loc[aid, 'run_time'] = m_runtime # Budget and Revenue for col in ['budget', 'revenue']: merged.loc[aid, col] = merge_money(merged.loc[aid, col], B.loc[bid, col]) # Rating: take the average after converting B rating to scale 10. m_rating = (merged.loc[aid, 'rating'] + 0.1 * B.loc[bid, 'rating']) / 2 merged.loc[aid, 'rating'] = m_rating merged.to_csv(FOLDER + 'E.csv', index=True)
def run_magellan(train_set, valid_set, test_set, feature_combinations, classifiers, experiment_name, write_test_set_for_inspection=False): train_path = os.path.dirname(train_set) train_file = os.path.basename(train_set) test_path = os.path.dirname(test_set) test_file = os.path.basename(test_set) report_train_name = train_file.replace('.csv', '') report_test_name = test_file.replace('.csv', '') train_set_left = train_file.replace('pairs', 'left') train_set_right = train_file.replace('pairs', 'right') test_set_left = test_file.replace('pairs', 'left') test_set_right = test_file.replace('pairs', 'right') os.makedirs(os.path.dirname( '../../../reports/magellan/{}/'.format(experiment_name)), exist_ok=True) try: os.remove('../../../reports/magellan/{}/{}_{}.csv'.format( experiment_name, report_train_name, report_test_name)) except OSError: pass with open( '../../../reports/magellan/{}/{}_{}.csv'.format( experiment_name, report_train_name, report_test_name), "w") as f: f.write( 'feature#####model#####mean_train_score#####std_train_score#####mean_valid_score#####std_valid_score#####precision_test#####recall_test#####f1_test#####best_params#####train_time#####prediction_time#####feature_importance#####experiment_name#####train_set#####test_set\n' ) for run in range(1, 4): for feature_combination in feature_combinations: A_t = em.read_csv_metadata(train_path + '/' + train_set_left, key='mag_id') B_t = em.read_csv_metadata(train_path + '/' + train_set_right, key='mag_id') # Load the pre-labeled data S_t = em.read_csv_metadata(train_set, key='_id', ltable=A_t, rtable=B_t, fk_ltable='ltable_mag_id', fk_rtable='rtable_mag_id') A_gs = em.read_csv_metadata(test_path + '/' + test_set_left, key='mag_id') B_gs = em.read_csv_metadata(test_path + '/' + test_set_right, key='mag_id') # Load the pre-labeled data S_gs = em.read_csv_metadata(test_set, key='_id', ltable=A_gs, rtable=B_gs, fk_ltable='ltable_mag_id', fk_rtable='rtable_mag_id') A_t.fillna('', inplace=True) A_gs.fillna('', inplace=True) B_t.fillna('', inplace=True) B_gs.fillna('', inplace=True) S_t.fillna('', inplace=True) S_gs.fillna('', inplace=True) ## DIRTY FIX, CLEAN UP! if 'name' in A_t.columns: A_t["price"] = A_t["price"].replace(r'^\s*$', np.nan, regex=True) A_t["price"] = A_t["price"].astype('float64') A_gs["price"] = A_gs["price"].replace(r'^\s*$', np.nan, regex=True) A_gs["price"] = A_gs["price"].astype('float64') B_t["price"] = B_t["price"].replace(r'^\s*$', np.nan, regex=True) B_t["price"] = B_t["price"].astype('float64') B_gs["price"] = B_gs["price"].replace(r'^\s*$', np.nan, regex=True) B_gs["price"] = B_gs["price"].astype('float64') S_t["ltable_price"] = S_t["ltable_price"].replace(r'^\s*$', np.nan, regex=True) S_t["ltable_price"] = S_t["ltable_price"].astype('float64') S_t["rtable_price"] = S_t["rtable_price"].replace(r'^\s*$', np.nan, regex=True) S_t["rtable_price"] = S_t["rtable_price"].astype('float64') S_gs["ltable_price"] = S_gs["ltable_price"].replace(r'^\s*$', np.nan, regex=True) S_gs["ltable_price"] = S_gs["ltable_price"].astype('float64') S_gs["rtable_price"] = S_gs["rtable_price"].replace(r'^\s*$', np.nan, regex=True) S_gs["rtable_price"] = S_gs["rtable_price"].astype('float64') atypes1 = em.get_attr_types(A_t) atypes2 = em.get_attr_types(B_t) match_c = em.get_attr_corres(A_t, B_t) match_c['corres'] = [] # select attributes to compare for feature in feature_combination: match_c['corres'].append((feature, feature)) tok = em.get_tokenizers_for_matching() sim = em.get_sim_funs_for_matching() F_t = em.get_features(A_t, B_t, atypes1, atypes2, match_c, tok, sim) H_t = em.extract_feature_vecs(S_t, feature_table=F_t, attrs_after=['label', 'pair_id'], show_progress=False) H_gs = em.extract_feature_vecs(S_gs, feature_table=F_t, attrs_after='label', show_progress=False) H_t = H_t.fillna(-1) H_gs = H_gs.fillna(-1) validation_ids_df = pd.read_csv(valid_set) val_df = H_t[H_t['pair_id'].isin( validation_ids_df['pair_id'].values)] train_only_df = H_t[~H_t['pair_id']. isin(validation_ids_df['pair_id'].values)] train_only_df = train_only_df.drop(columns='pair_id') val_df = val_df.drop(columns='pair_id') train_only_df = train_only_df.sample(frac=1, random_state=42) pos_neg = H_t['label'].value_counts() pos_neg = round(pos_neg[0] / pos_neg[1]) train_ind = [] val_ind = [] for i in range(len(train_only_df) - 1): train_ind.append(-1) for i in range(len(val_df) - 1): val_ind.append(0) ps = PredefinedSplit(test_fold=np.concatenate((train_ind, val_ind))) train_df = pd.concat([train_only_df, val_df]) for k, v in classifiers.items(): classifier = v['clf'] if 'random_state' in classifier.get_params().keys(): classifier = classifier.set_params(**{'random_state': run}) # add pos_neg ratio to XGBoost params if k == 'XGBoost': v['params']['scale_pos_weight']: [1, pos_neg] model = RandomizedSearchCV(cv=ps, estimator=classifier, param_distributions=v['params'], random_state=42, n_jobs=4, scoring='f1', n_iter=500, pre_dispatch=8, return_train_score=True) feats_train = train_df.drop( ['_id', 'ltable_mag_id', 'rtable_mag_id', 'label'], axis=1) labels_train = train_df['label'] feats_gs = H_gs.drop( ['_id', 'ltable_mag_id', 'rtable_mag_id', 'label'], axis=1) labels_gs = H_gs['label'] try: model.fit(feats_train, labels_train) except ValueError: set_trace() parameters = model.best_params_ score_names = [ 'mean_train_score', 'std_train_score', 'mean_test_score', 'std_test_score' ] scores = {} score_string = '' for name in score_names: scores[name] = model.cv_results_[name][model.best_index_] score_string = score_string + name + ': ' + str( scores[name]) + ' ' feature_names = list(feats_train.columns) if k == 'LogisticRegression' or k == 'LinearSVC': most_important_features = model.best_estimator_.coef_ word_importance = zip(feature_names, most_important_features[0].tolist()) word_importance = sorted( word_importance, key=lambda importance: importance[1], reverse=True) if k == 'RandomForest' or k == 'DecisionTree': most_important_features = model.best_estimator_.feature_importances_ word_importance = zip(feature_names, most_important_features.tolist()) word_importance = sorted( word_importance, key=lambda importance: importance[1], reverse=True) if k == 'NaiveBayes': word_importance = '' if k == 'XGBoost': most_important_features = model.best_estimator_.feature_importances_ word_importance = zip(feature_names, most_important_features.tolist()) word_importance = sorted( word_importance, key=lambda importance: importance[1], reverse=True) if k == 'LogisticRegression': learner = LogisticRegression(random_state=run, solver='liblinear', **parameters) elif k == 'NaiveBayes': learner = GaussianNB() elif k == 'DecisionTree': learner = DecisionTreeClassifier(random_state=run, **parameters) elif k == 'LinearSVC': learner = LinearSVC(random_state=run, dual=False, **parameters) elif k == 'RandomForest': learner = RandomForestClassifier(random_state=run, n_jobs=4, **parameters) elif k == 'XGBoost': learner = xgb.XGBClassifier(random_state=run, n_jobs=4, **parameters) else: print('Learner is not a valid option') break model = learner feats_train = train_only_df.sample(frac=1, random_state=42) feats_train = train_only_df.drop( ['_id', 'ltable_mag_id', 'rtable_mag_id', 'label'], axis=1) labels_train = train_only_df['label'] start = time.time() model.fit(feats_train, labels_train) end = time.time() train_time = end - start start = time.time() preds_gs = model.predict(feats_gs) end = time.time() pred_time = end - start gs_report = classification_report(labels_gs, preds_gs, output_dict=True) feature_report = '+'.join(feature_combination) if write_test_set_for_inspection: out_path = '../../../data/processed/wdc-lspc/inspection/{}/magellan/'.format( experiment_name) os.makedirs(os.path.dirname(out_path), exist_ok=True) file_name = '_'.join([ os.path.basename(train_set), os.path.basename(test_set), k, feature_report ]) file_name = file_name.replace('.csv', '') file_name += f'_{run}.pkl.gz' test_inspection_df = S_gs.copy() if k == 'LinearSVC': proba_gs = model.decision_function(feats_gs).tolist() else: proba_gs = model.predict_proba(feats_gs).tolist() test_inspection_df['pred'] = preds_gs test_inspection_df['Class Prob'] = proba_gs test_inspection_df.to_pickle(out_path + file_name, compression='gzip') with open( '../../../reports/magellan/{}/{}_{}.csv'.format( experiment_name, report_train_name, report_test_name), "a") as f: f.write(feature_report + '#####' + k + '#####' + str(scores['mean_train_score']) + '#####' + str(scores['std_train_score']) + '#####' + str(scores['mean_test_score']) + '#####' + str(scores['std_test_score']) + '#####' + str(gs_report['1']['precision']) + '#####' + str(gs_report['1']['recall']) + '#####' + str(gs_report['1']['f1-score']) + '#####' + str(parameters) + '#####' + str(train_time) + '#####' + str(pred_time) + '#####' + str(word_importance[0:100]) + '#####' + experiment_name + '#####' + report_train_name + '#####' + report_test_name + '\n')
I['ltable_pages'] = '' I['rtable_pages'] = '' J = train_test['test'] J['ltable_edition'] = '' J['rtable_edition'] = '' J['ltable_pages'] = '' J['rtable_pages'] = '' # Save Set I #em.to_csv_metadata(I, './TableI.csv') # Save Set J #em.to_csv_metadata(J, './TableJ.csv') # Automatic feature generation F = em.get_features_for_matching(A, B, validate_inferred_attr_types=False) H = em.extract_feature_vecs(I, feature_table=F, attrs_after=['gold_labels']) # Fill missing values H.fillna(value='NaN', inplace=True) # Create ML matchers dt = em.DTMatcher(name='DecisionTree') svm = em.SVMMatcher(name='SVM') rf = em.RFMatcher(name='RandomForest') lg = em.LogRegMatcher(name='LogisticRegression') ln = em.LinRegMatcher(name='LinearRegression') nb = em.NBMatcher(name='NaiveBayes') # Select the best matcher result = em.select_matcher( [dt, rf, svm, ln, lg, nb], table=H, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'gold_labels'],
svm = em.SVMMatcher(name='SVM', random_state=0) rf = em.RFMatcher(name='RF', random_state=0) lg = em.LogRegMatcher(name='LogReg', random_state=0) ln = em.LinRegMatcher(name='LinReg') nb = em.NBMatcher(name='NaiveBayes') # The features for matching F = em.get_features_for_matching(A, B, validate_inferred_attr_types=False) # List the names of the features generated F['feature_name'] #Extract Feature Vectors. H = em.extract_feature_vecs(I, feature_table=F, attrs_after='Match', show_progress=False) # compare stats to select best Matcher #Logistic Regression in our case result = em.select_matcher( [dt, rf, svm, ln, lg, nb], table=H, exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'Match'], k=5, target_attr='Match', metric_to_select_matcher='f1', random_state=0) result['cv_stats']
em.to_csv_metadata(D, 'datasets/tbl_blocked_8.csv') tbl_blocked = em.read_csv_metadata('datasets/tbl_blocked_8.csv',\ ltable=sample_movies, rtable=sample_tracks) S = em.sample_table(tbl_blocked, 400) em.to_csv_metadata(S, 'datasets/sampled_8.csv') with open('metadata_8.csv', 'wb') as data1: writer = csv.writer(data1, delimiter=',', quotechar='|') for entry in S.values: l = len(entry) item = entry[-4:] writer.writerow(item) match_f = em.get_features_for_matching(sample_movies, sample_tracks) H = em.extract_feature_vecs(S, feature_table=match_f) with open('data_8.csv', 'wb') as data: writer = csv.writer(data, delimiter=',', quotechar='|') flag = 0 names = [] for idx, row in H.iterrows(): item = [] print row for it in row.iteritems(): if flag: names.append(it[0]) item.append(it[1]) flag = 1 writer.writerow(item) print names
'left_ram_frequency', 'left_hdd_capacity', 'left_ssd_capacity', 'left_weight', 'left_dimensions', 'left_title', 'right_brand', 'right_cpu_brand', 'right_cpu_model', 'right_cpu_type', 'right_cpu_frequency', 'right_ram_capacity', 'right_ram_type', 'right_ram_frequency', 'right_hdd_capacity', 'right_ssd_capacity', 'right_weight', 'right_dimensions', 'right_title' ] attrs_to_be_excluded = [] attrs_to_be_excluded.extend( ['_id', 'left_instance_id', 'right_instance_id']) attrs_to_be_excluded.extend(attrs_from_table) # Convert the cancidate set to feature vectors using the feature table L = em.extract_feature_vecs(C, feature_table=feature_table, attrs_before=attrs_from_table, show_progress=True, n_jobs=-1) loaded_rf = joblib.load("random_forest.joblib") # Predict the matches predictions = loaded_rf.predict( table=L, exclude_attrs=attrs_to_be_excluded, append=True, target_attr='predicted', inplace=False, ) # Prepare the output
# L = K1.copy() # print(L.columns) print('Loading labels...') L['gold'] = 0 trues = exact[exact['gold'] == 1][['ltable.id', 'rtable.id']] L['temp'] = L['ltable_id'].astype(str) + L['rtable_id'].astype(str) trues['temp'] = trues['ltable.id'].astype(str) + trues['rtable.id'].astype(str) L.loc[L['temp'].isin(trues['temp']), ['gold']] = 1 development_evaluation = em.split_train_test(L, train_proportion=0.5) development = development_evaluation['train'] evaluation = development_evaluation['test'] print('Creating feature vectors...') train_feature_vectors = em.extract_feature_vecs(development, attrs_after='gold', feature_table=features) test_feature_vectors = em.extract_feature_vecs(evaluation, attrs_after='gold', feature_table=features) train_feature_vectors = train_feature_vectors.fillna(0.0) test_feature_vectors = test_feature_vectors.fillna(0.0) print("tagged pairs:" + str(exact['gold'].value_counts())) df = pd.DataFrame( columns=['instance', 'candName', 'targName', 'conf', 'realConf']) epoch = 1 # cands = list(exact['ltable.id'].unique()) # targs = list(exact['rtable.id'].unique())
# J_set = myset['test'] # em.to_csv_metadata(I_set, 'datasets/I_set.csv') # em.to_csv_metadata(J_set, 'datasets/J_set.csv') # creating feature for matching match_t = em.get_tokenizers_for_matching() match_s = em.get_sim_funs_for_matching() atypes1 = em.get_attr_types(sampled_movies) atypes2 = em.get_attr_types(sampled_tracks) match_c = em.get_attr_corres(sampled_movies, sampled_tracks) match_f = em.get_features(sampled_movies, sampled_tracks, atypes1, atypes2, match_c, match_t, match_s) # generating feature vectors H = em.extract_feature_vecs(dev_set, feature_table=match_f, attrs_after='label', show_progress=False) # filling missing values in feature vectors H.fillna(value=0, inplace=True) # creating a set of learning-based matchers dt = em.DTMatcher(name='DecisionTree', random_state=0) svm = em.SVMMatcher(name='SVM', random_state=0) rf = em.RFMatcher(name='RF', random_state=0) lg = em.LogRegMatcher(name='LogReg', random_state=0) ln = em.LinRegMatcher(name='LinReg') nb = em.NBMatcher(name='NaiveBayes') # Selecting the best matcher using cross-validation
def main(): A = em.read_csv_metadata('ltable.csv', key="ltable_id", encoding='ISO-8859-1') B = em.read_csv_metadata('rtable.csv', key="rtable_id", encoding='ISO-8859-1') ob = em.OverlapBlocker() C = ob.block_tables( A, B, 'title', 'title', l_output_attrs=['title', 'category', 'brand', 'modelno', 'price'], r_output_attrs=['title', 'category', 'brand', 'modelno', 'price'], overlap_size=1, show_progress=False) S = em.sample_table(C, 450) G = em.read_csv_metadata("train.csv", key='id', ltable=A, rtable=B, fk_ltable='ltable_id', fk_rtable='rtable_id') feature_table = em.get_features_for_matching( A, B, validate_inferred_attr_types=False) G = em.label_table(S, 'label') attrs_from_table = [ 'ltable_title', 'ltable_category', 'ltable_brand', 'ltable_modelno', 'ltable_price', 'rtable_title', 'rtable_category', 'rtable_brand', 'rtable_modelno', 'rtable_price' ] H = em.extract_feature_vecs(G, feature_table=feature_table, attrs_before=attrs_from_table, attrs_after='label', show_progress=False) H.fillna('0', inplace=True) # H = em.impute_table( # H, exclude_attrs=['_id', 'ltable_ltable_id', 'rtable_rtable_id','label'], strategy='mean') rf = em.RFMatcher() attrs_to_be_excluded = [] attrs_to_be_excluded.extend( ['_id', 'ltable_ltable_id', 'rtable_rtable_id', 'label']) attrs_to_be_excluded.extend(attrs_from_table) rf.fit(table=H, exclude_attrs=attrs_to_be_excluded, target_attr='label') attrs_from_table = [ 'ltable_title', 'ltable_category', 'ltable_brand', 'ltable_modelno', 'ltable_price', 'rtable_title', 'rtable_category', 'rtable_brand', 'rtable_modelno', 'rtable_price' ] L = em.extract_feature_vecs(C, feature_table=feature_table, attrs_before=attrs_from_table, show_progress=False, n_jobs=-1) attrs_to_be_excluded = [] attrs_to_be_excluded.extend( ['_id', 'ltable_ltable_id', 'rtable_rtable_id']) attrs_to_be_excluded.extend(attrs_from_table) predictions = rf.predict(table=L, exclude_attrs=attrs_to_be_excluded, append=True, target_attr='predicted', inplace=False) dataset = pd.DataFrame({"id": G[0]['id'], 'label': predictions['label']}) dataset.to_csv("./prediction2.csv", index=False)