def fit_deepmatcher(model_args, train, validation, test, batch_size = 16): ''' Takes train-validation-test sets generated by dm.data.process() using temp written csv files created by perpare_data_deepmatcher Outputs: (train_predictions, valid_predictions, test_predictions, None, post_blocked_all_sets_labels) * The none is to keep it in the desired format for evaluation_functions.py ''' # Confugre the Matching Algorithm model = dm.MatchingModel(attr_summarizer=model_args["attr_summarizer"]) # Fit the model and select best epoch version based on best validation accuracy model.run_train( train, validation, epochs=10, batch_size= batch_size, best_save_path='../results/' + model_args["attr_summarizer"] + '.pth', pos_neg_ratio=2) # Create and store predictions ## Name of the model is the attr summarizer setting train_predictions = {model_args["attr_summarizer"]:model.run_prediction(train).match_score.values} valid_predictions = {model_args["attr_summarizer"]:model.run_prediction(validation).match_score.values} test_predictions = {model_args["attr_summarizer"]:model.run_prediction(test).match_score.values} # Create source of truth to be used for evaluation_functions.py post_blocked_all_sets_labels = {"train":train.get_raw_table().y.values, "valid":validation.get_raw_table().y.values, "test":test.get_raw_table().y.values} return (train_predictions, valid_predictions, test_predictions, None, post_blocked_all_sets_labels)
def predict_and_write_for_inspection(test_path, model_path, experiment_name, gpu_id, nn_type, comp_type, features): out_path = '../../../data/processed/inspection/{}/deepmatcher/'.format( experiment_name) os.makedirs(os.path.dirname(out_path), exist_ok=True) ignore_columns = get_features(test_path) left_right_features = ['ltable_' + feat for feat in features] left_right_features.extend(['rtable_' + feat for feat in features]) for feat in left_right_features: ignore_columns.remove(feat) os.environ["CUDA_VISIBLE_DEVICES"] = gpu_id model = dm.MatchingModel(attr_summarizer=nn_type, attr_comparator=comp_type) model.load_state(model_path) candidate = dm.data.process_unlabeled(path=test_path, trained_model=model, ignore_columns=ignore_columns) predictions = model.run_prediction(candidate, output_attributes=True, batch_size=8) predictions['pred'] = predictions['match_score'].apply( lambda score: 1 if score >= 0.5 else 0) print( classification_report(predictions['label'], predictions['pred'], digits=4)) print(confusion_matrix(predictions['label'], predictions['pred'])) file_name = os.path.basename(model_path) + os.path.basename(test_path) file_name = file_name.replace('model.pth', '_') file_name = file_name.replace('.csv', '.csv.gz') file_name = file_name.replace('_formatted', '') predictions.to_csv(out_path + file_name, compression='gzip', header=True, index=False)
def dm_train(df): split_path = config.PATHS["deepmatcher_training_folder"] # Split labeled data into train, valid, and test csv files to disk, with the split ratio of 3:1:1. dm.data.split(df, split_path, 'train.csv', 'valid.csv', 'test.csv',\ [3, 1, 1]) train, validation, test = dm.data.process( path=split_path, cache='train_cache.pth', train='train.csv', validation='valid.csv', test='test.csv', left_prefix = 'ltable', right_prefix = 'rtable', label_attr='label', id_attr = '_id', ignore_columns=('ltable_id', 'rtable_id')) #Create a hybrid model. model = dm.MatchingModel(attr_summarizer='hybrid') # Train the hybrid model with 3 training epochs, batch size of 16, positive-to-negative # ratio to be 10. We save the best model (with the # highest F1 score on the validation set) to 'hybrid_model.pth'. model.run_train( train, validation, epochs=3, batch_size=16, best_save_path=config.PATHS['deepmatcher_model'], pos_neg_ratio=10) # Evaluate the accuracy on the test data. print (model.run_eval(test)) return model
cache=None, #check_cached_data=False, embeddings='fasttext.wiki.vec', embeddings_cache_path=embedding_cache_dir, train='train.csv', validation='validation.csv', test='test.csv') # parameters to keep consistent with nn_type = 'hybrid' comp_type = 'abs-diff' #epochs = 15 pos_neg_ratio = 1 batch_size = 8 lr = 0.001 lr_decay = 0.9 smoothing=0.05 model = dm.MatchingModel(attr_summarizer=nn_type, attr_comparator=comp_type) model.initialize(train) optim = dm.optim.Optimizer(method='adam', lr=lr, max_grad_norm=5, start_decay_at=1, beta1=0.9, beta2=0.999, adagrad_accum=0.0, lr_decay=lr_decay) optim.set_parameters(model.named_parameters()) start = time.time() if len(sys.argv)>4: ep=int(sys.argv[5]) else: ep=30 model.run_train( train, validation, #epochs=epochs, batch_size=batch_size, pos_neg_ratio=pos_neg_ratio,
def run_dm_model(train_set, valid_set, test_set, experiment_name, gpu_id, epochs, pos_neg_ratio, batch_size, lr, lr_decay, embedding, nn_type, comp_type, special_name, features, run_no, smoothing=0.05): os.makedirs(os.path.dirname( '../../../reports/deepmatcher/raw/{}/'.format(experiment_name)), exist_ok=True) os.makedirs(os.path.dirname( '../../../cache/deepmatcher/{}/data-cache/'.format(experiment_name)), exist_ok=True) os.makedirs(os.path.dirname( '../../../cache/deepmatcher/{}/models/'.format(experiment_name)), exist_ok=True) os.environ["CUDA_VISIBLE_DEVICES"] = gpu_id dm.data.reset_vector_cache() ignore_columns = get_features(train_set) left_right_features = ['ltable_' + feat for feat in features] left_right_features.extend(['rtable_' + feat for feat in features]) for feat in left_right_features: ignore_columns.remove(feat) features_filename = '-'.join(features) train_set_filename = os.path.basename(train_set) train_set_filename = train_set_filename.replace('.csv', '') train, valid, test = dm.data.process( path='', cache='../../../cache/deepmatcher/{}/data-cache/{}.pth'.format( experiment_name, train_set_filename + '_' + embedding), train=train_set, validation=valid_set, test=test_set, embeddings=embedding, use_magellan_convention=True, ignore_columns=ignore_columns) old_stdout = sys.stdout sys.stdout = open( '../../../reports/deepmatcher/raw/{}/{}_{}_{}_epochs{}_ratio{}_batch{}_lr{}_lrdecay{}_{}_{}_{}_run{}.txt' .format(experiment_name, nn_type, comp_type, special_name, epochs, pos_neg_ratio, batch_size, lr, lr_decay, embedding, features_filename, train_set_filename, run_no), 'w') model = dm.MatchingModel(attr_summarizer=nn_type, attr_comparator=comp_type) model.initialize(train) optim = dm.optim.Optimizer(method='adam', lr=lr, max_grad_norm=5, start_decay_at=1, beta1=0.9, beta2=0.999, adagrad_accum=0.0, lr_decay=lr_decay) optim.set_parameters(model.named_parameters()) start = time.time() model.run_train( train, valid, epochs=epochs, batch_size=batch_size, best_save_path= '../../../cache/deepmatcher/{}/models/{}_{}_{}_epochs{}_ratio{}_batch{}_lr{}_lrdecay{}_{}_{}_{}_run{}_model.pth' .format(experiment_name, nn_type, comp_type, special_name, epochs, pos_neg_ratio, batch_size, lr, lr_decay, embedding, features_filename, train_set_filename, run_no), pos_neg_ratio=pos_neg_ratio, optimizer=optim, label_smoothing=smoothing) end = time.time() print('Training time: ' + str(end - start)) start = time.time() model.run_eval(test, batch_size=batch_size) end = time.time() print('Prediction time: ' + str(end - start)) sys.stdout = old_stdout
if len(neighborhood) > num_triangles: neighborhood = neighborhood.sample(n=num_triangles) neighborhood['id'] = neighborhood.index neighborhood['label'] = list( map(lambda predictions: int(round(predictions)), neighborhood.match_score.values)) neighborhood = neighborhood.drop(['match_score'], axis=1) r1r2['label'] = np.argmax(originalPrediction) dataset4explanation = pd.concat([r1r2, neighborhood], ignore_index=True) return dataset4explanation lsource = pd.read_csv('datasets/Structured/DBLP-ACM/tableA.csv') rsource = pd.read_csv('datasets/Structured/DBLP-ACM/tableB.csv') model = dm.MatchingModel(attr_summarizer='hybrid') model.load_state('da_dm.pth') def predict_fn(test_df, model, ignore_columns=['label'], outputAttributes=False, batch_size=32): data = test_df.copy().drop( [c for c in ignore_columns if c in test_df.columns], axis=1) if not ('id' in data.columns): data['id'] = np.arange(len(data)) tmp_name = "./{}.csv".format("".join( [random.choice(string.ascii_lowercase) for _ in range(10)])) data.to_csv(tmp_name, index=False)
import numpy as np np.random.seed(42) import random random.seed(42) if __name__ == "__main__": data_dir = "/home/zz/Work/data/deepmatcher_toy/sample_data/itunes-amazon" train, validation, test = \ dm.data.process(path=data_dir, check_cached_data=False, embeddings='fasttext.wiki.vec', embeddings_cache_path=data_dir+"/embedding_cache", train='train.csv', validation='validation.csv', test='test.csv') model = dm.MatchingModel() model.run_train(train, validation, best_save_path=None) model.run_eval(test) # unlabeled = dm.data.process_unlabeled(path='data_directory/unlabeled.csv', trained_model=model) # model.run_prediction(unlabeled) ''' This method is important for reading/caching embeddings The param 'embeddings' is only a name for identifying the embeddings. DM only support a limited set of these names. And when a recognisable name is supplied, it will attempt to download it. This means if you want to use a custom embedding, you need to 'hack it' by renaming your model using one of the expected names, and keep the same format. As an example, when embedding=fasttext.wiki.vec, DM will look for: https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip You can pre-download this and save it into: [embeddings_cache_path] (see the code below)
def create_model(): model = dm.MatchingModel(attr_summarizer='hybrid') return model
def run_dm_model(train_set, valid_set, test_set, experiment_name, gpu_id, epochs, pos_neg_ratio, batch_size, lr, lr_decay, embedding, nn_type, comp_type, special_name, features, run_no, smoothing=0.05, prediction_sets=None): os.makedirs(os.path.dirname( '../../../reports/deepmatcher/raw/{}/'.format(experiment_name)), exist_ok=True) os.makedirs(os.path.dirname( '../../../cache/deepmatcher/{}/data-cache/'.format(experiment_name)), exist_ok=True) os.makedirs(os.path.dirname( '../../../cache/deepmatcher/{}/models/'.format(experiment_name)), exist_ok=True) os.environ["CUDA_VISIBLE_DEVICES"] = gpu_id dm.data.reset_vector_cache() ignore_columns = get_features(train_set) left_right_features = ['ltable_' + feat for feat in features] left_right_features.extend(['rtable_' + feat for feat in features]) for feat in left_right_features: ignore_columns.remove(feat) features_filename = '-'.join(features) train_set_filename = os.path.basename(train_set) train_set_filename = train_set_filename.replace('.csv', '') train, valid, test = dm.data.process( path='', cache='../../../cache/deepmatcher/{}/data-cache/{}.pth'.format( experiment_name, train_set_filename + '_' + embedding), train=train_set, validation=valid_set, test=test_set, embeddings=embedding, use_magellan_convention=True, ignore_columns=ignore_columns) old_stdout = sys.stdout sys.stdout = open( '../../../reports/deepmatcher/raw/{}/{}_{}_{}_epochs{}_ratio{}_batch{}_lr{}_lrdecay{}_{}_{}_{}_run{}.txt' .format(experiment_name, nn_type, comp_type, special_name, epochs, pos_neg_ratio, batch_size, lr, lr_decay, embedding, features_filename, train_set_filename, run_no), 'w') model = dm.MatchingModel(attr_summarizer=nn_type, attr_comparator=comp_type) model.initialize(train) optim = dm.optim.Optimizer(method='adam', lr=lr, max_grad_norm=5, start_decay_at=1, beta1=0.9, beta2=0.999, adagrad_accum=0.0, lr_decay=lr_decay) optim.set_parameters(model.named_parameters()) start = time.time() model.run_train( train, valid, epochs=epochs, batch_size=batch_size, best_save_path= '../../../cache/deepmatcher/{}/models/{}_{}_{}_epochs{}_ratio{}_batch{}_lr{}_lrdecay{}_{}_{}_{}_run{}_model.pth' .format(experiment_name, nn_type, comp_type, special_name, epochs, pos_neg_ratio, batch_size, lr, lr_decay, embedding, features_filename, train_set_filename, run_no), pos_neg_ratio=pos_neg_ratio, optimizer=optim, label_smoothing=smoothing) end = time.time() print('Training time: ' + str(end - start)) start = time.time() model.run_eval(test, batch_size=batch_size) end = time.time() print('Prediction time: ' + str(end - start)) sys.stdout = old_stdout if prediction_sets is not None: out_path = '../../../data/processed/inspection/{}/deepmatcher/'.format( experiment_name) os.makedirs(os.path.dirname(out_path), exist_ok=True) for prediction_set in prediction_sets: candidate = dm.data.process_unlabeled( path=prediction_set, trained_model=model, ignore_columns=ignore_columns) predictions = model.run_prediction(candidate, output_attributes=True, batch_size=8) predictions['label_pred'] = predictions['match_score'].apply( lambda score: 1 if score >= 0.5 else 0) file_name = os.path.basename( '{}_{}_{}_epochs{}_ratio{}_batch{}_lr{}_lrdecay{}_{}_{}_{}_run{}_model.pth' .format(nn_type, comp_type, special_name, epochs, pos_neg_ratio, batch_size, lr, lr_decay, embedding, features_filename, train_set_filename, run_no)) + os.path.basename(prediction_set) file_name = file_name.replace('.csv', '.csv.gz') file_name = file_name.replace('model.pth', '') file_name = file_name.replace('_formatted', '') predictions.to_csv(out_path + file_name, compression='gzip', header=True, index=False)