def __init__(self, mlflow_record): self.mlflow_record = mlflow_record self.validation_pc = 0.2 self.X, self.y = read_data('data/train.csv', label_bool=True) self.transform = Transformation() self.learner = Learner() self.model_name = 'model_eval'
def test_random_spliter(): logger.info("*" * 20) data = read_data(task="ud", folder=data_folder, shuffle=True, selected_feats=None, combine_models=True) random_spliter = Random_Spliter(data) splited_data = random_spliter.split() assert len(splited_data["all"]["train_feats"]) == 1 assert len(splited_data["all"]["train_feats"][0]) + len( splited_data["all"]["test_feats"][0]) == 72 * 25
def test_k_spliter(): logger.info("*" * 20) data = read_data(task="wiki", folder=data_folder, shuffle=True, selected_feats=None, combine_models=False) k_fold_spliter = K_Fold_Spliter(data) k_fold_data = k_fold_spliter.split() assert len(k_fold_data["BLEU"]["train_feats"]) == 5 assert len(k_fold_data["BLEU"]["train_feats"][0]) + len(k_fold_data["BLEU"]["test_feats"][0]) == \ len(k_fold_data["BLEU"]["train_feats"][1]) + len(k_fold_data["BLEU"]["test_feats"][1]) == 995
def test_load_data(): logger.info("*" * 20) data = read_data(task="monomt", folder=data_folder, shuffle=True, selected_feats=None, combine_models=False) assert len(data["BLEU"]["feats"]) == 54 assert len(data["BLEU"]["labels"]) == 54 assert len(data["BLEU"]["langs"]) == 54 assert list(data["BLEU"]["langs"].columns.values) == [ "Source Language", "Target Language" ] # test_feature_selection logger.info("*" * 20) data = read_data(task="monomt", folder=data_folder, shuffle=True, selected_feats=["dataset size (sent)"], combine_models=False) assert [data["BLEU"]["feats"].columns.values] == ["dataset size (sent)"] # test multi_model logger.info("*" * 20) data = read_data(task="bli", folder=data_folder, shuffle=True, selected_feats=None, combine_models=False) assert len(data) == 3 data = read_data(task="bli", folder=data_folder, shuffle=True, selected_feats=None, combine_models=True) assert len(data) == 1
def Main(): X_train, y_train, X_valid, y_valid = read_data() X = tf.placeholder(dtype=tf.float32, shape=[X_train.shape[1], X_train.shape[2]]) with tf.name_scope("network"): network = Q_RNN(num_inputs=X_train.shape[2], num_inputs=X_train.shape[2], num_layers=3, time_step=X_train.shape[1], size=1, scope="generative") proposal = network.proposal param_list = network.build_network(status=X) with tf.name_scope("loss"): ops = [] loss = network.compute_loss(param_list=param_list) r_optimizer = tf.train.AdamOptimizer() g_optimizer = tf.train.AdamOptimizer() r_vars = proposal.get_trainable() g_vars = network.get_trainable() r_grad = r_optimizer.compute_gradients(loss=loss, var_list=r_vars) g_grad = g_optimizer.compute_gradients(loss=loss, var_list=g_vars) ops.append(r_optimizer.apply_gradients(grads_and_vars=r_grad)) ops.append(g_optimizer.apply_gradients(grads_and_vars=g_grad)) with tf.name_scope("miscellaneous"): init = tf.global_variables_initializer() saver = tf.train.Saver() with tf.Session() as sess: sess.run(init) index = 0 while True: index = index + 1 e_idx = np.random.randint(low=0, high=X_train.shape[0] - 1) status = np.reshape(a=X_train[e_idx], newshape=[X_train.shape[1], X_train.shape[2]]) l, _ = sess.run([loss, ops], feed_dict={X: status}) print("At iteration {}, loss: {}".format(index, l)) if index % 100 == 0: saver.save(sess=sess, save_path=save_path)
def test_specific_spliter(): logger.info("*" * 20) data = read_data(task="ma", folder=data_folder, shuffle=True, selected_feats=None, combine_models=True) feats = data["all"]["feats"] lens = len(feats) train_idxs = list(feats[feats["data size"] > 200].index) test_idxs = list(set(feats.index) - set(train_idxs)) specific_spliter = Specific_Spliter(data, train_idxs, test_idxs) splited_data = specific_spliter.split() assert len(splited_data["all"]["train_feats"][0]) + len( splited_data["all"]["test_feats"][0]) == lens assert len(splited_data["all"]["train_labels"][0]) + len( splited_data["all"]["test_labels"][0]) == lens assert len(splited_data["all"]["train_langs"][0]) + len( splited_data["all"]["test_langs"][0]) == lens
def run_ex(task, n=3, regressor="xgboost", portion=0.5): org_data = read_data(task, True, combine_models=True) feats = org_data["all"]["feats"] ids = feats.index test_rmses = {} baseline_rmses = {} models = task_eval_columns(task) for model in models: logger.info( "Running experiments with {} examples for a new model {}...". format(n, model)) test_rmses[model] = [] baseline_rmses[model] = defaultdict(list) model_ids = list(feats[feats[f"model_{model}"] == 1].index) other_model_ids = list(feats[feats[f"model_{model}"] == 0].index) test_lens = int(len(model_ids) * portion) logger.info( f"We use {portion} of the new model data as the test set. And we sample data points for training" f"in the remaining {1-portion} of data. We sample the split for {params.test_id_options_num} times. " f"There are {len(model_ids)} for model {model} and {len(other_model_ids)} for other models." ) total_exs = params.test_id_options_num * params.sample_options_num finished_exs = 0 for i in range(params.test_id_options_num): test_id_option = sample(model_ids, test_lens) sample_ids = list(set(model_ids) - set(test_id_option)) total_sample_options = int(comb(len(sample_ids), n)) logger.info( "There are {} experiments running for model {}.. and we sample {} experiments" .format(total_sample_options, model, params.sample_options_num)) finished_exs_for_one_test_set = 0 for j in range(params.sample_options_num): sample_option = sample(sample_ids, n) train_ids = list( set(sample_option).union(set(other_model_ids))) splitter = Specific_Spliter(org_data, [train_ids], [test_id_option]) split_data = splitter.split()["all"] train_rmse, train_preds, test_rmse, test_preds, train_labels, test_labels, \ test_upper_preds, test_lower_preds, reg = \ run_once(split_data["train_feats"][0], split_data["train_labels"][0], split_data["test_feats"][0], split_data["test_labels"][0], split_data["train_labels_mns"][0], split_data["train_labels_sstd"][0], regressor, get_ci=False) test_rmses[model].append(test_rmse) these_baselines = get_baselines(org_data, other_model_ids, sample_ids, test_id_option) for baseline in these_baselines: baseline_rmses[model][baseline].append( these_baselines[baseline]) finished_exs_for_one_test_set += 1 finished_exs += 1 if finished_exs % 100 == 0: logger.info( "Progress: {}/{}, {:.2f}%, RMSE@{}: {:.2f}".format( finished_exs, total_exs, finished_exs / total_exs * 100, n, np.mean(test_rmses[model]))) for baseline in baseline_rmses[model]: logger.info( f"Baseline {baseline}: {np.mean(baseline_rmses[model][baseline])}" ) if finished_exs_for_one_test_set == params.sample_options_num: break if finished_exs == total_exs: logger.info("{} done! RMSE@{}: {:.2f}".format( model, n, np.mean(test_rmses[model]))) break logger.info("All experiments done!") for model in models: logger.info("Model: {}, ex: {} RMSE@{}: {:.2f}".format( model, len(test_rmses[model]), n, np.mean(test_rmses[model]))) logger.info("All models, RMSE@{}: {:.2f}".format( n, np.mean([np.mean(test_rmses[model]) for model in models]))) for baseline in baseline_rmses[models[0]]: logger.info( f"Baseline {baseline} across all models: {np.mean([np.mean(baseline_rmses[model][baseline]) for model in models])}" )
from src.read_data import read_data from src.gradient_descent import gradient_descent from src.data_visualisation import data_visualisation # import libraries import random import numpy as np if __name__ == '__main__': # initializing variables theta_0 = 0 theta_1 = 0 alpha = 1 # Read the data from "data.csv", remove first row and calculate m data = read_data("data.csv") data.pop(0) data = [[int(x), int(y)] for [x, y] in data] # Data normalisation using the maximum absolute scaling x_min = np.min([x for x, y in data]) y_min = np.min([y for x, y in data]) x_max = np.max([x for x, y in data]) y_max = np.max([y for x, y in data]) data_normalised = [[x / x_max, y / y_max] for x, y in data] # Define the maximum number of iterations max_iterations = input( "Enter the maximum number of iterations (1000 if not specified): ") if (max_iterations == '' or max_iterations.isnumeric() == False): max_iterations = int(1000)
def __init__(self): self.X_train, self.y_train = read_data('data/train.csv', label_bool=True) self.X_test = read_data('data/test.csv', label_bool=False) self.transform = Transformation() self.learner = Learner() self.model_name = 'model_full'