def __init__(self, model, params, space): self.__model = model self.__params = params self.__space = space self.__evaluator = Evaluator(self.__model) self.__train_df = None self.__test_df = None
def build_ideal_window_hopping_set(windowSize, AbstractBaseSampler, filename='./data/creditcard.csv', max_window_ct=3): filename = './data/creditcard.csv' sample_sizes = [30, 40, 60] for sample_size in sample_sizes: hopper, column_names = buildFromCSV(filename, windowSize, "Time") windows = hopper.hopper() sampler = AbstractBaseSampler(sample_size, (2, 29), eta) # sample size, column range, eta samples = dict() num_windows = 0 for window in windows: samples[num_windows] = sampler.sample(window) num_windows += 1 if num_windows > max_window_ct: break storage_filename = _persistentFileName(str(sampler), filename, windowSize, sample_size) # store dataset and return evaluation metrics on it sampler.persist_sample_set(samples, storage_filename, column_names, num_windows) e = Evaluator(samples, sampler) e.save(storage_filename + "evaluator.csv") return
def evaluateWillsSamplerParallel(windowSize, filename='./data/creditcard.csv', max_window_ct=3): parallel_counts = [1, 2, 4, 8, 16, 32] filename = './data/creditcard.csv' sample_sizes = [30, 40, 60] for sample_size in sample_sizes: for parallel_count in parallel_counts: hopper, column_names = buildFromCSV(filename, windowSize, "Time") windows = hopper.hopper() sampler = WillsSampler(sample_size, (2, 29), eta, parallel_count=parallel_count) samples = dict() num_windows = 0 for window in windows: samples[num_windows] = sampler.sample(window).deheapify() num_windows += 1 if num_windows > max_window_ct: break storage_filename = sampler.persistent_filename( filename, windowSize) # store dataset and return evaluation metrics on it sampler.persist_sample_set(samples, storage_filename, column_names, num_windows) e = Evaluator(samples, sampler) e.save(storage_filename + "evaluator.csv") return
def evaluateWillsSamplerClusters(windowSize, filename='./data/creditcard.csv', max_window_ct=3): parallel_counts = [1, 4, 16] cluster_choices = [1, 6, 11, 21, 31, 51] cluster_centers_collection = loadClusters(cluster_choices) filename = './data/creditcard.csv' sample_sizes = [30, 40, 60, 100] for sample_size in sample_sizes: for num_centers, cluster_centers in cluster_centers_collection.items(): for parallel_count in parallel_counts: hopper, column_names = buildFromCSV(filename, windowSize, "Time") windows = hopper.hopper() sampler = WillsSampler(sample_size, (2, 29), eta, parallel_count=parallel_count, cluster_centers=cluster_centers) samples = dict() num_windows = 0 for window in windows: samples[num_windows] = sampler.sample(window).deheapify() num_windows += 1 if num_windows > max_window_ct: break storage_filename = sampler.persistent_filename( filename, windowSize) # store dataset and return evaluation metrics on it sampler.persist_sample_set(samples, storage_filename, column_names, num_windows) e = Evaluator(samples, sampler) e.save(storage_filename + "evaluator.csv") return
def main(): args = parse_arguments() print("#" * 80) print("Model: ", args.model_class) print("Parameters: ", args.model_parameters) print("X: ", args.x_filepath) print("Y: ", args.y_filepath) print("Splits: ", args.n_splits) print("Random State: ", args.random_state) print("Model Filepath: ", args.model_filepath) print("Raw Evaluation Filepath: ", args.raw_model_score_filepath) print("Aggregate Evaluation Filepath: ", args.aggregated_model_score_filepath) model = initialize_model(args.model_class, args.model_parameters) X = np.load(args.x_filepath) Y = np.load(args.y_filepath) evaluator = Evaluator(args.n_splits) train_model(model, X, Y, evaluator, args.n_splits, args.random_state) evaluator.save(args.raw_model_score_filepath, args.aggregated_model_score_filepath) joblib.dump(model, args.model_filepath) print("#" * 80)
def __init__(self, crf, gibbs=False, cd=False, n_samps=5, burn=5, interval=5): self.crf = crf self.gibbs = gibbs self.cd = gibbs and cd self.E_f = self.exp_feat_gibbs if gibbs else self.exp_feat self.n_samples = n_samps self.burn = burn self.interval = interval self.ev = Evaluator()
def __init__(self, init_pop, growth_time=2 * 60, mut_prob=0.5, pop_size=30): self._init_pop = init_pop self._mut_prob = mut_prob self._evaluator = Evaluator(growth_time) self._nsgaii_sorter = NSGAII(2, None, None) self._pop_size = pop_size
def get_classifier_evaluation(prediction, test, classifier_name, data_name, b=2): """ this function get the evaluation of each classifier: print the amount of errors and the text of them, plot roc_curve and return the measures scores. """ evaluation = Evaluator(prediction, test, b) return evaluation.get_evaluation(classifier_name, data_name)
def label_evaluation(test_data, predicted_labels): gold_labels = flatten([flatten(i["gold_labels"]) for i in test_data]) gold_labels = [1 if i else 0 for i in gold_labels] metric_evaluation = Evaluator() metric_evaluation.compute_all(gold_labels, predicted_labels) log.write("Confusion Matrix :") log.write(metric_evaluation.confusion_matrix) log.write("Accuracy = %f" % metric_evaluation.accuracy) log.write("Precision = %f" % metric_evaluation.precision) log.write("Recall = %f" % metric_evaluation.recall) log.write("F1 Score = %f" % metric_evaluation.f1_score)
class Cross_Validation: def __init__(self): self.knn_algo = KnnAlgorithm() self.evaluator = Evaluator() def cross_validation(self, dataset, k, params): fold_size = m.floor(len(dataset) / k) best_param = 1 max_a = 0 for i in range(k): folds = np.split( dataset, [i * fold_size, i * fold_size + fold_size, len(dataset)]) test = folds[1] training = np.concatenate((folds[0], folds[2])) curr_p, acc = self.parameter_tuning(training, params, k - 1) if max_a < acc: max_a = acc best_param = curr_p print(max_a) return best_param def parameter_tuning(self, training, params, k): fold_size = m.floor(len(training) / k) max_a = 0 best_param = 1 for i in range(k): folds = np.split( training, [i * fold_size, i * fold_size + fold_size, len(training)]) validation_data = folds[1] training_data = np.concatenate((folds[0], folds[2])) columns = int(validation_data.shape[1]) sections = [int(columns - 1), columns] val_data = np.hsplit(validation_data, sections) ground_truth = val_data[1] val_data = val_data[0] for param in params: pred = self.knn_algo.predict_multiple(param, training_data, val_data) cm = self.evaluator.get_cm(pred, ground_truth) accuracy = self.evaluator.accuracy(cm) if accuracy > max_a: max_a = accuracy best_param = param return best_param, max_a
def evaluate(self, name, pprint): """ Evaluate model using predicted tracks and actually listened tracks. :param name: string, name of the experiment (if None, class name will be printed instead) :param pprint: bool, if True -> scores will be pretty printed :return: score of the model as a tuple (precision, recall, f-score) """ if name is None: name = self.type evaluation = Evaluator() score = evaluation.score(self.predicted_tracks, self.to_user_track_map(self.get_unique_user_tracks(self.testing_data).values)) if pprint: evaluation.pprint_scores(score, name) return score
def run_experiment(args: dict[str, str]): if args["models"] == "all": args["models"] = ALL_MODEL_NAMES if args["datasets"] == "all": args["datasets"] = ALL_DATASET_NAMES models = setup_models(args["models"].split(), args["location"], daner_path=args["daner"]) log(f"Succesfully set up {len(models)} models") datasets = setup_datasets(args["datasets"].split(), wikiann_path=args["wikiann"], plank_path=args["plank"]) log(f"Sucessfully acquired {len(datasets)} NER datasets") for model in models: for dataset in datasets: e = Evaluator(model, dataset) res = e.run() res.save(os.path.join(args["location"], "-".join((model.name, dataset.name))))
def __init__(self, model: Model, optimizer_name: str = "Adagrad", batch_size: int = 256, learning_rate: float = 1e-2, decay1: float = 0.9, decay2: float = 0.99, regularizer_name: str = "N3", regularizer_weight: float = 5e-2, verbose: bool = True): self.model = model self.batch_size = batch_size self.verbose = verbose # build all the supported optimizers using the passed params (learning rate and decays if Adam) supported_optimizers = { 'Adagrad': optim.Adagrad(params=self.model.parameters(), lr=learning_rate), 'Adam': optim.Adam(params=self.model.parameters(), lr=learning_rate, betas=(decay1, decay2)), 'SGD': optim.SGD(params=self.model.parameters(), lr=learning_rate) } # build all the supported regularizers using the passed regularizer_weight supported_regularizers = { 'N3': N3(weight=regularizer_weight), 'N2': N2(weight=regularizer_weight) } # choose the Torch Optimizer object to use, based on the passed name self.optimizer = supported_optimizers[optimizer_name] # choose the regularizer self.regularizer = supported_regularizers[regularizer_name] # create the evaluator to use between epochs self.evaluator = Evaluator(self.model)
def get_evaluation_metrics(self, df_original, df_imputed, target, mask_missing, m_prop, verbose): """ Generate evaluation metrics for datasets :param m_prop: :param verbose: :param target: :param df_original: :param df_imputed: :param mask_missing: :return: """ results = dict() results['prop'] = m_prop results['strategy'] = self.strategy_abbr # todo: refactor it with score factory if self.strategy_abbr not in ['constant', 'emb']: results['rmse'] = Evaluator().get_compare_metrics( df_original, df_imputed, mask_missing) if self.strategy_abbr not in ['emb']: results['uce'] = Evaluator().uce(df_original, df_imputed) results['silhouette'] = Evaluator().silhouette(df_imputed) # todo: add pipeline for regression with auto detect the target type sce_or = Evaluator().sce(df_original, target) sce_im = Evaluator().sce(df_imputed, target) results['sce'] = sce_im - sce_or results['f1'] = Evaluator().f1_score(df_imputed, target) # if verbose: # self.logger.info(f'UCE - clustering error between original and imputed datasets = ', np.round(results['uce'], 5)) # self.logger.info(f'RMSE score between original values and imputed = ', np.round(results['rmse'], 5)) # self.logger.info(f'SCE - classification error between original and imputed datasets', np.round(results['sce'], 5)) return results
def valid(self): test_iter = Clip_Iterator(c.VALID_DIR_CLIPS) evaluator = Evaluator(self.global_step) i = 0 for data in test_iter.sample_valid(self._batch): in_data = data[:, :self._in_seq, ...] if c.IN_CHANEL == 3: gt_data = data[:, self._in_seq:self._in_seq + self._out_seq, :, :, 1:-1] elif c.IN_CHANEL == 1: gt_data = data[:, self._in_seq:self._in_seq + self._out_seq, ...] else: raise NotImplementedError if c.NORMALIZE: in_data = normalize_frames(in_data) gt_data = normalize_frames(gt_data) mse, mae, gdl, pred = self.g_model.valid_step(in_data, gt_data) evaluator.evaluate(gt_data, pred) self.logger.info(f"Iter {self.global_step} {i}: \n\t " f"mse:{mse:.4f} \n\t " f"mae:{mae:.4f} \n\t " f"gdl:{gdl:.4f}") i += 1 evaluator.done()
def run_benchmark(self, iter, mode="Valid"): if mode == "Valid": time_interval = c.RAINY_VALID stride = 20 else: time_interval = c.RAINY_TEST stride = 1 test_iter = Iterator(time_interval=time_interval, sample_mode="sequent", seq_len=c.IN_SEQ + c.OUT_SEQ, stride=1) evaluator = Evaluator(iter) i = 1 while not test_iter.use_up: data, date_clip, *_ = test_iter.sample(batch_size=c.BATCH_SIZE) in_data = np.zeros(shape=(c.BATCH_SIZE, c.IN_SEQ, c.H, c.W, c.IN_CHANEL)) gt_data = np.zeros(shape=(c.BATCH_SIZE, c.OUT_SEQ, c.H, c.W, 1)) if type(data) == type([]): break in_data[...] = data[:, :c.IN_SEQ, ...] if c.IN_CHANEL == 3: gt_data[...] = data[:, c.IN_SEQ:c.IN_SEQ + c.OUT_SEQ, :, :, 1:-1] elif c.IN_CHANEL == 1: gt_data[...] = data[:, c.IN_SEQ:c.IN_SEQ + c.OUT_SEQ, ...] else: raise NotImplementedError # in_date = date_clip[0][:c.IN_SEQ] if c.NORMALIZE: in_data = normalize_frames(in_data) gt_data = normalize_frames(gt_data) mse, mae, gdl, pred = self.model.valid_step(in_data, gt_data) evaluator.evaluate(gt_data, pred) logging.info(f"Iter {iter} {i}: \n\t mse:{mse} \n\t mae:{mae} \n\t gdl:{gdl}") i += 1 if i % stride == 0: if c.IN_CHANEL == 3: in_data = in_data[:, :, :, :, 1:-1] for b in range(c.BATCH_SIZE): predict_date = date_clip[b][c.IN_SEQ] logging.info(f"Save {predict_date} results") if mode == "Valid": save_path = os.path.join(c.SAVE_VALID, str(iter), predict_date.strftime("%Y%m%d%H%M")) else: save_path = os.path.join(c.SAVE_TEST, str(iter), predict_date.strftime("%Y%m%d%H%M")) path = os.path.join(save_path, "in") save_png(in_data[b], path) path = os.path.join(save_path, "pred") save_png(pred[b], path) path = os.path.join(save_path, "out") save_png(gt_data[b], path) evaluator.done()
def generate_callbacks(self): callbacks = [] tbpath = os.path.join(self.out_path, "tensorboard") symtbpath = os.path.join(args.output, "tensorboard", args.tag) if not os.path.exists(tbpath): os.makedirs(tbpath) if not os.path.exists(symtbpath): os.symlink(tbpath, symtbpath) print(f"Symlinked {tbpath} -> {symtbpath}") log_files_list = os.listdir(tbpath) if log_files_list != []: for fn in log_files_list: print(f"Deleting {os.path.join(tbpath, fn)}") shutil.rmtree(os.path.join(tbpath, fn)) checkpath = os.path.join(self.out_path, 'checkpoint/') if not os.path.exists(checkpath): os.makedirs(checkpath) tb_callback = tf.keras.callbacks.TensorBoard(log_dir=tbpath, update_freq='epoch', write_graph=True, write_images=True) callbacks.append(tb_callback) check_name = os.path.join(checkpath, f'{args.model}_{args.tag}.hdf5') if self.data == 'opportunity': monitorname = f"out_{self.label_names[0]}_fmeasure" if len(self.label_names) == 1: monitorname = 'fmeasure' elif self.data == 'deap': monitorname = f"val_out_{self.label_names[0]}_accuracy" check_callback = tf.keras.callbacks.\ ModelCheckpoint(check_name, monitor=monitorname, save_best_only=True, mode='max', save_freq='epoch', save_weights_only=False) callbacks.append(check_callback) if self.data == 'opportunity': evaluator = Evaluator(self.label_names) eval_dir = os.path.join(outpath, 'evaluation') if not os.path.isdir(eval_dir): os.makedirs(eval_dir) eval_callback = EvaluationCallback(self.val_data, self.label_names, self.num_classes, eval_dir) callbacks.append(eval_callback) return callbacks
def __init__(self, model: TuckER, batch_size: int = 128, learning_rate: float = 0.03, decay: float = 1.0, label_smoothing: float = 0.1, verbose: bool = True): self.model = model self.dataset = self.model.dataset self.batch_size = batch_size self.label_smoothing = label_smoothing self.verbose = verbose self.learning_rate = learning_rate self.decay_rate = decay self.verbose = verbose self.loss = torch.nn.BCELoss() self.optimizer = optim.Adam(params=self.model.parameters(), lr=learning_rate) self.scheduler = optim.lr_scheduler.ExponentialLR( self.optimizer, decay) # create the evaluator to use between epochs self.evaluator = Evaluator(self.model)
def main(): USE_GPU = True if USE_GPU and torch.cuda.is_available(): torch.cuda.empty_cache() device = torch.device('cuda') else: device = torch.device('cpu') print('using device:', device) dtype = torch.float32 # ============================================================================= # filename = 'curr_model' # f = open(filename, 'rb') # model = pickle.load(f) # f.close() # # ============================================================================= model = UMWE(dtype, device, 32, 2) model.build_model() # model.discrim_fit() # filename = 'curr_model' # f = open(filename, 'wb') # pickle.dump(model, f) # f.close() # ============================================================================= model.mpsr_refine() # ============================================================================= # ============================================================================= # for lang in model.src_langs.values(): # model.export_embeddings(lang, model.embs, "txt") # ============================================================================= model.export_embeddings('es', model.embs, "txt") eval_ = Evaluator(model) print(eval_.clws('es', 'en')) eval_.word_translation('es', 'en')
def val_eval(model, validation_data, loss_fn): model.eval() eval = Evaluator() x_val = validation_data.X.to('cuda') y_val = validation_data.Y.to('cuda') z_val = validation_data.Z.to('cuda') yhat = model(x_val) val_loss = loss_fn(yhat, y_val) eval.update_counter(yhat, y_val, z_val) eval.update_loss(0, 0, 0) return val_loss, eval.total_percenage[0]
def __init__(self, transition_system, table_path='data/tables.json', database_dir='data/database'): super(Evaluator, self).__init__() self.transition_system = transition_system self.kmaps = build_foreign_key_map_from_json(table_path) self.database_dir = database_dir self.engine = Engine() self.checker = Checker(table_path, database_dir) self.acc_dict = { "sql": self.sql_acc, # use golden sql as references "ast": self. ast_acc, # compare ast accuracy, ast may be incorrect when constructed from raw sql "beam": self. beam_acc, # if the correct answer exist in the beam, assume the result is true }
def main(argv): del argv labels = read_class_labels() evaluator = Evaluator(labels) with PredictionWriter(labels, FLAGS.dest) as pwriter: pwriter.write_headers() for filepath in glob.glob(FLAGS.source + '/**/*.wav', recursive=True): filename = os.path.basename(filepath) predictions = process_file(filepath, FLAGS.ckpt, FLAGS.labels) true_label = read_true_label(filepath) evaluator.record(predictions, true_label) pwriter.write_row(filename, predictions) evaluator.print_eval()
def evaluate(self, X_test, Y_test, Y_test_classes): if not self.model: raise Exception("Load or fit new model first") score, acc = self.model.evaluate(X_test, Y_test, batch_size=3) print("Test accuracy:", acc) evaluator = Evaluator() predictions_encoded = self.model.predict(X_test) predictions = self.lb.inverse_transform( [np.argmax(pred) for pred in predictions_encoded]) evaluator.accuracy(Y_test_classes, predictions) # evaluator.classification_report(Y_test_classes, predictions) evaluator.confusion_matrix(Y_test_classes, predictions)
def run_fetc(): if "-exp" in sys.argv: exp_pos = sys.argv.index("-exp") param_file = sys.argv[exp_pos+1] else: param_file = "experiment.yaml" exp = read_parameters(param_file=param_file) ovw = parse_commandline(sys.argv) overwrite_params(exp, ovw) update_parameters(exp) #print exp if exp['steps']['extract_features']: fe = FeatureExtractor.create(params=exp) fe.run() if exp['steps']['aggregate_features']: fa = FeatureAggregator.create(params=exp) fa.run() if exp['steps']['train']: t = ModelTrainer.create(params=exp) t.run() if exp['steps']['test']: t = ModelTester.create(params=exp) t.run() if exp['steps']['evaluate']: t = Evaluator.create(params=exp) t.run()
def run_predictions(input_path, output_path, thresholds_file, num_skip, check_existing): """Creates thread pool which will concurrently run the prediction for every protein map in the 'input_path' Parameters ---------- input_path: str Path of the input directory where the different protein directories are located output_path: str Path of the folder where all generated files will be stored thresholds_file: str Path of the JSON file which contains the threshold values for the input files num_skip: int The number of prediction steps that should be skipped check_existing: bool If set prediction steps are only executed if their results are not existing in the output path yet """ # Create list of parameters for every prediction params_list = [ (emdb_id, input_path, output_path, thresholds_file, num_skip, check_existing) for emdb_id in filter( lambda d: os.path.isdir(input_path + d), os.listdir(input_path)) ] start_time = time() pool = Pool(min(cpu_count(), len(params_list))) results = pool.map(run_prediction, params_list) # Filter 'None' results results = filter(lambda r: r is not None, results) evaluator = Evaluator(input_path) for emdb_id, predicted_file, gt_file, execution_time in results: evaluator.evaluate(emdb_id, predicted_file, gt_file, execution_time) evaluator.create_report(output_path, time() - start_time)
def run_fetc(): if "-exp" in sys.argv: exp_pos = sys.argv.index("-exp") param_file = sys.argv[exp_pos + 1] else: param_file = "experiment.yaml" exp = read_parameters(param_file=param_file) ovw = parse_commandline(sys.argv) overwrite_params(exp, ovw) update_parameters(exp) #print exp if exp['steps']['extract_features']: fe = FeatureExtractor.create(params=exp) fe.run() if exp['steps']['aggregate_features']: fa = FeatureAggregator.create(params=exp) fa.run() if exp['steps']['train']: t = ModelTrainer.create(params=exp) t.run() if exp['steps']['test']: t = ModelTester.create(params=exp) t.run() if exp['steps']['evaluate']: t = Evaluator.create(params=exp) t.run()
def valid_clips(self, step): test_iter = Clip_Iterator(c.VALID_DIR_CLIPS) evaluator = Evaluator(step) i = 0 for data in test_iter.sample_valid(c.BATCH_SIZE): in_data = data[:, :c.IN_SEQ, ...] if c.IN_CHANEL == 3: gt_data = data[:, c.IN_SEQ:c.IN_SEQ + c.OUT_SEQ, :, :, 1:-1] elif c.IN_CHANEL == 1: gt_data = data[:, c.IN_SEQ:c.IN_SEQ + c.OUT_SEQ, ...] else: raise NotImplementedError if c.NORMALIZE: in_data = normalize_frames(in_data) gt_data = normalize_frames(gt_data) mse, mae, gdl, pred = self.model.valid_step(in_data, gt_data) evaluator.evaluate(gt_data, pred) logging.info(f"Iter {step} {i}: \n\t mse:{mse} \n\t mae:{mae} \n\t gdl:{gdl}") i += 1 evaluator.done()
def run_evaluation(): with open("queries.txt", "r") as queries_file: queries = list(map(str.strip, queries_file.readlines())) print(Evaluator().evaluate_to_latex(queries, "query.csv", "like.csv", relevance_cutoff=2))
dataset = IHDP(replications=args.reps) scores = np.zeros((args.reps, 3)) scores_test = np.zeros((args.reps, 3)) M = None d = 20 # latent space dimension lamba = 1e-4 # weight decay nh, h = 5, 200 # number and size of hidden layers for i, (train, valid, test, contfeats, binfeats) in enumerate(dataset.get_train_valid_test()): print('\nReplication {}/{}'.format(i + 1, args.reps)) (xtr, ttr, ytr), (y_cftr, mu0tr, mu1tr) = train (xva, tva, yva), (y_cfva, mu0va, mu1va) = valid (xte, tte, yte), (y_cfte, mu0te, mu1te) = test evaluator_test = Evaluator(yte, tte, y_cf=y_cfte, mu0=mu0te, mu1=mu1te) # Reorder features with binary first and continuous after perm = binfeats + contfeats xtr, xva, xte = xtr[:, perm], xva[:, perm], xte[:, perm] xalltr, talltr, yalltr = np.concatenate( [xtr, xva], axis=0), np.concatenate([ttr, tva], axis=0), np.concatenate([ytr, yva], axis=0) evaluator_train = Evaluator(yalltr, talltr, y_cf=np.concatenate([y_cftr, y_cfva], axis=0), mu0=np.concatenate([mu0tr, mu0va], axis=0), mu1=np.concatenate([mu1tr, mu1va], axis=0))
fold_importance_df["fold"] = n_fold + 1 feature_importance_df = pd.concat( [feature_importance_df, fold_importance_df], axis=0) predictions += clf.predict( X_test, num_iteration=clf.best_iteration) / folds.n_splits print("CV score (Validation): {:<8.5f}".format(roc_auc_score(Y_train, oof))) print("CV score (Test): {:<8.5f}".format(roc_auc_score( Y_test, predictions))) y_pred = np.zeros(predictions.shape[0]) y_pred[predictions >= 0.1] = 1 eval = Evaluator() eval.evaluate(Y_test, y_pred) cols = (feature_importance_df[[ "feature", "importance" ]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:1000].index) best_features = feature_importance_df.loc[ feature_importance_df.feature.isin(cols)] plt.figure(figsize=(14, 26)) sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False)) plt.title('LightGBM Features (averaged over folds)')
class MultiClassNLLptimizer: """ This optimizer relies on Multiclass Negative Log Likelihood loss. It is heavily inspired by paper "" Instead of considering each training sample as the "unit" for training, it groups training samples into couples (h, r) -> [all t for which <h, r, t> in training set]. Each couple (h, r) with the corresponding tails is treated as if it was one sample. When passing them to the loss... In our implementation, it is used by the following models: - TuckER """ def __init__(self, model: Model, optimizer_name: str = "Adagrad", batch_size: int = 256, learning_rate: float = 1e-2, decay1: float = 0.9, decay2: float = 0.99, regularizer_name: str = "N3", regularizer_weight: float = 5e-2, verbose: bool = True): self.model = model self.batch_size = batch_size self.verbose = verbose # build all the supported optimizers using the passed params (learning rate and decays if Adam) supported_optimizers = { 'Adagrad': optim.Adagrad(params=self.model.parameters(), lr=learning_rate), 'Adam': optim.Adam(params=self.model.parameters(), lr=learning_rate, betas=(decay1, decay2)), 'SGD': optim.SGD(params=self.model.parameters(), lr=learning_rate) } # build all the supported regularizers using the passed regularizer_weight supported_regularizers = { 'N3': N3(weight=regularizer_weight), 'N2': N2(weight=regularizer_weight) } # choose the Torch Optimizer object to use, based on the passed name self.optimizer = supported_optimizers[optimizer_name] # choose the regularizer self.regularizer = supported_regularizers[regularizer_name] # create the evaluator to use between epochs self.evaluator = Evaluator(self.model) def train(self, train_samples: np.array, max_epochs: int, save_path: str = None, evaluate_every: int = -1, valid_samples: np.array = None): # extract the direct and inverse train facts training_samples = np.vstack( (train_samples, self.model.dataset.invert_samples(train_samples))) # batch size must be the minimum between the passed value and the number of Kelpie training facts batch_size = min(self.batch_size, len(training_samples)) cur_loss = 0 for e in range(max_epochs): cur_loss = self.epoch(batch_size, training_samples) if evaluate_every > 0 and valid_samples is not None and \ (e + 1) % evaluate_every == 0: mrr, h1 = self.evaluator.eval(samples=valid_samples, write_output=False) print("\tValidation Hits@1: %f" % h1) print("\tValidation Mean Reciprocal Rank': %f" % mrr) if save_path is not None: print("\t saving model...") torch.save(self.model.state_dict(), save_path) print("\t done.") if save_path is not None: print("\t saving model...") torch.save(self.model.state_dict(), save_path) print("\t done.") def epoch(self, batch_size: int, training_samples: np.array): training_samples = torch.from_numpy(training_samples).cuda() # at the beginning of the epoch, shuffle all samples randomly actual_samples = training_samples[ torch.randperm(training_samples.shape[0]), :] loss = nn.CrossEntropyLoss(reduction='mean') with tqdm.tqdm(total=training_samples.shape[0], unit='ex', disable=not self.verbose) as bar: bar.set_description(f'train loss') batch_start = 0 while batch_start < training_samples.shape[0]: batch_end = min(batch_start + batch_size, training_samples.shape[0]) batch = actual_samples[batch_start:batch_end].cuda() l = self.step_on_batch(loss, batch) batch_start += self.batch_size bar.update(batch.shape[0]) bar.set_postfix(loss=f'{l.item():.0f}') def step_on_batch(self, loss, batch): predictions, factors = self.model.forward(batch) truth = batch[:, 2] # compute loss l_fit = loss(predictions, truth) l_reg = self.regularizer.forward(factors) l = l_fit + l_reg # compute loss gradients, and run optimization step self.optimizer.zero_grad() l.backward() self.optimizer.step() # return loss return l
KFold, cross_val_predict, cross_val_score, LeaveOneOut, GridSearchCV, ) from sklearn.naive_bayes import MultinomialNB from sklearn.linear_model import SGDClassifier from sklearn.svm import SVC import utils import random from evaluation import Evaluator from feature_extraction import tfidf_features_1, bag_of_words_features_1 evaluator = Evaluator() class PopularityModel: def name(self): return "Popularity model" def get_most_representative_class(self, Y_train): """Return most representative class""" item_counts = Y_train[utils.col_to_predict].value_counts() most_reprenetative = item_counts.idxmax() return most_reprenetative def predict(self, train, test): most_representative_class = self.get_most_representative_class(train) return [most_representative_class for _ in range(len(test))]