def calc_accuracy(self, path_test_file): """ This function calculates both sentence accuracy and word accuracy on a given test set. :param path_test_file: path to file containing labeled samples (str) :return: sentence_accuracy: percentage on complete sentences (float) :return: word_accuracy: percentage on words (float) """ total_words = 0 total_sentences = 0 correct_words = 0 correct_sentences = 0 num_samples = 0 for _ in dep_sample_generator(path_test_file): num_samples += 1 progress = ProgressBar(num_samples, fmt=ProgressBar.FULL) samp_gen = dep_sample_generator(path_test_file) for sample in samp_gen: total_sentences += 1 total_words += sample[-1].idx infered_sample = self.infer(sample) correct_parse = True for i in range(len(sample)): if not i: # skip ROOT continue if sample[i].head == infered_sample[i].head: correct_words += 1 else: correct_parse = False if correct_parse: correct_sentences += 1 progress.current += 1 progress() progress.done() print('\n') sentence_accuracy = 1.0 * correct_sentences / total_sentences word_accuracy = 1.0 * correct_words / total_words return sentence_accuracy, word_accuracy
def perceptron_train(self, num_iterations: int, accuracy_step=10) -> None: """ Given the number of iterations for training we loop over the training file said number of iterations preforming the perceptron algorithm the result is updated weights in self.w :param num_iterations: number of iterations to perform (int) :param accuracy_step: interval between accuracy calculation (int) :return: None """ print("training started") self.w = np.zeros(self.num_of_features) num_samples = 0 for _ in dep_sample_generator(self.training_file_path): num_samples += 1 st_time = time.time() # dep_weights = DepOptimizer(self.w, None, path_to_train_file=self.training_file_path, # dicts=self.dicts, minimal=self.minimal) # moved to class level train_word_accuracies = [] train_sentenence_accuracies = [] for i in range(num_iterations): print("iteration: ", i) progress = ProgressBar(num_samples, fmt=ProgressBar.FULL) total_sentences = 0 correct_sentences = 0 total_words = 0 correct_words = 0 it_st_time = time.time() for idx, sample in enumerate( dep_sample_generator(self.training_file_path)): total_sentences += 1 sample_len = sample[-1].idx successors = self.fc_graphs[ sample_len] # sample_to_full_successors(sample_len) # dep_weights = DepOptimizer(self.w, sample, dicts=self.dicts, minimal=self.minimal) self.dep_weights.update_sample(sample) self.dep_weights.update_weights(self.w) graph = Digraph(successors, self.dep_weights.get_score) mst_start_time = time.time() argmax_tree = graph.mst().successors argmax_tree = {k: v for k, v in argmax_tree.items() if v} ground_truth_successors = self.gt_trees[ idx] # sample_to_successors(sample) # print("mst calc time: %.5f secs" % (time.time() - mst_start_time)) infered_sample = successors_to_sample(deepcopy(sample), argmax_tree) for j in range(len(sample)): if not j: # skip ROOT continue total_words += 1 if sample[j].head == infered_sample[j].head: correct_words += 1 # according to python doc dictionary == works as expected # returning true only if both have same keys and same values to those keys # order of dict.values() corresponded to dict.keys() if argmax_tree != ground_truth_successors: # features_ground_truth = self.feature_extractor(sample, self.dicts, self.minimal) # could also be replaced by a dict features_ground_truth = self.gt_global_features[idx] feat_calc_start_time = time.time() features_argmax = self.feature_extractor( infered_sample, self.dicts, self.minimal, use_mcdonald=self.use_mcdonald) # print("feature extraction time: %.5f" % (time.time() - feat_calc_start_time)) self.w[list(features_ground_truth.keys())] += np.array( list(features_ground_truth.values())) self.w[list(features_argmax.keys())] -= np.array( list(features_argmax.values())) else: correct_sentences += 1 progress.current += 1 progress() sen_acc = 1.0 * correct_sentences / total_sentences word_acc = 1.0 * correct_words / total_words train_sentenence_accuracies.append(sen_acc) train_word_accuracies.append(word_acc) progress.done() print('\n') print( 'iteration/epoch ', i, "- iteration time: %.2f min" % ((time.time() - it_st_time) / 60), ", train accuracy:: sentence: %.3f " % sen_acc, " words: %.3f " % word_acc, ", total time: %.2f min" % ((time.time() - st_time) / 60)) if (i + 1 ) % accuracy_step == 0 and self.path_to_valid_file is not None: print("validation accuracy calculation step:") valid_sent_acc, valid_word_acc = self.calc_accuracy( self.path_to_valid_file) print("valid accuracy:: sentence: %.3f" % valid_sent_acc, " words: %.3f" % valid_word_acc) self.w.dump(self.weights_file_name) print("saved weights @ ", self.weights_file_name) # save checkpoint path = self.training_file_path + "_epoch_" + str( i) + ".checkpoint" ckpt = {} ckpt['weights'] = self.w.tolist() ckpt['train_acc'] = (sen_acc, word_acc) ckpt['valid_acc'] = (valid_sent_acc, valid_word_acc) with open(path, 'wb') as fp: pickle.dump(ckpt, fp) print("saved checkpoint @ ", path) self.w.dump(self.weights_file_name) path = self.training_file_path + "_" + str(i + 1) + "_epochs" + ".results" ckpt = {} ckpt['weights'] = self.w.tolist() ckpt['train_word_acc'] = train_word_accuracies ckpt['train_sen_acc'] = train_sentenence_accuracies with open(path, 'wb') as fp: pickle.dump(ckpt, fp) print("saved final results @ ", path)