def get_dataset_samples(filename: str): dataset_samples = [] for name, code in import_data(filename).items(): dataset_samples.append(''.join(code['original'][0])) for noisy_sample in code['noise']: dataset_samples.append(''.join(noisy_sample[0])) return set(dataset_samples)
def evaluate(self, data_filename: str, consider_only_first_n_components: int = None, num_nns: int = 10) -> np.array: data = import_data(data_filename) encodings = [] equivalent_to = [] equivalence_sets = [] for name, code in data.items(): idx = len(encodings) enc = self.__encoder.get_encoding(code['original']) assert not np.isnan(np.sum(enc)) encodings.append(enc) equivalent_to.append(idx) for noisy_sample in code['noise']: enc = self.__encoder.get_encoding(noisy_sample) assert not np.isnan(np.sum(enc)) encodings.append(enc) equivalent_to.append(idx) equivalence_sets.append(set(range(idx, len(encodings)))) encodings = np.array(encodings) if consider_only_first_n_components is not None: encodings = encodings[:, :consider_only_first_n_components] all_distances = squareform(pdist( encodings, 'cosine')) # TODO: avoid square form somehow assert not np.any(np.isnan(all_distances)) identity = np.arange(all_distances.shape[0]) all_distances[identity, identity] = float( 'inf' ) # The distance to self is infinite to get the real neighbors in the next step k_nearest_neighbor_idxs = np.argpartition(all_distances, num_nns)[:, :num_nns] left_index = np.atleast_2d(identity).T order_of_knearest_neighbors = np.argsort( all_distances[left_index, k_nearest_neighbor_idxs]) all_distances = k_nearest_neighbor_idxs[left_index, order_of_knearest_neighbors] equivalent_elements = {} for eq_set in equivalence_sets: for element in eq_set: equivalent_elements[element] = eq_set k_nns_semantic_eq = np.zeros(num_nns, dtype=np.float64) num_k_nns = np.zeros(num_nns) for i in range(all_distances.shape[0]): semantically_eq_nns = equivalent_elements[i] if len(semantically_eq_nns) < 2: continue for j in range(num_nns): num_k_nns[j] += 1 k_nns_semantic_eq[j] += float(len(semantically_eq_nns & set(all_distances[i, :j + 1]))) / \ min(len(semantically_eq_nns), j + 1) return k_nns_semantic_eq / num_k_nns
def __init__(self, filename: str, training_data: dict = None): """ :param filename: the filename of the training data, ignored if training_data is *not* None :param training_data: use this training data instead of loading the filename, defaults to None (and thus data is loaded from the filename) """ if training_data is None: training_data = import_data(filename) self.num_equivalent_classes = len(training_data) def get_vocabulary(): for data in training_data.values(): original_tree = data["original"][1] for node in original_tree: yield node.name self.__node_type_dict = FeatureDictionary.get_feature_dictionary_for( get_vocabulary()) def get_top_level_symbols(): for data in training_data.values(): original_tree = data["original"][1] yield original_tree.symbol for noise_expr in data["noise"]: yield noise_expr[1].symbol self.__symbol_dict = FeatureDictionary.get_feature_dictionary_for( get_top_level_symbols(), 0) self.__empirical_symbol_dist = get_empirical_distribution( self.__symbol_dict, get_top_level_symbols()) def get_num_properties(): for data in training_data.values(): original_tree = data["original"][1] for node in original_tree: yield len(node.properties) self.__max_num_properties_per_node = max(get_num_properties()) def get_start_node(): for data in training_data.values(): original_tree = data["original"][1] yield original_tree.name tree_roots = set(get_start_node()) assert len(tree_roots) == 1 # Everything should be a block! self.__root_type = tree_roots.pop() self.__node_to_properties = {} for data in training_data.values(): original_tree = data["original"][1] for node in original_tree: self.__node_to_properties[node.name] = node.properties
def prediction_accuracy(self, dataset_file): self.__compile_if_needed() data = import_data(dataset_file) dataset = list((self.__dataset_extractor.get_dataset_for_encoder(data, return_num_tokens=True))) correct = 0 for tree in dataset: all_args = list(tree[0]) ll, logprobs = self.__compiled_methods.ll_and_logprobs(*all_args) if np.argmax(logprobs) == all_args[-1]: correct += 1 return correct / len(dataset)
def __init__(self, filename): training_data = import_data(filename) self.num_equivalence_classes = len(training_data) def vocabulary(): for data in training_data.values(): for token in self.__add_start_end_symbols(data["original"][0]): yield token for noisy_sample in data["noise"]: for token in self.__add_start_end_symbols(noisy_sample[0]): yield token self.__feature_map = FeatureDictionary.get_feature_dictionary_for(vocabulary()) dataset = self.build_dataset(training_data) self.__dataset = dataset
def __init__(self, train_file): data = import_data(train_file) def document_tokens(): for snippet in data.values(): yield snippet['original'][0] all_document_tokens = [s for s in document_tokens()] self.__feature_dict = FeatureDictionary.get_feature_dictionary_for(chain(*all_document_tokens), count_threshold=10) self.__idfs = np.ones(len(self.__feature_dict), dtype=np.int) # use 1s for smoothing for document in all_document_tokens: document_word_ids = set(self.__feature_dict.get_id_or_unk(t) for t in document) for word_id in document_word_ids: self.__idfs[word_id] += 1 self.__idfs = np.log(self.__idfs.astype(np.float))
def __init__(self, training_file, hyperparameters, encoder_type='gru', use_centroid=False): """ :param training_file: :type hyperparameters: dict :return: """ self.__hyperparameters = hyperparameters self.dataset_extractor = TokenAutoencoderDatasetExtractor( training_file) empirical_distribution = get_empirical_distribution( self.dataset_extractor.feature_map, chain(*self.dataset_extractor.get_nonnoisy_samples( import_data(training_file)))) self.__encoder = SequenceGruSupervisedEncoderModel( self.__hyperparameters["embedding_size"], len(self.dataset_extractor.feature_map), empirical_distribution, self.__hyperparameters["representation_size"], self.__hyperparameters, encoder_type=encoder_type, use_centroid=use_centroid) target_embeddings = np.random.randn(self.__hyperparameters["representation_size"], self.dataset_extractor.num_equivalence_classes) * 10 ** \ self.__hyperparameters[ "log_init_noise"] self.__target_embeddings = theano.shared(target_embeddings.astype( theano.config.floatX), name="target_embeddings") self.__target_embeddings_dropout = dropout( self.__hyperparameters['dropout_rate'], self.__encoder.rng, self.__target_embeddings, True) self.__trained_parameters = None self.__compiled_methods = None
def get_representation_distance_ratio(encoder: AbstractEncoder, data_filename: str, print_stats: bool = False): """Compute the ratio of the avg distance of points within an equivalence class vs the avg distance between all points""" data = import_data(data_filename) encodings = [] equivalence_sets = [] for name, code in data.items(): idx = len(encodings) enc = encoder.get_encoding(code['original']) assert not np.isnan(np.sum(enc)) encodings.append(enc) for noisy_sample in code['noise']: enc = encoder.get_encoding(noisy_sample) assert not np.isnan(np.sum(enc)) encodings.append(enc) equivalence_sets.append(set(range(idx, len(encodings)))) encodings = np.array(encodings) all_distances = squareform(pdist(encodings, 'cosine')) # TODO: avoid square form somehow assert not np.any(np.isnan(all_distances)) # Average the lower triangle of all_distances avg_distance_between_all_points = np.sum(np.tril(all_distances, k=-1)) / (len(encodings) * (len(encodings) - 1) / 2) sum_distance_within_eq_class = 0. num_pairs = 0 for equiv_class_idxs in equivalence_sets: num_elements_in_class = len(equiv_class_idxs) if num_elements_in_class < 2: continue elems_in_eq_class = np.fromiter(equiv_class_idxs, dtype=np.int32) sum_distance_within_eq_class += np.sum(np.tril(all_distances[elems_in_eq_class][:, elems_in_eq_class], k=-1)) num_pairs += num_elements_in_class * (num_elements_in_class - 1) / 2 avg_distance_within_eq_class = sum_distance_within_eq_class / num_pairs if print_stats: print( "Within Avg Dist: %s All Avg Dist: %s " % (avg_distance_within_eq_class, avg_distance_between_all_points)) return avg_distance_between_all_points / avg_distance_within_eq_class
def __init__(self, training_file, hyperparameters, encoder_type='gru', use_centroid=False): """ :param training_file: :type hyperparameters: dict :return: """ self.__hyperparameters = hyperparameters self.dataset_extractor = TokenAutoencoderDatasetExtractor(training_file) empirical_distribution = get_empirical_distribution(self.dataset_extractor.feature_map, chain(*self.dataset_extractor.get_nonnoisy_samples( import_data(training_file)))) self.__encoder = SequenceGruSiameseEncoderModel(self.__hyperparameters["embedding_size"], len(self.dataset_extractor.feature_map), empirical_distribution, self.__hyperparameters["representation_size"], self.__hyperparameters, encoder_type=encoder_type, use_centroid=use_centroid) self.__trained_parameters = None self.__compiled_methods = None
for name, code in import_data(filename).items(): dataset_samples.append( (''.join(code['original'][0]), code['original'][1])) for noisy_sample in code['noise']: dataset_samples.append((''.join(noisy_sample[0]), noisy_sample[1])) return set(dataset_samples) if __name__ == '__main__': if len(sys.argv) != 4: print("Usage <encoderPkl> <dataset.json.gz> <testset.json.gz>") sys.exit(-1) testset_samples = get_dataset_samples(sys.argv[3]) data = import_data(sys.argv[2]) encoder = AbstractEncoder.load(sys.argv[1]) expression_data, encodings = [], [] eq_class_idx_to_names = {} eq_class_counts = defaultdict(int) def add_sample(data, eq_class_idx: int): sample_data = dict(tree=data[1], eq_class=eq_class_idx) expression_data.append(sample_data) representation = encoder.get_encoding(data) assert not np.isnan(np.sum(representation)) encodings.append(representation) for eq_class_idx, (name, code) in enumerate(data.items()):
def evaluate_with_test(self, data_filename: str, test_filename: str, consider_only_first_n_components: int = None, num_nns: int = 15) -> np.array: test_data = import_data(test_filename) test_samples = defaultdict(set) # eq_class -> tokens for eq_class, code in test_data.items(): test_samples[eq_class].add(''.join(code['original'][0])) for sample in code['noise']: test_samples[eq_class].add(''.join(sample[0])) data = import_data(data_filename) encodings = [] equivalence_classes = defaultdict(set) # eq_class->set(ids) test_samples_idx_map = OrderedDict() # id-> eq_class for eq_class, code in data.items(): encoding = self.__encoder.get_encoding(code['original']) assert not np.isnan(np.sum(encoding)) encodings.append(encoding) equivalence_classes[eq_class].add(len(encodings) - 1) if ''.join(code['original'][0]) in test_samples[eq_class]: test_samples_idx_map[len(encodings) - 1] = eq_class for noisy_sample in code['noise']: encoding = self.__encoder.get_encoding(noisy_sample) assert not np.isnan(np.sum(encoding)) encodings.append(encoding) equivalence_classes[eq_class].add(len(encodings) - 1) if ''.join(noisy_sample[0]) in test_samples[eq_class]: test_samples_idx_map[len(encodings) - 1] = eq_class test_sample_idxs = np.fromiter(test_samples_idx_map.keys(), dtype=np.int32) encodings = np.array(encodings) if consider_only_first_n_components is not None: encodings = encodings[:, :consider_only_first_n_components] nearest_neighbors = cdist(encodings[test_sample_idxs], encodings, 'cosine') # TODO: avoid square form somehow identity = np.arange(nearest_neighbors.shape[0]) assert nearest_neighbors.shape[0] == len(test_sample_idxs) nearest_neighbors[identity, test_sample_idxs] = float( 'inf' ) # The distance to self is infinite to get the real neighbors in the next step k_nearest_neighbor_idxs = np.argpartition(nearest_neighbors, num_nns)[:, :num_nns] left_index = np.atleast_2d(identity).T order_of_knearest_neighbors = np.argsort( nearest_neighbors[left_index, k_nearest_neighbor_idxs]) nearest_neighbors = k_nearest_neighbor_idxs[ left_index, order_of_knearest_neighbors] k_nns_semantic_eq = np.zeros(num_nns, dtype=np.float64) num_k_nns = np.zeros(num_nns) for i in range(nearest_neighbors.shape[0]): test_sample_i = test_sample_idxs[i] semantically_eq_nns = equivalence_classes[ test_samples_idx_map[test_sample_i]] if len(semantically_eq_nns) < 2: continue for j in range(num_nns): num_k_nns[j] += 1 k_nns_semantic_eq[j] += float(len(semantically_eq_nns & set(nearest_neighbors[i, :j + 1]))) / \ min(len(semantically_eq_nns), j + 1) return k_nns_semantic_eq / num_k_nns
def evaluate_with_test(self, data_filename: str, test_filename: str, consider_only_first_n_components: int = None, num_nns: int = 15) -> np.array: test_data = import_data(test_filename) test_samples = defaultdict(set) # eq_class -> tokens for eq_class, code in test_data.items(): test_samples[eq_class].add(''.join(code['original'][0])) for sample in code['noise']: test_samples[eq_class].add(''.join(sample[0])) data = import_data(data_filename) encodings = [] #Menge von Indizes equivalence_classes = defaultdict(set) # eq_class->set(ids) #Mapping von Index auf Äquivalenzklasse test_samples_idx_map = OrderedDict() # id-> eq_class for eq_class, code in data.items(): #encoding = 64-Stellen np-array = Repräsentation des berechneten SemVecs? encoding = self.__encoder.get_encoding(code['original']) assert not np.isnan(np.sum(encoding)) encodings.append(encoding) equivalence_classes[eq_class].add(len(encodings) - 1) if ''.join(code['original'][0]) in test_samples[eq_class]: test_samples_idx_map[len(encodings) - 1] = eq_class for noisy_sample in code['noise']: encoding = self.__encoder.get_encoding(noisy_sample) assert not np.isnan(np.sum(encoding)) encodings.append(encoding) equivalence_classes[eq_class].add(len(encodings) - 1) if ''.join(noisy_sample[0]) in test_samples[eq_class]: test_samples_idx_map[len(encodings) - 1] = eq_class test_sample_idxs = np.fromiter(test_samples_idx_map.keys(), dtype=np.int32) encodings = np.array(encodings) if consider_only_first_n_components is not None: encodings = encodings[:, :consider_only_first_n_components] nearest_neighbors = cdist(encodings[test_sample_idxs], encodings, 'cosine') # TODO: avoid square form somehow identity = np.arange(nearest_neighbors.shape[0]) assert nearest_neighbors.shape[0] == len(test_sample_idxs) nearest_neighbors[identity, test_sample_idxs] = float( 'inf') # The distance to self is infinite to get the real neighbors in the next step k_nearest_neighbor_idxs = np.argpartition(nearest_neighbors, num_nns)[:, :num_nns] left_index = np.atleast_2d(identity).T order_of_knearest_neighbors = np.argsort(nearest_neighbors[left_index, k_nearest_neighbor_idxs]) nearest_neighbors = k_nearest_neighbor_idxs[left_index, order_of_knearest_neighbors] k_nns_semantic_eq = np.zeros(num_nns, dtype=np.float64) num_k_nns = np.zeros(num_nns) for i in range(nearest_neighbors.shape[0]): test_sample_i = test_sample_idxs[i] semantically_eq_nns = equivalence_classes[test_samples_idx_map[test_sample_i]] if len(semantically_eq_nns) < 2: continue for j in range(num_nns): num_k_nns[j] += 1 #Formel (4) von Seite 6 in Paper #"proportion of k nearest neighbors of each expression (using cosine similarity) # that belong to the same equivalence class" k_nns_semantic_eq[j] += float(len(semantically_eq_nns & set(nearest_neighbors[i, :j + 1]))) / \ min(len(semantically_eq_nns), j + 1) #Avg Semantically Equivalent NNs #durchschnittlich sematnisch äquivlanete nearest neighbors? return k_nns_semantic_eq / num_k_nns
def train(self, training_file, validation_file, max_iter=5000, patience=50, validation_check_limit=2, additional_code_to_run=None) -> tuple: self.__compile_if_needed() minibatch_size = self.__hyperparameters["minibatch_size"] training_data = import_data(training_file) training_set = list(self.__dataset_extractor.get_dataset_for_encoder(training_data, return_num_tokens=True)) validation_set = list(self.__dataset_extractor.get_dataset_for_encoder(import_data(validation_file), return_num_tokens=True)) print("Num classes: %s" % self.__dataset_extractor.num_equivalent_classes) def compute_validation_score() -> float: print("Train Accuracy %s" % compute_score(training_set, False, True)[1]) return compute_score(validation_set) def compute_score(dataset, print_score=True, return_accuracy=False) -> float: # Get all encodings sum_ll = 0. correct = 0 for tree in dataset: all_args = list(tree[0]) ll, logprobs = self.__compiled_methods.ll_and_logprobs(*all_args) sum_ll += ll if np.argmax(logprobs) == all_args[-1]: correct += 1 if print_score: print("Accuracy: %s, LL: %s" % (correct / len(dataset) * 100, sum_ll / len(dataset))) if return_accuracy: return sum_ll / len(dataset), (correct / len(dataset) * 100) return (correct / len(dataset) * 100) if self.__trained_parameters is None: best_score = float('-inf') else: best_score = compute_validation_score() print("Previous best validation score: %s" % best_score) try: print("[%s] Training Started..." % time.asctime()) ratios = np.zeros(len(self.__trainable_params)) epochs_not_improved = 0 historic_data = defaultdict(list) # Clump minibatches and disallow minibatches that are smaller than their given size, since they may # cause instability. current_max_size = self.__hyperparameters['curriculum_initial_size'] curriculum_step = self.__hyperparameters['curriculum_step'] for i in range(max_iter): sample_ordering = [] for j, tree_data in enumerate(training_set): if tree_data[1] <= current_max_size: sample_ordering.append(j) current_max_size += curriculum_step np.random.shuffle(np.array(sample_ordering, dtype=np.int32)) n_batches = 0 sum_train_loss = 0 num_elements = 0 num_minibatches = max(1, min(int(np.floor(float(len(sample_ordering)) / minibatch_size)), 10)) for j in trange(num_minibatches, desc="Minibatch"): for k in trange(j * minibatch_size, min((j + 1) * minibatch_size, len(sample_ordering)), desc="Sample", leave=False): current_idx = sample_ordering[k] args = list(training_set[current_idx][0]) + [i] loss = self.__compiled_methods.grad_accumulate(*args) sum_train_loss += loss num_elements += 1 n_batches += 1 ratios += self.__compiled_methods.grad_step() if i % validation_check_limit == validation_check_limit - 1: print("Iteration %s Stats" % i) current_score = compute_validation_score() historic_data['validation_score'].append(current_score) if current_score > best_score: best_score = current_score self.__trained_parameters = [p.get_value() for p in self.__trainable_params] print("At %s validation: current_score=%s [best so far]" % (i, current_score)) epochs_not_improved = 0 else: print("At %s validation: current_score=%s" % (i, current_score)) epochs_not_improved += 1 for k in range(len(self.__trainable_params)): print("%s: %.0e" % (self.__trainable_params[k].name, ratios[k] / n_batches)) print("Train ll: %s" % (sum_train_loss / num_elements)) ratios = np.zeros_like(ratios) if additional_code_to_run is not None: additional_code_to_run(historic_data) if epochs_not_improved >= patience: print("Not improved for %s epochs. Stopping..." % patience) break print("[%s] Training Finished..." % time.asctime()) except (InterruptedError, KeyboardInterrupt): print("Interrupted. Exiting training gracefully...") return best_score, historic_data
from matplotlib import pyplot as plt from scipy.spatial.distance import squareform, pdist from sklearn.manifold import TSNE from data.dataimport import import_data from encoders.baseencoder import AbstractEncoder if __name__ == '__main__': if len(sys.argv) != 5: print( "Usage <encoderPkl> <dataset.json.gz> <testset.json.gz> <neweqtestset.json.gz>" ) sys.exit(-1) testset_samples = [] for name, code in import_data(sys.argv[3]).items(): testset_samples.append(''.join(code['original'][0])) for noisy_sample in code['noise']: testset_samples.append(''.join(noisy_sample[0])) testset_samples = set(testset_samples) neweq_test_set_eq_classes = set(import_data(sys.argv[4]).keys()) data = import_data(sys.argv[2]) encoder = AbstractEncoder.load(sys.argv[1]) encodings, eq_classes_idxs, test_sample_idxs, neweq_samples_idxs = [], [], [], [] eq_class_idx_to_names = {} for eq_class_idx, (name, code) in enumerate(data.items()): eq_class_idx_to_names[eq_class_idx] = name
def train(self, training_file: str, validation_file: str, max_iter: int = 1000, patience: int = 25, validation_check_limit: int = 1, semantically_equivalent_noise: bool = False, additional_code_to_run=None) -> tuple: self.__compile_if_needed() minibatch_size = self.__hyperparameters["minibatch_size"] training_data = import_data(training_file) training_set = list( self.dataset_extractor.get_dataset_for_encoder( training_data, return_num_tokens=True)) validation_set = list( self.dataset_extractor.get_dataset_for_encoder( import_data(validation_file), return_num_tokens=True)) best_score = float('-inf') train_x_ent = 0 epochs_not_improved = 0 historic_values = [] trainable_parameters = list( self.__encoder.parameters.values()) + [self.__target_embeddings] print("Num classes: %s" % self.dataset_extractor.num_equivalence_classes) def compute_validation_score() -> float: return compute_score(validation_set) def compute_score(dataset) -> float: # Get all encodings sum_ll = 0. correct = 0 for data in dataset: ll, logprobs = self.__compiled_methods.ll_and_logprobs( data[0], data[2]) sum_ll += ll if np.argmax(logprobs) == data[2]: correct += 1 print("Accuracy: %s" % (correct / len(dataset) * 100)) return sum_ll / len(dataset) num_minibatches = max( 1, min(int(np.floor(float(len(training_set)) / minibatch_size)), 25)) # Clump minibatches try: print("[%s] Training Started..." % time.asctime()) ratios = np.zeros(len(trainable_parameters)) n_batches = 0 current_max_size = 3. curriculum_step = .2 for i in range(max_iter): sample_ordering = [] for j, tree_data in enumerate(training_set): if tree_data[1] <= current_max_size: sample_ordering.append(j) current_max_size += curriculum_step np.random.shuffle(np.array(sample_ordering, dtype=np.int32)) n_batches = 0 sum_train_loss = 0 num_elements = 0 for j in trange(num_minibatches, desc="Minibatch"): for k in trange(j * minibatch_size, min((j + 1) * minibatch_size, len(sample_ordering)), desc="Sample", leave=False): current_idx = sample_ordering[k] loss = self.__compiled_methods.grad_accumulate( training_set[current_idx][0], training_set[current_idx][2]) sum_train_loss += loss num_elements += 1 n_batches += 1 ratios += self.__compiled_methods.grad_step() if i % validation_check_limit == validation_check_limit - 1: current_ll = compute_validation_score() if current_ll > best_score: best_score = current_ll self.__save_current_params_as_best() print("At %s validation: current_ll=%s [best so far]" % (i, current_ll)) epochs_not_improved = 0 else: print("At %s validation: current_ll=%s" % (i, current_ll)) epochs_not_improved += 1 for k in range(len(trainable_parameters)): print("%s: %.0e" % (trainable_parameters[k].name, ratios[k] / n_batches)) print("Train ll: %s" % (sum_train_loss / num_elements)) ratios = np.zeros_like(ratios) if additional_code_to_run is not None: additional_code_to_run() if epochs_not_improved >= patience: print("Not improved for %s epochs. Stopping..." % patience) break print("[%s] Training Finished..." % time.asctime()) except (InterruptedError, KeyboardInterrupt, SystemExit): print("Interrupted. Exiting training gracefully...") return best_score, historic_values
from data.dataimport import import_data def plot_distribution(data, title): data = np.array([d for d in data]) sns.distplot(data, rug=True) plt.title(title) plt.show() if len(sys.argv) != 2: print("Usage <dataset.json.gz>") sys.exit(-1) data = import_data(sys.argv[1]).values() def num_noise_samples_per_original(): for snippet in data: yield len(snippet["noise"]) plot_distribution(num_noise_samples_per_original(), title="Num noise samples per original") def num_nodes_of_original(): for snippet in data: original_tree = snippet["original"][1] yield sum(1 for node in original_tree)
def train(self, training_file, validation_file, max_iter=1000, patience=25, validation_check_limit=1, additional_code_to_run=None) -> tuple: self.__compile_if_needed() minibatch_size = self.__hyperparameters["minibatch_size"] training_data = import_data(training_file) training_set = list( self.__dataset_extractor.get_dataset_for_encoder( training_data, return_num_tokens=True)) validation_set = list( self.__dataset_extractor.get_dataset_for_encoder( import_data(validation_file), return_num_tokens=True)) def compute_validation_score() -> float: return compute_score(validation_set) def compute_score(dataset) -> float: # Get all encodings encodings = [] equivalents = defaultdict(set) for i, tree in enumerate(dataset): encodings.append(self.__compiled_methods.encode(*tree[0][:-1])) equivalents[tree[2]].add(i) encodings = np.array(encodings, dtype=theano.config.floatX) # Get all cosine similarities distances = pdist(encodings) is_similar = np.zeros_like(distances, dtype=np.int) for equivalence_set in equivalents.values(): for i, j in permutations(equivalence_set, 2): if i > j: is_similar[encodings.shape[0] * j - int(j * (j + 1) / 2) + i - 1 - j] = 1 similar_score = -np.sum(np.power(distances * is_similar, 2)) margin = self.__hyperparameters['dissimilar_margin'] differences = margin - distances rectified_diffs = differences * (differences > 0) dissimilar_score = -np.sum( np.power(rectified_diffs * (1 - is_similar), 2)) print("Similar Loss: %s Dissimilar Loss: %s" % (similar_score, dissimilar_score)) return similar_score + dissimilar_score if self.__trained_parameters is None: best_score = float('-inf') else: best_score = compute_validation_score() print("Previous best validation score: %s" % best_score) try: print("[%s] Training Started..." % time.asctime()) sum_similar_loss = 0. num_similar_loss = 0 sum_dissimilar_loss = 0. num_dissimilar_loss = 0 ratios = np.zeros(len(self.__trainable_params)) epochs_not_improved = 0 historic_data = defaultdict(list) # Clump minibatches and disallow minibatches that are smaller than their given size, since they may # cause instability. num_minibatches = max( 1, min(int(np.floor(float(len(training_set)) / minibatch_size)), 10)) current_max_size = self.__hyperparameters[ 'curriculum_initial_size'] curriculum_step = self.__hyperparameters['curriculum_step'] num_examples = self.__hyperparameters['max_num_similar_examples'] num_dissimilar_examples = self.__hyperparameters[ 'max_num_dissimilar_examples'] for i in range(max_iter): sample_ordering = [] for j, tree_data in enumerate(training_set): if tree_data[1] <= current_max_size: sample_ordering.append(j) current_max_size += curriculum_step np.random.shuffle(np.array(sample_ordering, dtype=np.int32)) n_batches = 0 for j in trange(num_minibatches, desc="Minibatch"): for k in trange(j * minibatch_size, min((j + 1) * minibatch_size, len(sample_ordering)), desc="Sample", leave=False): current_idx = sample_ordering[k] # Add siamese gradients, by picking num_examples similar_snippet_idxs = [] dissimilar_snippet_idxs = [] for l in range(len(sample_ordering)): if l == k: continue other_idx = sample_ordering[l] if training_set[current_idx][2] == training_set[ other_idx][2]: similar_snippet_idxs.append(other_idx) else: dissimilar_snippet_idxs.append(other_idx) dissimilar_snippet_idxs = np.array( dissimilar_snippet_idxs) np.random.shuffle(similar_snippet_idxs) np.random.shuffle(dissimilar_snippet_idxs) for other_idx in similar_snippet_idxs[:num_examples]: args = list(training_set[current_idx][0]) + list( training_set[other_idx][0]) + [i] loss = self.__compiled_methods.grad_accumulate( *args) sum_similar_loss += loss num_similar_loss += 1 for other_idx in dissimilar_snippet_idxs[: num_dissimilar_examples]: args = list(training_set[current_idx][0]) + list( training_set[other_idx][0]) + [i] loss = self.__compiled_methods.grad_accumulate( *args) sum_dissimilar_loss += loss num_dissimilar_loss += 1 if loss < 0 else 0 n_batches += 1 ratios += self.__compiled_methods.grad_step() if i % validation_check_limit == validation_check_limit - 1: print("Iteration %s Stats" % i) current_score = compute_validation_score() historic_data['validation_score'].append(current_score) if current_score > best_score: best_score = current_score self.__trained_parameters = [ p.get_value() for p in self.__trainable_params ] print( "At %s validation: current_score=%s [best so far]" % (i, current_score)) epochs_not_improved = 0 else: print("At %s validation: current_score=%s" % (i, current_score)) epochs_not_improved += 1 for k in range(len(self.__trainable_params)): print("%s: %.0e" % (self.__trainable_params[k].name, ratios[k] / n_batches)) print("Train sum similar-loss: %s (%s samples)" % (sum_similar_loss, num_similar_loss)) print("Train sum dissimilar-loss: %s (%s samples)" % (sum_dissimilar_loss, num_dissimilar_loss)) # print("Training Set stats: %s" % compute_score(training_set[:500])) sum_similar_loss = 0 num_similar_loss = 0 sum_dissimilar_loss = 0 num_dissimilar_loss = 0 ratios = np.zeros_like(ratios) if additional_code_to_run is not None: additional_code_to_run(historic_data) if epochs_not_improved >= patience: print("Not improved for %s epochs. Stopping..." % patience) break print("[%s] Training Finished..." % time.asctime()) except (InterruptedError, KeyboardInterrupt): print("Interrupted. Exiting training gracefully...") return best_score, historic_data