示例#1
0
def get_dataset_samples(filename: str):
    dataset_samples = []
    for name, code in import_data(filename).items():
        dataset_samples.append(''.join(code['original'][0]))
        for noisy_sample in code['noise']:
            dataset_samples.append(''.join(noisy_sample[0]))
    return set(dataset_samples)
示例#2
0
    def evaluate(self,
                 data_filename: str,
                 consider_only_first_n_components: int = None,
                 num_nns: int = 10) -> np.array:
        data = import_data(data_filename)
        encodings = []
        equivalent_to = []
        equivalence_sets = []
        for name, code in data.items():
            idx = len(encodings)
            enc = self.__encoder.get_encoding(code['original'])
            assert not np.isnan(np.sum(enc))
            encodings.append(enc)
            equivalent_to.append(idx)
            for noisy_sample in code['noise']:
                enc = self.__encoder.get_encoding(noisy_sample)
                assert not np.isnan(np.sum(enc))
                encodings.append(enc)
                equivalent_to.append(idx)

            equivalence_sets.append(set(range(idx, len(encodings))))

        encodings = np.array(encodings)
        if consider_only_first_n_components is not None:
            encodings = encodings[:, :consider_only_first_n_components]

        all_distances = squareform(pdist(
            encodings, 'cosine'))  # TODO: avoid square form somehow
        assert not np.any(np.isnan(all_distances))
        identity = np.arange(all_distances.shape[0])
        all_distances[identity, identity] = float(
            'inf'
        )  # The distance to self is infinite to get the real neighbors in the next step
        k_nearest_neighbor_idxs = np.argpartition(all_distances,
                                                  num_nns)[:, :num_nns]

        left_index = np.atleast_2d(identity).T
        order_of_knearest_neighbors = np.argsort(
            all_distances[left_index, k_nearest_neighbor_idxs])
        all_distances = k_nearest_neighbor_idxs[left_index,
                                                order_of_knearest_neighbors]

        equivalent_elements = {}
        for eq_set in equivalence_sets:
            for element in eq_set:
                equivalent_elements[element] = eq_set

        k_nns_semantic_eq = np.zeros(num_nns, dtype=np.float64)
        num_k_nns = np.zeros(num_nns)
        for i in range(all_distances.shape[0]):
            semantically_eq_nns = equivalent_elements[i]
            if len(semantically_eq_nns) < 2:
                continue
            for j in range(num_nns):
                num_k_nns[j] += 1
                k_nns_semantic_eq[j] += float(len(semantically_eq_nns & set(all_distances[i, :j + 1]))) / \
                                        min(len(semantically_eq_nns), j + 1)

        return k_nns_semantic_eq / num_k_nns
示例#3
0
    def __init__(self, filename: str, training_data: dict = None):
        """
        :param filename: the filename of the training data, ignored if training_data is *not* None
        :param training_data: use this training data instead of loading the filename, defaults to None (and thus
        data is loaded from the filename)
        """
        if training_data is None:
            training_data = import_data(filename)

        self.num_equivalent_classes = len(training_data)

        def get_vocabulary():
            for data in training_data.values():
                original_tree = data["original"][1]
                for node in original_tree:
                    yield node.name

        self.__node_type_dict = FeatureDictionary.get_feature_dictionary_for(
            get_vocabulary())

        def get_top_level_symbols():
            for data in training_data.values():
                original_tree = data["original"][1]
                yield original_tree.symbol
                for noise_expr in data["noise"]:
                    yield noise_expr[1].symbol

        self.__symbol_dict = FeatureDictionary.get_feature_dictionary_for(
            get_top_level_symbols(), 0)
        self.__empirical_symbol_dist = get_empirical_distribution(
            self.__symbol_dict, get_top_level_symbols())

        def get_num_properties():
            for data in training_data.values():
                original_tree = data["original"][1]
                for node in original_tree:
                    yield len(node.properties)

        self.__max_num_properties_per_node = max(get_num_properties())

        def get_start_node():
            for data in training_data.values():
                original_tree = data["original"][1]
                yield original_tree.name

        tree_roots = set(get_start_node())
        assert len(tree_roots) == 1  # Everything should be a block!
        self.__root_type = tree_roots.pop()

        self.__node_to_properties = {}
        for data in training_data.values():
            original_tree = data["original"][1]
            for node in original_tree:
                self.__node_to_properties[node.name] = node.properties
示例#4
0
    def prediction_accuracy(self, dataset_file):
        self.__compile_if_needed()
        data = import_data(dataset_file)
        dataset = list((self.__dataset_extractor.get_dataset_for_encoder(data, return_num_tokens=True)))

        correct = 0
        for tree in dataset:
            all_args = list(tree[0])
            ll, logprobs = self.__compiled_methods.ll_and_logprobs(*all_args)
            if np.argmax(logprobs) == all_args[-1]:
                correct += 1
        return correct / len(dataset)
示例#5
0
    def __init__(self, filename):
        training_data = import_data(filename)
        self.num_equivalence_classes = len(training_data)

        def vocabulary():
            for data in training_data.values():
                for token in self.__add_start_end_symbols(data["original"][0]):
                    yield token
                for noisy_sample in data["noise"]:
                    for token in self.__add_start_end_symbols(noisy_sample[0]):
                        yield token

        self.__feature_map = FeatureDictionary.get_feature_dictionary_for(vocabulary())

        dataset = self.build_dataset(training_data)
        self.__dataset = dataset
示例#6
0
    def __init__(self, train_file):
        data = import_data(train_file)

        def document_tokens():
            for snippet in data.values():
                yield snippet['original'][0]

        all_document_tokens = [s for s in document_tokens()]
        self.__feature_dict = FeatureDictionary.get_feature_dictionary_for(chain(*all_document_tokens),
                                                                           count_threshold=10)

        self.__idfs = np.ones(len(self.__feature_dict), dtype=np.int)  # use 1s for smoothing
        for document in all_document_tokens:
            document_word_ids = set(self.__feature_dict.get_id_or_unk(t) for t in document)
            for word_id in document_word_ids:
                self.__idfs[word_id] += 1

        self.__idfs = np.log(self.__idfs.astype(np.float))
    def __init__(self,
                 training_file,
                 hyperparameters,
                 encoder_type='gru',
                 use_centroid=False):
        """

        :param training_file:
        :type hyperparameters: dict
        :return:
        """
        self.__hyperparameters = hyperparameters

        self.dataset_extractor = TokenAutoencoderDatasetExtractor(
            training_file)

        empirical_distribution = get_empirical_distribution(
            self.dataset_extractor.feature_map,
            chain(*self.dataset_extractor.get_nonnoisy_samples(
                import_data(training_file))))
        self.__encoder = SequenceGruSupervisedEncoderModel(
            self.__hyperparameters["embedding_size"],
            len(self.dataset_extractor.feature_map),
            empirical_distribution,
            self.__hyperparameters["representation_size"],
            self.__hyperparameters,
            encoder_type=encoder_type,
            use_centroid=use_centroid)

        target_embeddings = np.random.randn(self.__hyperparameters["representation_size"],
                                            self.dataset_extractor.num_equivalence_classes) * 10 ** \
                                                                                              self.__hyperparameters[
                                                                                                  "log_init_noise"]

        self.__target_embeddings = theano.shared(target_embeddings.astype(
            theano.config.floatX),
                                                 name="target_embeddings")
        self.__target_embeddings_dropout = dropout(
            self.__hyperparameters['dropout_rate'], self.__encoder.rng,
            self.__target_embeddings, True)

        self.__trained_parameters = None
        self.__compiled_methods = None
示例#8
0
def get_representation_distance_ratio(encoder: AbstractEncoder, data_filename: str, print_stats: bool = False):
    """Compute the ratio of the avg distance of points within an equivalence class vs the avg distance between all points"""
    data = import_data(data_filename)
    encodings = []
    equivalence_sets = []

    for name, code in data.items():
        idx = len(encodings)
        enc = encoder.get_encoding(code['original'])
        assert not np.isnan(np.sum(enc))
        encodings.append(enc)
        for noisy_sample in code['noise']:
            enc = encoder.get_encoding(noisy_sample)
            assert not np.isnan(np.sum(enc))
            encodings.append(enc)
        equivalence_sets.append(set(range(idx, len(encodings))))

    encodings = np.array(encodings)

    all_distances = squareform(pdist(encodings, 'cosine'))  # TODO: avoid square form somehow
    assert not np.any(np.isnan(all_distances))

    # Average the lower triangle of all_distances
    avg_distance_between_all_points = np.sum(np.tril(all_distances, k=-1)) / (len(encodings) * (len(encodings) - 1) / 2)

    sum_distance_within_eq_class = 0.
    num_pairs = 0
    for equiv_class_idxs in equivalence_sets:
        num_elements_in_class = len(equiv_class_idxs)
        if num_elements_in_class < 2:
            continue
        elems_in_eq_class = np.fromiter(equiv_class_idxs, dtype=np.int32)
        sum_distance_within_eq_class += np.sum(np.tril(all_distances[elems_in_eq_class][:, elems_in_eq_class], k=-1))
        num_pairs += num_elements_in_class * (num_elements_in_class - 1) / 2

    avg_distance_within_eq_class = sum_distance_within_eq_class / num_pairs
    if print_stats:
        print(
            "Within Avg Dist: %s  All Avg Dist: %s " % (avg_distance_within_eq_class, avg_distance_between_all_points))
    return avg_distance_between_all_points / avg_distance_within_eq_class
    def __init__(self, training_file, hyperparameters, encoder_type='gru', use_centroid=False):
        """

        :param training_file:
        :type hyperparameters: dict
        :return:
        """
        self.__hyperparameters = hyperparameters

        self.dataset_extractor = TokenAutoencoderDatasetExtractor(training_file)

        empirical_distribution = get_empirical_distribution(self.dataset_extractor.feature_map,
                                                            chain(*self.dataset_extractor.get_nonnoisy_samples(
                                                                import_data(training_file))))
        self.__encoder = SequenceGruSiameseEncoderModel(self.__hyperparameters["embedding_size"],
                                                        len(self.dataset_extractor.feature_map),
                                                        empirical_distribution,
                                                        self.__hyperparameters["representation_size"],
                                                        self.__hyperparameters, encoder_type=encoder_type,
                                                        use_centroid=use_centroid)

        self.__trained_parameters = None
        self.__compiled_methods = None
示例#10
0
    for name, code in import_data(filename).items():
        dataset_samples.append(
            (''.join(code['original'][0]), code['original'][1]))
        for noisy_sample in code['noise']:
            dataset_samples.append((''.join(noisy_sample[0]), noisy_sample[1]))
    return set(dataset_samples)


if __name__ == '__main__':
    if len(sys.argv) != 4:
        print("Usage <encoderPkl> <dataset.json.gz> <testset.json.gz>")
        sys.exit(-1)

    testset_samples = get_dataset_samples(sys.argv[3])

    data = import_data(sys.argv[2])
    encoder = AbstractEncoder.load(sys.argv[1])

    expression_data, encodings = [], []
    eq_class_idx_to_names = {}
    eq_class_counts = defaultdict(int)

    def add_sample(data, eq_class_idx: int):
        sample_data = dict(tree=data[1], eq_class=eq_class_idx)
        expression_data.append(sample_data)

        representation = encoder.get_encoding(data)
        assert not np.isnan(np.sum(representation))
        encodings.append(representation)

    for eq_class_idx, (name, code) in enumerate(data.items()):
示例#11
0
    def evaluate_with_test(self,
                           data_filename: str,
                           test_filename: str,
                           consider_only_first_n_components: int = None,
                           num_nns: int = 15) -> np.array:
        test_data = import_data(test_filename)
        test_samples = defaultdict(set)  # eq_class -> tokens
        for eq_class, code in test_data.items():
            test_samples[eq_class].add(''.join(code['original'][0]))
            for sample in code['noise']:
                test_samples[eq_class].add(''.join(sample[0]))

        data = import_data(data_filename)
        encodings = []
        equivalence_classes = defaultdict(set)  # eq_class->set(ids)
        test_samples_idx_map = OrderedDict()  # id-> eq_class
        for eq_class, code in data.items():
            encoding = self.__encoder.get_encoding(code['original'])
            assert not np.isnan(np.sum(encoding))
            encodings.append(encoding)
            equivalence_classes[eq_class].add(len(encodings) - 1)
            if ''.join(code['original'][0]) in test_samples[eq_class]:
                test_samples_idx_map[len(encodings) - 1] = eq_class
            for noisy_sample in code['noise']:
                encoding = self.__encoder.get_encoding(noisy_sample)
                assert not np.isnan(np.sum(encoding))
                encodings.append(encoding)
                equivalence_classes[eq_class].add(len(encodings) - 1)
                if ''.join(noisy_sample[0]) in test_samples[eq_class]:
                    test_samples_idx_map[len(encodings) - 1] = eq_class

        test_sample_idxs = np.fromiter(test_samples_idx_map.keys(),
                                       dtype=np.int32)
        encodings = np.array(encodings)
        if consider_only_first_n_components is not None:
            encodings = encodings[:, :consider_only_first_n_components]

        nearest_neighbors = cdist(encodings[test_sample_idxs], encodings,
                                  'cosine')  # TODO: avoid square form somehow
        identity = np.arange(nearest_neighbors.shape[0])
        assert nearest_neighbors.shape[0] == len(test_sample_idxs)
        nearest_neighbors[identity, test_sample_idxs] = float(
            'inf'
        )  # The distance to self is infinite to get the real neighbors in the next step

        k_nearest_neighbor_idxs = np.argpartition(nearest_neighbors,
                                                  num_nns)[:, :num_nns]

        left_index = np.atleast_2d(identity).T
        order_of_knearest_neighbors = np.argsort(
            nearest_neighbors[left_index, k_nearest_neighbor_idxs])
        nearest_neighbors = k_nearest_neighbor_idxs[
            left_index, order_of_knearest_neighbors]

        k_nns_semantic_eq = np.zeros(num_nns, dtype=np.float64)
        num_k_nns = np.zeros(num_nns)
        for i in range(nearest_neighbors.shape[0]):
            test_sample_i = test_sample_idxs[i]
            semantically_eq_nns = equivalence_classes[
                test_samples_idx_map[test_sample_i]]
            if len(semantically_eq_nns) < 2:
                continue
            for j in range(num_nns):
                num_k_nns[j] += 1
                k_nns_semantic_eq[j] += float(len(semantically_eq_nns & set(nearest_neighbors[i, :j + 1]))) / \
                                        min(len(semantically_eq_nns), j + 1)

        return k_nns_semantic_eq / num_k_nns
示例#12
0
    def evaluate_with_test(self, data_filename: str, test_filename: str, consider_only_first_n_components: int = None,
                           num_nns: int = 15) -> np.array:
        test_data = import_data(test_filename)
        test_samples = defaultdict(set)  # eq_class -> tokens
        for eq_class, code in test_data.items():
            test_samples[eq_class].add(''.join(code['original'][0]))
            for sample in code['noise']:
                test_samples[eq_class].add(''.join(sample[0]))

        data = import_data(data_filename)
        encodings = []
        #Menge von Indizes
        equivalence_classes = defaultdict(set)  # eq_class->set(ids)
        #Mapping von Index auf Äquivalenzklasse
        test_samples_idx_map = OrderedDict()  # id-> eq_class
        for eq_class, code in data.items():
            #encoding = 64-Stellen np-array = Repräsentation des berechneten SemVecs?
            encoding = self.__encoder.get_encoding(code['original'])
            assert not np.isnan(np.sum(encoding))
            encodings.append(encoding)
            equivalence_classes[eq_class].add(len(encodings) - 1)
            if ''.join(code['original'][0]) in test_samples[eq_class]:
                test_samples_idx_map[len(encodings) - 1] = eq_class
            for noisy_sample in code['noise']:
                encoding = self.__encoder.get_encoding(noisy_sample)
                assert not np.isnan(np.sum(encoding))
                encodings.append(encoding)
                equivalence_classes[eq_class].add(len(encodings) - 1)
                if ''.join(noisy_sample[0]) in test_samples[eq_class]:
                    test_samples_idx_map[len(encodings) - 1] = eq_class

        test_sample_idxs = np.fromiter(test_samples_idx_map.keys(), dtype=np.int32)
        encodings = np.array(encodings)
        if consider_only_first_n_components is not None:
            encodings = encodings[:, :consider_only_first_n_components]

        nearest_neighbors = cdist(encodings[test_sample_idxs], encodings, 'cosine')  # TODO: avoid square form somehow
        identity = np.arange(nearest_neighbors.shape[0])
        assert nearest_neighbors.shape[0] == len(test_sample_idxs)
        nearest_neighbors[identity, test_sample_idxs] = float(
            'inf')  # The distance to self is infinite to get the real neighbors in the next step

        k_nearest_neighbor_idxs = np.argpartition(nearest_neighbors, num_nns)[:, :num_nns]

        left_index = np.atleast_2d(identity).T
        order_of_knearest_neighbors = np.argsort(nearest_neighbors[left_index, k_nearest_neighbor_idxs])
        nearest_neighbors = k_nearest_neighbor_idxs[left_index, order_of_knearest_neighbors]

        k_nns_semantic_eq = np.zeros(num_nns, dtype=np.float64)
        num_k_nns = np.zeros(num_nns)
        for i in range(nearest_neighbors.shape[0]):
            test_sample_i = test_sample_idxs[i]
            semantically_eq_nns = equivalence_classes[test_samples_idx_map[test_sample_i]]
            if len(semantically_eq_nns) < 2:
                continue
            for j in range(num_nns):
                num_k_nns[j] += 1
                #Formel (4) von Seite 6 in Paper
                #"proportion of k nearest neighbors of each expression (using cosine similarity)
                # that belong to the same equivalence class"
                k_nns_semantic_eq[j] += float(len(semantically_eq_nns & set(nearest_neighbors[i, :j + 1]))) / \
                                        min(len(semantically_eq_nns), j + 1)

        #Avg Semantically Equivalent NNs
        #durchschnittlich sematnisch äquivlanete nearest neighbors?
        return k_nns_semantic_eq / num_k_nns
示例#13
0
    def train(self, training_file, validation_file, max_iter=5000, patience=50, validation_check_limit=2,
              additional_code_to_run=None) -> tuple:
        self.__compile_if_needed()

        minibatch_size = self.__hyperparameters["minibatch_size"]
        training_data = import_data(training_file)
        training_set = list(self.__dataset_extractor.get_dataset_for_encoder(training_data, return_num_tokens=True))
        validation_set = list(self.__dataset_extractor.get_dataset_for_encoder(import_data(validation_file),
                                                                               return_num_tokens=True))

        print("Num classes: %s" % self.__dataset_extractor.num_equivalent_classes)

        def compute_validation_score() -> float:
            print("Train Accuracy %s" % compute_score(training_set, False, True)[1])
            return compute_score(validation_set)

        def compute_score(dataset, print_score=True, return_accuracy=False) -> float:
            # Get all encodings
            sum_ll = 0.
            correct = 0
            for tree in dataset:
                all_args = list(tree[0])
                ll, logprobs = self.__compiled_methods.ll_and_logprobs(*all_args)
                sum_ll += ll
                if np.argmax(logprobs) == all_args[-1]:
                    correct += 1
            if print_score:
                print("Accuracy: %s, LL: %s" % (correct / len(dataset) * 100, sum_ll / len(dataset)))

            if return_accuracy:
                return sum_ll / len(dataset), (correct / len(dataset) * 100)
            return (correct / len(dataset) * 100)

        if self.__trained_parameters is None:
            best_score = float('-inf')
        else:
            best_score = compute_validation_score()
            print("Previous best validation score: %s" % best_score)

        try:
            print("[%s] Training Started..." % time.asctime())
            ratios = np.zeros(len(self.__trainable_params))
            epochs_not_improved = 0
            historic_data = defaultdict(list)
            # Clump minibatches and disallow minibatches that are smaller than their given size, since they may
            # cause instability.
            current_max_size = self.__hyperparameters['curriculum_initial_size']
            curriculum_step = self.__hyperparameters['curriculum_step']
            for i in range(max_iter):
                sample_ordering = []
                for j, tree_data in enumerate(training_set):
                    if tree_data[1] <= current_max_size:
                        sample_ordering.append(j)
                current_max_size += curriculum_step
                np.random.shuffle(np.array(sample_ordering, dtype=np.int32))
                n_batches = 0
                sum_train_loss = 0
                num_elements = 0

                num_minibatches = max(1, min(int(np.floor(float(len(sample_ordering)) / minibatch_size)), 10))

                for j in trange(num_minibatches, desc="Minibatch"):
                    for k in trange(j * minibatch_size, min((j + 1) * minibatch_size, len(sample_ordering)),
                                    desc="Sample", leave=False):
                        current_idx = sample_ordering[k]
                        args = list(training_set[current_idx][0]) + [i]
                        loss = self.__compiled_methods.grad_accumulate(*args)
                        sum_train_loss += loss
                        num_elements += 1

                    n_batches += 1
                    ratios += self.__compiled_methods.grad_step()

                if i % validation_check_limit == validation_check_limit - 1:
                    print("Iteration %s Stats" % i)
                    current_score = compute_validation_score()
                    historic_data['validation_score'].append(current_score)
                    if current_score > best_score:
                        best_score = current_score
                        self.__trained_parameters = [p.get_value() for p in self.__trainable_params]
                        print("At %s validation: current_score=%s [best so far]" % (i, current_score))
                        epochs_not_improved = 0
                    else:
                        print("At %s validation: current_score=%s" % (i, current_score))
                        epochs_not_improved += 1
                    for k in range(len(self.__trainable_params)):
                        print("%s: %.0e" % (self.__trainable_params[k].name, ratios[k] / n_batches))

                    print("Train ll: %s" % (sum_train_loss / num_elements))

                    ratios = np.zeros_like(ratios)
                    if additional_code_to_run is not None: additional_code_to_run(historic_data)
                if epochs_not_improved >= patience:
                    print("Not improved for %s epochs. Stopping..." % patience)
                    break
            print("[%s] Training Finished..." % time.asctime())
        except (InterruptedError, KeyboardInterrupt):
            print("Interrupted. Exiting training gracefully...")

        return best_score, historic_data
示例#14
0
from matplotlib import pyplot as plt
from scipy.spatial.distance import squareform, pdist
from sklearn.manifold import TSNE

from data.dataimport import import_data
from encoders.baseencoder import AbstractEncoder

if __name__ == '__main__':
    if len(sys.argv) != 5:
        print(
            "Usage <encoderPkl> <dataset.json.gz> <testset.json.gz> <neweqtestset.json.gz>"
        )
        sys.exit(-1)

    testset_samples = []
    for name, code in import_data(sys.argv[3]).items():
        testset_samples.append(''.join(code['original'][0]))
        for noisy_sample in code['noise']:
            testset_samples.append(''.join(noisy_sample[0]))
    testset_samples = set(testset_samples)

    neweq_test_set_eq_classes = set(import_data(sys.argv[4]).keys())

    data = import_data(sys.argv[2])
    encoder = AbstractEncoder.load(sys.argv[1])

    encodings, eq_classes_idxs, test_sample_idxs, neweq_samples_idxs = [], [], [], []
    eq_class_idx_to_names = {}

    for eq_class_idx, (name, code) in enumerate(data.items()):
        eq_class_idx_to_names[eq_class_idx] = name
    def train(self,
              training_file: str,
              validation_file: str,
              max_iter: int = 1000,
              patience: int = 25,
              validation_check_limit: int = 1,
              semantically_equivalent_noise: bool = False,
              additional_code_to_run=None) -> tuple:
        self.__compile_if_needed()

        minibatch_size = self.__hyperparameters["minibatch_size"]
        training_data = import_data(training_file)
        training_set = list(
            self.dataset_extractor.get_dataset_for_encoder(
                training_data, return_num_tokens=True))
        validation_set = list(
            self.dataset_extractor.get_dataset_for_encoder(
                import_data(validation_file), return_num_tokens=True))
        best_score = float('-inf')
        train_x_ent = 0
        epochs_not_improved = 0
        historic_values = []

        trainable_parameters = list(
            self.__encoder.parameters.values()) + [self.__target_embeddings]

        print("Num classes: %s" %
              self.dataset_extractor.num_equivalence_classes)

        def compute_validation_score() -> float:
            return compute_score(validation_set)

        def compute_score(dataset) -> float:
            # Get all encodings
            sum_ll = 0.
            correct = 0
            for data in dataset:
                ll, logprobs = self.__compiled_methods.ll_and_logprobs(
                    data[0], data[2])
                sum_ll += ll
                if np.argmax(logprobs) == data[2]:
                    correct += 1
            print("Accuracy: %s" % (correct / len(dataset) * 100))
            return sum_ll / len(dataset)

        num_minibatches = max(
            1, min(int(np.floor(float(len(training_set)) / minibatch_size)),
                   25))  # Clump minibatches
        try:
            print("[%s] Training Started..." % time.asctime())
            ratios = np.zeros(len(trainable_parameters))
            n_batches = 0
            current_max_size = 3.
            curriculum_step = .2
            for i in range(max_iter):
                sample_ordering = []
                for j, tree_data in enumerate(training_set):
                    if tree_data[1] <= current_max_size:
                        sample_ordering.append(j)
                current_max_size += curriculum_step
                np.random.shuffle(np.array(sample_ordering, dtype=np.int32))
                n_batches = 0
                sum_train_loss = 0
                num_elements = 0

                for j in trange(num_minibatches, desc="Minibatch"):
                    for k in trange(j * minibatch_size,
                                    min((j + 1) * minibatch_size,
                                        len(sample_ordering)),
                                    desc="Sample",
                                    leave=False):
                        current_idx = sample_ordering[k]
                        loss = self.__compiled_methods.grad_accumulate(
                            training_set[current_idx][0],
                            training_set[current_idx][2])
                        sum_train_loss += loss
                        num_elements += 1

                    n_batches += 1
                    ratios += self.__compiled_methods.grad_step()

                if i % validation_check_limit == validation_check_limit - 1:
                    current_ll = compute_validation_score()
                    if current_ll > best_score:
                        best_score = current_ll
                        self.__save_current_params_as_best()
                        print("At %s validation: current_ll=%s [best so far]" %
                              (i, current_ll))
                        epochs_not_improved = 0
                    else:
                        print("At %s validation: current_ll=%s" %
                              (i, current_ll))
                        epochs_not_improved += 1

                    for k in range(len(trainable_parameters)):
                        print("%s: %.0e" % (trainable_parameters[k].name,
                                            ratios[k] / n_batches))

                    print("Train ll: %s" % (sum_train_loss / num_elements))
                    ratios = np.zeros_like(ratios)
                    if additional_code_to_run is not None:
                        additional_code_to_run()

                if epochs_not_improved >= patience:
                    print("Not improved for %s epochs. Stopping..." % patience)
                    break

            print("[%s] Training Finished..." % time.asctime())
        except (InterruptedError, KeyboardInterrupt, SystemExit):
            print("Interrupted. Exiting training gracefully...")

        return best_score, historic_values
示例#16
0
from data.dataimport import import_data


def plot_distribution(data, title):
    data = np.array([d for d in data])
    sns.distplot(data, rug=True)
    plt.title(title)
    plt.show()


if len(sys.argv) != 2:
    print("Usage <dataset.json.gz>")
    sys.exit(-1)

data = import_data(sys.argv[1]).values()


def num_noise_samples_per_original():
    for snippet in data:
        yield len(snippet["noise"])


plot_distribution(num_noise_samples_per_original(),
                  title="Num noise samples per original")


def num_nodes_of_original():
    for snippet in data:
        original_tree = snippet["original"][1]
        yield sum(1 for node in original_tree)
示例#17
0
    def train(self,
              training_file,
              validation_file,
              max_iter=1000,
              patience=25,
              validation_check_limit=1,
              additional_code_to_run=None) -> tuple:
        self.__compile_if_needed()

        minibatch_size = self.__hyperparameters["minibatch_size"]
        training_data = import_data(training_file)
        training_set = list(
            self.__dataset_extractor.get_dataset_for_encoder(
                training_data, return_num_tokens=True))
        validation_set = list(
            self.__dataset_extractor.get_dataset_for_encoder(
                import_data(validation_file), return_num_tokens=True))

        def compute_validation_score() -> float:
            return compute_score(validation_set)

        def compute_score(dataset) -> float:
            # Get all encodings
            encodings = []
            equivalents = defaultdict(set)
            for i, tree in enumerate(dataset):
                encodings.append(self.__compiled_methods.encode(*tree[0][:-1]))
                equivalents[tree[2]].add(i)

            encodings = np.array(encodings, dtype=theano.config.floatX)

            # Get all cosine similarities
            distances = pdist(encodings)

            is_similar = np.zeros_like(distances, dtype=np.int)
            for equivalence_set in equivalents.values():
                for i, j in permutations(equivalence_set, 2):
                    if i > j:
                        is_similar[encodings.shape[0] * j -
                                   int(j * (j + 1) / 2) + i - 1 - j] = 1

            similar_score = -np.sum(np.power(distances * is_similar, 2))
            margin = self.__hyperparameters['dissimilar_margin']
            differences = margin - distances
            rectified_diffs = differences * (differences > 0)
            dissimilar_score = -np.sum(
                np.power(rectified_diffs * (1 - is_similar), 2))

            print("Similar Loss: %s  Dissimilar Loss: %s" %
                  (similar_score, dissimilar_score))
            return similar_score + dissimilar_score

        if self.__trained_parameters is None:
            best_score = float('-inf')
        else:
            best_score = compute_validation_score()
            print("Previous best validation score: %s" % best_score)

        try:
            print("[%s] Training Started..." % time.asctime())
            sum_similar_loss = 0.
            num_similar_loss = 0
            sum_dissimilar_loss = 0.
            num_dissimilar_loss = 0
            ratios = np.zeros(len(self.__trainable_params))
            epochs_not_improved = 0
            historic_data = defaultdict(list)
            # Clump minibatches and disallow minibatches that are smaller than their given size, since they may
            # cause instability.
            num_minibatches = max(
                1,
                min(int(np.floor(float(len(training_set)) / minibatch_size)),
                    10))
            current_max_size = self.__hyperparameters[
                'curriculum_initial_size']
            curriculum_step = self.__hyperparameters['curriculum_step']

            num_examples = self.__hyperparameters['max_num_similar_examples']
            num_dissimilar_examples = self.__hyperparameters[
                'max_num_dissimilar_examples']

            for i in range(max_iter):
                sample_ordering = []
                for j, tree_data in enumerate(training_set):
                    if tree_data[1] <= current_max_size:
                        sample_ordering.append(j)
                current_max_size += curriculum_step
                np.random.shuffle(np.array(sample_ordering, dtype=np.int32))
                n_batches = 0

                for j in trange(num_minibatches, desc="Minibatch"):
                    for k in trange(j * minibatch_size,
                                    min((j + 1) * minibatch_size,
                                        len(sample_ordering)),
                                    desc="Sample",
                                    leave=False):
                        current_idx = sample_ordering[k]
                        # Add siamese gradients, by picking num_examples
                        similar_snippet_idxs = []
                        dissimilar_snippet_idxs = []
                        for l in range(len(sample_ordering)):
                            if l == k:
                                continue
                            other_idx = sample_ordering[l]
                            if training_set[current_idx][2] == training_set[
                                    other_idx][2]:
                                similar_snippet_idxs.append(other_idx)
                            else:
                                dissimilar_snippet_idxs.append(other_idx)
                        dissimilar_snippet_idxs = np.array(
                            dissimilar_snippet_idxs)

                        np.random.shuffle(similar_snippet_idxs)
                        np.random.shuffle(dissimilar_snippet_idxs)
                        for other_idx in similar_snippet_idxs[:num_examples]:
                            args = list(training_set[current_idx][0]) + list(
                                training_set[other_idx][0]) + [i]
                            loss = self.__compiled_methods.grad_accumulate(
                                *args)
                            sum_similar_loss += loss
                            num_similar_loss += 1

                        for other_idx in dissimilar_snippet_idxs[:
                                                                 num_dissimilar_examples]:
                            args = list(training_set[current_idx][0]) + list(
                                training_set[other_idx][0]) + [i]
                            loss = self.__compiled_methods.grad_accumulate(
                                *args)
                            sum_dissimilar_loss += loss
                            num_dissimilar_loss += 1 if loss < 0 else 0

                    n_batches += 1
                    ratios += self.__compiled_methods.grad_step()

                if i % validation_check_limit == validation_check_limit - 1:
                    print("Iteration %s Stats" % i)
                    current_score = compute_validation_score()
                    historic_data['validation_score'].append(current_score)
                    if current_score > best_score:
                        best_score = current_score
                        self.__trained_parameters = [
                            p.get_value() for p in self.__trainable_params
                        ]
                        print(
                            "At %s validation: current_score=%s [best so far]"
                            % (i, current_score))
                        epochs_not_improved = 0
                    else:
                        print("At %s validation: current_score=%s" %
                              (i, current_score))
                        epochs_not_improved += 1
                    for k in range(len(self.__trainable_params)):
                        print("%s: %.0e" % (self.__trainable_params[k].name,
                                            ratios[k] / n_batches))

                    print("Train sum similar-loss: %s (%s samples)" %
                          (sum_similar_loss, num_similar_loss))
                    print("Train sum dissimilar-loss: %s (%s samples)" %
                          (sum_dissimilar_loss, num_dissimilar_loss))
                    # print("Training Set stats: %s" % compute_score(training_set[:500]))
                    sum_similar_loss = 0
                    num_similar_loss = 0
                    sum_dissimilar_loss = 0
                    num_dissimilar_loss = 0
                    ratios = np.zeros_like(ratios)
                    if additional_code_to_run is not None:
                        additional_code_to_run(historic_data)
                if epochs_not_improved >= patience:
                    print("Not improved for %s epochs. Stopping..." % patience)
                    break
            print("[%s] Training Finished..." % time.asctime())
        except (InterruptedError, KeyboardInterrupt):
            print("Interrupted. Exiting training gracefully...")

        return best_score, historic_data