def __compile_if_needed(self):
     if self.__compiled_methods is None:
         print("Compiling Methods...")
         self.__compiled_methods = Bunch()
         self.__compile_train_functions()
         self.__compile_test_functions()
         print("Compilation Finished...")
Exemplo n.º 2
0
 def __compile_if_needed(self):
     if self.__compiled_methods is None:
         print("Compiling Methods...")
         if self.__trained_parameters is not None:
             self.set_parameter_values(self.__trained_parameters)
         self.__compiled_methods = Bunch()
         self.__compile_test_functions()
         self.__compile_train_functions()
         print("Compilation Finished...")
Exemplo n.º 3
0
class RecursiveNNSupervisedEncoder(AbstractEncoder):
    def __init__(self, training_filename: str, hyperparameters: dict, combination_type='eqnet'):
        self.__hyperparameters = hyperparameters

        self.__dataset_extractor = TreeDatasetExtractor(training_filename)
        self.__rng = RandomStreams()

        self.__rnn = RNN(self.__hyperparameters['memory_size'], self.__hyperparameters, self.__rng,
                         self.__dataset_extractor, combination_type=combination_type)
        check_hyperparameters(self.REQUIRED_HYPERPARAMETERS | self.__rnn.required_hyperparameters,
                              self.__hyperparameters)

        target_embeddings = np.random.randn(self.__hyperparameters['memory_size'],
                                            self.__dataset_extractor.num_equivalent_classes) * 10 ** \
                                                                                               self.__hyperparameters[
                                                                                                   "log_init_scale_embedding"]
        self.__target_embeddings = theano.shared(target_embeddings.astype(theano.config.floatX),
                                                 name="target_embeddings")
        self.__target_embeddings_dropout = dropout(self.__hyperparameters['dropout_rate'], self.__rng,
                                                   self.__target_embeddings, True)

        self.__target_bias = np.log(self.__dataset_extractor.training_empirical_distribution)

        self.__trainable_params = list(self.__rnn.get_params().values()) + [self.__target_embeddings]

        self.__compiled_methods = None
        self.__trained_parameters = None

    REQUIRED_HYPERPARAMETERS = {'log_learning_rate', 'rmsprop_rho', 'momentum', 'minibatch_size', 'grad_clip',
                                'memory_size', 'log_init_scale_embedding', 'dropout_rate', 'curriculum_initial_size',
                                'curriculum_step', 'accuracy_margin'}

    @property
    def rnn(self):
        return self.__rnn

    @property
    def rng(self):
        return self.__rng

    @property
    def hyperparameters(self):
        return self.__hyperparameters

    @property
    def dataset_extractor(self):
        return self.__dataset_extractor

    @property
    def trained_parameters(self):
        params = {}
        param_names = list(self.__rnn.get_params()) + ["target_embeddings"]
        for param, value in zip(param_names, self.__trained_parameters):
            params[param] = value
        return params

    def __get_loss(self, use_dropout: bool, iteration_number=0):
        _, all_node_encodings, additional_objective = self.__rnn.get_encoding(use_dropout, iteration_number)
        target_embeddings = self.__target_embeddings_dropout if use_dropout else self.__target_embeddings

        s = T.dot(all_node_encodings, target_embeddings) + self.__target_bias
        logprobs = log_softmax(s)

        eq_symbol = self.__rnn.get_input_variables().eq_symbol
        targets = T.extra_ops.to_one_hot(eq_symbol.dimshuffle('x'), self.__dataset_extractor.num_equivalent_classes)
        correct = logprobs[-1, eq_symbol]
        rest = T.max(T.flatten(logprobs[-1, (1 - targets).nonzero()]))
        ll = -T.nnet.relu(rest - correct + self.__hyperparameters['accuracy_margin'])
        return logprobs[-1], ll + additional_objective

    def __compile_train_functions(self):
        iteration_number = T.iscalar(name="iteration_number")
        _, ll = self.__get_loss(True, iteration_number)

        grad = T.grad(ll, self.__trainable_params, add_names=True)

        grad_acc = [theano.shared(np.zeros(param.get_value().shape).astype(theano.config.floatX)) for param in
                    self.__trainable_params] + [theano.shared(0, name="sample_count")]
        inputs = list(self.__rnn.get_input_variables()) + [iteration_number]
        self.__compiled_methods.grad_accumulate = theano.function(
            inputs=inputs,
            updates=[(v, v + g) for v, g in zip(grad_acc, grad)] + [(grad_acc[-1], grad_acc[-1] + 1)],
            outputs=T.mean(ll))

        normalized_grads = [T.switch(grad_acc[-1] > 0, g / grad_acc[-1].astype(theano.config.floatX), g) for g in
                            grad_acc[:-1]]

        step_updates, ratios = nesterov_rmsprop_multiple(self.__trainable_params, normalized_grads,
                                                         learning_rate=10 ** self.__hyperparameters[
                                                             "log_learning_rate"],
                                                         rho=self.__hyperparameters["rmsprop_rho"],
                                                         momentum=self.__hyperparameters["momentum"],
                                                         grad_clip=self.__hyperparameters["grad_clip"],
                                                         output_ratios=True)
        step_updates.extend(
            [(v, T.zeros(v.shape).astype(theano.config.floatX)) for v in grad_acc[:-1]])  # Set accumulators to 0
        step_updates.append((grad_acc[-1], 0))
        self.__compiled_methods.grad_step = theano.function(inputs=[], updates=step_updates, outputs=ratios)

    def __compile_test_functions(self):
        logprobs, ll = self.__get_loss(False)
        inputs = list(self.__rnn.get_input_variables())
        self.__compiled_methods.ll_and_logprobs = theano.function(
            inputs=inputs,
            outputs=[T.mean(ll), logprobs])

        self.__compiled_methods.encode = theano.function(inputs=self.__rnn.get_input_variables()[:-1],
                                                         outputs=self.__rnn.get_encoding(False)[0])

    def __compile_if_needed(self):
        if self.__compiled_methods is None:
            print("Compiling Methods...")
            if self.__trained_parameters is not None:
                self.set_parameter_values(self.__trained_parameters)
            self.__compiled_methods = Bunch()
            self.__compile_test_functions()
            self.__compile_train_functions()
            print("Compilation Finished...")

    def set_parameter_values(self, parameter_values: list):
        for param, value in zip(self.__trainable_params, parameter_values):
            param.set_value(value)

    def save(self, filename: str):
        tmp, self.__compiled_methods = self.__compiled_methods, None
        AbstractEncoder.save(self, filename)
        self.__compiled_methods = tmp

    def get_representation_vector_size(self) -> int:
        return self.__hyperparameters['memory_size']

    def get_encoding(self, data: tuple) -> np.array:
        self.__compile_if_needed()
        converted_tree = self.__dataset_extractor.convert_tree_to_array(data[1], ignore_eq_symbols=True)[:-1]

        return self.__compiled_methods.encode(*converted_tree)

    def prediction_accuracy(self, dataset_file):
        self.__compile_if_needed()
        data = import_data(dataset_file)
        dataset = list((self.__dataset_extractor.get_dataset_for_encoder(data, return_num_tokens=True)))

        correct = 0
        for tree in dataset:
            all_args = list(tree[0])
            ll, logprobs = self.__compiled_methods.ll_and_logprobs(*all_args)
            if np.argmax(logprobs) == all_args[-1]:
                correct += 1
        return correct / len(dataset)

    def train(self, training_file, validation_file, max_iter=5000, patience=50, validation_check_limit=2,
              additional_code_to_run=None) -> tuple:
        self.__compile_if_needed()

        minibatch_size = self.__hyperparameters["minibatch_size"]
        training_data = import_data(training_file)
        training_set = list(self.__dataset_extractor.get_dataset_for_encoder(training_data, return_num_tokens=True))
        validation_set = list(self.__dataset_extractor.get_dataset_for_encoder(import_data(validation_file),
                                                                               return_num_tokens=True))

        print("Num classes: %s" % self.__dataset_extractor.num_equivalent_classes)

        def compute_validation_score() -> float:
            print("Train Accuracy %s" % compute_score(training_set, False, True)[1])
            return compute_score(validation_set)

        def compute_score(dataset, print_score=True, return_accuracy=False) -> float:
            # Get all encodings
            sum_ll = 0.
            correct = 0
            for tree in dataset:
                all_args = list(tree[0])
                ll, logprobs = self.__compiled_methods.ll_and_logprobs(*all_args)
                sum_ll += ll
                if np.argmax(logprobs) == all_args[-1]:
                    correct += 1
            if print_score:
                print("Accuracy: %s, LL: %s" % (correct / len(dataset) * 100, sum_ll / len(dataset)))

            if return_accuracy:
                return sum_ll / len(dataset), (correct / len(dataset) * 100)
            return (correct / len(dataset) * 100)

        if self.__trained_parameters is None:
            best_score = float('-inf')
        else:
            best_score = compute_validation_score()
            print("Previous best validation score: %s" % best_score)

        try:
            print("[%s] Training Started..." % time.asctime())
            ratios = np.zeros(len(self.__trainable_params))
            epochs_not_improved = 0
            historic_data = defaultdict(list)
            # Clump minibatches and disallow minibatches that are smaller than their given size, since they may
            # cause instability.
            current_max_size = self.__hyperparameters['curriculum_initial_size']
            curriculum_step = self.__hyperparameters['curriculum_step']
            for i in range(max_iter):
                sample_ordering = []
                for j, tree_data in enumerate(training_set):
                    if tree_data[1] <= current_max_size:
                        sample_ordering.append(j)
                current_max_size += curriculum_step
                np.random.shuffle(np.array(sample_ordering, dtype=np.int32))
                n_batches = 0
                sum_train_loss = 0
                num_elements = 0

                num_minibatches = max(1, min(int(np.floor(float(len(sample_ordering)) / minibatch_size)), 10))

                for j in trange(num_minibatches, desc="Minibatch"):
                    for k in trange(j * minibatch_size, min((j + 1) * minibatch_size, len(sample_ordering)),
                                    desc="Sample", leave=False):
                        current_idx = sample_ordering[k]
                        args = list(training_set[current_idx][0]) + [i]
                        loss = self.__compiled_methods.grad_accumulate(*args)
                        sum_train_loss += loss
                        num_elements += 1

                    n_batches += 1
                    ratios += self.__compiled_methods.grad_step()

                if i % validation_check_limit == validation_check_limit - 1:
                    print("Iteration %s Stats" % i)
                    current_score = compute_validation_score()
                    historic_data['validation_score'].append(current_score)
                    if current_score > best_score:
                        best_score = current_score
                        self.__trained_parameters = [p.get_value() for p in self.__trainable_params]
                        print("At %s validation: current_score=%s [best so far]" % (i, current_score))
                        epochs_not_improved = 0
                    else:
                        print("At %s validation: current_score=%s" % (i, current_score))
                        epochs_not_improved += 1
                    for k in range(len(self.__trainable_params)):
                        print("%s: %.0e" % (self.__trainable_params[k].name, ratios[k] / n_batches))

                    print("Train ll: %s" % (sum_train_loss / num_elements))

                    ratios = np.zeros_like(ratios)
                    if additional_code_to_run is not None: additional_code_to_run(historic_data)
                if epochs_not_improved >= patience:
                    print("Not improved for %s epochs. Stopping..." % patience)
                    break
            print("[%s] Training Finished..." % time.asctime())
        except (InterruptedError, KeyboardInterrupt):
            print("Interrupted. Exiting training gracefully...")

        return best_score, historic_data
Exemplo n.º 4
0
    def __init__(self,
                 memory_size: int,
                 hyperparameters: dict,
                 rng: RandomStreams,
                 treedata: TreeDatasetExtractor,
                 name: str = "TreeRnn",
                 combination_type='eqnet'):
        self.__name = name
        self.__memory_size = memory_size
        self.__hyperparameters = hyperparameters
        self.__combination_type = combination_type
        self.__rng = rng
        self.__treedata = treedata

        self.__terminal_idx = T.ivector(name + ":terminal_idx")
        self.__terminal_types = T.ivector(name + ":terminal_types")
        self.__current_idx = T.ivector(name + ":current_idx")
        self.__children_idxs = T.imatrix(name + ":child_idxs")
        self.__node_types = T.ivector(name + ":node_types")
        self.__node_symbols = T.iscalar(name + ":node_symbols")
        self.__num_nodes = T.iscalar(name + ":num_nodes")

        # Used only for the leaf node representations
        node_embeddings = np.random.randn(len(treedata.node_type_dictionary), memory_size) * \
                          10 ** self.__hyperparameters["log_init_scale_embedding"]
        self.__node_embeddings = theano.shared(
            node_embeddings.astype(theano.config.floatX),
            name=name + ":terminal_embeddings")

        self.__dropped_out_params = Bunch()
        self.__dropped_out_params.node_embeddings_with_dropout = self.__node_embeddings + \
                                                                 self.__rng.normal(size=self.__node_embeddings.shape,
                                                                                   std=10 ** self.__hyperparameters[
                                                                                       "log_init_scale_embedding"])

        self.__required_hyperparameters = []

        if combination_type == 'single':
            self.__parent_state_combiner = SingleLayerCombination(
                self.__memory_size, len(treedata.node_type_dictionary),
                self.__treedata.max_num_properties_per_node,
                self.__hyperparameters, self.__rng)
        elif combination_type == 'eqnet':
            hidden_layer_sizes = self.__hyperparameters['hidden_layer_sizes']
            self.__required_hyperparameters.extend([
                'hidden_layer_sizes', 'ae_representation_size', 'ae_noise',
                'constrain_intro_rate'
            ])
            self.__parent_state_combiner = ResidualWithAutoencoder(
                self.__memory_size, len(treedata.node_type_dictionary),
                self.__treedata.max_num_properties_per_node,
                self.__hyperparameters, self.__rng, hidden_layer_sizes)
        elif combination_type == 'double':
            hidden_layer_size = self.__hyperparameters['hidden_layer_size']
            self.__required_hyperparameters.extend(['hidden_layer_size'])
            self.__parent_state_combiner = TwoLayerCombination(
                self.__memory_size, len(treedata.node_type_dictionary),
                self.__treedata.max_num_properties_per_node,
                self.__hyperparameters, hidden_layer_size, self.__rng)
        else:
            raise Exception("Unrecognized state combinator '" +
                            combination_type + "'")
Exemplo n.º 5
0
class RecursiveNNSiameseEncoder(AbstractEncoder):
    def __init__(self,
                 training_filename: str,
                 hyperparameters: dict,
                 combination_type='residual_with_ae'):
        self.__hyperparameters = hyperparameters

        self.__dataset_extractor = TreeDatasetExtractor(training_filename)
        self.__rng = RandomStreams()

        self.__rnn = RNN(self.__hyperparameters['memory_size'],
                         self.__hyperparameters,
                         self.__rng,
                         self.__dataset_extractor,
                         combination_type=combination_type)
        self.__trainable_params = list(self.__rnn.get_params().values())
        check_hyperparameters(
            self.REQUIRED_HYPERPARAMETERS
            | self.__rnn.required_hyperparameters, self.__hyperparameters)

        self.__compiled_methods = None
        self.__trained_parameters = None

    @staticmethod
    def get_encoder_from_supervised(supervised_encoder,
                                    dissimilar_margin: float):
        siamese = RecursiveNNSiameseEncoder.__new__(RecursiveNNSiameseEncoder)
        siamese.__rng = supervised_encoder.rng
        siamese.__rnn = supervised_encoder.rnn
        siamese.__dataset_extractor = supervised_encoder.dataset_extractor
        siamese.__hyperparameters = supervised_encoder.hyperparameters
        siamese.__hyperparameters['dissimilar_margin'] = dissimilar_margin

        siamese.__trainable_params = list(siamese.__rnn.get_params().values())
        saved_parameters = supervised_encoder.trained_parameters
        # print(saved_parameters)
        # siamese.set_parameter_values([saved_parameters[name] for name in siamese.__rnn.get_params()]) # Ignore the target embeddings
        siamese.__trained_parameters = [
            p.get_value() for p in siamese.__trainable_params
        ]
        siamese.__compiled_methods = None
        return siamese

    REQUIRED_HYPERPARAMETERS = {
        'log_learning_rate', 'rmsprop_rho', 'momentum', 'minibatch_size',
        'grad_clip', 'memory_size', 'log_init_scale_embedding', 'dropout_rate',
        'dissimilar_margin', 'curriculum_initial_size', 'curriculum_step',
        'max_num_similar_examples', 'max_num_dissimilar_examples'
    }

    def __get_loss(self, use_dropout, iteration_number=0):
        node_encoding1, _, extra_loss1 = self.__rnn.get_encoding(
            use_dropout, iteration_number)
        node_encoding1 /= node_encoding1.norm(2)

        copy_rnn = self.__rnn.copy_full()
        node_encoding2, _, extra_loss2 = copy_rnn.get_encoding(
            use_dropout, iteration_number)
        node_encoding2 /= node_encoding2.norm(2)

        distance = (node_encoding1 - node_encoding2).norm(2)

        are_non_equivalent = self.__rnn.get_input_variables(
        ).eq_symbol - copy_rnn.get_input_variables().eq_symbol

        margin = self.__hyperparameters['dissimilar_margin']
        siamese_loss = -T.power(
            T.switch(are_non_equivalent, T.nnet.relu(margin - distance),
                     distance), 2)
        return siamese_loss + extra_loss1 + extra_loss2, copy_rnn

    def __compile_train_functions(self):
        iteration_number = T.iscalar('iteration_number')
        prob_correct, other_rnn = self.__get_loss(True, iteration_number)

        grad = T.grad(prob_correct, self.__trainable_params, add_names=True)

        grad_acc = [
            theano.shared(
                np.zeros(param.get_value().shape).astype(theano.config.floatX))
            for param in self.__trainable_params
        ] + [theano.shared(0, name="sample_count")]
        inputs = list(self.__rnn.get_input_variables()) + list(
            other_rnn.get_input_variables()) + [iteration_number]
        self.__compiled_methods.grad_accumulate = theano.function(
            inputs=inputs,
            updates=[(v, v + g) for v, g in zip(grad_acc, grad)] +
            [(grad_acc[-1], grad_acc[-1])],
            # TODO: Remove accumulator if indeed not needed
            outputs=T.mean(prob_correct))

        normalized_grads = [
            T.switch(grad_acc[-1] > 0,
                     g / grad_acc[-1].astype(theano.config.floatX), g)
            for g in grad_acc[:-1]
        ]

        step_updates, ratios = nesterov_rmsprop_multiple(
            self.__trainable_params,
            normalized_grads,
            learning_rate=10**self.__hyperparameters["log_learning_rate"],
            rho=self.__hyperparameters["rmsprop_rho"],
            momentum=self.__hyperparameters["momentum"],
            grad_clip=self.__hyperparameters["grad_clip"],
            output_ratios=True)
        step_updates.extend([(v, T.zeros(v.shape).astype(theano.config.floatX))
                             for v in grad_acc[:-1]])  # Set accumulators to 0
        step_updates.append((grad_acc[-1], 0))
        self.__compiled_methods.grad_step = theano.function(
            inputs=[], updates=step_updates, outputs=ratios)

    def __compile_test_functions(self):
        prob_correct, other_rnn = self.__get_loss(False)
        inputs = list(self.__rnn.get_input_variables()) + list(
            other_rnn.get_input_variables())
        self.__compiled_methods.probability = theano.function(
            inputs=inputs, outputs=[prob_correct])

        encoding, _, _ = self.__rnn.get_encoding(False)
        encoding /= encoding.norm(2)
        self.__compiled_methods.encode = theano.function(
            inputs=self.__rnn.get_input_variables()[:-1], outputs=encoding)

    def __compile_if_needed(self):
        if self.__compiled_methods is None:
            print("Compiling Methods...")
            if self.__trained_parameters is not None:
                self.set_parameter_values(self.__trained_parameters)
            self.__compiled_methods = Bunch()
            self.__compile_test_functions()
            self.__compile_train_functions()
            print("Compilation Finished...")

    def set_parameter_values(self, parameter_values: list):
        for param, value in zip(self.__trainable_params, parameter_values):
            param.set_value(value)

    def save(self, filename: str):
        tmp, self.__compiled_methods = self.__compiled_methods, None
        AbstractEncoder.save(self, filename)
        self.__compiled_methods = tmp

    def get_representation_vector_size(self) -> int:
        return self.__hyperparameters['memory_size']

    def get_encoding(self, data: tuple) -> np.array:
        self.__compile_if_needed()
        converted_tree = self.__dataset_extractor.convert_tree_to_array(
            data[1])[:-1]
        return self.__compiled_methods.encode(*converted_tree)

    def train(self,
              training_file,
              validation_file,
              max_iter=1000,
              patience=25,
              validation_check_limit=1,
              additional_code_to_run=None) -> tuple:
        self.__compile_if_needed()

        minibatch_size = self.__hyperparameters["minibatch_size"]
        training_data = import_data(training_file)
        training_set = list(
            self.__dataset_extractor.get_dataset_for_encoder(
                training_data, return_num_tokens=True))
        validation_set = list(
            self.__dataset_extractor.get_dataset_for_encoder(
                import_data(validation_file), return_num_tokens=True))

        def compute_validation_score() -> float:
            return compute_score(validation_set)

        def compute_score(dataset) -> float:
            # Get all encodings
            encodings = []
            equivalents = defaultdict(set)
            for i, tree in enumerate(dataset):
                encodings.append(self.__compiled_methods.encode(*tree[0][:-1]))
                equivalents[tree[2]].add(i)

            encodings = np.array(encodings, dtype=theano.config.floatX)

            # Get all cosine similarities
            distances = pdist(encodings)

            is_similar = np.zeros_like(distances, dtype=np.int)
            for equivalence_set in equivalents.values():
                for i, j in permutations(equivalence_set, 2):
                    if i > j:
                        is_similar[encodings.shape[0] * j -
                                   int(j * (j + 1) / 2) + i - 1 - j] = 1

            similar_score = -np.sum(np.power(distances * is_similar, 2))
            margin = self.__hyperparameters['dissimilar_margin']
            differences = margin - distances
            rectified_diffs = differences * (differences > 0)
            dissimilar_score = -np.sum(
                np.power(rectified_diffs * (1 - is_similar), 2))

            print("Similar Loss: %s  Dissimilar Loss: %s" %
                  (similar_score, dissimilar_score))
            return similar_score + dissimilar_score

        if self.__trained_parameters is None:
            best_score = float('-inf')
        else:
            best_score = compute_validation_score()
            print("Previous best validation score: %s" % best_score)

        try:
            print("[%s] Training Started..." % time.asctime())
            sum_similar_loss = 0.
            num_similar_loss = 0
            sum_dissimilar_loss = 0.
            num_dissimilar_loss = 0
            ratios = np.zeros(len(self.__trainable_params))
            epochs_not_improved = 0
            historic_data = defaultdict(list)
            # Clump minibatches and disallow minibatches that are smaller than their given size, since they may
            # cause instability.
            num_minibatches = max(
                1,
                min(int(np.floor(float(len(training_set)) / minibatch_size)),
                    10))
            current_max_size = self.__hyperparameters[
                'curriculum_initial_size']
            curriculum_step = self.__hyperparameters['curriculum_step']

            num_examples = self.__hyperparameters['max_num_similar_examples']
            num_dissimilar_examples = self.__hyperparameters[
                'max_num_dissimilar_examples']

            for i in range(max_iter):
                sample_ordering = []
                for j, tree_data in enumerate(training_set):
                    if tree_data[1] <= current_max_size:
                        sample_ordering.append(j)
                current_max_size += curriculum_step
                np.random.shuffle(np.array(sample_ordering, dtype=np.int32))
                n_batches = 0

                for j in trange(num_minibatches, desc="Minibatch"):
                    for k in trange(j * minibatch_size,
                                    min((j + 1) * minibatch_size,
                                        len(sample_ordering)),
                                    desc="Sample",
                                    leave=False):
                        current_idx = sample_ordering[k]
                        # Add siamese gradients, by picking num_examples
                        similar_snippet_idxs = []
                        dissimilar_snippet_idxs = []
                        for l in range(len(sample_ordering)):
                            if l == k:
                                continue
                            other_idx = sample_ordering[l]
                            if training_set[current_idx][2] == training_set[
                                    other_idx][2]:
                                similar_snippet_idxs.append(other_idx)
                            else:
                                dissimilar_snippet_idxs.append(other_idx)
                        dissimilar_snippet_idxs = np.array(
                            dissimilar_snippet_idxs)

                        np.random.shuffle(similar_snippet_idxs)
                        np.random.shuffle(dissimilar_snippet_idxs)
                        for other_idx in similar_snippet_idxs[:num_examples]:
                            args = list(training_set[current_idx][0]) + list(
                                training_set[other_idx][0]) + [i]
                            loss = self.__compiled_methods.grad_accumulate(
                                *args)
                            sum_similar_loss += loss
                            num_similar_loss += 1

                        for other_idx in dissimilar_snippet_idxs[:
                                                                 num_dissimilar_examples]:
                            args = list(training_set[current_idx][0]) + list(
                                training_set[other_idx][0]) + [i]
                            loss = self.__compiled_methods.grad_accumulate(
                                *args)
                            sum_dissimilar_loss += loss
                            num_dissimilar_loss += 1 if loss < 0 else 0

                    n_batches += 1
                    ratios += self.__compiled_methods.grad_step()

                if i % validation_check_limit == validation_check_limit - 1:
                    print("Iteration %s Stats" % i)
                    current_score = compute_validation_score()
                    historic_data['validation_score'].append(current_score)
                    if current_score > best_score:
                        best_score = current_score
                        self.__trained_parameters = [
                            p.get_value() for p in self.__trainable_params
                        ]
                        print(
                            "At %s validation: current_score=%s [best so far]"
                            % (i, current_score))
                        epochs_not_improved = 0
                    else:
                        print("At %s validation: current_score=%s" %
                              (i, current_score))
                        epochs_not_improved += 1
                    for k in range(len(self.__trainable_params)):
                        print("%s: %.0e" % (self.__trainable_params[k].name,
                                            ratios[k] / n_batches))

                    print("Train sum similar-loss: %s (%s samples)" %
                          (sum_similar_loss, num_similar_loss))
                    print("Train sum dissimilar-loss: %s (%s samples)" %
                          (sum_dissimilar_loss, num_dissimilar_loss))
                    # print("Training Set stats: %s" % compute_score(training_set[:500]))
                    sum_similar_loss = 0
                    num_similar_loss = 0
                    sum_dissimilar_loss = 0
                    num_dissimilar_loss = 0
                    ratios = np.zeros_like(ratios)
                    if additional_code_to_run is not None:
                        additional_code_to_run(historic_data)
                if epochs_not_improved >= patience:
                    print("Not improved for %s epochs. Stopping..." % patience)
                    break
            print("[%s] Training Finished..." % time.asctime())
        except (InterruptedError, KeyboardInterrupt):
            print("Interrupted. Exiting training gracefully...")

        return best_score, historic_data
class SequenceGruSiameseEncoder(AbstractEncoder):
    """
    Train an encoder
    """

    def __init__(self, training_file, hyperparameters, encoder_type='gru', use_centroid=False):
        """

        :param training_file:
        :type hyperparameters: dict
        :return:
        """
        self.__hyperparameters = hyperparameters

        self.dataset_extractor = TokenAutoencoderDatasetExtractor(training_file)

        empirical_distribution = get_empirical_distribution(self.dataset_extractor.feature_map,
                                                            chain(*self.dataset_extractor.get_nonnoisy_samples(
                                                                import_data(training_file))))
        self.__encoder = SequenceGruSiameseEncoderModel(self.__hyperparameters["embedding_size"],
                                                        len(self.dataset_extractor.feature_map),
                                                        empirical_distribution,
                                                        self.__hyperparameters["representation_size"],
                                                        self.__hyperparameters, encoder_type=encoder_type,
                                                        use_centroid=use_centroid)

        self.__trained_parameters = None
        self.__compiled_methods = None

    REQUIRED_HYPERPARAMETERS = {'log_learning_rate', 'rmsprop_rho', 'momentum', 'grad_clip', 'minibatch_size',
                                'embedding_size', 'representation_size', 'log_init_noise', 'dropout_rate'}

    def __get_siamese_loss(self, use_dropout, scale_similar=1, scale_dissimilar=1):
        encoder_copy = self.__encoder.copy_full(name="siameseEncoder")
        encoding_1 = self.__encoder.get_encoding()
        encoding_2 = encoder_copy.get_encoding()

        representation_distance = (encoding_1 - encoding_2).norm(2)
        similar_loss = -scale_similar * T.pow(representation_distance, 2)
        margin = self.__hyperparameters['dissimilar_margin']
        dissimilar_loss = -scale_dissimilar * T.pow(T.nnet.relu(margin - representation_distance), 2)
        return dissimilar_loss, similar_loss, encoder_copy, encoding_1, encoding_2

    def __compile_train_functions(self):
        dissimilar_loss, similar_loss, encoder_copy, repr1, repr2 = self.__get_siamese_loss(True)

        wrt_vars = list(self.__encoder.parameters.values())

        grad_acc = [theano.shared(np.zeros(param.get_value().shape).astype(theano.config.floatX)) for param in wrt_vars] \
                   + [theano.shared(0, name="sample_count")]

        grad = T.grad(similar_loss, wrt_vars)
        self.__compiled_methods.grad_siamese_similar = theano.function(
            inputs=[encoder_copy.input_sequence_variable, self.__encoder.input_sequence_variable],
            updates=[(v, v + g) for v, g in zip(grad_acc, grad)] + [
                (grad_acc[-1], grad_acc[-1] + 1)],
            outputs=[similar_loss, repr1, repr2])

        grad = T.grad(dissimilar_loss, wrt_vars)
        self.__compiled_methods.grad_siamese_dissimilar = theano.function(
            inputs=[encoder_copy.input_sequence_variable, self.__encoder.input_sequence_variable],
            updates=[(v, v + g) for v, g in zip(grad_acc, grad)] + [
                (grad_acc[-1], grad_acc[-1] + 1)],
            outputs=[dissimilar_loss, repr1, repr2])

        normalized_grads = [T.switch(grad_acc[-1] > 0, g / grad_acc[-1].astype(theano.config.floatX), g) for g in
                            grad_acc[:-1]]
        step_updates, ratios = nesterov_rmsprop_multiple(wrt_vars, normalized_grads,
                                                         learning_rate=10 ** self.__hyperparameters[
                                                             "log_learning_rate"],
                                                         rho=self.__hyperparameters["rmsprop_rho"],
                                                         momentum=self.__hyperparameters["momentum"],
                                                         grad_clip=self.__hyperparameters["grad_clip"],
                                                         output_ratios=True)
        step_updates.extend([(v, T.zeros(v.shape)) for v in grad_acc[:-1]])  # Set accumulators to 0
        step_updates.append((grad_acc[-1], 0))

        self.__compiled_methods.grad_step = theano.function(inputs=[], updates=step_updates, outputs=ratios)

    def __compile_test_functions(self):
        dissimilar_loss, similar_loss, encoder_copy, _, _ = self.__get_siamese_loss(False)
        self.__compiled_methods.test_similar_loss = theano.function(
            inputs=[encoder_copy.input_sequence_variable, self.__encoder.input_sequence_variable], outputs=similar_loss)
        self.__compiled_methods.test_dissimilar_loss = theano.function(
            inputs=[encoder_copy.input_sequence_variable, self.__encoder.input_sequence_variable],
            outputs=dissimilar_loss)

        self.__compiled_methods.encode = theano.function(inputs=[self.__encoder.input_sequence_variable],
                                                         outputs=self.__encoder.get_encoding())

    def __compile_if_needed(self):
        if self.__compiled_methods is None:
            print("Compiling Methods...")
            self.__compiled_methods = Bunch()
            self.__compile_train_functions()
            self.__compile_test_functions()
            print("Compilation Finished...")

    def train(self, training_file: str, validation_file: str, max_iter: int = 1000, patience: int = 25,
              validation_check_limit: int = 1, additional_code_to_run=None) -> tuple:
        self.__compile_if_needed()

        minibatch_size = self.__hyperparameters["minibatch_size"]
        training_data = import_data(training_file)
        training_set = list(self.dataset_extractor.get_dataset_for_encoder(training_data, return_num_tokens=True))
        validation_set = list(
            self.dataset_extractor.get_dataset_for_encoder(import_data(validation_file), return_num_tokens=True))
        best_score = float('-inf')
        train_x_ent = 0
        epochs_not_improved = 0
        historic_values = []

        trainable_parameters = list(self.__encoder.parameters.values())

        print("Num classes: %s" % self.dataset_extractor.num_equivalence_classes)

        def compute_validation_score() -> float:
            return compute_score(validation_set)

        def compute_score(dataset) -> float:
            # Get all encodings
            encodings = []
            equivalents = defaultdict(set)
            for i, tree in enumerate(dataset):
                encodings.append(self.__compiled_methods.encode(tree[0]))
                equivalents[tree[2]].add(i)

            encodings = np.array(encodings, dtype=theano.config.floatX)
            distances = pdist(encodings, metric='euclidean')

            is_similar = np.zeros_like(distances, dtype=np.int)
            for equivalence_set in equivalents.values():
                for i, j in permutations(equivalence_set, 2):
                    if i > j:
                        is_similar[encodings.shape[0] * j - int(j * (j + 1) / 2) + i - 1 - j] = 1

            similar_score = -np.sum(np.power(distances * is_similar, 2))

            margin = self.__hyperparameters['dissimilar_margin']
            differences = margin - distances
            rectified_diffs = differences * (differences > 0)
            dissimilar_score = -np.sum(np.power(rectified_diffs * (1 - is_similar), 2))

            print("Similar Loss: %s  Dissimilar Loss: %s" % (-similar_score, -dissimilar_score))
            return similar_score + dissimilar_score

        if self.__trained_parameters is None:
            best_score = float('-inf')
        else:
            best_score = compute_validation_score()
            print("Previous best validation score: %s" % best_score)

        try:
            print("[%s] Training Started..." % time.asctime())
            sum_similar_loss = 0
            num_similar_loss = 0
            sum_dissimilar_loss = 0
            num_dissimilar_loss = 0
            ratios = np.zeros(len(list(self.__encoder.parameters.values())))
            epochs_not_improved = 0
            # Clump minibatches and disallow minibatches that are smaller than their given size, since they may
            # cause instability.
            num_minibatches = max(1, min(int(np.floor(float(len(training_set)) / minibatch_size)), 2))

            current_max_size = 4.
            curriculum_step = .1

            for i in range(max_iter):
                sample_ordering = []
                for j, tree in enumerate(training_set):
                    if tree[-1] <= current_max_size:
                        sample_ordering.append(j)
                current_max_size += curriculum_step
                np.random.shuffle(np.array(sample_ordering, dtype=np.int32))
                n_batches = 0

                for j in trange(num_minibatches, desc="Minibatch"):
                    for k in trange(j * minibatch_size, min((j + 1) * minibatch_size, len(sample_ordering)),
                                    desc="Sample", leave=False):
                        current_idx = sample_ordering[k]
                        # Add siamese gradients, by picking num_examples
                        num_examples = 1  # The max number of examples to pick from TODO: as parameter
                        similar_snippet_idxs = []
                        dissimilar_snippet_idxs = []
                        for l in range(len(sample_ordering)):
                            if l == k:
                                continue
                            other_idx = sample_ordering[l]
                            if training_set[current_idx][2] == training_set[other_idx][2]:
                                similar_snippet_idxs.append(other_idx)
                            else:
                                dissimilar_snippet_idxs.append(other_idx)
                        dissimilar_snippet_idxs = np.array(dissimilar_snippet_idxs)

                        np.random.shuffle(similar_snippet_idxs)
                        for other_idx in similar_snippet_idxs:
                            loss, repr1, repr2 = self.__compiled_methods.grad_siamese_similar(
                                list(training_set[current_idx][0]), list(training_set[other_idx][0]))
                            sum_similar_loss += loss
                            num_similar_loss += 1

                        for other_idx in dissimilar_snippet_idxs:
                            loss, repr1, repr2 = self.__compiled_methods.grad_siamese_dissimilar(
                                training_set[current_idx][0], training_set[other_idx][0])
                            sum_dissimilar_loss += loss
                            num_dissimilar_loss += 1 if loss < 0 else 0

                    n_batches += 1
                    ratios += self.__compiled_methods.grad_step()

                if i % validation_check_limit == validation_check_limit - 1:
                    print("Iteration %s Stats" % i)
                    current_score = compute_validation_score()
                    if current_score > best_score:
                        best_score = current_score
                        self.__trained_parameters = [p.get_value() for p in list(self.__encoder.parameters.values())]
                        print("At %s validation: current_score=%s [best so far]" % (i, current_score))
                        epochs_not_improved = 0
                    else:
                        print("At %s validation: current_score=%s" % (i, current_score))
                        epochs_not_improved += 1
                    for k in range(len(list(self.__encoder.parameters.values()))):
                        print("%s: %.0e" % (list(self.__encoder.parameters.values())[k].name, ratios[k] / n_batches))

                    print("Train sum similar-loss: %s (%s samples)" % (sum_similar_loss, num_similar_loss))
                    print("Train sum dissimilar-loss: %s (%s samples)" % (sum_dissimilar_loss, num_dissimilar_loss))
                    print("Training Set stats: %s" % compute_score(training_set[:500]))

                    historic_values.append({"validation_xent": current_score})

                    sum_similar_loss = 0
                    num_similar_loss = 0
                    sum_dissimilar_loss = 0
                    num_dissimilar_loss = 0
                    ratios = np.zeros_like(ratios)
                    if additional_code_to_run is not None: additional_code_to_run()
                if epochs_not_improved >= patience:
                    print("Not improved for %s epochs. Stopping..." % patience)
                    break
            print("[%s] Training Finished..." % time.asctime())
        except (InterruptedError, KeyboardInterrupt):
            print("Interrupted. Exiting training gracefully...")

        return best_score, historic_values

    def __save_current_params_as_best(self):
        self.__trained_parameters = [p.get_value() for p in list(self.__encoder.parameters.values())]

    def save(self, filename: str):
        tmp, self.__compiled_methods = self.__compiled_methods, None
        AbstractEncoder.save(self, filename)
        self.__compiled_methods = tmp

    def get_representation_vector_size(self) -> int:
        return self.__hyperparameters["representation_size"]

    def get_encoding(self, data: tuple) -> np.array:
        self.__compile_if_needed()
        converted_tokens = self.dataset_extractor.tokens_to_array(data[0])
        return self.__compiled_methods.encode(converted_tokens)

    def decoder_loss(self, data: tuple, representation: np.array) -> float:
        raise NotImplementedError("An encoder cannot do this operation")
class SequenceGruSupervisedEncoder(AbstractEncoder):
    """
    Train an encoder
    """
    def __init__(self,
                 training_file,
                 hyperparameters,
                 encoder_type='gru',
                 use_centroid=False):
        """

        :param training_file:
        :type hyperparameters: dict
        :return:
        """
        self.__hyperparameters = hyperparameters

        self.dataset_extractor = TokenAutoencoderDatasetExtractor(
            training_file)

        empirical_distribution = get_empirical_distribution(
            self.dataset_extractor.feature_map,
            chain(*self.dataset_extractor.get_nonnoisy_samples(
                import_data(training_file))))
        self.__encoder = SequenceGruSupervisedEncoderModel(
            self.__hyperparameters["embedding_size"],
            len(self.dataset_extractor.feature_map),
            empirical_distribution,
            self.__hyperparameters["representation_size"],
            self.__hyperparameters,
            encoder_type=encoder_type,
            use_centroid=use_centroid)

        target_embeddings = np.random.randn(self.__hyperparameters["representation_size"],
                                            self.dataset_extractor.num_equivalence_classes) * 10 ** \
                                                                                              self.__hyperparameters[
                                                                                                  "log_init_noise"]

        self.__target_embeddings = theano.shared(target_embeddings.astype(
            theano.config.floatX),
                                                 name="target_embeddings")
        self.__target_embeddings_dropout = dropout(
            self.__hyperparameters['dropout_rate'], self.__encoder.rng,
            self.__target_embeddings, True)

        self.__trained_parameters = None
        self.__compiled_methods = None

    REQUIRED_HYPERPARAMETERS = {
        'log_learning_rate', 'rmsprop_rho', 'momentum', 'grad_clip',
        'minibatch_size', 'embedding_size', 'representation_size',
        'log_init_noise', 'dropout_rate'
    }

    def __get_loss(self, target_class, use_dropout):
        encoding = self.__encoder.get_encoding()
        target_embeddings = self.__target_embeddings_dropout if use_dropout else self.__target_embeddings
        logprobs = log_softmax(
            T.dot(encoding / encoding.norm(2),
                  target_embeddings).dimshuffle('x', 0))[0]
        return logprobs, logprobs[target_class]

    def __compile_train_functions(self):
        target_class = T.iscalar(name="target_class")
        _, ll = self.__get_loss(target_class, True)

        wrt_vars = list(
            self.__encoder.parameters.values()) + [self.__target_embeddings]
        grad = T.grad(ll, wrt_vars)

        grad_acc = [theano.shared(np.zeros(param.get_value().shape).astype(theano.config.floatX)) for param in wrt_vars] \
                   + [theano.shared(0, name="sample_count")]
        self.__compiled_methods.grad_accumulate = theano.function(
            inputs=[self.__encoder.input_sequence_variable, target_class],
            updates=[(v, v + g) for v, g in zip(grad_acc, grad)] +
            [(grad_acc[-1], grad_acc[-1] + 1)],
            outputs=ll)

        normalized_grads = [
            T.switch(grad_acc[-1] > 0,
                     g / grad_acc[-1].astype(theano.config.floatX), g)
            for g in grad_acc[:-1]
        ]
        step_updates, ratios = nesterov_rmsprop_multiple(
            wrt_vars,
            normalized_grads,
            learning_rate=10**self.__hyperparameters["log_learning_rate"],
            rho=self.__hyperparameters["rmsprop_rho"],
            momentum=self.__hyperparameters["momentum"],
            grad_clip=self.__hyperparameters["grad_clip"],
            output_ratios=True)
        step_updates.extend([(v, T.zeros(v.shape))
                             for v in grad_acc[:-1]])  # Set accumulators to 0
        step_updates.append((grad_acc[-1], 0))

        self.__compiled_methods.grad_step = theano.function(
            inputs=[], updates=step_updates, outputs=ratios)

    def __compile_test_functions(self):
        target_class = T.iscalar(name="target_class")
        logprobs, ll = self.__get_loss(target_class, False)
        self.__compiled_methods.ll_and_logprobs = theano.function(
            inputs=[self.__encoder.input_sequence_variable, target_class],
            outputs=[ll, logprobs])

        self.__compiled_methods.encode = theano.function(
            inputs=[self.__encoder.input_sequence_variable],
            outputs=self.__encoder.get_encoding())

    def __compile_if_needed(self):
        if self.__compiled_methods is None:
            print("Compiling Methods...")
            self.__compiled_methods = Bunch()
            self.__compile_train_functions()
            self.__compile_test_functions()
            print("Compilation Finished...")

    def train(self,
              training_file: str,
              validation_file: str,
              max_iter: int = 1000,
              patience: int = 25,
              validation_check_limit: int = 1,
              semantically_equivalent_noise: bool = False,
              additional_code_to_run=None) -> tuple:
        self.__compile_if_needed()

        minibatch_size = self.__hyperparameters["minibatch_size"]
        training_data = import_data(training_file)
        training_set = list(
            self.dataset_extractor.get_dataset_for_encoder(
                training_data, return_num_tokens=True))
        validation_set = list(
            self.dataset_extractor.get_dataset_for_encoder(
                import_data(validation_file), return_num_tokens=True))
        best_score = float('-inf')
        train_x_ent = 0
        epochs_not_improved = 0
        historic_values = []

        trainable_parameters = list(
            self.__encoder.parameters.values()) + [self.__target_embeddings]

        print("Num classes: %s" %
              self.dataset_extractor.num_equivalence_classes)

        def compute_validation_score() -> float:
            return compute_score(validation_set)

        def compute_score(dataset) -> float:
            # Get all encodings
            sum_ll = 0.
            correct = 0
            for data in dataset:
                ll, logprobs = self.__compiled_methods.ll_and_logprobs(
                    data[0], data[2])
                sum_ll += ll
                if np.argmax(logprobs) == data[2]:
                    correct += 1
            print("Accuracy: %s" % (correct / len(dataset) * 100))
            return sum_ll / len(dataset)

        num_minibatches = max(
            1, min(int(np.floor(float(len(training_set)) / minibatch_size)),
                   25))  # Clump minibatches
        try:
            print("[%s] Training Started..." % time.asctime())
            ratios = np.zeros(len(trainable_parameters))
            n_batches = 0
            current_max_size = 3.
            curriculum_step = .2
            for i in range(max_iter):
                sample_ordering = []
                for j, tree_data in enumerate(training_set):
                    if tree_data[1] <= current_max_size:
                        sample_ordering.append(j)
                current_max_size += curriculum_step
                np.random.shuffle(np.array(sample_ordering, dtype=np.int32))
                n_batches = 0
                sum_train_loss = 0
                num_elements = 0

                for j in trange(num_minibatches, desc="Minibatch"):
                    for k in trange(j * minibatch_size,
                                    min((j + 1) * minibatch_size,
                                        len(sample_ordering)),
                                    desc="Sample",
                                    leave=False):
                        current_idx = sample_ordering[k]
                        loss = self.__compiled_methods.grad_accumulate(
                            training_set[current_idx][0],
                            training_set[current_idx][2])
                        sum_train_loss += loss
                        num_elements += 1

                    n_batches += 1
                    ratios += self.__compiled_methods.grad_step()

                if i % validation_check_limit == validation_check_limit - 1:
                    current_ll = compute_validation_score()
                    if current_ll > best_score:
                        best_score = current_ll
                        self.__save_current_params_as_best()
                        print("At %s validation: current_ll=%s [best so far]" %
                              (i, current_ll))
                        epochs_not_improved = 0
                    else:
                        print("At %s validation: current_ll=%s" %
                              (i, current_ll))
                        epochs_not_improved += 1

                    for k in range(len(trainable_parameters)):
                        print("%s: %.0e" % (trainable_parameters[k].name,
                                            ratios[k] / n_batches))

                    print("Train ll: %s" % (sum_train_loss / num_elements))
                    ratios = np.zeros_like(ratios)
                    if additional_code_to_run is not None:
                        additional_code_to_run()

                if epochs_not_improved >= patience:
                    print("Not improved for %s epochs. Stopping..." % patience)
                    break

            print("[%s] Training Finished..." % time.asctime())
        except (InterruptedError, KeyboardInterrupt, SystemExit):
            print("Interrupted. Exiting training gracefully...")

        return best_score, historic_values

    def __save_current_params_as_best(self):
        self.__trained_parameters = [
            p.get_value() for p in list(self.__encoder.parameters.values()) +
            [self.__target_embeddings]
        ]

    def save(self, filename: str):
        tmp, self.__compiled_methods = self.__compiled_methods, None
        AbstractEncoder.save(self, filename)
        self.__compiled_methods = tmp

    def get_representation_vector_size(self) -> int:
        return self.__hyperparameters["representation_size"]

    def get_encoding(self, data: tuple) -> np.array:
        self.__compile_if_needed()
        converted_tokens = self.dataset_extractor.tokens_to_array(data[0])
        return self.__compiled_methods.encode(converted_tokens)