Exemplo n.º 1
0
def init_wrap(init: dy.PyInitializer, size: Tuple[int]) -> dy.PyInitializer:

    if init == OrthogonalInitializer:
        return dy.NumpyInitializer(init.init(size))
    elif isinstance(init, dy.PyInitializer) == True:
        return init
    else:
        raise RuntimeError('%s is not a instance of dy.PyInitializer.' % init)
Exemplo n.º 2
0
    def __init__(self, model, input_dim,output_dim):

        self.input = input_dim
        self.output = output_dim

        Saxe_initializer = Saxe.Orthogonal()
        self.W = model.add_parameters((self.output,self.input), init=dy.NumpyInitializer(Saxe_initializer(((self.output,self.input)))))
        self.b = model.add_parameters((self.output), init = dy.ConstInitializer(0))
Exemplo n.º 3
0
 def initializer(self,
                 dim: Tuple[numbers.Integral],
                 is_lookup: bool = False,
                 num_shared: numbers.Integral = 1) -> 'dy.NumpyInitializer':
     if dim != self.array.shape:
         raise ValueError(
             f"the passed initializer array has different dimensions than the parameters to be initialized: : {self.array.shape} != {dim}"
         )
     return dy.NumpyInitializer(array=self.array)
Exemplo n.º 4
0
    def __init__(self,
                 model,
                 n_labels,
                 src_ctx_dim=400,
                 hidden=400,
                 dropout=0.33):

        self.src_ctx_dim = src_ctx_dim
        self.dropout = dropout
        self.n_labels = n_labels
        self.hidden = hidden
        self.dist_max = 10
        self.dist_dims = 32
        self.dlookup = model.add_lookup_parameters(
            (self.dist_max * 2, self.dist_dims), init=dy.ConstInitializer(0))
        Saxe_initializer = Saxe.Orthogonal(gain='leaky_relu', alpha=0.1)
        self.W_head = model.add_parameters(
            (self.src_ctx_dim, self.src_ctx_dim),
            init=dy.NumpyInitializer(
                Saxe_initializer(((self.src_ctx_dim, self.src_ctx_dim)))))
        self.b_head = model.add_parameters((self.src_ctx_dim),
                                           init=dy.ConstInitializer(0))
        self.W_mod = model.add_parameters(
            (self.src_ctx_dim, self.src_ctx_dim),
            init=dy.NumpyInitializer(
                Saxe_initializer(((self.src_ctx_dim, self.src_ctx_dim)))))
        self.b_mod = model.add_parameters((self.src_ctx_dim),
                                          init=dy.ConstInitializer(0))
        self.W_arc1 = model.add_parameters(
            (self.hidden, self.src_ctx_dim + self.dist_dims),
            init=dy.NumpyInitializer(
                Saxe_initializer(
                    ((self.hidden, self.src_ctx_dim + self.dist_dims)))))
        self.b_arc1 = model.add_parameters((self.hidden),
                                           init=dy.ConstInitializer(0))
        self.W_arc2 = model.add_parameters(
            (self.n_labels, self.hidden),
            init=dy.NumpyInitializer(
                Saxe_initializer(((self.n_labels, self.hidden)))))
        self.b_arc2 = model.add_parameters((self.n_labels),
                                           init=dy.ConstInitializer(0))
Exemplo n.º 5
0
    def __init__(self,
                 model,
                 num_tasks,
                 hidden_dim,
                 num_subspaces=1,
                 init_scheme=BALANCED):
        """
        Initializes a CrossStitchLayer.
        :param model: the DyNet Model
        :param num_tasks: the number of tasks
        :param hidden_dim: the # of hidden dimensions of the previous LSTM layer
        :param num_subspaces: the number of subspaces
        :param init_scheme: the initialization scheme; balanced or imbalanced
        """
        print('Using %d subspaces...' % num_subspaces, flush=True)
        alpha_params = np.full(
            (num_tasks * num_subspaces, num_tasks * num_subspaces),
            1. / (num_tasks * num_subspaces))
        if init_scheme == IMBALANCED:
            if num_subspaces == 1:
                alpha_params = np.full((num_tasks, num_tasks),
                                       0.1 / (num_tasks - 1))
                for i in range(num_tasks):
                    alpha_params[i, i] = 0.9
            else:
                # 0 1 0 1
                # 0 1 0 1
                # 1 0 1 0
                # 1 0 1 0
                for (x, y), value in np.ndenumerate(alpha_params):
                    if (y + 1) % num_subspaces == 0 and not \
                            (x in range(num_tasks, num_tasks+num_subspaces)):
                        alpha_params[x, y] = 0.95
                    elif (y + num_subspaces) % num_subspaces == 0 and x \
                            in range(num_tasks, num_tasks+num_subspaces):
                        alpha_params[x, y] = 0.95
                    else:
                        alpha_params[x, y] = 0.05

        self.alphas = model.add_parameters(
            (num_tasks * num_subspaces, num_tasks * num_subspaces),
            init=dynet.NumpyInitializer(alpha_params))
        #print('Initializing cross-stitch units to:', flush=True)
        #print(dynet.parameter(self.alphas).value(), flush=True)
        self.num_tasks = num_tasks
        self.num_subspaces = num_subspaces
        self.hidden_dim = hidden_dim
Exemplo n.º 6
0
 def __init__(self, model, num_layers, hidden_dim, init_scheme=IMBALANCED):
     """
     Initializes a LayerStitchLayer.
     :param model: the DyNet model
     :param num_layers: the number of layers
     :param hidden_dim: the hidden dimensions of the LSTM layers
     :param init_scheme: the initialisation scheme; balanced or imbalanced
     """
     if init_scheme == IMBALANCED:
         beta_params = np.full((num_layers), 0.1 / (num_layers - 1))
         beta_params[-1] = 0.9
     elif init_scheme == BALANCED:
         beta_params = np.full((num_layers), 1. / num_layers)
     else:
         raise ValueError('Invalid initialization scheme for layer-stitch '
                          'units: %s.' % init_scheme)
     self.betas = model.add_parameters(
         num_layers, init=dynet.NumpyInitializer(beta_params))
     print('Initializing layer-stitch units to:', flush=True)
     print(dynet.parameter(self.betas).value(), flush=True)
     self.num_layers = num_layers
     self.hidden_dim = hidden_dim
Exemplo n.º 7
0
    def __init__(self, model, input_size, recur_size, forget_bias=0.0):

        self.input_size = input_size
        self.recur_size = recur_size
        self.input_drop_mask = dy.ones(self.input_size)
        self.recur_drop_mask = dy.ones(self.recur_size)
        self.forget_bias = forget_bias
        self.cell_previous = None
        self.hidden_previous = None
        self.init = False
        self.input_drop = 0
        self.recur_drop = 0

        Saxe_initializer = Saxe.Orthogonal()
        gates_init = Saxe_initializer(
            ((self.recur_size, self.input_size + self.recur_size)))
        gates_init = np.concatenate([gates_init] * 4)
        self.WXH = model.add_parameters(
            (self.recur_size * 4, self.input_size + self.recur_size),
            init=dy.NumpyInitializer(gates_init))
        self.b = model.add_parameters((self.recur_size * 4),
                                      init=dy.ConstInitializer(0))
Exemplo n.º 8
0
    def build(self,
              char_dim,
              char_lstm_dim,
              ch_b,
              mt_d,
              word_dim,
              word_lstm_dim,
              w_b,
              lr_method,
              pre_emb,
              cap_dim,
              training=True,
              **kwargs):
        """
        Build the network.
        """

        self.training = training

        def _create_get_representation(activation_function=lambda x: x):
            """
            Helper function to create a function which assembles a representation given an
            activation_function
            :param activation_function: 
            :return: 
            """
            def f(obj, es):
                representations = []
                # for e in es:
                #     dynet.ensure_freshness(e)
                for (fb, bb) in obj.builder_layers:
                    fs = fb.initial_state().transduce(es)
                    bs = bb.initial_state().transduce(reversed(es))
                    es = [
                        dynet.concatenate([f, b])
                        for f, b in zip(fs, reversed(bs))
                    ]
                    representations.append(
                        activation_function(dynet.concatenate([fs[-1],
                                                               bs[-1]])))
                return representations

            return f

        BiRNNBuilder.get_representation = _create_get_representation(
            activation_function=dynet.rectify)
        BiRNNBuilder.get_representation_concat = _create_get_representation()

        # Training parameters
        n_words = len(self.id_to_word)
        n_chars = len(self.id_to_char)
        n_tags = len(self.id_to_tag)
        n_morpho_tags = len(self.id_to_morpho_tag)

        # Number of capitalization features
        if cap_dim:
            n_cap = 17

        # Final input (all word features)
        word_representation_dim = 0

        def get_scale(shape):
            return np.sqrt(6 / np.sum(list(shape)))

        #
        # Word inputs
        #
        if word_dim:
            # Initialize with pretrained embeddings
            scale = get_scale((n_words, word_dim))
            new_weights = scale * np.random.uniform(-1.0, 1.0,
                                                    (n_words, word_dim))
            # new_weights = np.zeros([n_words, word_dim], dtype='float32')
            if pre_emb and training:
                print('Loading pretrained embeddings from %s...' % pre_emb)
                pretrained = {}
                emb_invalid = 0
                for i, line in enumerate(codecs.open(pre_emb, 'r', 'utf-8')):
                    line = line.split()
                    if len(line) == word_dim + 1:
                        pretrained[line[0]] = np.array(
                            [float(x) for x in line[1:]]).astype(np.float32)
                    else:
                        emb_invalid += 1
                if emb_invalid > 0:
                    print('WARNING: %i invalid lines' % emb_invalid)
                c_found = 0
                c_lower = 0
                c_zeros = 0
                # Lookup table initialization
                for i in range(n_words):
                    raw_word = self.id_to_word[i]
                    if raw_word != "<UNK>":
                        # word = raw_word.split(" ")[1]
                        word = raw_word
                    else:
                        word = raw_word
                    # print word
                    if word in pretrained:
                        new_weights[i] = pretrained[word]
                        c_found += 1
                    elif word.lower() in pretrained:
                        new_weights[i] = pretrained[word.lower()]
                        c_lower += 1
                    elif re.sub('\d', '0', word.lower()) in pretrained:
                        new_weights[i] = pretrained[re.sub(
                            '\d', '0', word.lower())]
                        c_zeros += 1

                print('Loaded %i pretrained embeddings.' % len(pretrained))
                print(('%i / %i (%.4f%%) words have been initialized with '
                       'pretrained embeddings.') %
                      (c_found + c_lower + c_zeros, n_words, 100. *
                       (c_found + c_lower + c_zeros) / n_words))
                print(('%i found directly, %i after lowercasing, '
                       '%i after lowercasing + zero.') %
                      (c_found, c_lower, c_zeros))
            word_representation_dim += word_dim
            self.word_embeddings = self.model.add_lookup_parameters(
                (n_words, word_dim),
                init=dynet.NumpyInitializer(new_weights),
                name="wordembeddings")

        def create_bilstm_layer(label, input_dim, lstm_dim, bilstm=True):
            if bilstm:
                builder = BiRNNBuilder(1, input_dim, lstm_dim, self.model,
                                       CoupledLSTMBuilder)
            else:
                builder = CoupledLSTMBuilder(1, input_dim, lstm_dim,
                                             self.model)

            return builder

        # Chars inputs
        #
        if char_dim:
            self.char_embeddings = self.model.add_lookup_parameters(
                (n_chars, char_dim), name="charembeddings")

            self.char_lstm_layer = create_bilstm_layer(
                "char",
                char_dim, (2 if ch_b else 1) * char_lstm_dim,
                bilstm=True if ch_b else False)

            word_representation_dim += (2 if ch_b else 1) * char_lstm_dim

        # if self.parameters['integration_mode'] in [1, 2] or self.parameters['active_models'] in [1,
        #                                                                                          2,
        #                                                                                          3]:
        if self.parameters['active_models'] in [1, 2, 3]:

            self.char_lstm_layer_for_morph_analysis_roots = \
                create_bilstm_layer("char_for_morph_analysis_root",
                                   char_dim,
                                   2 * mt_d,
                                   bilstm=True)

            self.morpho_tag_embeddings = self.model.add_lookup_parameters(
                (n_morpho_tags, mt_d), name="charembeddings")
            self.morpho_tag_lstm_layer_for_morph_analysis_tags = \
                create_bilstm_layer("morpho_tag_for_morph_analysis_tags",
                                    mt_d,
                                    2 * mt_d,
                                    bilstm=True)

        if self.parameters[
                'use_golden_morpho_analysis_in_word_representation']:

            assert self.parameters['integration_mode'] == 0 and \
                   self.parameters['active_models'] == 0, "This feature is meaningful if we solely aim NER task."

            self.morpho_tag_embeddings = self.model.add_lookup_parameters(
                (n_morpho_tags, mt_d), name="charembeddings")

            self.old_style_morpho_tag_lstm_layer_for_golden_morpho_analyzes = \
                create_bilstm_layer("old_style_morpho_tag_lstm_layer_for_golden_morpho_analyzes",
                                    mt_d,
                                    2 * mt_d,
                                    bilstm=True)

            word_representation_dim += 2 * mt_d

        #
        # Capitalization feature
        #
        if cap_dim:
            word_representation_dim += cap_dim
            self.cap_embeddings = self.model.add_lookup_parameters(
                (n_cap, cap_dim), name="capembeddings")

        if self.parameters['multilayer'] and self.parameters[
                'shortcut_connections']:
            shortcut_connection_addition = word_representation_dim
            self.sentence_level_bilstm_contexts_length = shortcut_connection_addition + 2 * word_lstm_dim
        else:
            self.sentence_level_bilstm_contexts_length = 2 * word_lstm_dim
        # else:
        #     self.sentence_level_bilstm_contexts_length = word_lstm_dim # TODO: Q: as the output of self.tanh_layer_W will be used. right?

        self.tanh_layer_W = self.model.add_parameters(
            (word_lstm_dim, self.sentence_level_bilstm_contexts_length))
        self.tanh_layer_b = self.model.add_parameters((word_lstm_dim))

        if self.parameters['integration_mode'] in [0, 1]:
            self.last_layer_W = self.model.add_parameters(
                (n_tags, word_lstm_dim))
        elif self.parameters['integration_mode'] == 2:
            self.last_layer_W = self.model.add_parameters(
                (n_tags, word_lstm_dim + 2 * mt_d))

        self.last_layer_b = self.model.add_parameters((n_tags))

        self.transform_context_layer_b = \
            self.model.add_parameters((2 * mt_d))
        self.transform_context_layer_W = \
            self.model.add_parameters((2 * mt_d, self.sentence_level_bilstm_contexts_length))

        # LSTM for words
        # self.sentence_level_bilstm_layer = \
        #     create_bilstm_layer("sentence_level",
        #                         word_representation_dim,
        #                         2 * word_lstm_dim,
        #                         bilstm=True if w_b else False)

        from toolkit.rnn import BiLSTMMultiLayeredWithShortcutConnections

        if self.parameters['multilayer']:
            self.num_sentence_level_bilstm_layers = 3
        else:
            self.num_sentence_level_bilstm_layers = 1

        self.sentence_level_bilstm_layer = \
            BiLSTMMultiLayeredWithShortcutConnections(self.num_sentence_level_bilstm_layers,
                                                      word_representation_dim,
                                                      2 * word_lstm_dim,
                                                      self.model,
                                                      CoupledLSTMBuilder,
                                                      self.parameters['shortcut_connections'])

        def _create_tying_method(activation_function=dynet.tanh, classic=True):
            def f(x, y):
                if classic:
                    return dynet.tanh(x + y)
                else:
                    return activation_function(self.tying_method_W *
                                               dynet.concatenate([x, y]) +
                                               self.tying_method_b)

            return f

        if self.parameters['tying_method']:
            self.tying_method_W = self.model.add_parameters(
                (word_lstm_dim, 2 * mt_d))
            self.tying_method_b = self.model.add_parameters((word_lstm_dim))

            self.f_tying_method = _create_tying_method(
                activation_function=dynet.tanh, classic=False)
        else:
            self.f_tying_method = _create_tying_method(
                activation_function=dynet.tanh, classic=True)

        self.crf_module = CRF(self.model, self.id_to_tag)

        # Training
        def process_hyperparameter_definition(x):
            tokens = x.split("@")
            subtokens = tokens[0].split("_")
            if len(subtokens) > 1 and subtokens[-1] == "float":
                return ["_".join(subtokens[:-1]), float(tokens[1])]
            else:
                return tokens

        _tokens = lr_method.split("-")
        opt_update_algorithm = _tokens[0]
        opt_hyperparameters = [
            process_hyperparameter_definition(x) for x in _tokens[1:]
        ]
        opt_update_algorithms = {
            'sgd': dynet.SimpleSGDTrainer,
            'adam': dynet.AdamTrainer,
            'adadelta': dynet.AdadeltaTrainer,
            'adagrad': dynet.AdagradTrainer,
            'momentum': dynet.MomentumSGDTrainer,
            'rmsprop': dynet.RMSPropTrainer
        }

        if opt_update_algorithm == "adam":
            opt_hyperparameters += [("sparse_updates_enabled",
                                     self.parameters['sparse_updates_enabled'])
                                    ]

        self.trainer = opt_update_algorithms[opt_update_algorithm](
            self.model,
            # sparse_updates_enabled=self.parameters['sparse_updates_enabled'],
            **{name: value
               for name, value in opt_hyperparameters})

        # self.trainer = dynet.SimpleSGDTrainer(self.model, learning_rate=0.01)

        self.saver = DynetSaver(self.model, self.model_path)

        return self
Exemplo n.º 9
0
 def initializer(self,
                 dim,
                 is_lookup: bool = False,
                 num_shared: numbers.Integral = 1) -> dy.NumpyInitializer:
     return dy.NumpyInitializer(array=self.array)
Exemplo n.º 10
0
    def __init__(self,
                 h_dim,
                 h_layers,
                 model_dir,
                 log_dir,
                 task_names,
                 languages,
                 embeds=None,
                 activation=dynet.tanh,
                 lower=False,
                 noise_sigma=0.1,
                 cross_stitch=False,
                 num_subspaces=1,
                 constraint_weight=0,
                 constrain_matrices=[1, 2],
                 cross_stitch_init_scheme=IMBALANCED,
                 layer_stitch_init_scheme=BALANCED,
                 best_train_dict={},
                 best_dev_dict={},
                 avg_train_score=0,
                 avg_dev_score=0,
                 best_epoch=-1,
                 word2id={},
                 oov_id=None):
        """
        :param h_dim: The hidden dimension of the model.
        :param h_layers: The number of hidden layers.
        :param model_dir: The directory where the model should be saved
        :param log_dir: The directory where the log should be saved
        :param task_names: the names of the tasks
        :param langauges: the training languages of the model 
        :param embeds: the pre-trained embedding used by the model
        :param activation: the DyNet activation function that should be used
        :param lower: whether the words should be lower-cased
        :param noise_sigma: the stddev of the Gaussian noise that should be used
                            during training if > 0.0
        :param cross_stitch: whether to use cross-stitch units

        :param num_subspaces: the number of subspaces to use (1 or 2)
        :param constraint_weight: weight of subspace orthogonality constraint
                                  (default: 0 = no constraint)
        :param constrain_matrices: indices of LSTM weight matrices that should
                                   be constrained (default: [1, 2])
        :param cross_stitch_init_scheme: initialisation scheme for cross-stitch
        :param layer_stitch_init_scheme: initialisation scheme for layer-stitch

        :param  best_train_dict: dictionary storing the best scores on training set
        :param  best_dev_dict: dictionary storing the best scores on development set         
        :param  avg_train_score: best unweighted average training score over all tasks and all metrics
        :param  avg_dev_score: best unweighted average development score over all tasks and all metrics
        :param  best_epoch: the epoch of the best performance
        :param  word2id: dictionary storing the words to the idx of the word embedding
        :param  oov_id: the idx of the word which do not appear in the pre-trained word embedding


        """
        self.word2id = word2id

        self.task_names = task_names
        self.model_dir = model_dir
        self.log_dir = log_dir
        self.w_in_dim = 0

        if (len(task_names) == 1):

            if (len(languages) == 1):

                self.model_file = os.path.join(
                    model_dir,
                    'STSL/{}_{}.model'.format(languages[0], task_names[0]))
                self.params_file = os.path.join(
                    model_dir, 'STSL/{}_{}.pkl'.format(languages[0],
                                                       task_names[0]))
            else:

                self.model_file = os.path.join(
                    model_dir, 'STML/{}.model'.format(task_names[0]))

                self.params_file = os.path.join(
                    model_dir, 'STML/{}.pkl'.format(task_names[0]))

        else:

            if (len(languages) == 1):

                self.model_file = os.path.join(
                    model_dir, 'MTSL/{}.model'.format(languages[0]))
                self.params_file = os.path.join(
                    model_dir, 'MTSL/{}.pkl'.format(languages[0]))

            else:
                self.model_file = os.path.join(model_dir, 'MTML/MTML.model')

                self.params_file = os.path.join(model_dir, 'MTML/MTML.pkl')

        self.cross_stitch = cross_stitch
        self.num_subspaces = num_subspaces
        self.constraint_weight = constraint_weight
        self.constrain_matrices = constrain_matrices
        self.cross_stitch_init_scheme = cross_stitch_init_scheme
        self.layer_stitch_init_scheme = layer_stitch_init_scheme
        self.model = dynet.Model()  # init model
        # term to capture sum of constraints over all subspaces
        self.subspace_penalty = self.model.add_parameters(
            1, init=dynet.NumpyInitializer(np.zeros(1)))
        # weight of subspace constraint
        self.constraint_weight_param = self.model.add_parameters(
            1, init=dynet.NumpyInitializer(np.array(self.constraint_weight)))

        task2label2id = {}

        for task in task_names:
            labels = LABELS[task]
            task2label2id[task] = {}
            count = 0

            for label in LABELS[task]:
                task2label2id[task][label] = count
                count += 1

        self.task2label2id = task2label2id  # need one dictionary per task

        self.languages = languages
        self.h_dim = h_dim
        self.activation = activation
        self.lower = lower
        self.noise_sigma = noise_sigma
        self.h_layers = h_layers
        self.predictors = {}
        self.wembeds = None  # lookup: embeddings for words
        self.embeds = embeds

        self.best_train_dict = best_train_dict
        self.best_dev_dict = best_dev_dict

        self.best_epoch = best_epoch

        self.avg_train_score = avg_train_score
        self.avg_dev_score = avg_dev_score
        self.oov_id = oov_id
Exemplo n.º 11
0
 def initializer(self, dim, is_lookup=False, num_shared=1):
     return dy.NumpyInitializer(array=self.array)
Exemplo n.º 12
0
    def __init__(self,
                 in_dim,
                 h_dim,
                 c_in_dim,
                 h_layers,
                 pred_layer,
                 model_dir,
                 embeds_file=None,
                 activation=dynet.tanh,
                 lower=False,
                 noise_sigma=0.1,
                 task_names=[],
                 cross_stitch=False,
                 layer_connect=NONE,
                 num_subspaces=1,
                 constraint_weight=0,
                 constrain_matrices=[1, 2],
                 cross_stitch_init_scheme=IMBALANCED,
                 layer_stitch_init_scheme=BALANCED):
        """
        :param in_dim: The dimension of the word embeddings.
        :param h_dim: The hidden dimension of the model.
        :param c_in_dim: The dimension of the character embeddings.
        :param h_layers: The number of hidden layers.
        :param pred_layer: Indices indicating at which layer to predict each
                           task, e.g. [1, 2] indicates 1st task is predicted
                           at 1st layer, 2nd task is predicted at 2nd layer
        :param model_dir: The directory where the model should be saved
        :param embeds_file: the file containing pre-trained word embeddings
        :param activation: the DyNet activation that should be used
        :param lower: whether the words should be lower-cased
        :param noise_sigma: the stddev of the Gaussian noise that should be used
                            during training if > 0.0
        :param task_names: the names of the tasks
        :param cross_stitch: whether to use cross-stitch units
        :param layer_connect: the layer connections that are used (stitch,
                              skip, concat, or none)
        :param num_subspaces: the number of subspaces to use (1 or 2)
        :param constraint_weight: weight of subspace orthogonality constraint
                                  (default: 0 = no constraint)
        :param constrain_matrices: indices of LSTM weight matrices that should
                                   be constrained (default: [1, 2])
        :param cross_stitch_init_scheme: initialisation scheme for cross-stitch
        :param layer_stitch_init_scheme: initialisation scheme for layer-stitch
        """
        self.word2id = {}  # word to index mapping
        self.char2id = {}  # char to index mapping
        self.task_names = task_names
        self.main_task = self.task_names[0]
        print('Using the first task as main task:', self.main_task, flush=True)
        self.model_dir = model_dir
        self.model_file = os.path.join(model_dir, MODEL_FILE)
        self.params_file = os.path.join(model_dir, PARAMS_FILE)
        self.cross_stitch = cross_stitch
        self.layer_connect = layer_connect
        self.num_subspaces = num_subspaces
        self.constraint_weight = constraint_weight
        self.constrain_matrices = constrain_matrices
        self.cross_stitch_init_scheme = cross_stitch_init_scheme
        self.layer_stitch_init_scheme = layer_stitch_init_scheme
        self.model = dynet.Model()  # init model
        # term to capture sum of constraints over all subspaces
        self.subspace_penalty = self.model.add_parameters(
            1, init=dynet.NumpyInitializer(np.zeros(1)))
        # weight of subspace constraint
        self.constraint_weight_param = self.model.add_parameters(
            1, init=dynet.NumpyInitializer(np.array(self.constraint_weight)))

        self.task2tag2idx = {}  # need one dictionary per task
        self.pred_layer = pred_layer
        self.in_dim = in_dim
        self.h_dim = h_dim
        self.c_in_dim = c_in_dim
        self.activation = activation
        self.lower = lower
        self.noise_sigma = noise_sigma
        self.h_layers = h_layers
        # keep track of the inner layers and the task predictors
        self.predictors = {
            'inner': [],
            'output_layers_dict': {},
            'task_expected_at': {}
        }
        self.wembeds = None  # lookup: embeddings for words
        self.cembeds = None  # lookup: embeddings for characters
        self.embeds_file = embeds_file
        self.char_rnn = None  # RNN for character input
Exemplo n.º 13
0
    def __init__(self,
                 head_count: int,
                 model_dim: int,
                 downsample_factor: int = 1,
                 input_dim: int = None,
                 ignore_masks: bool = False,
                 plot_attention: typing.Optional[str] = None,
                 diag_gauss_mask: typing.Union[bool, numbers.Real] = False,
                 square_mask_std: bool = True,
                 cross_pos_encoding_type: typing.Optional[str] = None,
                 kq_pos_encoding_type: typing.Optional[str] = None,
                 kq_pos_encoding_size: int = 40,
                 max_len: int = 1500,
                 param_init: xnmt.param_initializers.ParamInitializer = xnmt.
                 param_initializers.GlorotInitializer(),
                 bias_init: xnmt.param_initializers.ParamInitializer = xnmt.
                 param_initializers.ZeroInitializer(),
                 linear_kvq=None,
                 kq_positional_embedder=None,
                 layer_norm=None,
                 res_shortcut=None,
                 desc: typing.Any = None) -> None:
        if input_dim is None: input_dim = model_dim
        self.input_dim = input_dim
        assert model_dim % head_count == 0
        self.dim_per_head = model_dim // head_count
        self.model_dim = model_dim
        self.head_count = head_count
        assert downsample_factor >= 1
        self.downsample_factor = downsample_factor
        self.plot_attention = plot_attention
        self.plot_attention_counter = 0
        self.desc = desc

        self.ignore_masks = ignore_masks
        self.diag_gauss_mask = diag_gauss_mask
        self.square_mask_std = square_mask_std

        self.kq_pos_encoding_type = kq_pos_encoding_type
        self.kq_pos_encoding_size = kq_pos_encoding_size
        self.max_len = max_len

        subcol = param_collections.ParamManager.my_params(self)

        if self.kq_pos_encoding_type is None:
            self.linear_kvq = self.add_serializable_component(
                "linear_kvq", linear_kvq,
                lambda: transforms.Linear(input_dim * downsample_factor,
                                          head_count * self.dim_per_head * 3,
                                          param_init=param_init,
                                          bias_init=bias_init))
        else:
            self.linear_kq, self.linear_v = \
              self.add_serializable_component("linear_kvq",
                                              linear_kvq,
                                              lambda: [
                                                transforms.Linear(input_dim * downsample_factor + self.kq_pos_encoding_size,
                                                                  head_count * self.dim_per_head * 2, param_init=param_init,
                                                                  bias_init=bias_init),
                                                transforms.Linear(input_dim * downsample_factor, head_count * self.dim_per_head,
                                                                  param_init=param_init, bias_init=bias_init)])
            assert self.kq_pos_encoding_type == "embedding"
            self.kq_positional_embedder = self.add_serializable_component(
                "kq_positional_embedder", kq_positional_embedder, lambda:
                embedders.PositionEmbedder(max_pos=self.max_len,
                                           emb_dim=self.kq_pos_encoding_size,
                                           param_init=param_init))

        if self.diag_gauss_mask:
            if self.diag_gauss_mask == "rand":
                rand_init = np.exp(
                    (np.random.random(size=(self.head_count, ))) *
                    math.log(1000))
                self.diag_gauss_mask_sigma = subcol.add_parameters(
                    dim=(1, 1, self.head_count),
                    init=dy.NumpyInitializer(rand_init))
            else:
                self.diag_gauss_mask_sigma = subcol.add_parameters(
                    dim=(1, 1, self.head_count),
                    init=dy.ConstInitializer(self.diag_gauss_mask))

        self.layer_norm = self.add_serializable_component(
            "layer_norm", layer_norm, lambda: norms.LayerNorm(model_dim))

        if model_dim != input_dim * downsample_factor:
            self.res_shortcut = self.add_serializable_component(
                "res_shortcut", res_shortcut,
                lambda: transforms.Linear(input_dim * downsample_factor,
                                          model_dim,
                                          param_init=param_init,
                                          bias_init=bias_init))
        self.cross_pos_encoding_type = cross_pos_encoding_type
        if cross_pos_encoding_type == "embedding":
            self.cross_pos_emb_p1 = subcol.add_parameters(
                dim=(self.max_len, self.dim_per_head, self.head_count),
                init=dy.NormalInitializer(mean=1.0, var=0.001))
            self.cross_pos_emb_p2 = subcol.add_parameters(
                dim=(self.max_len, self.dim_per_head, self.head_count),
                init=dy.NormalInitializer(mean=1.0, var=0.001))
        elif cross_pos_encoding_type is not None:
            raise NotImplementedError()
Exemplo n.º 14
0
    def __init__(self,
                 model,
                 n_labels,
                 src_ctx_dim=400,
                 n_arc_mlp_units=400,
                 n_label_mlp_units=100,
                 arc_mlp_dropout=0.33,
                 label_mlp_dropout=0.33):

        Saxe_initializer = Saxe.Orthogonal(gain='leaky_relu', alpha=0.1)
        self.src_ctx_dim = src_ctx_dim
        self.label_mlp_dropout = label_mlp_dropout
        self.arc_mlp_dropout = arc_mlp_dropout
        self.n_labels = n_labels
        self.W_arc_hidden_to_head = model.add_parameters(
            (n_arc_mlp_units, src_ctx_dim),
            init=dy.NumpyInitializer(
                Saxe_initializer(((n_arc_mlp_units, src_ctx_dim)))))
        self.b_arc_hidden_to_head = model.add_parameters(
            (n_arc_mlp_units, ), init=dy.ConstInitializer(0))
        self.W_arc_hidden_to_dep = model.add_parameters(
            (n_arc_mlp_units, src_ctx_dim),
            init=dy.NumpyInitializer(
                Saxe_initializer(((n_arc_mlp_units, src_ctx_dim)))))
        self.b_arc_hidden_to_dep = model.add_parameters(
            (n_arc_mlp_units, ), init=dy.ConstInitializer(0))

        self.W_label_hidden_to_head = model.add_parameters(
            (n_label_mlp_units, src_ctx_dim),
            init=dy.NumpyInitializer(
                Saxe_initializer(((n_label_mlp_units, src_ctx_dim)))))
        self.b_label_hidden_to_head = model.add_parameters(
            (n_label_mlp_units, ), init=dy.ConstInitializer(0))
        self.W_label_hidden_to_dep = model.add_parameters(
            (n_label_mlp_units, src_ctx_dim),
            init=dy.NumpyInitializer(
                Saxe_initializer(((n_label_mlp_units, src_ctx_dim)))))
        self.b_label_hidden_to_dep = model.add_parameters(
            (n_label_mlp_units, ), init=dy.ConstInitializer(0))

        self.U_arc_1 = model.add_parameters((n_arc_mlp_units, n_arc_mlp_units),
                                            init=dy.ConstInitializer(0))
        self.u_arc_2 = model.add_parameters((n_arc_mlp_units),
                                            init=dy.ConstInitializer(0))

        self.U_label_1 = [
            model.add_parameters((n_label_mlp_units, n_label_mlp_units),
                                 init=dy.ConstInitializer(0))
            for _ in range(n_labels)
        ]
        self.u_label_2_2 = [
            model.add_parameters((1, n_label_mlp_units),
                                 init=dy.ConstInitializer(0))
            for _ in range(n_labels)
        ]
        self.u_label_2_1 = [
            model.add_parameters((n_label_mlp_units, 1),
                                 init=dy.ConstInitializer(0))
            for _ in range(n_labels)
        ]
        self.b_label = [
            model.add_parameters((1, ), init=dy.ConstInitializer(0))
            for _ in range(n_labels)
        ]
Exemplo n.º 15
0
    def __init__(self,
                 model,
                 pos_labels, xpos_labels,
                 src_ctx_dim=400,
                 n_pos_tagger_mlp_units=200,
                 n_xpos_tagger_mlp_units=200,
                 mlps_dropout=0.33):


        self.src_ctx_dim = src_ctx_dim
        self.dropout = mlps_dropout
        self.pos_labels = pos_labels
        self.xpos_labels = xpos_labels

        Saxe_initializer = Saxe.Orthogonal(gain='leaky_relu',alpha = 0.1)

        self.W_pos = model.add_parameters((n_pos_tagger_mlp_units, src_ctx_dim), init=dy.NumpyInitializer(Saxe_initializer(((n_pos_tagger_mlp_units, src_ctx_dim)))))
        self.b_pos = model.add_parameters((n_pos_tagger_mlp_units,),init = dy.ConstInitializer(0))
        self.W_xpos = model.add_parameters((n_xpos_tagger_mlp_units, src_ctx_dim), init=dy.NumpyInitializer(Saxe_initializer(((n_xpos_tagger_mlp_units, src_ctx_dim)))))
        self.b_xpos = model.add_parameters((n_xpos_tagger_mlp_units,),init = dy.ConstInitializer(0))

        self.W_affine_pos = model.add_parameters((n_pos_tagger_mlp_units,pos_labels), init = dy.ConstInitializer(0))
        self.b_affine_pos = model.add_parameters((pos_labels),init = dy.ConstInitializer(0))
        self.W_affine_xpos = model.add_parameters((n_xpos_tagger_mlp_units,xpos_labels), init = dy.ConstInitializer(0))
        self.b_affine_xpos = model.add_parameters((xpos_labels),init = dy.ConstInitializer(0))