示例#1
0
 def __init__(self, model: dy.Model, vocab_size: int):
     self.input_lookup = model.add_lookup_parameters(
         (vocab_size, EMBEDDINGS_SIZE))
     self.enc_fwd_lstm = dy.LSTMBuilder(LSTM_NUM_OF_LAYERS, EMBEDDINGS_SIZE,
                                        LSTM_STATE_SIZE, model)
     self.enc_bwd_lstm = dy.LSTMBuilder(LSTM_NUM_OF_LAYERS, EMBEDDINGS_SIZE,
                                        LSTM_STATE_SIZE, model)
示例#2
0
    def __init__(self,
                 opts=None,
                 parameters=None,
                 models_path=None,
                 model_path=None,
                 model_epoch_dir_path=None,
                 overwrite_mappings=0):
        """
        Initialize the model. We either provide the parameters and a path where
        we store the models, or the location of a trained model.
        """

        self.training = True
        self.n_bests = 0
        self.overwrite_mappings = overwrite_mappings

        self.model = Model()

        if model_path is None:
            assert parameters and models_path and opts
            # Create a name based on the parameters
            self.parameters = parameters
            self.opts = opts
            self.name = get_name(parameters)
            # MainTaggerModel location
            # MainTaggerModel location
            available_model_subpath, model_path_id = create_a_model_subpath(
                models_path)
            # model_path = os.path.join(models_path, available_model_subpath)
            add_a_model_path_to_the_model_paths_database(
                models_path, available_model_subpath, get_name(parameters))
            self.model_path = available_model_subpath
            # model_path = os.path.join(models_path, self.name)
            # self.model_path = model_path
            self.parameters_path = os.path.join(self.model_path,
                                                'parameters.pkl')
            self.mappings_path = os.path.join(self.model_path, 'mappings.pkl')
            self.opts_path = os.path.join(self.model_path, 'opts.pkl')
            # Create directory for the model if it does not exist
            if not os.path.exists(self.model_path):
                os.makedirs(self.model_path)
            # Save the parameters to disk
            with open(self.parameters_path, 'wb') as f:
                pickle.dump(parameters, f)
            # Save the command line parameters to disk
            with open(self.opts_path, 'wb') as f:
                pickle.dump(opts, f)
        else:
            assert parameters is None and opts is None and models_path and model_path and model_epoch_dir_path
            # MainTaggerModel location
            self.model_path = model_path
            self.parameters_path = os.path.join(model_path, 'parameters.pkl')
            self.mappings_path = os.path.join(model_path, 'mappings.pkl')
            self.opts_path = os.path.join(model_path, 'opts.pkl')
            # Load the parameters and the mappings from disk
            with open(self.parameters_path, 'rb') as f:
                self.parameters = pickle.load(f)
            # Read opts from the saved file
            with open(self.opts_path, 'rb') as f:
                self.opts = pickle.load(f)
            self.reload_mappings()

        self.components = {}
示例#3
0
class MainTaggerModel(object):
    """
    Network architecture.
    """
    id_to_word = {}  # type: dict
    id_to_char = {}  # type: dict
    id_to_tag = {}  # type: dict
    id_to_morpho_tag = {}  # type: dict

    saver = None  # type: DynetSaver

    def __init__(self,
                 opts=None,
                 parameters=None,
                 models_path=None,
                 model_path=None,
                 model_epoch_dir_path=None,
                 overwrite_mappings=0):
        """
        Initialize the model. We either provide the parameters and a path where
        we store the models, or the location of a trained model.
        """

        self.training = True
        self.n_bests = 0
        self.overwrite_mappings = overwrite_mappings

        self.model = Model()

        if model_path is None:
            assert parameters and models_path and opts
            # Create a name based on the parameters
            self.parameters = parameters
            self.opts = opts
            self.name = get_name(parameters)
            # MainTaggerModel location
            # MainTaggerModel location
            available_model_subpath, model_path_id = create_a_model_subpath(
                models_path)
            # model_path = os.path.join(models_path, available_model_subpath)
            add_a_model_path_to_the_model_paths_database(
                models_path, available_model_subpath, get_name(parameters))
            self.model_path = available_model_subpath
            # model_path = os.path.join(models_path, self.name)
            # self.model_path = model_path
            self.parameters_path = os.path.join(self.model_path,
                                                'parameters.pkl')
            self.mappings_path = os.path.join(self.model_path, 'mappings.pkl')
            self.opts_path = os.path.join(self.model_path, 'opts.pkl')
            # Create directory for the model if it does not exist
            if not os.path.exists(self.model_path):
                os.makedirs(self.model_path)
            # Save the parameters to disk
            with open(self.parameters_path, 'wb') as f:
                pickle.dump(parameters, f)
            # Save the command line parameters to disk
            with open(self.opts_path, 'wb') as f:
                pickle.dump(opts, f)
        else:
            assert parameters is None and opts is None and models_path and model_path and model_epoch_dir_path
            # MainTaggerModel location
            self.model_path = model_path
            self.parameters_path = os.path.join(model_path, 'parameters.pkl')
            self.mappings_path = os.path.join(model_path, 'mappings.pkl')
            self.opts_path = os.path.join(model_path, 'opts.pkl')
            # Load the parameters and the mappings from disk
            with open(self.parameters_path, 'rb') as f:
                self.parameters = pickle.load(f)
            # Read opts from the saved file
            with open(self.opts_path, 'rb') as f:
                self.opts = pickle.load(f)
            self.reload_mappings()

        self.components = {}

    def save_mappings(self, id_to_word, id_to_char, id_to_tag,
                      id_to_morpho_tag):
        """
        We need to save the mappings if we want to use the model later.
        """
        self.id_to_word = id_to_word
        self.id_to_char = id_to_char
        self.id_to_tag = id_to_tag
        self.id_to_morpho_tag = id_to_morpho_tag

        if os.path.exists(self.mappings_path) and not self.overwrite_mappings:
            print(
                "Aborting. A previous mappings file exists. You should explicitly state to overwrite the mappings file"
            )
            sys.exit(1)
        else:
            with open(self.mappings_path, 'wb') as f:
                mappings = {
                    'id_to_word': self.id_to_word,
                    'id_to_char': self.id_to_char,
                    'id_to_tag': self.id_to_tag,
                    'id_to_morpho_tag': self.id_to_morpho_tag,
                }
                pickle.dump(mappings, f)

    def reload_mappings(self):
        """
        Load mappings from disk.
        """
        with open(self.mappings_path, 'rb') as f:
            mappings = pickle.load(f)
        self.id_to_word = mappings['id_to_word']
        self.id_to_char = mappings['id_to_char']
        self.id_to_tag = mappings['id_to_tag']
        self.id_to_morpho_tag = mappings['id_to_morpho_tag']

    def add_component(self, param):
        """
        Add a new parameter to the network.
        """
        if param.name in self.components:
            raise Exception('The network already has a parameter "%s"!' %
                            param.name)
        self.components[param.name] = param

    def save(self, epoch=None, best_performances=[], epoch_costs=[]):
        """
        Write components values to disk.
        """

        # for name, param in self.components.items():
        #     param_path = os.path.join(path, "%s.mat" % name)
        #     if hasattr(param, 'params'):
        #         param_values = {p.name: p.get_value() for p in param.params}
        #     else:
        #         param_values = {name: param.get_value()}
        #     scipy.io.savemat(param_path, param_values)

        self.saver.save(epoch=epoch, n_bests=self.n_bests)

        self.save_best_performances_and_costs(epoch, best_performances,
                                              epoch_costs)

    def save_best_performances_and_costs(self, epoch, best_performances,
                                         epoch_costs):

        path = self.model_path
        model_ckpt_filename = ("model-epoch-%08d" %
                               epoch) if epoch is not None else (
                                   "best-models-%08d" % self.n_bests)
        if len(best_performances) > 0:
            best_performances_path = os.path.join(
                path, "%s-%s.txt" % (model_ckpt_filename, "best_performances"))
            best_performances_f = open(best_performances_path, "w")
            best_performances_f.write(
                " ".join([str(b) for b in best_performances]) + "\n")
            best_performances_f.close()
        if len(epoch_costs) > 0:
            epoch_cost_path = os.path.join(
                path, "%s-%s.txt" % (model_ckpt_filename, "epoch_cost"))
            epoch_cost_f = open(epoch_cost_path, "w")
            epoch_cost_f.write(" ".join([str(e) for e in epoch_costs]) + "\n")
            epoch_cost_f.write(str(np.mean(epoch_costs)) + "\n")
            epoch_cost_f.close()

    def reload(self, model_epoch_dir_path=None):
        """
        Load components values from disk.
        """
        path = self.model_path
        if model_epoch_dir_path:
            path = model_epoch_dir_path
        else:
            path = os.path.join(path, "epoch-%08d" % 0)
        print(path)
        assert os.path.exists(path)

        self.saver.restore(os.path.join(path, "model.ckpt"))

    def get_last_layer_context_representations(
            self, sentence, context_representations_for_crf_loss,
            context_representations_for_md_loss):
        last_layer_context_representations = context_representations_for_crf_loss

        if self.parameters['active_models'] in [1, 2, 3]:

            if self.parameters['active_models'] == 1 and \
                   self.parameters['integration_mode'] != 0:
                assert False, "integration_mode should be set to zero when active_models == 1"

            if self.parameters['debug'] == 1:
                print(("str_words", sentence["str_words"]))
            morph_analysis_representations, morph_analysis_scores = \
                self.get_morph_analysis_representations_and_scores(sentence,
                                                                   context_representations_for_md_loss)

            selected_morph_analysis_representations = \
                self.disambiguate_morph_analyzes(morph_analysis_scores)

            if 'golden_morph_analysis_indices' in list(sentence.keys()):
                md_loss = dynet.esum([
                    dynet.pickneglogsoftmax(morph_analysis_scores_for_word,
                                            golden_idx)
                    for golden_idx, morph_analysis_scores_for_word in zip(
                        sentence['golden_morph_analysis_indices'],
                        morph_analysis_scores)
                ])
            else:
                md_loss = dynet.scalarInput(0)

            if self.parameters['integration_mode'] == 2:
                # on the other hand, we can implement two layer of contexts, which we use the
                # first for morphological disambiguation and then concatenate the predicted/computed/
                # selected morphological analysis representation to use for calculating tag_scores
                last_layer_context_representations = \
                    [dynet.concatenate([context,
                                        morph_analysis_representations[word_pos]
                                        [selected_morph_analysis_representation_pos]])
                     for word_pos, (selected_morph_analysis_representation_pos, context) in
                     enumerate(
                         zip(selected_morph_analysis_representations, context_representations_for_crf_loss))]
            if md_loss.value() > 1000:
                logging.error("BEEP")
        else:
            # only the plain old NER model
            # we must decide whether we should implement the morphological embeddings scheme here.
            md_loss = dynet.scalarInput(0)
            selected_morph_analysis_representations = None
            last_layer_context_representations = context_representations_for_crf_loss

        assert last_layer_context_representations is not None
        return last_layer_context_representations, md_loss, selected_morph_analysis_representations

    def get_morph_analysis_scores(self, morph_analysis_representations,
                                  context_representations):

        # (10) and (11) in Shen et al. "The Role of Context ..."
        def transform_context(context):
            return dynet.tanh(
                dynet.affine_transform([
                    self.transform_context_layer_b.expr(),
                    self.transform_context_layer_W.expr(), context
                ]))
            #return dynet.tanh(dynet.sum_cols(dynet.reshape(context, (int(self.sentence_level_bilstm_contexts_length/2), 2))))

        if self.parameters['debug'] == 1:
            print(("morph_analysis_representations",
                   morph_analysis_representations))
            print(("context_representations", context_representations))

        morph_analysis_scores = \
            [dynet.softmax(
                dynet.concatenate([dynet.dot_product(morph_analysis_representation,
                                                     transform_context(context)) # sum + tanh for context[:half] and contet[half:]
                                   for morph_analysis_representation in
                                   morph_analysis_representations[word_pos]]))
                for word_pos, context in enumerate(context_representations)]
        return morph_analysis_scores

    def get_morph_analysis_representations_and_scores(self, sentence,
                                                      context_representations):

        morph_analysis_representations = self.get_morph_analysis_representations(
            sentence)

        morph_analysis_scores = self.get_morph_analysis_scores(
            morph_analysis_representations, context_representations)

        return morph_analysis_representations, morph_analysis_scores

    def disambiguate_morph_analyzes(self, morph_analysis_scores):

        selected_morph_analysis_representations = [
            np.argmax(morph_analysis_scores_for_word.npvalue())
            for morph_analysis_scores_for_word in morph_analysis_scores
        ]

        return selected_morph_analysis_representations

    def build(self,
              char_dim,
              char_lstm_dim,
              ch_b,
              mt_d,
              word_dim,
              word_lstm_dim,
              w_b,
              lr_method,
              pre_emb,
              cap_dim,
              training=True,
              **kwargs):
        """
        Build the network.
        """

        self.training = training

        def _create_get_representation(activation_function=lambda x: x):
            """
            Helper function to create a function which assembles a representation given an
            activation_function
            :param activation_function: 
            :return: 
            """
            def f(obj, es):
                representations = []
                # for e in es:
                #     dynet.ensure_freshness(e)
                for (fb, bb) in obj.builder_layers:
                    fs = fb.initial_state().transduce(es)
                    bs = bb.initial_state().transduce(reversed(es))
                    es = [
                        dynet.concatenate([f, b])
                        for f, b in zip(fs, reversed(bs))
                    ]
                    representations.append(
                        activation_function(dynet.concatenate([fs[-1],
                                                               bs[-1]])))
                return representations

            return f

        BiRNNBuilder.get_representation = _create_get_representation(
            activation_function=dynet.rectify)
        BiRNNBuilder.get_representation_concat = _create_get_representation()

        # Training parameters
        n_words = len(self.id_to_word)
        n_chars = len(self.id_to_char)
        n_tags = len(self.id_to_tag)
        n_morpho_tags = len(self.id_to_morpho_tag)

        # Number of capitalization features
        if cap_dim:
            n_cap = 17

        # Final input (all word features)
        word_representation_dim = 0

        def get_scale(shape):
            return np.sqrt(6 / np.sum(list(shape)))

        #
        # Word inputs
        #
        if word_dim:
            # Initialize with pretrained embeddings
            scale = get_scale((n_words, word_dim))
            new_weights = scale * np.random.uniform(-1.0, 1.0,
                                                    (n_words, word_dim))
            # new_weights = np.zeros([n_words, word_dim], dtype='float32')
            if pre_emb and training:
                print('Loading pretrained embeddings from %s...' % pre_emb)
                pretrained = {}
                emb_invalid = 0
                for i, line in enumerate(codecs.open(pre_emb, 'r', 'utf-8')):
                    line = line.split()
                    if len(line) == word_dim + 1:
                        pretrained[line[0]] = np.array(
                            [float(x) for x in line[1:]]).astype(np.float32)
                    else:
                        emb_invalid += 1
                if emb_invalid > 0:
                    print('WARNING: %i invalid lines' % emb_invalid)
                c_found = 0
                c_lower = 0
                c_zeros = 0
                # Lookup table initialization
                for i in range(n_words):
                    raw_word = self.id_to_word[i]
                    if raw_word != "<UNK>":
                        # word = raw_word.split(" ")[1]
                        word = raw_word
                    else:
                        word = raw_word
                    # print word
                    if word in pretrained:
                        new_weights[i] = pretrained[word]
                        c_found += 1
                    elif word.lower() in pretrained:
                        new_weights[i] = pretrained[word.lower()]
                        c_lower += 1
                    elif re.sub('\d', '0', word.lower()) in pretrained:
                        new_weights[i] = pretrained[re.sub(
                            '\d', '0', word.lower())]
                        c_zeros += 1

                print('Loaded %i pretrained embeddings.' % len(pretrained))
                print(('%i / %i (%.4f%%) words have been initialized with '
                       'pretrained embeddings.') %
                      (c_found + c_lower + c_zeros, n_words, 100. *
                       (c_found + c_lower + c_zeros) / n_words))
                print(('%i found directly, %i after lowercasing, '
                       '%i after lowercasing + zero.') %
                      (c_found, c_lower, c_zeros))
            word_representation_dim += word_dim
            self.word_embeddings = self.model.add_lookup_parameters(
                (n_words, word_dim),
                init=dynet.NumpyInitializer(new_weights),
                name="wordembeddings")

        def create_bilstm_layer(label, input_dim, lstm_dim, bilstm=True):
            if bilstm:
                builder = BiRNNBuilder(1, input_dim, lstm_dim, self.model,
                                       CoupledLSTMBuilder)
            else:
                builder = CoupledLSTMBuilder(1, input_dim, lstm_dim,
                                             self.model)

            return builder

        # Chars inputs
        #
        if char_dim:
            self.char_embeddings = self.model.add_lookup_parameters(
                (n_chars, char_dim), name="charembeddings")

            self.char_lstm_layer = create_bilstm_layer(
                "char",
                char_dim, (2 if ch_b else 1) * char_lstm_dim,
                bilstm=True if ch_b else False)

            word_representation_dim += (2 if ch_b else 1) * char_lstm_dim

        # if self.parameters['integration_mode'] in [1, 2] or self.parameters['active_models'] in [1,
        #                                                                                          2,
        #                                                                                          3]:
        if self.parameters['active_models'] in [1, 2, 3]:

            self.char_lstm_layer_for_morph_analysis_roots = \
                create_bilstm_layer("char_for_morph_analysis_root",
                                   char_dim,
                                   2 * mt_d,
                                   bilstm=True)

            self.morpho_tag_embeddings = self.model.add_lookup_parameters(
                (n_morpho_tags, mt_d), name="charembeddings")
            self.morpho_tag_lstm_layer_for_morph_analysis_tags = \
                create_bilstm_layer("morpho_tag_for_morph_analysis_tags",
                                    mt_d,
                                    2 * mt_d,
                                    bilstm=True)

        if self.parameters[
                'use_golden_morpho_analysis_in_word_representation']:

            assert self.parameters['integration_mode'] == 0 and \
                   self.parameters['active_models'] == 0, "This feature is meaningful if we solely aim NER task."

            self.morpho_tag_embeddings = self.model.add_lookup_parameters(
                (n_morpho_tags, mt_d), name="charembeddings")

            self.old_style_morpho_tag_lstm_layer_for_golden_morpho_analyzes = \
                create_bilstm_layer("old_style_morpho_tag_lstm_layer_for_golden_morpho_analyzes",
                                    mt_d,
                                    2 * mt_d,
                                    bilstm=True)

            word_representation_dim += 2 * mt_d

        #
        # Capitalization feature
        #
        if cap_dim:
            word_representation_dim += cap_dim
            self.cap_embeddings = self.model.add_lookup_parameters(
                (n_cap, cap_dim), name="capembeddings")

        if self.parameters['multilayer'] and self.parameters[
                'shortcut_connections']:
            shortcut_connection_addition = word_representation_dim
            self.sentence_level_bilstm_contexts_length = shortcut_connection_addition + 2 * word_lstm_dim
        else:
            self.sentence_level_bilstm_contexts_length = 2 * word_lstm_dim
        # else:
        #     self.sentence_level_bilstm_contexts_length = word_lstm_dim # TODO: Q: as the output of self.tanh_layer_W will be used. right?

        self.tanh_layer_W = self.model.add_parameters(
            (word_lstm_dim, self.sentence_level_bilstm_contexts_length))
        self.tanh_layer_b = self.model.add_parameters((word_lstm_dim))

        if self.parameters['integration_mode'] in [0, 1]:
            self.last_layer_W = self.model.add_parameters(
                (n_tags, word_lstm_dim))
        elif self.parameters['integration_mode'] == 2:
            self.last_layer_W = self.model.add_parameters(
                (n_tags, word_lstm_dim + 2 * mt_d))

        self.last_layer_b = self.model.add_parameters((n_tags))

        self.transform_context_layer_b = \
            self.model.add_parameters((2 * mt_d))
        self.transform_context_layer_W = \
            self.model.add_parameters((2 * mt_d, self.sentence_level_bilstm_contexts_length))

        # LSTM for words
        # self.sentence_level_bilstm_layer = \
        #     create_bilstm_layer("sentence_level",
        #                         word_representation_dim,
        #                         2 * word_lstm_dim,
        #                         bilstm=True if w_b else False)

        from toolkit.rnn import BiLSTMMultiLayeredWithShortcutConnections

        if self.parameters['multilayer']:
            self.num_sentence_level_bilstm_layers = 3
        else:
            self.num_sentence_level_bilstm_layers = 1

        self.sentence_level_bilstm_layer = \
            BiLSTMMultiLayeredWithShortcutConnections(self.num_sentence_level_bilstm_layers,
                                                      word_representation_dim,
                                                      2 * word_lstm_dim,
                                                      self.model,
                                                      CoupledLSTMBuilder,
                                                      self.parameters['shortcut_connections'])

        def _create_tying_method(activation_function=dynet.tanh, classic=True):
            def f(x, y):
                if classic:
                    return dynet.tanh(x + y)
                else:
                    return activation_function(self.tying_method_W *
                                               dynet.concatenate([x, y]) +
                                               self.tying_method_b)

            return f

        if self.parameters['tying_method']:
            self.tying_method_W = self.model.add_parameters(
                (word_lstm_dim, 2 * mt_d))
            self.tying_method_b = self.model.add_parameters((word_lstm_dim))

            self.f_tying_method = _create_tying_method(
                activation_function=dynet.tanh, classic=False)
        else:
            self.f_tying_method = _create_tying_method(
                activation_function=dynet.tanh, classic=True)

        self.crf_module = CRF(self.model, self.id_to_tag)

        # Training
        def process_hyperparameter_definition(x):
            tokens = x.split("@")
            subtokens = tokens[0].split("_")
            if len(subtokens) > 1 and subtokens[-1] == "float":
                return ["_".join(subtokens[:-1]), float(tokens[1])]
            else:
                return tokens

        _tokens = lr_method.split("-")
        opt_update_algorithm = _tokens[0]
        opt_hyperparameters = [
            process_hyperparameter_definition(x) for x in _tokens[1:]
        ]
        opt_update_algorithms = {
            'sgd': dynet.SimpleSGDTrainer,
            'adam': dynet.AdamTrainer,
            'adadelta': dynet.AdadeltaTrainer,
            'adagrad': dynet.AdagradTrainer,
            'momentum': dynet.MomentumSGDTrainer,
            'rmsprop': dynet.RMSPropTrainer
        }

        if opt_update_algorithm == "adam":
            opt_hyperparameters += [("sparse_updates_enabled",
                                     self.parameters['sparse_updates_enabled'])
                                    ]

        self.trainer = opt_update_algorithms[opt_update_algorithm](
            self.model,
            # sparse_updates_enabled=self.parameters['sparse_updates_enabled'],
            **{name: value
               for name, value in opt_hyperparameters})

        # self.trainer = dynet.SimpleSGDTrainer(self.model, learning_rate=0.01)

        self.saver = DynetSaver(self.model, self.model_path)

        return self

    def get_char_representations(self, sentence):
        # initial_state = self.char_lstm_layer.initial_state()

        char_embeddings = [[
            self.char_embeddings[char_id] for char_id in word
        ] for sentence_pos, word in enumerate(sentence['char_for_ids'])]

        char_representations = []
        for sentence_pos, char_embeddings_for_word in enumerate(
                char_embeddings):
            # print char_embeddings_for_word
            try:
                # char_representations.append(
                #     self.char_lstm_layer.transduce(char_embeddings_for_word)[-1])
                char_representations.append(
                    self.char_lstm_layer.get_representation_concat(
                        char_embeddings_for_word)[0])
            except IndexError as e:
                print(sentence)
                print(char_embeddings_for_word)
                print(e)
        return char_representations

    def get_sentence_level_bilstm_outputs(
            self, combined_word_representations,
            which_layer_to_use_for_morpho_disamb):
        """
        This function produces the context representations at each level given the word representations
        for each word and returns the last layer's output and the specific layer output which we want
        to use for morphological disambiguation.
         :param combined_word_representations: 
         :param which_layer_to_use_for_morpho_disamb: xyz 
         :type which_layer_to_use_for_morpho_disamb: int
         :return: two outputs: 1) layer output to be used for NER loss, 2) layer output to be used for MD loss
        """

        last_layer_context_representations, multilayered_context_representations = \
            self.sentence_level_bilstm_layer.transduce(combined_word_representations)

        last_layer_context_representations = [dynet.tanh(dynet.affine_transform([self.tanh_layer_b.expr(),
                                                                      self.tanh_layer_W.expr(),
                                                                      context])) \
                                   for context in last_layer_context_representations]
        return last_layer_context_representations, \
               multilayered_context_representations[which_layer_to_use_for_morpho_disamb-1]

    def predict(self, sentence):

        context_representations_for_ner_loss, context_representations_for_md_loss = \
            self.get_context_representations(sentence, training=False)

        last_layer_context_representations, _, _ = \
            self.get_last_layer_context_representations(sentence,
                                                        context_representations_for_ner_loss,
                                                        context_representations_for_md_loss)

        if self.parameters['active_models'] in [0, 2, 3]:
            tag_scores = self.calculate_tag_scores(
                last_layer_context_representations)
            # _, decoded_tags = self.crf_module.viterbi_loss(tag_scores,
            #                                                   sentence['tag_ids'])
            observations = [
                dynet.concatenate([obs, dynet.inputVector([-1e10, -1e10])],
                                  d=0) for obs in tag_scores
            ]
            decoded_tags, _ = self.crf_module.viterbi_decoding(observations)
        else:
            decoded_tags = []

        # if self.parameters['integration_mode'] in [1, 2] or self.parameters['active_models'] == 1:
        if self.parameters['active_models'] in [1, 2, 3]:
            morph_analysis_representations, morph_analysis_scores = \
                self.get_morph_analysis_representations_and_scores(sentence,
                                                                   context_representations_for_md_loss)

            selected_morph_analysis_representations = \
                self.disambiguate_morph_analyzes(morph_analysis_scores)
        else:
            selected_morph_analysis_representations = []

        return selected_morph_analysis_representations, decoded_tags

    def get_loss(self,
                 sentences_in_the_batch,
                 loss_configuration_parameters=None):
        # immediate_compute=True, check_validity=True
        # read configuration

        dynet.renew_cg()
        loss_array = []
        for sentence in sentences_in_the_batch:
            """
            data.append({
                'str_words': str_words,
                'word_ids': words,
                'char_for_ids': chars,
                'char_lengths': [len(char) for char in chars],
                'cap_ids': caps,
                'tag_ids': tags,
                'morpho_tag_ids': morpho_tags,
                'morpho_analyzes_tags': morph_analyzes_tags,
                'morpho_analyzes_roots': morph_analyzes_roots,
                'golden_morph_analysis_indices': golden_analysis_indices,
                'sentence_lengths': len(s),
                'max_word_length_in_this_sample': max([len(x) for x in chars])
            })
            """
            """
                Our new approach will pick the appropriate model design according to the available labels.
                For example, if a sample contains only golden NER tags, only crf_loss will be added to the list of loss
                expressions to be updated. It is similar for other cases where only golden MD tags are available or when
                both are available. The latter one is only possible for Turkish.
            """

            context_representations_for_ner_loss, context_representations_for_md_loss = \
                self.get_context_representations(sentence)

            last_layer_context_representations, md_loss, _ = \
                self.get_last_layer_context_representations(sentence,
                                                            context_representations_for_ner_loss,
                                                            context_representations_for_md_loss)
            if self.parameters['active_models'] in [
                    0, 2, 3
            ]:  # 0: NER, 1: MD, 2: JOINT, 3: JOINT_MULTILAYER
                tag_scores = self.calculate_tag_scores(
                    last_layer_context_representations)

                if len(sentence['tag_ids']) > 0:
                    crf_loss = self.crf_module.neg_log_loss(
                        tag_scores, sentence['tag_ids'])
                else:
                    crf_loss = dynet.scalarInput(0)

                if crf_loss.value() > 1000:
                    logging.error("BEEP")
                loss_array.append(crf_loss)

            if self.parameters['active_models'] in [1, 2, 3]:
                loss_array.append(md_loss)

        return dynet.esum(loss_array)

    def calculate_tag_scores(self, context_representations):

        tag_scores = [dynet.affine_transform([self.last_layer_b.expr(),
                                              self.last_layer_W.expr(),
                                              context]) \
                      for context in context_representations]
        return tag_scores

    def get_morph_analysis_representation_in_old_style(self, sentence):
        # these morpho_tag_ids are either chars or tags depending on the morpho_tag_type
        return [self.old_style_morpho_tag_lstm_layer_for_golden_morpho_analyzes\
                    .get_representation_concat([self.morpho_tag_embeddings[morpho_tag_id] for morpho_tag_id in morpho_tag_sequence])[0]
                for morpho_tag_sequence in sentence['morpho_tag_ids']]

    def get_combined_word_representations(self, sentence, training=None):
        """
        
        :param training:
        :param sentence: whole sentence with input values as ids
        :return: word representations made up according to the user preferences
        """

        if training is None:
            training = self.training

        representations_to_be_zipped = []
        word_embedding_based_representations = \
            [self.word_embeddings[word_id] for word_id in sentence['word_ids']]
        representations_to_be_zipped.append(
            dynet.concatenate([
                dynet.transpose(x)
                for x in word_embedding_based_representations
            ]))
        char_representations = self.get_char_representations(sentence)
        representations_to_be_zipped.append(
            dynet.concatenate(
                [dynet.transpose(x) for x in char_representations]))
        if self.parameters[
                'use_golden_morpho_analysis_in_word_representation']:
            morph_tag_based_representations = self.get_morph_analysis_representation_in_old_style(
                sentence)
            representations_to_be_zipped.append(
                dynet.concatenate([
                    dynet.transpose(x) for x in morph_tag_based_representations
                ]))
        if self.parameters['cap_dim'] > 0:
            cap_embedding_based_representations = \
                [self.cap_embeddings[cap_id] for cap_id in sentence['cap_ids']]
            representations_to_be_zipped.append(
                dynet.concatenate([
                    dynet.transpose(x)
                    for x in cap_embedding_based_representations
                ]))
            # combined_word_representations = [dynet.concatenate([x, y, z, xx]) for x, y, z, xx in
            #                                  zip(*representations_to_be_zipped)]
        # else:
        # combined_word_representations = [dynet.concatenate([x, y, xx]) for x, y, xx in
        #                                  zip(*representations_to_be_zipped)]

        combined_word_representations = dynet.concatenate_cols(
            representations_to_be_zipped)
        # print combined_word_representations
        # print self.parameters

        if training:
            combined_word_representations = [
                dynet.dropout(x, p=self.parameters['dropout'])
                for x in combined_word_representations
            ]
        else:
            combined_word_representations = [
                x for x in combined_word_representations
            ]

        return combined_word_representations

    def get_context_representations(self, sentence, training=None):
        """
        
        :param training:
        :param sentence: whole sentence with input values as ids
        :return: context representations for every layer of RNN (Bi-LSTM in our case)
        """

        if training is None:
            training = self.training

        combined_word_representations = self.get_combined_word_representations(
            sentence, training=training)

        context_representations_for_ner_loss, context_representations_for_md_loss = \
            self.get_sentence_level_bilstm_outputs(combined_word_representations,
                                                   1 if self.parameters['multilayer'] else 1)
        return context_representations_for_ner_loss, context_representations_for_md_loss

    def get_morph_analysis_representations(self, sentence):

        try:
            root_representations = \
                [[self.char_lstm_layer_for_morph_analysis_roots.get_representation([self.char_embeddings[char_id]
                                                              for char_id in root_char_sequence])[0]
                for root_char_sequence in root_as_char_sequences_for_word]
                for root_as_char_sequences_for_word in sentence['morpho_analyzes_roots']]
        except IndexError as e:
            print(e)
            print(root_char_sequence)

        try:
            morpho_tag_sequence_representations = \
                [[self.morpho_tag_lstm_layer_for_morph_analysis_tags.get_representation([self.morpho_tag_embeddings[morpho_tag_id]
                                                              for morpho_tag_id in morpho_tag_sequence])[0]
                for morpho_tag_sequence in morpho_tag_sequences_for_word]
                for morpho_tag_sequences_for_word in sentence['morpho_analyzes_tags']]
        except IndexError as e:
            print(e)
            print(morpho_tag_sequence)

        tyed_representations_for_every_analysis = \
            [[self.f_tying_method(root_representation, morpho_tag_representation)
             for root_representation, morpho_tag_representation in
                 zip(root_representations_for_word, morpho_tag_representations_for_word)]
             for root_representations_for_word, morpho_tag_representations_for_word in
                 zip(root_representations, morpho_tag_sequence_representations)]

        return tyed_representations_for_every_analysis
示例#4
0
def print_data_status(fsp_dict, vocab_str):
    sys.stderr.write(
        "# {} = {}\n\tUnseen in dev/test = {}\n\tUnlearnt in dev/test = {}\n".
        format(vocab_str, fsp_dict.size(),
               fsp_dict.num_unks()[0],
               fsp_dict.num_unks()[1]))


print_data_status(VOCDICT, "Tokens")
print_data_status(POSDICT, "POS tags")
print_data_status(LUDICT, "LUs")
print_data_status(LUPOSDICT, "LU POS tags")
print_data_status(FRAMEDICT, "Frames")
sys.stderr.write("\n_____________________\n\n")

model = Model()
trainer = SimpleSGDTrainer(model)
# trainer = AdamTrainer(model, 0.0001, 0.01, 0.9999, 1e-8)

v_x = model.add_lookup_parameters((VOCDICT.size(), TOKDIM))
p_x = model.add_lookup_parameters((POSDICT.size(), POSDIM))
lu_x = model.add_lookup_parameters((LUDICT.size(), LUDIM))
lp_x = model.add_lookup_parameters((LUPOSDICT.size(), LPDIM))
if USE_WV:
    e_x = model.add_lookup_parameters((VOCDICT.size(), PRETRAINED_DIM))
    for wordid in pretrained_embeddings_map:
        e_x.init_row(wordid, pretrained_embeddings_map[wordid])

    # Embedding for unknown pretrained embedding.
    u_x = model.add_lookup_parameters((1, PRETRAINED_DIM), init='glorot')