예제 #1
0
파일: rank_nn.py 프로젝트: fooyou/tgen
    def _init_neural_network(self):
        layers = [[Embedding('emb_das', self.dict_size, self.emb_size, 'uniform_005'),
                   Embedding('emb_trees', self.dict_size, self.emb_size, 'uniform_005')]]

        if self.nn_shape.startswith('ff'):
            layers += [[Flatten('flat-da'), Flatten('flat-trees')],
                       [Concat('concat')],
                       [FeedForwardLayer('ff1',
                                         self.emb_size * 2 * self.max_da_len +
                                         self.emb_size * 3 * self.max_tree_len,
                                         self.num_hidden_units,
                                         T.tanh, self.initialization)],
                       [FeedForwardLayer('ff2', self.num_hidden_units, self.num_hidden_units,
                                         T.tanh, self.initialization)]]
            if self.nn_shape[-1] in ['3', '4']:
                layers += [[FeedForwardLayer('ff3', self.num_hidden_units, self.num_hidden_units,
                                             T.tanh, self.initialization)]]
            if self.nn_shape[-1] == '4':
                layers += [[FeedForwardLayer('ff4', self.num_hidden_units, self.num_hidden_units,
                                             T.tanh, self.initialization)]]
            layers += [[FeedForwardLayer('perc', self.num_hidden_units, 1,
                                         None, self.initialization)]]

        elif 'maxpool-ff' in self.nn_shape:
            if self.nn_shape.startswith('conv'):
                layers += [[Conv1DLayer('conv_das', n_in=self.max_da_len, filter_length=4, stride=2,
                                        init=self.initialization, activation=T.tanh),
                            Conv1DLayer('conv_trees', n_in=self.max_da_len, filter_length=9, stride=3,
                                        init=self.initialization, activation=T.tanh)]]
            layers += [[MaxPool1DLayer('mp_das', downscale_factor=self.max_da_len, stride=2),
                        MaxPool1DLayer('mp_trees', downscale_factor=self.max_tree_len, stride=3)],
                       [Concat('concat')],
                       [Flatten('flatten')],
                       [FeedForwardLayer('ff1', self.emb_size * 5, self.num_hidden_units,
                                         T.tanh, self.initialization)],
                       [FeedForwardLayer('ff2', self.num_hidden_units, self.num_hidden_units,
                                         T.tanh, self.initialization)],
                       [FeedForwardLayer('perc', self.num_hidden_units, 1,
                                         None, self.initialization)]]

        elif self.nn_shape.startswith('dot'):
            layers += [[Flatten('flat-das'), Flatten('flat-trees')],
                       [FeedForwardLayer('ff-das', self.emb_size * 2 * self.max_da_len, self.num_hidden_units,
                                         T.tanh, self.initialization),
                        FeedForwardLayer('ff-trees', self.emb_size * 3 * self.max_tree_len, self.num_hidden_units,
                                         T.tanh, self.initialization)]]
            if self.nn_shape.endswith('2'):
                layers += [[FeedForwardLayer('ff2-das', self.num_hidden_units, self.num_hidden_units,
                                             T.tanh, self.initialization),
                            FeedForwardLayer('ff2-trees', self.num_hidden_units, self.num_hidden_units,
                                             T.tanh, self.initialization)]]
            layers += [[DotProduct('dot')]]

        elif self.nn_shape == 'maxpool-dot':
            layers += [[MaxPool1DLayer('mp_das', downscale_factor=self.max_da_len, stride=2),
                        MaxPool1DLayer('mp_trees', downscale_factor=self.max_tree_len, stride=3)],
                       [Flatten('flat-das'), Flatten('flat-trees')],
                       [FeedForwardLayer('ff-das', self.emb_size * 2, self.num_hidden_units,
                                         T.tanh, self.initialization),
                        FeedForwardLayer('ff-trees', self.emb_size * 3, self.num_hidden_units,
                                         T.tanh, self.initialization)],
                       [DotProduct('dot')]]

        elif self.nn_shape == 'avgpool-dot':
            layers += [[MaxPool1DLayer('mp_das', downscale_factor=self.max_da_len, stride=2, pooling_func=T.mean),
                        MaxPool1DLayer('mp_trees', downscale_factor=self.max_tree_len, stride=3, pooling_func=T.mean)],
                       [FeedForwardLayer('ff-das', self.emb_size * 2, self.num_hidden_units,
                                         T.tanh, self.initialization),
                        FeedForwardLayer('ff-trees', self.emb_size * 3, self.num_hidden_units,
                                         T.tanh, self.initialization)],
                       [DotProduct('dot')]]

        # input: batch * word * sub-embeddings
        self.nn = NN(layers=layers, input_num=2, input_type=T.itensor3, normgrad=self.normgrad)
예제 #2
0
파일: rank_nn.py 프로젝트: fooyou/tgen
class EmbNNRanker(NNRanker):
    """A ranker using MR and tree embeddings in a NN."""

    UNK_SLOT = 0
    UNK_VALUE = 1
    UNK_T_LEMMA = 2
    UNK_FORMEME = 3
    MIN_VALID = 4

    def __init__(self, cfg):
        super(EmbNNRanker, self).__init__(cfg)
        self.num_hidden_units = cfg.get('num_hidden_units', 512)
        self.initialization = cfg.get('initialization', 'uniform_glorot10')
        self.emb_size = cfg.get('emb_size', 20)

        self.dict_slot = {'UNK_SLOT': self.UNK_SLOT}
        self.dict_value = {'UNK_VALUE': self.UNK_VALUE}
        self.dict_t_lemma = {'UNK_T_LEMMA': self.UNK_T_LEMMA}
        self.dict_formeme = {'UNK_FORMEME': self.UNK_FORMEME}

        self.max_da_len = cfg.get('max_da_len', 10)
        self.max_tree_len = cfg.get('max_tree_len', 20)

        self.nn_shape = cfg.get('nn_shape', 'ff')
        self.normgrad = cfg.get('normgrad', False)

    def _init_training(self, das_file, ttree_file, data_portion):
        super(EmbNNRanker, self)._init_training(das_file, ttree_file, data_portion)
        self._init_dict()
        self._init_neural_network()

        self.train_feats = [self._extract_feats(tree, da)
                            for tree, da in zip(self.train_trees, self.train_das)]

        self.w_after_iter = []
        self.update_weights_sum()

    def _init_dict(self):
        """Initialize word -> integer dictionaries, starting from a minimum
        valid value, always adding a new integer to unknown values to prevent
        clashes among different types of inputs."""
        dict_ord = self.MIN_VALID

        for da in self.train_das:
            for dai in da:
                if dai.name not in self.dict_slot:
                    self.dict_slot[dai.name] = dict_ord
                    dict_ord += 1
                if dai.value not in self.dict_value:
                    self.dict_value[dai.value] = dict_ord
                    dict_ord += 1

        for tree in self.train_trees:
            for t_lemma, formeme in tree.nodes:
                if t_lemma not in self.dict_t_lemma:
                    self.dict_t_lemma[t_lemma] = dict_ord
                    dict_ord += 1
                if formeme not in self.dict_formeme:
                    self.dict_formeme[formeme] = dict_ord
                    dict_ord += 1

        self.dict_size = dict_ord

    def _score(self, cand_embs):
        return self.nn.score([cand_embs[0]], [cand_embs[1]])[0]

    def _extract_feats(self, tree, da):
        """Extract DA and tree embeddings (return as a pair)."""

        # DA embeddings (slot - value; size == 2x self.max_da_len)
        da_emb_idxs = []
        for dai in da[:self.max_da_len]:
            da_emb_idxs.append([self.dict_slot.get(dai.name, self.UNK_SLOT),
                                self.dict_value.get(dai.value, self.UNK_VALUE)])

        # pad with "unknown"
        for _ in xrange(len(da_emb_idxs), self.max_da_len):
            da_emb_idxs.append([self.UNK_SLOT, self.UNK_VALUE])

        # tree embeddings (parent_lemma - formeme - lemma; size == 3x self.max_tree_len)
        tree_emb_idxs = []
        for parent_ord, (t_lemma, formeme) in zip(tree.parents[1:self.max_tree_len + 1],
                                                  tree.nodes[1:self.max_tree_len + 1]):
            tree_emb_idxs.append([self.dict_t_lemma.get(tree.nodes[parent_ord].t_lemma,
                                                        self.UNK_T_LEMMA),
                                  self.dict_formeme.get(formeme, self.UNK_FORMEME),
                                  self.dict_t_lemma.get(t_lemma, self.UNK_T_LEMMA)])

        # pad with unknown
        for _ in xrange(len(tree_emb_idxs), self.max_tree_len):
            tree_emb_idxs.append([self.UNK_T_LEMMA, self.UNK_FORMEME, self.UNK_T_LEMMA])

        return (da_emb_idxs, tree_emb_idxs)

    def _init_neural_network(self):
        layers = [[Embedding('emb_das', self.dict_size, self.emb_size, 'uniform_005'),
                   Embedding('emb_trees', self.dict_size, self.emb_size, 'uniform_005')]]

        if self.nn_shape.startswith('ff'):
            layers += [[Flatten('flat-da'), Flatten('flat-trees')],
                       [Concat('concat')],
                       [FeedForwardLayer('ff1',
                                         self.emb_size * 2 * self.max_da_len +
                                         self.emb_size * 3 * self.max_tree_len,
                                         self.num_hidden_units,
                                         T.tanh, self.initialization)],
                       [FeedForwardLayer('ff2', self.num_hidden_units, self.num_hidden_units,
                                         T.tanh, self.initialization)]]
            if self.nn_shape[-1] in ['3', '4']:
                layers += [[FeedForwardLayer('ff3', self.num_hidden_units, self.num_hidden_units,
                                             T.tanh, self.initialization)]]
            if self.nn_shape[-1] == '4':
                layers += [[FeedForwardLayer('ff4', self.num_hidden_units, self.num_hidden_units,
                                             T.tanh, self.initialization)]]
            layers += [[FeedForwardLayer('perc', self.num_hidden_units, 1,
                                         None, self.initialization)]]

        elif 'maxpool-ff' in self.nn_shape:
            if self.nn_shape.startswith('conv'):
                layers += [[Conv1DLayer('conv_das', n_in=self.max_da_len, filter_length=4, stride=2,
                                        init=self.initialization, activation=T.tanh),
                            Conv1DLayer('conv_trees', n_in=self.max_da_len, filter_length=9, stride=3,
                                        init=self.initialization, activation=T.tanh)]]
            layers += [[MaxPool1DLayer('mp_das', downscale_factor=self.max_da_len, stride=2),
                        MaxPool1DLayer('mp_trees', downscale_factor=self.max_tree_len, stride=3)],
                       [Concat('concat')],
                       [Flatten('flatten')],
                       [FeedForwardLayer('ff1', self.emb_size * 5, self.num_hidden_units,
                                         T.tanh, self.initialization)],
                       [FeedForwardLayer('ff2', self.num_hidden_units, self.num_hidden_units,
                                         T.tanh, self.initialization)],
                       [FeedForwardLayer('perc', self.num_hidden_units, 1,
                                         None, self.initialization)]]

        elif self.nn_shape.startswith('dot'):
            layers += [[Flatten('flat-das'), Flatten('flat-trees')],
                       [FeedForwardLayer('ff-das', self.emb_size * 2 * self.max_da_len, self.num_hidden_units,
                                         T.tanh, self.initialization),
                        FeedForwardLayer('ff-trees', self.emb_size * 3 * self.max_tree_len, self.num_hidden_units,
                                         T.tanh, self.initialization)]]
            if self.nn_shape.endswith('2'):
                layers += [[FeedForwardLayer('ff2-das', self.num_hidden_units, self.num_hidden_units,
                                             T.tanh, self.initialization),
                            FeedForwardLayer('ff2-trees', self.num_hidden_units, self.num_hidden_units,
                                             T.tanh, self.initialization)]]
            layers += [[DotProduct('dot')]]

        elif self.nn_shape == 'maxpool-dot':
            layers += [[MaxPool1DLayer('mp_das', downscale_factor=self.max_da_len, stride=2),
                        MaxPool1DLayer('mp_trees', downscale_factor=self.max_tree_len, stride=3)],
                       [Flatten('flat-das'), Flatten('flat-trees')],
                       [FeedForwardLayer('ff-das', self.emb_size * 2, self.num_hidden_units,
                                         T.tanh, self.initialization),
                        FeedForwardLayer('ff-trees', self.emb_size * 3, self.num_hidden_units,
                                         T.tanh, self.initialization)],
                       [DotProduct('dot')]]

        elif self.nn_shape == 'avgpool-dot':
            layers += [[MaxPool1DLayer('mp_das', downscale_factor=self.max_da_len, stride=2, pooling_func=T.mean),
                        MaxPool1DLayer('mp_trees', downscale_factor=self.max_tree_len, stride=3, pooling_func=T.mean)],
                       [FeedForwardLayer('ff-das', self.emb_size * 2, self.num_hidden_units,
                                         T.tanh, self.initialization),
                        FeedForwardLayer('ff-trees', self.emb_size * 3, self.num_hidden_units,
                                         T.tanh, self.initialization)],
                       [DotProduct('dot')]]

        # input: batch * word * sub-embeddings
        self.nn = NN(layers=layers, input_num=2, input_type=T.itensor3, normgrad=self.normgrad)

    def _update_nn(self, bad_feats, good_feats, rate):
        """Changing the NN update call to support arrays of parameters."""
        # TODO: this is just adding another dimension to fit the parallelized scoring
        # (even if updates are not parallelized). Make it nicer.
        bad_feats = ([bad_feats[0]], [bad_feats[1]])
        good_feats = ([good_feats[0]], [good_feats[1]])

        cost_gcost = self.nn.update(*(bad_feats + good_feats + (rate,)))
        log_debug('Cost:' + str(cost_gcost[0]))
        param_vals = [param.get_value() for param in self.nn.params]
        log_debug('Param norms : ' + str(self._l2s(param_vals)))
        log_debug('Gparam norms: ' + str(self._l2s(cost_gcost[1:])))
        l1_params = param_vals[2]
        log_debug('Layer 1 parts :' + str(self._l2s([l1_params[0:100, :], l1_params[100:200, :],
                                                    l1_params[200:350, :], l1_params[350:500, :],
                                                    l1_params[500:, :]])))
        l1_gparams = cost_gcost[3]
        log_debug('Layer 1 gparts:' + str(self._l2s([l1_gparams[0:100, :], l1_gparams[100:200, :],
                                                    l1_gparams[200:350, :], l1_gparams[350:500, :],
                                                    l1_gparams[500:, :]])))

    def _embs_to_str(self):
        out = ""
        da_emb = self.nn.layers[0][0].e.get_value()
        tree_emb = self.nn.layers[0][1].e.get_value()
        for idx, emb in enumerate(da_emb):
            for key, val in self.dict_slot.items():
                if val == idx:
                    out += key + ',' + ','.join([("%f" % d) for d in emb]) + "\n"
            for key, val in self.dict_value.items():
                if val == idx:
                    out += key + ',' + ','.join([("%f" % d) for d in emb]) + "\n"
        for idx, emb in enumerate(tree_emb):
            for key, val in self.dict_t_lemma.items():
                if val == idx:
                    out += str(key) + ',' + ','.join([("%f" % d) for d in emb]) + "\n"
            for key, val in self.dict_formeme.items():
                if val == idx:
                    out += str(key) + ',' + ','.join([("%f" % d) for d in emb]) + "\n"
        return out

    def _l2s(self, params):
        """Compute L2-norm of all members of the given list."""
        return [np.linalg.norm(param) for param in params]

    def store_iter_weights(self):
        """Remember the current weights to be used for averaged perceptron."""
        # fh = open('embs.txt', 'a')
        # print >> fh, '---', self._embs_to_str()
        # fh.close()
        self.w_after_iter.append(self.nn.get_param_values())

    def score_all(self, trees, da):
        cand_embs = [self._extract_feats(tree, da) for tree in trees]
        score = self.nn.score([emb[0] for emb in cand_embs], [emb[1] for emb in cand_embs])
        return np.atleast_1d(score[0])