示例#1
0
    def __init__(self, model_path, word_dim=None, afix_dim=None, nlayers=2,
            hidden_dim=128, elu_dim=64, dep_dim=100, dropout_ratio=0.5, use_cudnn=False):
        self.model_path = model_path
        defs_file = model_path + "/tagger_defs.txt"
        if word_dim is None:
            self.train = False
            Param.load(self, defs_file)
            self.extractor = FeatureExtractor(model_path)
        else:
            self.train = True
            p = Param(self)
            p.dep_dim = dep_dim
            p.word_dim = word_dim
            p.afix_dim = afix_dim
            p.hidden_dim = hidden_dim
            p.elu_dim = elu_dim
            p.nlayers = nlayers
            p.dump(defs_file)

        self.targets = read_model_defs(model_path + "/target.txt")
        self.words = read_model_defs(model_path + "/words.txt")
        self.suffixes = read_model_defs(model_path + "/suffixes.txt")
        self.prefixes = read_model_defs(model_path + "/prefixes.txt")
        self.in_dim = self.word_dim + 8 * self.afix_dim
        self.dropout_ratio = dropout_ratio
        super(PeepHoleLSTMParser, self).__init__(
                emb_word=L.EmbedID(len(self.words), self.word_dim, ignore_label=IGNORE),
                emb_suf=L.EmbedID(len(self.suffixes), self.afix_dim, ignore_label=IGNORE),
                emb_prf=L.EmbedID(len(self.prefixes), self.afix_dim, ignore_label=IGNORE),
                lstm_f1=DyerLSTM(self.in_dim, self.hidden_dim),
                lstm_f2=DyerLSTM(self.hidden_dim, self.hidden_dim),
                lstm_b1=DyerLSTM(self.in_dim, self.hidden_dim),
                lstm_b2=DyerLSTM(self.hidden_dim, self.hidden_dim),
                linear_cat1=L.Linear(2 * self.hidden_dim, self.elu_dim),
                linear_cat2=L.Linear(self.elu_dim, len(self.targets)),
                linear_dep=L.Linear(2 * self.hidden_dim, self.dep_dim),
                linear_head=L.Linear(2 * self.hidden_dim, self.dep_dim),
                biaffine=Biaffine(self.dep_dim)
                )
示例#2
0
    def __init__(self, model_path, ccgbank_path, tritrain_path, weight):
        self.model_path = model_path
        self.targets = read_model_defs(model_path + "/target.txt")
        self.extractor = FeatureExtractor(model_path)
        self.weight = weight
        self.ncopies = 15
        with open(ccgbank_path) as f:
            self.ccgbank_samples = sorted(json.load(f),
                                          key=lambda x: len(x[1][0]))
            self.ccgbank_size = len(self.ccgbank_samples)
        with open(tritrain_path) as f:
            self.tritrain_samples = sorted(json.load(f),
                                           key=lambda x: len(x[1][0]))
            self.tritrain_size = len(self.tritrain_samples)

        print >> sys.stderr, "len(ccgbank):", self.ccgbank_size
        print >> sys.stderr, "len(ccgbank) * # copies:", self.ccgbank_size * self.ncopies
        print >> sys.stderr, "len(tritrain):", self.tritrain_size
示例#3
0
class LSTMParserDataset(chainer.dataset.DatasetMixin):
    def __init__(self, model_path, samples_path):
        self.model_path = model_path
        self.targets = read_model_defs(model_path + "/target.txt")
        self.extractor = FeatureExtractor(model_path)
        with open(samples_path) as f:
            self.samples = sorted(json.load(f), key=lambda x: len(x[1][0]))

    def __len__(self):
        return len(self.samples)

    def get_example(self, i):
        words, [cats, deps] = self.samples[i]
        splitted = words.split(" ")
        w, s, p = self.extractor.process(splitted)
        cats = np.array([-1] + [self.targets.get(x, IGNORE)
                                for x in cats] + [-1], 'i')
        deps = np.array([-1] + deps + [-1], 'i')
        l = len(splitted) + 2
        weight = np.array(1, 'f')
        return w, s, p, l, cats, deps, weight
示例#4
0
class PeepHoleLSTMParser(chainer.Chain):
    def __init__(self, model_path, word_dim=None, afix_dim=None, nlayers=2,
            hidden_dim=128, elu_dim=64, dep_dim=100, dropout_ratio=0.5, use_cudnn=False):
        self.model_path = model_path
        defs_file = model_path + "/tagger_defs.txt"
        if word_dim is None:
            self.train = False
            Param.load(self, defs_file)
            self.extractor = FeatureExtractor(model_path)
        else:
            self.train = True
            p = Param(self)
            p.dep_dim = dep_dim
            p.word_dim = word_dim
            p.afix_dim = afix_dim
            p.hidden_dim = hidden_dim
            p.elu_dim = elu_dim
            p.nlayers = nlayers
            p.dump(defs_file)

        self.targets = read_model_defs(model_path + "/target.txt")
        self.words = read_model_defs(model_path + "/words.txt")
        self.suffixes = read_model_defs(model_path + "/suffixes.txt")
        self.prefixes = read_model_defs(model_path + "/prefixes.txt")
        self.in_dim = self.word_dim + 8 * self.afix_dim
        self.dropout_ratio = dropout_ratio
        super(PeepHoleLSTMParser, self).__init__(
                emb_word=L.EmbedID(len(self.words), self.word_dim, ignore_label=IGNORE),
                emb_suf=L.EmbedID(len(self.suffixes), self.afix_dim, ignore_label=IGNORE),
                emb_prf=L.EmbedID(len(self.prefixes), self.afix_dim, ignore_label=IGNORE),
                lstm_f1=DyerLSTM(self.in_dim, self.hidden_dim),
                lstm_f2=DyerLSTM(self.hidden_dim, self.hidden_dim),
                lstm_b1=DyerLSTM(self.in_dim, self.hidden_dim),
                lstm_b2=DyerLSTM(self.hidden_dim, self.hidden_dim),
                linear_cat1=L.Linear(2 * self.hidden_dim, self.elu_dim),
                linear_cat2=L.Linear(self.elu_dim, len(self.targets)),
                linear_dep=L.Linear(2 * self.hidden_dim, self.dep_dim),
                linear_head=L.Linear(2 * self.hidden_dim, self.dep_dim),
                biaffine=Biaffine(self.dep_dim)
                )

    def load_pretrained_embeddings(self, path):
        self.emb_word.W.data = read_pretrained_embeddings(path)

    def __call__(self, ws, ss, ps, ts):
        """
        xs [(w,s,p,y), ..., ]
        w: word, s: suffix, p: prefix, y: label
        """
        batchsize, length = ws.shape
        cat_ys, dep_ys = self.forward(ws, ss, ps)[1:-1]

        cat_ts = [F.reshape(x, (batchsize,)) for x \
                in F.split_axis(F.transpose(cat_ts), length, 0)]

        dep_ts = [F.reshape(x, (batchsize,)) for x \
                in F.split_axis(F.transpose(dep_ts), length, 0)]

        cat_loss = reduce(lambda x, y: x + y,
            [F.softmax_cross_entropy(y, t) for y, t in zip(cat_ys, cat_ts)])
        cat_acc = reduce(lambda x, y: x + y,
            [F.accuracy(y, t, ignore_label=IGNORE) for y, t in zip(cat_ys, cat_ts)])

        dep_loss = reduce(lambda x, y: x + y,
            [F.softmax_cross_entropy(y, t) for y, t in zip(dep_ys, dep_ts)])
        dep_acc = reduce(lambda x, y: x + y,
            [F.accuracy(y, t, ignore_label=IGNORE) for y, t in zip(dep_ys, dep_ts)])

        cat_acc /= length
        dep_acc /= length
        chainer.report({
            "tagging_loss": cat_loss,
            "tagging_accuracy": cat_acc,
            "parsing_loss": dep_loss,
            "parsing_accuracy": dep_acc
            }, self)
        return cat_loss + dep_loss

    def forward(self, ws, ss, ps):
        batchsize, length = ws.shape
        xp = chainer.cuda.get_array_module(ws[0])
        ws = self.emb_word(ws) # (batch, length, word_dim)
        ss = F.reshape(self.emb_suf(ss), (batchsize, length, -1))
        ps = F.reshape(self.emb_prf(ps), (batchsize, length, -1))
        hs = F.transpose(F.concat([ws, ss, ps], 2), (1, 0, 2))
        hs = F.dropout(hs, self.dropout_ratio, train=self.train)
        hs = F.split_axis(hs, length, 0)
        hs_f = []
        hs_b = []
        self._init_state()
        for h_in_f, h_in_b in zip(hs, reversed(hs)):
            h_f = self.lstm_f2(self.lstm_f1(F.reshape(h_in_f, (-1, self.in_dim))))
            hs_f.append(h_f)
            h_b = self.lstm_b2(self.lstm_b1(F.reshape(h_in_b, (-1, self.in_dim))))
            hs_b.append(h_b)

        hs = zip(hs_f, reversed(hs_b))

        cat_ys = [self.linear_cat2(F.dropout(
            F.elu(self.linear_cat1(h)), 0.5, train=self.train)) for h in hs]

        dep_ys = [self.biaffine(
            F.elu(F.dropout(self.linear_dep(h), 0.32, train=self.train)),
            F.elu(F.dropout(self.linear_head(h), 0.32, train=self.train))) for h in hs]

        return cat_ys, dep_ys

    def predict(self, xs):
        """
        batch: list of splitted sentences
        """
        batchsize = len(xs)
        xs = [self.extractor.process(x) for x in xs]
        ws, ss, ps = concat_examples(xs, padding=IGNORE)
        cat_ys, dep_ys = self.forward(ws, ss, ps)
        cat_ys = F.transpose(F.stack(cat_ys, 2), (0, 2, 1))
        dep_ys = F.transpose(F.stack(dep_ys, 2), (0, 2, 1))

        cat_ys = [F.squeeze(y, 0).data[1:len(x) + 1] for x, y in \
                zip(xs, F.split_axis(cat_ys, batchsize, 0))]

        dep_ys = [F.squeeze(F.log_softmax(y[1:len(x) + 1, :-1]), 0).data \
                for x, y in zip(xs, F.split_axis(dep_ys, batchsize, 0))]
        return cat_ys, dep_ys

    def predict_doc(self, doc, batchsize=16):
        """
        doc list of splitted sentences
        """
        res = []
        for i in range(0, len(doc), batchsize):
            res.extend([(i + j, 0, y)
                for j, y in enumerate(self.predict(doc[i:i + batchsize]))])
        return res

    def _init_state(self):
        self.lstm_f1.reset_state()
        self.lstm_f2.reset_state()
        self.lstm_b1.reset_state()
        self.lstm_b2.reset_state()

    @property
    def cats(self):
        return zip(*sorted(self.targets.items(), key=lambda x: x[1]))[0]
示例#5
0
    def __init__(self,
                 model_path,
                 word_dim=None,
                 afix_dim=None,
                 nlayers=2,
                 hidden_dim=128,
                 elu_dim=64,
                 dep_dim=100,
                 dropout_ratio=0.5,
                 use_cudnn=False):
        self.model_path = model_path
        defs_file = model_path + "/tagger_defs.txt"
        if word_dim is None:
            self.train = False
            Param.load(self, defs_file)
            self.extractor = FeatureExtractor(model_path)
        else:
            self.train = True
            p = Param(self)
            p.dep_dim = dep_dim
            p.word_dim = word_dim
            p.afix_dim = afix_dim
            p.hidden_dim = hidden_dim
            p.elu_dim = elu_dim
            p.nlayers = nlayers
            p.n_words = len(read_model_defs(model_path + "/words.txt"))
            p.n_suffixes = len(read_model_defs(model_path + "/suffixes.txt"))
            p.n_prefixes = len(read_model_defs(model_path + "/prefixes.txt"))
            p.targets = read_model_defs(model_path + "/target.txt")
            p.dump(defs_file)

        self.in_dim = self.word_dim + 8 * self.afix_dim
        self.dropout_ratio = dropout_ratio
        super(QRNNParser,
              self).__init__(emb_word=L.EmbedID(self.n_words,
                                                self.word_dim,
                                                ignore_label=IGNORE),
                             emb_suf=L.EmbedID(self.n_suffixes,
                                               self.afix_dim,
                                               ignore_label=IGNORE),
                             emb_prf=L.EmbedID(self.n_prefixes,
                                               self.afix_dim,
                                               ignore_label=IGNORE),
                             qrnn_fs=ChainList(),
                             qrnn_bs=ChainList(),
                             arc_dep=L.Linear(2 * self.hidden_dim,
                                              self.dep_dim),
                             arc_head=L.Linear(2 * self.hidden_dim,
                                               self.dep_dim),
                             rel_dep=L.Linear(2 * self.hidden_dim,
                                              self.dep_dim),
                             rel_head=L.Linear(2 * self.hidden_dim,
                                               self.dep_dim),
                             biaffine_arc=Biaffine(self.dep_dim),
                             biaffine_tag=Bilinear(self.dep_dim, self.dep_dim,
                                                   len(self.targets)))
        in_dim = self.in_dim
        for _ in range(self.nlayers):
            self.qrnn_fs.add_link(QRNNLayer(in_dim, self.hidden_dim))
            self.qrnn_bs.add_link(QRNNLayer(in_dim, self.hidden_dim))
            in_dim = self.hidden_dim
示例#6
0
class QRNNParser(chainer.Chain):
    def __init__(self,
                 model_path,
                 word_dim=None,
                 afix_dim=None,
                 nlayers=2,
                 hidden_dim=128,
                 elu_dim=64,
                 dep_dim=100,
                 dropout_ratio=0.5,
                 use_cudnn=False):
        self.model_path = model_path
        defs_file = model_path + "/tagger_defs.txt"
        if word_dim is None:
            self.train = False
            Param.load(self, defs_file)
            self.extractor = FeatureExtractor(model_path)
        else:
            self.train = True
            p = Param(self)
            p.dep_dim = dep_dim
            p.word_dim = word_dim
            p.afix_dim = afix_dim
            p.hidden_dim = hidden_dim
            p.elu_dim = elu_dim
            p.nlayers = nlayers
            p.n_words = len(read_model_defs(model_path + "/words.txt"))
            p.n_suffixes = len(read_model_defs(model_path + "/suffixes.txt"))
            p.n_prefixes = len(read_model_defs(model_path + "/prefixes.txt"))
            p.targets = read_model_defs(model_path + "/target.txt")
            p.dump(defs_file)

        self.in_dim = self.word_dim + 8 * self.afix_dim
        self.dropout_ratio = dropout_ratio
        super(QRNNParser,
              self).__init__(emb_word=L.EmbedID(self.n_words,
                                                self.word_dim,
                                                ignore_label=IGNORE),
                             emb_suf=L.EmbedID(self.n_suffixes,
                                               self.afix_dim,
                                               ignore_label=IGNORE),
                             emb_prf=L.EmbedID(self.n_prefixes,
                                               self.afix_dim,
                                               ignore_label=IGNORE),
                             qrnn_fs=ChainList(),
                             qrnn_bs=ChainList(),
                             arc_dep=L.Linear(2 * self.hidden_dim,
                                              self.dep_dim),
                             arc_head=L.Linear(2 * self.hidden_dim,
                                               self.dep_dim),
                             rel_dep=L.Linear(2 * self.hidden_dim,
                                              self.dep_dim),
                             rel_head=L.Linear(2 * self.hidden_dim,
                                               self.dep_dim),
                             biaffine_arc=Biaffine(self.dep_dim),
                             biaffine_tag=Bilinear(self.dep_dim, self.dep_dim,
                                                   len(self.targets)))
        in_dim = self.in_dim
        for _ in range(self.nlayers):
            self.qrnn_fs.add_link(QRNNLayer(in_dim, self.hidden_dim))
            self.qrnn_bs.add_link(QRNNLayer(in_dim, self.hidden_dim))
            in_dim = self.hidden_dim
            # in_dim += self.hidden_dim

    def load_pretrained_embeddings(self, path):
        self.emb_word.W.data = read_pretrained_embeddings(path)

    def __call__(self, ws, ss, ps, ls, cat_ts, dep_ts, weights):
        """
        xs [(w,s,p,y), ..., ]
        w: word, s: suffix, p: prefix, y: label
        """
        try:
            batchsize, length = ws.shape
            cat_ys, dep_ys = self.forward(ws, ss, ps, ls,
                                          dep_ts if self.train else None)

            cat_loss = reduce(lambda x, y: x + y,
                        [we * F.softmax_cross_entropy(y, t) \
                            for y, t, we in zip(cat_ys, cat_ts, weights)])
            cat_acc = reduce(lambda x, y: x + y,
                        [F.accuracy(y, t, ignore_label=IGNORE) \
                                for y, t in zip(cat_ys, cat_ts)]) / batchsize

            dep_loss = reduce(lambda x, y: x + y,
                        [we * F.softmax_cross_entropy(y, t) \
                            for y, t, we in zip(dep_ys, dep_ts, weights)])
            dep_acc = reduce(lambda x, y: x + y,
                        [F.accuracy(y, t, ignore_label=IGNORE) \
                                for y, t in zip(dep_ys, dep_ts)]) / batchsize
        except:
            print "caught erroneous example ignoring..."
            print[w.shape for w in ws]
            print[w.shape for w in ss]
            print[w.shape for w in ps]
            print ls
            print[w.shape for w in cat_ts]
            print[w.shape for w in dep_ts]
            xp = chainer.cuda.get_array_module(ws[0])
            return Variable(xp.array(0, 'f'))

        chainer.report(
            {
                "tagging_loss": cat_loss,
                "tagging_accuracy": cat_acc,
                "parsing_loss": dep_loss,
                "parsing_accuracy": dep_acc
            }, self)
        return cat_loss + dep_loss

    def forward(self, ws, ss, ps, ls, dep_ts=None):
        batchsize, length = ws.shape
        split = scanl(lambda x, y: x + y, 0, ls)[1:-1]
        xp = chainer.cuda.get_array_module(ws[0])
        ws = self.emb_word(ws)  # (batch, length, word_dim)
        ss = F.reshape(self.emb_suf(ss), (batchsize, length, -1))
        ps = F.reshape(self.emb_prf(ps), (batchsize, length, -1))
        hs = F.concat([ws, ss, ps], 2)
        hs = F.dropout(hs, self.dropout_ratio, train=self.train)
        fs = hs
        for qrnn_f in self.qrnn_fs:
            inp = fs
            fs = qrnn_f(inp)

        bs = hs[:, ::-1, :]
        for qrnn_b in self.qrnn_bs:
            inp = bs
            bs = qrnn_b(inp)

        # fs = [hs]
        # for qrnn_f in self.qrnn_fs:
        #     inp = F.concat(fs, 2)
        #     fs.append(F.dropout(qrnn_f(inp), 0.32, train=self.train))
        # fs = fs[-1]
        #
        # bs = [hs[:, ::-1, :]]
        # for qrnn_b in self.qrnn_bs:
        #     inp = F.concat(bs, 2)
        #     bs.append(F.dropout(qrnn_b(inp), 0.32, train=self.train))
        # bs = bs[-1]
        #
        hs = F.concat([fs, bs[:, ::-1, :]], 2)

        _, hs_len, hidden = hs.shape
        hs = [F.reshape(var, (hs_len, hidden))[:l] for l, var in \
                zip(ls, F.split_axis(hs, batchsize, 0))]

        dep_ys = [
            self.biaffine_arc(
                F.elu(F.dropout(self.arc_dep(h), 0.32, train=self.train)),
                F.elu(F.dropout(self.arc_head(h), 0.32, train=self.train)))
            for h in hs
        ]

        if dep_ts is not None:
            heads = dep_ts
        else:
            heads = [F.argmax(y, axis=1) for y in dep_ys]

        heads = F.elu(F.dropout(
            self.rel_head(
                F.vstack([F.embed_id(t, h, ignore_label=IGNORE) \
                        for h, t in zip(hs, heads)])),
            0.32, train=self.train))

        childs = F.elu(
            F.dropout(self.rel_dep(F.vstack(hs)), 0.32, train=self.train))
        cat_ys = self.biaffine_tag(childs, heads)

        cat_ys = list(F.split_axis(cat_ys, split, 0))

        return cat_ys, dep_ys

    def predict(self, xs):
        """
        batch: list of splitted sentences
        """
        fs = [self.extractor.process(x) for x in xs]
        ws, ss, ps = concat_examples(fs)
        ls = [len(x) + 2 for x in xs]
        cat_ys, dep_ys = self.forward(ws, ss, ps, ls)
        return zip([F.log_softmax(y[1:-1]).data for y in cat_ys],
                   [F.log_softmax(y[1:-1, :-1]).data for y in dep_ys])

    def predict_doc(self, doc, batchsize=16):
        """
        doc list of splitted sentences
        """
        res = []
        for i in range(0, len(doc), batchsize):
            res.extend([
                (i + j, 0, y)
                for j, y in enumerate(self.predict(doc[i:i + batchsize]))
            ])
        return res

    @property
    def cats(self):
        return zip(*sorted(self.targets.items(), key=lambda x: x[1]))[0]
示例#7
0
 def __init__(self, model_path, samples_path):
     self.model_path = model_path
     self.targets = read_model_defs(model_path + "/target.txt")
     self.extractor = FeatureExtractor(model_path)
     with open(samples_path) as f:
         self.samples = sorted(json.load(f), key=lambda x: len(x[1][0]))