Exemplo n.º 1
0
 def _create_samples(self, trees):
     for tree in trees:
         tokens = get_leaves(tree)
         words = [normalize(token.word) for token in tokens]
         cats = [str(token.cat) for token in tokens]
         sent = " ".join(words)
         self.sents.append(sent)
         self.samples[sent] = cats
Exemplo n.º 2
0
 def _create_samples(self, trees):
     for tree in trees:
         tokens = get_leaves(tree)
         words = [token.word for token in tokens]
         cats = [token.cat.without_semantics for token in tokens]
         sent = " ".join(words)
         self.sents.append(sent)
         self.samples[sent] = " ".join(cats)
Exemplo n.º 3
0
 def _create_samples(self, trees):
     for tree in trees:
         tokens = get_leaves(tree)
         words = [token.word for token in tokens]
         cats = [token.cat.without_semantics for token in tokens]
         deps = self._get_dependencies(tree, len(tokens))
         sent = " ".join(words)
         self.sents.append(sent)
         self.samples[sent] = cats, deps
Exemplo n.º 4
0
 def _create_samples(self, trees):
     for tree in trees:
         tokens = get_leaves(tree)
         words = [normalize(token.word) for token in tokens]
         tags = [token.tag for token in tokens]
         cats = [str(token.cat) for token in tokens]
         deps = self._get_dependencies(tree, len(tokens))
         sent = " ".join(words)
         self.sents.append(sent)
         self.samples.append((sent, tags, (cats, deps)))
Exemplo n.º 5
0
def _worker(inp):
    autofile, window_size = inp
    res = []
    for tree in AutoReader(autofile).readall(suppress_error=True):
        leaves = get_leaves(tree)
        feats = map(feature_extract, [leaf.word for leaf in leaves])
        contexts = get_context_by_window(
                feats, window_size, lpad=lpad, rpad=rpad)
        for leaf, context in zip(leaves, contexts):
            res.append(" ".join(map(lambda c: "|".join(c), context)) + \
                    " " + str(leaf.cat) + "\n")
    return res
Exemplo n.º 6
0
def test():
    sents = \
        [line.strip().decode("utf-8") for line in open("test.ccgbank")]
    tree = JaCCGLineReader(
        "{< NP {(S\\NP){I2}_none test} {(S\\NP){I2}_none test}}".decode(
            "utf-8")).parse()
    for sent in sents:
        if len(sent) == 0:
            continue
        tree = JaCCGLineReader(sent).parse()
        if len(get_leaves(tree)) < 10:
            # print tree
            if not isinstance(tree, Leaf):
                tree.show_derivation()
Exemplo n.º 7
0
 def create_testdata(self, outdir):
     trees = JaCCGReader(self.filepath).readall()
     for tree in trees:
         tokens = get_leaves(tree)
         words = [token.word for token in tokens]
         self.sents.append(" ".join(words))
         cats = [token.cat.without_semantics for token in tokens]
         samples = get_context_by_window(words,
                                         CONTEXT,
                                         lpad=LPAD,
                                         rpad=RPAD)
         assert len(samples) == len(cats)
         for cat, sample in zip(cats, samples):
             self.samples[" ".join(sample)] = cat
     with open(outdir + "/testdata.json", "w") as f:
         json.dump(self.samples, f)
     with open(outdir + "/testsents.txt", "w") as f:
         for sent in self.sents:
             f.write(sent.encode("utf-8") + "\n")
Exemplo n.º 8
0
    def create_traindata(self, outdir):
        trees = JaCCGReader(self.filepath).readall()
        # first construct dictionaries only
        for tree in trees:
            self._traverse(tree)
        # construct training samples with
        # categories whose frequency >= freq_cut.
        for tree in trees:
            tokens = get_leaves(tree)
            words = [token.word for token in tokens]
            self.sents.append(" ".join(words))
            cats = [token.cat.without_semantics for token in tokens]
            samples = get_context_by_window(words,
                                            CONTEXT,
                                            lpad=LPAD,
                                            rpad=RPAD)
            assert len(samples) == len(cats)
            for cat, sample in zip(cats, samples):
                if self.cats[cat] >= self.cat_freq_cut:
                    self.samples[" ".join(sample)] = cat

        self.cats = {k: v for (k, v) in self.cats.items() \
                        if v >= self.cat_freq_cut}
        self.words = {k: v for (k, v) in self.words.items() \
                        if v >= self.word_freq_cut}
        with open(outdir + "/unary_rules.txt", "w") as f:
            self._write(self.unary_rules, f, comment_out_value=True)
        with open(outdir + "/seen_rules.txt", "w") as f:
            self._write(self.seen_rules, f, comment_out_value=True)
        with open(outdir + "/target.txt", "w") as f:
            self._write(self.cats, f, comment_out_value=False)
        with open(outdir + "/words.txt", "w") as f:
            self._write(self.words, f, comment_out_value=False)
        with open(outdir + "/chars.txt", "w") as f:
            self._write(self.chars, f, comment_out_value=False)
        with open(outdir + "/traindata.json", "w") as f:
            json.dump(self.samples, f)
        with open(outdir + "/trainsents.txt", "w") as f:
            for sent in self.sents:
                f.write(sent.encode("utf-8") + "\n")