Пример #1
0
def _worker(inp):
    autofile, window_size = inp
    res = []
    for tree in AutoReader(autofile).readall(suppress_error=True):
        leaves = get_leaves(tree)
        feats = map(feature_extract, [leaf.word for leaf in leaves])
        contexts = get_context_by_window(
                feats, window_size, lpad=lpad, rpad=rpad)
        for leaf, context in zip(leaves, contexts):
            res.append(" ".join(map(lambda c: "|".join(c), context)) + \
                    " " + str(leaf.cat) + "\n")
    return res
Пример #2
0
 def create_testdata(self, outdir):
     trees = JaCCGReader(self.filepath).readall()
     for tree in trees:
         tokens = get_leaves(tree)
         words = [token.word for token in tokens]
         self.sents.append(" ".join(words))
         cats = [token.cat.without_semantics for token in tokens]
         samples = get_context_by_window(words,
                                         CONTEXT,
                                         lpad=LPAD,
                                         rpad=RPAD)
         assert len(samples) == len(cats)
         for cat, sample in zip(cats, samples):
             self.samples[" ".join(sample)] = cat
     with open(outdir + "/testdata.json", "w") as f:
         json.dump(self.samples, f)
     with open(outdir + "/testsents.txt", "w") as f:
         for sent in self.sents:
             f.write(sent.encode("utf-8") + "\n")
Пример #3
0
    def create_traindata(self, outdir):
        trees = JaCCGReader(self.filepath).readall()
        # first construct dictionaries only
        for tree in trees:
            self._traverse(tree)
        # construct training samples with
        # categories whose frequency >= freq_cut.
        for tree in trees:
            tokens = get_leaves(tree)
            words = [token.word for token in tokens]
            self.sents.append(" ".join(words))
            cats = [token.cat.without_semantics for token in tokens]
            samples = get_context_by_window(words,
                                            CONTEXT,
                                            lpad=LPAD,
                                            rpad=RPAD)
            assert len(samples) == len(cats)
            for cat, sample in zip(cats, samples):
                if self.cats[cat] >= self.cat_freq_cut:
                    self.samples[" ".join(sample)] = cat

        self.cats = {k: v for (k, v) in self.cats.items() \
                        if v >= self.cat_freq_cut}
        self.words = {k: v for (k, v) in self.words.items() \
                        if v >= self.word_freq_cut}
        with open(outdir + "/unary_rules.txt", "w") as f:
            self._write(self.unary_rules, f, comment_out_value=True)
        with open(outdir + "/seen_rules.txt", "w") as f:
            self._write(self.seen_rules, f, comment_out_value=True)
        with open(outdir + "/target.txt", "w") as f:
            self._write(self.cats, f, comment_out_value=False)
        with open(outdir + "/words.txt", "w") as f:
            self._write(self.words, f, comment_out_value=False)
        with open(outdir + "/chars.txt", "w") as f:
            self._write(self.chars, f, comment_out_value=False)
        with open(outdir + "/traindata.json", "w") as f:
            json.dump(self.samples, f)
        with open(outdir + "/trainsents.txt", "w") as f:
            for sent in self.sents:
                f.write(sent.encode("utf-8") + "\n")
Пример #4
0
 def feature_extract(self, tokens):
     max_len = max(len(w) for w in tokens)
     contexts = get_context_by_window(tokens, CONTEXT, lpad=LPAD, rpad=RPAD)
     return [self.extractor(c, max_len) for c in contexts]
Пример #5
0
 def feature_extract(self, tokens):
     return get_context_by_window(
             map(feature_extract, tokens), 3, lpad=lpad, rpad=rpad)