def __init__(self, model):
     self._eos = ['!', '。', '?']
     self._pairs = {'“': "”", "「": "」"}
     self.model = model
     self.model.eval()
     self.parser = BerkeleyParser()
     self.dep_parser = LTPParser()
Exemplo n.º 2
0
class RNNSegmenter(SegmenterI):
    def __init__(self, model):
        self._eos = ['!', '。', '?']
        self._pairs = {'“': "”", "「": "」"}
        self.model = model
        self.model.eval()
        self.parser = BerkeleyParser()

    def cut(self, text):
        sentences = self.cut_sent(text)
        for i, sent in enumerate(sentences):
            sentences[i] = Sentence(self.cut_edu(sent))
        return Paragraph(sentences)

    def cut_sent(self, text, sid=None):
        last_cut = 0
        sentences = []
        for i in range(0, len(text) - 1):
            if text[i] in self._eos:
                sentences.append(Sentence([TEXT(text[last_cut:i + 1])]))
                last_cut = i + 1
        if last_cut < len(text) - 1:
            sentences.append(Sentence([TEXT(text[last_cut:])]))
        return sentences

    def cut_edu(self, sent):
        if (not hasattr(sent, "words")) or (not hasattr(sent, "tags")):
            if hasattr(sent, "parse"):
                parse = getattr(sent, "parse")
            else:
                parse = self.parser.parse(sent.text)
            children = list(
                parse.subtrees(
                    lambda t: t.height() == 2 and t.label() != '-NONE-'))
            setattr(sent, "words", [child[0] for child in children])
            setattr(sent, "tags", [child.label() for child in children])
        word_ids = [self.model.word_vocab[word] for word in sent.words]
        pos_ids = [self.model.pos_vocab[pos] for pos in sent.tags]
        word_ids = torch.tensor([word_ids]).long()
        pos_ids = torch.tensor([pos_ids]).long()
        if self.model.use_gpu:
            word_ids = word_ids.cuda()
            pos_ids = pos_ids.cuda()
        pred = self.model(word_ids, pos_ids).squeeze(0)
        labels = [self.model.tag_label.id2label[t] for t in pred.argmax(-1)]

        edus = []
        last_edu_words = []
        last_edu_tags = []
        for word, pos, label in zip(sent.words, sent.tags, labels):
            last_edu_words.append(word)
            last_edu_tags.append(pos)
            if label == "B":
                text = "".join(last_edu_words)
                edu = EDU([TEXT(text)])
                setattr(edu, "words", last_edu_words)
                setattr(edu, "tags", last_edu_tags)
                edus.append(edu)
                last_edu_words = []
                last_edu_tags = []
        if last_edu_words:
            text = "".join(last_edu_words)
            edu = EDU([TEXT(text)])
            setattr(edu, "words", last_edu_words)
            setattr(edu, "tags", last_edu_tags)
            edus.append(edu)
        return edus
 def __init__(self, model):
     self._eos = ['!', '。', '?']
     self._pairs = {'“': "”", "「": "」"}
     self.model = model
     self.candidate = model.candidate
     self.parser = BerkeleyParser()
class SVMSegmenter(SegmenterI):
    def __init__(self, model):
        self._eos = ['!', '。', '?']
        self._pairs = {'“': "”", "「": "」"}
        self.model = model
        self.candidate = model.candidate
        self.parser = BerkeleyParser()

    def cut(self, text):
        sentences = self.cut_sent(text)
        for i, sent in enumerate(sentences):
            sentences[i] = Sentence(self.cut_edu(sent))
        return Paragraph(sentences)

    def cut_sent(self, text: str, sid=None) -> List[Sentence]:
        last_cut = 0
        sentences = []
        for i in range(0, len(text) - 1):
            if text[i] in self._eos:
                sentences.append(Sentence([TEXT(text[last_cut:i + 1])]))
                last_cut = i + 1
        if last_cut < len(text) - 1:
            sentences.append(Sentence([TEXT(text[last_cut:])]))
        return sentences

    def cut_edu(self, sent: Sentence) -> List[EDU]:
        if not hasattr(sent, "parse"):
            print(sent.text)
            parse = self.parser.parse(sent.text)
        else:
            parse = getattr(sent, "parse")
        parse = ParentedTree.fromstring(parse.pformat())
        children = list(
            parse.subtrees(
                lambda t: t.height() == 2 and t.label() != '-NONE-'))
        edus = []
        last_edu_words = []
        last_edu_tags = []
        offset = 0
        for child in children:
            if child[0] == '-LRB-':
                child[0] = '('
            if child[0] == '-RRB-':
                child[0] = ')'
            last_edu_words.append(child[0])
            last_edu_tags.append(child.label())
            if child[0] in self._eos or (child[0] in self.candidate and
                                         self.model.predict(offset, parse)):
                text = "".join(last_edu_words)
                edu = EDU([TEXT(text)])
                setattr(edu, "words", last_edu_words)
                setattr(edu, "tags", last_edu_tags)
                edus.append(edu)
                last_edu_words = []
                last_edu_tags = []
            offset += len(child[0])
        if last_edu_words:
            text = "".join(last_edu_words)
            edu = EDU([TEXT(text)])
            setattr(edu, "words", last_edu_words)
            setattr(edu, "tags", last_edu_tags)
            edus.append(edu)
        return edus
Exemplo n.º 5
0
# coding: UTF-8
import re
import os
from nltk.tree import Tree as ParseTree
from util.berkely import BerkeleyParser
from tqdm import tqdm


if __name__ == '__main__':
    ctb_dir = "data/CTB"
    save_dir = "data/CTB_auto"
    encoding = "UTF-8"
    ctb = {}
    s_pat = re.compile(r"<S ID=(?P<sid>\S+?)>(?P<sparse>.*?)</S>", re.M | re.DOTALL)
    parser = BerkeleyParser()
    for file in tqdm(os.listdir(ctb_dir)):
        if os.path.isfile(os.path.join(save_dir, file)):
            continue
        print(file)
        with open(os.path.join(ctb_dir, file), "r", encoding=encoding) as fd:
            doc = fd.read()
        parses = []
        for match in s_pat.finditer(doc):
            sid = match.group("sid")
            sparse = ParseTree.fromstring(match.group("sparse"))
            pairs = [(node[0], node.label()) for node in sparse.subtrees()
                     if node.height() == 2 and node.label() != "-NONE-"]
            words, tags = list(zip(*pairs))
            print(sid, " ".join(words))
            if sid == "5133":
                parse = sparse