def __init__(self, model): self._eos = ['!', '。', '?'] self._pairs = {'“': "”", "「": "」"} self.model = model self.model.eval() self.parser = BerkeleyParser() self.dep_parser = LTPParser()
class RNNSegmenter(SegmenterI): def __init__(self, model): self._eos = ['!', '。', '?'] self._pairs = {'“': "”", "「": "」"} self.model = model self.model.eval() self.parser = BerkeleyParser() def cut(self, text): sentences = self.cut_sent(text) for i, sent in enumerate(sentences): sentences[i] = Sentence(self.cut_edu(sent)) return Paragraph(sentences) def cut_sent(self, text, sid=None): last_cut = 0 sentences = [] for i in range(0, len(text) - 1): if text[i] in self._eos: sentences.append(Sentence([TEXT(text[last_cut:i + 1])])) last_cut = i + 1 if last_cut < len(text) - 1: sentences.append(Sentence([TEXT(text[last_cut:])])) return sentences def cut_edu(self, sent): if (not hasattr(sent, "words")) or (not hasattr(sent, "tags")): if hasattr(sent, "parse"): parse = getattr(sent, "parse") else: parse = self.parser.parse(sent.text) children = list( parse.subtrees( lambda t: t.height() == 2 and t.label() != '-NONE-')) setattr(sent, "words", [child[0] for child in children]) setattr(sent, "tags", [child.label() for child in children]) word_ids = [self.model.word_vocab[word] for word in sent.words] pos_ids = [self.model.pos_vocab[pos] for pos in sent.tags] word_ids = torch.tensor([word_ids]).long() pos_ids = torch.tensor([pos_ids]).long() if self.model.use_gpu: word_ids = word_ids.cuda() pos_ids = pos_ids.cuda() pred = self.model(word_ids, pos_ids).squeeze(0) labels = [self.model.tag_label.id2label[t] for t in pred.argmax(-1)] edus = [] last_edu_words = [] last_edu_tags = [] for word, pos, label in zip(sent.words, sent.tags, labels): last_edu_words.append(word) last_edu_tags.append(pos) if label == "B": text = "".join(last_edu_words) edu = EDU([TEXT(text)]) setattr(edu, "words", last_edu_words) setattr(edu, "tags", last_edu_tags) edus.append(edu) last_edu_words = [] last_edu_tags = [] if last_edu_words: text = "".join(last_edu_words) edu = EDU([TEXT(text)]) setattr(edu, "words", last_edu_words) setattr(edu, "tags", last_edu_tags) edus.append(edu) return edus
def __init__(self, model): self._eos = ['!', '。', '?'] self._pairs = {'“': "”", "「": "」"} self.model = model self.candidate = model.candidate self.parser = BerkeleyParser()
class SVMSegmenter(SegmenterI): def __init__(self, model): self._eos = ['!', '。', '?'] self._pairs = {'“': "”", "「": "」"} self.model = model self.candidate = model.candidate self.parser = BerkeleyParser() def cut(self, text): sentences = self.cut_sent(text) for i, sent in enumerate(sentences): sentences[i] = Sentence(self.cut_edu(sent)) return Paragraph(sentences) def cut_sent(self, text: str, sid=None) -> List[Sentence]: last_cut = 0 sentences = [] for i in range(0, len(text) - 1): if text[i] in self._eos: sentences.append(Sentence([TEXT(text[last_cut:i + 1])])) last_cut = i + 1 if last_cut < len(text) - 1: sentences.append(Sentence([TEXT(text[last_cut:])])) return sentences def cut_edu(self, sent: Sentence) -> List[EDU]: if not hasattr(sent, "parse"): print(sent.text) parse = self.parser.parse(sent.text) else: parse = getattr(sent, "parse") parse = ParentedTree.fromstring(parse.pformat()) children = list( parse.subtrees( lambda t: t.height() == 2 and t.label() != '-NONE-')) edus = [] last_edu_words = [] last_edu_tags = [] offset = 0 for child in children: if child[0] == '-LRB-': child[0] = '(' if child[0] == '-RRB-': child[0] = ')' last_edu_words.append(child[0]) last_edu_tags.append(child.label()) if child[0] in self._eos or (child[0] in self.candidate and self.model.predict(offset, parse)): text = "".join(last_edu_words) edu = EDU([TEXT(text)]) setattr(edu, "words", last_edu_words) setattr(edu, "tags", last_edu_tags) edus.append(edu) last_edu_words = [] last_edu_tags = [] offset += len(child[0]) if last_edu_words: text = "".join(last_edu_words) edu = EDU([TEXT(text)]) setattr(edu, "words", last_edu_words) setattr(edu, "tags", last_edu_tags) edus.append(edu) return edus
# coding: UTF-8 import re import os from nltk.tree import Tree as ParseTree from util.berkely import BerkeleyParser from tqdm import tqdm if __name__ == '__main__': ctb_dir = "data/CTB" save_dir = "data/CTB_auto" encoding = "UTF-8" ctb = {} s_pat = re.compile(r"<S ID=(?P<sid>\S+?)>(?P<sparse>.*?)</S>", re.M | re.DOTALL) parser = BerkeleyParser() for file in tqdm(os.listdir(ctb_dir)): if os.path.isfile(os.path.join(save_dir, file)): continue print(file) with open(os.path.join(ctb_dir, file), "r", encoding=encoding) as fd: doc = fd.read() parses = [] for match in s_pat.finditer(doc): sid = match.group("sid") sparse = ParseTree.fromstring(match.group("sparse")) pairs = [(node[0], node.label()) for node in sparse.subtrees() if node.height() == 2 and node.label() != "-NONE-"] words, tags = list(zip(*pairs)) print(sid, " ".join(words)) if sid == "5133": parse = sparse