def gen_instances(dataset, parses, model): instances = [] labels = [] candidate_re = re.compile("[%s]" % model.candidate) for paragraph in chain(*dataset): root = paragraph.root_relation() if root: sentences = list(root.iterfind(filter=node_type_filter(Sentence))) # 分割点两边的偏移量 for sentence in sentences: segments = set() # 分割点两侧的偏移量 candidates = set() # 候选分割词的偏移量 edus = list(sentence.iterfind(filter=node_type_filter(EDU))) offset = 0 for edu in edus: segments.add(offset) segments.add(offset+len(edu.text)-1) offset += len(edu.text) # convert tree in parented tree for feature extraction parse = ParentedTree.fromstring(parses[sentence.sid].pformat()) for m in candidate_re.finditer(sentence.text): candidate = m.start() instances.append(model.extract_features(candidate, parse)) labels.append(1 if candidate in segments else 0) return instances, labels
def gen_train_instances(dataset): instances = [] tags = [] for paragraph in chain(*dataset): for sentence in paragraph.sentences(): edus = list(sentence.iterfind(node_type_filter(EDU))) if edus: sent_words = [] sent_poses = [] sent_tags = [] graph = [] for i, edu in enumerate(edus): words = edu.words poses = edu.tags label = ['O'] * (len(words) - 1) label += ['B'] if i < len(edus) - 1 else ['O'] sent_words.extend(words) sent_poses.extend(poses) sent_tags.extend(label) for i, token in enumerate(sentence.dependency): graph.append((i, i, "self")) if token.head > 0: graph.append((i, token.head - 1, "head")) graph.append((token.head - 1, i, "dep")) instances.append((sent_words, sent_poses, graph)) tags.append(sent_tags) return instances, tags
def evaluate(dataset, model): model.eval() segmenter = RNNSegmenter(model) golds = [] segs = [] for paragraph in chain(*dataset): seged_sents = [] for sentence in paragraph.sentences(): # make sure sentence has edus if list(sentence.iterfind(node_type_filter(EDU))): seged_sents.append(Sentence(segmenter.cut_edu(sentence))) if seged_sents: segs.append(Paragraph(seged_sents)) golds.append(paragraph) return edu_eval(segs, golds)
def build_vocab(dataset): word_freq = Counter() pos_freq = Counter() nuc_freq = Counter() rel_freq = Counter() for paragraph in chain(*dataset): for node in paragraph.iterfind(filter=node_type_filter([EDU, Relation])): if isinstance(node, EDU): word_freq.update(node.words) pos_freq.update(node.tags) elif isinstance(node, Relation): nuc_freq[node.nuclear] += 1 rel_freq[node.ftype] += 1 word_vocab = Vocab("word", word_freq) pos_vocab = Vocab("part of speech", pos_freq) nuc_label = Label("nuclear", nuc_freq) rel_label = Label("relation", rel_freq) return word_vocab, pos_vocab, nuc_label, rel_label
def gen_train_instances(dataset): instances = [] tags = [] for paragraph in chain(*dataset): for sentence in paragraph.sentences(): edus = list(sentence.iterfind(node_type_filter(EDU))) if edus: sent_words = [] sent_poses = [] sent_tags = [] for i, edu in enumerate(edus): words = edu.words poses = edu.tags label = ['O'] * (len(words) - 1) label += ['B'] if i < len(edus) - 1 else ['O'] sent_words.extend(words) sent_poses.extend(poses) sent_tags.extend(label) instances.append((sent_words, sent_poses)) tags.append(sent_tags) return instances, tags