def create_doc_from_edu_file(edu_file, annotate_func): with open(edu_file, 'r') as fin: doc_tokens = [] paragraphs = [p.strip() for p in fin.read().split('<P>') if p.strip()] previous_edu_num = 0 for pidx, para in enumerate(paragraphs): sentences = [s.strip() for s in para.split('<S>') if s.strip()] for sidx, sent in enumerate(sentences): edus = [e.strip() + ' ' for e in sent.split('\n') if e.strip()] sent_text = ''.join(edus) annot_re = annotate_func(sent_text)['sentences'][0] sent_tokens = [] for t in annot_re['tokens']: token = Token() token.tidx, token.word, token.lemma, token.pos = t[ 'index'], t['word'], t['lemma'], t['pos'] token.pidx, token.sidx = pidx + 1, sidx edu_text_length = 0 for eidx, edu_text in enumerate(edus): edu_text_length += len(edu_text) if edu_text_length > t['characterOffsetEnd']: token.eduidx = previous_edu_num + eidx + 1 break sent_tokens.append(token) for dep in annot_re['basicDependencies']: dependent_token = sent_tokens[dep['dependent'] - 1] dependent_token.hidx = dep['governor'] dependent_token.dep_label = dep['dep'] doc_tokens += sent_tokens previous_edu_num += len(edus) doc = Doc() doc.init_from_tokens(doc_tokens) return doc
def _parse_fmerge_line(line): """ Parse one line from *.merge file """ items = line.split("\t") tok = Token() tok.pidx, tok.sidx, tok.tidx = int(items[-1]), int(items[0]), int( items[1]) # Without changing the case tok.word = items[2] try: tok.eduidx = int(items[9]) except ValueError: print("EDU index for {} is missing in fmerge file".format( tok.word)) pass return tok
def _parse_fmerge_line(line): """ Parse one line from *.merge file """ items = line.split("\t") tok = Token() tok.pidx, tok.sidx, tok.tidx = int(items[-1]), int(items[0]), int(items[1]) # Without changing the case tok.word, tok.lemma = items[2], items[3] tok.pos = items[4] tok.dep_label = items[5] try: tok.hidx = int(items[6]) except ValueError: pass tok.ner, tok.partial_parse = items[7], items[8] try: tok.eduidx = int(items[9]) except ValueError: print("EDU index for {} is missing in fmerge file".format(tok.word)) # sys.exit() pass return tok