def read_ptb_sec(ptb_sec_dir): ptb_sec_dir = Path(ptb_sec_dir) files = [] for loc in ptb_sec_dir.iterdir(): if not str(loc).endswith('parse') and not str(loc).endswith('mrg'): continue filename = loc.parts[-1].split('.')[0] with loc.open() as file_: text = file_.read() sents = [] for parse_str in read_ptb.split(text): words, brackets = read_ptb.parse(parse_str, strip_bad_periods=True) words = [_reform_ptb_word(word) for word in words] string = ' '.join(words) sents.append((filename, string)) files.append(sents) return files
def do_web(src_dir, onto_dir, out_dir): mapping = dict(line.split() for line in open(path.join(onto_dir, 'map.txt')) if len(line.split()) == 2) for annot_fn, src_fn in mapping.items(): if not annot_fn.startswith('eng'): continue ptb_loc = path.join(onto_dir, annot_fn + '.parse') src_loc = path.join(src_dir, src_fn + '.sgm') if path.exists(ptb_loc) and path.exists(src_loc): src_doc = sgml_extract(open(src_loc).read()) ptb_doc = [read_ptb.parse(parse_str, strip_bad_periods=True)[0] for parse_str in read_ptb.split(open(ptb_loc).read())] print('Found') else: print('Miss')
def format_para(raw_text, ptb_sents, dep_sents, ner_sents): para = {'raw': raw_text, 'sentences': []} offset = 0 assert len(ptb_sents) == len(dep_sents) == len(ner_sents) for ptb_text, dep_text, ner_text in zip(ptb_sents, dep_sents, ner_sents): _, deps = read_conll.parse(dep_text, strip_bad_periods=True) if deps and 'VERB' in [t['tag'] for t in deps]: continue if ner_text is not None: _, ner = read_ner.parse(ner_text, strip_bad_periods=True) else: ner = ['-' for _ in deps] _, brackets = read_ptb.parse(ptb_text, strip_bad_periods=True) # Necessary because the ClearNLP converter deletes EDITED words. if len(ner) != len(deps): ner = ['-' for _ in deps] para['sentences'].append(format_sentence(deps, ner, brackets)) return para
def do_web(src_dir, onto_dir, out_dir): mapping = dict(line.split() for line in open(path.join(onto_dir, 'map.txt')) if len(line.split()) == 2) for annot_fn, src_fn in mapping.items(): if not annot_fn.startswith('eng'): continue ptb_loc = path.join(onto_dir, annot_fn + '.parse') src_loc = path.join(src_dir, src_fn + '.sgm') if path.exists(ptb_loc) and path.exists(src_loc): src_doc = sgml_extract(open(src_loc).read()) ptb_doc = [ read_ptb.parse(parse_str, strip_bad_periods=True)[0] for parse_str in read_ptb.split(open(ptb_loc).read()) ] print('Found') else: print('Miss')