Пример #1
0
def read_ptb_sec(ptb_sec_dir):
    ptb_sec_dir = Path(ptb_sec_dir)
    files = []
    for loc in ptb_sec_dir.iterdir():
        if not str(loc).endswith('parse') and not str(loc).endswith('mrg'):
            continue
        filename = loc.parts[-1].split('.')[0]
        with loc.open() as file_:
            text = file_.read()
        sents = []
        for parse_str in read_ptb.split(text):
            words, brackets = read_ptb.parse(parse_str, strip_bad_periods=True)
            words = [_reform_ptb_word(word) for word in words]
            string = ' '.join(words)
            sents.append((filename, string))
        files.append(sents)
    return files
Пример #2
0
def read_ptb_sec(ptb_sec_dir):
    ptb_sec_dir = Path(ptb_sec_dir)
    files = []
    for loc in ptb_sec_dir.iterdir():
        if not str(loc).endswith('parse') and not str(loc).endswith('mrg'):
            continue
        filename = loc.parts[-1].split('.')[0]
        with loc.open() as file_:
            text = file_.read()
        sents = []
        for parse_str in read_ptb.split(text):
            words, brackets = read_ptb.parse(parse_str, strip_bad_periods=True)
            words = [_reform_ptb_word(word) for word in words]
            string = ' '.join(words)
            sents.append((filename, string))
        files.append(sents)
    return files
Пример #3
0
def do_web(src_dir, onto_dir, out_dir):
    mapping = dict(line.split() for line in open(path.join(onto_dir, 'map.txt'))
                   if len(line.split()) == 2)
    for annot_fn, src_fn in mapping.items():
        if not annot_fn.startswith('eng'):
            continue

        ptb_loc = path.join(onto_dir, annot_fn + '.parse') 
        src_loc = path.join(src_dir, src_fn + '.sgm')

        if path.exists(ptb_loc) and path.exists(src_loc):
            src_doc = sgml_extract(open(src_loc).read())
            ptb_doc = [read_ptb.parse(parse_str, strip_bad_periods=True)[0]
                       for parse_str in read_ptb.split(open(ptb_loc).read())]
            print('Found')
        else:
            print('Miss')
def format_para(raw_text, ptb_sents, dep_sents, ner_sents):
    para = {'raw': raw_text, 'sentences': []}
    offset = 0
    assert len(ptb_sents) == len(dep_sents) == len(ner_sents)
    for ptb_text, dep_text, ner_text in zip(ptb_sents, dep_sents, ner_sents):
        _, deps = read_conll.parse(dep_text, strip_bad_periods=True)
        if deps and 'VERB' in [t['tag'] for t in deps]:
            continue
        if ner_text is not None:
            _, ner = read_ner.parse(ner_text, strip_bad_periods=True)
        else:
            ner = ['-' for _ in deps]
        _, brackets = read_ptb.parse(ptb_text, strip_bad_periods=True)
        # Necessary because the ClearNLP converter deletes EDITED words.
        if len(ner) != len(deps):
            ner = ['-' for _ in deps]
        para['sentences'].append(format_sentence(deps, ner, brackets))
    return para
Пример #5
0
def format_para(raw_text, ptb_sents, dep_sents, ner_sents):
    para = {'raw': raw_text, 'sentences': []}
    offset = 0
    assert len(ptb_sents) == len(dep_sents) == len(ner_sents)
    for ptb_text, dep_text, ner_text in zip(ptb_sents, dep_sents, ner_sents):
        _, deps = read_conll.parse(dep_text, strip_bad_periods=True)
        if deps and 'VERB' in [t['tag'] for t in deps]:
            continue
        if ner_text is not None:
            _, ner = read_ner.parse(ner_text, strip_bad_periods=True)
        else:
            ner = ['-' for _ in deps]
        _, brackets = read_ptb.parse(ptb_text, strip_bad_periods=True)
        # Necessary because the ClearNLP converter deletes EDITED words.
        if len(ner) != len(deps):
            ner = ['-' for _ in deps]
        para['sentences'].append(format_sentence(deps, ner, brackets))
    return para
Пример #6
0
def do_web(src_dir, onto_dir, out_dir):
    mapping = dict(line.split()
                   for line in open(path.join(onto_dir, 'map.txt'))
                   if len(line.split()) == 2)
    for annot_fn, src_fn in mapping.items():
        if not annot_fn.startswith('eng'):
            continue

        ptb_loc = path.join(onto_dir, annot_fn + '.parse')
        src_loc = path.join(src_dir, src_fn + '.sgm')

        if path.exists(ptb_loc) and path.exists(src_loc):
            src_doc = sgml_extract(open(src_loc).read())
            ptb_doc = [
                read_ptb.parse(parse_str, strip_bad_periods=True)[0]
                for parse_str in read_ptb.split(open(ptb_loc).read())
            ]
            print('Found')
        else:
            print('Miss')