Пример #1
0
def get_udparse(sent, tool):
    "Create a ``UDParse`` from a sentence extracted from a Communication."

    # extract dependency parse for Communication.
    triples = []
    for ud_parse in sent.tokenization.dependencyParseList:
        if ud_parse.metadata.tool == tool:
            for dependency in ud_parse.dependencyList:
                triples.append(
                    DepTriple(dependency.edgeType, dependency.gov,
                              dependency.dep))
            break

    # Extract token strings
    tokens = [x.text for x in sent.tokenization.tokenList.tokenList]

    # Extract POS tags
    tags = get_tags(sent.tokenization, 'POS')

    #triples.sort(key=lambda triple: triple.dep)
    parse = UDParse(tokens=tokens, tags=tags, triples=triples)

    # Extract lemmas
    #parse.lemmas = get_tags(sent.tokenization, 'LEMMA')

    return parse
Пример #2
0
 def fresh(self, parse):
     "Convert constituency parse to UD. Expects string, returns `UDParse` instance."
     assert isinstance(parse, basestring)
     deps = self.sd.convert_tree(parse)
     tokens = [e.form for e in deps]
     # convert tags
     tags = [ptb2universal[e.cpos] for e in deps]
     triples = []
     for e in deps:
         # PyStanfordDependencies indexing starts at one, but we want
         # indexing to start at zero. Hence the -1 below.
         triples.append(
             DepTriple(rel=e.deprel, gov=e.head - 1, dep=e.index - 1))
     return UDParse(tokens=tokens, tags=tags, triples=triples)
Пример #3
0
def convert_parse(parse, ud):
    "Convert dependency parse on integers into a dependency parse on `Token`s."
    tokens = []
    for i, w in enumerate(parse.tokens):
        tokens.append(Token(i, w, parse.tags[i], ud))

    def convert_edge(e):
        return DepTriple(gov=tokens[e.gov], dep=tokens[e.dep], rel=e.rel)

    for i, _ in enumerate(tokens):
        tokens[i].gov = (None if i not in parse.governor or parse.governor[i].gov == -1
                         else tokens[parse.governor[i].gov])
        tokens[i].gov_rel = parse.governor[i].rel if i in parse.governor else 'root'
        tokens[i].dependents = [convert_edge(e) for e in parse.dependents[i]]

    return UDParse(tokens, parse.tags, [convert_edge(e) for e in parse.triples], ud)
Пример #4
0
def load_conllu(filename):
    "Load CoNLLu style files (e.g., the Universal Dependencies treebank)."
    sent_num = 1
    if os.path.isfile(filename):
        with codecs.open(filename, encoding='utf-8') as f:
            content = f.read().strip()
    else:
        content = filename.strip()

    for block in content.split('\n\n'):
        block = block.strip()
        if not block:
            continue
        lines = []
        sent_id = 'sent_%s' % sent_num
        has_sent_id = 0
        for line in block.split('\n'):
            if line.startswith('#'):
                if line.startswith('# sent_id'):
                    sent_id = line[10:].strip()
                    has_sent_id = 1
                else:
                    if not has_sent_id:  # don't take subsequent comments as sent_id
                        sent_id = line[1:].strip()
                continue
            line = line.split('\t')  # data appears to use '\t'
            if '-' in line[0]:  # skip multi-tokens, e.g., on Spanish UD bank
                continue
            if '.' in line[0]:  # Skip ellipsis empty tokens for enhanced UD
                continue
            assert len(line) == 10, line
            lines.append(line)
        [_, tokens, _, tags, _, _, head, gov_rel, _, _] = list(zip(*lines))
        triples = [
            DepTriple(rel,
                      int(gov) - 1, dep)
            for dep, (rel, gov) in enumerate(zip(gov_rel, head))
        ]
        parse = UDParse(list(tokens), tags, triples)
        yield sent_id, parse
        sent_num += 1
Пример #5
0
def test():
    from argparse import ArgumentParser
    p = ArgumentParser()
    p.add_argument('--filename', default='doc/DOCTEST.md')
    args = p.parse_args()

    sentences = re.findall(
        '^> (.*)\n([\w\W]*?)(?=^>|<END>)',
        codecs.open(args.filename, encoding='utf-8').read() + '\n<END>',
        re.MULTILINE)

    # TODO: Use PredPatt.from_string instead of duplicating code here.
    parser = Parser.get_instance()

    passed = 0
    failed = 0
    blank = 0
    for s, chunk in sentences:
        s = s.strip()
        if not s:
            continue

        # use cached parse listed in doctest chunk.
        parse_chunk = re.findall('<\!--parse=([\w\W]+?)-->', chunk)
        if parse_chunk:
            from predpatt.UDParse import DepTriple, UDParse
            [parse_chunk] = parse_chunk
            triples = [
                DepTriple(r, int(b), int(a)) for r, a, b in re.findall(
                    '(\S+)\(\S+?/(\d+), \S+?/(\d+)\)', parse_chunk)
            ]
            tokens = s.split()
            [tags_chunk] = re.findall('<\!--tags=([\w\W]+?)-->', chunk)
            tags = re.findall('\S+/(\S+)', tags_chunk)
            parse = UDParse(tokens, tags, triples)

        else:
            parse = parser(s)

        P = PredPatt(parse, ppattopts)
        relations = P.pprint(track_rule=True)
        tags = ' '.join('%s/%s' % x for x in zip(parse.tokens, parse.tags))
        parse = parse.pprint(K=4)

        relations = relations.replace('\t', '    ')
        relations = '\n'.join(line[4:].rstrip()
                              for line in relations.split('\n'))

        expected = []
        chunk = chunk.replace('\t', '    ')
        for line in chunk.split('\n'):
            if line.startswith('    '):
                line = line[4:].rstrip()
                expected.append(line)

        expected = '\n'.join(expected)

        if not expected.strip():
            blank += 1

        #got = '%s\n%s\n%s' % (tags, parse, relations)
        got = relations.strip() or '<empty>'
        got = re.sub(r'\s*\[.*\]', '', got)

        if expected.strip() == got.strip():
            #print colored('pass', 'green')
            passed += 1
        else:
            print()
            print(colored('> ' + s, 'yellow'))
            print(colored('fail', 'red'))
            print('expected:')
            for line in expected.split('\n'):
                print('   ', colored(line, 'blue'))
            print('got:')
            for line in got.split('\n'):
                print('   ', line)
            print()
            print(colored(tags, 'magenta'))
            print()
            print(colored(parse, 'magenta'))
            failed += 1

    msg = '[doctest] %.f%% (%s/%s) passed' % (passed * 100.0 /
                                              (passed + failed), passed,
                                              passed + failed)
    if failed == 0:
        print(msg)
    else:
        print()
        print(msg)
        print()
        if blank:
            print('blank:', blank)