def process(fname, chunk, fmt='turboparser'):
    """Process a memmapped chunk of a large file.

    The comparison detection logic is called from here.

    Parameters
    ----------

    fname : string
        The path to the file to be opened.

    chunk : tuple (int, int)
        Beginning and ending offsets in the file.  The code essentially
        processes `f.seek(chunk[0]).read(chunk[1])`.

    fmt : ('turboparser'|'wacky')
        CONLL dependency format to use.

    Returns
    -------
    chunk_matches : list
        List of tuples (sentence, matches) where the second element is a list
        of (pattern_no, dict) containing the slots matched by the pattern.

    """

    global filemap, fileobj
    chunk_matches = []
    if filemap is None or fileobj.name != fname:
        fileobj = open(fname)
        filemap = mmap.mmap(fileobj.fileno(), os.path.getsize(fname),
                            access=mmap.ACCESS_READ)

    filemap.seek(chunk[0])
    lines = filemap.read(chunk[1]).splitlines()
    sents = get_sents_wacky(lines) if fmt == 'wacky' else get_sents(lines)
    for sent in sents:
        try:
            for s, root in read(sent + ["\n"], return_tree=True):
                matches = [(pat_no, m)
                           for pat_no, pat in enumerate(patterns)
                           for m in match(root, pat)]
                if matches:
                    matches = deduplicate(matches)
                    chunk_matches.append((str(s), matches))
        except ValueError:
            pass  # sentence without root
    return chunk_matches
示例#2
0
def process(fname, chunk, fmt='turboparser'):
    """Process a memmapped chunk of a large file.

    The comparison detection logic is called from here.

    Parameters
    ----------

    fname : string
        The path to the file to be opened.

    chunk : tuple (int, int)
        Beginning and ending offsets in the file.  The code essentially
        processes `f.seek(chunk[0]).read(chunk[1])`.

    fmt : ('turboparser'|'wacky')
        CONLL dependency format to use.

    Returns
    -------
    chunk_matches : list
        List of tuples (sentence, matches) where the second element is a list
        of (pattern_no, dict) containing the slots matched by the pattern.

    """

    global filemap, fileobj
    chunk_matches = []
    if filemap is None or fileobj.name != fname:
        fileobj = open(fname)
        filemap = mmap.mmap(fileobj.fileno(),
                            os.path.getsize(fname),
                            access=mmap.ACCESS_READ)

    filemap.seek(chunk[0])
    lines = filemap.read(chunk[1]).splitlines()
    sents = get_sents_wacky(lines) if fmt == 'wacky' else get_sents(lines)
    for sent in sents:
        try:
            for s, root in read(sent + ["\n"], return_tree=True):
                matches = [(pat_no, m) for pat_no, pat in enumerate(patterns)
                           for m in match(root, pat)]
                if matches:
                    matches = deduplicate(matches)
                    chunk_matches.append((str(s), matches))
        except ValueError:
            pass  # sentence without root
    return chunk_matches
This script shows the simple way of using this package to extract
comparisons from a parsed English corpus.
For example, you can run it against the 'data/hanks_tp_lemma.conll' file
provided.

By default this prints the dependency root of each comparison slot (topic,
vehicle, etc) but the entire subtrees are extracted and available.
"""
from __future__ import print_function
import fileinput

from compattern.dependency import match
from compattern.dependency.seed_patterns import patterns


def _lemma_or_form(tok):
    return tok.form.lower() if tok.lemma == '_' else tok.lemma.lower()


if __name__ == '__main__':
    from compattern.dependency.conll import read

    sents = read(fileinput.input(), return_tree=True)
    for sent, root in sents:
        print(sent)
        for pat in patterns:
            for m in match(root, pat):
                print("\n".join("{}: {}".format(key, val.form)
                                for key, val in m.items()))
                print()
示例#4
0
                 'w',
                 encoding='utf-8')
    try:
        tree = GlarfTree.glarf_parse(gf, gt)
        args = [get_args(*node) for node in find_comparison_nodes(tree)]
        args = [
            arg_dict for arg_dict in args
            if arg_dict['C'].lower() == sys.argv[1] and 'V' in arg_dict
            and arg_dict['V'].strip() != ""
        ]
    except ValueError:
        args = []
        continue

    dep_args = [
        m for pat in patterns[:2] for m in match(dep, pat)
        if m['C'].form.lower() == sys.argv[1]
    ]

    if args:
        matches += 1
    if dep_args:
        dep_matches += 1

    print_to = [f]
    if args and not dep_args:
        print_to.append(only_glarf)
    elif dep_args and not args:
        print_to.append(only_dep)

    for dest in print_to:
示例#5
0
def test_as():
    sent, root = read(example_as, return_tree=True)[0]
    matches = match(root, seed_patterns.as_1)
    assert_greater(len(matches), 0)
This script shows the simple way of using this package to extract
comparisons from a parsed English corpus.
For example, you can run it against the 'data/hanks_tp_lemma.conll' file
provided.

By default this prints the dependency root of each comparison slot (topic,
vehicle, etc) but the entire subtrees are extracted and available.
"""
from __future__ import print_function
import fileinput

from compattern.dependency import match
from compattern.dependency.seed_patterns import patterns


def _lemma_or_form(tok):
    return tok.form.lower() if tok.lemma == '_' else tok.lemma.lower()

if __name__ == '__main__':
    from compattern.dependency.conll import read

    sents = read(fileinput.input(), return_tree=True)
    for sent, root in sents:
        print(sent)
        for pat in patterns:
            for m in match(root, pat):
                print("\n".join("{}: {}".format(key, val.form)
                                for key, val in m.items()))
                print()
示例#7
0
def test_like_t2():
    sent, root = read(example_like_t2, return_tree=True)[0]
    matches = match(root, seed_patterns.like_t2)
    assert_greater(len(matches), 0)
示例#8
0
def test_than():
    sent, root = read(example_rbr, return_tree=True)[0]
    matches = match(root, seed_patterns.than_2)
    assert_greater(len(matches), 0)
示例#9
0
def test_aussi_lemma():
    sent, root = read(ex_aussi, return_tree=True)[0]
    matches = match(root, aussi)
    assert_greater(len(matches), 0)
示例#10
0
def test_like():
    sent, root = read(example_like, return_tree=True)[0]
    matches = match(root, seed_patterns.like)
    assert_greater(len(matches), 0)
    assert_in('T', list(matches[0].keys()))
示例#11
0
        print '.'
        if f:
            f.close()
        f = open('bnc_similes/{}/{:03d}.txt'.format(sys.argv[1], ii / 20), 'w',
                 encoding='utf-8')
    try:
        tree = GlarfTree.glarf_parse(gf, gt)
        args = [get_args(*node) for node in find_comparison_nodes(tree)]
        args = [arg_dict for arg_dict in args
                if arg_dict['C'].lower() == sys.argv[1]
                and 'V' in arg_dict and arg_dict['V'].strip() != ""]
    except ValueError:
        args = []
        continue

    dep_args = [m for pat in patterns[:2] for m in match(dep, pat)
                if m['C'].form.lower() == sys.argv[1]]

    if args:
        matches += 1
    if dep_args:
        dep_matches += 1

    print_to = [f]
    if args and not dep_args:
        print_to.append(only_glarf)
    elif dep_args and not args:
        print_to.append(only_dep)

    for dest in print_to:
        print >> dest, sent
def test_as():
    sent, root = read(example_as, return_tree=True)[0]
    matches = match(root, seed_patterns.as_1)
    assert_greater(len(matches), 0)
def test_than():
    sent, root = read(example_rbr, return_tree=True)[0]
    matches = match(root, seed_patterns.than_2)
    assert_greater(len(matches), 0)
def test_like_t2():
    sent, root = read(example_like_t2, return_tree=True)[0]
    matches = match(root, seed_patterns.like_t2)
    assert_greater(len(matches), 0)
def test_like():
    sent, root = read(example_like, return_tree=True)[0]
    matches = match(root, seed_patterns.like)
    assert_greater(len(matches), 0)
    assert_in('T', matches[0].keys())