def process(fname, chunk, fmt='turboparser'): """Process a memmapped chunk of a large file. The comparison detection logic is called from here. Parameters ---------- fname : string The path to the file to be opened. chunk : tuple (int, int) Beginning and ending offsets in the file. The code essentially processes `f.seek(chunk[0]).read(chunk[1])`. fmt : ('turboparser'|'wacky') CONLL dependency format to use. Returns ------- chunk_matches : list List of tuples (sentence, matches) where the second element is a list of (pattern_no, dict) containing the slots matched by the pattern. """ global filemap, fileobj chunk_matches = [] if filemap is None or fileobj.name != fname: fileobj = open(fname) filemap = mmap.mmap(fileobj.fileno(), os.path.getsize(fname), access=mmap.ACCESS_READ) filemap.seek(chunk[0]) lines = filemap.read(chunk[1]).splitlines() sents = get_sents_wacky(lines) if fmt == 'wacky' else get_sents(lines) for sent in sents: try: for s, root in read(sent + ["\n"], return_tree=True): matches = [(pat_no, m) for pat_no, pat in enumerate(patterns) for m in match(root, pat)] if matches: matches = deduplicate(matches) chunk_matches.append((str(s), matches)) except ValueError: pass # sentence without root return chunk_matches
This script shows the simple way of using this package to extract comparisons from a parsed English corpus. For example, you can run it against the 'data/hanks_tp_lemma.conll' file provided. By default this prints the dependency root of each comparison slot (topic, vehicle, etc) but the entire subtrees are extracted and available. """ from __future__ import print_function import fileinput from compattern.dependency import match from compattern.dependency.seed_patterns import patterns def _lemma_or_form(tok): return tok.form.lower() if tok.lemma == '_' else tok.lemma.lower() if __name__ == '__main__': from compattern.dependency.conll import read sents = read(fileinput.input(), return_tree=True) for sent, root in sents: print(sent) for pat in patterns: for m in match(root, pat): print("\n".join("{}: {}".format(key, val.form) for key, val in m.items())) print()
'w', encoding='utf-8') try: tree = GlarfTree.glarf_parse(gf, gt) args = [get_args(*node) for node in find_comparison_nodes(tree)] args = [ arg_dict for arg_dict in args if arg_dict['C'].lower() == sys.argv[1] and 'V' in arg_dict and arg_dict['V'].strip() != "" ] except ValueError: args = [] continue dep_args = [ m for pat in patterns[:2] for m in match(dep, pat) if m['C'].form.lower() == sys.argv[1] ] if args: matches += 1 if dep_args: dep_matches += 1 print_to = [f] if args and not dep_args: print_to.append(only_glarf) elif dep_args and not args: print_to.append(only_dep) for dest in print_to:
def test_as(): sent, root = read(example_as, return_tree=True)[0] matches = match(root, seed_patterns.as_1) assert_greater(len(matches), 0)
def test_like_t2(): sent, root = read(example_like_t2, return_tree=True)[0] matches = match(root, seed_patterns.like_t2) assert_greater(len(matches), 0)
def test_than(): sent, root = read(example_rbr, return_tree=True)[0] matches = match(root, seed_patterns.than_2) assert_greater(len(matches), 0)
def test_aussi_lemma(): sent, root = read(ex_aussi, return_tree=True)[0] matches = match(root, aussi) assert_greater(len(matches), 0)
def test_like(): sent, root = read(example_like, return_tree=True)[0] matches = match(root, seed_patterns.like) assert_greater(len(matches), 0) assert_in('T', list(matches[0].keys()))
print '.' if f: f.close() f = open('bnc_similes/{}/{:03d}.txt'.format(sys.argv[1], ii / 20), 'w', encoding='utf-8') try: tree = GlarfTree.glarf_parse(gf, gt) args = [get_args(*node) for node in find_comparison_nodes(tree)] args = [arg_dict for arg_dict in args if arg_dict['C'].lower() == sys.argv[1] and 'V' in arg_dict and arg_dict['V'].strip() != ""] except ValueError: args = [] continue dep_args = [m for pat in patterns[:2] for m in match(dep, pat) if m['C'].form.lower() == sys.argv[1]] if args: matches += 1 if dep_args: dep_matches += 1 print_to = [f] if args and not dep_args: print_to.append(only_glarf) elif dep_args and not args: print_to.append(only_dep) for dest in print_to: print >> dest, sent
def test_like(): sent, root = read(example_like, return_tree=True)[0] matches = match(root, seed_patterns.like) assert_greater(len(matches), 0) assert_in('T', matches[0].keys())