def parse_ccgbank_tree(s): t = Tree.parse(s, parse_node=parse_ccgbank_node, parse_leaf=parse_ccgbank_leaf, node_pattern=ccgbank_node_pattern, leaf_pattern=ccgbank_leaf_pattern) return excise_empty_nodes(t)
def tag(self, input_tree): """ Tag an input tree using the rules in parsed grammars. """ #clean input tree: input_tree = self.clean(input_tree) text = self.from_tree_to_text(input_tree) #print "INPUT TEXT: "+text for rule in self.rules: rule_name = rule.keys()[0] rule = rule.values()[0] matches = re.finditer(rule, text, re.I) for match in matches: match_text = match.group(rule_name) #eliminar espacios al principio y al final del matching text, #para controlar que cada subarbol <NAME> está bien delimitado #en el texto resultante (no se come espacios opcionales): match_text = match_text.strip() text = string.replace(text, match_text, "<" + rule_name + ">") #print "TEXT = "+text self.stack.append(match_text) #print "OUTPUT TEXT : "+text output_tree_str = "(S " + self.from_text_to_tree_str(text) + " )" #print "OUTPUT TREE STR: "+output_tree_str output_tree = Tree.parse(output_tree_str, parse_leaf=self.from_string_token_to_tuple) return output_tree
def negra_tree_iter(corpus_root): pieces = [] for line in open(corpus_root): if line.startswith('%'): s = ''.join(pieces).strip() if len(s): yield Tree.parse(s) pieces = [] else: pieces.append(line) if len(pieces): s = ''.join(pieces).strip() yield Tree.parse(s)
def tag(self, input_tree): """ Tag an input tree using the rules in parsed grammars. """ #clean input tree: input_tree = self.clean(input_tree) text = self.from_tree_to_text(input_tree) #print "INPUT TEXT: "+text for rule in self.rules: rule_name = rule.keys()[0] rule = rule.values()[0] matches = re.finditer(rule, text, re.I) for match in matches: match_text = match.group(rule_name) #eliminar espacios al principio y al final del matching text, #para controlar que cada subarbol <NAME> está bien delimitado #en el texto resultante (no se come espacios opcionales): match_text = match_text.strip() text = string.replace(text, match_text, "<"+rule_name+">") #print "TEXT = "+text self.stack.append(match_text) #print "OUTPUT TEXT : "+text output_tree_str = "(S "+self.from_text_to_tree_str(text)+" )" #print "OUTPUT TREE STR: "+output_tree_str output_tree = Tree.parse(output_tree_str, parse_leaf=self.from_string_token_to_tuple) return output_tree
def test_simple_tags(self): grammar = "ANIMAL : {<ANIMAL>}" rp = ruleparser.RuleParser(grammar) expected = Tree.parse("(S el/DT (ANIMAL perro/NN/ANIMAL) ladra/VB al/DT (ANIMAL gato/NN/ANIMAL))", parse_leaf=rp.from_string_token_to_tuple) result = rp.tag(self.text) self.assertEqual(result, expected)
def treebank_bracket_parse(t): try: return Tree.parse(t, remove_empty_top_bracketing=True) # return tree.bracket_parse(t) except IndexError: # in case it's the real treebank format, # strip first and last brackets before parsing return tree.bracket_parse(t.strip()[1:-1])
def test_cascaded_rules_2(self): grammar = """ EQUIPOS : {<Equipo_Futbol> <CONJ> <Equipo_Futbol>} PARTIDO : {<EQUIPOS> <VB>} """ rp = ruleparser.RuleParser(grammar) expected = Tree.parse("(S (PARTIDO (EQUIPOS Real_Madrid/NN/NE/Equipo_Futbol y/CONJ F.C._Barcelona/NN/NE/Equipo_Futbol) disputan/VB) hoy/ADV la/DT final/NN de/PP la/DT Copa_del_Rey/NN/NE/Evento)", parse_leaf=rp.from_string_token_to_tuple) result = rp.tag(self.text) self.assertEqual(result,expected)
def test_cascaded_rules(self): grammar = """ NP : {<DT>? <NN>+} VP : {<VB> <ADV>} """ rp = ruleparser.RuleParser(grammar) expected = Tree.parse("(S (NP Real_Madrid/NN/NE/Equipo_Futbol) y/CONJ (NP F.C._Barcelona/NN/NE/Equipo_Futbol) (VP disputan/VB hoy/ADV) (NP la/DT final/NN) de/PP (NP la/DT Copa_del_Rey/NN/NE/Evento))", parse_leaf=rp.from_string_token_to_tuple) result = rp.tag(self.text) self.assertEqual(result,expected)
def test_simple_words(self): grammar = """ PERRO : {"el" "perro"} GATO : {"al" "gato"} """ rp = ruleparser.RuleParser(grammar) expected = Tree.parse("(S (PERRO el/DT perro/NN/ANIMAL) ladra/VB (GATO al/DT gato/NN/ANIMAL))", parse_leaf=rp.from_string_token_to_tuple) result = rp.tag(self.text) self.assertEqual(result,expected)
def test_context_rules(self): self.text = [('He',['VB']), ('estudiado',['VB']), ('en',['ADV']), ('la',['DT']), ('Universidad',['NN']), ('Complutense',['NN']), ('y',['CONJ']), ('he',['VB']), ('trabajado',['VB']), ('en',['ADV']), ('Yahoo!',['NN']), ('durante',['ADV']), ('2',['NN']), ('años',['NN'])] grammar = """ EMPRESA : "trabajado" "en" {<NN>+} UNIVERSIDAD : "estudiado" "en" <DT>? {<NN>+} TECNOLOGIA : "trabajado" "con" {<.*>} """ rp = ruleparser.RuleParser(grammar) expected = Tree.parse("(S He/VB estudiado/VB en/ADV la/DT (UNIVERSIDAD Universidad/NN Complutense/NN) y/CONJ he/VB trabajado/VB en/ADV (EMPRESA Yahoo!/NN) durante/ADV 2/NN años/NN)", parse_leaf=rp.from_string_token_to_tuple) result = rp.tag(self.text) self.assertEqual(result,expected)
def parse(self): """ Accesses the parse tree based on the S-expression parse string in the XML :getter: Returns the NLTK parse tree :type: nltk.Tree """ if self.parse_string is not None and self._parse is None: self._parse = Tree.parse(self._parse_string) return self._parse
def test_repetitive_rules(self): self.text = [('He',['VB']), ('estudiado',['VB']), ('en',['ADV']), ('la',['DT']), ('Universidad',['NN']), ('Complutense',['NN']), ('y',['CONJ']), ('he',['VB']), ('trabajado',['VB']), ('en',['ADV']), ('Yahoo!',['NN']), ('durante',['ADV']), ('2',['NN']), ('años',['NN'])] grammar = """ UNIVERSIDAD : {"universidad"} UNIVERSIDAD : {"complutense"} UNIVERSIDAD : {<UNIVERSIDAD> <UNIVERSIDAD>} """ rp = ruleparser.RuleParser(grammar) expected = Tree.parse("(S He/VB estudiado/VB en/ADV la/DT (UNIVERSIDAD (UNIVERSIDAD Universidad/NN) (UNIVERSIDAD Complutense/NN)) y/CONJ he/VB trabajado/VB en/ADV Yahoo!/NN durante/ADV 2/NN años/NN)", parse_leaf=rp.from_string_token_to_tuple) result = rp.tag(self.text) self.assertEqual(result,expected)
def parse_rst_dt_tree(tstr, context=None): """ Read a single RST tree from its RST DT string representation. If context is set, align the tree with it. You should really try to pass in a context (see `RSTContext` if you can, the None case is really intended for testing, or in cases where you don't have an original text) """ pstr = _preprocess(tstr) tree_ = Tree.parse(pstr, leaf_pattern=_LEAF_PATTERN) tree_ = _postprocess(tree_) if context: tree_ = _align_with_context(tree_, context) return tree_
def parse_lightweight_tree(tstr): """ Parse lightweight RST debug syntax into SimpleRSTTree, eg. :: (R:attribution (N:elaboration (N foo) (S bar) (S quux))) This is motly useful for debugging or for knocking out quick examples """ _lw_type_re = re.compile(r'(?P<nuc>[RSN])(:(?P<rel>.*)|$)') _lw_nuc_map = dict((nuc[0], nuc) for nuc in ["Root", "Nucleus", "Satellite"]) # pylint: disable=C0103 PosInfo = collections.namedtuple("PosInfo", "text edu") # pylint: enable=C0103 def walk(subtree, posinfo=PosInfo(text=0, edu=0)): """ walk down first-cut tree, counting span info and returning a fancier tree along the way """ if isinstance(subtree, Tree): start = copy.copy(posinfo) children = [] for kid in subtree: tree, posinfo = walk(kid, posinfo) children.append(tree) match = _lw_type_re.match(treenode(subtree)) if not match: raise RSTTreeException("Missing nuclearity annotation in ", subtree) nuclearity = _lw_nuc_map[match.group("nuc")] rel = match.group("rel") or "leaf" edu_span = (start.edu, posinfo.edu - 1) span = Span(start.text, posinfo.text) node = Node(nuclearity, edu_span, span, rel) return SimpleRSTTree(node, children), posinfo else: text = subtree start = posinfo.text end = start + len(text) posinfo2 = PosInfo(text=end, edu=posinfo.edu+1) return EDU(posinfo.edu, Span(start, end), text), posinfo2 return walk(Tree.parse(tstr))[0]
def read_trees(iterable): """Reads an iterable in order to mount a syntactic tree.""" from nltk import Tree tree_strings = [] trees = [] for line in iterable: uline = unicode(line, 'utf-8') data = uline.split() if len(data) <= 1: tree = Tree.parse(' '.join(tree_strings), brackets='[]') trees.append(tree) tree_strings = [] continue word = data[ConllPos.word] pos = data[ConllPos.pos] parse = data[ConllPos.parse] # a little workaround. # to avoid messing nltk.Tree string parser, we use [] as tree brackets # instead of the default (). This is done because "(" and ")" appear as # separate tokens, while "["and "]" do not. tree_string = parse.replace('(', '[').replace(')', ']') # treat "broken" constituents like VP- and -VP as normal VPs tree_string = tree_string.replace('-', '') # treat multiwords and concatenate their POS with # words = [' %s#%s ' % (part, pos) for part in word.split('_')] words_string = ' '.join(words) tree_string = tree_string.replace('*', words_string) tree_strings.append(tree_string) return trees
def ctb_tree_iter_f(corpus_root): in_s = False pieces = [] print >>sys.stderr, corpus_root for line in open(corpus_root): lowered = line.strip().lower() if lowered.startswith('<s '): in_s = True elif lowered.startswith('</s>'): s = ''.join(pieces).strip() if len(s): # In a couple instances of the CTB, there are two sentences # contained in a single <S> node. Deal with that here for s1 in split_separate_setences(s): yield Tree.parse(s1) in_s = False pieces = [] elif in_s: pieces.append(line)
def read_conll(iterable, read_srl=True): """ Reads a sentence from a sequence of lines in a CoNLL format file. :returns: if read_srl is True, returns a list of tuples, where each one has the format: ([token1, token2, ...], [[tag-for-pred1, tag-for-pred1, ...], [tag-for-pred2, tag-for-pred2, ...]], [index-of-pred1, index-of-pred2, ...]) Tags are repeated, NOT in IOBES format. If read_srl is False, returns a list of sentences. """ from nltk import Tree sentences = [] sentence = [] instances = None num_preds = None predicates = [] token_number = 0 # used to build syntactic trees tree_strings = [] for line in iterable: uline = unicode(line, 'utf-8') data = uline.split() if len(data) <= 1: # this is an empty line after a sentence # build the syntactic tree and attribute each token's chunk tree = Tree.parse(' '.join(tree_strings), brackets='[]') token_chunks = get_chunks(tree) for j, (token, (word, chunk)) in enumerate(izip(sentence, token_chunks)): assert token.word == word, \ "Syntactic and semantic analyses got different words: %s and %s" % (token.word, word) token.chunk = chunk sentence[j] = token if read_srl: sentences.append((sentence, instances, predicates)) instances = None predicates = [] token_number = 0 else: sentences.append(sentence) num_preds = None tree_strings = [] sentence = [] continue if instances is None and read_srl: # initializes each instance as an empty list num_preds = len(data) - ConllPos.pred - 1 instances = [[] for _ in xrange(num_preds)] expected_role = ['O'] * num_preds word = data[ConllPos.word] lemma = data[ConllPos.lemma].lower() pos = data[ConllPos.pos].lower() parse = data[ConllPos.parse] is_predicate = data[ConllPos.pred] != '-' # lemmas for punctuation are listed as - if lemma == '-': lemma = word # Syntactic tree # to avoid messing nltk.Tree string parser, we use [] as tree brackets # instead of the default (). This is done because "(" and ")" appear as # separate tokens, while "["and "]" do not. tree_string = parse.replace('(', '[').replace(')', ']') # treat "broken" constituents like VP- and -VP as normal VPs tree_string = tree_string.replace('-', '') tree_string = tree_string.replace('*', ' %s ' % word) tree_strings.append(tree_string) # if it's a predicate, add to the list of predicates # we must check it before appending the tokens # because multiword tokens may mess up the count if read_srl and is_predicate: predicates.append(token_number) # split multiwords splitted = zip(word.split('_'), lemma.split('_')) num_parts = len(splitted) for word_part, lemma_part in splitted: token = Token(word_part, pos=pos, lemma=lemma_part) sentence.append(token) token_number += 1 # SRL if read_srl: # read the roles for each predicate for i, role in enumerate(data[ConllPos.pred + 1:]): role, expected_role[i] = read_role(role, expected_role[i]) # repeat the tag if the word was splitted for _ in range(num_parts): instances[i].append(role) assert instances is None return sentences
def test_context_left(self): grammar = ' PERRO : <DT> {"perro"}' rp = ruleparser.RuleParser(grammar) expected = Tree.parse("(S el/DT (PERRO perro/NN/ANIMAL) ladra/VB al/DT gato/NN/ANIMAL)", parse_leaf=rp.from_string_token_to_tuple) result = rp.tag(self.text) self.assertEqual(result,expected)
def test_context_both(self): grammar = 'LADRA :"perro" {"ladra"} <DT>' rp = ruleparser.RuleParser(grammar) expected = Tree.parse("(S el/DT perro/NN/ANIMAL (LADRA ladra/VB) al/DT gato/NN/ANIMAL)", parse_leaf=rp.from_string_token_to_tuple) result = rp.tag(self.text) self.assertEqual(result,expected)
def test_operator_interrog_word(self): grammar = 'ANIMAL : {"el"? <ANIMAL>}' rp = ruleparser.RuleParser(grammar) expected = Tree.parse("(S (ANIMAL el/DT perro/NN/ANIMAL) ladra/VB al/DT (ANIMAL gato/NN/ANIMAL))", parse_leaf=rp.from_string_token_to_tuple)
def test_operator_interrog_tag(self): text = [('Spike', ['NN', 'ANIMAL']), ('ladra', ['VB']), ('al', ['DT']), ('gato', ['NN', 'ANIMAL'])] grammar = 'ANIMAL : {"el"? <ANIMAL>}' rp = ruleparser.RuleParser(grammar) expected = Tree.parse("(S (ANIMAL el/DT perro/NN/ANIMAL) ladra/VB al/DT (ANIMAL gato/NN/ANIMAL))", parse_leaf=rp.from_string_token_to_tuple)
def build(cls, str): tstr = cls._preprocess(str) t_ = Tree.parse(tstr, leaf_pattern=leaf_pattern) return cls._postprocess(t_)
def test_numerals(self): text = [('esto', ['DT']),('es', ['VB']),('muy', ['ADV']), ('muy', ['ADV']), ('muy', ['ADV']), ('bonito', ['ADJ'])] grammar = 'MUYx3 : {"muy"#3-3}' rp = ruleparser.RuleParser(grammar) expected = Tree.parse("(S esto/DT es/VB (MUYx3 muy/ADV muy/ADV muy/ADV) bonito/ADJ)", parse_leaf=rp.from_string_token_to_tuple)