def find_entities_spans(self, question): q_word = self._extract_question_word(question) syntax_parse = self._get_synt_parse(question) root = parse_tree(syntax_parse)[0] if root.token['form'] == q_word: for t in root.children: if t.token['deprel'] == 'nsubj': root = t break entity_tokens = [] for t in root.children: syntree = SynTree(t) if not syntree.has_token(q_word) and not syntree.ignored(): entity_tokens.append(syntree.get_ordered_tokens()) if not entity_tokens: spans = [] else: spans = [ ' '.join(list(zip(*sorted(et, key=itemgetter(0))))[1]) for et in entity_tokens ] return spans
def test_parse_tree(self): sentences = parse_tree(data) self.assertEqual(len(sentences), 1) root = sentences[0] self.assertEqual(text(root), "TokenTree<token={id=5, form=jumps}, children=[...]>") self.assertEqual( root.token, OrderedDict([ ('id', 5), ('form', 'jumps'), ('lemma', 'jump'), ('upostag', 'VERB'), ('xpostag', 'VBZ'), ('feats', OrderedDict([ ("Mood", "Ind"), ("Number", "Sing"), ("Person", "3"), ("Tense", "Pres"), ("VerbForm", "Fin"), ])), ('head', 0), ('deprel', 'root'), ('deps', None), ('misc', None) ]) ) self.assertEqual( [text(child) for child in root.children], [ "TokenTree<token={id=4, form=fox}, children=[...]>", "TokenTree<token={id=9, form=dog}, children=[...]>", "TokenTree<token={id=10, form=.}, children=None>", ] ) self.assertEqual( root.metadata["text"], "The quick brown fox jumps over the lazy dog." ) self.assertEqual(root.serialize(), data) self.assertEqual( capture_print(root.print_tree), dedent("""\ (deprel:root) form:jumps lemma:jump upostag:VERB [5] (deprel:nsubj) form:fox lemma:fox upostag:NOUN [4] (deprel:det) form:The lemma:the upostag:DET [1] (deprel:amod) form:quick lemma:quick upostag:ADJ [2] (deprel:amod) form:brown lemma:brown upostag:ADJ [3] (deprel:nmod) form:dog lemma:dog upostag:NOUN [9] (deprel:case) form:over lemma:over upostag:ADP [6] (deprel:det) form:the lemma:the upostag:DET [7] (deprel:amod) form:lazy lemma:lazy upostag:ADJ [8] (deprel:punct) form:. lemma:. upostag:PUNCT [10] """) )
def case_fox(): data = """ # text = The quick brown fox 1 The the DET DT Definite=Def|PronType=Art 4 det _ _ 2 quick quick ADJ JJ Degree=Pos 4 amod _ _ 3 brown brown ADJ JJ Degree=Pos 4 amod _ _ 4 fox fox NOUN NN Number=Sing 0 nsubj _ _ """ return TokenSNGram(conllu.parse_tree(data)[0]), { "length": 4, "str": "fox [The, quick, brown]", "repr": [ PatternElement('fox', 'form', 4), SNGram.LEFT_BRACKET, PatternElement('The', 'form', 1), SNGram.COMMA, PatternElement('quick', 'form', 2), SNGram.COMMA, PatternElement('brown', 'form', 3), SNGram.RIGHT_BRACKET ], "profiles": set(["form [ form , form , form ]"]) }
def case_apples(): data = """ # text = apples, pears, oranges, and bananas. 1 apples apple NOUN NN Number=Plur 0 obj _ _ 2 , , PUNCT , _ 3 punct _ _ 3 pears pear NOUN NN Number=Plur 1 conj _ _ 4 , , PUNCT , _ 5 punct _ _ 5 oranges orange NOUN NN Number=Plur 1 conj _ _ 6 , , PUNCT , _ 8 punct _ _ 7 and and SCONJ CC _ 8 cc _ _ 8 bananas banana NOUN NN Number=Plur 1 conj _ _ """ return TokenSNGram(conllu.parse_tree(data)[0]), { "length": 8, "str": "apples [pears,, oranges,, bananas [,, and]]", "repr": [ PatternElement('apples', 'form', 1), SNGram.LEFT_BRACKET, PatternElement('pears', 'form', 3), PatternElement(',', 'form', 2), SNGram.COMMA, PatternElement('oranges', 'form', 5), PatternElement(',', 'form', 4), SNGram.COMMA, PatternElement('bananas', 'form', 8), SNGram.LEFT_BRACKET, PatternElement(',', 'form', 6), SNGram.COMMA, PatternElement('and', 'form', 7), SNGram.RIGHT_BRACKET, SNGram.RIGHT_BRACKET ], "profiles": set(["form [ form form , form form , form [ form , form ] ]"]) }
def __call__(self, text): sentences = self.tokenize(text) for s in sentences: self.tag(s) self.parse(s) conllu = self.write(sentences, "conllu") return parse_tree(conllu)
def case_sidorov2(): data = """ # text = y le di un par de vueltas de_mala_gana 1 y _ _ _ _ 0 _ _ _ 2 le _ _ _ _ 3 _ _ _ 3 di _ _ _ _ 1 _ _ _ 4 par _ _ _ _ 3 _ _ _ 5 de_mala_gana _ _ _ _ 3 _ _ _ """ return TokenSNGram(conllu.parse_tree(data)[0]), { "length": 5, "str": "y di [le, par, de_mala_gana]", "repr": [ PatternElement('y', 'form', 1), PatternElement('di', 'form', 3), SNGram.LEFT_BRACKET, PatternElement('le', 'form', 2), SNGram.COMMA, PatternElement('par', 'form', 4), SNGram.COMMA, PatternElement('de_mala_gana', 'form', 5), SNGram.RIGHT_BRACKET ], "profiles": set(["form form [ form , form , form ]"]) }
def case_changed_special(): data = """ # text = The quick brown fox 1 The the DET DT Definite=Def|PronType=Art 4 det _ _ 2 quick quick ADJ JJ Degree=Pos 4 amod _ _ 3 brown brown ADJ JJ Degree=Pos 4 amod _ _ 4 fox fox NOUN NN Number=Sing 0 nsubj _ _ """ return TokenSNGram(conllu.parse_tree(data)[0], left_bracket="(", right_bracket=")", comma="_"), { "length": 4, "str": "fox (The_ quick_ brown)", "repr": [ PatternElement('fox', 'form', 4), "(", PatternElement('The', 'form', 1), "_", PatternElement('quick', 'form', 2), "_", PatternElement('brown', 'form', 3), ")" ], "profiles": set(["form ( form _ form _ form )"]) }
def load_data_file(file_name, tree=False): with open(file_name) as f: text = f.read() if tree: return (parse(text), parse_tree(text)) else: return parse(text)
def test_parse_tree_and_serialize(self): from tests.fixtures import TESTCASES for testcase in TESTCASES: data = parse(testcase) testcase_without_range_and_elided = TokenList( [token for token in data[0] if isinstance(token["id"], int)]) self.assertEqual( parse_tree(testcase)[0].serialize(), testcase_without_range_and_elided)
def process_conllu(inp): root = parse_tree(inp)[0] data = {} consts = [] for const in depth_first(root): consts.append(const) w = const['form'] deprel = const['deprel'] data[w] = deprel return data, consts
def read_data(filename): test = open(filename, 'r') text = test.read() test.close() # print text data = parse( text ) #parse(text)[i] represents the dependency relation of ith sentence tree = parse_tree(text) return (data, tree)
def parse_tree_conll(path_to_file: str) -> list: """ Read a CoNLL file and return a list of sentences as TokenTree objects (TokenTree == arborised hierarchal structures). :param path_to_file: Path to the conll file :returns: a list of sentences as TokenTree objects """ return parse_tree(load_conll(path_to_file))
def process_conllu(inp): tree = parse_tree(inp) root = tree[0] data = {} #path = 0 for const in depth_first(root): #print(const) w = const.token['form'] deprel = const.token['deprel'] data[w] = deprel return data
def analyze(self, themes, filename, encoding='utf8'): self.__checkerThemes__(themes) print('Updating model with text... ', end='') self.__srem__.trainFile(filename, encoding=encoding) print('[OK]') print('Parsing sentences... ', end='') with open(filename, 'r', encoding=encoding) as file: for index, line in enumerate(file, start=1): processed_conllu = self.__pipeline__.process(line, self.__uderror__) if self.__uderror__.occurred(): raise RuntimeError('UDPipe error: ' + self.__uderror__.message) sentence_root = parse_tree(processed_conllu)[0] self.__evalTreeSentence__(themes, sentence_root) print('[OK]')
def test_deptree(): data = """# text = the cat chases the mouse 1 The the DET DT Definite=Def|PronType=Art 4 det _ _ 4 cat cat NOUN NN Number=Sing 5 nsubj _ _ 5 chases chase VERB VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 0 root _ _ 7 the the DET DT Definite=Def|PronType=Art 9 det _ _ 9 mouse mouse NOUN NN Number=Sing 5 dobj _ SpaceAfter=No # text = the cat sleeps 1 The the DET DT Definite=Def|PronType=Art 4 det _ _ 4 cat cat NOUN NN Number=Sing 5 nsubj _ _ 5 sleeps sleep VERB VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 0 root _ _ """ examples = conllu.parse_tree(data) kernel = Kernel(label=DT.label, children=DT.children) assert kernel(examples[1], examples[1]) == 3.0 kernel_0 = Kernel() tree = Tree.fromstring("(root (nsubj det) (dobj det))") assert kernel_0(tree, tree) == kernel(examples[0], examples[0])
def case_jumps(): data = """ # text = The quick brown fox jumps over the lazy dog. 1 The the DET DT Definite=Def|PronType=Art 4 det _ _ 2 quick quick ADJ JJ Degree=Pos 4 amod _ _ 3 brown brown ADJ JJ Degree=Pos 4 amod _ _ 4 fox fox NOUN NN Number=Sing 5 nsubj _ _ 5 jumps jump VERB VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 0 root _ _ 6 over over ADP IN _ 9 case _ _ 7 the the DET DT Definite=Def|PronType=Art 9 det _ _ 8 lazy lazy ADJ JJ Degree=Pos 9 amod _ _ 9 dog dog NOUN NN Number=Sing 5 nmod _ SpaceAfter=No 10 . . PUNCT . _ 5 punct _ _ """ return TokenSNGram(conllu.parse_tree(data)[0]), { "length": 10, "str": "jumps [fox [The, quick, brown], dog [over, the, lazy], .]", "repr": [ PatternElement('jumps', 'form', 5), SNGram.LEFT_BRACKET, PatternElement('fox', 'form', 4), SNGram.LEFT_BRACKET, PatternElement('The', 'form', 1), SNGram.COMMA, PatternElement('quick', 'form', 2), SNGram.COMMA, PatternElement('brown', 'form', 3), SNGram.RIGHT_BRACKET, SNGram.COMMA, PatternElement('dog', 'form', 9), SNGram.LEFT_BRACKET, PatternElement('over', 'form', 6), SNGram.COMMA, PatternElement('the', 'form', 7), SNGram.COMMA, PatternElement('lazy', 'form', 8), SNGram.RIGHT_BRACKET, SNGram.COMMA, PatternElement('.', 'form', 10), SNGram.RIGHT_BRACKET ], "profiles": set([ "form [ form [ form , form , form ] , form [ form , form , form ] , form ]" ]) }
def case_dog(): data = """ # text = over the lazy dog 6 over over ADP IN _ 9 case _ _ 7 the the DET DT Definite=Def|PronType=Art 9 det _ _ 8 lazy lazy ADJ JJ Degree=Pos 9 amod _ _ 9 dog dog NOUN NN Number=Sing 0 nmod _ SpaceAfter=No """ return TokenSNGram(conllu.parse_tree(data)[0]), { "length": 4, "str": "dog [over, the, lazy]", "repr": [ PatternElement('dog', 'form', 9), SNGram.LEFT_BRACKET, PatternElement('over', 'form', 6), SNGram.COMMA, PatternElement('the', 'form', 7), SNGram.COMMA, PatternElement('lazy', 'form', 8), SNGram.RIGHT_BRACKET ], "profiles": set(["form [ form , form , form ]"]) }
# quick sentence.filter(feats__Degree="Pos") # TokenList<quick, brown, lazy> sentence.metadata ### Turn a TokenList back into CoNLL-U sentence.serialize() # The format is not desirable ### Turn a Tokenlist into a TokenTree sentence.to_tree() ### Use parse_tree() to parse into a list of dependency trees from conllu import parse_tree sentences = parse_tree(data) sentences from conllu import parse_tree_incr for tokentree in parse_tree_incr(data_file): print(tokentree) root = sentences[0] root root.print_tree() root.token children = root.children children
def test_parse_tree_incr(self): self.assertEqual(parse_tree(data), list(parse_tree_incr(StringIO(data))))
def test_parse_tree_incr(self): self.assertEqual(parse_tree(data), list(parse_tree_incr(string_to_file(data))))