def assign_slots(tokens, tag_tree, word_tree): stopword_list = stopwords.words('english') tokens_with_slot_tags = [] word_tree = ParentedTree.convert(word_tree) tag_tree = ParentedTree.convert(tag_tree) word_tree_with_cats = tag_words_with_categories(word_tree) tag_tree_with_cats = tag_words_with_categories(tag_tree) for i, word in enumerate(tokens): tag = finalize_tags(i, word, tag_tree_with_cats, word_tree_with_cats) tokens_with_slot_tags.append((word, tag)) found_query_focus = False for i, item in enumerate(tokens_with_slot_tags): word, tag = item if tag in ['USER','MEDIA','NETWORK'] and not found_query_focus: tokens_with_slot_tags[i] = (word, 'SEARCH') found_query_focus = True elif tag == UNK: tokens_with_slot_tags[i] = (word, 'KEYWORD') slots = {} for word, tag in tokens_with_slot_tags: if tag == 'SKIP': continue elif tag == 'KEYWORD': if 'KEYWORDS' not in slots: slots['KEYWORDS'] = [] if word not in stopword_list and word not in PUNCTUATION: slots['KEYWORDS'].append(word) else: if tag not in slots: slots[tag] = word else: previous_words = slots[tag] slots[tag] = ' '.join([previous_words, word]) return slots
def lappinleasse(parsetree, i): global entitySet for np in parsetree.subtrees(lambda x: x.label() == 'NP'): if 'PRP' in np[0].label(): if np[0,0].lower() == 'it' and ispleonastic(np, parsetree): continue maxsalience = -1 referent = None e = Entity(np, parsetree, i) for entity in entitySet: if entity.sentencenum >= i - 4 and e.agreeswith(entity) and maxsalience < entity.salience: maxsalience = entity.salience referent = entity try: referent.salience += e.salience referent.gender = e.gender referent.phrases.add(np[0,0] + str(i)) orig = np[0,0] if np[0].label() == 'PRP$': np[0] = ParentedTree.fromstring('(SUB <'+ referent.name + "'s>)") print('PRP$ substitution', orig, '-->', referent.name) else: np[0] = ParentedTree.fromstring('(SUB <' + referent.name + '>)') print('PRP substitution', orig, '-->', referent.name) except: print('No substitution found for ', orig) continue elif np[0].label() == 'EX': continue else: entitySet.add(Entity(np, parsetree, i)) # print('Discourse model after sentence', i + 1, ':') # for entity in entitySet: print(entity) halve()
def merge_tree_nnps(tree): """ Takes a parse tree and merges any consecutive leaf nodes that come from NNPs For example if there is a segment of: (NP (JJ old) (NNP Pierre) (NNP Vinken) ) Returns: (NP (JJ old) (NNP PierreVinken) ) """ # require a parented tree to get a subtrees tree position p = ParentedTree.convert(tree) # iterates subtrees of height 3. This is where NP's leading to NNP's leading to lexicalizations will be for s in p.subtrees(filter=lambda s: s.height() == 3): # merge NNP's in the list representation of this trees children: [(POS, word), ...] new_noun_phrase = merge_tagged_nnps([(c.label(), c[0]) for c in s]) child_str = " ".join("(%s %s)" % (pos, word) for pos, word in new_noun_phrase) # create new subtree with merged NNP's new_s = ParentedTree.fromstring("(%s %s)" % (s.label(), child_str)) # replace old subtree with new subtree p[s.treeposition()] = new_s return Tree.convert(p)
def test_labeled_nodes(self): ''' Test labeled nodes. Test case from Emily M. Bender. ''' search = ''' # macros @ SBJ /SBJ/; @ VP /VP/; @ VB /VB/; @ VPoB /V[PB]/; @ OBJ /OBJ/; # 1 svo S < @SBJ=s < (@VP=v < (@VB $.. @OBJ)) : =s .. =v''' sent1 = ParentedTree.fromstring( '(S (NP-SBJ I) (VP (VB eat) (NP-OBJ (NNS apples))))') sent2 = ParentedTree.fromstring( '(S (VP (VB eat) (NP-OBJ (NNS apples))) (NP-SBJ I))') search_firsthalf = (search.split('\n\n')[0] + 'S < @SBJ < (@VP < (@VB $.. @OBJ))') search_rewrite = 'S < (/.*SBJ/ $.. (/VP/ < (/VB/ $.. /.*OBJ/)))' self.assertTrue(list(tgrep.tgrep_positions(search_firsthalf, [sent1]))[0]) self.assertTrue(list(tgrep.tgrep_positions(search, [sent1]))[0]) self.assertTrue(list(tgrep.tgrep_positions(search_rewrite, [sent1]))[0]) self.assertEqual(list(tgrep.tgrep_positions(search, [sent1])), list(tgrep.tgrep_positions(search_rewrite, [sent1]))) self.assertTrue(list(tgrep.tgrep_positions(search_firsthalf, [sent2]))[0]) self.assertFalse(list(tgrep.tgrep_positions(search, [sent2]))[0]) self.assertFalse(list(tgrep.tgrep_positions(search_rewrite, [sent2]))[0]) self.assertEqual(list(tgrep.tgrep_positions(search, [sent2])), list(tgrep.tgrep_positions(search_rewrite, [sent2])))
def syntax_similarity_two_documents(self, doc1, doc2, average=False): #syntax similarity of two single documents global numnodes doc1sents = self.sent_detector.tokenize(doc1.strip()) doc2sents = self.sent_detector.tokenize(doc2.strip()) for s in doc1sents: # to handle unusual long sentences. if len(s.split())>100: return "NA" for s in doc2sents: if len(s.split())>100: return "NA" try: #to handle parse errors. Parser errors might happen in cases where there is an unsuall long word in the sentence. doc1parsed = self.parser.raw_parse_sents((doc1sents)) doc2parsed = self.parser.raw_parse_sents((doc2sents)) except Exception as e: sys.stderr.write(str(e)) return "NA" costMatrix = [] doc1parsed = list(doc1parsed) for i in range(len(doc1parsed)): doc1parsed[i] = list(doc1parsed[i])[0] doc2parsed = list(doc2parsed) for i in range(len(doc2parsed)): doc2parsed[i] = list(doc2parsed[i])[0] for i in range(len(doc1parsed)): numnodes = 0 sentencedoc1 = ParentedTree.convert(doc1parsed[i]) tempnode = Node(sentencedoc1.root().label()) new_sentencedoc1 = self.convert_mytree(sentencedoc1,tempnode) temp_costMatrix = [] sen1nodes = numnodes for j in range(len(doc2parsed)): numnodes=0.0 sentencedoc2 = ParentedTree.convert(doc2parsed[j]) tempnode = Node(sentencedoc2.root().label()) new_sentencedoc2 = self.convert_mytree(sentencedoc2,tempnode) ED = simple_distance(new_sentencedoc1, new_sentencedoc2) ED = ED / (numnodes + sen1nodes) temp_costMatrix.append(ED) costMatrix.append(temp_costMatrix) costMatrix = np.array(costMatrix) if average==True: return 1-np.mean(costMatrix) else: indexes = su.linear_assignment(costMatrix) total = 0 rowMarked = [0] * len(doc1parsed) colMarked = [0] * len(doc2parsed) for row, column in indexes: total += costMatrix[row][column] rowMarked[row] = 1 colMarked [column] = 1 for k in range(len(rowMarked)): if rowMarked[k]==0: total+= np.min(costMatrix[k]) for c in range(len(colMarked)): if colMarked[c]==0: total+= np.min(costMatrix[:,c]) maxlengraph = max(len(doc1parsed),len(doc2parsed)) return 1-(total/maxlengraph)
def test_exact_match(): tree = ParentedTree.fromstring('(S (NP (DT the) (JJ big) (NN cat)) (VP bit) (NP (DT a) (NN cat)))') node = search_by_exact_string_matching(tree, 'cat') assert_equal(len(node), 2) assert_equal(node[0], ParentedTree.fromstring('(NN cat)')) node = search_by_exact_string_matching(tree, 'a cat') assert_equal(len(node), 1) assert_equal(node[0], ParentedTree.fromstring('(NP (DT a) (NN cat))'))
def __init__(self, nlp_sent): """ :param nlp_sent: sentence extracted from parse from stanford corenlp parser has "enhancedPlusPlusDependencies" has "tokens" """ tokens = nlp_sent['tokens'] self.raw_dict = {token['word']: token for token in tokens} const_parse = ParentedTree.fromstring(nlp_sent['parse']) self.clause_trees = parse_to_clauses(const_parse) # dependencies deps = nlp_sent['enhancedPlusPlusDependencies'] dep_dict = collections.defaultdict(lambda: (None, None)) try: dep_dict.update({ dep['dependentGloss']: (dep['dep'], dep['governorGloss']) for dep in deps }) except: pass # create sentence list __self__ self.word_list = self.make_words(tokens, dep_dict) self.word_dict = dict( zip([token['word'] for token in tokens], self.word_list)) self.clauses = self.integrate_tokens_to_clauses()
def get_candidates(treestring: str, verb_idx: int) -> List[List[str]]: tree = ParentedTree.fromstring(treestring) # Designate the predicate as the current node current = get_verbs_non_terminal_node(tree, verb_idx=verb_idx) candidates = [] while current is not None: # collect its sisters (constituents attached at the same level as the predicate) for sister in get_sisters(current): if sister.label() == "CC": # unless its sisters are coordinated with the predicate. continue if sister.label() == "PP": # If a sister is a PP, also collect its immediate children for child in get_children(sister): candidates += [child.leaves()] if sister is not None and sister.label() not in [ ".", "``", ",", ":" ]: candidates += [sister.leaves()] current = current.parent() # remove candidates which are just a single token, because they will anyway respect the constraint new_candidates = [] for cand in candidates: if len(cand) == 1: continue else: new_candidates.append(cand) candidates = new_candidates return candidates
def getConsituentTreeDistribution(core_nlp_files): diff_productions = dict() production_dict_for_files = dict() for genre_file_path, genre_file_name in core_nlp_files: production_dict = dict() dictionary = dict() with open(genre_file_path) as f: lines = f.readlines() assert len(lines) == 1 line = lines[0] line = 'dictionary=' + line exec(line) # print genre_file_path, dictionary sentences = dictionary[SENTENCES] for sent in sentences: parsetree = sent[PARSE_TREE] t = ParentedTree.fromstring(parsetree) prods = t.productions() for prod in prods: if prod not in diff_productions: diff_productions[prod] = 0.0 if prod not in production_dict: production_dict[prod] = 0.0 diff_productions[prod] += 1.0 production_dict[prod] += 1.0 production_dict_for_files[genre_file_name.replace('_corenlp1000.txt', '.txt')] = production_dict return production_dict_for_files, diff_productions
def disfile2tree(dis_filepath): """converts a *.dis file into a ParentedTree (NLTK) instance""" with open(dis_filepath) as f: rst_tree_str = f.read().strip() rst_tree_str = fix_rst_treebank_tree_str(rst_tree_str) rst_tree_str = convert_parens_in_rst_tree_str(rst_tree_str) return ParentedTree.fromstring(rst_tree_str)
def get_triples(self, sentence): t = list(self.parser.raw_parse(sentence))[0] t = ParentedTree.convert(t) s = self.find_subject(t) p = self.find_predicate(t) o = self.find_object(t) return (s, p, o)
def create_tree(tree): nodes = [] for n in tree: subtrees = [ subtree for subtree in n.subtrees(filter=lambda k: k != n) ] if len(subtrees) > 0: subnodes = create_tree(n) nodes.append(ParentedTree(n.label(), subnodes)) else: parent_label = n.parent().label() if n.parent() is not None \ and n.parent().label() not in ['S', 'ROOT'] else None nodes.append( ParentedTree(parent_label, [(self.__decode_( n[0]), self.__decode_(n.label()))])) return nodes
def extract_parse_actions(tree): """ Extract a list of ``ShiftReduceAction`` objects for the given tree. Parameters ---------- tree : nltk.tree.ParentedTree The RST tree from which to extract the actions. Returns ------- actseq : list List of ``ShiftReduceAction`` objects extracted from the tree. """ if tree.label() == '': tree.set_label("ROOT") assert tree.label() == "ROOT" stack = [] cstack = [ParentedTree.fromstring("(DUMMY0 (DUMMY1 DUMMY3))")] actseq = [] _extract_parse_actions_helper(tree, stack, cstack, actseq) actseq = _merge_constituent_end_shifts(actseq) return actseq
def test_node_printing(self): '''Test that the tgrep print operator ' is properly ignored.''' tree = ParentedTree.fromstring('(S (n x) (N x))') self.assertEqual(list(tgrep.tgrep_positions('N', [tree])), list(tgrep.tgrep_positions('\'N', [tree]))) self.assertEqual(list(tgrep.tgrep_positions('/[Nn]/', [tree])), list(tgrep.tgrep_positions('\'/[Nn]/', [tree])))
def get_modparse(sentence): """returns the modified parse tree for a sentence""" sp_db = SentenceParse.get_sentence_parse(sentence) try: res = sp_db.all()[0] parsetree = res.original_parse modparsetree = res.modified_parse except: print "parse.py: 103: " + sentence parses = parse_sentences([sentence]) if len(parses) == 0: raise ParseError(printcolors.WARNING + ('ParseError: a sentence was empty')) modparses = modify_parses(parses) for i,chunk in enumerate(modparses[:]): for j,modparse in enumerate(chunk): if 'LANDMARK-PHRASE' in modparse: modparses[i] = modparse parses[i] = parses[i][j] break if isinstance(modparses[i],list): modparses[i] = modparses[i][0] parses[i] = parses[i][0] parsetree = parses[0] modparsetree = modparses[0] try: SentenceParse.add_sentence_parse(sentence, parsetree, modparsetree) except Exception as e: print e if count_lmk_phrases(ParentedTree.parse(modparsetree)) < 1: raise ParseError(printcolors.WARNING + ('ParseError: Parse contained no Landmark phrase.\nSentence: %s\nParse: %s\nModparse: %s' % (sentence,parsetree,modparsetree))) return parsetree, modparsetree
def get_tree_part(sentence, part): url = "http://corenlp.run:80/tregex" request_paramsN = { "pattern": "(NP[$VP]>S)|(NP[$VP]>S\\n)|(NP\\n[$VP]>S)|(NP\\n[$VP]>S\\n)|(NP[$VP]>SQ)" } request_paramsV = { "pattern": "(VP[$NP]>S)|(VP[$NP]>S\\n)|(VP\\n[$NP]>S)|(VP\\n[$NP]>S\\n)|(VP[$NP]>SQ)" } select = request_paramsN if part == "NP" else request_paramsV try: request = requests.post(url, data=sentence, params=select) json = request.json() if print_switch: print(json) except: print("Cannot connect to coreNLP server. Try again later.") raise Exception return try: string = str(dict(json['sentences'][0])['0']['match']) tree = ParentedTree.fromstring(string) return tree except: print("Parsing issue in sentence:", sentence) print("Recieved parse:", nlp.parse(sentence)) raise Exception return
def test_rel_precedence(self): """ Test matching nodes based on precedence relations. """ tree = ParentedTree.fromstring("(S (NP (NP (PP x)) (NP (AP x)))" " (VP (AP (X (PP x)) (Y (AP x))))" " (NP (RC (NP (AP x)))))") self.assertEqual(list(tgrep.tgrep_positions("* . X", [tree])), [[(0, ), (0, 1), (0, 1, 0)]]) self.assertEqual(list(tgrep.tgrep_positions("* . Y", [tree])), [[(1, 0, 0), (1, 0, 0, 0)]]) self.assertEqual( list(tgrep.tgrep_positions("* .. X", [tree])), [[(0, ), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0)]], ) self.assertEqual( list(tgrep.tgrep_positions("* .. Y", [tree])), [[(0, ), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0), (1, 0, 0), (1, 0, 0, 0)]], ) self.assertEqual(list(tgrep.tgrep_positions("* , X", [tree])), [[(1, 0, 1), (1, 0, 1, 0)]]) self.assertEqual( list(tgrep.tgrep_positions("* , Y", [tree])), [[(2, ), (2, 0), (2, 0, 0), (2, 0, 0, 0)]], ) self.assertEqual( list(tgrep.tgrep_positions("* ,, X", [tree])), [[(1, 0, 1), (1, 0, 1, 0), (2, ), (2, 0), (2, 0, 0), (2, 0, 0, 0)]], ) self.assertEqual( list(tgrep.tgrep_positions("* ,, Y", [tree])), [[(2, ), (2, 0), (2, 0, 0), (2, 0, 0, 0)]], )
def test_bad_operator(self): """ Test error handling of undefined tgrep operators. """ tree = ParentedTree.fromstring("(S (A (T x)) (B (N x)))") self.assertRaises(tgrep.TgrepException, list, tgrep.tgrep_positions("* >>> S", [tree]))
def parse(sentence, use_cache=True, parser='stanford'): cache_key = "parse_trees_{0}".format(parser) valid_lines = None if use_cache: cache_attempt = cache_get(cache_key, sentence) if cache_attempt: valid_lines = cache_attempt if valid_lines is None: if parser == "stanford": response = parse_stanford(sentence, use_cache=use_cache) elif parser == "malt": response = parse_malt(sentence, use_cache=use_cache) else: return [] valid_lines = [line for line in response.split("\n") if len(line) > 2 and line[0] == "(" and line[-1] == ")"] if use_cache: cache_set(cache_key, sentence, valid_lines) # throw away the garbgage we don't want from the parser's response. # this could probably get us in trouble since it'll hide errors etc, # but we got deadlines.... trees = [ParentedTree.parse(line) for line in valid_lines] return trees
def add_indices_to_terminals(treestring): tree = ParentedTree.fromstring(treestring) for idx, _ in enumerate(tree.leaves()): tree_location = tree.leaf_treeposition(idx) non_terminal = tree[tree_location[:-1]] non_terminal[0] = non_terminal[0] + "_" + str(idx) return str(tree)
def get_example( self, # type: ignore tree: ParentedTree, ancestor: str): """ Given a ParentedTree, extract the labels of the parents, grandparents, or greatgrandparents. Parameters ---------- tree: ParentedTree ParentedTree to extract the example from. ancestor: str Whether the labels should be the parent, grandparent, or great-grandparent of each leaf. """ tokens = tree.leaves() labels: List[str] = [] for child in tree: if isinstance(child, ParentedTree): if len(list(child.subtrees())) > 1: labels.extend(self.get_example(child, self._ancestor)[1]) else: labels.append(self._get_label(child, self._ancestor)) return tokens, labels
def gen_instances(dataset, parses, model): instances = [] labels = [] candidate_re = re.compile("[%s]" % model.candidate) for paragraph in chain(*dataset): root = paragraph.root_relation() if root: sentences = list(root.iterfind(filter=node_type_filter(Sentence))) # 分割点两边的偏移量 for sentence in sentences: segments = set() # 分割点两侧的偏移量 candidates = set() # 候选分割词的偏移量 edus = list(sentence.iterfind(filter=node_type_filter(EDU))) offset = 0 for edu in edus: segments.add(offset) segments.add(offset+len(edu.text)-1) offset += len(edu.text) # convert tree in parented tree for feature extraction parse = ParentedTree.fromstring(parses[sentence.sid].pformat()) for m in candidate_re.finditer(sentence.text): candidate = m.start() instances.append(model.extract_features(candidate, parse)) labels.append(1 if candidate in segments else 0) return instances, labels
def test_reconstruct_training_examples(): """Check extracted actions for entire training data.""" # go through the training data and make sure # that the actions extracted from the trees can be used to # reconstruct those trees from a list of EDUs # check if the training data file exists, otherwise skip test file_path = Path('rst_discourse_tb_edus_TRAINING_TRAIN.json') if not file_path.exists(): raise SkipTest("training data JSON file not found") # read in the training data file with open(file_path) as train_data_file: data = json.load(train_data_file) # instantiate the parser rst_parser = Parser(max_acts=1, max_states=1, n_best=1) # iterate over each document in the training data for doc_dict in data: # get the original RST tree original_tree = ParentedTree.fromstring(doc_dict['rst_tree']) # extract the parser actions from this tree actions = extract_parse_actions(original_tree) # reconstruct the tree from these actions using the parser reconstructed_tree = next(rst_parser.parse(doc_dict, gold_actions=actions, make_features=False))['tree'] eq_(reconstructed_tree, original_tree)
def get_all_parts_of_ctree(self, cparse, clabeldict, learn_features): self.cparse = ParentedTree.fromstring(str(cparse)) if len(cparse.leaves()) != len(self.tokens): raise Exception("sentences do not line up!") # Replace leaves with node-ids. idx = 0 for pos in self.cparse.treepositions('leaves'): self.cparse[pos] = idx idx += 1 # Replace internal nodes with node-ids. for st in self.cparse.subtrees(): # if x[0] in parentedp.leaves(): continue self.idxlabelmap[idx] = clabeldict.addstr(st.label()) st.set_label(idx) idx += 1 self.get_all_constit_spans() if not learn_features: return # Get stuff for constit features. self.leafnodes = [ k for k in self.cparse.subtrees(lambda t: t.height() == 2) ] for a in xrange(len(self.leafnodes)): if self.leafnodes[a][0] != a: raise Exception("order mixup!") self.get_cpath_to_root() # Get all lowest common ancestors. for j in xrange(len(self.leafnodes)): for k in xrange(j, len(self.leafnodes)): lca, lcaid = self.get_lca(self.leafnodes[j], self.leafnodes[k]) self.lca[(j, k)] = (lca, lcaid)
def test_rel_precedence(self): ''' Test matching nodes based on precedence relations. ''' tree = ParentedTree.fromstring('(S (NP (NP (PP x)) (NP (AP x)))' ' (VP (AP (X (PP x)) (Y (AP x))))' ' (NP (RC (NP (AP x)))))') self.assertEqual(list(tgrep.tgrep_positions('* . X', [tree])), [[(0, ), (0, 1), (0, 1, 0)]]) self.assertEqual(list(tgrep.tgrep_positions('* . Y', [tree])), [[(1, 0, 0), (1, 0, 0, 0)]]) self.assertEqual( list(tgrep.tgrep_positions('* .. X', [tree])), [[(0, ), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0)]], ) self.assertEqual( list(tgrep.tgrep_positions('* .. Y', [tree])), [[(0, ), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0), (1, 0, 0), (1, 0, 0, 0)]], ) self.assertEqual(list(tgrep.tgrep_positions('* , X', [tree])), [[(1, 0, 1), (1, 0, 1, 0)]]) self.assertEqual( list(tgrep.tgrep_positions('* , Y', [tree])), [[(2, ), (2, 0), (2, 0, 0), (2, 0, 0, 0)]], ) self.assertEqual( list(tgrep.tgrep_positions('* ,, X', [tree])), [[(1, 0, 1), (1, 0, 1, 0), (2, ), (2, 0), (2, 0, 0), (2, 0, 0, 0)]], ) self.assertEqual( list(tgrep.tgrep_positions('* ,, Y', [tree])), [[(2, ), (2, 0), (2, 0, 0), (2, 0, 0, 0)]], )
def get_sentence_posteriors(sentence, iterations=1, extra_meaning=None): meaning_probs = {} # parse sentence with charniak and apply surgeries print 'parsing ...' modparse = get_modparse(sentence) t = ParentedTree.parse(modparse) print '\n%s\n' % t.pprint() num_ancestors = count_lmk_phrases(t) - 1 for _ in xrange(iterations): (lmk, _, _), (rel, _, _) = get_meaning(num_ancestors=num_ancestors) meaning = m2s(lmk,rel) if meaning not in meaning_probs: ps = get_tree_probs(t, lmk, rel)[0] # print "Tree probs: ", zip(ps,rls) meaning_probs[meaning] = np.prod(ps) print '.' if extra_meaning: meaning = m2s(*extra_meaning) if meaning not in meaning_probs: ps = get_tree_probs(t, lmk, rel)[0] # print "Tree prob: ", zip(ps,rls) meaning_probs[meaning] = np.prod(ps) print '.' summ = sum(meaning_probs.values()) for key in meaning_probs: meaning_probs[key] /= summ return meaning_probs.items()
def __init__(self, corpus_path, pos_path=None, parse_path=None, dep_path=None): self.corpus = load_corpus(corpus_path) self.pos_corpus = {} if pos_path is not None: self.pos_corpus = load_corpus(pos_path) self.parse_corpus = {} if parse_path is not None: self.parse_corpus = load_corpus( parse_path, lambda x: ParentedTree.fromstring(x)) self.edu_corpus = {} for l, tokens in self.corpus.items(): self.edu_corpus[l] = argument.get_EDU_offsets(tokens) self.dep_corpus = {} if dep_path is not None: self.dep_corpus = load_corpus( dep_path, postprocess_dep_entry, preprocess_dep_entry, ) for l, dp in self.dep_corpus.items(): assert (len(dp) == len(self.edu_corpus[l]))
def get_pp_old(text): # Return: a list of prepositions inside PP's in # the text. If the phrase is preceded by a VP/ADJP, the result # include the verb/adj also. If the phrase is preceded by a NP, # the noun is not included. phrases = {} for structure in parser.parse(nltk.word_tokenize(text)): tree = ParentedTree.convert(structure) for subtree in tree.subtrees(): if subtree.label() == "PP": preposition = subtree.leaves()[0] left_sibling = subtree.left_sibling() if left_sibling != None: left_sibling_label = left_sibling.label() if is_noun(left_sibling_label): phrases[preposition] = True elif is_verb(left_sibling_label): verb = convert_to_base_form( " ".join(left_sibling.leaves()), 'v') word = verb + " " + preposition phrases[word] = True elif is_adj(left_sibling_label): adj = convert_to_base_form( " ".join(left_sibling.leaves()), 'a') word = adj + " " + preposition phrases[word] = True return phrases
def _parse_trees(self, file): with open(file, 'r') as f: lines = ''.join(map(str.strip, f.readlines())) s_expressions = self._tokenizer.tokenize(lines) trees = [ParentedTree.fromstring(s_expr) for s_expr in s_expressions] return trees
def test_bad_operator(self): ''' Test error handling of undefined tgrep operators. ''' tree = ParentedTree.fromstring('(S (A (T x)) (B (N x)))') self.assertRaises(tgrep.TgrepException, list, tgrep.tgrep_positions('* >>> S', [tree]))
def findSentencePTreeToken(sentence, keyword): import nltk from nltk.tree import ParentedTree stemmed = _lemma_(keyword) tmp = proc.parse_doc(sentence) i = 0 numSentences = len(tmp['sentences']) rs = [] for i in range(0, numSentences): p = tmp['sentences'][i]['parse'] ptree = ParentedTree.fromstring(p) # rs = [] for i in range(0, len(ptree.leaves())): tree_position = ptree.leaf_treeposition(i) node = ptree[tree_position] if _stem_(node)==stemmed: tree_position = tree_position[0:len(tree_position)-1] rs.append(ptree[tree_position]) # if len(rs)>0: # return rs return rs
def conll2tree(arr): #format: idx, word, _, pos, pos, _, head, _, _, _ # dep:head dep2headHash = {} for wArr in arr: dep2headHash[int(wArr[0])] = int(wArr[6]) #print dep2headHash # head:depsList[] head2depsHash = {} for dep in dep2headHash: head = dep2headHash[dep] if head in head2depsHash: head2depsHash[head].append(dep) else: head2depsHash[head] = [dep] #print head2depsHash # if len(head2depsHash[0]) > 1: # print "Error. Multiple roots." # head:tree treeheadHash = {} for head in head2depsHash: tree = ParentedTree(head, head2depsHash[head]) treeheadHash[head] = tree root = updateTree(treeheadHash, 0) return root
def parse_sentences(self, filename, num_sentences): """Parses each one-line sentence into a syntax tree""" # Open the file and parse a given number of sentences f = open(filename, 'r') if num_sentences == 'all': num_sentences = -1 count = 0 for sentence in f.readlines()[:num_sentences]: if count%10==0: print("Number of sentences trained: ",count) # Get possible parse trees trees = self.parser.raw_parse(sentence.lower()) for tree in trees: self.nonterminal_counts['ROOT'] += 1 tokenized_sentence = self.tokenize_sentence(sentence) # Only extract rules from sentences with greater than 8 tokens, # to avoid adding rules that generate short, ungrammatical sentences if len(tokenized_sentence) > 8: self.extract_rules(tree) # Convert the tree into a ParentedTree, # which is an NLTK tree that keeps pointers to each node's parent ptree = ParentedTree.convert(tree) # Calculate the bigram counts for this sentence self.get_bigram(ptree, tokenized_sentence) count+=1
def check(sent) : parser = StanfordParser() # Parse the example sentence # print(sent) t = list(parser.raw_parse(sent))[0] # print(t) t = ParentedTree.convert(t) # print(t) # t.pretty_print() try : subj = find_subject(t) except : subj = [] try : pred = find_predicate(t) except : pred = [] try : obj = find_object(t) except : obj = [] # print (subj) # print (pred) # print (obj) return subj , pred , obj
def parse_text(self, text): default_properties = { 'outputFormat': 'xml', 'annotators': 'tokenize,pos,lemma,ssplit,parse,depparse' } response = self.session.post( self.url, params={'properties': json.dumps(default_properties)}, data=text.encode(self.encoding), timeout=60) response.raise_for_status() parsed_data = xml.parse(response.text) sentences = parsed_data['root']['document']['sentences']['sentence'] sentences = sentences if isinstance(sentences, list) else [sentences] for sentence in sentences: yield (ParentedTree.fromstring( sentence['parse'], read_leaf=lambda leaf: leaf.lower(), read_node=lambda node: node.split("-")[0]), self.make_deps(sentence['dependencies'][3]['dep']), self.create_raw_sentence(sentence['tokens']['token']))
def test_use_macros(self): ''' Test defining and using tgrep2 macros. ''' tree = ParentedTree.fromstring( '(VP (VB sold) (NP (DET the) ' '(NN heiress)) (NP (NN deed) (PREP to) ' '(NP (DET the) (NN school) (NN house))))' ) self.assertEqual( list( tgrep.tgrep_positions( '@ NP /^NP/;\n@ NN /^NN/;\n@NP !< @NP !$.. @NN', [tree] ) ), [[(1,), (2, 2)]], ) # use undefined macro @CNP self.assertRaises( tgrep.TgrepException, list, tgrep.tgrep_positions( '@ NP /^NP/;\n@ NN /^NN/;\n@CNP !< @NP !$.. @NN', [tree] ), )
def test_node_nocase(self): ''' Test selecting nodes using case insensitive node names. ''' tree = ParentedTree.fromstring('(S (n x) (N x))') self.assertEqual(list(tgrep.tgrep_positions('"N"', [tree])), [[(1,)]]) self.assertEqual(list(tgrep.tgrep_positions('i@"N"', [tree])), [[(0,), (1,)]])
def _is_leaf(tree: ParentedTree): """ Checks whether the given tree is a leaf. :param tree: a ParentedTree instance :return: true if it is a leaf """ return tree.height() == 2
def add_tree(self, datum): # parse tree and binarize tree = Tree.fromstring(datum["raw_tree"]) tree.chomsky_normal_form() tree.collapse_unary(collapsePOS=True) tree = ParentedTree.convert(tree) # assign indices to subtrees indices = {} counter = 0 for t in tree.subtrees(): indices[t.treeposition()] = counter counter += 1 # generate parent pointers and labels # (labels = one instance of sent in sents by treelstm terminology) parents = [0] * (counter - 1) labels = [] counter = 0 for t in tree.subtrees(): parent = t.parent() if parent != None: parents[counter] = indices[parent.treeposition()] counter += 1 if type(t[0]) is str or type(t[0]) is unicode: labels.append(t[0]) self.parents_file.write(" ".join(map(str, parents)) + "\n") self.sents_file.write(" ".join(labels) + "\n") self.trees.append(datum) return len(self.trees) - 1 # ID
def gen(files): for f in files: with open(f) as fi: #set_trace() #leaves = ParentedTree.parse(fi.read()).leaves() pos = ParentedTree.parse(fi.read()).pos() yield makeRow(getLocalContext(pos), f)
def j_is_subject(feats): "WORKS" sentence_tree = TREES_DICTIONARY[feats.article+".raw"][int(feats.sentence_ref)] ptree = ParentedTree.convert(sentence_tree) parent = __get_parent_tree__(feats.token_ref, ptree) j_subject = __is_subject__(ptree,feats.token_ref, parent,ptree) return "j_is_subject={}".format(j_subject)
def _preprosess(root: ParentedTree): """ Preprocesses the lexcial tree: clean the syntactic tags and replace each token value with its index number. :param root: the root of the lexcial tree :return: a tuple of a processed tree and a sequence of (tag, token) """ root: ParentedTree = root.copy(deep=True) def __iterate(tree: ParentedTree, index: int = 1): # clean the tags which contains '-' if '-' in tree.label(): tree.set_label(tree.label().split('-')[0]) if _is_leaf(tree): yield tree.label(), tree[0] # (tag, token) tree[0] = index # replace the token with its index number index += 1 else: for subtree in tree: for _item in __iterate(subtree, index): yield _item index += 1 # i.e. [('NR', '上海'), ('NR', '浦东'), ('NN', '开发'), ('CC', '与'), ...] sequences = [i for i in __iterate(root)] return root, sequences
def is_pred_nominal(feats): """WORKS""" if feats.sentence != feats.sentence_ref: return "is_pred_nominal={}".format(False) else: s_tree = ParentedTree.convert(TREES_DICTIONARY[feats.article+".raw"][int(feats.sentence)]) NP_i = __get_parent_tree__(feats.token, s_tree) NP_j = __get_parent_tree__(feats.token_ref,s_tree) nominal= __get_max_projection__(s_tree,NP_j) copula_verbs = ["is","are","were","was","am"] def check_nominal_construction(tree): found = False for t in tree: if found: break elif isinstance(t, ParentedTree): if t == NP_i: brother = t.right_sibling() if isinstance(brother,ParentedTree) and brother.node == "VP": verb = brother.leaves()[0] if verb in copula_verbs: for subtree in brother: if subtree == nominal: found = True break else: found = check_nominal_construction(t) return found return "is_pred_nominal={}".format(check_nominal_construction(s_tree))
def syntax_similarity_conversation(self, documents1, average=False): #syntax similarity of each document with its before and after document global numnodes documents1parsed = [] for d1 in range(len(documents1)): sys.stderr.write(str(d1)+"\n") # print documents1[d1] tempsents = (self.sent_detector.tokenize(documents1[d1].strip())) for s in tempsents: if len(s.split())>100: documents1parsed.append("NA") break else: temp = list(self.parser.raw_parse_sents((tempsents))) for i in range(len(temp)): temp[i] = list(temp[i])[0] temp[i] = ParentedTree.convert(temp[i]) documents1parsed.append(list(temp)) results = OrderedDict() for d1 in range(len(documents1parsed)): d2 = d1+1 if d2 == len(documents1parsed): break if documents1parsed[d1] == "NA" or documents1parsed[d2]=="NA": continue costMatrix = [] for i in range(len(documents1parsed[d1])): numnodes = 0 tempnode = Node(documents1parsed[d1][i].root().label()) new_sentencedoc1 = self.convert_mytree(documents1parsed[d1][i],tempnode) temp_costMatrix = [] sen1nodes = numnodes for j in range(len(documents1parsed[d2])): numnodes=0.0 tempnode = Node(documents1parsed[d2][j].root().label()) new_sentencedoc2 = self.convert_mytree(documents1parsed[d2][j],tempnode) ED = simple_distance(new_sentencedoc1, new_sentencedoc2) ED = ED / (numnodes + sen1nodes) temp_costMatrix.append(ED) costMatrix.append(temp_costMatrix) costMatrix = np.array(costMatrix) if average==True: return 1-np.mean(costMatrix) else: indexes = su.linear_assignment(costMatrix) total = 0 rowMarked = [0] * len(documents1parsed[d1]) colMarked = [0] * len(documents1parsed[d2]) for row, column in indexes: total += costMatrix[row][column] rowMarked[row] = 1 colMarked [column] = 1 for k in range(len(rowMarked)): if rowMarked[k]==0: total+= np.min(costMatrix[k]) for c in range(len(colMarked)): if colMarked[c]==0: total+= np.min(costMatrix[:,c]) maxlengraph = max(len(documents1parsed[d1]),len(documents1parsed[d2])) results[(d1,d2)] = 1-total/maxlengraph#, minWeight/minlengraph, randtotal/lengraph return results
def test_rel_precedence(self): ''' Test matching nodes based on precedence relations. ''' tree = ParentedTree.fromstring('(S (NP (NP (PP x)) (NP (AP x)))' ' (VP (AP (X (PP x)) (Y (AP x))))' ' (NP (RC (NP (AP x)))))') self.assertEqual(list(tgrep.tgrep_positions('* . X', [tree])), [[(0,), (0, 1), (0, 1, 0)]]) self.assertEqual(list(tgrep.tgrep_positions('* . Y', [tree])), [[(1, 0, 0), (1, 0, 0, 0)]]) self.assertEqual(list(tgrep.tgrep_positions('* .. X', [tree])), [[(0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0)]]) self.assertEqual(list(tgrep.tgrep_positions('* .. Y', [tree])), [[(0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0), (1, 0, 0), (1, 0, 0, 0)]]) self.assertEqual(list(tgrep.tgrep_positions('* , X', [tree])), [[(1, 0, 1), (1, 0, 1, 0)]]) self.assertEqual(list(tgrep.tgrep_positions('* , Y', [tree])), [[(2,), (2, 0), (2, 0, 0), (2, 0, 0, 0)]]) self.assertEqual(list(tgrep.tgrep_positions('* ,, X', [tree])), [[(1, 0, 1), (1, 0, 1, 0), (2,), (2, 0), (2, 0, 0), (2, 0, 0, 0)]]) self.assertEqual(list(tgrep.tgrep_positions('* ,, Y', [tree])), [[(2,), (2, 0), (2, 0, 0), (2, 0, 0, 0)]])
def apposition(feats): #this was driving me MAD....I SHOULD CORRECT THE STYLE...aarrrrggghhshs """WORKS WITH THE EXAMPLES IN UNITTEST, HOPE THEY WERE NOT A COINDIDENCE""" if feats.sentence!=feats.sentence_ref: return "apposition={}".format(False) else: sentence_tree = TREES_DICTIONARY[feats.article+".raw"][int(feats.sentence_ref)] ptree = ParentedTree.convert(sentence_tree) token_ref = set(feats.token_ref.split("_")) token = set(feats.token.split("_")) def is_j_apposition(curr_tree): found = False for child in curr_tree: if found: break elif isinstance(child, ParentedTree): child_leaves = set(child.leaves()) conditions = len(token_ref.intersection(child_leaves))>0 and curr_tree.node == "NP" if conditions: brother = child.left_sibling() if isinstance(brother, ParentedTree) and brother.node == ",": antecedent = brother.left_sibling() if isinstance(antecedent,ParentedTree): previous_words = set(antecedent.leaves()) if len(token.intersection(previous_words))>0: found = True else: found = is_j_apposition(child) return found return "apposition={}".format(is_j_apposition(ptree))
def get_right_sibling(tree, pos, ct): for i, node in enumerate(tree.pos()): if i == pos: nodepos = tree.leaf_treeposition(i) pt = ParentedTree.convert(tree) rs = pt[nodepos[:-1]].right_sibling() if rs: if rs.label( ) == 'S': # the conn is connecting one or two S-es, take the right sibling S as int arg return rs.leaves() else: parent = pt[nodepos[:-1]].parent() # assuming that there are no duplicates of the connective anymore at this level of detail: leaves = parent.leaves() connindex = leaves.index(ct.token) remainder = [ xj for xi, xj in enumerate(leaves) if xi >= connindex ] return remainder else: # it's on the same level with its arg, which is not an S-clause parent = pt[nodepos[:-1]].parent() right_sibling = parent.right_sibling() leaves = parent.leaves() leaves = leaves + right_sibling.leaves( ) # in this case, it may well be at the end of the clause, in which case the right sibling should probably also be included connindex = leaves.index(ct.token) remainder = [ xj for xi, xj in enumerate(leaves) if xi >= connindex ] return remainder
def test_bad_operator(self): ''' Test error handling of undefined tgrep operators. ''' tree = ParentedTree.fromstring('(S (A (T x)) (B (N x)))') self.assertRaises( tgrep.TgrepException, list, tgrep.tgrep_positions('* >>> S', [tree]) )
def test_node_regex(self): ''' Test regex matching on nodes. ''' tree = ParentedTree.fromstring('(S (NP-SBJ x) (NP x) (NNP x) (VP x))') # This is a regular expression that matches any node whose # name starts with NP, including NP-SBJ: self.assertEqual(list(tgrep.tgrep_positions('/^NP/', [tree])), [[(0,), (1,)]])
def test_node_quoted(self): ''' Test selecting nodes using quoted node names. ''' tree = ParentedTree.fromstring('(N ("N" x) (N" x) ("\\" x))') self.assertEqual(list(tgrep.tgrep_positions('"N"', [tree])), [[()]]) self.assertEqual(list(tgrep.tgrep_positions('"\\"N\\""', [tree])), [[(0,)]]) self.assertEqual(list(tgrep.tgrep_positions('"N\\""', [tree])), [[(1,)]]) self.assertEqual(list(tgrep.tgrep_positions('"\\"\\\\\\""', [tree])), [[(2,)]])
def get_sentence_meaning_likelihood(sentence, lmk, rel): modparse = get_modparse(sentence) t = ParentedTree.parse(modparse) print '\n%s\n' % t.pprint() probs, entropies, lrpc, tps = get_tree_probs(t, lmk, rel) if np.prod(probs) == 0.0: logger('ERROR: Probability product is 0 for sentence: %s, lmk: %s, rel: %s, probs: %s' % (sentence, lmk, rel, str(probs))) return np.prod(probs), sum(entropies), lrpc, tps
def vertical_imbalance(furcation_node_dict): max_sd = 0 for node in furcation_node_dict: node = ParentedTree.fromstring(node) child_heights = numpy.array([child.height() for child in node]) sd = numpy.std(child_heights) if sd > max_sd: max_sd = sd return max_sd
def span(feats): """WORKS""" if feats.sentence != feats.sentence_ref: return "span={}".format(False) else: s_tree = ParentedTree.convert(TREES_DICTIONARY[feats.article+".raw"][int(feats.sentence)]) i_parent = __get_parent_tree__(feats.token, s_tree) j_parent = __get_parent_tree__(feats.token_ref,s_tree) return "span={}".format(i_parent==j_parent)
def test_node_noleaves(self): ''' Test node name matching with the search_leaves flag set to False. ''' tree = ParentedTree.fromstring('(S (A (T x)) (B (N x)))') self.assertEqual(list(tgrep.tgrep_positions('x', [tree])), [[(0, 0, 0), (1, 0, 0)]]) self.assertEqual(list(tgrep.tgrep_positions('x', [tree], False)), [[]])