def simp_syn_sent(sent): strs = "" # the original tokens in the sent #import pdb; pdb.set_trace() #print "syn sent: ", sent #import pdb; pdb.set_trace() tokens = StanfordTokenizer().tokenize(sent) tokens.insert(0, '') result = list(eng_parser.raw_parse(sent))[0] root = result.root['word'] #w = result.tree() #print "parse_tree:", w #TODO: use the tree structure, check again node_list = [] # dict (4 -> 4, u'said', u'VBD', u'root', [[18], [22], [16], [3]]) for node in result.nodes.items(): node_list.append(base.get_triples(node)) #node_list[base.get_triples[0]] = base.get_triples(node) #import pdb; pdb.set_trace() if len(sent) > 0: strs = simp_coordi_sent(tokens, node_list) if len(strs) > 0: return strs else: strs = simp_subordi_sent(tokens, node_list) if len(strs) > 0: return strs else: strs = simp_advcl_sent(tokens, node_list) if len(strs) > 0: return strs else: strs = simp_parti_sent(tokens, node_list) if len(strs) > 0: return strs else: strs = simp_adjec_sent(tokens, node_list) if len(strs) > 0: return strs else: strs = simp_appos_sent(tokens, node_list) if len(strs) > 0: return strs else: strs = simp_passive_sent(tokens, node_list) if len(strs) > 0: return strs return strs
def simp_syn_sent_(sent): strs = "" # the original tokens in the sent """ lst1 = "Peter, who liked fruits, ate an apple.".split() _lst = sent.split() #import pdb; pdb.set_trace() if lst1 == _lst: return "Peter liked fruits. Peter ate an apple." """ #import pdb; pdb.set_trace() #print(sent) #import pdb; pdb.set_trace() tokens = StanfordTokenizer().tokenize(str(sent)) #tokens = wordpunct_tokenize(str(sent)) tokens.insert(0, '') result = list(eng_parser.raw_parse(sent))[0] root = result.root['word'] #import pdb; pdb.set_trace() #w = result.tree() #print "parse_tree:", w #for row in result.triples(): # print(row) #import pdb; pdb.set_trace() #TODO: use the tree structure, check again node_list = [] # dict (4 -> 4, u'said', u'VBD', u'root', [[18], [22], [16], [3]]) for node in result.nodes.items(): node_list.append(base.get_triples(node)) #node_list[base.get_triples[0]] = base.get_triples(node) #import pdb; pdb.set_trace() #strs = simp_coordi_sent(tokens, node_list) #strs = simp_subordi_sent(tokens, node_list) #strs = simp_advcl_sent(tokens, node_list) #strs = simp_parti_sent(tokens, node_list) strs = simp_adjec_sent(tokens, node_list) #strs = simp_appos_sent(tokens, node_list) #strs = simp_passive_sent(tokens, node_list) return strs
def simp_syn_sent_(sent): strs = "" #print(sent) #import pdb; pdb.set_trace() tokens = StanfordTokenizer().tokenize(str(sent)) #tokens = wordpunct_tokenize(str(sent)) tokens.insert(0, '') re = list(eng_parser.raw_parse(sent))[0] root = re.root['word'] node_list = [] # dict (4 -> 4, u'said', u'VBD', u'root', [[18], [22], [16], [3]]) for node in re.nodes.items(): node_list.append(base.get_triples(node)) #result = list(eng_parser.raw_parse(sent))[0] #root = result.root['word'] strs = simp_relcl_sent(tokens, node_list) return strs
def _simp_syn_sent(sent, _algs=range(1,10)): strs = "" """ # order the ALG for the better performance(precision/recall) _algs_lst_ = [ paratax.simp_paratax_sent, #punct.simp_punct_sent, subordi.simp_subordi_sent, adverb.simp_adverb_sent, parti.simp_parti_sent, appos.simp_appos_sent, adjec.simp_adjec_sent, coordi.simp_coordi_sent, passive.simp_passive_sent ] """ # the original tokens in the sent #print "syn sent: ", sent #import pdb; pdb.set_trace() tokens = StanfordTokenizer().tokenize(sent) #tokens = wordpunct_tokenize(strs) tokens.insert(0, '') #taggers = eng_tagger.tag(sent.split()) result = list(eng_parser.raw_parse(sent))[0] root = result.root['word'] #w = result.tree() #print "parse_tree:", w #TODO: use the tree structure, Check again node_list = [] # dict (4 -> 4, u'said', u'VBD', u'root', [[18], [22], [16], [3]]) for node in result.nodes.items(): node_list.append(base.get_triples(node)) #node_list[base.get_triples[0]] = base.get_triples(node) alg = "" """ #import pdb; pdb.set_trace() if len(sent) > 0: for ind in _algs: #import pdb; pdb.set_trace() # if the alg in the choices print "_alg: ", _algs_lst[ind] if len(strs) > 0: return strs, _algs_lst[ind] else: #func = _algs_lst[ind] strs = _algs_lst[ind](tokens,node_list) """ # Use the robest function for the experiments if len(sent) > 0: strs = paratax.simp_paratax_sent(tokens, node_list) if len(strs) > 0: alg = "paratax" return strs, alg else: strs = punct.simp_punct_sent(tokens, node_list) if len(strs) > 0: alg = "punct" return strs, alg else: #strs = coordi.simp_coordi_sent(tokens, node_list) strs = subordi.simp_subordi_sent(tokens, node_list) if len(strs) > 0: alg = "subordi" return strs, alg else: strs = adverb.simp_adverb_sent(tokens, node_list) if len(strs) > 0: alg = "adverb" return strs, alg else: strs = parti.simp_parti_sent(tokens, node_list) if len(strs) > 0: alg = "parti" return strs, alg else: strs = appos.simp_appos_sent(tokens, node_list) if len(strs) > 0: alg = "appos" return strs, alg else: strs = adjec.simp_adjec_sent(tokens, node_list) if len(strs) > 0: alg = "adjec" return strs, alg else: #strs = subordi.simp_subordi_sent(tokens, node_list) strs = coordi.simp_coordi_sent(tokens, node_list) if len(strs) > 0: alg = "coordi" return strs, alg else: strs = passive.simp_passive_sent(tokens, node_list) if len(strs) > 0: alg = "passive" return strs, alg else: strs = relcl.simp_relcl_sent(tokens, node_list) if len(strs) > 0: alg= "relcl" return strs, alg return strs, alg
def relcl(sent): strs = "" #print(sent) #import pdb; pdb.set_trace() tokens = StanfordTokenizer().tokenize(str(sent)) #tokens = wordpunct_tokenize(str(sent)) tokens.insert(0, '') re = list(eng_parser.raw_parse(sent))[0] root = re.root['word'] node_list = [] # dict (4 -> 4, u'said', u'VBD', u'root', [[18], [22], [16], [3]]) for node in re.nodes.items(): node_list.append(base.get_triples(node)) #result = list(eng_parser.raw_parse(sent))[0] #root = result.root['word'] #strs = simp_relcl_sent(tokens, node_list) dep = eng_parser.raw_parse(sent).next() result = list(dep.triples()) nsubj = "" verb = "" for node in result: if 'acl:relcl' in node[1]: #import pdb; pdb.set_trace() nsubj = node[0][0] verb = node[2][0] #break #import pdb; pdb.set_trace() nsubj_ind = tokens.index(nsubj) verb_ind = tokens.index(verb) #split_ind = tokens.index(COMMA) #import pdb; pdb.set_trace() #if split_ind < verb_ind: _str1 = tokens[:nsubj_ind+1] str1 = ' '.join(_str1) + " . " _str2 = tokens[nsubj_ind+1:] if _str2[0] in PUNCTUATION: _str2.pop(0) if ('which' in _str2[0]) or ('who' in _str2[0]): _str2.pop(0) str2 = base.replace_nsubj(tokens, nsubj) + ' '.join(_str2) strs = str1 + str2 """ stree = [parse.tree() for parse in eng_parser.raw_parse(sent)][0] #import pdb; pdb.set_trace() for postn in stree.treepositions(): if stree.label().endswith("=H"): parentpos = postn[:-1] partial = Tree(stree[parentpos].label(), [ stree[postn] ]) """ #import pdb; pdb.set_trace() #strs = simp_relcl_sent(result) """ lst = [] se = 0 head = "" dependent = "" for nd in re: if 'nsubj' in nd[1] or 'nsubjpass' in nd[1]: head = nd[0][0] dependent = nd[2][0] """ #for node in node_list[1:]: return strs