Exemplo n.º 1
0
Arquivo: alg.py Projeto: muyun/dev.nlp
def simp_syn_sent(sent):
    strs = ""
    # the original tokens in the sent


    #import pdb; pdb.set_trace()
    #print "syn sent: ", sent
    #import pdb; pdb.set_trace()
    tokens = StanfordTokenizer().tokenize(sent)
    tokens.insert(0, '')

    result = list(eng_parser.raw_parse(sent))[0]
    root = result.root['word']

    #w = result.tree()
    #print "parse_tree:", w

    #TODO: use the tree structure, check again
    node_list = [] # dict (4 -> 4, u'said', u'VBD', u'root', [[18], [22], [16], [3]])
    for node in result.nodes.items():
        node_list.append(base.get_triples(node))
        #node_list[base.get_triples[0]] = base.get_triples(node)


    #import pdb; pdb.set_trace()
    if len(sent) > 0:
        strs = simp_coordi_sent(tokens, node_list)
        if len(strs) > 0:
            return strs
        else:
            strs = simp_subordi_sent(tokens, node_list)
            if len(strs) > 0:
                return strs
            else:
                strs = simp_advcl_sent(tokens, node_list)
                if len(strs) > 0:
                    return strs
                else:
                    strs = simp_parti_sent(tokens, node_list)
                    if len(strs) > 0:
                        return strs
                    else:
                        strs = simp_adjec_sent(tokens, node_list)
                        if len(strs) > 0:
                            return strs
                        else:
                            strs = simp_appos_sent(tokens, node_list)
                            if len(strs) > 0:
                                return strs
                            else:
                                strs = simp_passive_sent(tokens, node_list)
                                if len(strs) > 0:
                                    return strs


    return strs
Exemplo n.º 2
0
def simp_syn_sent_(sent):
    strs = ""
    # the original tokens in the sent
    """
    lst1 = "Peter, who liked fruits, ate an apple.".split()
    _lst = sent.split()

    #import pdb; pdb.set_trace()
    if lst1 == _lst:
        return "Peter liked fruits. Peter ate an apple."
    """
    #import pdb; pdb.set_trace()
    #print(sent)
    #import pdb; pdb.set_trace()
    tokens = StanfordTokenizer().tokenize(str(sent))
    #tokens = wordpunct_tokenize(str(sent))
    tokens.insert(0, '')

    result = list(eng_parser.raw_parse(sent))[0]
    root = result.root['word']


    #import pdb; pdb.set_trace()
    #w = result.tree()
    #print "parse_tree:", w
    #for row in result.triples():
    #    print(row)


    #import pdb; pdb.set_trace()
    #TODO: use the tree structure, check again
    node_list = [] # dict (4 -> 4, u'said', u'VBD', u'root', [[18], [22], [16], [3]])
    for node in result.nodes.items():
        node_list.append(base.get_triples(node))
        #node_list[base.get_triples[0]] = base.get_triples(node)


    #import pdb; pdb.set_trace()
    #strs = simp_coordi_sent(tokens, node_list)
    #strs = simp_subordi_sent(tokens, node_list)
    #strs = simp_advcl_sent(tokens, node_list)
    #strs = simp_parti_sent(tokens, node_list)
    strs = simp_adjec_sent(tokens, node_list)
    #strs = simp_appos_sent(tokens, node_list)
    #strs = simp_passive_sent(tokens, node_list)

    return strs
Exemplo n.º 3
0
def simp_syn_sent_(sent):
    strs = ""
    #print(sent)
    #import pdb; pdb.set_trace()
    tokens = StanfordTokenizer().tokenize(str(sent))
    #tokens = wordpunct_tokenize(str(sent))
    tokens.insert(0, '')

    re = list(eng_parser.raw_parse(sent))[0]
    root = re.root['word']

    node_list = [] # dict (4 -> 4, u'said', u'VBD', u'root', [[18], [22], [16], [3]])
    for node in re.nodes.items():
        node_list.append(base.get_triples(node))

    #result = list(eng_parser.raw_parse(sent))[0]
    #root = result.root['word']

    strs = simp_relcl_sent(tokens, node_list)

    return strs
Exemplo n.º 4
0
def _simp_syn_sent(sent, _algs=range(1,10)):
    strs = ""
    
    """ 
    # order the ALG for the better performance(precision/recall)
    _algs_lst_ = [
        paratax.simp_paratax_sent,
        #punct.simp_punct_sent,
        subordi.simp_subordi_sent,
        adverb.simp_adverb_sent,
        parti.simp_parti_sent,
        appos.simp_appos_sent,
        adjec.simp_adjec_sent,
        coordi.simp_coordi_sent,
        passive.simp_passive_sent
    ]
    """
    # the original tokens in the sent
    #print "syn sent: ", sent
    #import pdb; pdb.set_trace()
    tokens = StanfordTokenizer().tokenize(sent)
    #tokens = wordpunct_tokenize(strs)
    tokens.insert(0, '')
    #taggers = eng_tagger.tag(sent.split())

    result = list(eng_parser.raw_parse(sent))[0]
    root = result.root['word']

    #w = result.tree()
    #print "parse_tree:", w
    
    #TODO: use the tree structure, Check again
    node_list = [] # dict (4 -> 4, u'said', u'VBD', u'root', [[18], [22], [16], [3]])
    for node in result.nodes.items():
        node_list.append(base.get_triples(node))
        #node_list[base.get_triples[0]] = base.get_triples(node)

    alg = ""
    """
    #import pdb; pdb.set_trace()
    if len(sent) > 0:
        for ind in _algs:
            #import pdb; pdb.set_trace()
            # if the alg in the choices
            print "_alg: ", _algs_lst[ind]
            if len(strs) > 0:
                return strs, _algs_lst[ind]
            else:
                #func = _algs_lst[ind]
                strs = _algs_lst[ind](tokens,node_list)
    """           
    # Use the robest function for the experiments                  
    if len(sent) > 0: 
        strs = paratax.simp_paratax_sent(tokens, node_list)
        if len(strs) > 0:
            alg = "paratax"
            return strs, alg
        else:
            strs = punct.simp_punct_sent(tokens, node_list)
            if len(strs) > 0:
                alg = "punct"
                return strs, alg
            else:    
                #strs = coordi.simp_coordi_sent(tokens, node_list)                   
                strs = subordi.simp_subordi_sent(tokens, node_list)
                if len(strs) > 0:
                    alg = "subordi"
                    return strs, alg
                else:
                    strs = adverb.simp_adverb_sent(tokens, node_list)
                    if len(strs) > 0:
                        alg = "adverb"
                        return strs, alg
                    else:
                        strs = parti.simp_parti_sent(tokens, node_list)
                        if len(strs) > 0:
                            alg = "parti"
                            return strs, alg
                        else:
                            strs = appos.simp_appos_sent(tokens, node_list)
                            if len(strs) > 0:
                                alg = "appos"
                                return strs, alg
                            else:
                                strs = adjec.simp_adjec_sent(tokens, node_list)
                                if len(strs) > 0:
                                    alg = "adjec"
                                    return strs, alg
                                else:
                                    #strs = subordi.simp_subordi_sent(tokens, node_list)
                                    strs = coordi.simp_coordi_sent(tokens, node_list)
                                    if len(strs) > 0:
                                        alg = "coordi"
                                        return strs, alg
                                    else:
                                        
                                        strs = passive.simp_passive_sent(tokens, node_list)
                                        if len(strs) > 0:
                                            alg = "passive"
                                            return strs, alg
                                        else:
                                            strs = relcl.simp_relcl_sent(tokens, node_list)
                                            if len(strs) > 0:
                                                alg= "relcl"
                                                return strs, alg

    return strs, alg
Exemplo n.º 5
0
def relcl(sent):
    strs = ""
    #print(sent)
    #import pdb; pdb.set_trace()
    tokens = StanfordTokenizer().tokenize(str(sent))
    #tokens = wordpunct_tokenize(str(sent))
    tokens.insert(0, '')

    re = list(eng_parser.raw_parse(sent))[0]
    root = re.root['word']

    node_list = [] # dict (4 -> 4, u'said', u'VBD', u'root', [[18], [22], [16], [3]])
    for node in re.nodes.items():
        node_list.append(base.get_triples(node))

    #result = list(eng_parser.raw_parse(sent))[0]
    #root = result.root['word']

    #strs = simp_relcl_sent(tokens, node_list)

    dep = eng_parser.raw_parse(sent).next()
    result = list(dep.triples())

    nsubj = ""
    verb = ""
    for node in result:
        if 'acl:relcl' in node[1]:
            #import pdb; pdb.set_trace()
            nsubj = node[0][0]
            verb =  node[2][0]
        #break


    #import pdb; pdb.set_trace()
    nsubj_ind = tokens.index(nsubj)
    verb_ind = tokens.index(verb)

    #split_ind = tokens.index(COMMA)
    #import pdb; pdb.set_trace()

    #if split_ind < verb_ind:
    _str1 = tokens[:nsubj_ind+1]
    str1 = ' '.join(_str1) + " . "

    _str2 = tokens[nsubj_ind+1:]
    if _str2[0] in PUNCTUATION:
        _str2.pop(0)
    if ('which' in _str2[0]) or ('who' in _str2[0]):
        _str2.pop(0)

    str2 = base.replace_nsubj(tokens, nsubj) +  ' '.join(_str2)

    strs = str1 + str2
    """
    stree = [parse.tree() for parse in eng_parser.raw_parse(sent)][0]

    #import pdb; pdb.set_trace()
    for postn in stree.treepositions():
        if stree.label().endswith("=H"):
            parentpos = postn[:-1]
            partial = Tree(stree[parentpos].label(), [ stree[postn] ])
    """

    #import pdb; pdb.set_trace()
    #strs = simp_relcl_sent(result)

    """
    lst = []
    se = 0
    head = ""
    dependent = ""
    for nd in re:
        if 'nsubj' in nd[1] or 'nsubjpass' in nd[1]:
            head = nd[0][0]
            dependent = nd[2][0]
    """

    #for node in node_list[1:]:

    return strs