Python parsed_sents示例，nltk.corpus.treebank.parsed_sents Python示例

示例#1

0

显示文件

文件： grammar.py 项目： willowjar/scratchNLP

def pcfg_demo():
    """
    A demonstration showing how C{WeightedGrammar}s can be created and used.
    """

    from nltk.corpus import treebank
    from nltk import treetransforms
    from nltk import induce_pcfg
    from nltk.parse import pchart

    pcfg_prods = toy_pcfg1.productions()

    pcfg_prod = pcfg_prods[2]
    print 'A PCFG production:', ` pcfg_prod `
    print '    pcfg_prod.lhs()  =>', ` pcfg_prod.lhs() `
    print '    pcfg_prod.rhs()  =>', ` pcfg_prod.rhs() `
    print '    pcfg_prod.prob() =>', ` pcfg_prod.prob() `
    print

    grammar = toy_pcfg2
    print 'A PCFG grammar:', ` grammar `
    print '    grammar.start()       =>', ` grammar.start() `
    print '    grammar.productions() =>',
    # Use string.replace(...) is to line-wrap the output.
    print ` grammar.productions() `.replace(',', ',\n' + ' ' * 26)
    print

    print 'Coverage of input words by a grammar:'
    print grammar.covers(['a', 'boy'])
    print grammar.covers(['a', 'girl'])

    # extract productions from three trees and induce the PCFG
    print "Induce PCFG grammar from treebank data:"

    productions = []
    for item in treebank.items[:2]:
        for tree in treebank.parsed_sents(item):
            # perform optional tree transformations, e.g.:
            tree.collapse_unary(collapsePOS=False)
            tree.chomsky_normal_form(horzMarkov=2)

            productions += tree.productions()

    S = Nonterminal('S')
    grammar = induce_pcfg(S, productions)
    print grammar
    print

    print "Parse sentence using induced grammar:"

    parser = pchart.InsideChartParser(grammar)
    parser.trace(3)

    # doesn't work as tokens are different:
    #sent = treebank.tokenized('wsj_0001.mrg')[0]

    sent = treebank.parsed_sents('wsj_0001.mrg')[0].leaves()
    print sent
    for parse in parser.nbest_parse(sent):
        print parse

示例#2

0

显示文件

文件： grammar.py 项目： ggosline/taxonparser

def pcfg_demo():
    """
    A demonstration showing how C{WeightedGrammar}s can be created and used.
    """

    from nltk.corpus import treebank
    from nltk import treetransforms
    from nltk import induce_pcfg
    from nltk.parse import pchart

    pcfg_prods = toy_pcfg1.productions()

    pcfg_prod = pcfg_prods[2]
    print('A PCFG production:', repr(pcfg_prod))
    print('    pcfg_prod.lhs()  =>', repr(pcfg_prod.lhs()))
    print('    pcfg_prod.rhs()  =>', repr(pcfg_prod.rhs()))
    print('    pcfg_prod.prob() =>', repr(pcfg_prod.prob()))
    print()

    grammar = toy_pcfg2
    print('A PCFG grammar:', repr(grammar))
    print('    grammar.start()       =>', repr(grammar.start()))
    print('    grammar.productions() =>', end=' ')
    # Use string.replace(...) is to line-wrap the output.
    print(repr(grammar.productions()).replace(',', ',\n' + ' ' * 26))
    print()

    print('Coverage of input words by a grammar:')
    print(grammar.covers(['a', 'boy']))
    print(grammar.covers(['a', 'girl']))

    # extract productions from three trees and induce the PCFG
    print("Induce PCFG grammar from treebank data:")

    productions = []
    for item in treebank.items[:2]:
        for tree in treebank.parsed_sents(item):
            # perform optional tree transformations, e.g.:
            tree.collapse_unary(collapsePOS=False)
            tree.chomsky_normal_form(horzMarkov=2)

            productions += tree.productions()

    S = Nonterminal('S')
    grammar = induce_pcfg(S, productions)
    print(grammar)
    print()

    print("Parse sentence using induced grammar:")

    parser = pchart.InsideChartParser(grammar)
    parser.trace(3)

    # doesn't work as tokens are different:
    #sent = treebank.tokenized('wsj_0001.mrg')[0]

    sent = treebank.parsed_sents('wsj_0001.mrg')[0].leaves()
    print(sent)
    for parse in parser.nbest_parse(sent):
        print(parse)

示例#3

0

显示文件

def grammar_development_with_treebank():
    from nltk.corpus import treebank
    t = treebank.parsed_sents("wsj_0001.mrg")[0]
    print t
    print "identify verbs for SV in VP -> SV S", [
        subtree for tree in treebank.parsed_sents()
        for subtree in tree.subtrees(_grammar_filter)
    ]

示例#4

0

显示文件

文件： ch08.py 项目： prashiyn/nltk-examples

def grammar_development_with_treebank():
    from nltk.corpus import treebank

    t = treebank.parsed_sents("wsj_0001.mrg")[0]
    print t
    print "identify verbs for SV in VP -> SV S", [
        subtree for tree in treebank.parsed_sents() for subtree in tree.subtrees(_grammar_filter)
    ]

示例#5

0

显示文件

def pcfg_demo():
    """
    A demonstration showing how a ``PCFG`` can be created and used.
    """

    from nltk.corpus import treebank
    from nltk import treetransforms
    from nltk import induce_pcfg
    from nltk.parse import pchart

    # pcfg_prods = toy_pcfg1.productions()
    #
    # pcfg_prod = pcfg_prods[2]
    # print('A PCFG production:', repr(pcfg_prod))
    # print('    pcfg_prod.lhs()  =>', repr(pcfg_prod.lhs()))
    # print('    pcfg_prod.rhs()  =>', repr(pcfg_prod.rhs()))
    # print('    pcfg_prod.prob() =>', repr(pcfg_prod.prob()))
    # print()
    #
    # grammar = toy_pcfg2
    # print('A PCFG grammar:', repr(grammar))
    # print('    grammar.start()       =>', repr(grammar.start()))
    # print '    grammar.productions() =>',
    # # Use .replace(...) is to line-wrap the output.
    # print(repr(grammar.productions()).replace(',', ',\n' + ' ' * 26))
    # print()

    # extract productions from three trees and induce the PCFG
    print("Induce PCFG grammar from treebank data:")

    productions = []
    item = treebank._fileids[0]
    for tree in treebank.parsed_sents(item)[:3]:
        # perform optional tree transformations, e.g.:
        tree.collapse_unary(collapsePOS=False)
        tree.chomsky_normal_form(horzMarkov=2)

        productions += tree.productions()

    # S = Nonterminal('S')
    # grammar = induce_pcfg(S, productions)
    print(productions)
    print()

    print("Parse sentence using induced grammar:")

    parser = pchart.InsideChartParser(grammar)
    parser.trace(3)

    # doesn't work as tokens are different:
    # sent = treebank.tokenized('wsj_0001.mrg')[0]

    sent = treebank.parsed_sents(item)[0].leaves()
    print(sent)
    for parse in parser.parse(sent):
        print(parse)

示例#6

0

显示文件

文件： learn_pcfg.py 项目： salmanahmad/6.863

def learn_treebank(files=None, markov_order=None):
    """
    Learn a PCFG from the Penn Treebank, and return it.
    
    By default, this learns from NLTK's 10% sample of the Penn Treebank.
    You can give the filename of a Treebank file; 'wsj-02-21.mrg' will
    learn from the entire training section of Treebank.
    """
    if files is None: bank = treebank.parsed_sents()
    else: bank = treebank.parsed_sents(files)
    return learn_trees(bank, collapse=True, markov_order=markov_order)

示例#7

0

显示文件

文件： learn_pcfg.py 项目： sevinjyolchuyeva/6.863

def learn_treebank(files=None, markov_order=None):
    """
    Learn a PCFG from the Penn Treebank, and return it.
    
    By default, this learns from NLTK's 10% sample of the Penn Treebank.
    You can give the filename of a Treebank file; 'wsj-02-21.mrg' will
    learn from the entire training section of Treebank.
    """
    if files is None: bank = treebank.parsed_sents()
    else: bank = treebank.parsed_sents(files)
    return learn_trees(bank, collapse=True, markov_order=markov_order)

示例#8

0

显示文件

文件： chapter8.py 项目： hbdhj/python

def grammarDevelopmen():
    print "page 315 8.6  Grammar Developmen"
    print "=============== Treebanks and Grammars ==============="
    from nltk.corpus import treebank
    t = treebank.parsed_sents('wsj_0001.mrg')[0]
    print t

    def filter(tree):
        child_nodes = [child.node for child in tree if isinstance(child, nltk.Tree)]
        return  (tree.node == 'VP') and ('S' in child_nodes)

    print [subtree for tree in treebank.parsed_sents() for subtree in tree.subtrees(filter)]

示例#9

0

显示文件

文件： pcyk.py 项目： gitvivekgupta/pcyk

def sentences():
    for f in treebank.fileids():
        for t in treebank.parsed_sents(f):
            t.chomsky_normal_form(horzMarkov=1)
            t.collapse_unary(collapsePOS=True)

            yield (t, t.leaves())

示例#10

0

显示文件

文件： convert_boundary.py 项目： pramodkaushik/nlu-hmrnn

def gen_corpus(path, threshold):
    """
    src: http://www.nltk.org/_modules/nltk/tree.html
    corpora from wsj_0001.mrg to wsj_0199.mrg
    e.g.: t = treebank.parsed_sents('wsj_0001.mrg')[0]
    to visualize a tree: t.draw()
    :param path: save to path
    :param threshold: minimum length of a sentence to keep
    :return: none
    """
    boundaries = []
    sentences = []
    for t in treebank.parsed_sents(treebank.fileids()):
        flat = _flatten_tree(t, threshold)
        if flat:
            boundaries.append(flat)
            sentence = ' '.join(t.leaves()).translate(PUNC_TRANS).lower()
            sentence = re.sub(r' +', ' ', sentence)
            # replace digit(s) as 'x'(s)
            sentences.append(re.sub(r'\d', 'x', sentence).strip())
    _check_length_match(boundaries, sentences)
    with open(path + "/boundaries.txt", 'w') as f:
        f.write('1'.join(boundaries))
    with open(path + "/sentences.txt", 'w') as f:
        f.write(' '.join(sentences))

示例#11

0

显示文件

def CKY_parser():
    '''
    Given the PCFG, we use the built in CKY praser function
    to get a sentence's most probable parse
    '''
    PCFG_grammar = make_PCFG_grammar()
    # Utilize the ViertabiParser given the PCFG grammar induction rules
    parser = ViterbiParser(PCFG_grammar)

    # Sample sentence parse
    sentences = treebank.parsed_sents('wsj_1964.mrg')

    skipped_sentences = 0

    # A for loop to print out the full parse
    for sentence in sentences:
        sentence = sentence.leaves()
        try:
            PCFG_grammar.check_coverage(sentence)
            for parse in parser.parse(sentence):
                print(parse)
        except:
            skipped_sentences += 1
            continue

    print("Total skipped sentences:", skipped_sentences)

示例#12

0

显示文件

文件： syntax_and_parsing.py 项目： dmuiruri/nlp

def ex6(symbol='S', display=5):
    """
    PCFG: Probabilistic CFGs

    Generating the probability distribution of a given symbol in a CFG.

    For a condenced visual display of results, expansions with less
    than five or any given number of instances are removed from the
    results although the calculations for the probability distribution
    of the symbol includes all available productions.
    """
    prob_dist = dict()
    l5_view = dict()
    productions = [
        p for tree in treebank.parsed_sents() for p in tree.productions()
    ]
    all_sym_prd = [p for p in productions if p.lhs().symbol() == symbol]
    sym_count = len(all_sym_prd)
    unique_rhs = set([p.rhs() for p in all_sym_prd])
    all_rhs = [p.rhs() for p in all_sym_prd]
    for rhs in unique_rhs:
        prob_dist[rhs] = all_rhs.count(rhs) / sym_count
        if all_rhs.count(rhs) < display:  # condence display
            prob_dist.pop(rhs)
    return prob_dist

示例#13

0

显示文件

文件： pcyk.py 项目： brucespang/pcyk

def sentences():
    for f in treebank.fileids():
        for t in treebank.parsed_sents(f):
            t.chomsky_normal_form(horzMarkov=1)
            t.collapse_unary(collapsePOS=True)

            yield (t, t.leaves())

示例#14

0

显示文件

def sequence_matching(input):
    sents = treebank.tagged_sents()
    parses = treebank.parsed_sents()
    for s in range(len(sents)):  # look through every sentence in treebank to find a sequence match with input
        sent = sents[s]
        pars = parses[s]
        k = 0  # k will track how far into the sequence has been matched
        matches = []  # log position in sent that there was a match to help build tree later
        for i in range(len(input)):
            match = False  # flag to cut down on time if a word doesn't match anything in the sent
            for j in range(k, len(sent)):  # loop through every word in sentence starting from last match

                if sent[j][1] == input[i][1]:  # labels (pos) match
                    k = j
                    UpdateTree(pars, j, input[i][1])
                    match = True  # if this line is never reached, then don't waste more time on this sentence
                    if i == len(input) - 1:  # made it through the entire input, so sent was a match
                        return pars # pars will have words replaced where there is a match
                    break

            if match == False:
                print("Sentence does not match")
                break  # program has looked through whole sentence without matching a word so move onto the next sentence

    return None  # no sentence was found to match the input sequence, print error message

示例#15

0

显示文件

def test_GrammarParser():
    import nltk
    from nltk.corpus import treebank
    grammar = r"""NP:
    {<DT>*(<NN>|<NNP>|<NNS>)+}          # Chunk everything
    }<VBD|IN>+{      # Chink sequences of VBD and IN
    """
    #     tree=treebank.parsed_sents('wsj_0001.mrg')[0]
    #     print tree
    grammar_VP = r"""VP:
    {<VBZ><VP>}
    """
    #     tree=nltk.RegexpParser(grammar).parse(treebank.parsed_sents('wsj_0001.mrg')[0].pos())
    #     print tree
    fileids = treebank.fileids()

    #     for fileld in fileids:
    for i in range(len(fileids)):
        if i > 10:
            break


#         trees=treebank.parsed_sents(fileld)
        trees = treebank.parsed_sents(fileids[i])
        for tree in trees:
            tree_Gram = nltk.RegexpParser(grammar).parse(tree)
            for subtree in tree_Gram.subtrees():
                if subtree.label() == "VP":
                    print subtree

示例#16

0

显示文件

def test():
    """Do some tree drawing tests."""
    def print_tree(n, tree, sentence=None, ansi=True, **xargs):
        print()
        print('{0}: "{1}"'.format(n, ' '.join(sentence or tree.leaves())))
        print(tree)
        print()
        drawtree = TreePrettyPrinter(tree, sentence)
        try:
            print(drawtree.text(unicodelines=ansi, ansi=ansi, **xargs))
        except (UnicodeDecodeError, UnicodeEncodeError):
            print(drawtree.text(unicodelines=False, ansi=False, **xargs))

    from nltk.corpus import treebank
    for n in [0, 1440, 1591, 2771, 2170]:
        tree = treebank.parsed_sents()[n]
        print_tree(n, tree, nodedist=2, maxwidth=8)
    print()
    print('ASCII version:')
    print(TreePrettyPrinter(tree).text(nodedist=2))

    tree = Tree.fromstring(
        '(top (punct 8) (smain (noun 0) (verb 1) (inf (verb 5) (inf (verb 6) '
        '(conj (inf (pp (prep 2) (np (det 3) (noun 4))) (verb 7)) (inf (verb 9)) '
        '(vg 10) (inf (verb 11)))))) (punct 12))',
        read_leaf=int)
    sentence = ('Ze had met haar moeder kunnen gaan winkelen ,'
                ' zwemmen of terrassen .'.split())
    print_tree('Discontinuous tree', tree, sentence, nodedist=2)

示例#17

0

显示文件

文件： parse.py 项目： xiaohan2012/capitalization-restoration-train

def main(transform_func = None, n = 10):
    parser=StanfordParser(
        path_to_jar = "/cs/fs/home/hxiao/code/stanford-parser-full-2015-01-30/stanford-parser.jar",
        path_to_models_jar = "/cs/fs/home/hxiao/code/stanford-parser-full-2015-01-30/stanford-parser-3.5.1-models.jar",
        model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"
    )

    test_sents = treebank.sents()[-n:]

    print "len(test_sents) = %d" %(len(test_sents))

    if transform_func and callable(transform_func):
        print "transforming it using ", transform_func
        test_sents = [[transform_func(w) for w in s] 
                      for s in test_sents] # transform it

    print test_sents[:10]

    print "predicting"
    pred_parses = parser.parse_sents(test_sents)
    
    gold_parses = treebank.parsed_sents()
    
    print "evaluating"

    correct_n = gold_n = predicted_n = 0.0
    
    for gparse, pparse in zip(gold_parses, pred_parses):
        cn, gn, pn = precision_and_recall_stat(get_nodes_with_range(gparse), 
                                               get_nodes_with_range(pparse))
        correct_n += cn
        gold_n += gn
        predicted_n += pn
        
    print "Prediction: %f, Recall: %f" %(correct_n / predicted_n, correct_n / gold_n)

示例#18

0

显示文件

def pcfg(train_idx=None, smoothing=None):
    """
    productions = []
    item = treebank._fileids[0]
    print("ITEM\n\n",item,"\n\n")
    for tree in treebank.parsed_sents(item)[:3]:
        # perform optional tree transformations, e.g.:
        tree.collapse_unary(collapsePOS = False)
        tree.chomsky_normal_form(horzMarkov = 2)
        productions += tree.productions()
"""
    if train_idx == None:
        train_idx = (len(treebank.fileids()) * 3) // 4
    productions = []
    for item in treebank.fileids()[0:train_idx]:
        for tree in treebank.parsed_sents(item):
            tree.collapse_unary(
                collapsePOS=False)  # Remove unary production rule
            tree.chomsky_normal_form(
                horzMarkov=2
            )  # Convert into chomsky normal form i.e., A->(B,C,D) into A->(B,E) E->(C,D)
            productions += tree.productions()

    S = Nonterminal('S')
    if smoothing == None:
        grammar = learn_pcfg(S, productions)
    elif smoothing == 'L1':
        grammar = smoothing_pcfg(S, productions)

    with open('grammar.pkl', 'wb') as f:
        pickle.dump(grammar, f)

    return grammar

示例#19

0

显示文件

文件： main.py 项目： syrix78/IFT6285_Devoir8

def train_grammar(unknown_words=[], nb_reduced_production=6000):

    productions = []

    for item in train:
        for tree in treebank.parsed_sents(item):
            # perform optional tree transformations, e.g.:
            tree.collapse_unary(collapsePOS=False)  # Remove branches A-B-C into A-B+C
            tree.chomsky_normal_form(horzMarkov=2)  # Remove A->(B,C,D) into A->B,C+D->D
            #tree_prods = tree.productions()


            productions += tree.productions()


    counter = collections.Counter(productions)
    n_comms = [item for item, count in counter.most_common(nb_reduced_production) for i in range(count)]

    #Adding unkwown words and terminal rules back into the reduced productions set
    unknown_words_prods = []
    for p in productions:
        if isinstance(p._rhs[0], str):
            unknown_words_prods.append(p)
            for u in unknown_words:
                rhs = [u]
                lhs = p._lhs
                new_prod = Production(lhs, rhs)
                unknown_words_prods.append(new_prod)


    n_comms += unknown_words_prods
    S = Nonterminal('S')
    grammar = induce_pcfg(S, n_comms)

    return grammar

示例#20

0

显示文件

文件： demo.py 项目： sushengyang/NLP-project

def convert_wsj(file_obj):
    from nltk.corpus import treebank
    sys.stderr.write("Converting Penn Treebank sampler...\n")
    tb = TreebankConverter()
    for sentence in treebank.parsed_sents():
        tb.add_sentence(sentence)
    tb.write(file_obj)

示例#21

0

显示文件

def test():
    model = torch.load('./ckpt/model0.pt')
    leafmodel = LeafNet()
    x = treebank.sents('wsj_0003.mrg')[0]
    y = treebank.parsed_sents('wsj_0003.mrg')[0]
    preprocess(y)
    # embed_x is the list of embedding vectors of x
    embed_x = []
    x_list = []
    l = int(len(x))

    for i in range(0, l):
        txlist = []
        x[i] = x[i].lower()
        txlist.append(x[i])
        tembed = torch.Tensor(get_embed(x[i]))
        embed_x.append(tembed)

        pred = leafmodel(embed_x[i])
        gt = (torch.argmax(pred)).item()
        txlist.append(gt)
        x_list.append(txlist)

    # we got the (sentence,gt) list, embedding vector list for the leafs
    xscore = 0.0
    while (len(x_list) != 1):
        x_list, embed_x, tscore = calculate_score(x_list, embed_x, model)
        xscore = xscore + tscore
    x_list = str(x_list).replace('[', '(').replace(']', ')').replace(
        '\'', '').replace(',', '')
    x_list_tree = Tree.fromstring((x_list))

    draw_trees(x_list_tree)
    draw_trees(y)

示例#22

0

显示文件

文件： treeprettyprinter.py 项目： CaptainAL/Spyder

def test():
    """Do some tree drawing tests."""
    def print_tree(n, tree, sentence=None, ansi=True, **xargs):
        print()
        print('{0}: "{1}"'.format(n, ' '.join(sentence or tree.leaves())))
        print(tree)
        print()
        drawtree = TreePrettyPrinter(tree, sentence)
        try:
            print(drawtree.text(unicodelines=ansi, ansi=ansi, **xargs))
        except (UnicodeDecodeError, UnicodeEncodeError):
            print(drawtree.text(unicodelines=False, ansi=False, **xargs))

    from nltk.corpus import treebank
    for n in [0, 1440, 1591, 2771, 2170]:
        tree = treebank.parsed_sents()[n]
        print_tree(n, tree, nodedist=2, maxwidth=8)
    print()
    print('ASCII version:')
    print(TreePrettyPrinter(tree).text(nodedist=2))

    tree = Tree.fromstring(
        '(top (punct 8) (smain (noun 0) (verb 1) (inf (verb 5) (inf (verb 6) '
        '(conj (inf (pp (prep 2) (np (det 3) (noun 4))) (verb 7)) (inf (verb 9)) '
        '(vg 10) (inf (verb 11)))))) (punct 12))', read_leaf=int)
    sentence = ('Ze had met haar moeder kunnen gaan winkelen ,'
                ' zwemmen of terrassen .'.split())
    print_tree('Discontinuous tree', tree, sentence, nodedist=2)

示例#23

0

显示文件

文件： symbolic.py 项目： SkittlePox/2951-Final-Project

def PCFG_Section():
    toy_pcfg1 = PCFG.fromstring("""
        S -> NP VP [1.0]
        NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15]
        Det -> 'the' [0.8] | 'my' [0.2]
        N -> 'man' [0.5] | 'telescope' [0.5]
        VP -> VP PP [0.1] | V NP [0.7] | V [0.2]
        V -> 'ate' [0.35] | 'saw' [0.65]
        PP -> P NP [1.0]
        P -> 'with' [0.61] | 'under' [0.39]
    """)

    pcfg_prods = toy_pcfg1.productions()

    pcfg_prod = pcfg_prods[2]
    print('A PCFG production:', pcfg_prod)
    print('pcfg_prod.lhs()  =>', pcfg_prod.lhs())
    print('pcfg_prod.rhs()  =>', pcfg_prod.rhs())
    print('pcfg_prod.prob() =>', pcfg_prod.prob())

    # extract productions from three trees and induce the PCFG
    print("Induce PCFG grammar from treebank data:")

    productions = []
    for item in treebank.fileids()[:2]:
      for tree in treebank.parsed_sents(item):
        # print(" ".join(tree.leaves()))
        # perform optional tree transformations, e.g.:
        # tree.collapse_unary(collapsePOS = False)# Remove branches A-B-C into A-B+C
        # tree.chomsky_normal_form(horzMarkov = 2)# Remove A->(B,C,D) into A->B,C+D->D
        prods = tree.productions()
        # print(prods[0].prob())
        productions += prods

    S = Nonterminal('S')
    grammar = induce_pcfg(S, productions)
    # print(grammar)    # This is a PCFG

    ### Parsing section below ###

    print("\nParse sentence using induced grammar:")

    parser = pchart.InsideChartParser(grammar)
    parser.trace(1)

    sent = treebank.parsed_sents('wsj_0001.mrg')[0]
    print(sent.prob())

示例#24

0

显示文件

文件： pcfg.py 项目： yiyouls/Natural-Language-Processing-A-Machine-Learning-Perspective

def train():
    print("Collecting sub-corpus from Penn Treebank (nltk.corpus)")
    
    # prepare parsing trees, extrated from treebank
    tbank_trees = []
    for sent in treebank.parsed_sents():
        sent.chomsky_normal_form()
        tbank_trees.append(sent)
    
    # build vocabulary list, extracted from treebank
    vocab_size = 10000 # set vocabulary size to 10000
    words = [wrd.lower() for wrd in treebank.words()]
    vocab = [wrd for wrd,freq in Counter(treebank.words()).most_common(vocab_size)]
    
    # generate grammar rules list, extracted from treebank. and calculate their probablity based their frequency
    tbank_productions = set(production for tree in tbank_trees for production in tree.productions())
    tbank_grammar = CFG(Nonterminal('S'), list(tbank_productions))
    production_rules = tbank_grammar.productions()
    rules_to_prob = defaultdict(int)
    nonterm_occurrence = defaultdict(int)
    
    #calculate probablity for rules
    for sent in tbank_trees:
        for production in sent.productions():
            if len(production.rhs()) == 1 and not isinstance(production.rhs()[0], Nonterminal):
                production = Production(production.lhs(), [production.rhs()[0].lower()])
            nonterm_occurrence[production.lhs()] += 1
            rules_to_prob[production] += 1
    for rule in rules_to_prob:
        rules_to_prob[rule] /= nonterm_occurrence[rule.lhs()]

    # use Katz smoothing
    rules_to_prob, vocab = katz_smooth(rules_to_prob, vocab)
    rules = list(rules_to_prob.keys())
    rules_reverse_dict = dict((j,i) for i, j in enumerate(rules))
    left_rules = defaultdict(set)
    right_rules = defaultdict(set)
    unary_rules = defaultdict(set)
    
    # classify left, right rules
    for rule in rules:
        if len(rule.rhs()) > 1:
            left_rules[rule.rhs()[0]].add(rule)
            right_rules[rule.rhs()[1]].add(rule)
        else:
            unary_rules[rule.rhs()[0]].add(rule)
    terminal_nonterms_rules = set(rule for rule in rules_to_prob if len(rule.rhs()) == 1 and isinstance(rule.rhs()[0], str))
    terminal_nonterms = defaultdict(int)
    for rule in terminal_nonterms_rules:
        terminal_nonterms[rule.lhs()] += 1
        pcfg_parser = {
    'vocab': vocab,
        'left_rules': left_rules,
        'right_rules': right_rules,
        'unary_rules': unary_rules,
        'rules_to_prob': rules_to_prob,
        'terminal_nonterms': terminal_nonterms
    }
    return pcfg_parser

示例#25

0

显示文件

文件： data_ptb.py 项目： drewjel/CS6316-Final-Project

 def add_words(self, file_ids):
     for id in file_ids:
         sentences = ptb.parsed_sents(id)
         for sen_tree in sentences:
             words = Corpus._filter_words(sen_tree)
             words = ['<eos>'] + words + ['<eos>']
             for word in words:
                 self.dict.add(word)

示例#26

0

显示文件

文件： demo.py 项目： Sandy4321/nltk_contrib

def convert_wsj(file_obj):
    from nltk.corpus import treebank

    sys.stderr.write("Converting Penn Treebank sampler...\n")
    tb = TreebankConverter()
    for sentence in treebank.parsed_sents():
        tb.add_sentence(sentence)
    tb.write(file_obj)

示例#27

0

显示文件

文件： induce_pcfg.py 项目： pramodkaushik/nlu-hmrnn

 def _induce_grammar(self):
     self.productions = []
     for tree in treebank.parsed_sents(treebank.fileids()):
         # perform optional tree transformations, e.g.:
         tree.collapse_unary(
             collapsePOS=False)  # Remove branches A-B-C into A-B+C
         tree.chomsky_normal_form(
             horzMarkov=2)  # Remove A->(B,C,D) into A->B,C+D->D
         self.productions += tree.productions()

示例#28

0

显示文件

文件： Assignment_2.py 项目： adics116/ChatBot_IO

def nltk_parse(s):
    tokens = nltk.word_tokenize(s)
    print(tokens)
    tagged = nltk.pos_tag(tokens)
    print(tagged[0:6])
    entities = nltk.chunk.ne_chunk(tagged)
    print(entities)
    t = treebank.parsed_sents('wsj_0001.mrg')[0]
    t.draw()

示例#29

0

显示文件

文件： PennTreeReader.py 项目： aniltrue/MLProject

def read_data():
    treebank_tagged_sents = list(
        chain(*[[tree.pos() for tree in treebank.parsed_sents(pf)] for pf in treebank.fileids()]))

    words_list = [[tag[0] for tag in sent] for sent in treebank_tagged_sents]
    labels = [[tag[1] for tag in sent] for sent in treebank_tagged_sents]

    words = []
    max_words = 0
    for sent in words_list:
        words.extend(sent)
        max_words = max(max_words, len(sent))

    print("Max. Words:", max_words)

    seq_length = 100

    print("Seq. Length:", seq_length)

    words = list(set(words))

    print("Number of Words:", len(words))

    unique_labels = []
    for sent in labels:
        unique_labels.extend(sent)

    unique_labels = list(set(unique_labels))

    print("Number of Unique Labels:", len(unique_labels))

    word2id = {word: i + 1 for i, word in enumerate(words)}
    id2word = {i + 1: word for i, word in enumerate(words)}

    X_data = []
    Y_data = []

    for i in range(len(treebank_tagged_sents)):
        for j in range(len(words_list[i])):
            _x = [0] * max_words

            for k in range(j + 1):
                _x[j - k] = word2id[words_list[i][k]]

            _x = _x[:seq_length]
            _x.reverse()

            X_data.append(_x)
            Y_data.append(one_hot(labels[i][j], unique_labels))

    X_data = np.array(X_data, dtype=np.int32)
    Y_data = np.array(Y_data, dtype=np.float32)

    print(X_data.shape)
    print(Y_data.shape)

    return X_data, Y_data, unique_labels, words, word2id, id2word

示例#30

0

显示文件

 def read_wsj_from_treebank(self, index):
     from nltk.corpus import treebank
     self.__reset()
     self.__input_text = 'wsj_000' + str(index) + '.mrg'
     self.__sents = treebank.sents(self.__input_text)
     self.__tagged_sents = treebank.parsed_sents(self.__input_text)
     if self.__verbose:
         self.__print_all()
     return self.__tagged_sents

示例#31

0

显示文件

def get_processed_data():

    bank = treebank.parsed_sents()
    train_bank, test_bank = train_test_split(bank, test_size=0.2)
    train_bank = list(train_bank)
    test_bank = list(test_bank)
    train_bank = convert_to_base_category(train_bank)
    test_bank = convert_to_base_category(test_bank)

    return train_bank, test_bank

示例#32

0

显示文件

def extract_simple_productions(n):
    rules = []
    new_rules = []
    for t in treebank.parsed_sents()[:n]:
        rules = rules + t.productions()
    for r in rules:
        r = simple_rule(r)
        if not "EMPTY" in str(r):
            new_rules.append(r)
    return new_rules

示例#33

0

显示文件

文件： pset4.py 项目： limz10/NLP

def TreebankNoTraces():
    tb = []
    for t in treebank.parsed_sents():
        if t.label() != "S": continue
        RemoveFunctionTags(t)
        RemoveTraces(t)
        t.collapse_unary(collapsePOS=True, collapseRoot=True)
        t.chomsky_normal_form()
        tb.append(t)
    return tb

示例#34

0

显示文件

def getTrees(source, size):
    '''Load the trees from source, return first SIZE trees'''
    if source == 'treebank':
        from nltk.corpus import treebank
        trees = treebank.parsed_sents()
        #inds = random.permutation(range(0,len(trees)))[0:size]
        trees = trees[:size]
        return trees
    else:
        return list()

示例#35

0

显示文件

文件： PCFG_util.py 项目： Jsalim/NLP-Stuff

def getTrees(source,size):
    '''Load the trees from source, return first SIZE trees'''
    if source=='treebank':
        from nltk.corpus import treebank
        trees = treebank.parsed_sents()
        #inds = random.permutation(range(0,len(trees)))[0:size]
        trees = trees[:size]
        return trees
    else:
        return list()

示例#36

0

显示文件

def learn_treebank(trees=None):
    """
    Learn a PCFG from the Penn Treebank, and return it.
    
    By default, this learns from NLTK's 10% sample of the Penn Treebank.
    You can also pass a set of trees.
    """
    if trees is None: bank = treebank.parsed_sents()
    else: bank = trees
    return learn_trees(bank, collapse=True)

示例#37

0

显示文件

def main(phrase_level, sanitize):
    for n in range(1, 200):
        tree_file = "wsj_{}.mrg".format(str(n).zfill(4))
        sentences = treebank.parsed_sents(tree_file)
        for s in sentences:
            for subtree in s.subtrees(lambda t: t.label() == phrase_level):
                if sanitize == True:
                    sanitize_tree(subtree)
                print(subtree.pformat(100000))
                break

示例#38

0

显示文件

文件： learn_pcfg.py 项目： JakeBrawer/org

def learn_treebank(trees=None):
    """
    Learn a PCFG from the Penn Treebank, and return it.
    
    By default, this learns from NLTK's 10% sample of the Penn Treebank.
    You can also pass a set of trees.
    """
    if trees is None: bank = treebank.parsed_sents()
    else: bank = trees
    return learn_trees(bank, collapse=True)

示例#39

0

显示文件

文件： pcfg.py 项目： XsongyangX/ift6285-hw8

def parse_treebank(parser: ViterbiParser, sentences):
    start_time = time.time()
    parser.trace(trace=1)
    for sentence in treebank.parsed_sents(sentences[:3]):
        tokens = sentence.leaves()
        for tree in parser.parse(tokens):
            print(tree)
            print(
                f"Time elapsed for sentence of length {len(tokens)}: {time.time() - start_time}"
            )

示例#40

0

显示文件

文件： pset4.py 项目： weitongruan/Comp150NLP

def TreebankNoTraces():
    tb = []
    for t in treebank.parsed_sents():
        if t.label() != "S": continue
        RemoveFunctionTags(t)
        RemoveTraces(t)
        t.collapse_unary(collapsePOS = True, collapseRoot = True)
        t.chomsky_normal_form()
        tb.append(t)
    return tb

示例#41

0

显示文件

文件： nltk_test.py 项目： HideOnHouse/Study

def main():
    sentence = """I saw a man with a telescope.
    ... Colorless green ideas sleep furiously.
    ... The horse raced past the barn fell."""
    tokens = nltk.word_tokenize(sentence)
    print(tokens)
    tagged = nltk.pos_tag(tokens)
    print(tagged[0:6])
    entities = nltk.chunk.ne_chunk(tagged)
    print(entities)
    t = treebank.parsed_sents('wsj_0001.mrg')[0]

示例#42

0

显示文件

文件： funtag.py 项目： EddieNejadi/Machine_Learning

def write_example_tree(features, f):
    filename = features['_filename']
    sen = features['_sentence_id']
    phr = features['_phrase_id']
    tree = treebank.parsed_sents(filename)[sen]
    phrase = tree[tree.treepositions('preorder')[phr]]
    l = treebank_helper.get_label(phrase)
    treebank_helper.set_label(phrase, '***' + l + '***')
    f.write(str(tree))
    f.write('\n')
    treebank_helper.set_label(phrase, l)

示例#43

0

显示文件

文件： helper.py 项目： barbaragabriela/inlp-probabilistic-parsing

def treebank_accessor():
  '''
  Function that reads the Penn treebank and returns all the trees 
  for each sentence in the corpus.
  '''
  trees = []

  for i in range(1, TREEBANK_FILES + 1):
    file_number = "%03d" % (i,)
    t = treebank.parsed_sents('wsj_0' + file_number + '.mrg')

    for sentence in range(len(t)):
      # For each sentence in the file, convert to a tree and add it to trees[]
      trees.append(t[sentence])

  return trees

示例#44

0

显示文件

文件： syntactic_formation.py 项目： snyderp/cs412-scorer

def get_treebank_rules(cutoff=0, include_counts=False):
    all_rules = cache_utils.cache_get('treebank_rules', 'rules')
    if not all_rules:
        log('Generating lexical rules from Penn Treebank', 4)
        from nltk.corpus import treebank
        all_rules = dict()
        for tree in treebank.parsed_sents():
            for rule, count in lexical_rules(tree).items():
                all_rules[rule] = all_rules.get(rule, 0) + count

        cache_utils.cache_set('treebank_rules', 'rules', all_rules)

    if include_counts:
        return {k: v for (k, v) in all_rules.items() if v > cutoff}
    else:
        rules_set = set([rule for rule, count in all_rules.items() if count > cutoff])
        return rules_set

示例#45

0

显示文件

文件： funtag.py 项目： EddieNejadi/Machine_Learning

def read_treebank_files(files, extractor,fe):
    """Read the listed treebank files and collect function tagging examples
    from each tree.

    The user-provided feature extractor is applied to each phrase in each 
    tree. The extracted feature dicts and the true function tags for each
    phrase are stored in two separate lists, which are returned.
    """
    X = []
    Y = []
    for filename in files:
        scount = 0
        for tree in treebank.parsed_sents(filename):
            tree = ParentedTree.convert(tree)
            treebank_helper.postprocess(tree)
            find_examples_in_tree(tree, X, Y, extractor,fe, filename, scount, 0)
            scount += 1
    return X, Y

示例#46

0

显示文件

文件： main.py 项目： hmc-cs159-spring2016/banana

def get_trees(fileids=None, verbose=False):
	""" 
	Get the CNF trees for the treebank fileids given, or for the entire treebank
	"""
	if not fileids:
		# Get the Penn Treebank corpus
		fileids = treebank.fileids()

	# Get the sentence-trees in each file
	tree_lists = [treebank.parsed_sents(file_id) for file_id in fileids]
	trees = [sent for sent_list in tree_lists for sent in sent_list]
	if verbose:
		print("obtained", len(trees), "trees from the corpus.")

	cnf_trees = [ctc.convert_tree(t) for t in trees]
	if verbose:
		print("converted", len(trees), "trees to cnf.")

	return cnf_trees

示例#47

0

显示文件

文件： Document.py 项目： jpurma/Kataja

    def create_forests(self, filename=None, treelist=None, clear=False):
        """ This will read sentences to parse. One sentence per line, no periods etc.

        :param filename: not used
        :param clear: start with empty
        """
        filename = filename or Document.get_default_treeset_file()

        forests = []
        input_trees = []

        shared_lexicon = load_lexicon(Document.get_default_lexicon_file())
        print('loaded shared_lexicon: ', shared_lexicon)
        if treelist:
            input_trees = treelist
        elif has_nltk:
            print(f"reading trees {NLTK_TREE_RANGE[0]}-{NLTK_TREE_RANGE[1]} from NLTK's treebank")
            for i in range(*NLTK_TREE_RANGE):  # 199
                trees = treebank.parsed_sents(f'wsj_0{str(i).rjust(3, "0")}.mrg')
                for j, tree in enumerate(trees):
                    tree.chomsky_normal_form()
                    tree.collapse_unary()
                    input_trees.append(as_list(tree))
        else:
            readfile = open(filename, 'r')
            for line in readfile:
                line = line.strip()
                if line:
                    if line.startswith('[') and line.endswith(']'):
                        input_trees.append(ast.literal_eval(line))
                    else:
                        input_trees.append(line)

        for input_tree in input_trees:
            syn = classes.SyntaxAPI()
            syn.lexicon = shared_lexicon
            if isinstance(input_tree, list):
                syn.input_tree = input_tree
            else:
                syn.input_text = input_tree
            forest = Forest(heading_text=str(input_tree), syntax=syn)
            forests.append(forest)
        return forests

示例#48

0

显示文件

文件： write.py 项目： owenst/geotweets

def train_pcfg():
    print 'training grammar'
    productions = []
    # print len(treebank.fileids())
    trees = []
    # up to 199 less for shorter grammar for quicker training
    for fileid in treebank.fileids()[0:20]:
        for tree in treebank.parsed_sents(fileid):
            # perform optional tree transformations, e.g.:
            # Remove branches A->B->C into A->B+C so we can avoid infinite
            # productions
            tree.collapse_unary(collapsePOS=False)
            # Remove A->(B,C,D) into A->B,C+D->D (binarization req'd by CKY parser)
            # horizontal and vertical Markovization: remember parents and siblings in tree
            #     This gives a performance boost, but makes the grammar HUGE
            #     If we use these we would need to implement a tag forgetting method
            #tree.chomsky_normal_form(horzMarkov = 0, vertMarkov=0)
            tree.chomsky_normal_form()
            productions += tree.productions()
    S = nltk.Nonterminal('S')
    grammar = nltk.induce_pcfg(S, productions)
    print "grammar trained!"
    return grammar

示例#49

0

显示文件

文件： fetch-senseval3-aw.py 项目： ai-ku/uwsd

#! /usr/bin/python
# -*- coding: utf-8 -*-

__author__ = "Osman Baskaya"

from nltk.corpus import treebank

files = "cl23.mrg wsj_1695.mrg wsj_1778.mrg".split()

for f in files:
    for sentence in treebank.parsed_sents(f):
        s = []
        for word, p in sentence.pos():
            if p != '-NONE-':
                s.append(word)
        print ' '.join(s)


#f = '../data/senseval3/english-all-words.xml'

#soup = BeautifulSoup(open(f), 'xml')
#texts = soup.find_all('text')
#sentences = []
#quot_set = set(['"', ])
#quot = False
#sentence = []
#for t in texts:
    #tokens = t.text.split()
    #for token in tokens:
        #if token in quot_set:
            #quot = not quot

示例#50

0

显示文件

文件： rulefinder.py 项目： johncadigan/Coursework

    rules = []
    results = re.findall("(\({0}\ {1}\))".format(rule,word), sent)
    for res in results:
        x = res.split(" ")
        if len(x) == 2:
            p,c = x
            rules.append("{0} -> '{1}'".format(p[1:], c[:-1]))
    return rules

def check(productions, rules):
    i = 0
    for x in productions:
        if str(x) in rules: i += 1
        else: print x
    return (i,len(productions))

if __name__=="__main__":
   total, recall = 0,0
   for s in treebank.parsed_sents():
        sent = "".join(str(s).split("\n"))
        unaries = find_unary(sent)
        nonunaries = find_nonunary(sent)
        rules = unaries + nonunaries
        r, t = check(s.productions(), rules)
        recall+=r
        total+=t
   print "{0} out of {1}: {2}".format(recall,total, float(recall)/total)

示例#51

0

显示文件

文件： file_tester.py 项目： treyfeldman/Hobb-s-Algorithm

def find_pronouns(tree):
    pronouns = []
    for child in tree:
        if type(child) in [unicode, str] and child.lower() in PRONOUNS:
            pronouns.append((child.lower(), None))

        if isinstance(child, ParentedTree):
            pronouns = pronouns + find_pronouns(child)

    return pronouns

total = 0
for file in treebank.fileids():
    stats['name'] = file
    for tree in treebank.parsed_sents(file):
        tree = ParentedTree.convert(tree)
        for pronoun, np_node in find_pronouns(tree):
            if pronoun in gendered:
                stats['gendered'] += 1
            if pronoun in itits:
                stats['itits'] += 1
            stats['total'] += 1
            total += 1
            stats['pct_gendered'] = stats['gendered']/float(stats['total'])
    print file, total


    files.append(stats.copy())
    stats = dict.fromkeys(stats, 0)

示例#52

0

显示文件

文件： treebanks.py 项目： anderscui/nlpy

import nltk
from nltk.corpus import treebank

# show samples of treebank
t = treebank.parsed_sents('wsj_0001.mrg')[0]
# print(t)

# filter sentential complements
def filter(tree):
    child_nodes = [child.label() for child in tree if isinstance(child, nltk.Tree)]
    return (tree.label() == 'VP') and ('S' in child_nodes)


subtrees = [subtree for tree in treebank.parsed_sents()
            for subtree in tree.subtrees(filter)]
for st in subtrees:
    print(st)

示例#53

0

显示文件

文件： penn_treebank_extractor.py 项目： satwantrana/list-extractor

# Extracts Penn Treebank from NLTK.
from nltk.corpus import treebank
from operator import itemgetter
import codecs
words = treebank.sents()
tagged_words = [map(itemgetter(1), sent) for sent in treebank.tagged_sents()]
parsed_sents = treebank.parsed_sents()

total_sents = len(parsed_sents)

f = codecs.open('../data/penn_treebank','w','utf-8')
assert (len(words) == len(tagged_words) and len(words) == len(parsed_sents)), ' '.join(map(str, [len(words), len(tagged_words), len(parsed_sents)]))
f.write(str(total_sents) + '\n')
for i in xrange(total_sents):
	sent_len = len(words[i])
	f.write(str(sent_len) + '\n')
	
	sent = ' '.join(words[i])
	pos = ' '.join(tagged_words[i])
	assert(sent.count('\n') == 0 and pos.count('\n') == 0 and len(sent.split(' ')) == sent_len and len(pos.split(' ')) == sent_len)
	f.write(sent + '\n')
	f.write(pos + '\n')
	
	tree = str(parsed_sents[i]).split('\n')
	f.write(str(len(tree)) + '\n')
	f.write('\n'.join(tree) + '\n')

示例#54

0

显示文件

文件： hw4_02.py 项目： thedansimonson/StatNLP_HW4

from nltk.corpus import treebank
from nltk import Tree, Nonterminal
from nltk.parse.viterbi import ViterbiParser
from nltk.grammar import induce_pcfg
from os import getcwd, walk
from pickle import dump

###############################
# 2) Remove numerical indices #
###############################

print "Loading treebank."
sentenceStrings = map(lambda x: x.pprint(), treebank.parsed_sents())

#these are various things we want to remove (indices) or replace
indexStrings = map(str, range(166,0,-1))

#things that come before values and what should be left behind after
#the index is removed
indexPrefixes = [("-",""), ("=","")]
indiceRemedy = lambda n: map(lambda x: (n[0]+x,n[1]), indexStrings)
fixingTuples = reduce(lambda x,y: x+indiceRemedy(y), [[]]+indexPrefixes)

#this is where the removing takes place
print "Cleaning POS tags."
removeTargets = lambda x: reduce(lambda y,z: y.replace(z[0],z[1]), \
		[x]+fixingTuples)
sentenceStrings = map(removeTargets, sentenceStrings)
sentenceTrees = map(lambda x: Tree(x), sentenceStrings)

###################################################

示例#55

0

显示文件

文件： ch5_2.py 项目： PacktPublishing/Mastering-Natural-Language-Processing-with-Python

import nltk
from nltk.corpus import treebank
print(treebank.parsed_sents('wsj_0007.mrg')[2])

示例#56

0

显示文件

文件： GrammarImporter.py 项目： AlejandroAcosta/Parser

from nltk.corpus import treebank
from nltk.grammar import ContextFreeGrammar, Nonterminal
from nltk.treetransforms import chomsky_normal_form


'''
tbank_productions = set(production for sent in treebank.parsed_sents()
                        for production in sent.productions())

'''

treebank_prods = []
for i in range(199): # for all found sets of fileids
    tbstuff = treebank._fileids[i] # get a bunch of 'em
    for tree in treebank.parsed_sents(tbstuff):
        tree.chomsky_normal_form()

        treebank_prods += tree.productions()



tTCpcfg = nltk.induce_pcfg(Nonterminal('S'), list(treebank_prods))

# induce pcfg

# PTCpcfg = nltk.induce_pcfg(tbank_grammar)

# treetransforms: chomsky_normal_form

print("done! You have your WeightedGrammar")

示例#57

0

显示文件

文件： Prediction_script.py 项目： AlexanderERoss/Sylf

master_path = "./Data/"
train_filepath = master_path + "train.csv"
train_data = pd.read_csv(train_filepath)

dup_prob = []

row_count = 0

for row in train_data.iterrows():
    row_count += 1
    q1 = row[1]['question1']
    q2 = row[1]['question2']

    while row_count < 19:
        print(treebank.parsed_sents(q1)[0])
        print(treebank.parsed_sents(q2)[0])

    if pd.isnull(q1):
        q1_words = []
    else:
        q1_words = q1.split(' ')

    if pd.isnull(q2):
        q2_words = []
    else:
        q2_words = q2.split(' ')

    wd_counter = 0
    sim_counter = 0

示例#58

0

显示文件

文件： avg+viterbi.py 项目： albertomh/py-nltk-parsing

import nltk
from nltk.corpus import treebank
from nltk.probability import *
from nltk.grammar import *

### RETRIEVE ALL TREES AND THEN SELECT THE FIRST 100.
all_trees = treebank.parsed_sents()
trees_100 = all_trees[0:100]

### FUNCTION EXTRACTING LEAVES OF NODES WITH LABEL AS A PARAMETER OF getAvgNodeLength().
def getAvgNodeLength(label):

    l_leaves = list()
    for tree in trees_100:
        for node in tree:
            if node.label() == label:
                l_leaves.append(node.leaves())

### CREATED OWN LIST OF PUNCTUATION TO EXCLUDE SINCE USING string.punctuation WOULD
### HAVE DELETED WORDS SUCH AS "Dr.", "World-Wide", "U.S.", etc. WHICH ARE OF INTEREST.
    punct = [u"*", u",", u"&", u"'"]

    for wordlist in l_sbj:
        for word in wordlist:
            for i in punct:
                if i in word:
                    wordlist.remove(word)

### CREATE LIST OF LENGTHS (IN WORDS) OF NODES.
    l_len = list()
    for wordlist in l_leaves:

示例#59

0

显示文件

文件： soluciones_texto.py 项目： tgquintela/ProgrammingExercises

count = {}
for sentence in sentences:
    for word in sentence:
        if word in count:
            count[word] += 1
        else:
            count[word] = 1


## 3. Estadisticas de transicion de palabras (2-gram model)
from sklearn.feature_extraction.text import CountVectorizer


sentences = sentences = texto.strip().split('.')[:-1]
bigram_vectorizer = CountVectorizer(ngram_range=(1,2), min_df=1)
X_2 = bigram_vectorizer.fit_transform(sentences).toarray()


## 4. Using NLTK do a Part of Speech tagging (POS tagging)
tokens = nltk.word_tokenize(sentence)
tagged = nltk.pos_tag(tokens)

## 5. Dibuja un arbol lexico-gramatical con la ayuda de NLTK
from nltk.corpus import treebank

t = treebank.parsed_sents('wsj_0001.mrg')[0]
t.draw()

示例#60

-1

显示文件

文件： HobbsImplementation.py 项目： treyfeldman/Hobb-s-Algorithm

def main():
    answers = open('coref_key.txt', 'r')
    this_correct = 0
    correct = 0
    total = 0
    prev_sentences = deque()
    for file in FILENAMES:
        this_correct = 0
        this_total = 0
        prev_sentences.clear()
        for tree in treebank.parsed_sents(file):


            tree = ParentedTree.convert(tree)

            for pronoun, np_node in find_pronouns(tree):

                # i = 0
                # for t in list(prev_sentences)[-3:]:
                #     t.pretty_print()
                #     print("-"*25)
                #     i = i + 1
                #     if i == 3: break
                proposed = hobbs_to_string(hobbs(np_node, pronoun.lower(), prev_sentences))
                tree.pretty_print()

                actual = answers.readline()

                if  proposed == actual[:-1]:
                    update_pronoun_results(pronoun, 1)
                    correct += 1
                    this_correct += 1

                update_pronoun_results(pronoun, 0)
                total += 1
                this_total += 1

                print "Pronoun: '" + pronoun + "'   Proposed: '" + proposed + "'   Actual: '" + actual + "'"

                if total: print "Overall:\tCorrect:", correct, "\tTotal:", total, "\tPercentage:", correct/float(total), "\n"


                print("*"*100)
                print("*"*100)
            prev_sentences.append(tree)
        print("-"*50)
        if this_correct: print file,":\tCorrect:", this_correct, "\tTotal:", this_total, "\tPercentage:", this_correct/float(this_total), "\n"
        if total: print "Overall:\tCorrect:", correct, "\tTotal:", total, "\tPercentage:", correct/float(total), "\n"
        print("-"*50)

    print "Male correct:", PRONOUN_RESULTS['male'], "\tMale total:", PRONOUN_RESULTS['male_total'], "\tPercent correct:", PRONOUN_RESULTS['male_pct']
    print "Female correct:", PRONOUN_RESULTS['female'], "\tFemale total:", PRONOUN_RESULTS['female_total'], "\tPercent correct:", PRONOUN_RESULTS['female_pct']
    print "Neutral correct:", PRONOUN_RESULTS['neutral'], "\tNeutral total:", PRONOUN_RESULTS['neutral_total'], "\tPercent correct:", PRONOUN_RESULTS['neutral_pct']
    print "Plural correct:", PRONOUN_RESULTS['they'], "\tPlural total:", PRONOUN_RESULTS['they_total'], "\tPercent correct:", PRONOUN_RESULTS['they_pct']
    print "Reflexive correct:", PRONOUN_RESULTS['reflexive'], "\tReflexive total:", PRONOUN_RESULTS['reflexive_total'], "\tPercent correct:", PRONOUN_RESULTS['reflexive_pct']
    print "Total correct:", correct, "\tTotal:", total, "\tPercent correct:", correct/float(total)