Пример #1
0
def description_length(pcfg, sequences):
    N = len(np.unique([str(r) for p in pcfg.productions() for r in p.rhs()]))
    logN = log2(N)
    pcfg_dl = sum([(1 + len(p.rhs())) * logN for p in pcfg.productions()])
    print(pcfg_dl)
    #most probable parse for each sequence
    parses = [InsideChartParser(pcfg).parse_all(s) for s in sequences]
    parses = [
        sorted(p, key=lambda t: t.prob(), reverse=True)[0] for p in parses
    ]
    seq_dl = sum([(1 + len(r.rhs())) * logN for p in parses
                  for r in p.productions()])
    print(seq_dl)
    return pcfg_dl + seq_dl
Пример #2
0
def to_grammar(sequences, sections):
    end_state = np.max(np.hstack(sequences)) + 1
    #for now removing -1 (but deal with it later!)
    sequences = [np.append(s[s >= 0], end_state) for s in sequences]
    new_seqs = to_productions(sequences, end_state)
    trees = [Tree.fromstring(to_tree(s[1:], sections, s[0])) for s in new_seqs]
    prods = [p for t in trees for p in t.productions()]
    prods = induce_pcfg(Nonterminal('S'), prods).productions()
    grammar_string = '\n'.join([str(p) for p in prods])
    for k in set([s[0] for s in new_seqs if s[0] != 'S']):
        grammar_string = grammar_string.replace("'" + str(k) + "'", str(k))
    grammar = PCFG.fromstring(grammar_string)
    print(grammar)
    parser = InsideChartParser(grammar)
    #parser.trace(1)
    sentences = [
        Tree.fromstring(to_tree(s[:-1], sections)).leaves() for s in sequences
    ]
    parses = flatten(multiprocess('parsing', parser.parse_all, sentences), 1)
    probs = mean_probs(parses, grammar)
    print(probs)
Пример #3
0
over_5 = 0
for k, v in transitions.items():
    if v >= 5:
        filt_trans[k] = (v, v / total)

filt_trans = {k: (v, v / over_5) for k, v in filt_trans.items()}

filt_trans

from nltk import induce_pcfg
from nltk import InsideChartParser

prods = list({
    production
    for sent in treebank.parsed_sents() for production in sent.productions()
})
g_pfcg = induce_pcfg(Nonterminal('S'), prods)

p_parser = InsideChartParser(g_pfcg, beam_size=400)

sents = [
    'Mr. Vinken is chairman .'.split(), 'Stocks rose .'.split(),
    'Alan introduced a plan .'.split()
]

for sent in sents:
    print(sent)
    for p in p_parser.parse(sent):
        print(p)
list(parse)
list(p_parser.parse(['you', 'are', 'sleeping']))