Пример #1
0
def description_length(pcfg, sequences):
    N = len(np.unique([str(r) for p in pcfg.productions() for r in p.rhs()]))
    logN = log2(N)
    pcfg_dl = sum([(1 + len(p.rhs())) * logN for p in pcfg.productions()])
    print(pcfg_dl)
    #most probable parse for each sequence
    parses = [InsideChartParser(pcfg).parse_all(s) for s in sequences]
    parses = [
        sorted(p, key=lambda t: t.prob(), reverse=True)[0] for p in parses
    ]
    seq_dl = sum([(1 + len(r.rhs())) * logN for p in parses
                  for r in p.productions()])
    print(seq_dl)
    return pcfg_dl + seq_dl
Пример #2
0
def to_grammar(sequences, sections):
    end_state = np.max(np.hstack(sequences)) + 1
    #for now removing -1 (but deal with it later!)
    sequences = [np.append(s[s >= 0], end_state) for s in sequences]
    new_seqs = to_productions(sequences, end_state)
    trees = [Tree.fromstring(to_tree(s[1:], sections, s[0])) for s in new_seqs]
    prods = [p for t in trees for p in t.productions()]
    prods = induce_pcfg(Nonterminal('S'), prods).productions()
    grammar_string = '\n'.join([str(p) for p in prods])
    for k in set([s[0] for s in new_seqs if s[0] != 'S']):
        grammar_string = grammar_string.replace("'" + str(k) + "'", str(k))
    grammar = PCFG.fromstring(grammar_string)
    print(grammar)
    parser = InsideChartParser(grammar)
    #parser.trace(1)
    sentences = [
        Tree.fromstring(to_tree(s[:-1], sections)).leaves() for s in sequences
    ]
    parses = flatten(multiprocess('parsing', parser.parse_all, sentences), 1)
    probs = mean_probs(parses, grammar)
    print(probs)
Пример #3
0
over_5 = 0
for k, v in transitions.items():
    if v >= 5:
        filt_trans[k] = (v, v / total)

filt_trans = {k: (v, v / over_5) for k, v in filt_trans.items()}

filt_trans

from nltk import induce_pcfg
from nltk import InsideChartParser

prods = list({
    production
    for sent in treebank.parsed_sents() for production in sent.productions()
})
g_pfcg = induce_pcfg(Nonterminal('S'), prods)

p_parser = InsideChartParser(g_pfcg, beam_size=400)

sents = [
    'Mr. Vinken is chairman .'.split(), 'Stocks rose .'.split(),
    'Alan introduced a plan .'.split()
]

for sent in sents:
    print(sent)
    for p in p_parser.parse(sent):
        print(p)
list(parse)
list(p_parser.parse(['you', 'are', 'sleeping']))
Пример #4
0
	def __init__(self, treebank, rootsymbol='S', wrap=False, cnf=True,
				cleanup=True, normalize=False, extratags=(),
				parser=InsideChartParser, **parseroptions):
		""" initialize a DOP model given a treebank. uses the Goodman
		reduction of a STSG to a PCFG.  after initialization,
		self.parser will contain an InsideChartParser.

		>>> tree = Tree("(S (NP mary) (VP walks))")
		>>> d = GoodmanDOP([tree])
		>>> print d.grammar
		    Grammar with 8 productions (start state = S)
			NP -> 'mary' [1.0]
			NP@1 -> 'mary' [1.0]
			S -> NP VP [0.25]
			S -> NP VP@2 [0.25]
			S -> NP@1 VP [0.25]
			S -> NP@1 VP@2 [0.25]
			VP -> 'walks' [1.0]
			VP@2 -> 'walks' [1.0]
		>>> print d.parser.parse("mary walks".split())
		(S (NP mary) (VP@2 walks)) (p=0.25)		
		
		@param treebank: a list of Tree objects. Caveat lector:
			terminals may not have (non-terminals as) siblings.
		@param wrap: boolean specifying whether to add the start symbol
			to each tree
		@param normalize: whether to normalize frequencies
		@param parser: a class which will be instantiated with the DOP 
			model as its grammar. Supports BitParChartParser.
		
		instance variables:
		- self.grammar a WeightedGrammar containing the PCFG reduction
		- self.fcfg a list of strings containing the PCFG reduction 
		  with frequencies instead of probabilities
		- self.parser an InsideChartParser object
		- self.exemplars dictionary of known parse trees (memoization)"""
		from bitpar import BitParChartParser
		nonterminalfd, subtreefd, cfg = FreqDist(), FreqDist(), FreqDist()
		ids = count(1)
		self.exemplars = {}
		if wrap:
			# wrap trees in a common root symbol (eg. for morphology)
			treebank = [Tree(rootsymbol, [a]) for a in treebank]
		if cnf:
			#CNF conversion is destructive
			treebank = list(treebank)
			for a in treebank:
				a.chomsky_normal_form() #todo: sibling annotation necessary?

		# add unique IDs to nodes
		utreebank = [(tree, decorate_with_ids(tree, ids)) for tree in treebank]

		# count node frequencies
		for tree, utree in utreebank:
			nodefreq(tree, utree, subtreefd, nonterminalfd)

		if isinstance(parser, BitParChartParser):
			lexicon = set(x for a, b in utreebank for x in a.pos() + b.pos())
			# this takes the most time, produce CFG rules:
			cfg = FreqDist(chain(*(self.goodman(tree, utree)
								for tree, utree in utreebank)))
			cfg.update("%s\t%s" % (t, w) for w, t in extratags
								if w not in lexicon)
			lexicon.update(a for a in extratags if a not in lexicon)
			# annotate rules with frequencies
			self.fcfg = frequencies(cfg, subtreefd, nonterminalfd, normalize)
			self.parser = BitParChartParser(self.fcfg, lexicon, rootsymbol,
									cleanup=cleanup, **parseroptions)
		else:
			cfg = FreqDist(chain(*(self.goodman(tree, utree, False)
							for tree, utree in utreebank)))
			probs = probabilities(cfg, subtreefd, nonterminalfd)
			#for a in probs: print a
			self.grammar = WeightedGrammar(Nonterminal(rootsymbol), probs)
			self.parser = InsideChartParser(self.grammar)
			
		#stuff for self.mccparse
		#the highest id
		#self.addresses = ids.next()
		#a list of interior + exterior nodes, 
		#ie., non-terminals with and without ids
		#self.nonterminals = nonterminalfd.keys()
		#a mapping of ids to nonterminals without their IDs
		#self.nonterminal = dict(a.split("@")[::-1] for a in 
		#	nonterminalfd.keys() if "@" in a)

		#clean up
		del cfg, nonterminalfd
Пример #5
0
class GoodmanDOP:
	def __init__(self, treebank, rootsymbol='S', wrap=False, cnf=True,
				cleanup=True, normalize=False, extratags=(),
				parser=InsideChartParser, **parseroptions):
		""" initialize a DOP model given a treebank. uses the Goodman
		reduction of a STSG to a PCFG.  after initialization,
		self.parser will contain an InsideChartParser.

		>>> tree = Tree("(S (NP mary) (VP walks))")
		>>> d = GoodmanDOP([tree])
		>>> print d.grammar
		    Grammar with 8 productions (start state = S)
			NP -> 'mary' [1.0]
			NP@1 -> 'mary' [1.0]
			S -> NP VP [0.25]
			S -> NP VP@2 [0.25]
			S -> NP@1 VP [0.25]
			S -> NP@1 VP@2 [0.25]
			VP -> 'walks' [1.0]
			VP@2 -> 'walks' [1.0]
		>>> print d.parser.parse("mary walks".split())
		(S (NP mary) (VP@2 walks)) (p=0.25)		
		
		@param treebank: a list of Tree objects. Caveat lector:
			terminals may not have (non-terminals as) siblings.
		@param wrap: boolean specifying whether to add the start symbol
			to each tree
		@param normalize: whether to normalize frequencies
		@param parser: a class which will be instantiated with the DOP 
			model as its grammar. Supports BitParChartParser.
		
		instance variables:
		- self.grammar a WeightedGrammar containing the PCFG reduction
		- self.fcfg a list of strings containing the PCFG reduction 
		  with frequencies instead of probabilities
		- self.parser an InsideChartParser object
		- self.exemplars dictionary of known parse trees (memoization)"""
		from bitpar import BitParChartParser
		nonterminalfd, subtreefd, cfg = FreqDist(), FreqDist(), FreqDist()
		ids = count(1)
		self.exemplars = {}
		if wrap:
			# wrap trees in a common root symbol (eg. for morphology)
			treebank = [Tree(rootsymbol, [a]) for a in treebank]
		if cnf:
			#CNF conversion is destructive
			treebank = list(treebank)
			for a in treebank:
				a.chomsky_normal_form() #todo: sibling annotation necessary?

		# add unique IDs to nodes
		utreebank = [(tree, decorate_with_ids(tree, ids)) for tree in treebank]

		# count node frequencies
		for tree, utree in utreebank:
			nodefreq(tree, utree, subtreefd, nonterminalfd)

		if isinstance(parser, BitParChartParser):
			lexicon = set(x for a, b in utreebank for x in a.pos() + b.pos())
			# this takes the most time, produce CFG rules:
			cfg = FreqDist(chain(*(self.goodman(tree, utree)
								for tree, utree in utreebank)))
			cfg.update("%s\t%s" % (t, w) for w, t in extratags
								if w not in lexicon)
			lexicon.update(a for a in extratags if a not in lexicon)
			# annotate rules with frequencies
			self.fcfg = frequencies(cfg, subtreefd, nonterminalfd, normalize)
			self.parser = BitParChartParser(self.fcfg, lexicon, rootsymbol,
									cleanup=cleanup, **parseroptions)
		else:
			cfg = FreqDist(chain(*(self.goodman(tree, utree, False)
							for tree, utree in utreebank)))
			probs = probabilities(cfg, subtreefd, nonterminalfd)
			#for a in probs: print a
			self.grammar = WeightedGrammar(Nonterminal(rootsymbol), probs)
			self.parser = InsideChartParser(self.grammar)
			
		#stuff for self.mccparse
		#the highest id
		#self.addresses = ids.next()
		#a list of interior + exterior nodes, 
		#ie., non-terminals with and without ids
		#self.nonterminals = nonterminalfd.keys()
		#a mapping of ids to nonterminals without their IDs
		#self.nonterminal = dict(a.split("@")[::-1] for a in 
		#	nonterminalfd.keys() if "@" in a)

		#clean up
		del cfg, nonterminalfd

	def goodman(self, tree, utree, bitparfmt=True):
		""" given a parsetree from a treebank, yield a goodman
		reduction of eight rules per node (in the case of a binary tree).

		>>> tree = Tree("(S (NP mary) (VP walks))")
		>>> d = GoodmanDOP([tree])
		>>> utree = decorate_with_ids(tree, count(1))
		>>> sorted(d.goodman(tree, utree, False))
		[(NP, ('mary',)), (NP@1, ('mary',)), (S, (NP, VP)), (S, (NP, VP@2)),
		(S, (NP@1, VP)), (S, (NP@1, VP@2)), (VP, ('walks',)),
		(VP@2, ('walks',))]
		"""
		# linear: nr of nodes
		sep = "\t"
		for p, up in zip(tree.productions(), utree.productions()):
			if len(p.rhs()) == 0: raise ValueError
			if len(p.rhs()) == 1:
				if not isinstance(p.rhs()[0], Nonterminal): rhs = (p.rhs(), )
				else: rhs = (p.rhs(), up.rhs())
			#else: rhs = product(*zip(p.rhs(), up.rhs()))
			else:
				if all(isinstance(a, Nonterminal) for a in up.rhs()):
					rhs = set(product(*zip(p.rhs(), up.rhs())))
				else: rhs = product(*zip(p.rhs(), up.rhs()))

			# constant factor: 8
			#for l, r in product(*((p.lhs(), up.lhs()), rhs)):
			for l, r in product(set((p.lhs(), up.lhs())), rhs):
				#yield Production(l, r)
				if bitparfmt:
					yield "%s%s%s" % (l, sep, sep.join(map(unicode, r)))
				else:
					yield l, r
				# yield a delayed computation that also gives the frequencies
				# given a distribution of nonterminals
				#yield (lambda fd: WeightedProduction(l, r, prob= 
				#	reduce(mul, map(lambda z: '@' in z and
				#	fd[z] or 1, r)) / float(fd[l])))
	
	def parse(self, sent):
		"""most probable derivation (not very good)."""
		return self.parser.parse(sent)

	def mostprobableparse(self, sent, sample=None):
		"""warning: this problem is NP-complete. using an unsorted
		chart parser avoids unnecessary sorting (since we need all
		derivations anyway).
		
		@param sent: a sequence of terminals
		@param sample: None or int; if int then sample that many parses"""
		p = FreqDist()
		for a in self.parser.nbest_parse(sent, sample):
			p.inc(removeids(a).freeze(), a.prob())
		if p.max():
			return ProbabilisticTree(p.max().node, p.max(), prob=p[p.max()])
		else: raise ValueError("no parse")

	def mostconstituentscorrect(self, sent):
		""" not working yet. almost verbatim translation of Goodman's (1996)
		most constituents correct parsing algorithm, except for python's
		zero-based indexing. needs to be modified to return the actual parse
		tree. expects a pcfg in the form of a dictionary from productions to
		probabilities """ 
		def g(s, t, x):
			def f(s, t, x):
				return self.pcfg[Production(rootsymbol,
					sent[1:s] + [x] + sent[s+1:])]
			def e(s, t, x):
				return self.pcfg[Production(x, sent[s:t+1])]
			return f(s, t, x) * e(s, t, x ) / e(1, n, rootsymbol)

		sumx = defaultdict(int) #zero
		maxc = defaultdict(int) #zero
		for length in range(2, len(sent)+1):
			for s in range(1, len(sent) + length):
				t = s + length - 1
				for x in self.nonterminals:
					sumx[x] = g(s, t, x)
				for k in range(self.addresses):
					#ordered dictionary here
					x = self.nonterminal[k]
					sumx[x] += g(s, t, "%s@%d" % (x, k))
				max_x = max(sumx[x] for x in self.nonterminals)
				#for x in self.nonterminals:
				#	max_x = argmax(sumx, x) #???
				best_split = max(maxc[(s,r)] + maxc[(r+1,t)]
									for r in range(s, t))
				#for r in range(s, t):
				#	best_split = max(maxc[(s,r)] + maxc[(r+1,t)])
				maxc[(s,t)] = sumx[max_x] + best_split
		
		return maxc[(1, len(sent) + 1)]