def generate(grammar: PCFG, fitness_fn): """ Generate an utterance from the supplied grammar fitted to the fitness function Here's an example usage to generate a simple stack of three blocks: >>> def fitness(_, prefix): ... return len(prefix) < 3 >>> grammar = load_grammar("block_top -> 'block' block_top|") >>> generate(grammar, fitness) ('block', 'block', 'block') """ sentence = [grammar.start()] for i in next_nonterm(sentence): productions = grammar.productions(lhs=sentence[i]) try: # Attempt random selection if we are dealing with probabilistic rules best = np.random.choice(productions, p=[p.prob() for p in productions]) except ValueError: # Probabilities do not sum to 1, so we're checking against a fitness function best_fitness = 0.0 best_prods = [] for prod in productions: fitness = fitness_fn(terminated(grammar, prod.rhs()), prefix=tuple(sentence[0:i])) if fitness > best_fitness: best_prods = [] if fitness >= best_fitness: best_fitness = fitness best_prods.append(prod) if fitness >= 1.0: break best = best_prods[-1] sentence.pop(i) [sentence.insert(i, s) for s in reversed(best.rhs())] return tuple(sentence)
class QuestionEvaluator(object): """docstring for QuestionEvaluator""" def __init__(self, productions_filename=None): super(QuestionEvaluator, self).__init__() self.parser = stanford.StanfordParser(encoding='utf8') if (productions_filename != None): self.read_productions(productions_filename) else: self.grammar = None def write_productions_to_file(self, productions_filename): assert (self.grammar != None) productions = self.grammar.productions() with io.open(productions_filename, 'w', encoding='utf8') as f: for p in productions: lhs = "%s" % p.lhs() rhs = " ".join(["%s" % s for s in p.rhs()]) prob = "%f" % p.prob() f.write(lhs + u"+" + rhs + u"+" + prob + u"\n") def read_productions(self, productions_filename): productions = [] with io.open(productions_filename, 'r', encoding='utf8') as f: for line in f: line = line.strip() components = line.split(u'+') lhs = Nonterminal(components[0]) rhs = tuple([ Nonterminal(nt.strip()) for nt in components[1].split(u' ') ]) prob = float(components[2]) pp = ProbabilisticProduction(lhs, rhs, prob=prob) productions.append(pp) self.grammar = PCFG(Nonterminal('S'), productions) def generate_pcfg_productions(self, questionbank): productions = [] with io.open(questionbank, 'r', encoding='utf8') as f: for line in f: line = line.strip() sent_text = nltk.sent_tokenize(line) for sentence in sent_text: #print sentence ss = self.parser.raw_parse_sents((sentence, )) for k in ss: for s in k: buf = "%s" % s buf = six.text_type(buf) s1 = Tree.fromstring(buf) #get rid of the ROOT for node in s1: if node.label() == 'ROOT': continue else: s1 = node break s1.chomsky_normal_form(horzMarkov=2) pdc = [] for p in s1.productions(): #remove the lexical production if not p.is_lexical(): pdc.append(p) productions += pdc S = Nonterminal('S') self.grammar = induce_pcfg(S, productions) def traverse(self, node): assert (self.grammar != None) prob = 0.0 length = 0 if node.height() == 2: return (prob, length) lhs = Nonterminal(node.label()) productions = self.grammar.productions(lhs) #find the productions from flag = False rhs_list = [] for c in node: rhs_list.append(Nonterminal(c.label())) tuple_rhs = tuple(rhs_list) for p in productions: if p.lhs() == lhs and p.rhs() == tuple_rhs: flag = True prob += math.log(p.prob()) break if not flag: prob += math.log(eps) length += 1 for c in node: ret = self.traverse(c) prob += ret[0] length += ret[1] return (prob, length) ''' @param: sentence that needs to be evaluate @output: a grammartical probability ''' def evaluate(self, sentence): ss = self.parser.raw_parse_sents((sentence, )) tree = None for k in ss: for s in k: buf = "%s" % s buf = six.text_type(buf) tree = Tree.fromstring(buf) for node in tree: if node.label() == 'ROOT': continue else: tree = node break tree.chomsky_normal_form(horzMarkov=2) #print tree (prob, length) = self.traverse(tree) return prob / length