Exemplo n.º 1
0
def generate(grammar: PCFG, fitness_fn):
    """ 
  Generate an utterance from the supplied grammar fitted to the fitness function

  Here's an example usage to generate a simple stack of three blocks:
  
  >>> def fitness(_, prefix):
  ...   return len(prefix) < 3
  >>> grammar = load_grammar("block_top -> 'block' block_top|")
  >>> generate(grammar, fitness)
  ('block', 'block', 'block')
  """
    sentence = [grammar.start()]

    for i in next_nonterm(sentence):
        productions = grammar.productions(lhs=sentence[i])

        try:
            # Attempt random selection if we are dealing with probabilistic rules
            best = np.random.choice(productions,
                                    p=[p.prob() for p in productions])
        except ValueError:
            # Probabilities do not sum to 1, so we're checking against a fitness function
            best_fitness = 0.0
            best_prods = []

            for prod in productions:
                fitness = fitness_fn(terminated(grammar, prod.rhs()),
                                     prefix=tuple(sentence[0:i]))

                if fitness > best_fitness:
                    best_prods = []

                if fitness >= best_fitness:
                    best_fitness = fitness

                    best_prods.append(prod)

                    if fitness >= 1.0:
                        break

            best = best_prods[-1]

        sentence.pop(i)
        [sentence.insert(i, s) for s in reversed(best.rhs())]

    return tuple(sentence)
Exemplo n.º 2
0
class QuestionEvaluator(object):
    """docstring for QuestionEvaluator"""
    def __init__(self, productions_filename=None):
        super(QuestionEvaluator, self).__init__()
        self.parser = stanford.StanfordParser(encoding='utf8')
        if (productions_filename != None):
            self.read_productions(productions_filename)
        else:
            self.grammar = None

    def write_productions_to_file(self, productions_filename):
        assert (self.grammar != None)
        productions = self.grammar.productions()
        with io.open(productions_filename, 'w', encoding='utf8') as f:
            for p in productions:
                lhs = "%s" % p.lhs()
                rhs = " ".join(["%s" % s for s in p.rhs()])
                prob = "%f" % p.prob()
                f.write(lhs + u"+" + rhs + u"+" + prob + u"\n")

    def read_productions(self, productions_filename):
        productions = []
        with io.open(productions_filename, 'r', encoding='utf8') as f:
            for line in f:
                line = line.strip()
                components = line.split(u'+')
                lhs = Nonterminal(components[0])
                rhs = tuple([
                    Nonterminal(nt.strip()) for nt in components[1].split(u' ')
                ])
                prob = float(components[2])
                pp = ProbabilisticProduction(lhs, rhs, prob=prob)
                productions.append(pp)
        self.grammar = PCFG(Nonterminal('S'), productions)

    def generate_pcfg_productions(self, questionbank):
        productions = []

        with io.open(questionbank, 'r', encoding='utf8') as f:
            for line in f:
                line = line.strip()
                sent_text = nltk.sent_tokenize(line)

                for sentence in sent_text:
                    #print sentence
                    ss = self.parser.raw_parse_sents((sentence, ))
                    for k in ss:
                        for s in k:
                            buf = "%s" % s
                            buf = six.text_type(buf)
                            s1 = Tree.fromstring(buf)

                            #get rid of the ROOT
                            for node in s1:
                                if node.label() == 'ROOT':
                                    continue
                                else:
                                    s1 = node
                                    break
                            s1.chomsky_normal_form(horzMarkov=2)
                            pdc = []
                            for p in s1.productions():
                                #remove the lexical production
                                if not p.is_lexical():
                                    pdc.append(p)

                            productions += pdc

        S = Nonterminal('S')
        self.grammar = induce_pcfg(S, productions)

    def traverse(self, node):
        assert (self.grammar != None)
        prob = 0.0
        length = 0
        if node.height() == 2:
            return (prob, length)
        lhs = Nonterminal(node.label())
        productions = self.grammar.productions(lhs)

        #find the productions from
        flag = False
        rhs_list = []
        for c in node:
            rhs_list.append(Nonterminal(c.label()))
        tuple_rhs = tuple(rhs_list)
        for p in productions:
            if p.lhs() == lhs and p.rhs() == tuple_rhs:
                flag = True
                prob += math.log(p.prob())
                break
        if not flag:
            prob += math.log(eps)
        length += 1
        for c in node:
            ret = self.traverse(c)
            prob += ret[0]
            length += ret[1]
        return (prob, length)

    '''
	@param: sentence that needs to be evaluate
	@output: a grammartical probability
	'''

    def evaluate(self, sentence):
        ss = self.parser.raw_parse_sents((sentence, ))
        tree = None
        for k in ss:
            for s in k:
                buf = "%s" % s
                buf = six.text_type(buf)
                tree = Tree.fromstring(buf)
                for node in tree:
                    if node.label() == 'ROOT':
                        continue
                    else:
                        tree = node
                        break
                tree.chomsky_normal_form(horzMarkov=2)

        #print tree
        (prob, length) = self.traverse(tree)
        return prob / length