예제 #1
0
def pcfg_generate(grammar):

    def non_terminal_into_terminal(non_terminal):
        nt_productions = grammar.productions(Nonterminal(str(non_terminal)))
        my_dict = dict()
        for pr in nt_productions: my_dict[pr.rhs()] = pr.prob()
        nt_productions_probDist = DictionaryProbDist(my_dict)
        genereted = nt_productions_probDist.generate()
        return list(genereted)

    def nts_into_ts(genereted_nts):
        for index in range(len(genereted_nts)):
            old_nt = genereted_nts[index]
            try:
                t = non_terminal_into_terminal(genereted_nts[index])
            except Exception as e:
                continue
            productions_corpus.append(ProbabilisticProduction(Nonterminal(old_nt), tuple(t), **{'prob': 0}))
            genereted_nts[index] = nts_into_ts(Tree(old_nt, t))
        return genereted_nts

    productions = grammar.productions()
    dic = dict()
    for pr in productions: dic[pr.rhs()] = pr.prob()
    productions_probDist = DictionaryProbDist(dic)
    genereted = productions_probDist.generate()
    productions_corpus.append(ProbabilisticProduction(Nonterminal('S'), genereted, **{'prob': 0}))
    genereted = Tree('S', [genereted[0], genereted[1]])
    return nts_into_ts(genereted)
예제 #2
0
 def non_terminal_into_terminal(non_terminal):
     nt_productions = grammar.productions(Nonterminal(str(non_terminal)))
     my_dict = dict()
     for pr in nt_productions: my_dict[pr.rhs()] = pr.prob()
     nt_productions_probDist = DictionaryProbDist(my_dict)
     genereted = nt_productions_probDist.generate()
     return list(genereted)
예제 #3
0
def generate_one_sample(grammar, item, depth):
    if depth > 0:
        if isinstance(item, Nonterminal):

            # get all relations starting with item
            np_productions = grammar.productions(item)
            dict = {}
            # record the probabilities
            for pr in np_productions:
                dict[pr.rhs()] = pr.prob()
            np_probDist = DictionaryProbDist(dict)

            # np_probDist.generate() samples a probable expansion
            # in contrast to the iterative
            for prod in grammar.productions(lhs=item):
                for frag in _generate_all(grammar, np_probDist.generate(),
                                          depth - 1):
                    yield frag
        else:
            yield [item]
예제 #4
0
class NovelParagraph:
    def __init__(self, *args, **kwargs):
        if 'strategy' in kwargs:
            self.strategy = kwargs['strategy']
        else:
            self.strategy = 'best'

        self.events = []
        self.sentences = []
        self.source_probability = {}
        self.querysets = {}
        self.sources = []
        self.symmetrical_tokens = []
        for source, probability in args:
            self.source_probability[source] = probability
            self.querysets[source] = NGram.objects.filter(
                **reconcile_old_style_source(source)
            )
            self.sources.append(source)
            if self.querysets[source].count() == 0:
                raise InvalidSourceException("No NGrams with this source")
        self.source_probability = DictionaryProbDist(self.source_probability)

    def pick_queryset(self):
        return self.querysets[self.source_probability.generate()]

    def append_sentence(self):
        self.current_sentence = []
        starter = self.pick_queryset().filter(
            sentence_starter=True
        ).order_by('?').first()
        self.current_sentence.append((starter.token_one, starter.tag_one))
        self.current_sentence.append((starter.token_two, starter.tag_two))
        self.current_sentence.append((starter.token_three, starter.tag_three))
        while self.current_sentence[-1][0] not in TERMINAL_PUNCTUATION:
            new_word = self.new_word()
            self.current_sentence.append(new_word)
        self.sentences.append(self.current_sentence)

    def _get_others(self, original):
        sources = self.sources.copy()
        sources.remove(original)
        return [
            NGram.objects.filter(
                **reconcile_old_style_source(source)
            ) for source in sources
        ]

    def _account_for_symmetrical_tokens(self, token):
        if token in SYMMETRICAL_TOKENS:
            self.symmetrical_tokens.append(
                ( SYMMETRICAL_TOKENS[token], SYMMETRICAL_TOKENS[token] )
            )

    def new_word(self):
        queryset = self.pick_queryset()
        ordered_querysets = [queryset]

        if len(self.sources) > 1:
            if queryset.first().twitter_user:
                source = queryset.first().twitter_user.twitter_id + '@twitter'
            else:
                source = 'document:'+queryset.first().document.name
            ordered_querysets = ordered_querysets + self._get_others(source)

        for qs in ordered_querysets:
            new_word = self.new_word_from_queryset(qs)
            if new_word:
                self._account_for_symmetrical_tokens(new_word[0])
                if new_word[0] in TERMINAL_PUNCTUATION:
                    if len(self.symmetrical_tokens) > 0:
                        return self.symmetrical_tokens.pop()
                return new_word

        if len(self.symmetrical_tokens) > 0:
            return self.symmetrical_tokens.pop()

        return ('.', '.')

    def _best_matching_word(self, queryset):
        if self.strategy == 'grammar_only':
            return queryset.filter(
                tag_one=self.current_sentence[-2][1],
                tag_two=self.current_sentence[-1][1],
            ).order_by('?').first()
        else:
            nxt = queryset.filter(
                token_one__iexact=self.current_sentence[-2][0],
                token_two__iexact=self.current_sentence[-1][0],
                tag_one=self.current_sentence[-2][1],
                tag_two=self.current_sentence[-1][1],
            ).order_by('?').first()
            if not nxt:
                nxt = queryset.filter(
                    token_one__iexact=self.current_sentence[-2][0],
                    token_two__iexact=self.current_sentence[-1][0],
                ).order_by('?').first()
            return nxt

    def new_word_from_queryset(self, queryset):
        nxt = self._best_matching_word(queryset)
        if nxt:
            return (nxt.token_three, nxt.tag_three)
        else:
            return None

    @classmethod
    def _needs_space(self, token, previous_token, index):
        if index == 0:
            return False
        if previous_token in NO_TRAILING_SPACE_TOKENS:
            return False
        if token in NO_LEADING_SPACE_TOKENS:
            return False
        return True
    
    @classmethod
    def _join_and_postprocess_sentences(self, sentences):
        sentences = [''.join(sentence) for sentence in sentences]
        text = ' '.join(sentences)
        for pattern, replacement in REGEX_REPLACEMENTS:
            text = re.sub(pattern, replacement, text) 
        return text 
    
    def human_readable_sentences(self):
        final_output = []
        for sent in self.sentences:
            output = []
            for i, token in enumerate(sent):
                if NovelParagraph._needs_space(token[0], sent[i-1][0], i):
                    output.append(' ')
                output.append(token[0])
            final_output.append(output)
        return NovelParagraph._join_and_postprocess_sentences(final_output)