예제 #3
파일: ngram.py 프로젝트: NISH1001/anuwadak
class Ngram(object):

    def __init__(self, data_path="../data/ngrams/", load_type="memory"):
        self.load_type = load_type
        self.unigrams = OrderedDict()
        self.bigrams  = OrderedDict()
        self.trigrams = OrderedDict()
        self.quadgrams = OrderedDict()

        # this is ngrams database handler
        self.ngramdb = NgramDB(data_path+"ngrams.db")
        if load_type=="memory":
            ngram_mem = NgramMem(data_path=data_path)
            self.__count = ngram_mem.count
            self.__count_vocab = ngram_mem.count_vocab
            self.__count = self.ngramdb.count

    def close_ngramdb(self):

    def count(self, seq, total=False):
        get integer count of the sequence of words
        seq : is a tuple of words
        returns count of occurences of such sequence
        if total count is needed just set total to True
            and pass garbage seq

        return self.__count(seq, total)

    def probability(self, seq):
        """ get probability of the ngram sequence"""

        length = len(seq)
        count = self.count(seq, total=False)

        # get lower ngram tuple
        lower = tuple( seq[0:length-1] )

        # total is the count of that lower ngram
        count_lower = self.count(lower, total=False)

        if not count or not count_lower:
            return 1e-9
            #return count/total
        # smoothing is done finally
        return (count+1) / (count_lower + self.__count_vocab(n=length-1) )

    def probability_sentence(self, seq, n=3):
            calculates the probability of given sentence
            using markov chain

        if len(seq) < n:
            n = 2
        prob = 1
        for i in range( len(seq) - (n-1) ):
            tup = tuple( seq[i:i+n] )
            prob *= self.probability(tup)
        return prob
    """ private function: create our 2d table with probability using list comprehension"""
    def __generate_probability_table(self, seq):
        n = len(seq)
        table = [ 
                        if x==y or seq[x]==seq[y] else 1*self.probability( tuple([ seq[x],seq[y] ]) ) 
                        for y in range(n)
                for x in range(n) 
        return table

    """ generate the first phase sentence
    def generate_sentence(self, seq):
        n = len(seq)
        table = self.__generate_probability_table(seq) # get the table

        # track the row index in the table
        index_row = 0
        # resultant list
        res = [seq[0]]

        # main generator
        for i in range(n-1):
            # get the word with max probablity ie word(x+1) that is likley to occur after the world(x)
            index_col, max_prob = max(enumerate(table[index_row]), key=lambda x: x[1])

            #table[index_col] = [0.0]*n
            # set the column prob negative -> no need of previous words for upcoming words
            for i in range(n):
                table[i][index_row] = -1

            # new index row 
            index_row = index_col
        return tuple(res)

    # our overlapper -> overlapping tuple processer
    def __overlapper(self, seq, start_with, trie):
        overlap_list = [start_with]
        closed_set = set(start_with)

        for i in range(1,len(seq)-2):
            # get previous tuple in the overlap list
            prev_key = overlap_list[i-1]
            # our next best tuple
            next_tuple = None
            # for getting best probability shit :D
            prev_prob = -1

            # iterate over all the trigrams in the trie
            # cannot delete an element from a dict while in the loop -> hence copy
            temp_dict = trie.copy()
            for key in temp_dict:

                # if current key/tuple begins with the words previous tuple in our overlap_list has
                if (key[0], key[1],) == (prev_key[1], prev_key[2],) and key[2] not in closed_set:
                    if trie[key] > prev_prob:
                        prev_prob, next_tuple = trie[key], key
                        del trie[key]

            # if we get the next tuple 
            if next_tuple:
                # update our closed set
                # update our flood list

        return overlap_list

    def generate_sentence2(self, seq):
        n = len(seq)

        # create our trigrams dict
        # it is not a trie and following variable name 'trie' is hallucinating :P
        trie = {}
        # 3 nested-loops
        for i in range(n):
            for j in range(n):
                for k in range(n):
                    key = (seq[i], seq[j], seq[k])
                    if i==j==k or len(key)!=len(set(key)):
                    #print(key, self.count(key))
                    trie[key] = self.probability(key)
        #trie = OrderedDict(sorted(trie.items(), key=operator.itemgetter(1),  reverse=True))

        # find the starting 3 words (a tuple) from trigrams
        start_word = seq[0]
        start_with = None
        prev = -1
        for key in trie:
            if key[0]==start_word:
                if trie[key] > prev:
                    prev, start_with = trie[key], key

        if not start_with:
            return seq

        # use our overlapper (overlapping technique)
        overlap_list = self.__overlapper(seq, start_with, trie)

        final = list(start_with)
        for i in range(1,len(overlap_list)):
            tup = overlap_list[i]

        return tuple(final)
    def generate_sentences_from_list(self, seq_list):
        return [ self.generate_sentence2(seq) for seq in seq_list ]

    def generate_sentence_best(self, seq_list):
        prev = 0
        best = None
        for sentence in seq_list:
            prob = self.probability_sentence(sentence)
            if prob >= prev:
                prev, best = prob, sentence
        return best
