Пример #1
0
    def rawForVector(self, f, min_word=2):
        """ Word level vector """
        tok = tokenize()
        t0 = time()

        table = string.maketrans("", "")

        words = re.split(r'' + pattern[0] + '', f)
        words = [z.translate(table, string.punctuation) for z in words]
        words = filter(lambda x: len(tok.WordTokenize(x)) >= min_word, words)
        words = [tok.WordTokenize(z) for z in words]

        return words
Пример #2
0
    def rawForVector(self, f, min_word=2):
        """ Word level vector """
        tok = tokenize()
        t0 = time()

        table = string.maketrans("","")
        
        words = re.split(r''+pattern[0]+'',f)
        words = [z.translate(table, string.punctuation) for z in words]
        words = filter(lambda x: len(tok.WordTokenize(x)) >= min_word, words)
        words = [tok.WordTokenize(z) for z in words]
        
        return words
Пример #3
0
    def rawForLangmodel(self, f, punct_remove=False, to_token=True, min_word=2):
        tok = tokenize()
        table = string.maketrans("","")

        # Splitting sentence based on new line then Regex pattern[0]
        words = re.split(r'\n',f)
        words = re.split(r''+pattern[0]+'',f)
        
        if punct_remove: words = [z.translate(table, string.punctuation) for z in words]

        if to_token:
            words = [tok.WordTokenize(z) for z in words]
            words = filter(lambda x: len(x) >= min_word, words)
        else:
            words = filter(lambda x: len(tok.WordTokenize(x)) >= min_word, words)
        return words
Пример #4
0
    def rawForLangmodel(self,
                        f,
                        punct_remove=False,
                        to_token=True,
                        min_word=2):
        tok = tokenize()
        table = string.maketrans("", "")

        # Splitting sentence based on new line then Regex pattern[0]
        words = re.split(r'\n', f)
        words = re.split(r'' + pattern[0] + '', f)

        if punct_remove:
            words = [z.translate(table, string.punctuation) for z in words]

        if to_token:
            words = [tok.WordTokenize(z) for z in words]
            words = filter(lambda x: len(x) >= min_word, words)
        else:
            words = filter(lambda x: len(tok.WordTokenize(x)) >= min_word,
                           words)
        return words
Пример #5
0
    def train(self,
              vect,
              optimizer=None,
              separate=True,
              njump=0,
              verbose=False):
        """
        In the study of probability, given at least two random variables X, Y, ...,that are defined on a probability space S,       
        the joint probability distribution for X, Y, ... is a probability distribution
            that gives the probability that each of X, Y, ... falls in any particular range
            or discrete set of values specified for that variable

        #####################################################################################################################
            #############################################################################################################
        demikian jika diberikan kalimat S: "saya sedang makan nasi goreng di warung depan"
        berarti p(w1,w2,...,wn) disebut sebagai distribusi probabilitas(dimana w1,w2,...wn sebagai random variables), 
            kata (w1,w2,...,wn), over the kalimat S
        """
        t0 = time()
        print "Begin training language model..."

        if optimizer == 'modkn':
            """NOTE:
                Untuk sementara penerapan njump parameter belum dapat digunakan
                dalam pengimplementasian Modified Kneser-Ney optimizer ini.
            """
            print "Using optimizer: ", 'Modified Kneser-Ney'
            modkn = ModifiedKneserNey(order=self.nforgram,
                                      sb=self.sb,
                                      se=self.se)
            modkn.kneser_ney_discounting(vect)
            modkn.train()

            tmpN = 1
            for k, v in sorted(modkn.mKNeyEstimate.items(),
                               key=lambda x: x[1][3]):
                if len(k.split(' ')) != tmpN:
                    self.finalmodel[tmpN] = self.vocab
                    self.vocab = {}
                    tmpN = len(k.split(' '))

                if self.normalize_logprob:
                    self.vocab[k] = SimpleVocab(count=int(v[2]),
                                                estimator=exp(v[0]),
                                                discount=exp(v[1]))
                else:
                    self.vocab[k] = SimpleVocab(count=int(v[2]),
                                                estimator=v[0],
                                                discount=v[1])

            self.finalmodel[tmpN] = self.vocab
            del self.vocab

        else:
            if optimizer == 'sgt':
                print "Using optimizer: ", 'Simple Good-Turing'
            elif optimizer == 'ls':
                print "Using optimizer: ", 'Laplace'
            else:
                print "Using optimizer: ", 'Maximum Likelihood Estimation'

            tok = tokenize()
            for i in range(1, self.nforgram + 1):
                self.nforgram = i
                self.raw_vocab,self.total_word = constructVocab(vect, self.total_word, \
                                                                nforgram=self.nforgram, separate=separate, \
                                                                njump=njump)

                if optimizer == 'sgt':
                    sgtN = float(
                        functools.reduce(operator.add,
                                         self.raw_vocab.values()))
                    sgt = SimpleGoodTuring(self.raw_vocab, sgtN)
                    sgtSmoothProb, p0 = sgt.train(self.raw_vocab)
                else:
                    mle = MLE()

                for k, v in self.raw_vocab.iteritems():
                    # Hitung:
                    # P(Wi) = C(Wi) / N <= untuk UniGram <= dicari melalui MLE
                    if i == 1:
                        # Unigram do not use history
                        """ WARNING!!! Kalau menggunakan SGT smoothing, MLE tidak digunakan """
                        if optimizer == 'ls':
                            V = len(
                                self.raw_vocab
                            )  #<= gunakan jika menggunakan laplace smoothing
                            if self.normalize_logprob:
                                # http://stats.stackexchange.com/questions/66616/converting-normalizing-very-small-likelihood-values-to-probability
                                logprob = exp(
                                    mle.train(v, self.total_word, ls=True,
                                              V=V))
                            else:
                                logprob = mle.train(v,
                                                    self.total_word,
                                                    ls=True,
                                                    V=V)
                            self.vocab[k] = SimpleVocab(count=v,
                                                        estimator=logprob,
                                                        discount=1)
                        elif optimizer == 'sgt':
                            if self.normalize_logprob:
                                logprob = exp(sgtSmoothProb[k])
                            else:
                                logprob = sgtSmoothProb[k]
                            self.vocab[k] = SimpleVocab(count=v,
                                                        estimator=logprob,
                                                        discount=p0)
                        else:
                            if self.normalize_logprob:
                                logprob = exp(mle.train(v, self.total_word))
                            else:
                                logprob = mle.train(v, self.total_word)
                            self.vocab[k] = SimpleVocab(count=v,
                                                        estimator=logprob,
                                                        discount=1)
                    elif i == 2:
                        """
                        Perlu diingat motivasi dibalik NGram LM adalah:
                        begin with the task of computing P(w|h), the probability of a word w given some history h
                        """
                        # P(Wi, Wj) = C(Wi, Wj) / N <= untuk BiGram <= dicari melalui MLE
                        #   tetapi yang perlu kita cari adalah P(Wj | Wi) == conditional distribution untuk
                        #       seberapa kemungkinan kata Wj muncul diberikan kata Wi sebelumnya
                        # P(Wj | Wi) = P(Wi, Wj) / P(Wi) = C(Wi, Wj) / C(Wi)
                        #       C(Wi)=> adalah unigram count
                        CWi = self.finalmodel[i -
                                              1][tok.WordTokenize(k)[0]].count
                        if optimizer == 'ls':
                            V = len(
                                self.raw_vocab
                            )  #<= gunakan jika menggunakan laplace smoothing
                            if self.normalize_logprob:
                                logprob = exp(mle.train(v, CWi, ls=True, V=V))
                            else:
                                logprob = mle.train(v, CWi, ls=True, V=V)
                            self.vocab[k] = SimpleVocab(count=v,
                                                        estimator=logprob,
                                                        discount=1)
                        elif optimizer == 'sgt':
                            if self.normalize_logprob:
                                logprob = exp(sgtSmoothProb[k])
                            else:
                                logprob = sgtSmoothProb[k]
                            self.vocab[k] = SimpleVocab(count=v,
                                                        estimator=logprob,
                                                        discount=p0)
                        else:
                            if self.normalize_logprob:
                                logprob = exp(mle.train(v, CWi))
                            else:
                                logprob = mle.train(v, CWi)
                            self.vocab[k] = SimpleVocab(count=v,
                                                        estimator=logprob,
                                                        discount=1)
                    else:
                        #######################################################################################
                        # P(Wi, Wj, Wk) = C (Wi, Wj, Wk) / N <= untuk Trigram
                        #   tetapi yang perlu kita cari adalah P(Wk | Wi, Wj) == conditional distribution untuk
                        #       seberapa kemungkinan kata Wk muncul diberikan kata Wi,Wj sebelumnya
                        CWi = self.finalmodel[i - 1][' '.join(
                            tok.WordTokenize(k)[:self.nforgram -
                                                (i + 1)])].count
                        if optimizer == 'ls':
                            V = len(
                                self.raw_vocab
                            )  #<= gunakan jika menggunakan laplace smoothing
                            if self.normalize_logprob:
                                logprob = exp(mle.train(v, CWi, ls=True, V=V))
                            else:
                                logprob = mle.train(v, CWi, ls=True, V=V)
                            self.vocab[k] = SimpleVocab(count=v,
                                                        estimator=logprob,
                                                        discount=1)
                        elif optimizer == 'sgt':
                            if self.normalize_logprob:
                                logprob = exp(sgtSmoothProb[k])
                            else:
                                logprob = sgtSmoothProb[k]
                            self.vocab[k] = SimpleVocab(count=v,
                                                        estimator=logprob,
                                                        discount=p0)
                        else:
                            if self.normalize_logprob:
                                logprob = exp(mle.train(v, CWi))
                            else:
                                logprob = mle.train(v, CWi)
                            self.vocab[k] = SimpleVocab(count=v,
                                                        estimator=logprob,
                                                        discount=1)

                self.finalmodel[i] = self.vocab
                self.vocab = {}
                del self.raw_vocab

        self.perplexity(self.finalmodel, verbose=verbose)
        print("Training language model done in %fs" % (time() - t0))
        if verbose:
            print "token\tcount\tproba\n",
            for k, v in self.finalmodel.iteritems():
                print "######################################################################"
                print k, "-Gram", "\n",
                print "######################################################################"
                for ke, va in v.iteritems():
                    print("%s\t %d\t %0.5f" %
                          (ke, va.count, exp(va.estimator)))

        return self.finalmodel
Пример #6
0
 def wordTokenizer(self, sent, simple=False):
     if not simple:
         tok = tokenize()
         return tok.WordTokenize(sent)
     else:
         return [x.strip() for x in re.split('(\W+)?', text) if x.strip()]
Пример #7
0
    def train(self, vect, optimizer=None, separate=True, njump=0, verbose=False):
        """
        In the study of probability, given at least two random variables X, Y, ...,that are defined on a probability space S,       
        the joint probability distribution for X, Y, ... is a probability distribution
            that gives the probability that each of X, Y, ... falls in any particular range
            or discrete set of values specified for that variable

        #####################################################################################################################
            #############################################################################################################
        demikian jika diberikan kalimat S: "saya sedang makan nasi goreng di warung depan"
        berarti p(w1,w2,...,wn) disebut sebagai distribusi probabilitas(dimana w1,w2,...wn sebagai random variables), 
            kata (w1,w2,...,wn), over the kalimat S
        """
        t0 = time()
        print "Begin training language model..."

        if optimizer=='modkn':
            """NOTE:
                Untuk sementara penerapan njump parameter belum dapat digunakan
                dalam pengimplementasian Modified Kneser-Ney optimizer ini.
            """
            print "Using optimizer: ", 'Modified Kneser-Ney'
            modkn = ModifiedKneserNey(order=self.nforgram, sb=self.sb, se=self.se)
            modkn.kneser_ney_discounting(vect)
            modkn.train()

            tmpN=1
            for k,v in sorted(modkn.mKNeyEstimate.items(),key=lambda x: x[1][3]):
                if len(k.split(' ')) != tmpN:
                    self.finalmodel[tmpN]=self.vocab
                    self.vocab={}
                    tmpN = len(k.split(' '))
                
                if self.normalize_logprob:
                    self.vocab[k] = SimpleVocab(count=int(v[2]), estimator=exp(v[0]), discount=exp(v[1]))
                else:
                    self.vocab[k] = SimpleVocab(count=int(v[2]), estimator=v[0], discount=v[1])

            self.finalmodel[tmpN]=self.vocab
            del self.vocab

        else:
            if optimizer =='sgt':
                print "Using optimizer: ", 'Simple Good-Turing'
            elif optimizer == 'ls':
                print "Using optimizer: ", 'Laplace'
            else:
                print "Using optimizer: ",'Maximum Likelihood Estimation'
                
            tok = tokenize()
            for i in range(1,self.nforgram+1):            
                self.nforgram=i
                self.raw_vocab,self.total_word = constructVocab(vect, self.total_word, \
                                                                nforgram=self.nforgram, separate=separate, \
                                                                njump=njump)

                if optimizer=='sgt':
                    sgtN = float(functools.reduce(operator.add,self.raw_vocab.values()))
                    sgt = SimpleGoodTuring(self.raw_vocab, sgtN)
                    sgtSmoothProb,p0 = sgt.train(self.raw_vocab)
                else:
                    mle = MLE()
                    
                for k, v in self.raw_vocab.iteritems():
                    # Hitung:
                    # P(Wi) = C(Wi) / N <= untuk UniGram <= dicari melalui MLE
                    if i==1:
                        # Unigram do not use history
                        """ WARNING!!! Kalau menggunakan SGT smoothing, MLE tidak digunakan """
                        if optimizer=='ls':
                            V=len(self.raw_vocab) #<= gunakan jika menggunakan laplace smoothing
                            if self.normalize_logprob:
                                # http://stats.stackexchange.com/questions/66616/converting-normalizing-very-small-likelihood-values-to-probability
                                logprob = exp(mle.train(v,self.total_word,ls=True,V=V))
                            else:
                                logprob = mle.train(v,self.total_word,ls=True,V=V)
                            self.vocab[k] = SimpleVocab(count=v, estimator=logprob, discount=1)
                        elif optimizer =='sgt':
                            if self.normalize_logprob:
                                logprob = exp(sgtSmoothProb[k])
                            else:
                                logprob = sgtSmoothProb[k]
                            self.vocab[k] = SimpleVocab(count=v, estimator=logprob, discount=p0)
                        else:
                            if self.normalize_logprob:
                                logprob = exp(mle.train(v,self.total_word))
                            else:
                                logprob = mle.train(v,self.total_word)
                            self.vocab[k] = SimpleVocab(count=v, estimator=logprob, discount=1)
                    elif i ==2:
                        """
                        Perlu diingat motivasi dibalik NGram LM adalah:
                        begin with the task of computing P(w|h), the probability of a word w given some history h
                        """
                        # P(Wi, Wj) = C(Wi, Wj) / N <= untuk BiGram <= dicari melalui MLE
                        #   tetapi yang perlu kita cari adalah P(Wj | Wi) == conditional distribution untuk
                        #       seberapa kemungkinan kata Wj muncul diberikan kata Wi sebelumnya
                        # P(Wj | Wi) = P(Wi, Wj) / P(Wi) = C(Wi, Wj) / C(Wi)
                        #       C(Wi)=> adalah unigram count
                        CWi = self.finalmodel[i-1][tok.WordTokenize(k)[0]].count
                        if optimizer=='ls':
                            V=len(self.raw_vocab) #<= gunakan jika menggunakan laplace smoothing
                            if self.normalize_logprob:
                                logprob = exp(mle.train(v,CWi,ls=True,V=V))
                            else:
                                logprob = mle.train(v,CWi,ls=True,V=V)
                            self.vocab[k] = SimpleVocab(count=v, estimator=logprob, discount=1)
                        elif optimizer =='sgt':
                            if self.normalize_logprob:
                                logprob = exp(sgtSmoothProb[k])
                            else:
                                logprob = sgtSmoothProb[k]
                            self.vocab[k] = SimpleVocab(count=v, estimator=logprob, discount=p0)
                        else:
                            if self.normalize_logprob:
                                logprob = exp(mle.train(v,CWi))
                            else:
                                logprob = mle.train(v,CWi)
                            self.vocab[k] = SimpleVocab(count=v, estimator=logprob, discount=1)
                    else:
                        #######################################################################################
                        # P(Wi, Wj, Wk) = C (Wi, Wj, Wk) / N <= untuk Trigram
                        #   tetapi yang perlu kita cari adalah P(Wk | Wi, Wj) == conditional distribution untuk
                        #       seberapa kemungkinan kata Wk muncul diberikan kata Wi,Wj sebelumnya
                        CWi = self.finalmodel[i-1][' '.join(tok.WordTokenize(k)[:self.nforgram - (i+1)])].count
                        if optimizer=='ls':
                            V=len(self.raw_vocab) #<= gunakan jika menggunakan laplace smoothing
                            if self.normalize_logprob:
                                logprob = exp(mle.train(v,CWi,ls=True,V=V))
                            else:
                                logprob = mle.train(v,CWi,ls=True,V=V)
                            self.vocab[k] = SimpleVocab(count=v, estimator=logprob, discount=1)
                        elif optimizer =='sgt':
                            if self.normalize_logprob:
                                logprob = exp(sgtSmoothProb[k])
                            else:
                                logprob = sgtSmoothProb[k]
                            self.vocab[k] = SimpleVocab(count=v, estimator=logprob, discount=p0)
                        else:
                            if self.normalize_logprob:
                                logprob = exp(mle.train(v,CWi))
                            else:
                                logprob = mle.train(v,CWi)
                            self.vocab[k] = SimpleVocab(count=v, estimator=logprob, discount=1)

                self.finalmodel[i]=self.vocab                
                self.vocab={}
                del self.raw_vocab

        self.perplexity(self.finalmodel, verbose=verbose)
        print ("Training language model done in %fs" % (time() - t0))
        if verbose:
            print "token\tcount\tproba\n",
            for k, v in self.finalmodel.iteritems():
                print "######################################################################"
                print k, "-Gram", "\n",
                print "######################################################################"
                for ke,va in v.iteritems():
                    print ("%s\t %d\t %0.5f"%(ke,va.count,exp(va.estimator)))
                    
        return self.finalmodel
Пример #8
0
 def wordTokenizer(self, sent, simple=False):
     if not simple:
         tok = tokenize()
         return tok.WordTokenize(sent)
     else:
         return [x.strip() for x in re.split('(\W+)?', text) if x.strip()]