Пример #1
0
def TextTokenizer(sen):
    # Materi Syntatic proses:text tokenizing
    # http://blog.pantaw.com/syntatic-proses-text-tokenizing/
    stopwords= ['kah','lah','pun','jah','jeh','mu','ku','ke','di','tapi','saya','kamu','mereka','dia', \
          'kita','adalah','dan','jika','kalau','sama','yang', \
          'sekarang','nanti','besok','kemarin','kemaren','nya','na',\
          'at','apa','ini','itu','juga','ketika','namun',\
          'sebab','oleh','malah','memang']
        
    tok = tokenize()
    kata = tok.WordTokenize(sen,removepunct=False)
    if kata:
        print "kalimat setelah di tokenize: ", kata, "\n"
    return kata
Пример #2
0
def TextTokenizer(sen):
    # Materi Syntatic proses:text tokenizing
    # http://blog.pantaw.com/syntatic-proses-text-tokenizing/
    stopwords= ['kah','lah','pun','jah','jeh','mu','ku','ke','di','tapi','saya','kamu','mereka','dia', \
          'kita','adalah','dan','jika','kalau','sama','yang', \
          'sekarang','nanti','besok','kemarin','kemaren','nya','na',\
          'at','apa','ini','itu','juga','ketika','namun',\
          'sebab','oleh','malah','memang']

    tok = tokenize()
    kata = tok.WordTokenize(sen, removepunct=False)
    if kata:
        print "kalimat setelah di tokenize: ", kata, "\n"
    return kata
Пример #3
0
    def rawForLangmodel(self, f, punct_remove=False, to_token=True, min_word=2):
        tok = tokenize()
        table = string.maketrans("","")

        # Splitting sentence based on new line then Regex pattern[0]
        words = re.split(r'\n',f)
        words = re.split(r''+pattern[0]+'',f)
        
        if punct_remove: words = [z.translate(table, string.punctuation) for z in words]

        if to_token:
            words = [tok.WordTokenize(z) for z in words]
            words = filter(lambda x: len(x) >= min_word, words)
        else:
            words = filter(lambda x: len(tok.WordTokenize(x)) >= min_word, words)
        return words
Пример #4
0
    def rawForLangmodel(self, f, punct_remove=False, to_token=True, min_word=2):
        tok = tokenize()
        table = string.maketrans("","")

        # Splitting sentence based on new line then Regex pattern[0]
        words = re.split(r'\n',f)
        words = re.split(r''+pattern[0]+'',f)
        
        if punct_remove: words = [z.translate(table, string.punctuation) for z in words]

        if to_token:
            words = [tok.WordTokenize(z) for z in words]
            words = filter(lambda x: len(x) >= min_word, words)
        else:
            words = filter(lambda x: len(tok.WordTokenize(x)) >= min_word, words)
        return words
Пример #5
0
    def rawForVector(self, f, min_word=2):
        """ Word level vector """
        tok = tokenize()
        t0 = time()
        #print "Splitting sentence for vector processing..."
        table = string.maketrans("", "")

        words = re.split(r'' + pattern[0] + '', f)
        words = [z.translate(table, string.punctuation) for z in words]
        words = filter(lambda x: len(tok.WordTokenize(x)) >= min_word, words)
        words = [tok.WordTokenize(z) for z in words]

        #print "total sentence for process: ", len(words)
        #print "total unique words(vocabulary): ", len(self.word_constructor(words))
        #print("Splitting sentence for vector done in %fs" % (time() - t0))
        #print "\n"

        return words
Пример #6
0
 def rawForVector(self, f, min_word=2):
     """ Word level vector """
     tok = tokenize()
     t0 = time()
     #print "Splitting sentence for vector processing..."
     table = string.maketrans("","")
     
     words = re.split(r''+pattern[0]+'',f)
     words = [z.translate(table, string.punctuation) for z in words]
     words = filter(lambda x: len(tok.WordTokenize(x)) >= min_word, words)
     words = [tok.WordTokenize(z) for z in words]
     
     #print "total sentence for process: ", len(words)
     #print "total unique words(vocabulary): ", len(self.word_constructor(words))
     #print("Splitting sentence for vector done in %fs" % (time() - t0))
     #print "\n"
     
     return words
Пример #7
0
    def train(self, vect, optimizer=None, separate=True, njump=0, verbose=False):
        """
        In the study of probability, given at least two random variables X, Y, ...,that are defined on a probability space S,       
        the joint probability distribution for X, Y, ... is a probability distribution
            that gives the probability that each of X, Y, ... falls in any particular range
            or discrete set of values specified for that variable

        #####################################################################################################################
            #############################################################################################################
        demikian jika diberikan kalimat S: "saya sedang makan nasi goreng di warung depan"
        berarti p(w1,w2,...,wn) disebut sebagai distribusi probabilitas(dimana w1,w2,...wn sebagai random variables), 
            kata (w1,w2,...,wn), over the kalimat S
        """
        t0 = time()
        print "Begin training language model..."

        if optimizer=='modkn':
            """NOTE:
                Untuk sementara penerapan njump parameter belum dapat digunakan
                dalam pengimplementasian Modified Kneser-Ney optimizer ini.
            """
            print "Using optimizer: ", 'Modified Kneser-Ney'
            modkn = ModifiedKneserNey()
            modkn.kneser_ney_discounting(vect)
            modkn.train()
            #print "proba\t\ttoken\t\tbow\t\tcount"
            tmpN=1
            for k,v in sorted(modkn.mKNeyEstimate.items(),key=lambda x: x[1][3]):
                #print ("%0.7f\t%s\t\t%0.7f\t\t%d"%(exp(v[0]),k,exp(v[1]),v[2]))
                if len(k.split(' ')) != tmpN:
                    self.finalmodel[tmpN]=self.vocab
                    self.vocab={}
                    tmpN = len(k.split(' '))

                self.vocab[k] = SimpleVocab(count=int(v[2]), estimator=v[0])

            self.finalmodel[tmpN]=self.vocab
            del self.vocab

        else:
            if optimizer =='sgt':
                print "Using optimizer: ", 'Simple Good-Turing'
            elif optimizer == 'ls':
                print "Using optimizer: ", 'Laplace'
            else:
                print "Using optimizer: ",'Maximum Likelihood Estimation'
                
            tok = tokenize()
            for i in range(1,self.nforgram+1):            
                self.nforgram=i
                self.raw_vocab,self.total_word = constructVocab(vect, self.total_word, \
                                                                nforgram=self.nforgram, separate=separate, \
                                                                njump=njump)
                if optimizer=='sgt':
                    sgtN= float(functools.reduce(operator.add,self.raw_vocab.values()))
                    sgt = SimpleGoodTuring(self.raw_vocab, sgtN)
                    sgtSmoothProb,p0 = sgt.train(self.raw_vocab)
                    
                for k, v in self.raw_vocab.iteritems():
                    # Hitung:
                    # P(Wi) = C(Wi) / N <= untuk UniGram <= dicari melalui MLE
                    if i==1:
                        # Unigram do not use history
                        """ WARNING!!! Kalau menggunakan SGT smoothing, MLE tidak digunakan """
                        if optimizer=='ls':
                            V=len(self.raw_vocab) #<= gunakan jika menggunakan laplace smoothing
                            self.vocab[k] = SimpleVocab(count=v, estimator=self.MLE(v,self.total_word,ls=True,V=V))
                        elif optimizer =='sgt':
                            self.vocab[k] = SimpleVocab(count=v, estimator=sgtSmoothProb[k])
                        else:
                            self.vocab[k] = SimpleVocab(count=v, estimator=self.MLE(v,self.total_word))
                    elif i ==2:
                        """
                        Perlu diingat motivasi dibalik NGram LM adalah:
                        begin with the task of computing P(w|h), the probability of a word w given some history h
                        """
                        # P(Wi, Wj) = C(Wi, Wj) / N <= untuk BiGram <= dicari melalui MLE
                        #   tetapi yang perlu kita cari adalah P(Wj | Wi) == conditional distribution untuk
                        #       seberapa kemungkinan kata Wj muncul diberikan kata Wi sebelumnya
                        # P(Wj | Wi) = P(Wi, Wj) / P(Wi) = C(Wi, Wj) / C(Wi)
                        #       C(Wi)=> adalah unigram count
                        CWi = self.finalmodel[i-1][tok.WordTokenize(k)[0]].count
                        if optimizer=='ls':
                            #functools.reduce(operator.add,self.raw_vocab.values())
                            V=len(self.raw_vocab) #<= gunakan jika menggunakan laplace smoothing
                            self.vocab[k] = SimpleVocab(count=v, estimator=self.MLE(v,CWi,ls=True,V=V))
                        elif optimizer =='sgt':
                            self.vocab[k] = SimpleVocab(count=v, estimator=sgtSmoothProb[k])
                        else:
                            self.vocab[k] = SimpleVocab(count=v, estimator=self.MLE(v,CWi))
                    elif i ==3:
                        #######################################################################################
                        # P(Wi, Wj, Wk) = C (Wi, Wj, Wk) / N <= untuk Trigram
                        #   tetapi yang perlu kita cari adalah P(Wk | Wi, Wj) == conditional distribution untuk
                        #       seberapa kemungkinan kata Wk muncul diberikan kata Wi,Wj sebelumnya
                        CWi = self.finalmodel[i-1][' '.join(tok.WordTokenize(k)[:-1])].count
                        if optimizer=='ls':
                            V=len(self.raw_vocab) #<= gunakan jika menggunakan laplace smoothing
                            self.vocab[k] = SimpleVocab(count=v, estimator=self.MLE(v,CWi,ls=True,V=V))
                        elif optimizer =='sgt':
                            self.vocab[k] = SimpleVocab(count=v, estimator=sgtSmoothProb[k])
                        else:
                            self.vocab[k] = SimpleVocab(count=v, estimator=self.MLE(v,CWi))

                self.finalmodel[i]=self.vocab                
                self.vocab={}
                del self.raw_vocab

        self.perplexity(self.finalmodel, verbose=verbose)
        print ("Training language model done in %fs" % (time() - t0))
        if verbose:
            print "token \t count \t proba \n",
            for k, v in self.finalmodel.iteritems():
                print "######################################################################"
                print k, " - Gram", "\n",
                print "######################################################################"
                for ke,va in v.iteritems():
                    print ("%s\t %d\t %0.5f"%(ke,va.count,exp(va.estimator)))
                    
        return self.finalmodel
Пример #8
0
 def wordTokenizer(self, sent, simple=False):
     if not simple:
         tok = tokenize()
         return tok.WordTokenize(sent)
     else:
         return [x.strip() for x in re.split('(\W+)?', text) if x.strip()]
Пример #9
0
    def train(self,
              vect,
              optimizer=None,
              separate=True,
              njump=0,
              verbose=False):
        """
        In the study of probability, given at least two random variables X, Y, ...,that are defined on a probability space S,       
        the joint probability distribution for X, Y, ... is a probability distribution
            that gives the probability that each of X, Y, ... falls in any particular range
            or discrete set of values specified for that variable

        #####################################################################################################################
            #############################################################################################################
        demikian jika diberikan kalimat S: "saya sedang makan nasi goreng di warung depan"
        berarti p(w1,w2,...,wn) disebut sebagai distribusi probabilitas(dimana w1,w2,...wn sebagai random variables), 
            kata (w1,w2,...,wn), over the kalimat S
        """
        t0 = time()
        print "Begin training language model..."

        if optimizer == 'modkn':
            """NOTE:
                Untuk sementara penerapan njump parameter belum dapat digunakan
                dalam pengimplementasian Modified Kneser-Ney optimizer ini.
            """
            print "Using optimizer: ", 'Modified Kneser-Ney'
            modkn = ModifiedKneserNey()
            modkn.kneser_ney_discounting(vect)
            modkn.train()
            #print "proba\t\ttoken\t\tbow\t\tcount"
            tmpN = 1
            for k, v in sorted(modkn.mKNeyEstimate.items(),
                               key=lambda x: x[1][3]):
                #print ("%0.7f\t%s\t\t%0.7f\t\t%d"%(exp(v[0]),k,exp(v[1]),v[2]))
                if len(k.split(' ')) != tmpN:
                    self.finalmodel[tmpN] = self.vocab
                    self.vocab = {}
                    tmpN = len(k.split(' '))

                self.vocab[k] = SimpleVocab(count=int(v[2]), estimator=v[0])

            self.finalmodel[tmpN] = self.vocab
            del self.vocab

        else:
            if optimizer == 'sgt':
                print "Using optimizer: ", 'Simple Good-Turing'
            elif optimizer == 'ls':
                print "Using optimizer: ", 'Laplace'
            else:
                print "Using optimizer: ", 'Maximum Likelihood Estimation'

            tok = tokenize()
            for i in range(1, self.nforgram + 1):
                self.nforgram = i
                self.raw_vocab,self.total_word = constructVocab(vect, self.total_word, \
                                                                nforgram=self.nforgram, separate=separate, \
                                                                njump=njump)
                if optimizer == 'sgt':
                    sgtN = float(
                        functools.reduce(operator.add,
                                         self.raw_vocab.values()))
                    sgt = SimpleGoodTuring(self.raw_vocab, sgtN)
                    sgtSmoothProb, p0 = sgt.train(self.raw_vocab)

                for k, v in self.raw_vocab.iteritems():
                    # Hitung:
                    # P(Wi) = C(Wi) / N <= untuk UniGram <= dicari melalui MLE
                    if i == 1:
                        # Unigram do not use history
                        """ WARNING!!! Kalau menggunakan SGT smoothing, MLE tidak digunakan """
                        if optimizer == 'ls':
                            V = len(
                                self.raw_vocab
                            )  #<= gunakan jika menggunakan laplace smoothing
                            self.vocab[k] = SimpleVocab(count=v,
                                                        estimator=self.MLE(
                                                            v,
                                                            self.total_word,
                                                            ls=True,
                                                            V=V))
                        elif optimizer == 'sgt':
                            self.vocab[k] = SimpleVocab(
                                count=v, estimator=sgtSmoothProb[k])
                        else:
                            self.vocab[k] = SimpleVocab(count=v,
                                                        estimator=self.MLE(
                                                            v,
                                                            self.total_word))
                    elif i == 2:
                        """
                        Perlu diingat motivasi dibalik NGram LM adalah:
                        begin with the task of computing P(w|h), the probability of a word w given some history h
                        """
                        # P(Wi, Wj) = C(Wi, Wj) / N <= untuk BiGram <= dicari melalui MLE
                        #   tetapi yang perlu kita cari adalah P(Wj | Wi) == conditional distribution untuk
                        #       seberapa kemungkinan kata Wj muncul diberikan kata Wi sebelumnya
                        # P(Wj | Wi) = P(Wi, Wj) / P(Wi) = C(Wi, Wj) / C(Wi)
                        #       C(Wi)=> adalah unigram count
                        CWi = self.finalmodel[i -
                                              1][tok.WordTokenize(k)[0]].count
                        if optimizer == 'ls':
                            #functools.reduce(operator.add,self.raw_vocab.values())
                            V = len(
                                self.raw_vocab
                            )  #<= gunakan jika menggunakan laplace smoothing
                            self.vocab[k] = SimpleVocab(count=v,
                                                        estimator=self.MLE(
                                                            v,
                                                            CWi,
                                                            ls=True,
                                                            V=V))
                        elif optimizer == 'sgt':
                            self.vocab[k] = SimpleVocab(
                                count=v, estimator=sgtSmoothProb[k])
                        else:
                            self.vocab[k] = SimpleVocab(count=v,
                                                        estimator=self.MLE(
                                                            v, CWi))
                    elif i == 3:
                        #######################################################################################
                        # P(Wi, Wj, Wk) = C (Wi, Wj, Wk) / N <= untuk Trigram
                        #   tetapi yang perlu kita cari adalah P(Wk | Wi, Wj) == conditional distribution untuk
                        #       seberapa kemungkinan kata Wk muncul diberikan kata Wi,Wj sebelumnya
                        CWi = self.finalmodel[i - 1][' '.join(
                            tok.WordTokenize(k)[:-1])].count
                        if optimizer == 'ls':
                            V = len(
                                self.raw_vocab
                            )  #<= gunakan jika menggunakan laplace smoothing
                            self.vocab[k] = SimpleVocab(count=v,
                                                        estimator=self.MLE(
                                                            v,
                                                            CWi,
                                                            ls=True,
                                                            V=V))
                        elif optimizer == 'sgt':
                            self.vocab[k] = SimpleVocab(
                                count=v, estimator=sgtSmoothProb[k])
                        else:
                            self.vocab[k] = SimpleVocab(count=v,
                                                        estimator=self.MLE(
                                                            v, CWi))

                self.finalmodel[i] = self.vocab
                self.vocab = {}
                del self.raw_vocab

        self.perplexity(self.finalmodel, verbose=verbose)
        print("Training language model done in %fs" % (time() - t0))
        if verbose:
            print "token \t count \t proba \n",
            for k, v in self.finalmodel.iteritems():
                print "######################################################################"
                print k, " - Gram", "\n",
                print "######################################################################"
                for ke, va in v.iteritems():
                    print("%s\t %d\t %0.5f" %
                          (ke, va.count, exp(va.estimator)))

        return self.finalmodel