예제 #1
0
 def loadSentences(self, corpus):
   sentences = UnicodeHelper.readlinesSingleColumn(corpus)
   tempUni = dd(int)
   tempBi = dd(int)
   tempTri = dd(int)
   for sentence in sentences:
     sentence = sentence.split()
     for index in range(len(sentence)):
       word = sentence[index]
       tempUni[word] += 1
       try:
         prevWord = sentence[index - 1]
         tempBi[(prevWord, word)] += 1
       except:
         pass
       try:
         prePrevWord = sentence[index - 2]
         tempTri[(prePrevWord, prevWord, word)] += 1
       except:
         pass
   self.uniFreq = dict(tempUni.iteritems())
   self.unigrams = set(self.uniFreq.keys())
   self.uniTotal = sum(self.uniFreq.values())
   self.biFreq = dict(tempBi.iteritems())
   self.bigrams = set(self.biFreq.keys())
   self.biTotal = sum(self.biFreq.values())
   self.triFreq = dict(tempTri.iteritems())
   self.trigrams = set(self.triFreq.keys())
   self.triTotal = sum(self.triFreq.values())
   
   self.computeOnceOccured()
   
   print "Trigram Model Trained"
예제 #2
0
 def loadSentences(self, corpus):
   print 'In Load Sentences'
   lines = UnicodeHelper.readlinesSingleColumn(corpus)
   print 'lines loaded'
   self.sentences = [tuple(line.split()) for line in lines]