def loadSentences(self, corpus): sentences = UnicodeHelper.readlinesSingleColumn(corpus) tempUni = dd(int) tempBi = dd(int) tempTri = dd(int) for sentence in sentences: sentence = sentence.split() for index in range(len(sentence)): word = sentence[index] tempUni[word] += 1 try: prevWord = sentence[index - 1] tempBi[(prevWord, word)] += 1 except: pass try: prePrevWord = sentence[index - 2] tempTri[(prePrevWord, prevWord, word)] += 1 except: pass self.uniFreq = dict(tempUni.iteritems()) self.unigrams = set(self.uniFreq.keys()) self.uniTotal = sum(self.uniFreq.values()) self.biFreq = dict(tempBi.iteritems()) self.bigrams = set(self.biFreq.keys()) self.biTotal = sum(self.biFreq.values()) self.triFreq = dict(tempTri.iteritems()) self.trigrams = set(self.triFreq.keys()) self.triTotal = sum(self.triFreq.values()) self.computeOnceOccured() print "Trigram Model Trained"
def loadSentences(self, corpus): print 'In Load Sentences' lines = UnicodeHelper.readlinesSingleColumn(corpus) print 'lines loaded' self.sentences = [tuple(line.split()) for line in lines]