def __init__(self): self.sentences = [] # [(word, tag, lang), ...] self.scoredSentences = [] # [(word, tag, lang, hiScore, enScore), ...] self.enTrigramModel = Ngrammodel() self.hiTrigramModel = Ngrammodel()
class SentenceSplitter: def __init__(self): self.sentences = [] # [(word, tag, lang), ...] self.scoredSentences = [] # [(word, tag, lang, hiScore, enScore), ...] self.enTrigramModel = Ngrammodel() self.hiTrigramModel = Ngrammodel() def trainLMs(self, enCorpus, hiCorpus, n): self.enTrigramModel.loadSentences(enCorpus) self.enTrigramModel.trainNgramModel(n) self.hiTrigramModel.loadSentences(hiCorpus) self.hiTrigramModel.trainNgramModel(n) def loadSentences(self, sentencesCSV): csvLines = readlinesFromCSV(sentencesCSV) sent = [] for line in csvLines[1:]: if line[0] == u'': self.sentences.append(sent) sent = [] else: sent.append(tuple(line)) def scoreWithTrigamLM(self, word, context, lm): if len(context) > 2: context = tuple(context[-2:]) '''elif len(context) < 2: context = ['', ''] + context context = tuple(context[-2:]) else:''' context = tuple(context) ##print "context:", context, " word:", word prob = lm.prob(word, context) if prob == 0: prob = 0.001 #return -log(prob) return prob def scoreSentence(self, sentIndex): sentence = self.sentences[sentIndex] context = [] newSentence = [] for word, tag, lang in sentence: hiScore = self.scoreWithTrigamLM(word, context, self.hiTrigramModel) enScore = self.scoreWithTrigamLM(word, context, self.enTrigramModel) newSentence.append((word, tag, lang, hiScore, enScore)) context.append(word) return newSentence def scoreSentences(self): self.scoredSentences = [] for sentIndex in range(len(self.sentences)): self.scoredSentences.append(self.scoreSentence(sentIndex)) def sanityCheck(self): print len(self.scoredSentences) print self.scoredSentences[0] def analyzeSentences(self): for sentence in self.scoredSentences: print '\n'.join(map(lambda x:' '.join(map(lambda y:str(y), x)),sentence))+'\n' dummy = raw_input()