示例#1
0
 def __init__(self):
   self.sentences = [] # [(word, tag, lang), ...]
   self.scoredSentences = [] # [(word, tag, lang, hiScore, enScore), ...]
   self.enTrigramModel = Ngrammodel()
   self.hiTrigramModel = Ngrammodel()
示例#2
0
class SentenceSplitter:
  def __init__(self):
    self.sentences = [] # [(word, tag, lang), ...]
    self.scoredSentences = [] # [(word, tag, lang, hiScore, enScore), ...]
    self.enTrigramModel = Ngrammodel()
    self.hiTrigramModel = Ngrammodel()
  
  def trainLMs(self, enCorpus, hiCorpus, n):
    self.enTrigramModel.loadSentences(enCorpus)
    self.enTrigramModel.trainNgramModel(n)
    self.hiTrigramModel.loadSentences(hiCorpus)
    self.hiTrigramModel.trainNgramModel(n)
  
  def loadSentences(self, sentencesCSV):
    csvLines = readlinesFromCSV(sentencesCSV)
    sent = []
    for line in csvLines[1:]:
      if line[0] == u'':
        self.sentences.append(sent)
        sent = []
      else:
        sent.append(tuple(line))

  def scoreWithTrigamLM(self, word, context, lm):
    if len(context) > 2:
      context = tuple(context[-2:])
    '''elif len(context) < 2:
      context = ['', ''] + context
      context = tuple(context[-2:])
    else:'''
    context = tuple(context)
    ##print "context:", context, " word:", word
    prob = lm.prob(word, context)
    if prob == 0:
      prob = 0.001
    #return -log(prob)
    return prob
  
  def scoreSentence(self, sentIndex):
    sentence = self.sentences[sentIndex]
    context = []
    newSentence = []
    for word, tag, lang in sentence:
      hiScore = self.scoreWithTrigamLM(word, context, self.hiTrigramModel)
      enScore = self.scoreWithTrigamLM(word, context, self.enTrigramModel)
      newSentence.append((word, tag, lang, hiScore, enScore))
      context.append(word)
    return newSentence
  
  def scoreSentences(self):
    self.scoredSentences = []
    for sentIndex in range(len(self.sentences)):
      self.scoredSentences.append(self.scoreSentence(sentIndex))

  def sanityCheck(self):
    print len(self.scoredSentences)
    print self.scoredSentences[0]
  
  def analyzeSentences(self):
    for sentence in self.scoredSentences:
      print '\n'.join(map(lambda x:' '.join(map(lambda y:str(y), x)),sentence))+'\n'
      dummy = raw_input()