Python WBTrigramModel примеры использования

Язык программирования: Python

Пространство имен/Пакет: splitter.WBTrigramModel

Класс/Тип: WBTrigramModel

Примеров на hotexamples.com: 2

Python WBTrigramModel - 2 примера найдено. Это лучшие примеры Python кода для splitter.WBTrigramModel.WBTrigramModel, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

loadSentences(1)

Пример #1

Показать файл

Файл: SentenceSplitterWB.py Проект: phanigadde/MSRCSWork

 def __init__(self):
   self.sentences = []  # [(word, tag, lang), ...]
   self.scoredSentences = []  # [(word, tag, lang, lmScore), ...]
   self.splittedSentences = []  # [([sentence], tag)]
   self.enTrigramModel = WBTrigramModel()
   self.hiTrigramModel = WBTrigramModel()

Пример #2

Показать файл

Файл: SentenceSplitterWB.py Проект: phanigadde/MSRCSWork

class SentenceSplitter:
  def __init__(self):
    self.sentences = []  # [(word, tag, lang), ...]
    self.scoredSentences = []  # [(word, tag, lang, lmScore), ...]
    self.splittedSentences = []  # [([sentence], tag)]
    self.enTrigramModel = WBTrigramModel()
    self.hiTrigramModel = WBTrigramModel()
  
  def trainLMsDefault(self):
    enCorpus = 'C:\Users\\t-phgad\Documents\Project\Data\\forLM/en-ICE-India.txt'
    hiCorpus = 'C:\Users\\t-phgad\Documents\Project\Data\\forLM/hi-TB.txt'
    self.enTrigramModel.loadSentences(enCorpus)
    self.hiTrigramModel.loadSentences(hiCorpus)
  
  def trainLMsWithOpts(self, enCorpus, hiCorpus, n):
    self.enTrigramModel.loadSentences(enCorpus)
    self.hiTrigramModel.loadSentences(hiCorpus)
    
  def loadSentencesSingleColCSV(self, sentencesCSV):
    csvLines = readlinesFromCSV(sentencesCSV)
    sent = []
    for line in csvLines[1:]:
      if line[0] == u'':
        self.sentences.append(sent)
        sent = []
      else:
        sent.append(tuple(line))
  
  def loadSentences(self, sentences):
    self.sentences = [s for s in sentences]

  # Two versions of the function. Trigrams and the Unigrams 
  def scoreSentence(self, sentIndex):
    sentence = self.sentences[sentIndex]
    context = []
    newSentence = []
    for index in range(len(sentence)):
      word, tag, lang = sentence[index]
      lm = self.hiTrigramModel
      if lang == 'E':
        lm = self.enTrigramModel
      lmScore = lm.scoreUnigram(word)
      if index - 2 >= 0:  # and sentence[index - 2][2] == lang and sentence[index - 1][2] == lang:
        lmScore = lm.scoreTrigram((sentence[index - 2][0], sentence[index - 1][0], word))
      elif index - 1 >= 0:  # and sentence[index - 1][2] == lang:
        lmScore = lm.scoreBigram((sentence[index - 1][0], word))
      if lmScore != 0:
        lmScore = -1.0 / log(lmScore)
      else:
        lmScore = -1.0 / log(0.000001)
      newSentence.append((word, tag, lang, lmScore))
      context.append(word)
    return newSentence
  
  def scoreSentences(self):
    self.scoredSentences = []
    for sentIndex in range(len(self.sentences)):
      self.scoredSentences.append(self.scoreSentence(sentIndex))

  def sanityCheck(self):
    print len(self.scoredSentences)
    print self.scoredSentences[0]
  
  def analyzeSentences(self):
    for sentence in self.scoredSentences:
      print '\n'.join(map(lambda x:' '.join(map(lambda y:str(y), x)), sentence)) + '\n'
      dummy = raw_input()
      
  def splitSentences(self):
    index = 0
    for sentence in self.scoredSentences:
      index += 1
      origins = Counter(map(lambda x:x[2], sentence))
      origin = 'H'
      if origins['H'] < origins['E']:
        origin = 'E'
      # for origin in ['E', 'H']:
      originedScores = []
      for _, _, lang, score in sentence:
        if lang == origin:
          originedScores.append(score)
        else:
          originedScores.append(-score)
      start, end = longestPositive(originedScores)
      words = map(lambda x:x[0:2], sentence)
      # print str(index) + '\t' + origin + '\t' + ' '.join(words[0:start]) + "\t" + ' '.join(words[start:end]) + "\t" + ' '.join(words[end:len(words)])
      otherOrigin = 'E'
      if otherOrigin == origin:
        otherOrigin = 'H'
      if start > 0:
        #print words[0:start]
        self.splittedSentences.append((words[0:start], otherOrigin, index, 0))
      if end > start:
        self.splittedSentences.append((words[start:end], origin, index, start))
      if len(words) > end:
        self.splittedSentences.append((words[end:len(words)], otherOrigin, index, end))