Пример #1
0
 def __init__(self):
   self.sentences = []  # [(word, tag, lang), ...]
   self.scoredSentences = []  # [(word, tag, lang, lmScore), ...]
   self.splittedSentences = []  # [([sentence], tag)]
   self.enTrigramModel = WBTrigramModel()
   self.hiTrigramModel = WBTrigramModel()
Пример #2
0
class SentenceSplitter:
  def __init__(self):
    self.sentences = []  # [(word, tag, lang), ...]
    self.scoredSentences = []  # [(word, tag, lang, lmScore), ...]
    self.splittedSentences = []  # [([sentence], tag)]
    self.enTrigramModel = WBTrigramModel()
    self.hiTrigramModel = WBTrigramModel()
  
  def trainLMsDefault(self):
    enCorpus = 'C:\Users\\t-phgad\Documents\Project\Data\\forLM/en-ICE-India.txt'
    hiCorpus = 'C:\Users\\t-phgad\Documents\Project\Data\\forLM/hi-TB.txt'
    self.enTrigramModel.loadSentences(enCorpus)
    self.hiTrigramModel.loadSentences(hiCorpus)
  
  def trainLMsWithOpts(self, enCorpus, hiCorpus, n):
    self.enTrigramModel.loadSentences(enCorpus)
    self.hiTrigramModel.loadSentences(hiCorpus)
    
  def loadSentencesSingleColCSV(self, sentencesCSV):
    csvLines = readlinesFromCSV(sentencesCSV)
    sent = []
    for line in csvLines[1:]:
      if line[0] == u'':
        self.sentences.append(sent)
        sent = []
      else:
        sent.append(tuple(line))
  
  def loadSentences(self, sentences):
    self.sentences = [s for s in sentences]

  # Two versions of the function. Trigrams and the Unigrams 
  def scoreSentence(self, sentIndex):
    sentence = self.sentences[sentIndex]
    context = []
    newSentence = []
    for index in range(len(sentence)):
      word, tag, lang = sentence[index]
      lm = self.hiTrigramModel
      if lang == 'E':
        lm = self.enTrigramModel
      lmScore = lm.scoreUnigram(word)
      if index - 2 >= 0:  # and sentence[index - 2][2] == lang and sentence[index - 1][2] == lang:
        lmScore = lm.scoreTrigram((sentence[index - 2][0], sentence[index - 1][0], word))
      elif index - 1 >= 0:  # and sentence[index - 1][2] == lang:
        lmScore = lm.scoreBigram((sentence[index - 1][0], word))
      if lmScore != 0:
        lmScore = -1.0 / log(lmScore)
      else:
        lmScore = -1.0 / log(0.000001)
      newSentence.append((word, tag, lang, lmScore))
      context.append(word)
    return newSentence
  
  def scoreSentences(self):
    self.scoredSentences = []
    for sentIndex in range(len(self.sentences)):
      self.scoredSentences.append(self.scoreSentence(sentIndex))

  def sanityCheck(self):
    print len(self.scoredSentences)
    print self.scoredSentences[0]
  
  def analyzeSentences(self):
    for sentence in self.scoredSentences:
      print '\n'.join(map(lambda x:' '.join(map(lambda y:str(y), x)), sentence)) + '\n'
      dummy = raw_input()
      
  def splitSentences(self):
    index = 0
    for sentence in self.scoredSentences:
      index += 1
      origins = Counter(map(lambda x:x[2], sentence))
      origin = 'H'
      if origins['H'] < origins['E']:
        origin = 'E'
      # for origin in ['E', 'H']:
      originedScores = []
      for _, _, lang, score in sentence:
        if lang == origin:
          originedScores.append(score)
        else:
          originedScores.append(-score)
      start, end = longestPositive(originedScores)
      words = map(lambda x:x[0:2], sentence)
      # print str(index) + '\t' + origin + '\t' + ' '.join(words[0:start]) + "\t" + ' '.join(words[start:end]) + "\t" + ' '.join(words[end:len(words)])
      otherOrigin = 'E'
      if otherOrigin == origin:
        otherOrigin = 'H'
      if start > 0:
        #print words[0:start]
        self.splittedSentences.append((words[0:start], otherOrigin, index, 0))
      if end > start:
        self.splittedSentences.append((words[start:end], origin, index, start))
      if len(words) > end:
        self.splittedSentences.append((words[end:len(words)], otherOrigin, index, end))