예제 #1
0
def tagBigboss(unsegSentences):
  enLearner = learner.Learner()
  enTrainData = 'C:\Users\\t-phgad\Documents\Project\Data\POSAnnotated\WSJ\\train.uni.txt'
  enTrainSents = Reader.readTaggsedSentences(enTrainData)
  enLearner.train(enTrainSents)
  
  hiLearner = learner.Learner()
  hiTrainData = 'C:\Users\\t-phgad\Documents\Project\Data\POSAnnotated\Hindi\\train.uni.txt'
  hiTrainSents = Reader.readTaggsedSentences(hiTrainData)
  hiLearner.train(hiTrainSents)
  
  SS = SentenceSplitter()
  SS.trainLMsDefault()
  SS.loadSentences(unsegSentences)
  SS.scoreSentences()
  SS.splitSentences()
  testSents = SS.splittedSentences
  
  enTestSents = [(testSents[index][0],testSents[index][2],testSents[index][3]) for index in range(len(testSents)) if testSents[index][1] == 'E']
  enTagger = tagger.Tagger(enLearner)
  enTagger.loadTestSentences(map(lambda x:x[0], enTestSents))
  enTagger.tag()
  
  hiTestSents = [(testSents[index][0],testSents[index][2],testSents[index][3]) for index in range(len(testSents)) if testSents[index][1] == 'H']
  hiTagger = tagger.Tagger(hiLearner)
  hiTagger.loadTestSentences(map(lambda x:x[0],hiTestSents))
  hiTagger.tag()
  
  print "English Accuracy:", enTagger.accuracy()
  print "Hindi Accuracy:", hiTagger.accuracy()
  enCorrect, enTotal = enTagger.getAccuCounts()
  hiCorrect, hiTotal = hiTagger.getAccuCounts()
  print "EN Total:", enTotal, "EN Correct:", enCorrect
  print "HI Total:", hiTotal, "HI Correct:", hiCorrect
  print "Total Accuracy:", (enCorrect+hiCorrect)*100.0/(enTotal+hiTotal)
  
  taggedSentneces = dd(list)
  
  assert len(enTestSents) == len(enTagger.outputs)
  for sentIndex in range(len(enTagger.outputs)):
    output = enTagger.outputs[sentIndex]
    taggedSentneces[enTestSents[sentIndex][1]].append((output, enTestSents[sentIndex][2]))
  
  assert len(hiTestSents) == len(hiTagger.outputs)
  for sentIndex in range(len(hiTagger.outputs)):
    output = hiTagger.outputs[sentIndex]
    taggedSentneces[hiTestSents[sentIndex][1]].append((output, hiTestSents[sentIndex][2]))
  
  for sentId in taggedSentneces:
    sent = []
    taggedChunks = sorted(taggedSentneces[sentId], cmp=lambda x, y:int(x[1]) - int(y[1]))
    for chunk in taggedChunks:
      sent.extend(chunk[0])
    taggedSentneces[sentId] = sent
  taggedFB = 'C:\Users\\t-phgad\Documents\Project\Data\\Bigboss\\longPosAutoTags.csv'
  writer = UnicodeWriter(open(taggedFB, 'w'), lineterminator='\n')
  for sent in taggedSentneces.itervalues():
    for line in sent:
      writer.writerow(line)
    writer.writerow(["", ""])
예제 #2
0
def tagPure(unsegSentences):
  unsegSentences = normalizeManually(unsegSentences)
  testSents = purelySplitSentences(unsegSentences)
  enLearner = learner.Learner()
  enTrainData = 'C:\Users\\t-phgad\Documents\Project\Data\POSAnnotated\WSJ\\train.uni.txt'
  enTrainSents = Reader.readTaggsedSentences(enTrainData)
  enLearner.train(enTrainSents)
  
  hiLearner = learner.Learner()
  hiTrainData = 'C:\Users\\t-phgad\Documents\Project\Data\POSAnnotated\Hindi\\train.uni.txt'
  hiTrainSents = Reader.readTaggsedSentences(hiTrainData)
  hiLearner.train(hiTrainSents)
  
  enTestSents = [(testSents[index][0], testSents[index][2], testSents[index][3]) for index in range(len(testSents)) if testSents[index][1] == 'E']
  enTagger = tagger.Tagger(enLearner)
  enTagger.loadTestSentences(map(lambda x:x[0], enTestSents))
  # enTagger.tag()
  enTagger.tagTopK(5)
  
  hiTestSents = [(testSents[index][0], testSents[index][2], testSents[index][3]) for index in range(len(testSents)) if testSents[index][1] == 'H']
  hiTagger = tagger.Tagger(hiLearner)
  hiTagger.loadTestSentences(map(lambda x:x[0], hiTestSents))
  # hiTagger.tag()
  hiTagger.tagTopK(5)
  
  print "English Accuracy:", enTagger.accuracy()
  print "Hindi Accuracy:", hiTagger.accuracy()
  enCorrect, enTotal = enTagger.getAccuCounts()
  hiCorrect, hiTotal = hiTagger.getAccuCounts()
  print "EN Total:", enTotal, "EN Correct:", enCorrect
  print "HI Total:", hiTotal, "HI Correct:", hiCorrect
  print "Total Accuracy:", (enCorrect + hiCorrect) * 100.0 / (enTotal + hiTotal)
  
  taggedSentneces = dd(list)
  
  assert len(enTestSents) == len(enTagger.outputs)
  for sentIndex in range(len(enTagger.outputs)):
    output = enTagger.outputs[sentIndex]
    taggedSentneces[enTestSents[sentIndex][1]].append((output, enTestSents[sentIndex][2]))
  
  assert len(hiTestSents) == len(hiTagger.outputs)
  for sentIndex in range(len(hiTagger.outputs)):
    output = hiTagger.outputs[sentIndex]
    taggedSentneces[hiTestSents[sentIndex][1]].append((output, hiTestSents[sentIndex][2]))
  
  for sentId in taggedSentneces:
    sent = []
    taggedChunks = sorted(taggedSentneces[sentId], cmp=lambda x, y:int(x[1]) - int(y[1]))
    for chunk in taggedChunks:
      sent.extend(chunk[0])
    taggedSentneces[sentId] = sent
  return taggedSentneces
예제 #3
0
def tagWithBigbossTransitions(unsegSentences):
  taggedSentences = tagPure(unsegSentences)

  # # Context Viterbi Decoding ##
  bigbossLearner = learner.Learner()
  bigbossTrainData = 'C:\\Users\\t-phgad\\Documents\\Project\\Data\\Bigboss\\FromDOcs\\bigbossDev.uni.txt'
  bigbossSents = Reader.readTaggsedSentences(bigbossTrainData)
  bigbossLearner.train(bigbossSents)
  bigbossLearner.laplaceSmoothTransitions()
  vitDecoder = tagger.ViteriDecoder(bigbossLearner)
  outputs = []
  for _, topKOutput in taggedSentences.iteritems():
    sentence = map(lambda x:x[0], topKOutput)
    topKTags = map(lambda x:x[1], topKOutput)
    tags = vitDecoder.decodeTopK(topKTags, sentence)
    output = zip(sentence, tags)
    outputs.append(output)
  print "Context Decoding Accuracy:", accuracy(unsegSentences, outputs)
  return
  # # Writing
  taggedFB = 'C:\Users\\t-phgad\Documents\Project\Data\\FB\\FBContextTagged.csv'
  writer = UnicodeWriter(open(taggedFB, 'w'), lineterminator='\n')
  # for sent in taggedSentneces.itervalues():
  for sent in outputs:
    for line in sent:
      writer.writerow(line)
    writer.writerow(["", ""])
예제 #4
0
def tagTaggedSentences(testSents, taggerTags):
    enLearner = learner.Learner()
    enTrainData = "C:\Users\\t-phgad\Documents\Project\Data\POSAnnotated\WSJ\\train.uni.txt"
    enTrainSents = Reader.readTaggsedSentences(enTrainData)
    enLearner.train(enTrainSents)

    hiLearner = learner.Learner()
    hiTrainData = "C:\Users\\t-phgad\Documents\Project\Data\POSAnnotated\Hindi\\train.uni.txt"
    hiTrainSents = Reader.readTaggsedSentences(hiTrainData)
    hiLearner.train(hiTrainSents)

    enTestSents = [(testSents[index], index) for index in range(len(testSents)) if taggerTags[index] == "E"]
    enTagger = tagger.Tagger(enLearner)
    enTagger.loadTestSentences(map(lambda x: x[0], enTestSents))
    enTagger.tag()

    hiTestSents = [(testSents[index], index) for index in range(len(testSents)) if taggerTags[index] == "H"]
    hiTagger = tagger.Tagger(hiLearner)
    hiTagger.loadTestSentences(map(lambda x: x[0], hiTestSents))
    hiTagger.tag()

    print "English Accuracy:", enTagger.accuracy()
    print "Hindi Accuracy:", hiTagger.accuracy()
    enCorrect, enTotal = enTagger.getAccuCounts()
    hiCorrect, hiTotal = hiTagger.getAccuCounts()
    print "EN Total:", enTotal, "EN Correct:", enCorrect
    print "HI Total:", hiTotal, "HI Correct:", hiCorrect
    print "Total Accuracy:", (enCorrect + hiCorrect) * 100.0 / (enTotal + hiTotal)

    taggedSentences = {}

    assert len(enTagger.outputs) == len(enTestSents)
    for sentIndex in range(len(enTagger.outputs)):
        output = enTagger.outputs[sentIndex]
        testSent = enTestSents[sentIndex]
        testSentIndex = testSent[1]
        taggedSentences[testSentIndex] = output

    assert len(hiTagger.outputs) == len(hiTestSents)
    for sentIndex in range(len(hiTagger.outputs)):
        output = hiTagger.outputs[sentIndex]
        testSent = hiTestSents[sentIndex]
        testSentIndex = testSent[1]
        taggedSentences[testSentIndex] = output

    for index in range(len(testSents)):
        print "\n".join(map(lambda x: "\t".join(x), taggedSentences[index])) + "\n"
예제 #5
0
def tagBigboss():
  unsegBigBoss = 'C:\Users\\t-phgad\Documents\Project\Data\Bigboss\FromDocs\CSSentsWithIDs.csv'
  unsegSentences = Reader.readTaggsedSentencesCSV(unsegBigBoss, 2, 5)
  unsegSentences = map2UniWithLtag(unsegSentences)
  #for sent in unsegSentences:
  #  print '\n'.join(map(lambda x:x[1], sent))+'\n'
  #sys.exit()
  tagTaggedSentences(unsegSentences)
예제 #6
0
def tagUntaggedSentences(unsegSentences):
  testSents = purelySplitUntaggedSentences(unsegSentences)
  enLearner = learner.Learner()
  enTrainData = 'C:\Users\\t-phgad\Documents\Project\Data\POSAnnotated\WSJ\\train.uni.txt'
  enTrainSents = Reader.readTaggsedSentences(enTrainData)
  enLearner.train(enTrainSents)
  
  hiLearner = learner.Learner()
  hiTrainData = 'C:\Users\\t-phgad\Documents\Project\Data\POSAnnotated\Hindi\\train.uni.txt'
  hiTrainSents = Reader.readTaggsedSentences(hiTrainData)
  hiLearner.train(hiTrainSents)
  
  enTestSents = [(testSents[index][0], testSents[index][2], testSents[index][3])  for index in range(len(testSents)) if testSents[index][1] == 'E']
  enTagger = tagger.Tagger(enLearner)
  enTagger.loadTestSentences(map(lambda x:x[0], enTestSents))
  enTagger.tag()
  
  hiTestSents = [(testSents[index][0], testSents[index][2], testSents[index][3]) for index in range(len(testSents)) if testSents[index][1] == 'H']
  hiTagger = tagger.Tagger(hiLearner)
  hiTagger.loadTestSentences(map(lambda x:x[0], hiTestSents))
  hiTagger.tag()
  
  taggedSentneces = dd(list)
  
  assert len(enTestSents) == len(enTagger.outputs)
  for sentIndex in range(len(enTagger.outputs)):
    output = enTagger.outputs[sentIndex]
    taggedSentneces[enTestSents[sentIndex][1]].append((output, enTestSents[sentIndex][2]))
  
  assert len(hiTestSents) == len(hiTagger.outputs)
  for sentIndex in range(len(hiTagger.outputs)):
    output = hiTagger.outputs[sentIndex]
    taggedSentneces[hiTestSents[sentIndex][1]].append((output, hiTestSents[sentIndex][2]))
  
  for sentId in taggedSentneces:
    sent = []
    taggedChunks = sorted(taggedSentneces[sentId], cmp=lambda x, y:int(x[1]) - int(y[1]))
    for chunk in taggedChunks:
      sent.extend(chunk[0])
    taggedSentneces[sentId] = sent
  taggedFB = 'C:\Users\\t-phgad\Documents\Project\Data\\cominedWordsLangsTagged.csv'
  writer = UnicodeWriter(open(taggedFB, 'w'), lineterminator='\n')
  for sent in taggedSentneces.itervalues():
    for line in sent:
      writer.writerow(line)
    writer.writerow(["", ""])
예제 #7
0
def tagBigboss():
    bigBossData = "C:\Users\\t-phgad\Documents\Project\Data\Bigboss\FromDocs\CSSentsSegWithIDs.csv"
    uniTestData = "C:\Users\\t-phgad\Documents\Project\Data\Bigboss\FromDocs\\bigbossDev.uni.txt"
    # prepareTestDataUni(bigBossData, uniTestData)
    testSents = Reader.readTaggsedSentences(uniTestData)
    taggerTags = getTaggerTags(bigBossData)
    inferredTaggerTags = inferTaggerTags(bigBossData)
    tagTaggedSentences(testSents, taggerTags)
    print "Length of tags:", len(taggerTags)
    print "Length of inferred tags:", len(inferredTaggerTags)
예제 #8
0
def tagWithManualTaggerTags(unsegSentences):
  taggedSentences = tagPure(unsegSentences)
  taggerTags = ["H", "H", "E", "H", "H", "H", "H", "H", "H", "H", "E", "H", "E", "E", "H", "E", "E", "H", "H", "H", "H", "H", "H", "H", "H", "H", "H", "H"]
  assert len(taggedSentences) == len(taggerTags)
  
  enLearner = learner.Learner()
  enTrainData = 'C:\Users\\t-phgad\Documents\Project\Data\POSAnnotated\WSJ\\train.uni.txt'
  enTrainSents = Reader.readTaggsedSentences(enTrainData)
  enLearner.train(enTrainSents)
  enLearner.laplaceSmoothTransitions()
  
  hiLearner = learner.Learner()
  hiTrainData = 'C:\Users\\t-phgad\Documents\Project\Data\POSAnnotated\Hindi\\train.uni.txt'
  hiTrainSents = Reader.readTaggsedSentences(hiTrainData)
  hiLearner.train(hiTrainSents)
  hiLearner.laplaceSmoothTransitions()
  
  hiVitDecoder = tagger.ViteriDecoder(hiLearner)
  enVitDecoder = tagger.ViteriDecoder(enLearner)
  outputs = []
  index = 0
  for _, topKOutput in taggedSentences.iteritems():
    sentence = map(lambda x:x[0], topKOutput)
    topKTags = map(lambda x:x[1], topKOutput)
    if taggerTags[index] == "E":
      tags = enVitDecoder.decodeTopK(topKTags, sentence)
    elif taggerTags[index] == "H":
      tags = hiVitDecoder.decodeTopK(topKTags, sentence)
    output = zip(sentence, tags)
    outputs.append(output)
    index += 1
  print "Context Decoding Accuracy:", accuracy(unsegSentences, outputs)
  
  taggedFB = 'C:\Users\\t-phgad\Documents\Project\Data\\FB\\FBContextMLTagged.csv'
  writer = UnicodeWriter(open(taggedFB, 'w'), lineterminator='\n')
  # for sent in taggedSentneces.itervalues():
  for sent in outputs:
    for line in sent:
      writer.writerow(line)
    writer.writerow(["", ""])
예제 #9
0
  
  assert len(enTestSents) == len(enTagger.outputs)
  for sentIndex in range(len(enTagger.outputs)):
    output = enTagger.outputs[sentIndex]
    taggedSentneces[enTestSents[sentIndex][1]].append((output, enTestSents[sentIndex][2]))
  
  assert len(hiTestSents) == len(hiTagger.outputs)
  for sentIndex in range(len(hiTagger.outputs)):
    output = hiTagger.outputs[sentIndex]
    taggedSentneces[hiTestSents[sentIndex][1]].append((output, hiTestSents[sentIndex][2]))
  
  for sentId in taggedSentneces:
    sent = []
    taggedChunks = sorted(taggedSentneces[sentId], cmp=lambda x, y:int(x[1]) - int(y[1]))
    for chunk in taggedChunks:
      sent.extend(chunk[0])
    taggedSentneces[sentId] = sent
  taggedFB = 'C:\Users\\t-phgad\Documents\Project\Data\\Bigboss\\longPosAutoTags.csv'
  writer = UnicodeWriter(open(taggedFB, 'w'), lineterminator='\n')
  for sent in taggedSentneces.itervalues():
    for line in sent:
      writer.writerow(line)
    writer.writerow(["", ""])

if __name__ == '__main__':
  unsegBigBoss = 'C:\Users\\t-phgad\Documents\Project\Data\Bigboss\FromDocs\CSSentsWithIDs.csv'
  unsegSentences = Reader.readTaggsedSentencesCSV(unsegBigBoss, 2, 5)
  unsegSentences = map2UniWithLtag(unsegSentences)
  # prepareTestDataUni(bigBossData, uniTestData)
  tagBigboss(unsegSentences)
  
예제 #10
0
def tagFB():
  exractedFB = 'C:\Users\\t-phgad\Documents\Project\Data\\FB\\FBPOSAnnotated.csv'
  unsegSentences = Reader.readTaggsedSentencesCSV(exractedFB, 0, 3)
  #tagUntaggedSentences(unsegSentences)
  tagTaggedSentences(unsegSentences)
예제 #11
0
def tagFB():
  exractedFB = 'C:\Users\\t-phgad\Documents\Project\Data\\FB\\FBPOSAnnotated.csv'
  unsegSentences = Reader.readTaggsedSentencesCSV(exractedFB, 0, 3)
  #tagWithBigbossTransitions(unsegSentences)
  tagWithManualTaggerTags(unsegSentences)
예제 #12
0
 def loadDictionaries(self):
   enTaggedData = 'C:\Users\\t-phgad\Documents\Project\Data\POSAnnotated\Hindi\\train.uni.txt'
   hiTaggedData = 'C:\Users\\t-phgad\Documents\Project\Data\POSAnnotated\WSJ\\train.uni.txt'
   enWordTags = Reader.readTaggsedSentences(enTaggedData)
   hiWordTags = Reader.readTaggsedSentences(hiTaggedData)
   self.learner.guesser.loadDicts(enWordTags, hiWordTags)
예제 #13
0
 def loadTestSentencesFromFile(self, inputFile):
   self.testSentences = Reader.readTaggsedSentences(inputFile)