示例#1
0
    def __init__(self, corpus):
        tgtLangVocabCount = corpus.getTargetLanguageWordCount()
        
        ''' INIT: translation probabilities '''
        for pairIdx in range(0, corpus.getEntryCount()):
            curPair = corpus.getEntryAt(pairIdx)
            srcWords = curPair[0].split()
            tgtWords = curPair[1].split()
            
            for srcWord in srcWords:
                for tgtWord in tgtWords:
                    self.srcTgtProbs[(srcWord, tgtWord)] = 1.0 / tgtLangVocabCount;
        
        uniqueWordsInSrcVocab = corpus.getAllSourceLanguageWords()
        uniqueWordsInTgtVocab = corpus.getAllTargetLanguageWords()
        
        print "Number of unique source words = " + str(len(uniqueWordsInSrcVocab))
        print "Number of unique target words = " + str(len(uniqueWordsInTgtVocab))
        
        for srcWord in uniqueWordsInSrcVocab:
            for tgtWord in uniqueWordsInTgtVocab:
                self.srcTgtCounts[(srcWord, tgtWord)] = 0.0
                self.srcCounts[srcWord] = 0.0
        
        ''' LOOP: epochs '''
        for epochIdx in range(0, 10):  # this loop is run until convergence
            ''' initialize both counts '''            
            for srcWord in uniqueWordsInSrcVocab:
                for tgtWord in uniqueWordsInTgtVocab:
                    self.srcTgtCounts[(srcWord, tgtWord)] = 0.0
                    self.srcCounts[srcWord] = 0.0
            
            ''' examine each pair in the corpus '''
            for pairIdx in range(0, corpus.getEntryCount()):
                curPair = corpus.getEntryAt(pairIdx)
                srcWords = curPair[0].split()
                tgtWords = curPair[1].split()
                
                ''' calc normalization term '''
                for tgtWord in tgtWords:
                    self.tgtNormTerms[tgtWord] = 0
                    for srcWord in srcWords:
                        self.tgtNormTerms[tgtWord] += self.srcTgtProbs[(srcWord, tgtWord)]
                
                ''' collect count '''
                for tgtWord in tgtWords:
                    for srcWord in srcWords:
                        inc = self.srcTgtProbs[(srcWord, tgtWord)] / self.tgtNormTerms[tgtWord]
                        self.srcTgtCounts[(srcWord, tgtWord)] += inc
                        self.srcCounts[srcWord] += inc
            
            print "now estimating trans probs"
            ''' estimate trans probs '''
            for srcWord in uniqueWordsInSrcVocab:
                for tgtWord in uniqueWordsInTgtVocab:
                    self.srcTgtProbs[(srcWord, tgtWord)] = self.srcTgtCounts[(srcWord, tgtWord)] / self.srcCounts[srcWord]
            print "estimation done"  
        printTopResultsInDic(self.srcTgtProbs, 0.5)
        
        ''' DECODING '''
        alignments = []
        for pairIdx in range(0, corpus.getEntryCount()):
            curPair = corpus.getEntryAt(pairIdx)
            srcWords = curPair[0].split()
            tgtWords = curPair[1].split()
            
            alignmentInCurPair = []
            
            for srcWordIdx in range(0, len(srcWords)):
                bestTgtWordIdx = 0
                bestSrcTgtProb = 0
                for tgtWordIdx in range(0, len(tgtWords)):
                    transProb = self.srcTgtProbs[(srcWords[srcWordIdx], tgtWords[tgtWordIdx])]
                    if transProb > bestSrcTgtProb:
                        bestSrcTgtProb = transProb
                        bestTgtWordIdx = tgtWordIdx
                alignmentInCurPair.append([srcWordIdx, bestTgtWordIdx])
            alignments.append(alignmentInCurPair)
        print alignments
        
        outputStr = ""
        for sentence in alignments:
            for word in sentence:
                outputStr += str(word[0]) + "-" + str(word[1]) + " "
            outputStr += "\n"
        outputFile = open("c:/Users/Administrator/Desktop/a.txt", "w")
        outputFile.write(outputStr)
            




        ''' DECODING '''
        alignmentsText = []
        for pairIdx in range(0, corpus.getEntryCount()):
            curPair = corpus.getEntryAt(pairIdx)
            srcWords = curPair[0].split()
            tgtWords = curPair[1].split()
            
            alignmentInCurPair = []
            
            for srcWord in srcWords:
                bestTgtWord = ""
                bestSrcTgtProb = 0.0
                for tgtWord in tgtWords:
                    transProb = self.srcTgtProbs[(srcWord, tgtWord)]
                    if transProb > bestSrcTgtProb:
                        bestSrcTgtProb = transProb
                        bestTgtWord = tgtWord
                alignmentInCurPair.append([srcWord, bestTgtWord])
            alignmentsText.append(alignmentInCurPair)
        print alignmentsText
        
        outputStr2 = ""
        for sentence in alignmentsText:
            for word in sentence:
                outputStr2 += word[0] + "-" + word[1] + " "
            outputStr2 += "\n"
        outputFile = open("c:/Users/Administrator/Desktop/a.str.txt", "w")
        outputFile.write(outputStr2)
示例#2
0
    def __init__(self, corpus):
        gc.disable()
        tgtLangVocabCount = corpus.getTargetLanguageWordCount()
        srcLangCount = corpus.getSourceLanguageWordCount()

        """ INIT: translation probabilities """
        for pairIdx in range(0, corpus.getEntryCount()):
            curPair = corpus.getEntryAt(pairIdx)
            srcWords = curPair[0].split()
            tgtWords = curPair[1].split()

            for srcWord in srcWords:
                for tgtWord in tgtWords:
                    self.srcTgtProbs[(srcWord, tgtWord)] = 1.0 / tgtLangVocabCount

        """ LOOP: epochs """
        uniqueWordsInSrcVocab = corpus.getAllSourceLanguageWords()
        uniqueWordsInTgtVocab = corpus.getAllTargetLanguageWords()

        print "Number of unique source words = " + str(len(uniqueWordsInSrcVocab))
        print "Number of unique target words = " + str(len(uniqueWordsInTgtVocab))

        print "loop start"
        t1 = time.time()
        for srcWord in uniqueWordsInSrcVocab:
            for tgtWord in uniqueWordsInTgtVocab:
                self.srcTgtCounts[(srcWord, tgtWord)] = 0.0
                self.srcCounts[srcWord] = 0.0
        print "loop end"
        t2 = time.time()
        print str(t2 - t1) + "\n"

        for epochIdx in range(0, 10):  # this loop is run until convergence
            """ initialize both counts """
            for srcWord in uniqueWordsInSrcVocab:
                for tgtWord in uniqueWordsInTgtVocab:
                    self.srcTgtCounts[(srcWord, tgtWord)] = 0.0
                    self.srcCounts[srcWord] = 0.0

            t3 = time.time()
            """ examine each pair in the corpus """
            for pairIdx in range(0, corpus.getEntryCount()):
                curPair = corpus.getEntryAt(pairIdx)
                srcWords = curPair[0].split()
                tgtWords = curPair[1].split()

                """ calc normalization term """
                for tgtWord in tgtWords:
                    self.tgtNormTerms[tgtWord] = 0
                    for srcWord in srcWords:
                        self.tgtNormTerms[tgtWord] += self.srcTgtProbs[(srcWord, tgtWord)]

                """ collect count """
                for tgtWord in tgtWords:
                    for srcWord in srcWords:
                        inc = self.srcTgtProbs[(srcWord, tgtWord)] / self.tgtNormTerms[tgtWord]
                        self.srcTgtCounts[(srcWord, tgtWord)] += inc
                        self.srcCounts[srcWord] += inc
            t4 = time.time()
            print str(epochIdx) + "th epoch ran for " + str(t4 - t3) + "\n"

            t5 = time.time()
            print "now estimating trans probs"
            """ estimate trans probs """
            for srcWord in uniqueWordsInSrcVocab:
                for tgtWord in uniqueWordsInTgtVocab:
                    self.srcTgtProbs[(srcWord, tgtWord)] = (
                        self.srcTgtCounts[(srcWord, tgtWord)] / self.srcCounts[srcWord]
                    )
            print "estimation done"
            t6 = time.time()
            print str(t6 - t5) + "\n"
        printTopResultsInDic(self.srcTgtProbs, 0.09)