def EM(self, dataset, iterations, modelName="IBM1Base", index=0): task = Task("Aligner", modelName + str(iterations)) self.logger.info("Starting Training Process") self.logger.info("Training size: " + str(len(dataset))) start_time = time.time() for iteration in range(iterations): self._beginningOfIteration() self.logger.info("Starting Iteration " + str(iteration)) counter = 0 for item in dataset: f, e = item[0:2] counter += 1 task.progress(modelName + " iter %d, %d of %d" % ( iteration, counter, len(dataset), )) for fWord in f: z = 0 for eWord in e: z += self.tProbability(fWord, eWord) for eWord in e: self._updateCount(fWord, eWord, z, index) self._updateEndOfIteration() end_time = time.time() self.logger.info("Training Complete, total time(seconds): %f" % (end_time - start_time, )) self.endOfEM() return
class AlignmentModel(Base): def __init__(self): self.modelName = "HMMWithAlignmentType" self.version = "0.1b" self.logger = logging.getLogger('HMM') self.p0H = 0.3 self.nullEmissionProb = 0.000005 self.smoothFactor = 0.1 self.task = None self.evaluate = evaluate self.fe = () self.s = defaultdict(list) self.sTag = defaultdict(list) self.index = 0 self.typeList = [] self.typeIndex = {} self.typeDist = [] self.lambd = 1 - 1e-20 self.lambda1 = 0.9999999999 self.lambda2 = 9.999900827395436E-11 self.lambda3 = 1.000000082740371E-15 self.loadTypeDist = { "SEM": .401, "FUN": .264, "PDE": .004, "CDE": .004, "MDE": .012, "GIS": .205, "GIF": .031, "COI": .008, "TIN": .003, "NTR": .086, "MTA": .002 } self.modelComponents = [ "t", "pi", "a", "eLengthSet", "s", "sTag", "typeList", "typeIndex", "typeDist", "lambd", "lambda1", "lambda2", "lambda3" ] Base.__init__(self) return def _beginningOfIteration(self, dataset): self.lenDataset = len(dataset) self.c_feh = defaultdict( lambda: [0.0 for h in range(len(self.typeList))]) return def _updateGamma(self, f, e, gamma, alpha, beta, alphaScale): for i in range(len(f)): for j in range(len(e)): tmpGamma = alpha[i][j] * beta[i][j] / alphaScale[i] gamma[i][j] = tmpGamma c_feh = self.c_feh[(f[i][self.index], e[j][self.index])] for h in range(len(self.typeList)): c_feh[h] += tmpGamma * self.sProbability(f[i], e[j], h) def _updateEndOfIteration(self, maxE, delta, gammaSum_0, gammaBiword): # Update a for targetLen in self.eLengthSet: a = self.a[targetLen] for prev_j in range(len(a)): for j in range(len(a[prev_j])): a[prev_j][j] = 0.0 for Len in self.eLengthSet: for prev_j in range(Len): deltaSum = 0.0 for j in range(Len): deltaSum += delta[Len][prev_j][j] for j in range(Len): self.a[Len][prev_j][j] = delta[Len][prev_j][j] /\ (deltaSum + 1e-37) # Update pi for i in range(maxE): self.pi[i] = gammaSum_0[i] * (1.0 / self.lenDataset) # Update t gammaEWord = defaultdict(float) for f, e in gammaBiword: gammaEWord[e] += gammaBiword[(f, e)] self.t.clear() for f, e in gammaBiword: self.t[(f, e)] = gammaBiword[(f, e)] / (gammaEWord[e] + 1e-37) s = self.s if self.index == 0 else self.sTag for (f, e) in self.c_feh: c_feh = self.c_feh[(f, e)] sTmp = s[(f, e)] gammaTmp = gammaBiword[(f, e)] for h in range(len(self.typeList)): sTmp[h] = c_feh[h] / gammaTmp self.fe = () return def endOfBaumWelch(self): # Smoothing for target sentences of unencountered length for targetLen in self.eLengthSet: a = self.a[targetLen] for prev_j in range(targetLen): for j in range(targetLen): a[prev_j][j] *= 1 - self.p0H for targetLen in self.eLengthSet: a = self.a[targetLen] for prev_j in range(targetLen): for j in range(targetLen): a[prev_j][prev_j + targetLen] = self.p0H a[prev_j + targetLen][prev_j + targetLen] = self.p0H a[prev_j + targetLen][j] = a[prev_j][j] return def sProbability(self, f, e, h): fWord, fTag = f eWord, eTag = e if self.fe != (f, e): self.fe, sKey, sTagKey = (f, e), (f[0], e[0]), (f[1], e[1]) self.sTmp = self.s[sKey] if sKey in self.s else None self.sTagTmp = self.sTag[sTagKey] if sTagKey in self.sTag else None sTmp = self.sTmp[h] if self.sTmp else 0 sTagTmp = self.sTagTmp[h] if self.sTagTmp else 0 if self.index == 0: p1 = (1 - self.lambd) * self.typeDist[h] + self.lambd * sTmp p2 = (1 - self.lambd) * self.typeDist[h] + self.lambd * sTagTmp p3 = self.typeDist[h] return self.lambda1 * p1 + self.lambda2 * p2 + self.lambda3 * p3 else: return (1 - self.lambd) * self.typeDist[h] + self.lambd * sTagTmp def trainWithIndex(self, dataset, iterations, index): self.index = index alignerIBM1 = AlignerIBM1() alignerIBM1.initialiseBiwordCount(dataset, index) alignerIBM1.EM(dataset, iterations, 'IBM1', index) self.task.progress("IBM model Trained") self.logger.info("IBM model Trained") self.logger.info("Initialising HMM") self.initialiseBiwordCount(dataset, index) if self.index == 1: self.sTag = self.calculateS(dataset, self.fe_count, index) else: self.s = self.calculateS(dataset, self.fe_count, index) self.t = alignerIBM1.t self.logger.info("HMM Initialised, start training") self.baumWelch(dataset, iterations=iterations, index=index) self.task.progress("HMM finalising") return def train(self, dataset, iterations=5): self.task = Task("Aligner", "HMMOI" + str(iterations)) self.logger.info("Loading alignment type distribution") self.initialiseAlignTypeDist(dataset, self.loadTypeDist) self.logger.info("Alignment type distribution loaded") self.task.progress("Stage 1 Training With POS Tags") self.logger.info("Stage 1 Training With POS Tags") self.trainWithIndex(dataset, iterations, 1) self.task.progress("Stage 1 Training With FORM") self.logger.info("Stage 1 Training With FORM") self.trainWithIndex(dataset, iterations, 0) self.logger.info("Training Complete") self.task = None return def logViterbi(self, f, e): fLen, eLen = len(f), len(e) e = deepcopy(e) for i in range(eLen): e.append(("null", "null")) score = np.zeros((fLen, eLen * 2, len(self.typeList))) prev_j = np.zeros((fLen, eLen * 2, len(self.typeList))) prev_h = np.zeros((fLen, eLen * 2, len(self.typeList))) for j in range(len(e)): tPr = log(self.tProbability(f[0], e[j])) for h in range(len(self.typeList)): score[0][j][h] = log(self.sProbability(f[0], e[j], h)) + tPr if j < len(self.pi) and self.pi[j] != 0: score[0][j][h] += log(self.pi[j]) else: score[0][j][h] = -sys.maxint - 1 for i in range(1, fLen): for j in range(len(e)): maxScore = -sys.maxint - 1 jPrevBest = -sys.maxint - 1 hPrevBest = 0 tPr = log(self.tProbability(f[i], e[j])) for jPrev in range(len(e)): aPrPreLog = self.aProbability(jPrev, j, eLen) if aPrPreLog == 0: continue aPr = log(aPrPreLog) for h in range(len(self.typeList)): temp = score[i - 1][jPrev][h] + aPr + tPr if temp > maxScore: maxScore = temp jPrevBest = jPrev hPrevBest = h for h in range(len(self.typeList)): s = self.sProbability(f[i], e[j], h) if s != 0: temp_s = log(s) score[i][j][h] = maxScore + temp_s prev_j[i][j][h] = jPrevBest prev_h[i][j][h] = hPrevBest maxScore = -sys.maxint - 1 best_j = best_h = 0 for j in range(len(e)): for h in range(len(self.typeList)): if score[fLen - 1][j][h] > maxScore: maxScore = score[fLen - 1][j][h] best_j, best_h = j, h trace = [ (best_j + 1, best_h), ] j, h = best_j, best_h i = fLen - 1 while (i > 0): j, h = int(prev_j[i][j][h]), int(prev_h[i][j][h]) trace = [(j + 1, h)] + trace i = i - 1 return trace
class AlignmentModelBase(Base): def __init__(self): if "nullEmissionProb" not in vars(self): self.nullEmissionProb = 0.000005 if "task" not in vars(self): self.task = None if "t" not in vars(self): self.t = defaultdict(float) if "eLengthSet" not in vars(self): self.eLengthSet = defaultdict(int) if "a" not in vars(self): self.a = [[[]]] if "pi" not in vars(self): self.pi = [] if "logger" not in vars(self): self.logger = logging.getLogger('HMMBASE') if "modelComponents" not in vars(self): self.modelComponents = ["t", "pi", "a", "eLengthSet"] Base.__init__(self) return def initialiseParameter(self, Len): doubleLen = 2 * Len tmp = 1.0 / Len for z in range(Len): for y in range(Len): for x in range(Len + 1): self.a[x][z][y] = tmp tmp = 1.0 / doubleLen for x in range(Len): self.pi[x] = tmp return def forwardBackward(self, f, e, tSmall, a): alpha = [[0.0 for x in range(len(e))] for y in range(len(f))] alphaScale = [0.0 for x in range(len(f))] alphaSum = 0 for j in range(len(e)): alpha[0][j] = self.pi[j] * tSmall[0][j] alphaSum += alpha[0][j] alphaScale[0] = 1 / alphaSum for j in range(len(e)): alpha[0][j] *= alphaScale[0] for i in range(1, len(f)): alphaSum = 0 for j in range(len(e)): total = 0 for prev_j in range(len(e)): total += alpha[i - 1][prev_j] * a[prev_j][j] alpha[i][j] = tSmall[i][j] * total alphaSum += alpha[i][j] alphaScale[i] = 1.0 / alphaSum for j in range(len(e)): alpha[i][j] = alphaScale[i] * alpha[i][j] beta = [[0.0 for x in range(len(e))] for y in range(len(f))] for j in range(len(e)): beta[len(f) - 1][j] = alphaScale[len(f) - 1] for i in range(len(f) - 2, -1, -1): for j in range(len(e)): total = 0 for next_j in range(len(e)): total += (beta[i + 1][next_j] * a[j][next_j] * tSmall[i + 1][next_j]) beta[i][j] = alphaScale[i] * total return alpha, alphaScale, beta def maxTargetSentenceLength(self, dataset): maxLength = 0 eLengthSet = defaultdict(int) for (f, e, alignment) in dataset: tempLength = len(e) if tempLength > maxLength: maxLength = tempLength eLengthSet[tempLength] += 1 return (maxLength, eLengthSet) def baumWelch(self, dataset, iterations=5, index=0): if not self.task: self.task = Task("Aligner", "HMMBaumWelchOI" + str(iterations)) self.logger.info("Starting Training Process") self.logger.info("Training size: " + str(len(dataset))) startTime = time.time() maxE, self.eLengthSet = self.maxTargetSentenceLength(dataset) self.logger.info("Maximum Target sentence length: " + str(maxE)) self.a = [[[0.0 for x in range(maxE * 2)] for y in range(maxE * 2)] for z in range(maxE + 1)] self.pi = [0.0 for x in range(maxE * 2)] for iteration in range(iterations): self.logger.info("BaumWelch Iteration " + str(iteration)) logLikelihood = 0 gamma = [[0.0 for x in range(maxE)] for y in range(maxE * 2)] gammaBiword = defaultdict(float) gammaSum_0 = [0.0 for x in range(maxE)] delta = [[[0.0 for x in range(maxE)] for y in range(maxE)] for z in range(maxE + 1)] self._beginningOfIteration(dataset) counter = 0 for (f, e, alignment) in dataset: self.task.progress("BaumWelch iter %d, %d of %d" % (iteration, counter, len(dataset),)) counter += 1 if iteration == 0: self.initialiseParameter(len(e)) fLen, eLen = len(f), len(e) a = self.a[eLen] tSmall = [[self.t[(f[i][index], e[j][index])] for j in range(eLen)] for i in range(fLen)] alpha, alphaScale, beta = self.forwardBackward(f, e, tSmall, a) # Update logLikelihood for i in range(fLen): logLikelihood -= log(alphaScale[i]) # Setting gamma self._updateGamma(f, e, gamma, alpha, beta, alphaScale) for i in range(fLen): for j in range(eLen): gammaBiword[(f[i][index], e[j][index])] += gamma[i][j] for j in range(eLen): gammaSum_0[j] += gamma[0][j] # Update delta c = [0.0 for i in range(eLen * 2)] for i in range(1, fLen): for prev_j in range(eLen): for j in range(eLen): c[eLen - 1 + j - prev_j] += (alpha[i - 1][prev_j] * beta[i][j] * a[prev_j][j] * tSmall[i][j]) for prev_j in range(eLen): for j in range(eLen): delta[eLen][prev_j][j] += c[eLen - 1 + j - prev_j] # end of loop over dataset self.logger.info("likelihood " + str(logLikelihood)) # M-Step self._updateEndOfIteration(maxE, delta, gammaSum_0, gammaBiword) self.endOfBaumWelch() endTime = time.time() self.logger.info("Training Complete, total time(seconds): %f" % (endTime - startTime,)) return def _beginningOfIteration(self, dataset): # self.lenDataset = len(dataset) # return raise NotImplementedError def _updateGamma(self, f, e, gamma, alpha, beta, alphaScale): # for i in range(len(f)): # for j in range(len(e)): # gamma[i][j] = alpha[i][j] * beta[i][j] / alphaScale[i] raise NotImplementedError def _updateEndOfIteration(self, maxE, delta, gammaSum_0, gammaBiword): # self.t.clear() # for Len in self.eLengthSet: # for prev_j in range(Len): # deltaSum = 0.0 # for j in range(Len): # deltaSum += delta[Len][prev_j][j] # for j in range(Len): # self.a[Len][prev_j][j] = delta[Len][prev_j][j] /\ # (deltaSum + 1e-37) # for i in range(maxE): # self.pi[i] = gammaSum_0[i] * (1.0 / self.lenDataset) # gammaEWord = defaultdict(float) # for f, e in gammaBiword: # gammaEWord[e] += gammaBiword[(f, e)] # for f, e in gammaBiword: # self.t[(f, e)] = gammaBiword[(f, e)] / (gammaEWord[e] + 1e-37) # return raise NotImplementedError def endOfBaumWelch(self): # Apply final smoothing here raise NotImplementedError def tProbability(self, f, e, index=0): v = 163303 if (f[index], e[index]) in self.t: return self.t[(f[index], e[index])] if e[index] == "null": return self.nullEmissionProb return 1.0 / v def aProbability(self, prev_j, j, targetLength): if targetLength in self.eLengthSet: return self.a[targetLength][prev_j][j] return 1.0 / targetLength def logViterbi(self, f, e): e = deepcopy(e) fLen, eLen = len(f), len(e) for i in range(eLen): e.append(("null", "null")) score = np.zeros((fLen, eLen * 2)) prev_j = np.zeros((fLen, eLen * 2)) for i in range(fLen): for j in range(eLen * 2): score[i][j] = log(self.tProbability(f[i], e[j])) if i == 0: if j < len(self.pi) and self.pi[j] != 0: score[i][j] += log(self.pi[j]) else: score[i][j] = - sys.maxint - 1 else: # Find the best alignment for f[i-1] maxScore = -sys.maxint - 1 bestPrev_j = -sys.maxint - 1 for jPrev in range(eLen * 2): aPr = self.aProbability(jPrev, j, eLen) if aPr == 0: continue temp = score[i - 1][jPrev] + log(aPr) if temp > maxScore: maxScore = temp bestPrev_j = jPrev score[i][j] += maxScore prev_j[i][j] = bestPrev_j maxScore = -sys.maxint - 1 best_j = 0 for j in range(eLen * 2): if score[fLen - 1][j] > maxScore: maxScore = score[fLen - 1][j] best_j = j trace = [(best_j + 1, )] i = fLen - 1 j = best_j while (i > 0): j = int(prev_j[i][j]) trace = [(j + 1, )] + trace i = i - 1 return trace def decodeSentence(self, sentence): f, e, alignment = sentence sentenceAlignment = [] bestAlign = self.logViterbi(f, e) for i in range(len(bestAlign)): if bestAlign[i][0] <= len(e): if len(bestAlign[i]) > 1 and "typeList" in vars(self): sentenceAlignment.append( (i + 1, bestAlign[i][0], self.typeList[bestAlign[i][1]])) else: sentenceAlignment.append((i + 1, bestAlign[i][0])) return sentenceAlignment
class AlignmentModel(Base): def __init__(self): self.modelName = "HMM" self.version = "0.1b" self.logger = logging.getLogger('HMM') self.p0H = 0.3 self.nullEmissionProb = 0.000005 self.smoothFactor = 0.1 self.task = None self.evaluate = evaluate self.modelComponents = ["t", "pi", "a", "eLengthSet"] Base.__init__(self) return def _beginningOfIteration(self, dataset): self.lenDataset = len(dataset) return def _updateGamma(self, f, e, gamma, alpha, beta, alphaScale): for i in range(len(f)): for j in range(len(e)): gamma[i][j] = alpha[i][j] * beta[i][j] / alphaScale[i] def _updateEndOfIteration(self, maxE, delta, gammaSum_0, gammaBiword): # Update a for Len in self.eLengthSet: for prev_j in range(Len): deltaSum = 0.0 for j in range(Len): deltaSum += delta[Len][prev_j][j] for j in range(Len): self.a[Len][prev_j][j] = delta[Len][prev_j][j] /\ (deltaSum + 1e-37) # Update pi for i in range(maxE): self.pi[i] = gammaSum_0[i] * (1.0 / self.lenDataset) # Update t gammaEWord = defaultdict(float) for f, e in gammaBiword: gammaEWord[e] += gammaBiword[(f, e)] self.t.clear() for f, e in gammaBiword: self.t[(f, e)] = gammaBiword[(f, e)] / (gammaEWord[e] + 1e-37) return def endOfBaumWelch(self): # Smoothing for target sentences of unencountered length for targetLen in self.eLengthSet: a = self.a[targetLen] for prev_j in range(targetLen): for j in range(targetLen): a[prev_j][j] *= 1 - self.p0H for targetLen in self.eLengthSet: a = self.a[targetLen] for prev_j in range(targetLen): for j in range(targetLen): a[prev_j][prev_j + targetLen] = self.p0H a[prev_j + targetLen][prev_j + targetLen] = self.p0H a[prev_j + targetLen][j] = a[prev_j][j] return def train(self, dataset, iterations): self.task = Task("Aligner", "HMMOI" + str(iterations)) self.task.progress("Training IBM model 1") self.logger.info("Training IBM model 1") alignerIBM1 = AlignerIBM1() alignerIBM1.train(dataset, iterations) self.t = alignerIBM1.t self.task.progress("IBM model Trained") self.logger.info("IBM model Trained") self.baumWelch(dataset, iterations=iterations) self.task.progress("finalising") self.task = None return