def train(sentences): print("Training the model...") obs = defaultdict(lambda: defaultdict(lambda: 1)) prevObs = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 1))) tr = defaultdict(lambda: defaultdict(lambda: 0)) uniObs = defaultdict(lambda: defaultdict(lambda: 0)) prevBigram = "start" for raw,indices in sentences: j = 0 i = 0 cstate = 'B' while i < len(raw) - 1: bigram = raw[i:i+2] if indices[j] == i+1: obs['B'][bigram] += 1 j += 1 tr[cstate]['B'] += 1 cstate = 'B' prevObs['B'][bigram][prevBigram] += 1 else: obs['C'][bigram] += 1 tr[cstate]['C'] += 1 cstate = 'C' prevObs['C'][bigram][prevBigram] += 1 i += 1 prevBigram = bigram uniObs[cstate][uniBlock.block(bigram[0]) +'::'+ uniBlock.block(bigram[1])] += 1 return [obs,tr,prevObs,uniObs]
def nextProbas(model,cstate,bigram,prevBigram): observations = model[0] transitions = model[1] prevObservations = model[2] uniObs = model[3] d = False if not bigram in observations['B'] or not bigram in observations['C']: bCoeff = 1 for bg in observations['B']: if bigram[1] == bg[1]: bCoeff += 1 cCoeff = 1 for bg in observations['C']: if bigram[1] == bg[1]: cCoeff += 1 bPb = transitions[cstate]['B'] * bCoeff cPb = transitions[cstate]['C'] * cCoeff if abs((float(min(bPb,cPb)) / max(bPb,cPb))) < 0.1: d = True if not d: bPb = transitions[cstate]['B'] * observations['B'][bigram] * uniObs['B'][uniBlock.block(bigram[0]) +'::'+ uniBlock.block(bigram[1])] cPb = transitions[cstate]['C'] * observations['C'][bigram] * uniObs['C'][uniBlock.block(bigram[0]) +'::'+ uniBlock.block(bigram[1])] return dict({'B':bPb,'C':cPb})
def nextProbas(model,cstate,bigram,prevBigram): observations = model[0] transitions = model[1] prevObservations = model[2] uniObs = model[3] bPb = transitions[cstate]['B'] * observations['B'][bigram] * uniObs['B'][uniBlock.block(bigram[0]) +'::'+ uniBlock.block(bigram[1])] cPb = transitions[cstate]['C'] * observations['C'][bigram] * uniObs['C'][uniBlock.block(bigram[0]) +'::'+ uniBlock.block(bigram[1])] return dict({'B':bPb,'C':cPb})