Exemplo n.º 1
0
def train(sentences):
	print("Training the model...")
	obs = defaultdict(lambda: defaultdict(lambda: 1))
	prevObs = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 1)))
	tr = defaultdict(lambda: defaultdict(lambda: 0))
	uniObs = defaultdict(lambda: defaultdict(lambda: 0))

	prevBigram = "start"
	for raw,indices in sentences:
		j = 0
		i = 0
		cstate = 'B'
		while i < len(raw) - 1:
			bigram = raw[i:i+2]
			if indices[j] == i+1:
				obs['B'][bigram] += 1
				j += 1
				tr[cstate]['B'] += 1
				cstate = 'B'
				prevObs['B'][bigram][prevBigram] += 1
			else:
				obs['C'][bigram] += 1
				tr[cstate]['C'] += 1
				cstate = 'C'
				prevObs['C'][bigram][prevBigram] += 1
			i += 1
			prevBigram = bigram
			uniObs[cstate][uniBlock.block(bigram[0]) +'::'+ uniBlock.block(bigram[1])] += 1

	return [obs,tr,prevObs,uniObs]
Exemplo n.º 2
0
def nextProbas(model,cstate,bigram,prevBigram):
	observations = model[0]
	transitions = model[1]
	prevObservations = model[2]
	uniObs = model[3]
	d = False

	if not bigram in observations['B'] or not bigram in observations['C']:
		bCoeff = 1
		for bg in observations['B']:
			if bigram[1] == bg[1]:
				bCoeff += 1
		cCoeff = 1
		for bg in observations['C']:
			if bigram[1] == bg[1]:
				cCoeff += 1
		bPb = transitions[cstate]['B'] * bCoeff
		cPb = transitions[cstate]['C'] * cCoeff
		if abs((float(min(bPb,cPb)) / max(bPb,cPb))) < 0.1:
			d = True

	if not d:
		bPb = transitions[cstate]['B'] * observations['B'][bigram] * uniObs['B'][uniBlock.block(bigram[0]) +'::'+ uniBlock.block(bigram[1])]
		cPb = transitions[cstate]['C'] * observations['C'][bigram] * uniObs['C'][uniBlock.block(bigram[0]) +'::'+ uniBlock.block(bigram[1])]

	return dict({'B':bPb,'C':cPb})
Exemplo n.º 3
0
def nextProbas(model,cstate,bigram,prevBigram):
	observations = model[0]
	transitions = model[1]
	prevObservations = model[2]
	uniObs = model[3]

	bPb = transitions[cstate]['B'] * observations['B'][bigram] * uniObs['B'][uniBlock.block(bigram[0]) +'::'+ uniBlock.block(bigram[1])]
	cPb = transitions[cstate]['C'] * observations['C'][bigram] * uniObs['C'][uniBlock.block(bigram[0]) +'::'+ uniBlock.block(bigram[1])]

	return dict({'B':bPb,'C':cPb})