def calculate_posterior(sentence, label): posterior_prob = 1.0 for index in range(0, len(sentence)): posterior_prob *= Probabilities.get_posterior_word_probability(sentence[index], label[index]) if index == 0: posterior_prob *= Probabilities.get_first_speech_prob(label[index]) else: posterior_prob *= Probabilities.get_transition_prob(label[index], label[index - 1]) return log(posterior_prob)
def calculate_posterior(sentence, label): posterior_prob = 1.0 for index in range(0, len(sentence)): posterior_prob *= Probabilities.get_posterior_word_probability( sentence[index], label[index]) if index == 0: posterior_prob *= Probabilities.get_first_speech_prob(label[index]) else: posterior_prob *= Probabilities.get_transition_prob( label[index], label[index - 1]) return log(posterior_prob)
def Qc(train_set, test_set, laplace=False): """Handles the tasks of question c Arguments: train_set test_set Keyword Arguments: laplace {bool} -- are we to use Laplace smoothing or not (default: {False}) """ gen_error_vec = [] known_error_vec = [] unknown_error_vec = [] viterbi_results = [] train_set = clean_POS(train_set) test_set = clean_POS(test_set) S = initialize_S(train_set) probs = Probabilities(S, train_set=train_set, test_set=test_set) for xy_tup in test_set: x = [t[0] for t in xy_tup] y = [t[1] for t in xy_tup] viterbi_tags = viterbi(x, probs, laplace) viterbi_results.append(viterbi_tags) err_vec, known_0, unknonwn_0 = (calculate_error( viterbi_tags, y, x, probs)) gen_error_vec.append(err_vec[0]) if not known_0: known_error_vec.append(err_vec[1]) if not unknonwn_0: unknown_error_vec.append(err_vec[2]) gen_error = statistics.mean(gen_error_vec) known_error = statistics.mean(known_error_vec) unknown_error = statistics.mean(unknown_error_vec) return [gen_error, known_error, unknown_error]
def get_part_of_speech(sentence): pos = [] for word in sentence: best_prob = 0 best_speech = "" for speech in Probabilities.speech_prob.keys(): word_prob = Probabilities.get_naive_word_probability(word, speech) * Probabilities.speech_prob[speech] # choose best possible tag for speech if word_prob > best_prob: best_prob = word_prob best_speech = speech # choose best speech by heuristic if no occurrences found if best_prob == 0: best_speech = Probabilities.get_best_possible_speech(word) pos.append(best_speech) # store result in result cache for future use result_cache.naive_result = pos return [[pos], []]
def get_part_of_speech(sentence): pos = [] for word in sentence: best_prob = 0 best_speech = "" for speech in Probabilities.speech_prob.keys(): word_prob = Probabilities.get_naive_word_probability( word, speech) * Probabilities.speech_prob[speech] # choose best possible tag for speech if word_prob > best_prob: best_prob = word_prob best_speech = speech # choose best speech by heuristic if no occurrences found if best_prob == 0: best_speech = Probabilities.get_best_possible_speech(word) pos.append(best_speech) # store result in result cache for future use result_cache.naive_result = pos return [[pos], []]
def Qe(train_set, test_set, laplace=False): """Handles tasks of question e Arguments: train_set test_set Keyword Arguments: laplace {bool} -- (default: {False}) """ # initializations viterbi_results = [] gen_error_vec = [] known_error_vec = [] unknown_error_vec = [] # "clean" the train and test sets from complex tags train_set = clean_POS(train_set) test_set = clean_POS(test_set) S = initialize_S(train_set) probs = Probabilities(S, train_set, test_set) # Generate pseudo train and test sets and probability object pseudo_train = probs.generate_pseudo_set(train_set) pseudo_test = probs.generate_pseudo_set(test_set) pseudo_probs = Probabilities(S, pseudo_train, pseudo_test) for xy_tup in pseudo_test: x = [t[0] for t in xy_tup] y = [t[1] for t in xy_tup] viterbi_tags = viterbi(x, pseudo_probs, laplace) viterbi_results.append(viterbi_tags) err_vec, _, _ = (calculate_error(viterbi_tags, y, x, probs, True)) gen_error_vec.append(err_vec[0]) # update confusion values pseudo_probs.update_confusion_matrix(y, viterbi_tags) gen_error = statistics.mean(gen_error_vec) print(gen_error) # print results and statistics if laplace: print(DataFrame(confusion_matrix(S, pseudo_probs))) return gen_error
def get_part_of_speech(sentense): # all the speeches from the train data speeches = Probabilities.speech_prob.keys() no_words = len(sentense) no_speech = len(speeches) # Holds backtrack path for trace back back_tracks = [[0] * (no_words + 1) for x in range(no_speech)] # maximum probabilities for each speech at each level max_probabilities = [[0] * (no_words + 1) for x in range(no_speech)] # initial probability calculation for first word first_word = sentense[0] # basic step for i in range(0, no_speech): max_probabilities[i][0] = Probabilities.get_first_speech_prob( speeches[i]) * Probabilities.get_word_probability( first_word, speeches[i]) back_tracks[i][0] = "" # recursive step for word_index in range(1, no_words): curr_word = sentense[word_index] for tag_index in range(0, no_speech): arg_max = -sys.maxint arg_bt = "" max_total_prob = -sys.maxint curr_tag = speeches[tag_index] word_prob = Probabilities.get_word_probability(curr_word, curr_tag) for prev_tag_index in range(0, no_speech): prev_tag = speeches[prev_tag_index] # calculate transition probability transition_prob = Probabilities.get_transition_prob( curr_tag, prev_tag) * max_probabilities[prev_tag_index][word_index - 1] if transition_prob > arg_max: arg_max = transition_prob arg_bt = prev_tag # total probability total_prob = transition_prob * word_prob if total_prob > max_total_prob: max_total_prob = total_prob back_tracks[tag_index][word_index] = arg_bt max_probabilities[tag_index][word_index] = max_total_prob # terminal step, calculate speech for last word max_probabilities[no_speech - 1][no_words] = -1 for i in range(0, no_speech): tag = speeches[i] last_prob = max_probabilities[i][ no_words - 1] * Probabilities.get_last_speech_prob(tag) if max_probabilities[no_speech - 1][no_words] < last_prob: max_probabilities[no_speech - 1][no_words] = last_prob back_tracks[no_speech - 1][no_words] = tag # backtrack, get best path last_tag = back_tracks[no_speech - 1][no_words] solution = [last_tag] for word_index in range(no_words - 1, 0, -1): prev_tag_index = speeches.index(last_tag) last_tag = back_tracks[prev_tag_index][word_index] solution.append(last_tag) # Due to backtrack it will be in reverse, change it solution.reverse() # store result in result cache for future use result_cache.viterbi_result = solution[:] return [[solution], []]
def get_part_of_speech(sentence): # print result_cache.results result_cache.results = [result_cache.naive_result] + [result_cache.max_marginal] + [result_cache.viterbi_result] Probabilities.convert_algo_results(result_cache.results, sentence) pos = [] previous_word = "" for index in range(len(sentence)): best_prob = - sys.maxint for speech in Probabilities.speech_prob.keys(): word = sentence[index] prob = Probabilities.get_word_probability(word, speech) if index == 0: next_word = sentence[index + 1] prob *= Probabilities.get_next_word_speech_probability(next_word, speech) * Probabilities.get_first_speech_prob( speech) elif len(sentence) == 1: prob *= Probabilities.get_first_speech_prob(speech) elif index == len(sentence) - 1: prob *= Probabilities.get_transition_prob(speech, previous_speech) * Probabilities.get_prev_word_speech_probability( previous_word, speech) else: next_word = sentence[index + 1] prob *= Probabilities.get_transition_prob(speech, previous_speech) * Probabilities.get_prev_word_speech_probability( previous_word, speech) * Probabilities.get_next_word_speech_probability(next_word, speech) previous_speech = speech previous_word = word if best_prob < prob: best_prob = prob best_speech = speech pos.insert(index, best_speech) return pos
def get_samples(sentence, sample_count): ''' Gibbs sampler 1. Generate initial samples, i.e. Assign the sentence random Parts of Speech [uniform or random or EM, not sure] 1.1 Optional Burn-in or thinning?, throw away few samples. [TO-DO] Repeat sample_count times 2. Pick the last sample, x[t] <- x[t-1] 3. Repeat the following steps for all unobserved/non-evidence words 4. For each word picked, sample it by calculating the posterior probability keeping all other variable as evidence 5. Add to sample list ''' sampleMap = Counter() samples = [] # step 1 - generate initial samples initSample = generateInitSamples(sentence) previousSample = initSample for i in range(1, Constants.gibbs_max_iteration): # step 2 - pick last sample modifiedSample = previousSample # step 3 for j in range(0, len(sentence)): speechTags = [] probWeights = [] sumProbWeights = 0 # step 4, calculate the posterior probability for speech in Probabilities.speech_prob.keys(): word_prob = Probabilities.get_word_probability(sentence[j], speech) if len(sentence) == 1: prob1 = Probabilities.get_first_speech_prob(speech) prob = word_prob * prob1 elif j == 0: # first word, nothing prior prob1 = Probabilities.get_first_speech_prob(speech) * Probabilities.get_transition_prob( modifiedSample[j + 1], speech) prob = word_prob * prob1 elif j == len(sentence) - 1: prob1 = Probabilities.get_transition_prob(speech, modifiedSample[j - 1]) prob = word_prob * prob1 else: prob1 = Probabilities.get_transition_prob(speech, modifiedSample[ j - 1]) * Probabilities.get_transition_prob(modifiedSample[j + 1], speech) prob = word_prob * prob1 sumProbWeights += prob speechTags.append(speech) probWeights.append(prob) probWeights = [x / sumProbWeights for x in probWeights] # cummulative sum cumsum = 0 randomWeight = random() for i in range(len(probWeights)): cumsum += probWeights[i] probWeights[i] = cumsum randomIndex = -1 for i in range(1, len(probWeights)): if probWeights[i] >= randomWeight and randomWeight >= probWeights[i - 1]: randomIndex = i if randomIndex == -1: randomIndex = 0 modifiedSample[j] = speechTags[randomIndex] previousSample = modifiedSample[:] samples.append(previousSample) sampleMap['_'.join(previousSample)] += 1 samples = samples[Constants.gibbs_burn_in_count:] return samples, sampleMap
def get_part_of_speech(sentense): # all the speeches from the train data speeches = Probabilities.speech_prob.keys() no_words = len(sentense) no_speech = len(speeches) # Holds backtrack path for trace back back_tracks = [[0] * (no_words + 1) for x in range(no_speech)] # maximum probabilities for each speech at each level max_probabilities = [[0] * (no_words + 1) for x in range(no_speech)] # initial probability calculation for first word first_word = sentense[0] # basic step for i in range(0, no_speech): max_probabilities[i][0] = Probabilities.get_first_speech_prob(speeches[i]) * Probabilities.get_word_probability( first_word, speeches[i] ) back_tracks[i][0] = "" # recursive step for word_index in range(1, no_words): curr_word = sentense[word_index] for tag_index in range(0, no_speech): arg_max = -sys.maxint arg_bt = "" max_total_prob = -sys.maxint curr_tag = speeches[tag_index] word_prob = Probabilities.get_word_probability(curr_word, curr_tag) for prev_tag_index in range(0, no_speech): prev_tag = speeches[prev_tag_index] # calculate transition probability transition_prob = ( Probabilities.get_transition_prob(curr_tag, prev_tag) * max_probabilities[prev_tag_index][word_index - 1] ) if transition_prob > arg_max: arg_max = transition_prob arg_bt = prev_tag # total probability total_prob = transition_prob * word_prob if total_prob > max_total_prob: max_total_prob = total_prob back_tracks[tag_index][word_index] = arg_bt max_probabilities[tag_index][word_index] = max_total_prob # terminal step, calculate speech for last word max_probabilities[no_speech - 1][no_words] = -1 for i in range(0, no_speech): tag = speeches[i] last_prob = max_probabilities[i][no_words - 1] * Probabilities.get_last_speech_prob(tag) if max_probabilities[no_speech - 1][no_words] < last_prob: max_probabilities[no_speech - 1][no_words] = last_prob back_tracks[no_speech - 1][no_words] = tag # backtrack, get best path last_tag = back_tracks[no_speech - 1][no_words] solution = [last_tag] for word_index in range(no_words - 1, 0, -1): prev_tag_index = speeches.index(last_tag) last_tag = back_tracks[prev_tag_index][word_index] solution.append(last_tag) # Due to backtrack it will be in reverse, change it solution.reverse() # store result in result cache for future use result_cache.viterbi_result = solution[:] return [[solution], []]
def train(self, data): # create all probabilities required Probabilities.train_from_data(data)