def SimilarityViterbi(emission_probs, state_init_probs, state_trans_probs, test_subseq, low_frequency_probabilities, smooth): ''' For now we're ignoring the <UNK> tokens that were inserted If we lookup an emission that doesn't exist, it will have probability 0, since we're using a Counter ''' # Initialize paths and probabilities path_dict = dict((state, [state[1]]) for state in state_init_probs.keys()) prev_probs = {} for state in state_init_probs.keys(): prev_probs[state] = state_init_probs[state] * emission_probs[state[1]][test_subseq[0]] # Iterate over the sentence all_states = set(state2 for state1 in state_trans_probs for state2 in state_trans_probs[state1]) all_states.remove("<UNK>") for emission in test_subseq[1:]: new_path_dict = {} curr_state_probs = {} for curr_state in all_states: temp_state_probs = {} for prev_state in path_dict: #Emission is unseen. Finding probability based on feature class if emission not in emission_probs[curr_state]: feature_class = findFeatureClass(emission) # simple_curr_state = curr_state.split('-')[-1] # emission_probability = low_frequency_probabilities[simple_curr_state][feature_class] emission_probability = low_frequency_probabilities[curr_state][feature_class] else: emission_probability = emission_probs[curr_state][emission] if smooth: if prev_state not in state_trans_probs: this_trans_prob = state_trans_probs["<UNK>"][curr_state] elif curr_state not in state_trans_probs[prev_state]: this_trans_prob = state_trans_probs[prev_state]["<UNK>"] else: this_trans_prob = state_trans_probs[prev_state][curr_state] else: this_trans_prob = state_trans_probs[prev_state][curr_state] temp_state_probs[prev_state] = prev_probs[prev_state] * this_trans_prob * emission_probability max_idx = np.argmax(temp_state_probs.values()) max_prob = temp_state_probs.values()[max_idx] max_state = temp_state_probs.keys()[max_idx] curr_bigram = (max_state[1], curr_state) curr_state_probs[curr_bigram] = max_prob new_path_dict[curr_bigram] = path_dict[max_state] + [curr_state] prev_probs = curr_state_probs.copy() path_dict = new_path_dict.copy() # Identify overall most probable path overall_max_idx = np.argmax(prev_probs.values()) overall_max_state = prev_probs.keys()[overall_max_idx] return path_dict[overall_max_state]
def SimilarityViterbi(emission_probs, state_init_probs, state_trans_probs, test_subseq, low_frequency_probabilities, smooth): ''' For now we're ignoring the <UNK> tokens that were inserted If we lookup an emission that doesn't exist, it will have probability 0, since we're using a Counter ''' # Initialize paths and probabilities path_dict = dict((state, [state[1]]) for state in state_init_probs.keys()) prev_probs = {} for state in state_init_probs.keys(): prev_probs[state] = state_init_probs[state] * emission_probs[state[1]][ test_subseq[0]] # Iterate over the sentence all_states = set(state2 for state1 in state_trans_probs for state2 in state_trans_probs[state1]) all_states.remove("<UNK>") for emission in test_subseq[1:]: new_path_dict = {} curr_state_probs = {} for curr_state in all_states: temp_state_probs = {} for prev_state in path_dict: #Emission is unseen. Finding probability based on feature class if emission not in emission_probs[curr_state]: feature_class = findFeatureClass(emission) # simple_curr_state = curr_state.split('-')[-1] # emission_probability = low_frequency_probabilities[simple_curr_state][feature_class] emission_probability = low_frequency_probabilities[ curr_state][feature_class] else: emission_probability = emission_probs[curr_state][emission] if smooth: if prev_state not in state_trans_probs: this_trans_prob = state_trans_probs["<UNK>"][ curr_state] elif curr_state not in state_trans_probs[prev_state]: this_trans_prob = state_trans_probs[prev_state][ "<UNK>"] else: this_trans_prob = state_trans_probs[prev_state][ curr_state] else: this_trans_prob = state_trans_probs[prev_state][curr_state] temp_state_probs[prev_state] = prev_probs[ prev_state] * this_trans_prob * emission_probability max_idx = np.argmax(temp_state_probs.values()) max_prob = temp_state_probs.values()[max_idx] max_state = temp_state_probs.keys()[max_idx] curr_bigram = (max_state[1], curr_state) curr_state_probs[curr_bigram] = max_prob new_path_dict[curr_bigram] = path_dict[max_state] + [curr_state] prev_probs = curr_state_probs.copy() path_dict = new_path_dict.copy() # Identify overall most probable path overall_max_idx = np.argmax(prev_probs.values()) overall_max_state = prev_probs.keys()[overall_max_idx] return path_dict[overall_max_state]
def Viterbi(emission_probs, state_init_probs, state_trans_probs, test_subseq, low_frequency_probabilities, smooth, similarity_based, pos_subseq): ''' For now we're ignoring the <UNK> tokens that were inserted If we lookup an emission that doesn't exist, it will have probability 0, since we're using a Counter ''' # Initialize paths and probabilities path_dict = dict((state, [state]) for state in state_init_probs.keys()) prev_probs = {} for state in state_init_probs.keys(): prev_probs[state] = state_init_probs[state] * emission_probs[state][test_subseq[0]] # Iterate over the sentence all_states = set(state2 for state1 in state_trans_probs for state2 in state_trans_probs[state1]) # for emission in test_subseq[1:]: for emission_idx in range(1, len(test_subseq)): emission = test_subseq[emission_idx] new_path_dict = {} curr_state_probs = {} for curr_state in all_states: temp_state_probs = {} for prev_state in path_dict: if emission not in emission_probs[curr_state]: #Using the smoothed values in case emission was something we had not seen before if smooth == 'Laplacian' or smooth == 'Good-Turing': temp_state_probs[prev_state] = prev_probs[prev_state] * state_trans_probs[prev_state][curr_state] * \ emission_probs[curr_state]['<UNK>'] #Using feature classes from local context instead of smoothing elif similarity_based: feature_class = findFeatureClass(emission) current_state = curr_state if '-' not in curr_state else curr_state.split('-')[1] emission_probability = low_frequency_probabilities[feature_class][current_state] temp_state_probs[prev_state] = prev_probs[prev_state] * state_trans_probs[prev_state][curr_state] * \ emission_probability else: feature_class = pos_subseq[emission_idx] current_state = curr_state if '-' not in curr_state else curr_state.split('-')[1] emission_probability = low_frequency_probabilities[feature_class][current_state] temp_state_probs[prev_state] = prev_probs[prev_state] * state_trans_probs[prev_state][curr_state] * \ emission_probability else: temp_state_probs[prev_state] = prev_probs[prev_state] * state_trans_probs[prev_state][curr_state] * \ emission_probs[curr_state][emission] max_idx = np.argmax(temp_state_probs.values()) max_prob = temp_state_probs.values()[max_idx] max_state = temp_state_probs.keys()[max_idx] curr_state_probs[curr_state] = max_prob new_path_dict[curr_state] = path_dict[max_state] + [curr_state] prev_probs = curr_state_probs.copy() path_dict = new_path_dict.copy() # Identify overall most probable path overall_max_idx = np.argmax(prev_probs.values()) overall_max_state = prev_probs.keys()[overall_max_idx] return path_dict[overall_max_state]
def Viterbi(emission_probs, state_init_probs, state_trans_probs, test_subseq, low_frequency_probabilities, smooth, similarity_based, pos_subseq): ''' For now we're ignoring the <UNK> tokens that were inserted If we lookup an emission that doesn't exist, it will have probability 0, since we're using a Counter ''' # Initialize paths and probabilities path_dict = dict((state, [state]) for state in state_init_probs.keys()) prev_probs = {} for state in state_init_probs.keys(): prev_probs[state] = state_init_probs[state] * emission_probs[state][ test_subseq[0]] # Iterate over the sentence all_states = set(state2 for state1 in state_trans_probs for state2 in state_trans_probs[state1]) # for emission in test_subseq[1:]: for emission_idx in range(1, len(test_subseq)): emission = test_subseq[emission_idx] new_path_dict = {} curr_state_probs = {} for curr_state in all_states: temp_state_probs = {} for prev_state in path_dict: if emission not in emission_probs[curr_state]: #Using the smoothed values in case emission was something we had not seen before if smooth == 'Laplacian' or smooth == 'Good-Turing': temp_state_probs[prev_state] = prev_probs[prev_state] * state_trans_probs[prev_state][curr_state] * \ emission_probs[curr_state]['<UNK>'] #Using feature classes from local context instead of smoothing elif similarity_based: feature_class = findFeatureClass(emission) current_state = curr_state if '-' not in curr_state else curr_state.split( '-')[1] emission_probability = low_frequency_probabilities[ feature_class][current_state] temp_state_probs[prev_state] = prev_probs[prev_state] * state_trans_probs[prev_state][curr_state] * \ emission_probability else: feature_class = pos_subseq[emission_idx] current_state = curr_state if '-' not in curr_state else curr_state.split( '-')[1] emission_probability = low_frequency_probabilities[ feature_class][current_state] temp_state_probs[prev_state] = prev_probs[prev_state] * state_trans_probs[prev_state][curr_state] * \ emission_probability else: temp_state_probs[prev_state] = prev_probs[prev_state] * state_trans_probs[prev_state][curr_state] * \ emission_probs[curr_state][emission] max_idx = np.argmax(temp_state_probs.values()) max_prob = temp_state_probs.values()[max_idx] max_state = temp_state_probs.keys()[max_idx] curr_state_probs[curr_state] = max_prob new_path_dict[curr_state] = path_dict[max_state] + [curr_state] prev_probs = curr_state_probs.copy() path_dict = new_path_dict.copy() # Identify overall most probable path overall_max_idx = np.argmax(prev_probs.values()) overall_max_state = prev_probs.keys()[overall_max_idx] return path_dict[overall_max_state]