def features(self): ''' Implement your features here. ''' local_features_list = [] global_feature = Vector({}) if len(self.snt[0]) == 2: start_word, start_suffix, start_tag = self.word_suffix_tag(self.snt[0]) start_feature = self.start_word_feature(start_word, start_tag) local_features_list.append(start_feature) if len(self.snt) >= 4: for i in range(1, len(self.snt)-2): item = self.snt[i] item_prev = self.snt[i-1] word, suffix, tag = self.word_suffix_tag(item) word_prev, suffix_prev, tag_prev = self.word_suffix_tag(item_prev) local_feature = self.middle_word_feature(word, tag, tag_prev) local_features_list.append(local_feature) if len(self.snt) >= 3: end_word, end_suffix, end_tag = self.word_suffix_tag(self.snt[-2]) word_prev, suffix_prev, tag_prev = self.word_suffix_tag(self.snt[-3]) end_feature = self.end_word_feature(end_word, end_tag, tag_prev) local_features_list.append(end_feature) for feature in local_features_list: feature = Vector(feature) global_feature = Vector.__iadd__(global_feature, feature) return (local_features_list, global_feature)
def get_predicted_sequence(self, final_state): tag_seq = list() final_feature_vector = Vector() my_state = final_state while my_state: tag_seq.append(my_state.current_tag) final_feature_vector.__iadd__(my_state.feature_vector) my_state = my_state.prev_state tag_seq.reverse() return tag_seq, final_feature_vector
def featurize(self, curr_word, curr_tag, prev_word, prev_tag): """""" vector = Vector({}) bi_word = 'w-1_{} w0_{}'.format(prev_word, curr_word) vector.v[bi_word] = 1 bi_tag = 't-1_{} t0_{}'.format(prev_tag, curr_tag) vector.v[bi_tag] = 1 emission = 'w0_{} t0_{}'.format(curr_word, curr_tag) vector.v[emission] = 1 return vector
def get_gold_feature_vector(self, sent): feature_vector_gold = Vector() for i in range(len(sent.snt)): features = sent.features(sent, i) if i == 0: local_feature = self.create_feature_vector( features, sent.snt[i][1], "__START__") else: local_feature = self.create_feature_vector( features, sent.snt[i][1], sent.snt[i - 1][1]) feature_vector_gold.__iadd__(local_feature) return feature_vector_gold
def create_feature_vector(self, features, curr_tag, prev_tag): my_vector = Vector() my_vector.v[("ptag=" + prev_tag, curr_tag)] = 1 # prev tag for feature in features: my_vector.v[(feature, curr_tag)] = 1 if ablation: key_to_ablated = None for k in my_vector.v.keys(): if k[0].startswith(ablation[0]): key_to_ablated = k if key_to_ablated: del my_vector.v[key_to_ablated] return my_vector
def __init__(self, tags): ''' Modify if necessary. ''' self.tags = tags self.weights = Vector({})
def train(self, train_data, dev_data): ''' Implement the Perceptron training algorithm here. ''' results_file = open('10000train_1000dev_averaged.txt', 'w') # plain_dev = [[tup[0] for tup in sent] for sent in dev_data] for i in range(1): print('--------------------------------') print('minibatch_iteration ', i) x = 0 minibatch = [] for k in range(10000): minibatch.append(random.choice(train_data)) mini_dev = [] for m in range(1000): mini_dev.append(random.choice(dev_data)) plain_mini_dev = [[tup[0] for tup in sent] for sent in mini_dev] minibatch_update = Vector({}) for sent in minibatch: plain_sent = [tup[0] for tup in sent] predicted = self.tag(plain_sent) # featurize gold and predicted to get representations for full sequence predicted_feats = self.featurize(predicted[0][0], predicted[0][1], '$START', '<S>') gold_feats = self.featurize(sent[0][0], sent[0][1], '$START', '<S>') for j in range(1, len(predicted)): predicted_feats += self.featurize(predicted[j][0], predicted[j][1], predicted[j - 1][0], predicted[j - 1][1]) gold_feats += self.featurize(sent[j][0], sent[j][1], sent[j - 1][0], sent[j - 1][1]) # adjust weights according to difference between correct and predicted sequence if predicted_feats != gold_feats: minibatch_update += gold_feats - predicted_feats else: print('correct prediction') if x % 100 == 0: print('mini training iteration', i) print('sentence', x) print('p:', predicted) print('g:', sent) print('******') x += 1 #self.weights += minibatch_update.element_wise_divide(len(minibatch)) print('minibatch_update', minibatch_update) print('updating weights') self.weights += (1 / len(minibatch)) * minibatch_update tagged_dev = [] dev_count = 0 for dev_sent in plain_mini_dev: dev_tagged = self.tag(dev_sent) tagged_dev.append(dev_tagged) if dev_count % 50 == 0: print('~~tagging dev after mini iteration ', i) print( '~~len(plain_mini_dev):{}, len(tagged_dev):{}'.format( len(plain_mini_dev), len(tagged_dev))) print('~~dev sentence', dev_count) print(dev_tagged) print('~~########################') dev_count += 1 print() acc = self.compute_accuracy(mini_dev, tagged_dev) print(acc) results_file.write(str(i) + '\t' + str(acc) + '\n')
def train(self, train_data, dev_data, average=False, to_be_ablated=None): if to_be_ablated: global ablation ablation = [to_be_ablated[0]] print("Training...") iterations = 4 self.tagset = self.get_tagset(train_data) if average: batch = 100 counter = 0 big_predicted_feature_vector = Vector() big_gold_feature_vector = Vector() for iteration in range(iterations): for i in tqdm(range(len(train_data))): sent = train_data[i] tag_sequence_gold = [pair[1] for pair in sent.snt] tag_sequence_predicted, predicted_feature_vector = self.viterbi( sent) gold_feature_vector = self.get_gold_feature_vector(sent) counter = counter + 1 if counter % batch == 0: self.weights.__iadd__(big_gold_feature_vector) self.weights.__isub__(big_predicted_feature_vector) big_predicted_feature_vector = Vector() big_gold_feature_vector = Vector() else: big_gold_feature_vector.__iadd__(gold_feature_vector) big_predicted_feature_vector.__iadd__( predicted_feature_vector) else: for iteration in range(iterations): for i in tqdm(range(len(train_data))): sent = train_data[i] tag_sequence_gold = [pair[1] for pair in sent.snt] tag_sequence_predicted, predicted_feature_vector = self.viterbi( sent) gold_feature_vector = self.get_gold_feature_vector(sent) self.weights.__iadd__(gold_feature_vector) self.weights.__isub__(predicted_feature_vector) print("Getting accuracy on dev set...") acc = self.get_accuracy(dev_data) print(acc)
def __init__(self): self.tagset = None self.weights = Vector()
class Perceptron_POS_Tagger(object): def __init__(self): self.tagset = None self.weights = Vector() def tag(self, test_data): results = [] for sent in tqdm(test_data): add_slots = Sentence([[sent.snt[i], []] for i in range(len(sent.snt))]) tag_sequence_predicted, predicted_feature_vector = self.viterbi( add_slots) new_sent = Sentence([[sent.snt[i], tag_sequence_predicted[i]] for i in range(len(sent.snt))]) results.append(new_sent) return results def train(self, train_data, dev_data, average=False, to_be_ablated=None): if to_be_ablated: global ablation ablation = [to_be_ablated[0]] print("Training...") iterations = 4 self.tagset = self.get_tagset(train_data) if average: batch = 100 counter = 0 big_predicted_feature_vector = Vector() big_gold_feature_vector = Vector() for iteration in range(iterations): for i in tqdm(range(len(train_data))): sent = train_data[i] tag_sequence_gold = [pair[1] for pair in sent.snt] tag_sequence_predicted, predicted_feature_vector = self.viterbi( sent) gold_feature_vector = self.get_gold_feature_vector(sent) counter = counter + 1 if counter % batch == 0: self.weights.__iadd__(big_gold_feature_vector) self.weights.__isub__(big_predicted_feature_vector) big_predicted_feature_vector = Vector() big_gold_feature_vector = Vector() else: big_gold_feature_vector.__iadd__(gold_feature_vector) big_predicted_feature_vector.__iadd__( predicted_feature_vector) else: for iteration in range(iterations): for i in tqdm(range(len(train_data))): sent = train_data[i] tag_sequence_gold = [pair[1] for pair in sent.snt] tag_sequence_predicted, predicted_feature_vector = self.viterbi( sent) gold_feature_vector = self.get_gold_feature_vector(sent) self.weights.__iadd__(gold_feature_vector) self.weights.__isub__(predicted_feature_vector) print("Getting accuracy on dev set...") acc = self.get_accuracy(dev_data) print(acc) def get_accuracy(self, dev_data): gold = list() predicted = list() for sent in tqdm(dev_data): sentence = [pair[0] for pair in sent.snt] tag_sequence_gold = [pair[1] for pair in sent.snt] tag_sequence_predicted, predicted_feature_vector = self.viterbi( sent) gold.extend(tag_sequence_gold) predicted.extend(tag_sequence_predicted) return sum(1 for x, y in zip(gold, predicted) if x == y) / len(gold) def get_tagset(self, train_data): tag_set = set() for sent in train_data: tags = [pair[1] for pair in sent.snt] tag_set.update(tags) tag_set = list(tag_set) tagset_dict = dict() for i in range(len(tag_set)): tagset_dict[i] = tag_set[i] return tagset_dict def create_feature_vector(self, features, curr_tag, prev_tag): my_vector = Vector() my_vector.v[("ptag=" + prev_tag, curr_tag)] = 1 # prev tag for feature in features: my_vector.v[(feature, curr_tag)] = 1 if ablation: key_to_ablated = None for k in my_vector.v.keys(): if k[0].startswith(ablation[0]): key_to_ablated = k if key_to_ablated: del my_vector.v[key_to_ablated] return my_vector def viterbi(self, sentence): trellis = Trellis(len(self.tagset), len(sentence.snt)) # trellis for i in range(len(trellis.columns)): # i is index of curr word column = trellis.columns[i] features = sentence.features(sentence, i) for j in range(len(column.states)): # j is index of curr tag if i == 0: my_feature_vector = self.create_feature_vector( features, self.tagset[j], "__START__") score = self.weights.dot(my_feature_vector) column.states[j] = State(my_feature_vector, score, self.tagset[j], None) else: possible_states = [] prev_column = trellis.columns[i - 1] for z in range(len( prev_column.states)): # z is index of prev tag prev_state = prev_column.states[z] my_feature_vector = self.create_feature_vector( features, self.tagset[j], self.tagset[z]) score = prev_state.score + self.weights.dot( my_feature_vector) possible_states.append( State(my_feature_vector, score, self.tagset[j], prev_state)) possible_scores = np.array( [state.score for state in possible_states]) index_best_score = possible_scores.argmax() column.states[j] = possible_states[index_best_score] last_column = trellis.columns[len(sentence.snt) - 1] last_column_scores = np.array( [state.score for state in last_column.states]) best_final_index = last_column_scores.argmax() best_final_state = last_column.states[best_final_index] tag_sequence_predicted, predicted_feature_vector = self.get_predicted_sequence( best_final_state) return tag_sequence_predicted, predicted_feature_vector def get_predicted_sequence(self, final_state): tag_seq = list() final_feature_vector = Vector() my_state = final_state while my_state: tag_seq.append(my_state.current_tag) final_feature_vector.__iadd__(my_state.feature_vector) my_state = my_state.prev_state tag_seq.reverse() return tag_seq, final_feature_vector def get_gold_feature_vector(self, sent): feature_vector_gold = Vector() for i in range(len(sent.snt)): features = sent.features(sent, i) if i == 0: local_feature = self.create_feature_vector( features, sent.snt[i][1], "__START__") else: local_feature = self.create_feature_vector( features, sent.snt[i][1], sent.snt[i - 1][1]) feature_vector_gold.__iadd__(local_feature) return feature_vector_gold
def train(self, train_data, gold_dev_data, plain_dev_data, iterations): ''' Implement the Perceptron training algorithm here. ''' alpha = Vector({}) alpha_sum = Vector({}) print("length of training data:", len(train_data), "\nstart training...") for t in range(iterations): print("\niter:", t, "\n") for i in range(len(train_data)): z = self.tag(Sentence(self.raw_sentence(train_data[i].snt)), alpha) curr_tags = Sentence( self.comb_word_tag(self.raw_sentence(train_data[i].snt), z)) local_features_list, global_feature_z = curr_tags.features() true_local_features, true_global = train_data[i].features() if z != self.true_tag(train_data[i].snt): alpha = Vector.__iadd__( alpha, Vector.__sub__(true_global, global_feature_z)) alpha_sum = Vector.__iadd__(alpha_sum, alpha) if i == 499: print("training size:", i + 1, "acc on dev with regular alpha:", self.acc_dev(gold_dev_data, plain_dev_data, alpha)) print( "acc with avg alpha:", self.acc_dev( gold_dev_data, plain_dev_data, self.average_alpha(alpha_sum, t, train_data, i))) if i == 999: print("training size:", i + 1, "acc on dev with regular alpha:", self.acc_dev(gold_dev_data, plain_dev_data, alpha)) print( "acc with avg alpha:", self.acc_dev( gold_dev_data, plain_dev_data, self.average_alpha(alpha_sum, t, train_data, i))) if i == 9999: print("training size:", i + 1, "acc on dev with regular alpha:", self.acc_dev(gold_dev_data, plain_dev_data, alpha)) print( "acc with avg alpha:", self.acc_dev( gold_dev_data, plain_dev_data, self.average_alpha(alpha_sum, t, train_data, i))) if i == 24999: print("training size:", i + 1, "acc on dev with regular alpha:", self.acc_dev(gold_dev_data, plain_dev_data, alpha)) print( "acc with avg alpha:", self.acc_dev( gold_dev_data, plain_dev_data, self.average_alpha(alpha_sum, t, train_data, i))) if i == len(train_data) - 1: final_avg_alpha = self.average_alpha( alpha_sum, t, train_data, i) print("training size:", i + 1, "acc on dev with regular alpha:", self.acc_dev(gold_dev_data, plain_dev_data, alpha)) print( "acc with avg alpha:", self.acc_dev(gold_dev_data, plain_dev_data, final_avg_alpha)) print("\n") # acc_dev = self.acc_dev(gold_dev_data, plain_dev_data, alpha) # print("iter", t, ": acc on dev with regular alpha:", acc_dev) # print("calculate acc with average alpha...") # average_alpha = Vector.__rmul__(average_alpha, 1 / (len(train_data) * iterations)) # acc_avg = self.acc_dev(gold_dev_data, plain_dev_data, average_alpha) # print("acc on dev with average alpha:", acc_avg) return (alpha, final_avg_alpha)
def average_alpha(self, alpha_sum, iter, train_data, i): avg_alpha = Vector.__rmul__(alpha_sum, 1 / (len(train_data) * iter + i + 1)) return avg_alpha
def tag(self, sentence, alpha): ''' Implement the Viterbi decoding algorithm here. ''' z = [] b_matrix = [] score_matrix = [] score_start = [] start_word = sentence.snt[0] for tag in Perceptron_POS_Tagger.tag_list: start_feature = Vector(sentence.start_word_feature( start_word, tag)) score_start.append(alpha.dot(start_feature)) score_matrix.append(score_start) b_matrix.append([0] * len(Perceptron_POS_Tagger.tag_list)) if len(sentence.snt) >= 4: for i in range(1, len(sentence.snt) - 2): score = [] word = sentence.snt[i] b = [] for j in range(len(Perceptron_POS_Tagger.tag_list)): score_max = -10000 score_max_index = 0 for k in range(len(Perceptron_POS_Tagger.tag_list)): feature = Vector( sentence.middle_word_feature( word, Perceptron_POS_Tagger.tag_list[j], Perceptron_POS_Tagger.tag_list[k])) if (score_matrix[i - 1][k] + alpha.dot(feature)) > score_max: score_max = score_matrix[i - 1][k] + alpha.dot(feature) score_max_index = k score.append(score_max) b.append(score_max_index) b_matrix.append(b) score_matrix.append(score) if len(sentence.snt) >= 3: end_word = sentence.snt[-2] b = [] score = [] for j in range(len(Perceptron_POS_Tagger.tag_list)): score_max = -10000 score_max_index = 0 for k in range(len(Perceptron_POS_Tagger.tag_list)): feature = Vector( sentence.end_word_feature( end_word, Perceptron_POS_Tagger.tag_list[j], Perceptron_POS_Tagger.tag_list[k])) if (score_matrix[len(sentence.snt) - 3][k] + alpha.dot(feature)) > score_max: score_max = score_matrix[len(sentence.snt) - 3][k] + alpha.dot(feature) score_max_index = k score.append(score_max) b.append(score_max_index) score_matrix.append(score) b_matrix.append(b) end_tag_index = score_matrix[-1].index(max(score_matrix[-1])) z_index = [end_tag_index] z.append(Perceptron_POS_Tagger.tag_list[z_index[-1]]) for i in range(len(sentence.snt) - 2): z_index.append(b_matrix[len(sentence.snt) - i - 2][z_index[-1]]) z.append(Perceptron_POS_Tagger.tag_list[z_index[-1]]) z = list(reversed(z)) z.append(sentence.snt[-1]) return z