Exemplo n.º 1
0
 def features(self):
     ''' Implement your features here.
     '''
     local_features_list = []
     global_feature = Vector({})
     if len(self.snt[0]) == 2:
         start_word, start_suffix, start_tag = self.word_suffix_tag(self.snt[0])
         start_feature = self.start_word_feature(start_word, start_tag)
         local_features_list.append(start_feature)
         if len(self.snt) >= 4:
             for i in range(1, len(self.snt)-2):
                 item = self.snt[i]
                 item_prev = self.snt[i-1]
                 word, suffix, tag = self.word_suffix_tag(item)
                 word_prev, suffix_prev, tag_prev = self.word_suffix_tag(item_prev)
                 local_feature = self.middle_word_feature(word, tag, tag_prev)
                 local_features_list.append(local_feature)
         if len(self.snt) >= 3:
             end_word, end_suffix, end_tag = self.word_suffix_tag(self.snt[-2])
             word_prev, suffix_prev, tag_prev = self.word_suffix_tag(self.snt[-3])
             end_feature = self.end_word_feature(end_word, end_tag, tag_prev)
             local_features_list.append(end_feature)
         for feature in local_features_list:
             feature = Vector(feature)
             global_feature = Vector.__iadd__(global_feature, feature)
     return (local_features_list, global_feature)
Exemplo n.º 2
0
 def get_predicted_sequence(self, final_state):
     tag_seq = list()
     final_feature_vector = Vector()
     my_state = final_state
     while my_state:
         tag_seq.append(my_state.current_tag)
         final_feature_vector.__iadd__(my_state.feature_vector)
         my_state = my_state.prev_state
     tag_seq.reverse()
     return tag_seq, final_feature_vector
Exemplo n.º 3
0
    def featurize(self, curr_word, curr_tag, prev_word, prev_tag):
        """"""
        vector = Vector({})
        bi_word = 'w-1_{} w0_{}'.format(prev_word, curr_word)
        vector.v[bi_word] = 1
        bi_tag = 't-1_{} t0_{}'.format(prev_tag, curr_tag)
        vector.v[bi_tag] = 1
        emission = 'w0_{} t0_{}'.format(curr_word, curr_tag)
        vector.v[emission] = 1

        return vector
Exemplo n.º 4
0
 def get_gold_feature_vector(self, sent):
     feature_vector_gold = Vector()
     for i in range(len(sent.snt)):
         features = sent.features(sent, i)
         if i == 0:
             local_feature = self.create_feature_vector(
                 features, sent.snt[i][1], "__START__")
         else:
             local_feature = self.create_feature_vector(
                 features, sent.snt[i][1], sent.snt[i - 1][1])
         feature_vector_gold.__iadd__(local_feature)
     return feature_vector_gold
Exemplo n.º 5
0
 def create_feature_vector(self, features, curr_tag, prev_tag):
     my_vector = Vector()
     my_vector.v[("ptag=" + prev_tag, curr_tag)] = 1  # prev tag
     for feature in features:
         my_vector.v[(feature, curr_tag)] = 1
     if ablation:
         key_to_ablated = None
         for k in my_vector.v.keys():
             if k[0].startswith(ablation[0]):
                 key_to_ablated = k
         if key_to_ablated:
             del my_vector.v[key_to_ablated]
     return my_vector
Exemplo n.º 6
0
    def __init__(self, tags):
        ''' Modify if necessary. 
        '''

        self.tags = tags
        self.weights = Vector({})
Exemplo n.º 7
0
    def train(self, train_data, dev_data):
        ''' Implement the Perceptron training algorithm here.
        '''

        results_file = open('10000train_1000dev_averaged.txt', 'w')
        # plain_dev = [[tup[0] for tup in sent] for sent in dev_data]

        for i in range(1):
            print('--------------------------------')
            print('minibatch_iteration ', i)
            x = 0
            minibatch = []
            for k in range(10000):
                minibatch.append(random.choice(train_data))
            mini_dev = []
            for m in range(1000):
                mini_dev.append(random.choice(dev_data))

            plain_mini_dev = [[tup[0] for tup in sent] for sent in mini_dev]

            minibatch_update = Vector({})
            for sent in minibatch:
                plain_sent = [tup[0] for tup in sent]
                predicted = self.tag(plain_sent)

                # featurize gold and predicted to get representations for full sequence
                predicted_feats = self.featurize(predicted[0][0],
                                                 predicted[0][1], '$START',
                                                 '<S>')
                gold_feats = self.featurize(sent[0][0], sent[0][1], '$START',
                                            '<S>')
                for j in range(1, len(predicted)):
                    predicted_feats += self.featurize(predicted[j][0],
                                                      predicted[j][1],
                                                      predicted[j - 1][0],
                                                      predicted[j - 1][1])
                    gold_feats += self.featurize(sent[j][0], sent[j][1],
                                                 sent[j - 1][0],
                                                 sent[j - 1][1])

                # adjust weights according to difference between correct and predicted sequence
                if predicted_feats != gold_feats:
                    minibatch_update += gold_feats - predicted_feats
                else:
                    print('correct prediction')

                if x % 100 == 0:
                    print('mini training iteration', i)
                    print('sentence', x)
                    print('p:', predicted)
                    print('g:', sent)
                    print('******')

                x += 1

            #self.weights += minibatch_update.element_wise_divide(len(minibatch))
            print('minibatch_update', minibatch_update)
            print('updating weights')
            self.weights += (1 / len(minibatch)) * minibatch_update

            tagged_dev = []
            dev_count = 0
            for dev_sent in plain_mini_dev:
                dev_tagged = self.tag(dev_sent)
                tagged_dev.append(dev_tagged)

                if dev_count % 50 == 0:
                    print('~~tagging dev after mini iteration ', i)
                    print(
                        '~~len(plain_mini_dev):{}, len(tagged_dev):{}'.format(
                            len(plain_mini_dev), len(tagged_dev)))
                    print('~~dev sentence', dev_count)
                    print(dev_tagged)
                    print('~~########################')
                dev_count += 1

            print()
            acc = self.compute_accuracy(mini_dev, tagged_dev)
            print(acc)
            results_file.write(str(i) + '\t' + str(acc) + '\n')
Exemplo n.º 8
0
 def train(self, train_data, dev_data, average=False, to_be_ablated=None):
     if to_be_ablated:
         global ablation
         ablation = [to_be_ablated[0]]
     print("Training...")
     iterations = 4
     self.tagset = self.get_tagset(train_data)
     if average:
         batch = 100
         counter = 0
         big_predicted_feature_vector = Vector()
         big_gold_feature_vector = Vector()
         for iteration in range(iterations):
             for i in tqdm(range(len(train_data))):
                 sent = train_data[i]
                 tag_sequence_gold = [pair[1] for pair in sent.snt]
                 tag_sequence_predicted, predicted_feature_vector = self.viterbi(
                     sent)
                 gold_feature_vector = self.get_gold_feature_vector(sent)
                 counter = counter + 1
                 if counter % batch == 0:
                     self.weights.__iadd__(big_gold_feature_vector)
                     self.weights.__isub__(big_predicted_feature_vector)
                     big_predicted_feature_vector = Vector()
                     big_gold_feature_vector = Vector()
                 else:
                     big_gold_feature_vector.__iadd__(gold_feature_vector)
                     big_predicted_feature_vector.__iadd__(
                         predicted_feature_vector)
     else:
         for iteration in range(iterations):
             for i in tqdm(range(len(train_data))):
                 sent = train_data[i]
                 tag_sequence_gold = [pair[1] for pair in sent.snt]
                 tag_sequence_predicted, predicted_feature_vector = self.viterbi(
                     sent)
                 gold_feature_vector = self.get_gold_feature_vector(sent)
                 self.weights.__iadd__(gold_feature_vector)
                 self.weights.__isub__(predicted_feature_vector)
     print("Getting accuracy on dev set...")
     acc = self.get_accuracy(dev_data)
     print(acc)
Exemplo n.º 9
0
 def __init__(self):
     self.tagset = None
     self.weights = Vector()
Exemplo n.º 10
0
class Perceptron_POS_Tagger(object):
    def __init__(self):
        self.tagset = None
        self.weights = Vector()

    def tag(self, test_data):
        results = []
        for sent in tqdm(test_data):
            add_slots = Sentence([[sent.snt[i], []]
                                  for i in range(len(sent.snt))])
            tag_sequence_predicted, predicted_feature_vector = self.viterbi(
                add_slots)
            new_sent = Sentence([[sent.snt[i], tag_sequence_predicted[i]]
                                 for i in range(len(sent.snt))])
            results.append(new_sent)
        return results

    def train(self, train_data, dev_data, average=False, to_be_ablated=None):
        if to_be_ablated:
            global ablation
            ablation = [to_be_ablated[0]]
        print("Training...")
        iterations = 4
        self.tagset = self.get_tagset(train_data)
        if average:
            batch = 100
            counter = 0
            big_predicted_feature_vector = Vector()
            big_gold_feature_vector = Vector()
            for iteration in range(iterations):
                for i in tqdm(range(len(train_data))):
                    sent = train_data[i]
                    tag_sequence_gold = [pair[1] for pair in sent.snt]
                    tag_sequence_predicted, predicted_feature_vector = self.viterbi(
                        sent)
                    gold_feature_vector = self.get_gold_feature_vector(sent)
                    counter = counter + 1
                    if counter % batch == 0:
                        self.weights.__iadd__(big_gold_feature_vector)
                        self.weights.__isub__(big_predicted_feature_vector)
                        big_predicted_feature_vector = Vector()
                        big_gold_feature_vector = Vector()
                    else:
                        big_gold_feature_vector.__iadd__(gold_feature_vector)
                        big_predicted_feature_vector.__iadd__(
                            predicted_feature_vector)
        else:
            for iteration in range(iterations):
                for i in tqdm(range(len(train_data))):
                    sent = train_data[i]
                    tag_sequence_gold = [pair[1] for pair in sent.snt]
                    tag_sequence_predicted, predicted_feature_vector = self.viterbi(
                        sent)
                    gold_feature_vector = self.get_gold_feature_vector(sent)
                    self.weights.__iadd__(gold_feature_vector)
                    self.weights.__isub__(predicted_feature_vector)
        print("Getting accuracy on dev set...")
        acc = self.get_accuracy(dev_data)
        print(acc)

    def get_accuracy(self, dev_data):
        gold = list()
        predicted = list()
        for sent in tqdm(dev_data):
            sentence = [pair[0] for pair in sent.snt]
            tag_sequence_gold = [pair[1] for pair in sent.snt]
            tag_sequence_predicted, predicted_feature_vector = self.viterbi(
                sent)
            gold.extend(tag_sequence_gold)
            predicted.extend(tag_sequence_predicted)
        return sum(1 for x, y in zip(gold, predicted) if x == y) / len(gold)

    def get_tagset(self, train_data):
        tag_set = set()
        for sent in train_data:
            tags = [pair[1] for pair in sent.snt]
            tag_set.update(tags)
        tag_set = list(tag_set)
        tagset_dict = dict()
        for i in range(len(tag_set)):
            tagset_dict[i] = tag_set[i]
        return tagset_dict

    def create_feature_vector(self, features, curr_tag, prev_tag):
        my_vector = Vector()
        my_vector.v[("ptag=" + prev_tag, curr_tag)] = 1  # prev tag
        for feature in features:
            my_vector.v[(feature, curr_tag)] = 1
        if ablation:
            key_to_ablated = None
            for k in my_vector.v.keys():
                if k[0].startswith(ablation[0]):
                    key_to_ablated = k
            if key_to_ablated:
                del my_vector.v[key_to_ablated]
        return my_vector

    def viterbi(self, sentence):
        trellis = Trellis(len(self.tagset), len(sentence.snt))  # trellis
        for i in range(len(trellis.columns)):  # i is index of curr word
            column = trellis.columns[i]
            features = sentence.features(sentence, i)
            for j in range(len(column.states)):  # j is index of curr tag
                if i == 0:
                    my_feature_vector = self.create_feature_vector(
                        features, self.tagset[j], "__START__")
                    score = self.weights.dot(my_feature_vector)
                    column.states[j] = State(my_feature_vector, score,
                                             self.tagset[j], None)
                else:
                    possible_states = []
                    prev_column = trellis.columns[i - 1]
                    for z in range(len(
                            prev_column.states)):  # z is index of prev tag
                        prev_state = prev_column.states[z]
                        my_feature_vector = self.create_feature_vector(
                            features, self.tagset[j], self.tagset[z])
                        score = prev_state.score + self.weights.dot(
                            my_feature_vector)
                        possible_states.append(
                            State(my_feature_vector, score, self.tagset[j],
                                  prev_state))
                    possible_scores = np.array(
                        [state.score for state in possible_states])
                    index_best_score = possible_scores.argmax()
                    column.states[j] = possible_states[index_best_score]
        last_column = trellis.columns[len(sentence.snt) - 1]
        last_column_scores = np.array(
            [state.score for state in last_column.states])
        best_final_index = last_column_scores.argmax()
        best_final_state = last_column.states[best_final_index]
        tag_sequence_predicted, predicted_feature_vector = self.get_predicted_sequence(
            best_final_state)
        return tag_sequence_predicted, predicted_feature_vector

    def get_predicted_sequence(self, final_state):
        tag_seq = list()
        final_feature_vector = Vector()
        my_state = final_state
        while my_state:
            tag_seq.append(my_state.current_tag)
            final_feature_vector.__iadd__(my_state.feature_vector)
            my_state = my_state.prev_state
        tag_seq.reverse()
        return tag_seq, final_feature_vector

    def get_gold_feature_vector(self, sent):
        feature_vector_gold = Vector()
        for i in range(len(sent.snt)):
            features = sent.features(sent, i)
            if i == 0:
                local_feature = self.create_feature_vector(
                    features, sent.snt[i][1], "__START__")
            else:
                local_feature = self.create_feature_vector(
                    features, sent.snt[i][1], sent.snt[i - 1][1])
            feature_vector_gold.__iadd__(local_feature)
        return feature_vector_gold
Exemplo n.º 11
0
    def train(self, train_data, gold_dev_data, plain_dev_data, iterations):
        ''' Implement the Perceptron training algorithm here.
        '''
        alpha = Vector({})
        alpha_sum = Vector({})
        print("length of training data:", len(train_data),
              "\nstart training...")
        for t in range(iterations):
            print("\niter:", t, "\n")
            for i in range(len(train_data)):
                z = self.tag(Sentence(self.raw_sentence(train_data[i].snt)),
                             alpha)
                curr_tags = Sentence(
                    self.comb_word_tag(self.raw_sentence(train_data[i].snt),
                                       z))
                local_features_list, global_feature_z = curr_tags.features()
                true_local_features, true_global = train_data[i].features()
                if z != self.true_tag(train_data[i].snt):
                    alpha = Vector.__iadd__(
                        alpha, Vector.__sub__(true_global, global_feature_z))
                alpha_sum = Vector.__iadd__(alpha_sum, alpha)
                if i == 499:
                    print("training size:", i + 1,
                          "acc on dev with regular alpha:",
                          self.acc_dev(gold_dev_data, plain_dev_data, alpha))
                    print(
                        "acc with avg alpha:",
                        self.acc_dev(
                            gold_dev_data, plain_dev_data,
                            self.average_alpha(alpha_sum, t, train_data, i)))
                if i == 999:
                    print("training size:", i + 1,
                          "acc on dev with regular alpha:",
                          self.acc_dev(gold_dev_data, plain_dev_data, alpha))
                    print(
                        "acc with avg alpha:",
                        self.acc_dev(
                            gold_dev_data, plain_dev_data,
                            self.average_alpha(alpha_sum, t, train_data, i)))
                if i == 9999:
                    print("training size:", i + 1,
                          "acc on dev with regular alpha:",
                          self.acc_dev(gold_dev_data, plain_dev_data, alpha))
                    print(
                        "acc with avg alpha:",
                        self.acc_dev(
                            gold_dev_data, plain_dev_data,
                            self.average_alpha(alpha_sum, t, train_data, i)))
                if i == 24999:
                    print("training size:", i + 1,
                          "acc on dev with regular alpha:",
                          self.acc_dev(gold_dev_data, plain_dev_data, alpha))
                    print(
                        "acc with avg alpha:",
                        self.acc_dev(
                            gold_dev_data, plain_dev_data,
                            self.average_alpha(alpha_sum, t, train_data, i)))
                if i == len(train_data) - 1:
                    final_avg_alpha = self.average_alpha(
                        alpha_sum, t, train_data, i)
                    print("training size:", i + 1,
                          "acc on dev with regular alpha:",
                          self.acc_dev(gold_dev_data, plain_dev_data, alpha))
                    print(
                        "acc with avg alpha:",
                        self.acc_dev(gold_dev_data, plain_dev_data,
                                     final_avg_alpha))
                    print("\n")
            # acc_dev = self.acc_dev(gold_dev_data, plain_dev_data, alpha)
            # print("iter", t, ": acc on dev with regular alpha:", acc_dev)

        # print("calculate acc with average alpha...")
        # average_alpha = Vector.__rmul__(average_alpha, 1 / (len(train_data) * iterations))
        # acc_avg = self.acc_dev(gold_dev_data, plain_dev_data, average_alpha)
        # print("acc on dev with average alpha:", acc_avg)
        return (alpha, final_avg_alpha)
Exemplo n.º 12
0
 def average_alpha(self, alpha_sum, iter, train_data, i):
     avg_alpha = Vector.__rmul__(alpha_sum,
                                 1 / (len(train_data) * iter + i + 1))
     return avg_alpha
Exemplo n.º 13
0
    def tag(self, sentence, alpha):
        ''' Implement the Viterbi decoding algorithm here.
        '''
        z = []
        b_matrix = []
        score_matrix = []
        score_start = []
        start_word = sentence.snt[0]
        for tag in Perceptron_POS_Tagger.tag_list:
            start_feature = Vector(sentence.start_word_feature(
                start_word, tag))
            score_start.append(alpha.dot(start_feature))
        score_matrix.append(score_start)
        b_matrix.append([0] * len(Perceptron_POS_Tagger.tag_list))
        if len(sentence.snt) >= 4:
            for i in range(1, len(sentence.snt) - 2):
                score = []
                word = sentence.snt[i]
                b = []
                for j in range(len(Perceptron_POS_Tagger.tag_list)):
                    score_max = -10000
                    score_max_index = 0
                    for k in range(len(Perceptron_POS_Tagger.tag_list)):
                        feature = Vector(
                            sentence.middle_word_feature(
                                word, Perceptron_POS_Tagger.tag_list[j],
                                Perceptron_POS_Tagger.tag_list[k]))
                        if (score_matrix[i - 1][k] +
                                alpha.dot(feature)) > score_max:
                            score_max = score_matrix[i -
                                                     1][k] + alpha.dot(feature)
                            score_max_index = k
                    score.append(score_max)
                    b.append(score_max_index)
                b_matrix.append(b)
                score_matrix.append(score)
        if len(sentence.snt) >= 3:
            end_word = sentence.snt[-2]
            b = []
            score = []
            for j in range(len(Perceptron_POS_Tagger.tag_list)):
                score_max = -10000
                score_max_index = 0
                for k in range(len(Perceptron_POS_Tagger.tag_list)):
                    feature = Vector(
                        sentence.end_word_feature(
                            end_word, Perceptron_POS_Tagger.tag_list[j],
                            Perceptron_POS_Tagger.tag_list[k]))

                    if (score_matrix[len(sentence.snt) - 3][k] +
                            alpha.dot(feature)) > score_max:
                        score_max = score_matrix[len(sentence.snt) -
                                                 3][k] + alpha.dot(feature)
                        score_max_index = k
                score.append(score_max)
                b.append(score_max_index)
            score_matrix.append(score)
            b_matrix.append(b)

        end_tag_index = score_matrix[-1].index(max(score_matrix[-1]))

        z_index = [end_tag_index]
        z.append(Perceptron_POS_Tagger.tag_list[z_index[-1]])
        for i in range(len(sentence.snt) - 2):
            z_index.append(b_matrix[len(sentence.snt) - i - 2][z_index[-1]])
            z.append(Perceptron_POS_Tagger.tag_list[z_index[-1]])
        z = list(reversed(z))
        z.append(sentence.snt[-1])
        return z