Exemplo n.º 1
0
def get_feature_ids(feat_list, gold_tags, output_tags):
    feat_ids_gold = []
    feat_ids_output = []

    # Get list of list of features - each sublist corresponds to a word
    feat_index = 0
    feat_list_by_words = []
    for i in range(len(gold_tags)):
        (feat_index, feats) = perc.feats_for_word(feat_index, feat_list)
        feat_list_by_words.append(feats)

    # For each feature sublist, create a list of feature IDs ((feature, tag)).
    # One set based on gold tags, another based on argmax tags
    j = 0
    for sublist in feat_list_by_words:
        gold_sublist = []
        out_sublist = []
        for f in sublist:
            # Feature based on bigrams of output tags
            if f == 'B' and j > 0:
                curr_feat_g = '{0}:{1}'.format(f, gold_tags[j - 1])
                curr_feat_o = '{0}:{1}'.format(f, output_tags[j - 1])
            else:
                curr_feat_g = f
                curr_feat_o = f

            gold_sublist.append((curr_feat_g, gold_tags[j]))
            out_sublist.append((curr_feat_o, output_tags[j]))

        feat_ids_gold.append(gold_sublist)
        feat_ids_output.append(out_sublist)

        j += 1

    return feat_ids_gold, feat_ids_output
Exemplo n.º 2
0
def perc_train(train_data, tagset, numepochs):
    print len(train_data)
    feat_vec = defaultdict(int)
    defaultTag = tagset[0]
    for i in range(numepochs):
        print i
        k = 0
        feat_index = 0
        for (labeled_list, feat_list) in train_data:
            if k % 100 == 0: print "     ", k
            k += 1
            z = perc.perc_test(feat_vec, labeled_list, feat_list, tagset,
                               defaultTag)

            # get the augmented labels and feats for the word
            labels = copy.deepcopy(labeled_list)
            (feat_index, feats) = perc.feats_for_word(feat_index, feat_list)
            labels.insert(0, '_B-1 _B-1 _B-1')
            z.insert(0, '_B-1')

            # update weights when t != labels[j]
            N = len(labels)
            for j in range(1, N - 1):
                if x(labels, j, 2) != z[j]:
                    updateWeights(feat_vec, labels, z, j, feats)
    return feat_vec
Exemplo n.º 3
0
def perc_train(train_data, tagset, numepochs):
    # perceptron train
    feat_vec = defaultdict(int)
    default_tag = tagset[0]
    for i in range(0, numepochs):
        for (label_list, feat_list) in train_data:
            cur = perc.perc_test(feat_vec, label_list, feat_list, tagset,
                                 default_tag)
            gold = [entry.split()[2] for entry in label_list]
            if cur != gold:
                cur.insert(0, 'B_-1')
                gold.insert(0, 'B_-1')
                cur.append('B_+1')
                gold.append('B_+1')
                cur_len = len(cur)
                gold_len = len(gold)
                if cur_len != gold_len:
                    raise ValueError(
                        "output length is not the same with the input sentence"
                    )
                feat_index = 0
                # perceptron update
                for i in range(1, cur_len):
                    (feat_index,
                     features) = perc.feats_for_word(feat_index, feat_list)
                    for f in features:
                        feat_vec[(f, cur[i])] = feat_vec[(f, cur[i])] - 1
                        feat_vec[(f, gold[i])] = feat_vec[(f, gold[i])] + 1
        print >> sys.stderr, "iteration %d done." % i

    return feat_vec
Exemplo n.º 4
0
def perc_train(train_data, tagset, numepochs, pos_dict):
    feat_vec = defaultdict(int)
    # insert your code here
    if len(tagset) <= 0:
        raise ValueError("Empty tagset")

    numepochs = int(20)
    default_tag = tagset[0]
    for t in range(numepochs):
        tmp = 0
        # Count sentence
        print 'Iteration#',t,' is processing now.'
        for (labeled_list, feat_list) in train_data:
            labels = copy.deepcopy(labeled_list)
            # add in the start and end buffers for the context
            # for every sentence in the training set, iterate numepochs times
            output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag, pos_dict)
            # compare current output and true result
            # correct_flag = True
            feat_index = 0
            # check word by word if the predicted tag is equal to the true tag
            for i, v in enumerate(output):
                (feat_index, feats) = perc.feats_for_word(feat_index, feat_list)
                # retrieve the feature for a word
                if len(feats) == 0:
                    print >>sys.stderr, " ".join(labels), " ".join(feat_list), "\n"
                    raise ValueError("features do not align with input sentence")
                
                fields = labels[i].split()
                label = fields[2]
                if i > 0: 
                    label_pre = labels[i-1].split()[2]
                    if output[i-1] is not label_pre or output[i] != label:
                        for feat in feats:
                            if feat[0] == 'B': # for bigram feature
                                feat_out = feat + ":" + output[i-1]  # feat_out is the "B:<previous output>"
                                feat_lab = feat + ":" + label_pre  # feat_lab is the "B:<previous label>"
                                feat_vec[feat_out, output[i]] = feat_vec[feat_out, output[i]] - 1
                                feat_vec[feat_out, label] = feat_vec[feat_out, label] + 1
                                feat_vec[feat_lab, output[i]] = feat_vec[feat_lab, output[i]] - 1
                                feat_vec[feat_lab, label] = feat_vec[feat_lab, label] + 1
                            else: # for U00 to U22 feature
                                feat_vec[feat, output[i]] = feat_vec[feat, output[i]] - 1
                                feat_vec[feat, label] = feat_vec[feat, label] + 1
                else:  # for i==0 case, all the first word in each sentence
                    label_pre = 'B_-1'  # previous label will be denoted by B_-1
                    for feat in feats:
                        if feat[0] == 'B':  # bigram feature case
                            feat = feat + ":" + label_pre
                        feat_vec[feat, output[i]] = feat_vec[feat, output[i]] - 1
                        feat_vec[feat, label] = feat_vec[feat, label] + 1

        perc.perc_write_to_file(feat_vec, 'model_' + str(t))

    # please limit the number of iterations of training to n iterations
    return feat_vec
Exemplo n.º 5
0
def global_feature_vector(feat_list, tag_list):
    vec = defaultdict(int)
    feat_index = 0

    for i in range(0, len(tag_list)):
        (feat_index, feats) = perc.feats_for_word(feat_index, feat_list)
        for feat in feats:
            vec[(feat, tag_list[i])] += 1

    return vec
def perc_train(train_data, tagset, n):
    feat_vec = defaultdict(int)
    feat_avg_vec = defaultdict(int)
    # insert your code here
    # please limit the number of iterations of training to n iterations
    default_tag = tagset[0]  # tag any word with 'B-NP' in the beginning
    num_sentence = len(train_data)
    num_words = 0
    count = 0
    for iteration in range(n):
        sent_index = 0
        for sentence in train_data:  #sentence = (labeled_list, feat_list) for each sentence
            sent_index += 1
            print '{0}\r'.format("\rIteration: %d/%d. Sentence: %d/%d\t" %
                                 (iteration + 1, n, sent_index, num_sentence)),

            (labeled_list, feat_list) = sentence
            num_words += len(labeled_list)

            #compute tags based on current weights
            estimated_tags = perc.perc_test(feat_vec, labeled_list, feat_list,
                                            tagset, default_tag)
            #the target 'right' tag list
            standard_tags = [item.split()[2] for item in labeled_list]

            if estimated_tags != standard_tags:
                st_prev = es_prev = 'B_-1'
                index = 0
                #reference: http://gul.gu.se/public/pp/public_courses/course38351/published/1360057354030/resourceId/19456476/content/9adb1f1e-52e4-48b4-8001-ada93be18089/9adb1f1e-52e4-48b4-8001-ada93be18089.html
                step = (n * num_sentence - count) * 1.0 / (n * num_sentence)
                for (st_tag, es_tag) in zip(standard_tags, estimated_tags):
                    (index, feats) = perc.feats_for_word(index, feat_list)

                    for feat in feats:
                        #deal with feat B: according to the given output example.
                        if feat == 'B':
                            if st_prev != es_prev or st_tag != es_tag:
                                feat_vec[('B:' + es_prev, es_tag)] -= 1
                                feat_vec[('B:' + st_prev, st_tag)] += 1
                                feat_avg_vec[('B:' + es_prev, es_tag)] -= step
                                feat_avg_vec[('B:' + st_prev, st_tag)] += step
                                es_prev = es_tag
                                st_prev = st_tag

                        else:
                            if st_tag != es_tag:
                                feat_vec[(feat, es_tag)] -= 1
                                feat_vec[(feat, st_tag)] += 1
                                feat_avg_vec[(feat, es_tag)] -= step
                                feat_avg_vec[(feat, st_tag)] += step
            count += 1
        perc.perc_write_to_file(
            feat_avg_vec, 'models/n' + str(iteration) + 'avg_params.model')

    return feat_avg_vec
Exemplo n.º 7
0
def perc_train(train_data, tagset, n):
    # insert your code here
    if len(tagset) <= 0:
        raise ValueError("Empty tagset")
    default_tag = tagset[0]
    feat_vec = defaultdict(int)

    epochs = n
    for round in range(0, epochs):
        num_mistakes = 0
        for (labeled_list, feat_list) in train_data:
            output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset,
                                    default_tag)
            true_output = get_truth(labeled_list)
            logging.info("arg max output: %s" % (" ".join(output)))
            logging.info("truth: %s" % (" ".join(true_output)))
            #print >>sys.stderr, "\noutput:", output, "\ntruth:", true_output
            if output != true_output:
                num_mistakes += 1
                output.insert(0, 'B_-1')
                output.append('B_+1')
                true_output.insert(0, 'B_-1')
                true_output.append('B_+1')
                feat_index = 0
                for i in range(1, len(output) - 1):
                    #print >>sys.stderr, output[i], true_output[i]
                    (feat_index,
                     feats) = perc.feats_for_word(feat_index, feat_list)
                    if len(feats) == 0:
                        print >> sys.stderr, " ".join(labeled_list), " ".join(
                            feat_list), "\n"
                        raise ValueError(
                            "features do not align with input sentence")
                    #print >>sys.stderr, feats
                    feat_vec_update = defaultdict(int)
                    for feat in feats:
                        #!!!Debug: output_feat is not truth feat....
                        output_feat = truth_feat = feat

                        feat_vec_update[output_feat, output[i]] += -1
                        feat_vec_update[truth_feat, true_output[i]] += 1

                    for (upd_feat, upd_tag) in feat_vec_update:
                        if feat_vec_update[upd_feat, upd_tag] != 0:
                            feat_vec[upd_feat,
                                     upd_tag] += feat_vec_update[upd_feat,
                                                                 upd_tag]
                            logging.info(
                                "updating feat_vec with feature_id: (%s, %s) value: %d"
                                % (upd_feat, upd_tag,
                                   feat_vec_update[upd_feat, upd_tag]))
        print >> sys.stderr, "number of mistakes:", num_mistakes
        logging.info("current number of mistakes: %d" % (num_mistakes))
    return feat_vec
def perc_train(train_data, tagset, numepochs):
    # perceptron train
    T = float(len(train_data))
    step = numepochs * T
    feat_vec_cache = defaultdict(int)
    # feat_vec stores the weights for the features of a sentence, initially all weights are 0
    feat_vec = defaultdict(int)
    # default_tag = 'B-NP'
    default_tag = tagset[0]
    # for each epoch/iteration
    for i in range(0, numepochs):
        # for each item (e.g tuple=([labeled words for each sentence],[features for those words of sentence])) in train_data
        for (label_list, feat_list) in train_data:
            # cur = list of best tag for each word in sentence found using viterbi algo
            cur = perc.perc_test(feat_vec, label_list, feat_list, tagset,
                                 default_tag)
            # gold = list of reference/true tag for each word in sentence
            gold = [entry.split()[2] for entry in label_list]
            if cur != gold:
                cur.insert(0, 'B_-1')
                gold.insert(0, 'B_-1')
                cur.append('B_+1')
                gold.append('B_+1')
                cur_len = len(cur)
                gold_len = len(gold)
                if cur_len != gold_len:
                    raise ValueError(
                        "output length is not the same with the input sentence"
                    )
                feat_index = 0
                # perceptron update
                # for each tag/word of a sentence
                for i in range(1, cur_len):
                    # for each word in a sentence, (feat_index, features) is a tuple, where feat_index=endindex of the list of features for that word, and features=list of features for that word
                    (feat_index,
                     features) = perc.feats_for_word(feat_index, feat_list)
                    # update the weights of the features for that word, by rewarding the features seen in reference, while penalizing the ones not seen in reference but returned by viterbi
                    for f in features:
                        feat_vec[(f, cur[i])] = feat_vec[(f, cur[i])] - 1
                        feat_vec[(f, gold[i])] = feat_vec[(f, gold[i])] + 1
                        # averaged perceptron
                        # usual way of averaging over all intermediate weight vectors is:
                        # w = (w0 + w1 + w2 + ...... + wt) / (numepochs * T)
                        # But we can also average in an efficient way:
                        # w = w1*(step/numepochs*T) + w2*(step-1/numepochs*T) + w3*(step-2/numepochs*T) + ...... + wt*(1/numepochs*T)
                        feat_vec_cache[(f, cur[i])] = feat_vec_cache[
                            (f, cur[i])] - 1 * (float(step / numepochs * T))
                        feat_vec_cache[(f, gold[i])] = feat_vec_cache[
                            (f, gold[i])] + 1 * (float(step / numepochs * T))
            step -= 1
        print >> sys.stderr, "iteration %d done." % i
    return feat_vec_cache
Exemplo n.º 9
0
def retrieve_feature(output, feat_list):
    # This function returns feature vector generated by certain output
    feat_vec = FeatureVector()
    index = 0

    for i in range(1, len(output) - 1):
        (index, feats) = perc.feats_for_word(index, feat_list)

        if len(feats) == 0:
            raise ValueError("Returned empty feature")

        for feat in feats:
            feat_vec[feat, output[i]] += 1

    return feat_vec
def perc_avg_train(train_data, tagset, numepochs):
    feat_vec = defaultdict(int)
    avg_feat_vec = defaultdict(float)
    default_tag = tagset[0]

    for epoch in range(numepochs):
        count_mistake = 0
        print(f"Running on epoch {epoch+1}......")
        tic = time.time()
        for _, (labeled_list, feat_list) in enumerate(train_data):
            pred_output = perc.perc_test(feat_vec, labeled_list, feat_list,
                                         tagset, default_tag)
            true_output = [x.split()[2] for x in labeled_list]

            if pred_output != true_output:
                count_mistake += 1
                feat_index = 0

                for w_index in range(len(pred_output)):
                    pred_tag = pred_output[w_index]
                    true_tag = true_output[w_index]
                    (feat_index,
                     feats) = perc.feats_for_word(feat_index, feat_list)
                    for feat in feats:
                        if feat == 'B' and w_index > 0:
                            if true_output[w_index - 1] != pred_output[
                                    w_index - 1] or pred_tag != true_tag:
                                feat_vec['B:' + true_output[w_index - 1],
                                         true_tag] += 1
                                feat_vec['B:' + pred_output[w_index - 1],
                                         pred_tag] -= 1
                        elif pred_tag != true_tag:
                            feat_vec[feat, true_tag] += 1
                            feat_vec[feat, pred_tag] -= 1

            for key in feat_vec.keys():
                # γ = σ/(mT)
                avg_feat_vec[key] += feat_vec[key]

        toc = time.time()
        print(
            f'Epoch {epoch+1} finished. Time cost on this epoch: {toc-tic}. Number of mistakes: {count_mistake}.'
        )

    for key in avg_feat_vec.keys():
        avg_feat_vec[key] /= (numepochs * len(train_data))
    return avg_feat_vec
Exemplo n.º 11
0
def perc_train(train_data, tagset, n):
    # insert your code here
    if len(tagset) <= 0:
        raise ValueError("Empty tagset")
    default_tag = tagset[0]
    feat_vec = defaultdict(int)

    epochs = n
    for round in range(0,epochs):
        num_mistakes = 0
        for (labeled_list, feat_list) in train_data:
            output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag)
            true_output = get_truth(labeled_list)
            logging.info("arg max output: %s" % (" ".join(output)))
            logging.info("truth: %s" % (" ".join(true_output)))
            #print >>sys.stderr, "\noutput:", output, "\ntruth:", true_output
            if output != true_output:
                num_mistakes += 1
                output.insert(0,'B_-1')
                output.append('B_+1')
                true_output.insert(0,'B_-1')
                true_output.append('B_+1')
                feat_index = 0
                for i in range(1,len(output)-1):
                    #print >>sys.stderr, output[i], true_output[i]
                    (feat_index, feats) = perc.feats_for_word(feat_index, feat_list)
                    if len(feats) == 0:
                        print >>sys.stderr, " ".join(labeled_list), " ".join(feat_list), "\n"
                        raise ValueError("features do not align with input sentence")
                    #print >>sys.stderr, feats
                    feat_vec_update = defaultdict(int)
                    for feat in feats:
                        #!!!Debug: output_feat is not truth feat....
                        output_feat = truth_feat = feat

                        feat_vec_update[output_feat, output[i]] += -1
                        feat_vec_update[truth_feat, true_output[i]] += 1

                    for (upd_feat, upd_tag) in feat_vec_update:
                        if feat_vec_update[upd_feat, upd_tag] != 0:
                            feat_vec[upd_feat, upd_tag] += feat_vec_update[upd_feat, upd_tag]
                            logging.info("updating feat_vec with feature_id: (%s, %s) value: %d" % (upd_feat, upd_tag, feat_vec_update[upd_feat, upd_tag]))
        print >>sys.stderr, "number of mistakes:", num_mistakes
        logging.info("current number of mistakes: %d" % (num_mistakes))
    return feat_vec
Exemplo n.º 12
0
def perc_train(train_data, tagset, n):
	feat_vec = defaultdict(int)
	# insert your code here
	# please limit the number of iterations of training to n iterations
	
	n_sentences = len(train_data)
	for i in range (0,n):
		for j in range(0,n_sentences):
			print '{0}\r'.format("\rIteration: %d/%d. Sentence: %d/%d\t" %(i+1, n, j+1, n_sentences)),
			
			labeled_list = train_data[j][0]
			feat_list = train_data[j][1]	
	
			# Extract the labels from training data
			toutput = [tags.split(' ')[2] for tags in labeled_list ]	
			
			# Output from Viterbi Algorithm	
			zoutput = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, 'B-NP')
			
			# Compare outputs
			if toutput != zoutput:
				index = 0
				for k in range(0, len(zoutput)):							
					predicted_tag = zoutput[k]
					true_tag = toutput[k]
					(index, feats) = perc.feats_for_word(index, feat_list)
					for feat in feats:
						s1 = s2 = ''
						if feat == 'B':
							if k >= 1:
								zprevtag = zoutput[k-1]
								tprevtag = toutput[k-1]
							else:
								zprevtag = tprevtag = 'B_-1'
							s1 = (feat+':'+zprevtag, predicted_tag)
							s2 = (feat+':'+tprevtag, true_tag)
						else:
							s1 = (feat, predicted_tag)
							s2 = (feat, true_tag)
							
						if s1 != s2:
							feat_vec[s1] -= 1
							feat_vec[s2] += 1
	return feat_vec
Exemplo n.º 13
0
def perc_train(train_data, tagset, numepochs):
    feat_vec = defaultdict(int)
    # insert your code here
    epoch = 0

    while (epoch < numepochs):
        #print(epoch)
        mistakes = 0
        correct = 0
        #print(len(train_data))
        #sen=0
        for sentence_data in train_data:
            words = []
            postags = []
            truetags = []
            label_list = sentence_data[0]
            feat_list = sentence_data[1]
            for label in label_list:
                (word, postag, chunktag) = label.split(" ")
                words.append(word)
                postags.append(postag)
                truetags.append(chunktag)
            tagset = perc.read_tagset(opts.tagsetfile)
            default_tag = tagset[0]
            argmaxtags = perc.perc_test(feat_vec, label_list, feat_list,
                                        tagset, default_tag)
            feat_index = 0
            i = 0

            for word in words:
                (feat_index, feats_for_this_word) = perc.feats_for_word(
                    feat_index, feat_list)
                # print(len(feats_for_this_word))
                argmax = argmaxtags[i]
                tru = truetags[i]
                if (argmax == tru):
                    i += 1
                    continue
                for f in feats_for_this_word:
                    wrongkey = f, argmax
                    rightkey = f, tru
                    feat_vec[wrongkey] = feat_vec.get(wrongkey, 0) - 1
                    feat_vec[rightkey] = feat_vec.get(rightkey, 0) + 1
                i += 1
            i = 0

            for word in words:
                argmax = argmaxtags[i]
                tru = truetags[i]
                if (argmax == tru):
                    i += 1
                    correct += 1
                    continue
                else:
                    mistakes += 1
                argmaxprev = "B:"
                truprev = "B:"
                if (i == 0):
                    argmaxprev += "B_-1"
                    truprev += "B_-1"
                else:
                    argmaxprev += argmaxtags[i - 1]
                    truprev += truetags[i - 1]
                wrongkey = argmaxprev, argmax
                rightkey = truprev, tru
                feat_vec[wrongkey] = feat_vec.get(wrongkey, 0) - 1
                feat_vec[rightkey] = feat_vec.get(rightkey, 0) + 1
                i += 1
            #if(sen%1000==0):
            #print(str(sen)+"/"+str(len(train_data)))
            #sen+=1
        #print(mistakes)
        #print(correct)
        epoch += 1
    # please limit the number of iterations of training to n iterations
    return feat_vec
Exemplo n.º 14
0
def perc_train(train_data, tagset, numepochs):
    # feat_vec = perc.perc_read_from_file(opts.modelfile)
    # print len(feat_vec)
    feat_vec = defaultdict(int)
    # insert your code here
    if len(tagset) <= 0:
        raise ValueError("Empty tagset")

    numepochs = int(1)
    default_tag = tagset[0]
    for t in range(numepochs):

        print "Iteration#", t, " is processing now."
        counter = 0
        for (labeled_list, feat_list) in train_data:

            counter += 1
            print counter

            labels = copy.deepcopy(labeled_list)
            # add in the start and end buffers for the context
            # for every sentence in the training set, iterate numepochs times
            output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag)
            # compare current output and true result
            # correct_flag = True
            feat_index = 0

            # check word by word if the predicted tag is equal to the true tag
            for i, v in enumerate(output):
                (feat_index, feats) = perc.feats_for_word(feat_index, feat_list)

                # retrieve the feature for a word
                if len(feats) == 0:
                    print >> sys.stderr, " ".join(labels), " ".join(feat_list), "\n"
                    raise ValueError("features do not align with input sentence")

                fields = labels[i].split()
                label = fields[2]
                if i > 1:
                    pre_label = labels[i - 1].split()[2]
                    pre_pre_label = labels[i - 2].split()[2]

                    if output[i - 2] != pre_pre_label or output[i - 1] != pre_label or output[i] != label:
                        for feat in feats:
                            if feat[0] == "B":  # for bigram feature
                                feat_out = "B:" + output[i - 1]  # feat_out is the "B:<previous output>"
                                feat_lab = "B:" + pre_label  # feat_lab is the "B:<previous label>"
                                if output[i - 1] != pre_label and output[i] != label:
                                    feat_vec[feat_out, output[i]] -= 1
                                    feat_vec[feat_lab, output[i]] -= 1
                                    feat_vec[feat_out, label] += 1
                                    feat_vec[feat_lab, label] += 1
                                elif output[i - 1] == pre_label and output[i] != label:
                                    feat_vec[feat_lab, output[i]] -= 2
                                    feat_vec[feat_lab, label] += 2
                                elif output[i - 1] != pre_label and output[i] == label:
                                    pass
                                elif output[i - 1] == pre_label and output[i] == label:
                                    pass

                                feat_out = "T:" + output[i - 2] + "/" + output[i - 1]
                                feat_lab = "T:" + pre_pre_label + "/" + pre_label
                                if output[i - 2] == pre_pre_label and output[i - 1] == pre_label and output[i] != label:
                                    feat_vec[feat_out, output[i]] -= 1
                                    feat_vec[feat_lab, label] += 1
                                feat_vec[feat_out, output[i]] -= 1
                                feat_vec[feat_lab, label] += 1

                            else:  # for U00 to U22 feature
                                feat_vec[feat, output[i]] -= 1
                                feat_vec[feat, label] += 1

                elif i == 1:
                    pre_label = labels[i - 1].split()[2]
                    if output[i - 1] != pre_label or output[i] != label:
                        for feat in feats:
                            if feat[0] == "B":  # for bigram feature
                                feat_out = "B:" + output[i - 1]  # feat_out is the "B:<previous output>"
                                feat_lab = "B:" + pre_label  # feat_lab is the "B:<previous label>"
                                feat_vec[feat_out, output[i]] -= 1
                                feat_vec[feat_lab, label] += 1
                            else:  # for U00 to U22 feature
                                feat_vec[feat, output[i]] -= 1
                                feat_vec[feat, label] += 1
                        feat_out = "T:B_-1/" + output[i - 1]
                        feat_lab = "T:B_-1/" + pre_label
                        feat_vec[feat_out, output[i]] -= 1
                        feat_vec[feat_lab, label] += 1
                else:  # for i==0 case, all the first word in each sentence
                    for feat in feats:
                        if feat[0] == "B":  # bigram feature case
                            feat = "B:B_-1"
                        feat_vec[feat, output[i]] -= 1
                        feat_vec[feat, label] += 1
                    feat = "T:B_-2/B_-1"
                    feat_vec[feat, output[i]] -= 1
                    feat_vec[feat, label] += 1

    # please limit the number of iterations of training to n iterations
    return feat_vec
Exemplo n.º 15
0
def perc_train(train_data, tagset, n):
	feat_vec = defaultdict(int)
	sigma_feat_vec = defaultdict(float)
	# insert your code here
	# please limit the number of iterations of training to n iterations
	
	print >> sys.stderr, "training data ..."
	n_sentences = len(train_data)
	for i in range (0,n):		
		for j in range(0,n_sentences):
			print >> sys.stderr, '\r{0}'.format("Iteration: %d/%d. Sentence: %d/%d\t" %(i+1, n, j+1, n_sentences)),
			
			labeled_list = train_data[j][0]
			feat_list = train_data[j][1]
	
			# Extract the labels from training data
			toutput = [tags.split(' ')[2] for tags in labeled_list ]	
			
			# Output from Viterbi Algorithm	
			zoutput = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, 'B-NP')
			
			# Compare outputs
			if toutput != zoutput:
				index = 0
				for p in range(0, len(zoutput)):							
					predicted_tag = zoutput[p]
					true_tag = toutput[p]
					(index, feats) = perc.feats_for_word(index, feat_list)				
					
					for feat in feats:
						s1 = s2 = ''
						if feat == 'B':
							if p >= 1:
								zprevtag = zoutput[p-1]
								tprevtag = toutput[p-1]
							else:
								zprevtag = tprevtag = 'B_-1'
							s1 = (feat+':'+zprevtag, predicted_tag)
							s2 = (feat+':'+tprevtag, true_tag)
						else:
							s1 = (feat, predicted_tag)
							s2 = (feat, true_tag)
							
						if s1 != s2:
							feat_vec[s1] -= 1
							feat_vec[s2] += 1
			
			# Compute average vector			
			for f in feat_vec:
				sigma_feat_vec[f] += feat_vec[f] 
							
	print >> sys.stderr, "\ndone"
	zerokeys = []
	for f in sigma_feat_vec:
		if sigma_feat_vec[f] == 0:
			zerokeys.append(f)
		else:
			sigma_feat_vec[f] = sigma_feat_vec[f]/(n*n_sentences)
	for k in zerokeys:
		del sigma_feat_vec[k]
	return sigma_feat_vec
Exemplo n.º 16
0
def perc_train(train_data, tagset, numepochs):
    feat_vec = defaultdict(float)
    avg_feat_vec = defaultdict(float)
    tau_feat_vec = dict()

    # insert your code here
    if len(tagset) <= 0:
        raise ValueError("Empty tagset")

    default_tag = tagset[0]
    m = len(train_data) # length of training data
    for t in range(numepochs):
        print 'Iteration#',t,' is processing now.'
        for j, (labeled_list, feat_list) in enumerate(train_data):

            labels = copy.deepcopy(labeled_list)
            # print 'sentence[',j,']'
            # add in the start and end buffers for the context
            # for every sentence in the training set, iterate numepochs times
            output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag)
            # compare current output and true result

            if j != m - 1 or t != numepochs - 1:
                feat_index = 0
                # check word by word if the predicted tag is equal to the true tag
                for i, v in enumerate(output):
                    (feat_index, feats) = perc.feats_for_word(feat_index, feat_list)
                    # retrieve the feature for a word
                    if len(feats) == 0:
                        print >>sys.stderr, " ".join(labels), " ".join(feat_list), "\n"
                        raise ValueError("features do not align with input sentence")
                    
                    label = labels[i].split()[2]
                    if i > 0: 
                        label_pre = labels[i-1].split()[2]
                        for feat in feats:

                            if feat[0] == 'B': # for bigram feature
                                feat_out = feat + ":" + output[i-1]  # feat_out is the "B:<previous output>"
                                feat_lab = feat + ":" + label_pre  # feat_lab is the "B:<previous label>"

                                if output[i] != label or feat_out != feat_lab:

                                    # laze update the tau vector value
                                    lazy_update_vect(feat_out, output[i], tau_feat_vec, feat_vec, avg_feat_vec, t, j, m)
                                    lazy_update_vect(feat_lab, label, tau_feat_vec, feat_vec, avg_feat_vec, t, j, m)


                                    # update original feature vector, if feat_out == feat_lab perform 2nd type updating
                                    update_bigram_vect(feat_vec, avg_feat_vec, feat_out, feat_lab, output[i], label)

                                    # if feat_out == feat_lab then update twice for the same tau
                                    tau_feat_vec[feat_out, output[i]] = (j, t)
                                    tau_feat_vec[feat_lab, label] = (j, t)



                            elif output[i] != label:
                                lazy_update_vect(feat, output[i], tau_feat_vec, feat_vec, avg_feat_vec, t, j, m)
                                lazy_update_vect(feat, label, tau_feat_vec, feat_vec, avg_feat_vec, t, j, m)
                                
                                # for U00 to U22 feature                                
                                update_unigram_vect(feat_vec, avg_feat_vec, feat, output[i],label)

                                # update vector
                                tau_feat_vec[feat, output[i]] = (j, t)
                                tau_feat_vec[feat, label] = (j, t)


                    else:  # for i==0 case, all the first word in each sentence
                        label_pre = '_B-2'  # previous label will be denoted by _B-2
                        for feat in feats:
                            if feat[0] == 'B' and output[i] != label:
                                # bigram feature case
                                feat = feat + ":" + label_pre

                                lazy_update_vect(feat, output[i], tau_feat_vec, feat_vec, avg_feat_vec, t, j, m)  
                                lazy_update_vect(feat, label, tau_feat_vec, feat_vec, avg_feat_vec, t, j, m)  

                                update_bigram_vect(feat_vec, avg_feat_vec, feat, feat, output[i], label)

                                tau_feat_vec[feat, label] = (j, t)
                                tau_feat_vec[feat, output[i]] = (j, t)


                            elif output[i] != label:
                                lazy_update_vect(feat, output[i], tau_feat_vec, feat_vec, avg_feat_vec, t, j, m)
                                lazy_update_vect(feat, label, tau_feat_vec, feat_vec, avg_feat_vec, t, j, m)
                                
                                # for U00 to U22 feature
                                update_unigram_vect(feat_vec, avg_feat_vec, feat, output[i],label)

                                tau_feat_vec[feat, output[i]] = (j, t)
                                tau_feat_vec[feat, label] = (j, t)


            else:
                final_lazy_update_vect(tau_feat_vec, feat_vec, avg_feat_vec, t, j, m)

                # special case for the last sentence 
                feat_index = 0
                # check word by word if the predicted tag is equal to the true tag
                for i, v in enumerate(output):
                    (feat_index, feats) = perc.feats_for_word(feat_index, feat_list)
                    # retrieve the feature for a word
                    if len(feats) == 0:
                        print >>sys.stderr, " ".join(labels), " ".join(feat_list), "\n"
                        raise ValueError("features do not align with input sentence")
                    
                    label = labels[i].split()[2]
                    if i > 0: 
                        label_pre = labels[i-1].split()[2]
                        for feat in feats:

                            if feat[0] == 'B': # for bigram feature
                                feat_out = feat + ":" + output[i-1]  # feat_out is the "B:<previous output>"
                                feat_lab = feat + ":" + label_pre  # feat_lab is the "B:<previous label>"
                                if output[i] != label:
                                    # update original feature vector
                                    update_bigram_vect(feat_vec, avg_feat_vec, feat_out, feat_lab, output[i], label)

                            elif output[i] != label:                                
                                update_unigram_vect(feat_vec, avg_feat_vec, feat, output[i],label)

                    else:  
                        # for i==0 case, all the first word in each sentence
                        label_pre = '_B-1'  # previous label will be denoted by _B-2
                        for feat in feats:
                            if feat[0] == 'B' and output[i] != label:  
                                # bigram feature case
                                feat = feat + ":" + label_pre
                                update_bigram_vect(feat_vec, avg_feat_vec, feat, feat, output[i], label)

                            elif output[i] != label:
                                # for U00 to U22 feature
                                update_unigram_vect(feat_vec, avg_feat_vec, feat, output[i],label)

        # end of iteration

    # averaging perceptron
    for key in avg_feat_vec.keys():
        avg_feat_vec[key] = avg_feat_vec[key]/float(numepochs*m)
    # please limit the number of iterations of training to n iterations
    perc.perc_write_to_file(feat_vec, 'model_feat_vec')
    return avg_feat_vec
Exemplo n.º 17
0
def avg_perc_train(train_data, tagset, n):
    # insert your code here
    if len(tagset) <= 0:
        raise ValueError("Empty tagset")
    default_tag = tagset[0]

    feat_vec = defaultdict(int)
    avg_vec = defaultdict(int)
    last_iter = {}

    epochs = n
    num_updates = 0
    for round in range(0,epochs):
        num_mistakes = 0
        for (labeled_list, feat_list) in train_data:
            num_updates += 1
            output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag)
            true_output = get_truth(labeled_list)
            logging.info("arg max output: %s" % (" ".join(output)))
            logging.info("truth: %s" % (" ".join(true_output)))
            #print >>sys.stderr, "\noutput:", output, "\ntruth:", true_output
            if output != true_output:
                num_mistakes += 1
                output.insert(0,'B_-1')
                output.append('B_+1')
                true_output.insert(0,'B_-1')
                true_output.append('B_+1')
                feat_index = 0
                for i in range(1,len(output)-1):
                    #print >>sys.stderr, output[i], true_output[i]
                    (feat_index, feats) = perc.feats_for_word(feat_index, feat_list)
                    if len(feats) == 0:
                        print >>sys.stderr, " ".join(labeled_list), " ".join(feat_list), "\n"
                        raise ValueError("features do not align with input sentence")
                    #print >>sys.stderr, feats
                    feat_vec_update = defaultdict(int)
                    for feat in feats:
                        if feat == 'B':
                            output_feat = 'B:' + output[i-1]
                            truth_feat = 'B:' + true_output[i-1]
                        else:
                            output_feat = truth_feat = feat

                        feat_vec_update[output_feat, output[i]] += -1
                        feat_vec_update[truth_feat, true_output[i]] += 1
                        #reason: if output[i]==true_output[i] update = 0
                    for (upd_feat, upd_tag) in feat_vec_update:
                        if feat_vec_update[upd_feat, upd_tag] != 0:
                            feat_vec[upd_feat, upd_tag] += feat_vec_update[upd_feat, upd_tag]
                            logging.info("updating feat_vec with feature_id: (%s, %s) value: %d" % (upd_feat, upd_tag, feat_vec_update[upd_feat, upd_tag]))
                            if (upd_feat, upd_tag) in last_iter:
                                avg_vec[upd_feat, upd_tag] += (num_updates - last_iter[upd_feat, upd_tag]) * feat_vec[upd_feat, upd_tag]
                            else:
                                avg_vec[upd_feat, upd_tag] = feat_vec[upd_feat, upd_tag]
                            last_iter[upd_feat, upd_tag] = num_updates
        print >>sys.stderr, "number of mistakes:", num_mistakes
    for (feat, tag) in feat_vec:
        if (feat, tag) in last_iter:
            avg_vec[feat, tag] += (num_updates - last_iter[feat, tag]) * feat_vec[feat, tag]
        else:
            avg_vec[feat, tag] = feat_vec[feat, tag]
        feat_vec[feat, tag] = avg_vec[feat, tag] / num_updates
    return feat_vec
Exemplo n.º 18
0
def perc_train(train_data, tagset, numepochs):
    feat_vec = defaultdict(int)
    # insert your code here
    cumulative_feat_vec = defaultdict(float)
    index_dec = dict()
    epoch = 0
    count = 0
    numsen = len(train_data)
    while (epoch < numepochs):
        print(epoch)
        mistakes = 0
        correct = 0
        #print(numsen)
        sen = 0
        for sentence_data in train_data:
            if (epoch != numepochs or sen != numsen):

                words = []
                postags = []
                truetags = []
                label_list = sentence_data[0]
                feat_list = sentence_data[1]
                for label in label_list:
                    (word, postag, chunktag) = label.split(" ")
                    words.append(word)
                    postags.append(postag)
                    truetags.append(chunktag)
                tagset = perc.read_tagset(opts.tagsetfile)
                default_tag = tagset[0]
                argmaxtags = perc.perc_test(feat_vec, label_list, feat_list,
                                            tagset, default_tag)
                feat_index = 0
                i = 0

                for word in words:
                    (feat_index, feats_for_this_word) = perc.feats_for_word(
                        feat_index, feat_list)
                    # print(len(feats_for_this_word))
                    argmax = argmaxtags[i]
                    tru = truetags[i]
                    if (argmax == tru):
                        i += 1
                        continue
                    for f in feats_for_this_word:
                        wrongkey = f, argmax
                        rightkey = f, tru
                        if (wrongkey in index_dec):
                            (index_epoch, index_sen) = index_dec[wrongkey]
                            idletime = (epoch * numsen + sen -
                                        index_epoch * numsen - index_sen)
                            cumulative_feat_vec[
                                wrongkey] = cumulative_feat_vec.get(
                                    wrongkey,
                                    0) + feat_vec.get(wrongkey, 0) * idletime
                        if (rightkey in index_dec):
                            (index_epoch, index_sen) = index_dec[rightkey]
                            idletime = (epoch * numsen + sen -
                                        index_epoch * numsen - index_sen)
                            cumulative_feat_vec[
                                rightkey] = cumulative_feat_vec.get(
                                    rightkey,
                                    0) + feat_vec.get(rightkey, 0) * idletime

                        feat_vec[wrongkey] = feat_vec.get(wrongkey, 0) - 1
                        feat_vec[rightkey] = feat_vec.get(rightkey, 0) + 1
                        cumulative_feat_vec[
                            wrongkey] = cumulative_feat_vec.get(
                                wrongkey, 0) + feat_vec[wrongkey]
                        cumulative_feat_vec[
                            rightkey] = cumulative_feat_vec.get(
                                rightkey, 0) + feat_vec[rightkey]
                        index_dec[wrongkey] = (epoch, sen)
                        index_dec[rightkey] = (epoch, sen)
                    i += 1
                i = 0

                for word in words:
                    argmax = argmaxtags[i]
                    tru = truetags[i]
                    if (argmax == tru):
                        i += 1
                        correct += 1
                        continue
                    else:
                        mistakes += 1
                    argmaxprev = "B:"
                    truprev = "B:"
                    if (i == 0):
                        argmaxprev += "B_-1"
                        truprev += "B_-1"
                    else:
                        argmaxprev += argmaxtags[i - 1]
                        truprev += truetags[i - 1]
                    wrongkey = argmaxprev, argmax
                    rightkey = truprev, tru

                    if (wrongkey in index_dec):
                        (index_epoch, index_sen) = index_dec[wrongkey]
                        idletime = (epoch * numsen + sen -
                                    index_epoch * numsen - index_sen)
                        cumulative_feat_vec[wrongkey] = cumulative_feat_vec.get(
                            wrongkey, 0) + feat_vec.get(wrongkey, 0) * idletime

                    if (rightkey in index_dec):
                        (index_epoch, index_sen) = index_dec[rightkey]
                        idletime = (epoch * numsen + sen -
                                    index_epoch * numsen - index_sen)
                        cumulative_feat_vec[rightkey] = cumulative_feat_vec.get(
                            rightkey, 0) + feat_vec.get(rightkey, 0) * idletime

                    feat_vec[wrongkey] = feat_vec.get(wrongkey, 0) - 1
                    feat_vec[rightkey] = feat_vec.get(rightkey, 0) + 1
                    cumulative_feat_vec[wrongkey] = cumulative_feat_vec.get(
                        wrongkey, 0) + feat_vec[wrongkey]
                    cumulative_feat_vec[rightkey] = cumulative_feat_vec.get(
                        rightkey, 0) + feat_vec[rightkey]
                    index_dec[wrongkey] = (epoch, sen)
                    index_dec[rightkey] = (epoch, sen)
                    i += 1

                #keys=feat_vec.keys()
                #for key in keys:
                #cumulative_feat_vec[key]=cumulative_feat_vec.get(key,0)+feat_vec[key]
                count += 1
            else:

                words = []
                postags = []
                truetags = []
                label_list = sentence_data[0]
                feat_list = sentence_data[1]
                for label in label_list:
                    (word, postag, chunktag) = label.split(" ")
                    words.append(word)
                    postags.append(postag)
                    truetags.append(chunktag)
                tagset = perc.read_tagset(opts.tagsetfile)
                default_tag = tagset[0]
                argmaxtags = perc.perc_test(feat_vec, label_list, feat_list,
                                            tagset, default_tag)
                feat_index = 0
                i = 0

                for word in words:
                    (feat_index, feats_for_this_word) = perc.feats_for_word(
                        feat_index, feat_list)
                    # print(len(feats_for_this_word))
                    argmax = argmaxtags[i]
                    tru = truetags[i]
                    for f in feats_for_this_word:
                        wrongkey = f, argmax
                        rightkey = f, tru

                        (index_epoch, index_sen) = index_dec[wrongkey]
                        idletime = (epoch * numsen + sen -
                                    index_epoch * numsen - index_sen)
                        cumulative_feat_vec[wrongkey] = cumulative_feat_vec.get(
                            wrongkey, 0) + feat_vec.get(wrongkey, 0) * idletime

                        (index_epoch, index_sen) = index_dec[rightkey]
                        idletime = (epoch * numsen + sen -
                                    index_epoch * numsen - index_sen)
                        cumulative_feat_vec[rightkey] = cumulative_feat_vec.get(
                            rightkey, 0) + feat_vec.get(rightkey, 0) * idletime

                    if (argmax == tru):
                        i += 1
                        continue
                    for f in feats_for_this_word:
                        wrongkey = f, argmax
                        rightkey = f, tru

                        feat_vec[wrongkey] = feat_vec.get(wrongkey, 0) - 1
                        feat_vec[rightkey] = feat_vec.get(rightkey, 0) + 1
                        cumulative_feat_vec[
                            wrongkey] = cumulative_feat_vec.get(
                                wrongkey, 0) + feat_vec[wrongkey]
                        cumulative_feat_vec[
                            rightkey] = cumulative_feat_vec.get(
                                rightkey, 0) + feat_vec[rightkey]
                        index_dec[wrongkey] = (epoch, sen)
                        index_dec[rightkey] = (epoch, sen)
                    i += 1
                i = 0

                for word in words:
                    argmax = argmaxtags[i]
                    tru = truetags[i]

                    argmaxprev = "B:"
                    truprev = "B:"
                    if (i == 0):
                        argmaxprev += "B_-1"
                        truprev += "B_-1"
                    else:
                        argmaxprev += argmaxtags[i - 1]
                        truprev += truetags[i - 1]
                    wrongkey = argmaxprev, argmax
                    rightkey = truprev, tru

                    (index_epoch, index_sen) = index_dec[wrongkey]
                    idletime = (epoch * numsen + sen - index_epoch * numsen -
                                index_sen)
                    cumulative_feat_vec[wrongkey] = cumulative_feat_vec.get(
                        wrongkey, 0) + feat_vec.get(wrongkey, 0) * idletime

                    (index_epoch, index_sen) = index_dec[rightkey]
                    idletime = (epoch * numsen + sen - index_epoch * numsen -
                                index_sen)
                    cumulative_feat_vec[rightkey] = cumulative_feat_vec.get(
                        rightkey, 0) + feat_vec.get(rightkey, 0) * idletime

                    if (argmax == tru):
                        i += 1
                        correct += 1
                        continue
                    else:
                        mistakes += 1

                    feat_vec[wrongkey] = feat_vec.get(wrongkey, 0) - 1
                    feat_vec[rightkey] = feat_vec.get(rightkey, 0) + 1
                    cumulative_feat_vec[wrongkey] = cumulative_feat_vec.get(
                        wrongkey, 0) + feat_vec[wrongkey]
                    cumulative_feat_vec[rightkey] = cumulative_feat_vec.get(
                        rightkey, 0) + feat_vec[rightkey]
                    index_dec[wrongkey] = (epoch, sen)
                    index_dec[rightkey] = (epoch, sen)
                    i += 1

                # keys=feat_vec.keys()
                # for key in keys:
                # cumulative_feat_vec[key]=cumulative_feat_vec.get(key,0)+feat_vec[key]
                count += 1

            if (sen % 1000 == 0):
                print(str(sen) + "/" + str(len(train_data)))
            sen += 1

        #print(mistakes)
        #print(correct)
        epoch += 1

    keys = cumulative_feat_vec.keys()
    for key in keys:
        cumulative_feat_vec[key] = float(
            cumulative_feat_vec[key]) / float(count)

    # please limit the number of iterations of training to n iterations

    return cumulative_feat_vec
Exemplo n.º 19
0
def perc_train(train_data, tagset, T):
	feat_vec = defaultdict(int)
	sigma_feat_vec = defaultdict(float)
	sigma_feat_vec2 = defaultdict(float)
	tau = {}
	# insert your code here
	# please limit the number of iterations of training to n iterations
	
	print >> sys.stderr, "training data ..."
	M = len(train_data)
	for t in range (0,T):		
		for i in range(0,M):
			print >> sys.stderr, '\r{0}'.format("Iteration: %d/%d. Sentence: %d/%d\t" %(t+1, T, i+1, M)),
			
			labeled_list = train_data[i][0]
			feat_list = train_data[i][1]
	
			# Extract the labels from training data
			toutput = [tags.split(' ')[2] for tags in labeled_list ]	
			
			# Output from Viterbi Algorithm	
			zoutput = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, 'B-NP')
			
			# Compare outputs
			diff = toutput != zoutput
			if t != T-1 or i != M-1:
				if diff:				
					index = 0
					for p in range(0, len(zoutput)):							
						predicted_tag = zoutput[p]
						true_tag = toutput[p]
						(index, feats) = perc.feats_for_word(index, feat_list)										
						for feat in feats:
							s1 = s2 = ''
							if feat == 'B':
								if p >= 1:
									zprevtag = zoutput[p-1]
									tprevtag = toutput[p-1]
								else:
									zprevtag = tprevtag = 'B_-1'
								s1 = (feat+':'+zprevtag, predicted_tag)
								s2 = (feat+':'+tprevtag, true_tag)
							else:
								s1 = (feat, predicted_tag)
								s2 = (feat, true_tag)

							if s1 in tau:
								sigma_feat_vec[s1] = sigma_feat_vec[s1] + feat_vec[s1] * ((t+1) * M + (i+1) - tau[s1][1] * M - tau[s1][0])
							if s1 != s2 and s2 in tau:
								sigma_feat_vec[s2] = sigma_feat_vec[s2] + feat_vec[s2] * ((t+1) * M + (i+1) - tau[s2][1] * M - tau[s2][0])
							
							if s1 != s2:
								feat_vec[s1] -= 1
								feat_vec[s2] += 1
								sigma_feat_vec[s1] -= 1
								sigma_feat_vec[s2] += 1
								tau[s1] = (i+1,t+1)
							tau[s2] = (i+1,t+1)
			else:
				for s in tau:					
					sigma_feat_vec[s] = sigma_feat_vec[s] + feat_vec[s] * (T * M + M - tau[s][1] * M - tau[s][0])
				if diff:				
					index = 0
					for p in range(0, len(zoutput)):							
						predicted_tag = zoutput[p]
						true_tag = toutput[p]
						(index, feats) = perc.feats_for_word(index, feat_list)										
						for feat in feats:
							s1 = s2 = ''
							if feat == 'B':
								if p >= 1:
									zprevtag = zoutput[p-1]
									tprevtag = toutput[p-1]
								else:
									zprevtag = tprevtag = 'B_-1'
								s1 = (feat+':'+zprevtag, predicted_tag)
								s2 = (feat+':'+tprevtag, true_tag)
							else:
								s1 = (feat, predicted_tag)
								s2 = (feat, true_tag)
							
							if s1 != s2:
								feat_vec[s1] -= 1
								feat_vec[s2] += 1
								sigma_feat_vec[s1] -= 1
								sigma_feat_vec[s2] += 1
							
	print >> sys.stderr, "\ndone"
	print >> sys.stderr, "computing average vector ..."
	zerokeys = []
	for f in sigma_feat_vec:
		if sigma_feat_vec[f] == 0:
			zerokeys.append(f)
		else:
			sigma_feat_vec[f] = sigma_feat_vec[f]/(T*M)	
	for k in zerokeys:
		del sigma_feat_vec[k]
	print >> sys.stderr, "done"
	return sigma_feat_vec
Exemplo n.º 20
0
def perc_train(train_data, tagset, numepochs):
    feat_vec = defaultdict(int)
    # insert your code here
    if len(tagset) <= 0:
        raise ValueError("Empty tagset")

    numepochs = int(1)
    default_tag = tagset[0]
    for t in range(numepochs):
        tmp = 0
        # Count sentence
        print 'Iteration#',t,' is processing now.'
        cnt = 0
        for (labeled_list, feat_list) in train_data:
            cnt = cnt + 1
            print 'Sentence[',cnt,'] is now processing...'
            labels = copy.deepcopy(labeled_list)
            # add in the start and end buffers for the context
            # for every sentence in the training set, iterate numepochs times
            output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag)

            feat_index = 0
            # check word by word if the predicted tag is equal to the true tag
            for i, v in enumerate(output):
                (feat_index, feats) = perc.feats_for_word(feat_index, feat_list)
                # retrieve the feature for a word
                if len(feats) == 0:
                    print >>sys.stderr, " ".join(labels), " ".join(feat_list), "\n"
                    raise ValueError("features do not align with input sentence")
                
                label = labels[i].split()[2]
                if i > 1: 
                    label_i_1 = labels[i-1].split()[2]
                    label_i_2 = labels[i-2].split()[2]
                    if output[i] != label:
                        for feat in feats:
                            if feat[0] == 'T' and output[i-2] != label_i_2 and output[i-1] != label_i_1: 
                                # trigram case 
                                feat_out = feat + ":" + output[i-2] + "," + output[i-1]  
                                # feat_out is the "B:<previous output>"
                                feat_lab = feat + ":" + label_i_2 + "," + label_i_1
                                # feat_lab is the "B:<previous label>"
                                # reward best condition
                                feat_vec[feat_lab, label] = feat_vec[feat_lab, label] + 1

                                # penalize condition
                                feat_vec[feat_out, output[i]] = feat_vec[feat_out, output[i]] - 1

                            elif feat[0] == 'B' and output[i-1] != label_i_1:
                                # bigram case
                                feat_out = feat + ":" + output[i-1]  
                                feat_lab = feat + ":" + label_i_1
                                feat_vec[feat_lab, label] = feat_vec[feat_lab, label] + 1
                                feat_vec[feat_out, output[i]] = feat_vec[feat_out, output[i]] - 1

                            else: 
                            # for U00 to U22 feature
                                feat_vec[feat, output[i]] = feat_vec[feat, output[i]] - 1
                                feat_vec[feat, label] = feat_vec[feat, label] + 1
                elif i == 1:
                    # for i==0 case, all the first word in each sentence
                    label_i_2 = '_-1'  # previous label will be denoted by B_-1
                    label_i_1 = labels[i-1].split()[2]
                    if  output[i] != label:
                        for feat in feats:
                            if feat[0] == 'T' and output[i-1] != label_i_1:
                            # trigram case 
                                feat_out = feat + ":" + label_i_2 + "," + output[i-1]  
                                feat_lab = feat + ":" + label_i_2 + "," + label_i_1
                                # reward best condition
                                feat_vec[feat_lab, label] = feat_vec[feat_lab, label] + 1

                                # penalize condition
                                feat_vec[feat_out, output[i]] = feat_vec[feat_out, output[i]] - 1
                            
                            elif feat[0] == 'B':
                                feat_out = feat + ":" + output[i-1]  
                                feat_lab = feat + ":" + label_i_1
                                feat_vec[feat_lab, label] = feat_vec[feat_lab, label] + 1
                                feat_vec[feat_out, output[i]] = feat_vec[feat_out, output[i]] - 1

                            else: 
                            # for U00 to U22 feature
                                feat_vec[feat, output[i]] = feat_vec[feat, output[i]] - 1
                                feat_vec[feat, label] = feat_vec[feat, label] + 1
                elif i == 0:
                    label_i_2 = '_B-2'
                    label_i_1 = '_B-1'
                    if output[i] != label:
                        for feat in feats:
                            if feat[0] == 'T':
                            # trigram case 
                                feat = feat + ":" + label_i_2 + "," + label_i_1
                            
                            elif feat[0] == 'B':
                            #bigram case
                                feat = feat + ":" + label_i_1

                            feat_vec[feat, output[i]] = feat_vec[feat, output[i]] - 1
                            feat_vec[feat, label] = feat_vec[feat, label] + 1

        filename = 'mid_model_iter' + str(t)
        perc.perc_write_to_file(feat_vec, filename)


    for (k1, k2), v in feat_vec.items():
        if v == 0:
            del feat_vec[k1,k2]


    # please limit the number of iterations of training to n iterations
    return feat_vec
Exemplo n.º 21
0
def perc_train(train_data, tagset, numepochs, word_set):
    feat_vec = defaultdict(int)
    # insert your code here
    if len(tagset) <= 0:
        raise ValueError("Empty tagset")

    # numepochs = int(50)
    default_tag = tagset[0]
    for t in range(numepochs):
        tmp = 0
        # Count sentence
        print 'Iteration#',t,' is processing now.'
        cnt = 0
        for (labeled_list, feat_list) in train_data:
            cnt = cnt + 1
            if cnt % 1000 == 0:
                print 'current status: ', str(round(100*cnt/9000.0,2)),'%'
            labels = copy.deepcopy(labeled_list)
            # add in the start and end buffers for the context
            # for every sentence in the training set, iterate numepochs times
            output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag, word_set)

            feat_index = 0
            # check word by word if the predicted tag is equal to the true tag
            for i, v in enumerate(output):
                (feat_index, feats) = perc.feats_for_word(feat_index, feat_list)
                # retrieve the feature for a word
                if len(feats) == 0:
                    print >>sys.stderr, " ".join(labels), " ".join(feat_list), "\n"
                    raise ValueError("features do not align with input sentence")
                
                fields = labels[i].split()
                label = fields[2]

                if i > 0: 
                    label_pre = labels[i-1].split()[2]
                    for feat in feats:
                        if feat[0] == 'B': # for bigram feature
                            feat_out = feat + ":" + output[i-1]  # feat_out is the "B:<previous output>"
                            feat_lab = feat + ":" + label_pre  # feat_lab is the "B:<previous label>"

                            if   output[i-1] != label_pre and output[i] != label:
                                feat_vec[feat_out, output[i]]   -= 1
                                feat_vec[feat_lab, output[i]]   -= 1
                                feat_vec[feat_out, label]       += 1
                                feat_vec[feat_lab, label]       += 1

                            elif output[i-1] == label_pre and output[i] != label:
                                feat_vec[feat_lab, output[i]]   -= 2
                                feat_vec[feat_lab, label]       += 2

                            elif output[i-1] != label_pre and output[i] == label:
                                pass

                            elif output[i-1] == label_pre and output[i] == label:
                                pass

                            # feat_vec[feat_out, output[i]] = feat_vec[feat_out, output[i]] - 1
                            # feat_vec[feat_lab, label] = feat_vec[feat_lab, label] + 1

                            # feat_vec[feat_out, label] = feat_vec[feat_out, label] + 1
                            # feat_vec[feat_lab, output[i]] = feat_vec[feat_lab, output[i]] - 1

                        else: 
                            # for U00 to U22 feature
                            # if the condition is not right, there will be no penaulty and rewarding
                            feat_vec[feat, output[i]] -= 1
                            feat_vec[feat, label]     += 1
                else:  # for i==0 case, all the first word in each sentence
                    label_pre = '_B-1'  # previous label will be denoted by _B-1
                    for feat in feats:
                        if feat[0] == 'B':  # bigram feature case
                            feat = feat + ":" + label_pre
                        feat_vec[feat, output[i]] = feat_vec[feat, output[i]] - 1
                        feat_vec[feat, label] = feat_vec[feat, label] + 1

                # if i > 0: 
                #     label_pre = labels[i-1].split()[2]
                #     if output[i-1] != label_pre or output[i] != label:
                #         for feat in feats:
                #             if feat[0] == 'B': 
                #             # for bigram feature
                #                 feat_out = feat + ":" + output[i-1]  
                #                 # feat_out is the "B:<previous output>"
                #                 feat_lab = feat + ":" + label_pre  
                #                 # feat_lab is the "B:<previous label>"
                #                 # reward best condition

                #                 feat_vec[feat_lab, label] = feat_vec[feat_lab, label] + 1

                #                 # penalize condition
                #                 feat_vec[feat_out, output[i]] = feat_vec[feat_out, output[i]] - 1
                                
                #             else: 
                #             # for U00 to U22 feature
                #                 feat_vec[feat, output[i]] = feat_vec[feat, output[i]] - 1
                #                 feat_vec[feat, label] = feat_vec[feat, label] + 1
                # else:
                #     # for i==0 case, all the first word in each sentence
                #     label_pre = '_B-1'  # previous label will be denoted by _B-1
                #     for feat in feats:
                #         if feat[0] == 'B':  
                #         # bigram feature case
                #             feat = feat + ":" + label_pre
                #         feat_vec[feat, output[i]] = feat_vec[feat, output[i]] - 1
                #         feat_vec[feat, label] = feat_vec[feat, label] + 1

        if t % 3 == 0:
            perc.perc_write_to_file(feat_vec, 'model_' + str(t))

        perc.perc_write_to_file(feat_vec, 'model')
        os.system('python perc.py -m model | python score-chunks.py')

    # please limit the number of iterations of training to n iterations
    return feat_vec
Exemplo n.º 22
0
def avg_perc_train(train_data, tagset, n):
    # insert your code here
    if len(tagset) <= 0:
        raise ValueError("Empty tagset")
    default_tag = tagset[0]

    feat_vec = defaultdict(int)
    avg_vec = defaultdict(int)
    last_iter = {}

    epochs = n
    num_updates = 0
    for round in range(0, epochs):
        num_mistakes = 0
        for (labeled_list, feat_list) in train_data:
            num_updates += 1
            output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset,
                                    default_tag)
            true_output = get_truth(labeled_list)
            logging.info("arg max output: %s" % (" ".join(output)))
            logging.info("truth: %s" % (" ".join(true_output)))
            #print >>sys.stderr, "\noutput:", output, "\ntruth:", true_output
            if output != true_output:
                num_mistakes += 1
                output.insert(0, 'B_-1')
                output.append('B_+1')
                true_output.insert(0, 'B_-1')
                true_output.append('B_+1')
                feat_index = 0
                for i in range(1, len(output) - 1):
                    #print >>sys.stderr, output[i], true_output[i]
                    (feat_index,
                     feats) = perc.feats_for_word(feat_index, feat_list)
                    if len(feats) == 0:
                        print >> sys.stderr, " ".join(labeled_list), " ".join(
                            feat_list), "\n"
                        raise ValueError(
                            "features do not align with input sentence")
                    #print >>sys.stderr, feats
                    feat_vec_update = defaultdict(int)
                    for feat in feats:
                        if feat == 'B':
                            output_feat = 'B:' + output[i - 1]
                            truth_feat = 'B:' + true_output[i - 1]
                        else:
                            output_feat = truth_feat = feat

                        feat_vec_update[output_feat, output[i]] += -1
                        feat_vec_update[truth_feat, true_output[i]] += 1
                        #reason: if output[i]==true_output[i] update = 0
                    for (upd_feat, upd_tag) in feat_vec_update:
                        if feat_vec_update[upd_feat, upd_tag] != 0:
                            feat_vec[upd_feat,
                                     upd_tag] += feat_vec_update[upd_feat,
                                                                 upd_tag]
                            logging.info(
                                "updating feat_vec with feature_id: (%s, %s) value: %d"
                                % (upd_feat, upd_tag,
                                   feat_vec_update[upd_feat, upd_tag]))
                            if (upd_feat, upd_tag) in last_iter:
                                avg_vec[upd_feat, upd_tag] += (
                                    num_updates - last_iter[upd_feat, upd_tag]
                                ) * feat_vec[upd_feat, upd_tag]
                            else:
                                avg_vec[upd_feat, upd_tag] = feat_vec[upd_feat,
                                                                      upd_tag]
                            last_iter[upd_feat, upd_tag] = num_updates
        print >> sys.stderr, "number of mistakes:", num_mistakes
    for (feat, tag) in feat_vec:
        if (feat, tag) in last_iter:
            avg_vec[feat, tag] += (num_updates -
                                   last_iter[feat, tag]) * feat_vec[feat, tag]
        else:
            avg_vec[feat, tag] = feat_vec[feat, tag]
        feat_vec[feat, tag] = avg_vec[feat, tag] / num_updates
    return feat_vec
Exemplo n.º 23
0
def perc_train(train_data, tagset, numepochs):
    feat_vec = defaultdict(float)
    avg_feat_vec = defaultdict(float)
    tau_feat_vec = dict()
    # insert your code here
    if len(tagset) <= 0:
        raise ValueError("Empty tagset")

    numepochs = int(2)
    default_tag = tagset[0]
    m = len(train_data) # length of training data
    for t in range(numepochs):
        print 'Iteration#',t,' is processing now.'
        for j, (labeled_list, feat_list) in enumerate(train_data):
            labels = copy.deepcopy(labeled_list)
            # add in the start and end buffers for the context
            # for every sentence in the training set, iterate numepochs times
            output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag)
            # compare current output and true result
            # correct_flag = True
            if j != m or t != numepochs - 1:
                feat_index = 0
                # check word by word if the predicted tag is equal to the true tag
                for i, v in enumerate(output):
                    (feat_index, feats) = perc.feats_for_word(feat_index, feat_list)
                    # retrieve the feature for a word
                    if len(feats) == 0:
                        print >>sys.stderr, " ".join(labels), " ".join(feat_list), "\n"
                        raise ValueError("features do not align with input sentence")
                    
                    label = labels[i].split()[2]
                    if i > 0: 
                        label_pre = labels[i-1].split()[2]
                        for feat in feats:

                            if feat[0] == 'B': # for bigram feature
                                feat_out = feat + ":" + output[i-1]  # feat_out is the "B:<previous output>"
                                feat_lab = feat + ":" + label_pre  # feat_lab is the "B:<previous label>"

                                if   output[i-1] != label_pre and output[i] != label:

                                    if feat in tau_feat_vec:
                                        (js, ts) = tau_feat_vec[feat]
                                        for (feature, tag) in avg_feat_vec.keys():
                                            if feature == feat:
                                                avg_feat_vec[feat, tag] = avg_feat_vec[feat, tag] + feat_vec[feat, tag] * (t*m + j - ts*m - js)


                                    # update original feature vector
                                    feat_vec[feat_out, output[i]]   -= 1.0
                                    feat_vec[feat_lab, output[i]]   -= 1.0
                                    feat_vec[feat_out, label]       += 1.0
                                    feat_vec[feat_lab, label]       += 1.0

                                    # update avg feature vector
                                    avg_feat_vec[feat_out, output[i]]   -= 1.0
                                    avg_feat_vec[feat_lab, output[i]]   -= 1.0
                                    avg_feat_vec[feat_out, label]       += 1.0
                                    avg_feat_vec[feat_lab, label]       += 1.0

                                    tau_feat_vec[feat_out] = (j, t)
                                    tau_feat_vec[feat_lab] = (j, t)

                                elif output[i-1] == label_pre and output[i] != label:

                                    if feat in tau_feat_vec:
                                        (js, ts) = tau_feat_vec[feat]
                                        for (feature, tag) in avg_feat_vec.keys():
                                            if feature == feat:
                                                avg_feat_vec[feat, tag] = avg_feat_vec[feat, tag] + feat_vec[feat, tag] * (t*m + j - ts*m - js)


                                    feat_vec[feat_lab, output[i]]   -= 2.0
                                    feat_vec[feat_lab, label]       += 2.0

                                    avg_feat_vec[feat_lab, output[i]]   -= 2.0
                                    avg_feat_vec[feat_lab, label]       += 2.0
                                    
                                    tau_feat_vec[feat_lab] = (j, t)
                                    tau_feat_vec[feat_lab]     = (j, t)

                                elif output[i-1] != label_pre and output[i] == label:
                                    pass

                                elif output[i-1] == label_pre and output[i] == label:
                                    pass

                            else: # for U00 to U22 feature

                                if output[i] != label and feat in tau_feat_vec:
                                    (js, ts) = tau_feat_vec[feat]
                                    for (feature, tag) in avg_feat_vec.keys():
                                        if feature == feat:
                                            avg_feat_vec[feat, tag] = avg_feat_vec[feat, tag] + feat_vec[feat, tag] * (t*m + j - ts*m - js)

                                feat_vec[feat, output[i]] -= 1.0
                                feat_vec[feat, label] += 1.0

                                avg_feat_vec[feat, output[i]] -= 1.0
                                avg_feat_vec[feat, label] += 1.0

                                # update vector
                                tau_feat_vec[feat, output[i]] = (j, t)
                                tau_feat_vec[feat, label]     = (j, t)


                    else:  # for i==0 case, all the first word in each sentence
                        label_pre = 'B_-1'  # previous label will be denoted by B_-1
                        for feat in feats:


                            if feat[0] == 'B':  # bigram feature case
                                feat = feat + ":" + label_pre

                            if output[i] != label and feat in tau_feat_vec:
                                (js, ts) = tau_feat_vec[feat]
                                for (feature, tag) in avg_feat_vec.keys():
                                    if feature == feat:
                                        avg_feat_vec[feat, tag] = avg_feat_vec[feat, tag] + feat_vec[feat, tag] * (t*m + j - ts*m - js)


                            feat_vec[feat, output[i]] -= 1.0
                            feat_vec[feat, label] += 1.0

                            avg_feat_vec[feat, output[i]] -= 1.0
                            avg_feat_vec[feat, label] += 1.0

                            tau_feat_vec[feat, output[i]] = (j, t)
                            tau_feat_vec[feat, label] = (j, t)


            else:
                # last sentence of each iteration
                feat_index = 0
                # check word by word if the predicted tag is equal to the true tag
                for i, v in enumerate(output):
                    (feat_index, feats) = perc.feats_for_word(feat_index, feat_list)
                    # retrieve the feature for a word
                    if len(feats) == 0:
                        print >>sys.stderr, " ".join(labels), " ".join(feat_list), "\n"
                        raise ValueError("features do not align with input sentence")
                    
                    fields = labels[i].split()
                    label = fields[2]
                    if i > 0: 
                        label_pre = labels[i-1].split()[2]
                        for feat in feats:
                            if feat[0] == 'B': # for bigram feature
                                feat_out = feat + ":" + output[i-1]  # feat_out is the "B:<previous output>"
                                feat_lab = feat + ":" + label_pre  # feat_lab is the "B:<previous label>"

                                if   output[i-1] != label_pre and output[i] != label:

                                    if feat in tau_feat_vec:
                                        (js, ts) = tau_feat_vec[feat]
                                        for (feature, tag) in avg_feat_vec.keys():
                                            if feature == feat:
                                                avg_feat_vec[feat, tag] = avg_feat_vec[feat, tag] + feat_vec[feat, tag] * (t*m + j - ts*m - js)

                                    # update original feature vector
                                    feat_vec[feat_out, output[i]]   -= 1.0
                                    feat_vec[feat_lab, output[i]]   -= 1.0
                                    feat_vec[feat_out, label]       += 1.0
                                    feat_vec[feat_lab, label]       += 1.0

                                    # update avg feature vector
                                    avg_feat_vec[feat_out, output[i]]   -= 1.0
                                    avg_feat_vec[feat_lab, output[i]]   -= 1.0
                                    avg_feat_vec[feat_out, label]       += 1.0
                                    avg_feat_vec[feat_lab, label]       += 1.0

                                elif output[i-1] == label_pre and output[i] != label:

                                    if feat in tau_feat_vec:
                                        (js, ts) = tau_feat_vec[feat]
                                        for (feature, tag) in avg_feat_vec.keys():
                                            if feature == feat:
                                                avg_feat_vec[feat, tag] = avg_feat_vec[feat, tag] + feat_vec[feat, tag] * (t*m + j - ts*m - js)


                                    feat_vec[feat_lab, output[i]]   -= 2.0
                                    feat_vec[feat_lab, label]       += 2.0

                                    avg_feat_vec[feat_lab, output[i]]   -= 2.0
                                    avg_feat_vec[feat_lab, label]       += 2.0
                                    

                                elif output[i-1] != label_pre and output[i] == label:
                                    pass

                                elif output[i-1] == label_pre and output[i] == label:
                                    pass

                            else: # for U00 to U22 feature
                            
                                if output[i] != label and feat in tau_feat_vec:
                                    (js, ts) = tau_feat_vec[feat]
                                    for (feature, tag) in avg_feat_vec.keys():
                                        if feature == feat:
                                            avg_feat_vec[feat, tag] = avg_feat_vec[feat, tag] + feat_vec[feat, tag] * (t*m + j - ts*m - js)


                                feat_vec[feat, output[i]] -= 1.0
                                feat_vec[feat, label] += 1.0

                                avg_feat_vec[feat, output[i]] -= 1.0
                                avg_feat_vec[feat, label] += 1.0


                    else:  # for i==0 case, all the first word in each sentence
                        label_pre = 'B_-1'  # previous label will be denoted by B_-1
                        for feat in feats:
                            if feat[0] == 'B':  # bigram feature case
                                feat = feat + ":" + label_pre
                            
                            if output[i] != label and feat in tau_feat_vec:
                                (js, ts) = tau_feat_vec[feat]
                                for (feature, tag) in avg_feat_vec.keys():
                                    if feature == feat:
                                        avg_feat_vec[feat, tag] = avg_feat_vec[feat, tag] + feat_vec[feat, tag] * (t*m + j - ts*m - js)


                            feat_vec[feat, output[i]] -= 1.0
                            feat_vec[feat, label] += 1.0

                            avg_feat_vec[feat, output[i]] -= 1.0
                            avg_feat_vec[feat, label] += 1.0


        # end of iteration

    # averaging perceptron
    for key in avg_feat_vec.keys():
        avg_feat_vec[key] = avg_feat_vec[key]/float(numepochs*m)
    # please limit the number of iterations of training to n iterations
    return avg_feat_vec