def perc_train(train_data, tagset, numepochs): feat_vec = defaultdict(int) i = 0 j = 0 while i < len(train_data): while j < len(train_data[i][1]): feat = train_data[i][1][j] for tag in tagset: feat_vec[feat, tag] = 0 j += 1 i += 1 gooby = perc.perc_test(feat_vec, train_data[0][0], train_data[0][1], tagset, tagset[0]) #print(train_data[0][0]) #print(getDefault(train_data[0][0])) for i in range(1, numepochs): # for i in range(1,numepochs): for j in range(0, len(train_data) - 1): for k in range(0, len(train_data[j]) - 1): #Get Default Tag defaultTag = getDefault(train_data[j][0]) z = perc.perc_test(feat_vec, train_data[j][0], train_data[j][1], tagset, defaultTag) # print(z) #compare z to t t = [] for line in train_data[j][0]: t.append(line.split()[2]) #update weights if z and t are not the same for a in range(0, len(z) - 1): if z[a] != t[a]: for b in range(20 * a, 20 * a + 19): try: feat_vec[(train_data[j][1][b], t[a])] += 1 feat_vec[(train_data[j][1][b], z[a])] -= 1 except: print("Error") print("j = " + str(j) + " b = " + str(b) + " a = " + str(a)) print("len(train_data) = " + str(len(train_data))) print("len(train_data[j][1]) = " + str(len(train_data[j][1]))) print("len(t) = " + str(len(t))) print("len(z) = " + str(len(z))) print("train_data[j][1][738] = " + train_data[j][1][738]) print("train_data[j][1][739] = " + train_data[j][1][739]) exit() #if z != t: #feat_vec += feat_vec + perc.perc_test(feat_vec, train_data[j][0], train_data[j][1], tagset, defaultTag) return feat_vec
def perc_train(train_data, tagset, numepochs): print len(train_data) feat_vec = defaultdict(int) defaultTag = tagset[0] for i in range(numepochs): print i k = 0 feat_index = 0 for (labeled_list, feat_list) in train_data: if k % 100 == 0: print " ", k k += 1 z = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, defaultTag) # get the augmented labels and feats for the word labels = copy.deepcopy(labeled_list) (feat_index, feats) = perc.feats_for_word(feat_index, feat_list) labels.insert(0, '_B-1 _B-1 _B-1') z.insert(0, '_B-1') # update weights when t != labels[j] N = len(labels) for j in range(1, N - 1): if x(labels, j, 2) != z[j]: updateWeights(feat_vec, labels, z, j, feats) return feat_vec
def perc_train(train_data, tagset, numepochs): feat_vec = defaultdict(int) sigma = defaultdict(int) mistakes = 0 # For each epoch we go through all the sentences in the training set. for epoch in range(numepochs): mistakes = 0 # Each sentence is passed through the viterbi to get the argmax output of the labels. for sentence in train_data: # The prediction from viterbi is stored in pred. pred = perc.perc_test(feat_vec, sentence[0], sentence[1], tagset, tagset[0]) true = [word.split()[2] for word in sentence[0]] count = 0 # Each label returned from the result of viterbi is checked with the true label. for i in range(len(pred)): count += 20 if (pred[i] != true[i]): # Record the mistakes in all epochs mistakes += 1 # For features of each of the word in the sentence, we make the updates in weight vector. for j in range(count - 20, count): # We give a -1 update to the features of the wrong label. if (sentence[1][j], pred[i]) in feat_vec.keys(): feat_vec[(sentence[1][j], pred[i])] -= 1 else: feat_vec[(sentence[1][j], pred[i])] = -1 # We give a +1 update to the features of the true label. if (sentence[1][j], true[i]) in feat_vec.keys(): feat_vec[(sentence[1][j], true[i])] += 1 else: feat_vec[(sentence[1][j], true[i])] = 1 if i > 0: # Similarly, we give -1 update to the wrong bigram features and # +1 update to bigram features in true label. if ("B:" + pred[i - 1], pred[i]) in feat_vec.keys(): feat_vec[("B:" + pred[i - 1], pred[i])] -= 1 else: feat_vec[("B:" + pred[i - 1], pred[i])] = -1 if ("B:" + true[i - 1], true[i]) in feat_vec.keys(): feat_vec[("B:" + true[i - 1], true[i])] += 1 else: feat_vec[("B:" + true[i - 1], true[i])] = 1 # After going through each sentence, we aggregate the weights for all the features as mentioned in # http://www.cs.sfu.ca/~anoop/papers/pdf/syntax-parsing-survey-2011.pdf for feat, weight in feat_vec.items(): if feat in sigma.keys(): sigma[feat] += weight else: sigma[feat] = weight print('Mistakes in epoch :', epoch, ' are: ', mistakes) # We average the weight parameter using the formula γ = σ/(mT) mentioned in the above mentioned paper. for feat, weight in sigma.items(): sigma[feat] = weight / (len(train_data) * numepochs) # insert your code here # please limit the number of iterations of training to n iterations return sigma
def perc_train(train_data, tagset, numepochs): """ :current_global_vector: a dict of features for the predicted labels :gold_global_vector: a dict of features for the standard """ feat_vec = defaultdict(int) avg_vec = defaultdict(int) default_tag = tagset[0] for t in range(numepochs): error_num = 0 for (labeled_list, feat_list) in train_data: std_labels = get_labels(labeled_list) output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag) if std_labels != output: error_num += 1 gold_global_vector = get_global_vector(std_labels, feat_list) current_global_vector = get_global_vector(output, feat_list) add_vector(feat_vec, gold_global_vector, 1) add_vector(feat_vec, current_global_vector, -1) print >> sys.stderr, "Epoch", t + 1, "done. # of incorrect sentences: ", error_num # Supposedly we should average over all epoch * len(train_data) feature vectors, # but that would lead to too many long-vector additions and is painfully slow. add_vector(avg_vec, feat_vec, 1) perc.perc_write_to_file( {key: float(avg_vec[key]) / (t + 1) for key in avg_vec}, opts.modelfile + str(t)) return {key: float(avg_vec[key]) / numepochs for key in avg_vec}
def perc_train(train_data, tagset, numepochs): # perceptron train feat_vec = defaultdict(int) default_tag = tagset[0] for i in range(0, numepochs): for (label_list, feat_list) in train_data: cur = perc.perc_test(feat_vec, label_list, feat_list, tagset, default_tag) gold = [entry.split()[2] for entry in label_list] if cur != gold: cur.insert(0, 'B_-1') gold.insert(0, 'B_-1') cur.append('B_+1') gold.append('B_+1') cur_len = len(cur) gold_len = len(gold) if cur_len != gold_len: raise ValueError( "output length is not the same with the input sentence" ) feat_index = 0 # perceptron update for i in range(1, cur_len): (feat_index, features) = perc.feats_for_word(feat_index, feat_list) for f in features: feat_vec[(f, cur[i])] = feat_vec[(f, cur[i])] - 1 feat_vec[(f, gold[i])] = feat_vec[(f, gold[i])] + 1 print >> sys.stderr, "iteration %d done." % i return feat_vec
def argmax(feat_vec, data, tagset, default_tag): labeled_list, feat_list = data local_output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag) local_output.insert(0, 'B_-1') local_output.append('B_+1') return retrieve_feature(local_output, feat_list)
def perc_train(train_data, tagset, numepochs, pos_dict): feat_vec = defaultdict(int) # insert your code here if len(tagset) <= 0: raise ValueError("Empty tagset") numepochs = int(20) default_tag = tagset[0] for t in range(numepochs): tmp = 0 # Count sentence print 'Iteration#',t,' is processing now.' for (labeled_list, feat_list) in train_data: labels = copy.deepcopy(labeled_list) # add in the start and end buffers for the context # for every sentence in the training set, iterate numepochs times output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag, pos_dict) # compare current output and true result # correct_flag = True feat_index = 0 # check word by word if the predicted tag is equal to the true tag for i, v in enumerate(output): (feat_index, feats) = perc.feats_for_word(feat_index, feat_list) # retrieve the feature for a word if len(feats) == 0: print >>sys.stderr, " ".join(labels), " ".join(feat_list), "\n" raise ValueError("features do not align with input sentence") fields = labels[i].split() label = fields[2] if i > 0: label_pre = labels[i-1].split()[2] if output[i-1] is not label_pre or output[i] != label: for feat in feats: if feat[0] == 'B': # for bigram feature feat_out = feat + ":" + output[i-1] # feat_out is the "B:<previous output>" feat_lab = feat + ":" + label_pre # feat_lab is the "B:<previous label>" feat_vec[feat_out, output[i]] = feat_vec[feat_out, output[i]] - 1 feat_vec[feat_out, label] = feat_vec[feat_out, label] + 1 feat_vec[feat_lab, output[i]] = feat_vec[feat_lab, output[i]] - 1 feat_vec[feat_lab, label] = feat_vec[feat_lab, label] + 1 else: # for U00 to U22 feature feat_vec[feat, output[i]] = feat_vec[feat, output[i]] - 1 feat_vec[feat, label] = feat_vec[feat, label] + 1 else: # for i==0 case, all the first word in each sentence label_pre = 'B_-1' # previous label will be denoted by B_-1 for feat in feats: if feat[0] == 'B': # bigram feature case feat = feat + ":" + label_pre feat_vec[feat, output[i]] = feat_vec[feat, output[i]] - 1 feat_vec[feat, label] = feat_vec[feat, label] + 1 perc.perc_write_to_file(feat_vec, 'model_' + str(t)) # please limit the number of iterations of training to n iterations return feat_vec
def perc_train(train_data, tagset, n): feat_vec = defaultdict(int) feat_avg_vec = defaultdict(int) # insert your code here # please limit the number of iterations of training to n iterations default_tag = tagset[0] # tag any word with 'B-NP' in the beginning num_sentence = len(train_data) num_words = 0 count = 0 for iteration in range(n): sent_index = 0 for sentence in train_data: #sentence = (labeled_list, feat_list) for each sentence sent_index += 1 print '{0}\r'.format("\rIteration: %d/%d. Sentence: %d/%d\t" % (iteration + 1, n, sent_index, num_sentence)), (labeled_list, feat_list) = sentence num_words += len(labeled_list) #compute tags based on current weights estimated_tags = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag) #the target 'right' tag list standard_tags = [item.split()[2] for item in labeled_list] if estimated_tags != standard_tags: st_prev = es_prev = 'B_-1' index = 0 #reference: http://gul.gu.se/public/pp/public_courses/course38351/published/1360057354030/resourceId/19456476/content/9adb1f1e-52e4-48b4-8001-ada93be18089/9adb1f1e-52e4-48b4-8001-ada93be18089.html step = (n * num_sentence - count) * 1.0 / (n * num_sentence) for (st_tag, es_tag) in zip(standard_tags, estimated_tags): (index, feats) = perc.feats_for_word(index, feat_list) for feat in feats: #deal with feat B: according to the given output example. if feat == 'B': if st_prev != es_prev or st_tag != es_tag: feat_vec[('B:' + es_prev, es_tag)] -= 1 feat_vec[('B:' + st_prev, st_tag)] += 1 feat_avg_vec[('B:' + es_prev, es_tag)] -= step feat_avg_vec[('B:' + st_prev, st_tag)] += step es_prev = es_tag st_prev = st_tag else: if st_tag != es_tag: feat_vec[(feat, es_tag)] -= 1 feat_vec[(feat, st_tag)] += 1 feat_avg_vec[(feat, es_tag)] -= step feat_avg_vec[(feat, st_tag)] += step count += 1 perc.perc_write_to_file( feat_avg_vec, 'models/n' + str(iteration) + 'avg_params.model') return feat_avg_vec
def perc_train(train_data, tagset, n): # insert your code here if len(tagset) <= 0: raise ValueError("Empty tagset") default_tag = tagset[0] feat_vec = defaultdict(int) epochs = n for round in range(0, epochs): num_mistakes = 0 for (labeled_list, feat_list) in train_data: output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag) true_output = get_truth(labeled_list) logging.info("arg max output: %s" % (" ".join(output))) logging.info("truth: %s" % (" ".join(true_output))) #print >>sys.stderr, "\noutput:", output, "\ntruth:", true_output if output != true_output: num_mistakes += 1 output.insert(0, 'B_-1') output.append('B_+1') true_output.insert(0, 'B_-1') true_output.append('B_+1') feat_index = 0 for i in range(1, len(output) - 1): #print >>sys.stderr, output[i], true_output[i] (feat_index, feats) = perc.feats_for_word(feat_index, feat_list) if len(feats) == 0: print >> sys.stderr, " ".join(labeled_list), " ".join( feat_list), "\n" raise ValueError( "features do not align with input sentence") #print >>sys.stderr, feats feat_vec_update = defaultdict(int) for feat in feats: #!!!Debug: output_feat is not truth feat.... output_feat = truth_feat = feat feat_vec_update[output_feat, output[i]] += -1 feat_vec_update[truth_feat, true_output[i]] += 1 for (upd_feat, upd_tag) in feat_vec_update: if feat_vec_update[upd_feat, upd_tag] != 0: feat_vec[upd_feat, upd_tag] += feat_vec_update[upd_feat, upd_tag] logging.info( "updating feat_vec with feature_id: (%s, %s) value: %d" % (upd_feat, upd_tag, feat_vec_update[upd_feat, upd_tag])) print >> sys.stderr, "number of mistakes:", num_mistakes logging.info("current number of mistakes: %d" % (num_mistakes)) return feat_vec
def perc_train(train_data, tagset, numepochs): # perceptron train T = float(len(train_data)) step = numepochs * T feat_vec_cache = defaultdict(int) # feat_vec stores the weights for the features of a sentence, initially all weights are 0 feat_vec = defaultdict(int) # default_tag = 'B-NP' default_tag = tagset[0] # for each epoch/iteration for i in range(0, numepochs): # for each item (e.g tuple=([labeled words for each sentence],[features for those words of sentence])) in train_data for (label_list, feat_list) in train_data: # cur = list of best tag for each word in sentence found using viterbi algo cur = perc.perc_test(feat_vec, label_list, feat_list, tagset, default_tag) # gold = list of reference/true tag for each word in sentence gold = [entry.split()[2] for entry in label_list] if cur != gold: cur.insert(0, 'B_-1') gold.insert(0, 'B_-1') cur.append('B_+1') gold.append('B_+1') cur_len = len(cur) gold_len = len(gold) if cur_len != gold_len: raise ValueError( "output length is not the same with the input sentence" ) feat_index = 0 # perceptron update # for each tag/word of a sentence for i in range(1, cur_len): # for each word in a sentence, (feat_index, features) is a tuple, where feat_index=endindex of the list of features for that word, and features=list of features for that word (feat_index, features) = perc.feats_for_word(feat_index, feat_list) # update the weights of the features for that word, by rewarding the features seen in reference, while penalizing the ones not seen in reference but returned by viterbi for f in features: feat_vec[(f, cur[i])] = feat_vec[(f, cur[i])] - 1 feat_vec[(f, gold[i])] = feat_vec[(f, gold[i])] + 1 # averaged perceptron # usual way of averaging over all intermediate weight vectors is: # w = (w0 + w1 + w2 + ...... + wt) / (numepochs * T) # But we can also average in an efficient way: # w = w1*(step/numepochs*T) + w2*(step-1/numepochs*T) + w3*(step-2/numepochs*T) + ...... + wt*(1/numepochs*T) feat_vec_cache[(f, cur[i])] = feat_vec_cache[ (f, cur[i])] - 1 * (float(step / numepochs * T)) feat_vec_cache[(f, gold[i])] = feat_vec_cache[ (f, gold[i])] + 1 * (float(step / numepochs * T)) step -= 1 print >> sys.stderr, "iteration %d done." % i return feat_vec_cache
def perc_avg_train(train_data, tagset, numepochs): feat_vec = defaultdict(int) avg_feat_vec = defaultdict(float) default_tag = tagset[0] for epoch in range(numepochs): count_mistake = 0 print(f"Running on epoch {epoch+1}......") tic = time.time() for _, (labeled_list, feat_list) in enumerate(train_data): pred_output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag) true_output = [x.split()[2] for x in labeled_list] if pred_output != true_output: count_mistake += 1 feat_index = 0 for w_index in range(len(pred_output)): pred_tag = pred_output[w_index] true_tag = true_output[w_index] (feat_index, feats) = perc.feats_for_word(feat_index, feat_list) for feat in feats: if feat == 'B' and w_index > 0: if true_output[w_index - 1] != pred_output[ w_index - 1] or pred_tag != true_tag: feat_vec['B:' + true_output[w_index - 1], true_tag] += 1 feat_vec['B:' + pred_output[w_index - 1], pred_tag] -= 1 elif pred_tag != true_tag: feat_vec[feat, true_tag] += 1 feat_vec[feat, pred_tag] -= 1 for key in feat_vec.keys(): # γ = σ/(mT) avg_feat_vec[key] += feat_vec[key] toc = time.time() print( f'Epoch {epoch+1} finished. Time cost on this epoch: {toc-tic}. Number of mistakes: {count_mistake}.' ) for key in avg_feat_vec.keys(): avg_feat_vec[key] /= (numepochs * len(train_data)) return avg_feat_vec
def perc_train(train_data, tagset, iterations=1): feat_vec = FeatureVector() default_tag = tagset[0] for iteration in range(iterations): # Number of Sentences sentence_total = len(train_data) sentence_count = 0 for (labeled_list, feat_list) in train_data: # Print out information sentence_count += 1 print "iteration", iteration, "sentence", sentence_count, "of", sentence_total # Retrieve Gold Output gold_output = [] gold_output.append('B_-1') for i in labeled_list: (w, t, label) = i.split() gold_output.append(label) gold_output.append('B_+1') # Retrieve Local Output local_output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag) local_output.insert(0, 'B_-1') local_output.append('B_+1') print gold_output print local_output # When Outputs are different, update feature vector if local_output != gold_output: # Extract features from both outputs local_vec = retrieve_feature(local_output, feat_list) gold_vec = retrieve_feature(gold_output, feat_list) feat_vec += gold_vec - local_vec # Dump every iteration feat_vec.dump("models/jetic_Iter_" + str(iteration+1) + ".model") return feat_vec.export()
def perc_train(train_data, tagset, n): # insert your code here if len(tagset) <= 0: raise ValueError("Empty tagset") default_tag = tagset[0] feat_vec = defaultdict(int) epochs = n for round in range(0,epochs): num_mistakes = 0 for (labeled_list, feat_list) in train_data: output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag) true_output = get_truth(labeled_list) logging.info("arg max output: %s" % (" ".join(output))) logging.info("truth: %s" % (" ".join(true_output))) #print >>sys.stderr, "\noutput:", output, "\ntruth:", true_output if output != true_output: num_mistakes += 1 output.insert(0,'B_-1') output.append('B_+1') true_output.insert(0,'B_-1') true_output.append('B_+1') feat_index = 0 for i in range(1,len(output)-1): #print >>sys.stderr, output[i], true_output[i] (feat_index, feats) = perc.feats_for_word(feat_index, feat_list) if len(feats) == 0: print >>sys.stderr, " ".join(labeled_list), " ".join(feat_list), "\n" raise ValueError("features do not align with input sentence") #print >>sys.stderr, feats feat_vec_update = defaultdict(int) for feat in feats: #!!!Debug: output_feat is not truth feat.... output_feat = truth_feat = feat feat_vec_update[output_feat, output[i]] += -1 feat_vec_update[truth_feat, true_output[i]] += 1 for (upd_feat, upd_tag) in feat_vec_update: if feat_vec_update[upd_feat, upd_tag] != 0: feat_vec[upd_feat, upd_tag] += feat_vec_update[upd_feat, upd_tag] logging.info("updating feat_vec with feature_id: (%s, %s) value: %d" % (upd_feat, upd_tag, feat_vec_update[upd_feat, upd_tag])) print >>sys.stderr, "number of mistakes:", num_mistakes logging.info("current number of mistakes: %d" % (num_mistakes)) return feat_vec
def perc_train(train_data, tagset, n): feat_vec = defaultdict(int) # insert your code here # please limit the number of iterations of training to n iterations n_sentences = len(train_data) for i in range (0,n): for j in range(0,n_sentences): print '{0}\r'.format("\rIteration: %d/%d. Sentence: %d/%d\t" %(i+1, n, j+1, n_sentences)), labeled_list = train_data[j][0] feat_list = train_data[j][1] # Extract the labels from training data toutput = [tags.split(' ')[2] for tags in labeled_list ] # Output from Viterbi Algorithm zoutput = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, 'B-NP') # Compare outputs if toutput != zoutput: index = 0 for k in range(0, len(zoutput)): predicted_tag = zoutput[k] true_tag = toutput[k] (index, feats) = perc.feats_for_word(index, feat_list) for feat in feats: s1 = s2 = '' if feat == 'B': if k >= 1: zprevtag = zoutput[k-1] tprevtag = toutput[k-1] else: zprevtag = tprevtag = 'B_-1' s1 = (feat+':'+zprevtag, predicted_tag) s2 = (feat+':'+tprevtag, true_tag) else: s1 = (feat, predicted_tag) s2 = (feat, true_tag) if s1 != s2: feat_vec[s1] -= 1 feat_vec[s2] += 1 return feat_vec
def perc_train(train_data, tagset, numepochs): feat_vec = defaultdict(int) default_tag = tagset[0] for t in range(numepochs): error_num = 0 for (labeled_list, feat_list) in train_data: output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag) expected = [i.split()[2] for i in labeled_list] if output != expected: vec_output = global_feature_vector(feat_list, output) vec_expected = global_feature_vector(feat_list, expected) update_weight_vector(feat_vec, vec_output, -1) update_weight_vector(feat_vec, vec_expected, 1) error_num += 1 print "Number of mistakes: ", error_num return feat_vec
def perc_train(train_data, tagset, numepochs): feat_vec = defaultdict(int) tags = {} for i in range(0, numepochs): for j in range(0, len(train_data)): label_list = train_data[j][0] feat_list = train_data[j][1] z = perc.perc_test(feat_vec, label_list, feat_list, tagset, tagset[0]) for k in range(0, len(z)): temp = train_tags(label_list) if (z[k] != temp[k]): check_and_change(feat_vec, word_list(label_list), pos_list(label_list), train_tags(label_list), z, k) return feat_vec
def perc_train(train_data, tagset, numepochs): """ :current_global_vector: a dict of features for the predicted labels :gold_global_vector: a dict of features for the standard """ feat_vec = defaultdict(int) #for t in range(numepochs): default_tag = tagset[0] for t in range(numepochs): for (labeled_list, feat_list) in train_data: std_labels = get_labels(labeled_list) output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag) gold_global_vector = get_global_vector(std_labels, feat_list) current_global_vector = get_global_vector(output, feat_list) add_vector(feat_vec, gold_global_vector, 1) add_vector(feat_vec, current_global_vector, -1) perc.perc_write_to_file(feat_vec, opts.modelfile + str(t)) return feat_vec
def perc_train(train_data, tagset, numepochs): feat_vec = defaultdict(int) default_tag = tagset[0] # insert your code here # please limit the number of iterations of training to n iterations for i in range(numepochs): numOfError = 0 output = [] for (labeled_list, feat_list) in train_data: output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag) elements = [element.split(" ")[2] for element in labeled_list] for j in range(len(elements)): trueLabel = elements[j] argMaxLabel = output[j] if (trueLabel != argMaxLabel): numOfError = numOfError + 1 for feat in feat_list[j * 20:j * 20 + 20]: if (feat == "B") & j > 0: trueLabel_prev = elements[j - 1] argMaxLabel_prev = output[j - 1] feat_vec["B:" + trueLabel_prev, trueLabel] = feat_vec["B:" + trueLabel_prev, trueLabel] + 1 feat_vec["B:" + argMaxLabel_prev, argMaxLabel] = feat_vec["B:" + argMaxLabel_prev, argMaxLabel] - 1 else: feat_vec[feat, trueLabel] = feat_vec[feat, trueLabel] + 1 feat_vec[feat, argMaxLabel] = feat_vec[feat, argMaxLabel] - 1 print "Number of error in Epoch", i + 1, " ", numOfError return feat_vec
def perc_train(train_data, tagset, numepochs): feat_vec = defaultdict(int) avg_vec = defaultdict(int) changed_vec = { } # records when vectors were changed, value is (feature/x, epoch/count, totalfeatures) print 'numepochs = ', numepochs for count in range(0, numepochs): print "Epoch: " + str(count) for x in range(0, len(train_data)): true_features = create_featureSchema(train_data[x][1], train_data[x][0]) output_label = perc.perc_test(feat_vec, train_data[x][0], train_data[x][1], tagset, tagset[0]) for y in range(0, len(output_label)): true_label = train_data[x][0][y].split(' ')[2] if output_label[y] != true_label: changed_vec_value = [count, x, len(train_data)] apply_change = (count == (numepochs - 1) and x == (len(train_data) - 1)) previous_true_label = train_data[x][0][y - 1].split(' ')[2] update_featVector(output_label[y], output_label[y - 1], true_features[y], true_label, previous_true_label, feat_vec, avg_vec, changed_vec, changed_vec_value, apply_change) for k in changed_vec.iterkeys(): this_feat = len(train_data) this_epoch = numepochs last_feat = changed_vec[k][0] last_epoch = changed_vec[k][1] num_feat = changed_vec[k][2] multiplier = (this_feat * num_feat + this_epoch - last_feat * num_feat - last_epoch) avg_vec[k] += (feat_vec[k] * multiplier) for k in avg_vec.iterkeys(): avg_vec[k] = 1.0 * avg_vec[k] / (numepochs * len(train_data)) return avg_vec
def perc_train(train_data, tagset, n): feat_vec = defaultdict(int) sigma_feat_vec = defaultdict(float) # insert your code here # please limit the number of iterations of training to n iterations print >> sys.stderr, "training data ..." n_sentences = len(train_data) for i in range (0,n): for j in range(0,n_sentences): print >> sys.stderr, '\r{0}'.format("Iteration: %d/%d. Sentence: %d/%d\t" %(i+1, n, j+1, n_sentences)), labeled_list = train_data[j][0] feat_list = train_data[j][1] # Extract the labels from training data toutput = [tags.split(' ')[2] for tags in labeled_list ] # Output from Viterbi Algorithm zoutput = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, 'B-NP') # Compare outputs if toutput != zoutput: index = 0 for p in range(0, len(zoutput)): predicted_tag = zoutput[p] true_tag = toutput[p] (index, feats) = perc.feats_for_word(index, feat_list) for feat in feats: s1 = s2 = '' if feat == 'B': if p >= 1: zprevtag = zoutput[p-1] tprevtag = toutput[p-1] else: zprevtag = tprevtag = 'B_-1' s1 = (feat+':'+zprevtag, predicted_tag) s2 = (feat+':'+tprevtag, true_tag) else: s1 = (feat, predicted_tag) s2 = (feat, true_tag) if s1 != s2: feat_vec[s1] -= 1 feat_vec[s2] += 1 # Compute average vector for f in feat_vec: sigma_feat_vec[f] += feat_vec[f] print >> sys.stderr, "\ndone" zerokeys = [] for f in sigma_feat_vec: if sigma_feat_vec[f] == 0: zerokeys.append(f) else: sigma_feat_vec[f] = sigma_feat_vec[f]/(n*n_sentences) for k in zerokeys: del sigma_feat_vec[k] return sigma_feat_vec
def perc_train(train_data, tagset, numepochs): feat_vec = defaultdict(float) avg_feat_vec = defaultdict(float) tau_feat_vec = dict() # insert your code here if len(tagset) <= 0: raise ValueError("Empty tagset") default_tag = tagset[0] m = len(train_data) # length of training data for t in range(numepochs): print 'Iteration#',t,' is processing now.' for j, (labeled_list, feat_list) in enumerate(train_data): labels = copy.deepcopy(labeled_list) # print 'sentence[',j,']' # add in the start and end buffers for the context # for every sentence in the training set, iterate numepochs times output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag) # compare current output and true result if j != m - 1 or t != numepochs - 1: feat_index = 0 # check word by word if the predicted tag is equal to the true tag for i, v in enumerate(output): (feat_index, feats) = perc.feats_for_word(feat_index, feat_list) # retrieve the feature for a word if len(feats) == 0: print >>sys.stderr, " ".join(labels), " ".join(feat_list), "\n" raise ValueError("features do not align with input sentence") label = labels[i].split()[2] if i > 0: label_pre = labels[i-1].split()[2] for feat in feats: if feat[0] == 'B': # for bigram feature feat_out = feat + ":" + output[i-1] # feat_out is the "B:<previous output>" feat_lab = feat + ":" + label_pre # feat_lab is the "B:<previous label>" if output[i] != label or feat_out != feat_lab: # laze update the tau vector value lazy_update_vect(feat_out, output[i], tau_feat_vec, feat_vec, avg_feat_vec, t, j, m) lazy_update_vect(feat_lab, label, tau_feat_vec, feat_vec, avg_feat_vec, t, j, m) # update original feature vector, if feat_out == feat_lab perform 2nd type updating update_bigram_vect(feat_vec, avg_feat_vec, feat_out, feat_lab, output[i], label) # if feat_out == feat_lab then update twice for the same tau tau_feat_vec[feat_out, output[i]] = (j, t) tau_feat_vec[feat_lab, label] = (j, t) elif output[i] != label: lazy_update_vect(feat, output[i], tau_feat_vec, feat_vec, avg_feat_vec, t, j, m) lazy_update_vect(feat, label, tau_feat_vec, feat_vec, avg_feat_vec, t, j, m) # for U00 to U22 feature update_unigram_vect(feat_vec, avg_feat_vec, feat, output[i],label) # update vector tau_feat_vec[feat, output[i]] = (j, t) tau_feat_vec[feat, label] = (j, t) else: # for i==0 case, all the first word in each sentence label_pre = '_B-2' # previous label will be denoted by _B-2 for feat in feats: if feat[0] == 'B' and output[i] != label: # bigram feature case feat = feat + ":" + label_pre lazy_update_vect(feat, output[i], tau_feat_vec, feat_vec, avg_feat_vec, t, j, m) lazy_update_vect(feat, label, tau_feat_vec, feat_vec, avg_feat_vec, t, j, m) update_bigram_vect(feat_vec, avg_feat_vec, feat, feat, output[i], label) tau_feat_vec[feat, label] = (j, t) tau_feat_vec[feat, output[i]] = (j, t) elif output[i] != label: lazy_update_vect(feat, output[i], tau_feat_vec, feat_vec, avg_feat_vec, t, j, m) lazy_update_vect(feat, label, tau_feat_vec, feat_vec, avg_feat_vec, t, j, m) # for U00 to U22 feature update_unigram_vect(feat_vec, avg_feat_vec, feat, output[i],label) tau_feat_vec[feat, output[i]] = (j, t) tau_feat_vec[feat, label] = (j, t) else: final_lazy_update_vect(tau_feat_vec, feat_vec, avg_feat_vec, t, j, m) # special case for the last sentence feat_index = 0 # check word by word if the predicted tag is equal to the true tag for i, v in enumerate(output): (feat_index, feats) = perc.feats_for_word(feat_index, feat_list) # retrieve the feature for a word if len(feats) == 0: print >>sys.stderr, " ".join(labels), " ".join(feat_list), "\n" raise ValueError("features do not align with input sentence") label = labels[i].split()[2] if i > 0: label_pre = labels[i-1].split()[2] for feat in feats: if feat[0] == 'B': # for bigram feature feat_out = feat + ":" + output[i-1] # feat_out is the "B:<previous output>" feat_lab = feat + ":" + label_pre # feat_lab is the "B:<previous label>" if output[i] != label: # update original feature vector update_bigram_vect(feat_vec, avg_feat_vec, feat_out, feat_lab, output[i], label) elif output[i] != label: update_unigram_vect(feat_vec, avg_feat_vec, feat, output[i],label) else: # for i==0 case, all the first word in each sentence label_pre = '_B-1' # previous label will be denoted by _B-2 for feat in feats: if feat[0] == 'B' and output[i] != label: # bigram feature case feat = feat + ":" + label_pre update_bigram_vect(feat_vec, avg_feat_vec, feat, feat, output[i], label) elif output[i] != label: # for U00 to U22 feature update_unigram_vect(feat_vec, avg_feat_vec, feat, output[i],label) # end of iteration # averaging perceptron for key in avg_feat_vec.keys(): avg_feat_vec[key] = avg_feat_vec[key]/float(numepochs*m) # please limit the number of iterations of training to n iterations perc.perc_write_to_file(feat_vec, 'model_feat_vec') return avg_feat_vec
def avg_perc_train(train_data, tagset, n): # insert your code here if len(tagset) <= 0: raise ValueError("Empty tagset") default_tag = tagset[0] feat_vec = defaultdict(int) avg_vec = defaultdict(int) last_iter = {} epochs = n num_updates = 0 for round in range(0,epochs): num_mistakes = 0 for (labeled_list, feat_list) in train_data: num_updates += 1 output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag) true_output = get_truth(labeled_list) logging.info("arg max output: %s" % (" ".join(output))) logging.info("truth: %s" % (" ".join(true_output))) #print >>sys.stderr, "\noutput:", output, "\ntruth:", true_output if output != true_output: num_mistakes += 1 output.insert(0,'B_-1') output.append('B_+1') true_output.insert(0,'B_-1') true_output.append('B_+1') feat_index = 0 for i in range(1,len(output)-1): #print >>sys.stderr, output[i], true_output[i] (feat_index, feats) = perc.feats_for_word(feat_index, feat_list) if len(feats) == 0: print >>sys.stderr, " ".join(labeled_list), " ".join(feat_list), "\n" raise ValueError("features do not align with input sentence") #print >>sys.stderr, feats feat_vec_update = defaultdict(int) for feat in feats: if feat == 'B': output_feat = 'B:' + output[i-1] truth_feat = 'B:' + true_output[i-1] else: output_feat = truth_feat = feat feat_vec_update[output_feat, output[i]] += -1 feat_vec_update[truth_feat, true_output[i]] += 1 #reason: if output[i]==true_output[i] update = 0 for (upd_feat, upd_tag) in feat_vec_update: if feat_vec_update[upd_feat, upd_tag] != 0: feat_vec[upd_feat, upd_tag] += feat_vec_update[upd_feat, upd_tag] logging.info("updating feat_vec with feature_id: (%s, %s) value: %d" % (upd_feat, upd_tag, feat_vec_update[upd_feat, upd_tag])) if (upd_feat, upd_tag) in last_iter: avg_vec[upd_feat, upd_tag] += (num_updates - last_iter[upd_feat, upd_tag]) * feat_vec[upd_feat, upd_tag] else: avg_vec[upd_feat, upd_tag] = feat_vec[upd_feat, upd_tag] last_iter[upd_feat, upd_tag] = num_updates print >>sys.stderr, "number of mistakes:", num_mistakes for (feat, tag) in feat_vec: if (feat, tag) in last_iter: avg_vec[feat, tag] += (num_updates - last_iter[feat, tag]) * feat_vec[feat, tag] else: avg_vec[feat, tag] = feat_vec[feat, tag] feat_vec[feat, tag] = avg_vec[feat, tag] / num_updates return feat_vec
def perc_train(train_data, tagset, numepochs): feat_vec = defaultdict(float) avg_feat_vec = defaultdict(float) tau_feat_vec = dict() # insert your code here if len(tagset) <= 0: raise ValueError("Empty tagset") numepochs = int(2) default_tag = tagset[0] m = len(train_data) # length of training data for t in range(numepochs): print 'Iteration#',t,' is processing now.' for j, (labeled_list, feat_list) in enumerate(train_data): labels = copy.deepcopy(labeled_list) # add in the start and end buffers for the context # for every sentence in the training set, iterate numepochs times output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag) # compare current output and true result # correct_flag = True if j != m or t != numepochs - 1: feat_index = 0 # check word by word if the predicted tag is equal to the true tag for i, v in enumerate(output): (feat_index, feats) = perc.feats_for_word(feat_index, feat_list) # retrieve the feature for a word if len(feats) == 0: print >>sys.stderr, " ".join(labels), " ".join(feat_list), "\n" raise ValueError("features do not align with input sentence") label = labels[i].split()[2] if i > 0: label_pre = labels[i-1].split()[2] for feat in feats: if feat[0] == 'B': # for bigram feature feat_out = feat + ":" + output[i-1] # feat_out is the "B:<previous output>" feat_lab = feat + ":" + label_pre # feat_lab is the "B:<previous label>" if output[i-1] != label_pre and output[i] != label: if feat in tau_feat_vec: (js, ts) = tau_feat_vec[feat] for (feature, tag) in avg_feat_vec.keys(): if feature == feat: avg_feat_vec[feat, tag] = avg_feat_vec[feat, tag] + feat_vec[feat, tag] * (t*m + j - ts*m - js) # update original feature vector feat_vec[feat_out, output[i]] -= 1.0 feat_vec[feat_lab, output[i]] -= 1.0 feat_vec[feat_out, label] += 1.0 feat_vec[feat_lab, label] += 1.0 # update avg feature vector avg_feat_vec[feat_out, output[i]] -= 1.0 avg_feat_vec[feat_lab, output[i]] -= 1.0 avg_feat_vec[feat_out, label] += 1.0 avg_feat_vec[feat_lab, label] += 1.0 tau_feat_vec[feat_out] = (j, t) tau_feat_vec[feat_lab] = (j, t) elif output[i-1] == label_pre and output[i] != label: if feat in tau_feat_vec: (js, ts) = tau_feat_vec[feat] for (feature, tag) in avg_feat_vec.keys(): if feature == feat: avg_feat_vec[feat, tag] = avg_feat_vec[feat, tag] + feat_vec[feat, tag] * (t*m + j - ts*m - js) feat_vec[feat_lab, output[i]] -= 2.0 feat_vec[feat_lab, label] += 2.0 avg_feat_vec[feat_lab, output[i]] -= 2.0 avg_feat_vec[feat_lab, label] += 2.0 tau_feat_vec[feat_lab] = (j, t) tau_feat_vec[feat_lab] = (j, t) elif output[i-1] != label_pre and output[i] == label: pass elif output[i-1] == label_pre and output[i] == label: pass else: # for U00 to U22 feature if output[i] != label and feat in tau_feat_vec: (js, ts) = tau_feat_vec[feat] for (feature, tag) in avg_feat_vec.keys(): if feature == feat: avg_feat_vec[feat, tag] = avg_feat_vec[feat, tag] + feat_vec[feat, tag] * (t*m + j - ts*m - js) feat_vec[feat, output[i]] -= 1.0 feat_vec[feat, label] += 1.0 avg_feat_vec[feat, output[i]] -= 1.0 avg_feat_vec[feat, label] += 1.0 # update vector tau_feat_vec[feat, output[i]] = (j, t) tau_feat_vec[feat, label] = (j, t) else: # for i==0 case, all the first word in each sentence label_pre = 'B_-1' # previous label will be denoted by B_-1 for feat in feats: if feat[0] == 'B': # bigram feature case feat = feat + ":" + label_pre if output[i] != label and feat in tau_feat_vec: (js, ts) = tau_feat_vec[feat] for (feature, tag) in avg_feat_vec.keys(): if feature == feat: avg_feat_vec[feat, tag] = avg_feat_vec[feat, tag] + feat_vec[feat, tag] * (t*m + j - ts*m - js) feat_vec[feat, output[i]] -= 1.0 feat_vec[feat, label] += 1.0 avg_feat_vec[feat, output[i]] -= 1.0 avg_feat_vec[feat, label] += 1.0 tau_feat_vec[feat, output[i]] = (j, t) tau_feat_vec[feat, label] = (j, t) else: # last sentence of each iteration feat_index = 0 # check word by word if the predicted tag is equal to the true tag for i, v in enumerate(output): (feat_index, feats) = perc.feats_for_word(feat_index, feat_list) # retrieve the feature for a word if len(feats) == 0: print >>sys.stderr, " ".join(labels), " ".join(feat_list), "\n" raise ValueError("features do not align with input sentence") fields = labels[i].split() label = fields[2] if i > 0: label_pre = labels[i-1].split()[2] for feat in feats: if feat[0] == 'B': # for bigram feature feat_out = feat + ":" + output[i-1] # feat_out is the "B:<previous output>" feat_lab = feat + ":" + label_pre # feat_lab is the "B:<previous label>" if output[i-1] != label_pre and output[i] != label: if feat in tau_feat_vec: (js, ts) = tau_feat_vec[feat] for (feature, tag) in avg_feat_vec.keys(): if feature == feat: avg_feat_vec[feat, tag] = avg_feat_vec[feat, tag] + feat_vec[feat, tag] * (t*m + j - ts*m - js) # update original feature vector feat_vec[feat_out, output[i]] -= 1.0 feat_vec[feat_lab, output[i]] -= 1.0 feat_vec[feat_out, label] += 1.0 feat_vec[feat_lab, label] += 1.0 # update avg feature vector avg_feat_vec[feat_out, output[i]] -= 1.0 avg_feat_vec[feat_lab, output[i]] -= 1.0 avg_feat_vec[feat_out, label] += 1.0 avg_feat_vec[feat_lab, label] += 1.0 elif output[i-1] == label_pre and output[i] != label: if feat in tau_feat_vec: (js, ts) = tau_feat_vec[feat] for (feature, tag) in avg_feat_vec.keys(): if feature == feat: avg_feat_vec[feat, tag] = avg_feat_vec[feat, tag] + feat_vec[feat, tag] * (t*m + j - ts*m - js) feat_vec[feat_lab, output[i]] -= 2.0 feat_vec[feat_lab, label] += 2.0 avg_feat_vec[feat_lab, output[i]] -= 2.0 avg_feat_vec[feat_lab, label] += 2.0 elif output[i-1] != label_pre and output[i] == label: pass elif output[i-1] == label_pre and output[i] == label: pass else: # for U00 to U22 feature if output[i] != label and feat in tau_feat_vec: (js, ts) = tau_feat_vec[feat] for (feature, tag) in avg_feat_vec.keys(): if feature == feat: avg_feat_vec[feat, tag] = avg_feat_vec[feat, tag] + feat_vec[feat, tag] * (t*m + j - ts*m - js) feat_vec[feat, output[i]] -= 1.0 feat_vec[feat, label] += 1.0 avg_feat_vec[feat, output[i]] -= 1.0 avg_feat_vec[feat, label] += 1.0 else: # for i==0 case, all the first word in each sentence label_pre = 'B_-1' # previous label will be denoted by B_-1 for feat in feats: if feat[0] == 'B': # bigram feature case feat = feat + ":" + label_pre if output[i] != label and feat in tau_feat_vec: (js, ts) = tau_feat_vec[feat] for (feature, tag) in avg_feat_vec.keys(): if feature == feat: avg_feat_vec[feat, tag] = avg_feat_vec[feat, tag] + feat_vec[feat, tag] * (t*m + j - ts*m - js) feat_vec[feat, output[i]] -= 1.0 feat_vec[feat, label] += 1.0 avg_feat_vec[feat, output[i]] -= 1.0 avg_feat_vec[feat, label] += 1.0 # end of iteration # averaging perceptron for key in avg_feat_vec.keys(): avg_feat_vec[key] = avg_feat_vec[key]/float(numepochs*m) # please limit the number of iterations of training to n iterations return avg_feat_vec
def perc_train(train_data, tagset, numepochs): feat_vec = defaultdict(int) # insert your code here if len(tagset) <= 0: raise ValueError("Empty tagset") numepochs = int(1) default_tag = tagset[0] for t in range(numepochs): tmp = 0 # Count sentence print 'Iteration#',t,' is processing now.' cnt = 0 for (labeled_list, feat_list) in train_data: cnt = cnt + 1 print 'Sentence[',cnt,'] is now processing...' labels = copy.deepcopy(labeled_list) # add in the start and end buffers for the context # for every sentence in the training set, iterate numepochs times output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag) feat_index = 0 # check word by word if the predicted tag is equal to the true tag for i, v in enumerate(output): (feat_index, feats) = perc.feats_for_word(feat_index, feat_list) # retrieve the feature for a word if len(feats) == 0: print >>sys.stderr, " ".join(labels), " ".join(feat_list), "\n" raise ValueError("features do not align with input sentence") label = labels[i].split()[2] if i > 1: label_i_1 = labels[i-1].split()[2] label_i_2 = labels[i-2].split()[2] if output[i] != label: for feat in feats: if feat[0] == 'T' and output[i-2] != label_i_2 and output[i-1] != label_i_1: # trigram case feat_out = feat + ":" + output[i-2] + "," + output[i-1] # feat_out is the "B:<previous output>" feat_lab = feat + ":" + label_i_2 + "," + label_i_1 # feat_lab is the "B:<previous label>" # reward best condition feat_vec[feat_lab, label] = feat_vec[feat_lab, label] + 1 # penalize condition feat_vec[feat_out, output[i]] = feat_vec[feat_out, output[i]] - 1 elif feat[0] == 'B' and output[i-1] != label_i_1: # bigram case feat_out = feat + ":" + output[i-1] feat_lab = feat + ":" + label_i_1 feat_vec[feat_lab, label] = feat_vec[feat_lab, label] + 1 feat_vec[feat_out, output[i]] = feat_vec[feat_out, output[i]] - 1 else: # for U00 to U22 feature feat_vec[feat, output[i]] = feat_vec[feat, output[i]] - 1 feat_vec[feat, label] = feat_vec[feat, label] + 1 elif i == 1: # for i==0 case, all the first word in each sentence label_i_2 = '_-1' # previous label will be denoted by B_-1 label_i_1 = labels[i-1].split()[2] if output[i] != label: for feat in feats: if feat[0] == 'T' and output[i-1] != label_i_1: # trigram case feat_out = feat + ":" + label_i_2 + "," + output[i-1] feat_lab = feat + ":" + label_i_2 + "," + label_i_1 # reward best condition feat_vec[feat_lab, label] = feat_vec[feat_lab, label] + 1 # penalize condition feat_vec[feat_out, output[i]] = feat_vec[feat_out, output[i]] - 1 elif feat[0] == 'B': feat_out = feat + ":" + output[i-1] feat_lab = feat + ":" + label_i_1 feat_vec[feat_lab, label] = feat_vec[feat_lab, label] + 1 feat_vec[feat_out, output[i]] = feat_vec[feat_out, output[i]] - 1 else: # for U00 to U22 feature feat_vec[feat, output[i]] = feat_vec[feat, output[i]] - 1 feat_vec[feat, label] = feat_vec[feat, label] + 1 elif i == 0: label_i_2 = '_B-2' label_i_1 = '_B-1' if output[i] != label: for feat in feats: if feat[0] == 'T': # trigram case feat = feat + ":" + label_i_2 + "," + label_i_1 elif feat[0] == 'B': #bigram case feat = feat + ":" + label_i_1 feat_vec[feat, output[i]] = feat_vec[feat, output[i]] - 1 feat_vec[feat, label] = feat_vec[feat, label] + 1 filename = 'mid_model_iter' + str(t) perc.perc_write_to_file(feat_vec, filename) for (k1, k2), v in feat_vec.items(): if v == 0: del feat_vec[k1,k2] # please limit the number of iterations of training to n iterations return feat_vec
def perc_train(train_data, tagset, numepochs): feat_vec = defaultdict(int) # insert your code here cumulative_feat_vec = defaultdict(float) index_dec = dict() epoch = 0 count = 0 numsen = len(train_data) while (epoch < numepochs): print(epoch) mistakes = 0 correct = 0 #print(numsen) sen = 0 for sentence_data in train_data: if (epoch != numepochs or sen != numsen): words = [] postags = [] truetags = [] label_list = sentence_data[0] feat_list = sentence_data[1] for label in label_list: (word, postag, chunktag) = label.split(" ") words.append(word) postags.append(postag) truetags.append(chunktag) tagset = perc.read_tagset(opts.tagsetfile) default_tag = tagset[0] argmaxtags = perc.perc_test(feat_vec, label_list, feat_list, tagset, default_tag) feat_index = 0 i = 0 for word in words: (feat_index, feats_for_this_word) = perc.feats_for_word( feat_index, feat_list) # print(len(feats_for_this_word)) argmax = argmaxtags[i] tru = truetags[i] if (argmax == tru): i += 1 continue for f in feats_for_this_word: wrongkey = f, argmax rightkey = f, tru if (wrongkey in index_dec): (index_epoch, index_sen) = index_dec[wrongkey] idletime = (epoch * numsen + sen - index_epoch * numsen - index_sen) cumulative_feat_vec[ wrongkey] = cumulative_feat_vec.get( wrongkey, 0) + feat_vec.get(wrongkey, 0) * idletime if (rightkey in index_dec): (index_epoch, index_sen) = index_dec[rightkey] idletime = (epoch * numsen + sen - index_epoch * numsen - index_sen) cumulative_feat_vec[ rightkey] = cumulative_feat_vec.get( rightkey, 0) + feat_vec.get(rightkey, 0) * idletime feat_vec[wrongkey] = feat_vec.get(wrongkey, 0) - 1 feat_vec[rightkey] = feat_vec.get(rightkey, 0) + 1 cumulative_feat_vec[ wrongkey] = cumulative_feat_vec.get( wrongkey, 0) + feat_vec[wrongkey] cumulative_feat_vec[ rightkey] = cumulative_feat_vec.get( rightkey, 0) + feat_vec[rightkey] index_dec[wrongkey] = (epoch, sen) index_dec[rightkey] = (epoch, sen) i += 1 i = 0 for word in words: argmax = argmaxtags[i] tru = truetags[i] if (argmax == tru): i += 1 correct += 1 continue else: mistakes += 1 argmaxprev = "B:" truprev = "B:" if (i == 0): argmaxprev += "B_-1" truprev += "B_-1" else: argmaxprev += argmaxtags[i - 1] truprev += truetags[i - 1] wrongkey = argmaxprev, argmax rightkey = truprev, tru if (wrongkey in index_dec): (index_epoch, index_sen) = index_dec[wrongkey] idletime = (epoch * numsen + sen - index_epoch * numsen - index_sen) cumulative_feat_vec[wrongkey] = cumulative_feat_vec.get( wrongkey, 0) + feat_vec.get(wrongkey, 0) * idletime if (rightkey in index_dec): (index_epoch, index_sen) = index_dec[rightkey] idletime = (epoch * numsen + sen - index_epoch * numsen - index_sen) cumulative_feat_vec[rightkey] = cumulative_feat_vec.get( rightkey, 0) + feat_vec.get(rightkey, 0) * idletime feat_vec[wrongkey] = feat_vec.get(wrongkey, 0) - 1 feat_vec[rightkey] = feat_vec.get(rightkey, 0) + 1 cumulative_feat_vec[wrongkey] = cumulative_feat_vec.get( wrongkey, 0) + feat_vec[wrongkey] cumulative_feat_vec[rightkey] = cumulative_feat_vec.get( rightkey, 0) + feat_vec[rightkey] index_dec[wrongkey] = (epoch, sen) index_dec[rightkey] = (epoch, sen) i += 1 #keys=feat_vec.keys() #for key in keys: #cumulative_feat_vec[key]=cumulative_feat_vec.get(key,0)+feat_vec[key] count += 1 else: words = [] postags = [] truetags = [] label_list = sentence_data[0] feat_list = sentence_data[1] for label in label_list: (word, postag, chunktag) = label.split(" ") words.append(word) postags.append(postag) truetags.append(chunktag) tagset = perc.read_tagset(opts.tagsetfile) default_tag = tagset[0] argmaxtags = perc.perc_test(feat_vec, label_list, feat_list, tagset, default_tag) feat_index = 0 i = 0 for word in words: (feat_index, feats_for_this_word) = perc.feats_for_word( feat_index, feat_list) # print(len(feats_for_this_word)) argmax = argmaxtags[i] tru = truetags[i] for f in feats_for_this_word: wrongkey = f, argmax rightkey = f, tru (index_epoch, index_sen) = index_dec[wrongkey] idletime = (epoch * numsen + sen - index_epoch * numsen - index_sen) cumulative_feat_vec[wrongkey] = cumulative_feat_vec.get( wrongkey, 0) + feat_vec.get(wrongkey, 0) * idletime (index_epoch, index_sen) = index_dec[rightkey] idletime = (epoch * numsen + sen - index_epoch * numsen - index_sen) cumulative_feat_vec[rightkey] = cumulative_feat_vec.get( rightkey, 0) + feat_vec.get(rightkey, 0) * idletime if (argmax == tru): i += 1 continue for f in feats_for_this_word: wrongkey = f, argmax rightkey = f, tru feat_vec[wrongkey] = feat_vec.get(wrongkey, 0) - 1 feat_vec[rightkey] = feat_vec.get(rightkey, 0) + 1 cumulative_feat_vec[ wrongkey] = cumulative_feat_vec.get( wrongkey, 0) + feat_vec[wrongkey] cumulative_feat_vec[ rightkey] = cumulative_feat_vec.get( rightkey, 0) + feat_vec[rightkey] index_dec[wrongkey] = (epoch, sen) index_dec[rightkey] = (epoch, sen) i += 1 i = 0 for word in words: argmax = argmaxtags[i] tru = truetags[i] argmaxprev = "B:" truprev = "B:" if (i == 0): argmaxprev += "B_-1" truprev += "B_-1" else: argmaxprev += argmaxtags[i - 1] truprev += truetags[i - 1] wrongkey = argmaxprev, argmax rightkey = truprev, tru (index_epoch, index_sen) = index_dec[wrongkey] idletime = (epoch * numsen + sen - index_epoch * numsen - index_sen) cumulative_feat_vec[wrongkey] = cumulative_feat_vec.get( wrongkey, 0) + feat_vec.get(wrongkey, 0) * idletime (index_epoch, index_sen) = index_dec[rightkey] idletime = (epoch * numsen + sen - index_epoch * numsen - index_sen) cumulative_feat_vec[rightkey] = cumulative_feat_vec.get( rightkey, 0) + feat_vec.get(rightkey, 0) * idletime if (argmax == tru): i += 1 correct += 1 continue else: mistakes += 1 feat_vec[wrongkey] = feat_vec.get(wrongkey, 0) - 1 feat_vec[rightkey] = feat_vec.get(rightkey, 0) + 1 cumulative_feat_vec[wrongkey] = cumulative_feat_vec.get( wrongkey, 0) + feat_vec[wrongkey] cumulative_feat_vec[rightkey] = cumulative_feat_vec.get( rightkey, 0) + feat_vec[rightkey] index_dec[wrongkey] = (epoch, sen) index_dec[rightkey] = (epoch, sen) i += 1 # keys=feat_vec.keys() # for key in keys: # cumulative_feat_vec[key]=cumulative_feat_vec.get(key,0)+feat_vec[key] count += 1 if (sen % 1000 == 0): print(str(sen) + "/" + str(len(train_data))) sen += 1 #print(mistakes) #print(correct) epoch += 1 keys = cumulative_feat_vec.keys() for key in keys: cumulative_feat_vec[key] = float( cumulative_feat_vec[key]) / float(count) # please limit the number of iterations of training to n iterations return cumulative_feat_vec
def avg_perc_train(train_data, tagset, iterations=1): feat_vec = FeatureVector() feat_vec_sum = FeatureVector() last_change_dict = FeatureVector() total_sentence_count = 0 default_tag = tagset[0] import random for iteration in range(iterations): # Number of Sentences # stocastic gradient descent batch_train_data = random.sample(train_data, 128) sentence_total = len(batch_train_data) sentence_count = 0 for (labeled_list, feat_list) in batch_train_data: # For averaged perceptron, we need to know exactly how many # sentences we have used during training total_sentence_count += 1 # Print out information sentence_count += 1 print "iteration", iteration, "sentence", sentence_count, "of", sentence_total # Retrieve Gold Output gold_output = [] gold_output.append('B_-1') for i in labeled_list: (w, t, label) = i.split() gold_output.append(label) gold_output.append('B_+1') # Retrieve Local Output local_output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag) local_output.insert(0, 'B_-1') local_output.append('B_+1') print gold_output print local_output # Extract features from both outputs local_vec = retrieve_feature(local_output, feat_list) gold_vec = retrieve_feature(gold_output, feat_list) # Calculate delta delta_vec = gold_vec - local_vec # This is the key to averaged perceptron, it sums up all the # feat_vec we have used, and returns the averaged value by # dividing that sum with the total_sentence_count(total number # of sentences used during training, including duplicates # during multiple iterations) # feat_vec += delta_vec # feat_vec_sum += feat_vec # The following is the optimisation for averaged perceptron # which does exactly the same thing as the code in the above two # lines. Instead of updating the feat_vec_sum everytime we train # a new sentence, we do lazy update. if sentence_count != sentence_total: # Not the last sentence of current iteration if not gold_vec == local_vec: for key in delta_vec: # Only update the changed values, and store when they # was last updated feat_vec_sum[key] += feat_vec[key] * (total_sentence_count - last_change_dict[key]) last_change_dict[key] = total_sentence_count feat_vec += delta_vec # Because feat_vec is updated here by adding delta_vec, we # do exactly the same thing to feat_vec_sum, because it is # in its nature, a sum of feat_vecs feat_vec_sum += delta_vec else: # Is the last sentence of current iteration, we need to apply # all pending updates to feat_vec_sum for key in last_change_dict.keys() + feat_vec.keys(): # Just to make sure we have indeed updated every key. feat_vec_sum[key] += feat_vec[key] * (total_sentence_count - last_change_dict[key]) last_change_dict[key] = total_sentence_count if not gold_vec == local_vec: # Last but not least, don't forget the current delta_vec feat_vec += delta_vec feat_vec_sum += delta_vec # Dump every iteration tmp = feat_vec_sum / total_sentence_count tmp.dump("models/jetic_avg_Iter_" + str(iteration+1) + ".model") # Finalisation, divide feat_vec_sum with total_sentence_count feat_vec = feat_vec_sum / total_sentence_count return feat_vec.export()
def perc_train(train_data, tagset, numepochs): starttime = datetime.now() feat_vec = defaultdict(int) default_tag = tagset[0] sigma = defaultdict(int) gamma = defaultdict(int) # insert your code here # please limit the number of iterations of training to n iterations for i in range(numepochs): epochstarttime = datetime.now() numOfError = 0 argMaxoutput = [] for (labeled_list, feat_list) in train_data: argMaxoutput = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag) expectedOutput = [ element.split(" ")[2] for element in labeled_list ] for j in range(len(expectedOutput)): trueLabel = expectedOutput[j] argMaxLabel = argMaxoutput[j] trueLabel_prev = expectedOutput[j - 1] argMaxLabel_prev = argMaxoutput[j - 1] if (trueLabel != argMaxLabel): numOfError = numOfError + 1 for feat in feat_list[j * 20:j * 20 + 20]: if (feat == "B") & (j > 0): trueLabel_prev = expectedOutput[j - 1] argMaxLabel_prev = argMaxoutput[j - 1] feat_vec["B:" + trueLabel_prev, trueLabel] = feat_vec["B:" + trueLabel_prev, trueLabel] + 1 feat_vec["B:" + argMaxLabel_prev, argMaxLabel] = feat_vec["B:" + argMaxLabel_prev, argMaxLabel] - 1 #sigma["B:"+trueLabel_prev,trueLabel] = sigma["B:"+ trueLabel_prev,trueLabel] + feat_vec["B:"+ trueLabel_prev,trueLabel] #sigma["B:"+argMaxLabel_prev,argMaxLabel] = sigma["B:"+argMaxLabel_prev,argMaxLabel] + feat_vec["B:"+argMaxLabel_prev,argMaxLabel] else: feat_vec[feat, trueLabel] = feat_vec[feat, trueLabel] + 1 feat_vec[feat, argMaxLabel] = feat_vec[feat, argMaxLabel] - 1 #sigma[feat,trueLabel] = sigma[feat,trueLabel] + feat_vec[feat,trueLabel]; #sigma[feat,argMaxLabel] = sigma[feat,argMaxLabel] + feat_vec[feat,argMaxLabel]; elif (j > 0) & (trueLabel == argMaxLabel) & (trueLabel_prev != argMaxLabel_prev): feat_vec["B:" + trueLabel_prev, trueLabel] = feat_vec["B:" + trueLabel_prev, trueLabel] + 1 feat_vec["B:" + argMaxLabel_prev, argMaxLabel] = feat_vec["B:" + argMaxLabel_prev, argMaxLabel] - 1 #sigma["B:"+trueLabel_prev,trueLabel] = sigma["B:"+ trueLabel_prev,trueLabel] + feat_vec["B:"+ trueLabel_prev,trueLabel] #sigma["B:"+argMaxLabel_prev,argMaxLabel] = sigma["B:"+argMaxLabel_prev,argMaxLabel] + feat_vec["B:"+argMaxLabel_prev,argMaxLabel] ''' elif (j > 1) & (trueLabel == argMaxLabel) & (trueLabel_prev == argMaxLabel_prev) & (expectedOutput[j-2] == argMaxoutput[j-2]): feat_vec["B:"+expectedOutput[j-2],trueLabel_prev,trueLabel] = feat_vec["B:"+ expectedOutput[j-2],trueLabel_prev,trueLabel] + 1 feat_vec["B:"+argMaxLabel_prev,argMaxLabel] = feat_vec["B:"+argMaxoutput[j-2],argMaxLabel_prev,argMaxLabel] - 1 #sigma["B:"+trueLabel_prev,trueLabel] = sigma["B:"+ trueLabel_prev,trueLabel] + feat_vec["B:"+ trueLabel_prev,trueLabel] #sigma["B:"+argMaxLabel_prev,argMaxLabel] = sigma["B:"+argMaxLabel_prev,argMaxLabel] + feat_vec["B:"+argMaxLabel_prev,argMaxLabel] ''' #sigma = dict(Counter(sigma)+Counter(feat_vec)) for key in feat_vec: sigma[key] = sigma[key] + feat_vec[key] epochendtime = datetime.now() print "Number of error in Epoch", i + 1, " ", numOfError, " Time Taken:", epochendtime - epochstarttime for key, value in sigma.items(): gamma[key] = value / (numepochs * len(train_data)) #gamma[key] = value/(numepochs) endtime = datetime.now() print "Total Time taken to train:", endtime - starttime return gamma
def perc_train(train_data, tagset, numepochs): feat_vec = defaultdict(int) sigma_vec = defaultdict(int) gamma_vec = defaultdict(int) tau_vec = defaultdict(int) T = numepochs m = len(train_data) # Main loop for t in range(T): print('EPOCH:', t + 1) mistakes = 0 for i in range(m): if i % 1000 == 0: print(i, end=' ') # Get output chunk tags from Viterbi labeled_list = train_data[i][0] feat_list = train_data[i][1] gold_tags = [ll.split()[2] for ll in labeled_list] output_tags = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag=tagset[0]) if t != T - 1 & i != m - 1: # Update weight vector if the output is incorrect if output_tags != gold_tags: # Get feature IDs: from training data and from Viterbi output feat_ids_gold, feat_ids_output = get_feature_ids( feat_list, gold_tags, output_tags) for k in range(len(gold_tags)): if output_tags[k] != gold_tags[k]: for f in feat_ids_gold[k]: if f in tau_vec.keys(): sigma_vec[f] = sigma_vec[f] + feat_vec[ f] * (t * m + i - tau_vec[f][1] * m - tau_vec[f][0]) feat_vec[f] = feat_vec[f] + 1 sigma_vec[f] = sigma_vec[f] + 1 # record the location where the dimension tag is updated tau_vec[f] = [i, t] for f in feat_ids_output[k]: if f in tau_vec.keys(): sigma_vec[f] = sigma_vec[f] + feat_vec[ f] * (t * m + i - tau_vec[f][1] * m - tau_vec[f][0]) feat_vec[f] = feat_vec[f] - 1 sigma_vec[f] = sigma_vec[f] - 1 # record the location where the dimension tag is updated tau_vec[f] = [i, t] else: # record the location where the dimension tag is updated for f in feat_ids_gold[k]: tau_vec[f] = [i, t] for f in feat_ids_output[k]: tau_vec[f] = [i, t] mistakes += 1 else: # to deal with the last sentence in the last iteration # Get feature IDs: from training data and from Viterbi output feat_ids_gold, feat_ids_output = get_feature_ids( feat_list, gold_tags, output_tags) for f in tau_vec.keys(): sigma_vec[f] = sigma_vec[f] + feat_vec[f] * ( T * m + m - tau_vec[f][1] * m - tau_vec[f][0]) for k in range(len(gold_tags)): if output_tags[k] != gold_tags[k]: for g in feat_ids_gold[k]: feat_vec[g] = feat_vec[g] + 1 sigma_vec[g] = sigma_vec[g] + 1 for g in feat_ids_output[k]: feat_vec[g] = feat_vec[g] - 1 sigma_vec[g] = sigma_vec[g] - 1 print('\nMistakes in epoch {0}: {1} out of {2} sentences'.format( t + 1, mistakes, len(train_data))) for key in sigma_vec.keys(): sigma_vec[key] = sigma_vec[key] / (m * T) return sigma_vec
def perc_train(train_data, tagset, T): feat_vec = defaultdict(int) sigma_feat_vec = defaultdict(float) sigma_feat_vec2 = defaultdict(float) tau = {} # insert your code here # please limit the number of iterations of training to n iterations print >> sys.stderr, "training data ..." M = len(train_data) for t in range (0,T): for i in range(0,M): print >> sys.stderr, '\r{0}'.format("Iteration: %d/%d. Sentence: %d/%d\t" %(t+1, T, i+1, M)), labeled_list = train_data[i][0] feat_list = train_data[i][1] # Extract the labels from training data toutput = [tags.split(' ')[2] for tags in labeled_list ] # Output from Viterbi Algorithm zoutput = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, 'B-NP') # Compare outputs diff = toutput != zoutput if t != T-1 or i != M-1: if diff: index = 0 for p in range(0, len(zoutput)): predicted_tag = zoutput[p] true_tag = toutput[p] (index, feats) = perc.feats_for_word(index, feat_list) for feat in feats: s1 = s2 = '' if feat == 'B': if p >= 1: zprevtag = zoutput[p-1] tprevtag = toutput[p-1] else: zprevtag = tprevtag = 'B_-1' s1 = (feat+':'+zprevtag, predicted_tag) s2 = (feat+':'+tprevtag, true_tag) else: s1 = (feat, predicted_tag) s2 = (feat, true_tag) if s1 in tau: sigma_feat_vec[s1] = sigma_feat_vec[s1] + feat_vec[s1] * ((t+1) * M + (i+1) - tau[s1][1] * M - tau[s1][0]) if s1 != s2 and s2 in tau: sigma_feat_vec[s2] = sigma_feat_vec[s2] + feat_vec[s2] * ((t+1) * M + (i+1) - tau[s2][1] * M - tau[s2][0]) if s1 != s2: feat_vec[s1] -= 1 feat_vec[s2] += 1 sigma_feat_vec[s1] -= 1 sigma_feat_vec[s2] += 1 tau[s1] = (i+1,t+1) tau[s2] = (i+1,t+1) else: for s in tau: sigma_feat_vec[s] = sigma_feat_vec[s] + feat_vec[s] * (T * M + M - tau[s][1] * M - tau[s][0]) if diff: index = 0 for p in range(0, len(zoutput)): predicted_tag = zoutput[p] true_tag = toutput[p] (index, feats) = perc.feats_for_word(index, feat_list) for feat in feats: s1 = s2 = '' if feat == 'B': if p >= 1: zprevtag = zoutput[p-1] tprevtag = toutput[p-1] else: zprevtag = tprevtag = 'B_-1' s1 = (feat+':'+zprevtag, predicted_tag) s2 = (feat+':'+tprevtag, true_tag) else: s1 = (feat, predicted_tag) s2 = (feat, true_tag) if s1 != s2: feat_vec[s1] -= 1 feat_vec[s2] += 1 sigma_feat_vec[s1] -= 1 sigma_feat_vec[s2] += 1 print >> sys.stderr, "\ndone" print >> sys.stderr, "computing average vector ..." zerokeys = [] for f in sigma_feat_vec: if sigma_feat_vec[f] == 0: zerokeys.append(f) else: sigma_feat_vec[f] = sigma_feat_vec[f]/(T*M) for k in zerokeys: del sigma_feat_vec[k] print >> sys.stderr, "done" return sigma_feat_vec
def avg_perc_train(train_data, tagset, n): # insert your code here if len(tagset) <= 0: raise ValueError("Empty tagset") default_tag = tagset[0] feat_vec = defaultdict(int) avg_vec = defaultdict(int) last_iter = {} epochs = n num_updates = 0 for round in range(0, epochs): num_mistakes = 0 for (labeled_list, feat_list) in train_data: num_updates += 1 output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag) true_output = get_truth(labeled_list) logging.info("arg max output: %s" % (" ".join(output))) logging.info("truth: %s" % (" ".join(true_output))) #print >>sys.stderr, "\noutput:", output, "\ntruth:", true_output if output != true_output: num_mistakes += 1 output.insert(0, 'B_-1') output.append('B_+1') true_output.insert(0, 'B_-1') true_output.append('B_+1') feat_index = 0 for i in range(1, len(output) - 1): #print >>sys.stderr, output[i], true_output[i] (feat_index, feats) = perc.feats_for_word(feat_index, feat_list) if len(feats) == 0: print >> sys.stderr, " ".join(labeled_list), " ".join( feat_list), "\n" raise ValueError( "features do not align with input sentence") #print >>sys.stderr, feats feat_vec_update = defaultdict(int) for feat in feats: if feat == 'B': output_feat = 'B:' + output[i - 1] truth_feat = 'B:' + true_output[i - 1] else: output_feat = truth_feat = feat feat_vec_update[output_feat, output[i]] += -1 feat_vec_update[truth_feat, true_output[i]] += 1 #reason: if output[i]==true_output[i] update = 0 for (upd_feat, upd_tag) in feat_vec_update: if feat_vec_update[upd_feat, upd_tag] != 0: feat_vec[upd_feat, upd_tag] += feat_vec_update[upd_feat, upd_tag] logging.info( "updating feat_vec with feature_id: (%s, %s) value: %d" % (upd_feat, upd_tag, feat_vec_update[upd_feat, upd_tag])) if (upd_feat, upd_tag) in last_iter: avg_vec[upd_feat, upd_tag] += ( num_updates - last_iter[upd_feat, upd_tag] ) * feat_vec[upd_feat, upd_tag] else: avg_vec[upd_feat, upd_tag] = feat_vec[upd_feat, upd_tag] last_iter[upd_feat, upd_tag] = num_updates print >> sys.stderr, "number of mistakes:", num_mistakes for (feat, tag) in feat_vec: if (feat, tag) in last_iter: avg_vec[feat, tag] += (num_updates - last_iter[feat, tag]) * feat_vec[feat, tag] else: avg_vec[feat, tag] = feat_vec[feat, tag] feat_vec[feat, tag] = avg_vec[feat, tag] / num_updates return feat_vec
def perc_train(train_data, tagset, numepochs): # feat_vec = perc.perc_read_from_file(opts.modelfile) # print len(feat_vec) feat_vec = defaultdict(int) # insert your code here if len(tagset) <= 0: raise ValueError("Empty tagset") numepochs = int(1) default_tag = tagset[0] for t in range(numepochs): print "Iteration#", t, " is processing now." counter = 0 for (labeled_list, feat_list) in train_data: counter += 1 print counter labels = copy.deepcopy(labeled_list) # add in the start and end buffers for the context # for every sentence in the training set, iterate numepochs times output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag) # compare current output and true result # correct_flag = True feat_index = 0 # check word by word if the predicted tag is equal to the true tag for i, v in enumerate(output): (feat_index, feats) = perc.feats_for_word(feat_index, feat_list) # retrieve the feature for a word if len(feats) == 0: print >> sys.stderr, " ".join(labels), " ".join(feat_list), "\n" raise ValueError("features do not align with input sentence") fields = labels[i].split() label = fields[2] if i > 1: pre_label = labels[i - 1].split()[2] pre_pre_label = labels[i - 2].split()[2] if output[i - 2] != pre_pre_label or output[i - 1] != pre_label or output[i] != label: for feat in feats: if feat[0] == "B": # for bigram feature feat_out = "B:" + output[i - 1] # feat_out is the "B:<previous output>" feat_lab = "B:" + pre_label # feat_lab is the "B:<previous label>" if output[i - 1] != pre_label and output[i] != label: feat_vec[feat_out, output[i]] -= 1 feat_vec[feat_lab, output[i]] -= 1 feat_vec[feat_out, label] += 1 feat_vec[feat_lab, label] += 1 elif output[i - 1] == pre_label and output[i] != label: feat_vec[feat_lab, output[i]] -= 2 feat_vec[feat_lab, label] += 2 elif output[i - 1] != pre_label and output[i] == label: pass elif output[i - 1] == pre_label and output[i] == label: pass feat_out = "T:" + output[i - 2] + "/" + output[i - 1] feat_lab = "T:" + pre_pre_label + "/" + pre_label if output[i - 2] == pre_pre_label and output[i - 1] == pre_label and output[i] != label: feat_vec[feat_out, output[i]] -= 1 feat_vec[feat_lab, label] += 1 feat_vec[feat_out, output[i]] -= 1 feat_vec[feat_lab, label] += 1 else: # for U00 to U22 feature feat_vec[feat, output[i]] -= 1 feat_vec[feat, label] += 1 elif i == 1: pre_label = labels[i - 1].split()[2] if output[i - 1] != pre_label or output[i] != label: for feat in feats: if feat[0] == "B": # for bigram feature feat_out = "B:" + output[i - 1] # feat_out is the "B:<previous output>" feat_lab = "B:" + pre_label # feat_lab is the "B:<previous label>" feat_vec[feat_out, output[i]] -= 1 feat_vec[feat_lab, label] += 1 else: # for U00 to U22 feature feat_vec[feat, output[i]] -= 1 feat_vec[feat, label] += 1 feat_out = "T:B_-1/" + output[i - 1] feat_lab = "T:B_-1/" + pre_label feat_vec[feat_out, output[i]] -= 1 feat_vec[feat_lab, label] += 1 else: # for i==0 case, all the first word in each sentence for feat in feats: if feat[0] == "B": # bigram feature case feat = "B:B_-1" feat_vec[feat, output[i]] -= 1 feat_vec[feat, label] += 1 feat = "T:B_-2/B_-1" feat_vec[feat, output[i]] -= 1 feat_vec[feat, label] += 1 # please limit the number of iterations of training to n iterations return feat_vec
def perc_train(train_data, tagset, numepochs): feat_vec = defaultdict(int) # insert your code here epoch = 0 while (epoch < numepochs): #print(epoch) mistakes = 0 correct = 0 #print(len(train_data)) #sen=0 for sentence_data in train_data: words = [] postags = [] truetags = [] label_list = sentence_data[0] feat_list = sentence_data[1] for label in label_list: (word, postag, chunktag) = label.split(" ") words.append(word) postags.append(postag) truetags.append(chunktag) tagset = perc.read_tagset(opts.tagsetfile) default_tag = tagset[0] argmaxtags = perc.perc_test(feat_vec, label_list, feat_list, tagset, default_tag) feat_index = 0 i = 0 for word in words: (feat_index, feats_for_this_word) = perc.feats_for_word( feat_index, feat_list) # print(len(feats_for_this_word)) argmax = argmaxtags[i] tru = truetags[i] if (argmax == tru): i += 1 continue for f in feats_for_this_word: wrongkey = f, argmax rightkey = f, tru feat_vec[wrongkey] = feat_vec.get(wrongkey, 0) - 1 feat_vec[rightkey] = feat_vec.get(rightkey, 0) + 1 i += 1 i = 0 for word in words: argmax = argmaxtags[i] tru = truetags[i] if (argmax == tru): i += 1 correct += 1 continue else: mistakes += 1 argmaxprev = "B:" truprev = "B:" if (i == 0): argmaxprev += "B_-1" truprev += "B_-1" else: argmaxprev += argmaxtags[i - 1] truprev += truetags[i - 1] wrongkey = argmaxprev, argmax rightkey = truprev, tru feat_vec[wrongkey] = feat_vec.get(wrongkey, 0) - 1 feat_vec[rightkey] = feat_vec.get(rightkey, 0) + 1 i += 1 #if(sen%1000==0): #print(str(sen)+"/"+str(len(train_data))) #sen+=1 #print(mistakes) #print(correct) epoch += 1 # please limit the number of iterations of training to n iterations return feat_vec
def perc_train(train_data, tagset, numepochs, word_set): feat_vec = defaultdict(int) # insert your code here if len(tagset) <= 0: raise ValueError("Empty tagset") # numepochs = int(50) default_tag = tagset[0] for t in range(numepochs): tmp = 0 # Count sentence print 'Iteration#',t,' is processing now.' cnt = 0 for (labeled_list, feat_list) in train_data: cnt = cnt + 1 if cnt % 1000 == 0: print 'current status: ', str(round(100*cnt/9000.0,2)),'%' labels = copy.deepcopy(labeled_list) # add in the start and end buffers for the context # for every sentence in the training set, iterate numepochs times output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag, word_set) feat_index = 0 # check word by word if the predicted tag is equal to the true tag for i, v in enumerate(output): (feat_index, feats) = perc.feats_for_word(feat_index, feat_list) # retrieve the feature for a word if len(feats) == 0: print >>sys.stderr, " ".join(labels), " ".join(feat_list), "\n" raise ValueError("features do not align with input sentence") fields = labels[i].split() label = fields[2] if i > 0: label_pre = labels[i-1].split()[2] for feat in feats: if feat[0] == 'B': # for bigram feature feat_out = feat + ":" + output[i-1] # feat_out is the "B:<previous output>" feat_lab = feat + ":" + label_pre # feat_lab is the "B:<previous label>" if output[i-1] != label_pre and output[i] != label: feat_vec[feat_out, output[i]] -= 1 feat_vec[feat_lab, output[i]] -= 1 feat_vec[feat_out, label] += 1 feat_vec[feat_lab, label] += 1 elif output[i-1] == label_pre and output[i] != label: feat_vec[feat_lab, output[i]] -= 2 feat_vec[feat_lab, label] += 2 elif output[i-1] != label_pre and output[i] == label: pass elif output[i-1] == label_pre and output[i] == label: pass # feat_vec[feat_out, output[i]] = feat_vec[feat_out, output[i]] - 1 # feat_vec[feat_lab, label] = feat_vec[feat_lab, label] + 1 # feat_vec[feat_out, label] = feat_vec[feat_out, label] + 1 # feat_vec[feat_lab, output[i]] = feat_vec[feat_lab, output[i]] - 1 else: # for U00 to U22 feature # if the condition is not right, there will be no penaulty and rewarding feat_vec[feat, output[i]] -= 1 feat_vec[feat, label] += 1 else: # for i==0 case, all the first word in each sentence label_pre = '_B-1' # previous label will be denoted by _B-1 for feat in feats: if feat[0] == 'B': # bigram feature case feat = feat + ":" + label_pre feat_vec[feat, output[i]] = feat_vec[feat, output[i]] - 1 feat_vec[feat, label] = feat_vec[feat, label] + 1 # if i > 0: # label_pre = labels[i-1].split()[2] # if output[i-1] != label_pre or output[i] != label: # for feat in feats: # if feat[0] == 'B': # # for bigram feature # feat_out = feat + ":" + output[i-1] # # feat_out is the "B:<previous output>" # feat_lab = feat + ":" + label_pre # # feat_lab is the "B:<previous label>" # # reward best condition # feat_vec[feat_lab, label] = feat_vec[feat_lab, label] + 1 # # penalize condition # feat_vec[feat_out, output[i]] = feat_vec[feat_out, output[i]] - 1 # else: # # for U00 to U22 feature # feat_vec[feat, output[i]] = feat_vec[feat, output[i]] - 1 # feat_vec[feat, label] = feat_vec[feat, label] + 1 # else: # # for i==0 case, all the first word in each sentence # label_pre = '_B-1' # previous label will be denoted by _B-1 # for feat in feats: # if feat[0] == 'B': # # bigram feature case # feat = feat + ":" + label_pre # feat_vec[feat, output[i]] = feat_vec[feat, output[i]] - 1 # feat_vec[feat, label] = feat_vec[feat, label] + 1 if t % 3 == 0: perc.perc_write_to_file(feat_vec, 'model_' + str(t)) perc.perc_write_to_file(feat_vec, 'model') os.system('python perc.py -m model | python score-chunks.py') # please limit the number of iterations of training to n iterations return feat_vec