def perc_train(train_data, tagset, numepochs): feat_vec = defaultdict(int) # insert your code here epoch = 0 while (epoch < numepochs): #print(epoch) mistakes = 0 correct = 0 #print(len(train_data)) #sen=0 for sentence_data in train_data: words = [] postags = [] truetags = [] label_list = sentence_data[0] feat_list = sentence_data[1] for label in label_list: (word, postag, chunktag) = label.split(" ") words.append(word) postags.append(postag) truetags.append(chunktag) tagset = perc.read_tagset(opts.tagsetfile) default_tag = tagset[0] argmaxtags = perc.perc_test(feat_vec, label_list, feat_list, tagset, default_tag) feat_index = 0 i = 0 for word in words: (feat_index, feats_for_this_word) = perc.feats_for_word( feat_index, feat_list) # print(len(feats_for_this_word)) argmax = argmaxtags[i] tru = truetags[i] if (argmax == tru): i += 1 continue for f in feats_for_this_word: wrongkey = f, argmax rightkey = f, tru feat_vec[wrongkey] = feat_vec.get(wrongkey, 0) - 1 feat_vec[rightkey] = feat_vec.get(rightkey, 0) + 1 i += 1 i = 0 for word in words: argmax = argmaxtags[i] tru = truetags[i] if (argmax == tru): i += 1 correct += 1 continue else: mistakes += 1 argmaxprev = "B:" truprev = "B:" if (i == 0): argmaxprev += "B_-1" truprev += "B_-1" else: argmaxprev += argmaxtags[i - 1] truprev += truetags[i - 1] wrongkey = argmaxprev, argmax rightkey = truprev, tru feat_vec[wrongkey] = feat_vec.get(wrongkey, 0) - 1 feat_vec[rightkey] = feat_vec.get(rightkey, 0) + 1 i += 1 #if(sen%1000==0): #print(str(sen)+"/"+str(len(train_data))) #sen+=1 #print(mistakes) #print(correct) epoch += 1 # please limit the number of iterations of training to n iterations return feat_vec
help= "precomputed features for the input data, i.e. the values of \phi(x,_) without y" ) optparser.add_option( "-e", "--numepochs", dest="numepochs", default=int(10), help= "number of epochs of training; in each epoch we iterate over over all the training examples" ) optparser.add_option("-m", "--modelfile", dest="modelfile", default=os.path.join("data", "default.model"), help="weights for all features stored on disk") (opts, _) = optparser.parse_args() # each element in the feat_vec dictionary is: # key=feature_id value=weight feat_vec = {} tagset = [] train_data = [] tagset = perc.read_tagset(opts.tagsetfile) print >> sys.stderr, "reading data ..." train_data = perc.read_labeled_data(opts.trainfile, opts.featfile) print >> sys.stderr, "done." feat_vec = perc_train(train_data, tagset, int(opts.numepochs)) perc.perc_write_to_file(feat_vec, opts.modelfile)
def perc_train(train_data, tagset, numepochs): feat_vec = defaultdict(int) # insert your code here cumulative_feat_vec = defaultdict(float) index_dec = dict() epoch = 0 count = 0 numsen = len(train_data) while (epoch < numepochs): print(epoch) mistakes = 0 correct = 0 #print(numsen) sen = 0 for sentence_data in train_data: if (epoch != numepochs or sen != numsen): words = [] postags = [] truetags = [] label_list = sentence_data[0] feat_list = sentence_data[1] for label in label_list: (word, postag, chunktag) = label.split(" ") words.append(word) postags.append(postag) truetags.append(chunktag) tagset = perc.read_tagset(opts.tagsetfile) default_tag = tagset[0] argmaxtags = perc.perc_test(feat_vec, label_list, feat_list, tagset, default_tag) feat_index = 0 i = 0 for word in words: (feat_index, feats_for_this_word) = perc.feats_for_word( feat_index, feat_list) # print(len(feats_for_this_word)) argmax = argmaxtags[i] tru = truetags[i] if (argmax == tru): i += 1 continue for f in feats_for_this_word: wrongkey = f, argmax rightkey = f, tru if (wrongkey in index_dec): (index_epoch, index_sen) = index_dec[wrongkey] idletime = (epoch * numsen + sen - index_epoch * numsen - index_sen) cumulative_feat_vec[ wrongkey] = cumulative_feat_vec.get( wrongkey, 0) + feat_vec.get(wrongkey, 0) * idletime if (rightkey in index_dec): (index_epoch, index_sen) = index_dec[rightkey] idletime = (epoch * numsen + sen - index_epoch * numsen - index_sen) cumulative_feat_vec[ rightkey] = cumulative_feat_vec.get( rightkey, 0) + feat_vec.get(rightkey, 0) * idletime feat_vec[wrongkey] = feat_vec.get(wrongkey, 0) - 1 feat_vec[rightkey] = feat_vec.get(rightkey, 0) + 1 cumulative_feat_vec[ wrongkey] = cumulative_feat_vec.get( wrongkey, 0) + feat_vec[wrongkey] cumulative_feat_vec[ rightkey] = cumulative_feat_vec.get( rightkey, 0) + feat_vec[rightkey] index_dec[wrongkey] = (epoch, sen) index_dec[rightkey] = (epoch, sen) i += 1 i = 0 for word in words: argmax = argmaxtags[i] tru = truetags[i] if (argmax == tru): i += 1 correct += 1 continue else: mistakes += 1 argmaxprev = "B:" truprev = "B:" if (i == 0): argmaxprev += "B_-1" truprev += "B_-1" else: argmaxprev += argmaxtags[i - 1] truprev += truetags[i - 1] wrongkey = argmaxprev, argmax rightkey = truprev, tru if (wrongkey in index_dec): (index_epoch, index_sen) = index_dec[wrongkey] idletime = (epoch * numsen + sen - index_epoch * numsen - index_sen) cumulative_feat_vec[wrongkey] = cumulative_feat_vec.get( wrongkey, 0) + feat_vec.get(wrongkey, 0) * idletime if (rightkey in index_dec): (index_epoch, index_sen) = index_dec[rightkey] idletime = (epoch * numsen + sen - index_epoch * numsen - index_sen) cumulative_feat_vec[rightkey] = cumulative_feat_vec.get( rightkey, 0) + feat_vec.get(rightkey, 0) * idletime feat_vec[wrongkey] = feat_vec.get(wrongkey, 0) - 1 feat_vec[rightkey] = feat_vec.get(rightkey, 0) + 1 cumulative_feat_vec[wrongkey] = cumulative_feat_vec.get( wrongkey, 0) + feat_vec[wrongkey] cumulative_feat_vec[rightkey] = cumulative_feat_vec.get( rightkey, 0) + feat_vec[rightkey] index_dec[wrongkey] = (epoch, sen) index_dec[rightkey] = (epoch, sen) i += 1 #keys=feat_vec.keys() #for key in keys: #cumulative_feat_vec[key]=cumulative_feat_vec.get(key,0)+feat_vec[key] count += 1 else: words = [] postags = [] truetags = [] label_list = sentence_data[0] feat_list = sentence_data[1] for label in label_list: (word, postag, chunktag) = label.split(" ") words.append(word) postags.append(postag) truetags.append(chunktag) tagset = perc.read_tagset(opts.tagsetfile) default_tag = tagset[0] argmaxtags = perc.perc_test(feat_vec, label_list, feat_list, tagset, default_tag) feat_index = 0 i = 0 for word in words: (feat_index, feats_for_this_word) = perc.feats_for_word( feat_index, feat_list) # print(len(feats_for_this_word)) argmax = argmaxtags[i] tru = truetags[i] for f in feats_for_this_word: wrongkey = f, argmax rightkey = f, tru (index_epoch, index_sen) = index_dec[wrongkey] idletime = (epoch * numsen + sen - index_epoch * numsen - index_sen) cumulative_feat_vec[wrongkey] = cumulative_feat_vec.get( wrongkey, 0) + feat_vec.get(wrongkey, 0) * idletime (index_epoch, index_sen) = index_dec[rightkey] idletime = (epoch * numsen + sen - index_epoch * numsen - index_sen) cumulative_feat_vec[rightkey] = cumulative_feat_vec.get( rightkey, 0) + feat_vec.get(rightkey, 0) * idletime if (argmax == tru): i += 1 continue for f in feats_for_this_word: wrongkey = f, argmax rightkey = f, tru feat_vec[wrongkey] = feat_vec.get(wrongkey, 0) - 1 feat_vec[rightkey] = feat_vec.get(rightkey, 0) + 1 cumulative_feat_vec[ wrongkey] = cumulative_feat_vec.get( wrongkey, 0) + feat_vec[wrongkey] cumulative_feat_vec[ rightkey] = cumulative_feat_vec.get( rightkey, 0) + feat_vec[rightkey] index_dec[wrongkey] = (epoch, sen) index_dec[rightkey] = (epoch, sen) i += 1 i = 0 for word in words: argmax = argmaxtags[i] tru = truetags[i] argmaxprev = "B:" truprev = "B:" if (i == 0): argmaxprev += "B_-1" truprev += "B_-1" else: argmaxprev += argmaxtags[i - 1] truprev += truetags[i - 1] wrongkey = argmaxprev, argmax rightkey = truprev, tru (index_epoch, index_sen) = index_dec[wrongkey] idletime = (epoch * numsen + sen - index_epoch * numsen - index_sen) cumulative_feat_vec[wrongkey] = cumulative_feat_vec.get( wrongkey, 0) + feat_vec.get(wrongkey, 0) * idletime (index_epoch, index_sen) = index_dec[rightkey] idletime = (epoch * numsen + sen - index_epoch * numsen - index_sen) cumulative_feat_vec[rightkey] = cumulative_feat_vec.get( rightkey, 0) + feat_vec.get(rightkey, 0) * idletime if (argmax == tru): i += 1 correct += 1 continue else: mistakes += 1 feat_vec[wrongkey] = feat_vec.get(wrongkey, 0) - 1 feat_vec[rightkey] = feat_vec.get(rightkey, 0) + 1 cumulative_feat_vec[wrongkey] = cumulative_feat_vec.get( wrongkey, 0) + feat_vec[wrongkey] cumulative_feat_vec[rightkey] = cumulative_feat_vec.get( rightkey, 0) + feat_vec[rightkey] index_dec[wrongkey] = (epoch, sen) index_dec[rightkey] = (epoch, sen) i += 1 # keys=feat_vec.keys() # for key in keys: # cumulative_feat_vec[key]=cumulative_feat_vec.get(key,0)+feat_vec[key] count += 1 if (sen % 1000 == 0): print(str(sen) + "/" + str(len(train_data))) sen += 1 #print(mistakes) #print(correct) epoch += 1 keys = cumulative_feat_vec.keys() for key in keys: cumulative_feat_vec[key] = float( cumulative_feat_vec[key]) / float(count) # please limit the number of iterations of training to n iterations return cumulative_feat_vec
from collections import defaultdict def perc_train(train_data, tagset, numepochs): feat_vec = defaultdict(int) # insert your code here # please limit the number of iterations of training to n iterations return feat_vec if __name__ == '__main__': optparser = optparse.OptionParser() optparser.add_option("-t", "--tagsetfile", dest="tagsetfile", default=os.path.join("data", "tagset.txt"), help="tagset that contains all the labels produced in the output, i.e. the y in \phi(x,y)") optparser.add_option("-i", "--trainfile", dest="trainfile", default=os.path.join("data", "train.txt.gz"), help="input data, i.e. the x in \phi(x,y)") optparser.add_option("-f", "--featfile", dest="featfile", default=os.path.join("data", "train.feats.gz"), help="precomputed features for the input data, i.e. the values of \phi(x,_) without y") optparser.add_option("-e", "--numepochs", dest="numepochs", default=int(10), help="number of epochs of training; in each epoch we iterate over over all the training examples") optparser.add_option("-m", "--modelfile", dest="modelfile", default=os.path.join("data", "default.model"), help="weights for all features stored on disk") (opts, _) = optparser.parse_args() # each element in the feat_vec dictionary is: # key=feature_id value=weight feat_vec = {} tagset = [] train_data = [] tagset = perc.read_tagset(opts.tagsetfile) print >>sys.stderr, "reading data ..." train_data = perc.read_labeled_data(opts.trainfile, opts.featfile) print >>sys.stderr, "done." feat_vec = perc_train(train_data, tagset, int(opts.numepochs)) perc.perc_write_to_file(feat_vec, opts.modelfile)
help='number of layers') argparser.add_argument('--pos-dim', type=int, default=64, help='POS tag embedding dimension') argparser.add_argument('-r', '--resume', help='resume training from saved model') argparser.add_argument('--prototype', default=False, action='store_true', help='prototyping mode') args = argparser.parse_args() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") tagset = perc.read_tagset(args.tagsetfile) print("reading data ...", file=sys.stderr) test_data = perc.read_labeled_data(args.inputfile, args.featfile, verbose=False) print("done.", file=sys.stderr) if args.prototype: test_data = test_data[0:8] print('loading model...', file=sys.stderr) model_data = load_model(args.modelfile) word_idx = model_data['word_index'] speech_tag_idx = model_data['speech_tag_index'] tag2idx = model_data['tag_index']