def regularization_using_grid_search (alphas, counts, class_counts, allkeys, tr_outfile='nb.alpha.tr.txt', dv_outfile='nb.alpha.dv.txt'): tr_accs = [] dv_accs = [] # Choose your alphas here weights_nb_alphas = dict() for alpha in alphas: weights_nb_alphas[alpha] = learnNBWeights(counts, class_counts, allkeys, alpha) confusion = evalClassifier(weights_nb_alphas[alpha],tr_outfile,TRAINKEY) tr_accs.append(scorer.accuracy(confusion)) confusion = evalClassifier(weights_nb_alphas[alpha],dv_outfile,DEVKEY) dv_accs.append(scorer.accuracy(confusion)) return weights_nb_alphas, tr_accs, dv_accs
def regularization_using_grid_search(alphas, netas, N_its,inst_generator, outfile, devkey, learning_rate=1e-4, regularizer=1e-2, tr_outfile='logreg.alpha.tr.txt', dv_outfile='logreg.alpha.dv.txt'): tr_accs = [] dv_accs = [] # Choose your alphas here weights_log_reg_alphas = dict() for alpha in alphas: for neta in netas: weights_log_reg_alphas[(alpha, neta)] = trainLRbySGD(N_its,inst_generator, outfile, devkey, learning_rate=neta, regularizer=alpha) confusion = evalClassifier(weights_log_reg_alphas[(alpha, neta)],tr_outfile,TRAINKEY) tr_accs.append(scorer.accuracy(confusion)) confusion = evalClassifier(weights_log_reg_alphas[(alpha, neta)],dv_outfile,DEVKEY) dv_accs.append(scorer.accuracy(confusion)) return weights_log_reg_alphas, tr_accs, dv_accs
def test_wlc_dev_almost_there_accuracy (): global weights_wlc global WLC_FILE mat = evalClassifier (weights_wlc, WLC_FILE, DEVKEY) actual = scorer.accuracy(mat) expected = 0.40 ok_ (expected <= actual, msg="UNEQUAL Expected:%f, Actual:%f" %(expected,actual))
def test_mcc_dev_accuracy (): global weights_mcc global MCC_FILE mat = evalClassifier (weights_mcc, MCC_FILE, DEVKEY) actual = scorer.accuracy(mat) expected = 0.3756 assert_almost_equals (expected, actual, places=4, msg="UNEQUAL Expected:%f, Actual:%f" %(expected,actual))
def test_wlc_dev_exact_accuracy(): global weights_wlc global WLC_FILE mat = evalClassifier(weights_wlc, WLC_FILE, DEVKEY) actual = scorer.accuracy(mat) expected = 0.4467 assert_almost_equals(expected, actual, places=4, msg="UNEQUAL Expected:%f, Actual:%f" % (expected, actual))
def trainLRbySGD(N_its,inst_generator, outfile, devkey, learning_rate=1e-4, regularizer=1e-2): weights = defaultdict(float) dv_acc = [None]*N_its tr_acc = [None]*N_its # this block is all to take care of regularization ratereg = learning_rate * regularizer def regularize(base_feats,t): for base_feat in base_feats: for label in ALL_LABELS: weights[(label,base_feat)] *= (1 - ratereg) ** (t-last_update[base_feat]) last_update[base_feat] = t for it in xrange(N_its): tr_err = 0 last_update = defaultdict(int) # reset, since we regularize at the end of every iteration for i,(inst,true_label) in enumerate(inst_generator): # apply "just-in-time" regularization to the weights for features in this instance regularize(inst,i) # compute likelihood gradient from this instance probs = computeLabelProbs(inst,weights,ALL_LABELS) if true_label != argmax(probs): tr_err += 1 # your code for updating the weights goes here # regularize all features at the end of each iteration regularize([base_feature for label,base_feature in weights.keys()],i) dv_acc[it] = scorer.accuracy(evalClassifier(weights, outfile, devkey)) tr_acc[it] = 1. - tr_err/float(i) print it,'dev:',dv_acc[it],'train:',tr_acc[it] return weights,tr_acc,dv_acc
def test_wlc_dev_exact_accuracy (): global weights_wlc global WLC_FILE mat = evalClassifier (weights_wlc, WLC_FILE, DEVKEY) actual = scorer.accuracy(mat) expected = 0.4467 assert_almost_equals (expected, actual, places=4, msg="UNEQUAL Expected:%f, Actual:%f" %(expected,actual))
def test_wlc_dev_exact_accuracy (): global weights_wlc global WLC_FILE mat = evalClassifier (weights_wlc, WLC_FILE, DEVKEY) actual = scorer.accuracy(mat) expected_lower = 0.4440 expected_higher = 0.4470 ok = (actual >=expected_lower) and (actual <= expected_higher) ok_(ok, msg="NOT IN RANGE Expected:%f,%f, Actual:%f" %(expected_lower,expected_higher,actual))
def trainLRbyAdaGrad(N_its,inst_generator, outfile, devkey, learning_rate=1e-4, regularizer=1e-2): weights = defaultdict(float) dv_acc = [None]*N_its tr_acc = [None]*N_its running_value = defaultdict(float) num_inst = len(inst_generator) # this block is all to take care of regularization ratereg = learning_rate * regularizer def regularize(base_feats, t): for base_feat in base_feats: for label in ALL_LABELS: weights[(label, base_feat)] *= (1 - ratereg) ** (t-last_update[base_feat]) last_update[base_feat] = t for it in xrange(N_its): tr_err = 0 last_update = defaultdict(int) # reset, since we regularize at the end of every iteration for i, (inst, true_label) in enumerate(inst_generator): # apply "just-in-time" regularization to the weights for features in this instance regularize(inst, i) # compute likelihood gradient from this instance probs = computeLabelProbs(inst, weights, ALL_LABELS) label_pred = argmax(probs) if true_label != label_pred:tr_err += 1 for word, value in inst.items(): weights[(true_label, word)] += num_inst * learning_rate * value / running_value.get((true_label, word), 1) for label in ALL_LABELS: weights[(label, word)] -= num_inst * probs[label] * learning_rate * value / running_value.get((label, word), 1) running_value[(true_label, word)] = value**2 # regularize all features at the end of each iteration regularize([base_feature for label,base_feature in weights.keys()], i) dv_acc[it] = scorer.accuracy(evalClassifier(weights, outfile, devkey)) tr_acc[it] = 1. - tr_err/float(i) print it,'dev:',dv_acc[it],'train:',tr_acc[it] return weights,tr_acc,dv_acc
def setup_nb_testing (): global mat mat = evalClassifier (weights_nb, NB_FILE, DEVKEY)
def setup_nb_testing(): global mat mat = evalClassifier(weights_nb, NB_FILE, DEVKEY)