예제 #1
0
def regularization_using_grid_search (alphas, counts, class_counts, allkeys, tr_outfile='nb.alpha.tr.txt', dv_outfile='nb.alpha.dv.txt'):
    tr_accs = []
    dv_accs = []
    # Choose your alphas here
    weights_nb_alphas = dict()
    for alpha in alphas:
        weights_nb_alphas[alpha] = learnNBWeights(counts, class_counts, allkeys, alpha)
        confusion = evalClassifier(weights_nb_alphas[alpha],tr_outfile,TRAINKEY)
        tr_accs.append(scorer.accuracy(confusion))
        confusion = evalClassifier(weights_nb_alphas[alpha],dv_outfile,DEVKEY)
        dv_accs.append(scorer.accuracy(confusion))
    return weights_nb_alphas, tr_accs, dv_accs
예제 #2
0
def regularization_using_grid_search(alphas, netas, N_its,inst_generator, outfile, devkey, learning_rate=1e-4, regularizer=1e-2, tr_outfile='logreg.alpha.tr.txt', dv_outfile='logreg.alpha.dv.txt'):
    tr_accs = []
    dv_accs = []
    # Choose your alphas here
    weights_log_reg_alphas = dict()
    for alpha in alphas:
        for neta in netas:
            weights_log_reg_alphas[(alpha, neta)] = trainLRbySGD(N_its,inst_generator, outfile, devkey, learning_rate=neta, regularizer=alpha)
            confusion = evalClassifier(weights_log_reg_alphas[(alpha, neta)],tr_outfile,TRAINKEY)
            tr_accs.append(scorer.accuracy(confusion))
            confusion = evalClassifier(weights_log_reg_alphas[(alpha, neta)],dv_outfile,DEVKEY)
            dv_accs.append(scorer.accuracy(confusion))
    return weights_log_reg_alphas, tr_accs, dv_accs
예제 #3
0
def test_wlc_dev_almost_there_accuracy ():
    global weights_wlc
    global WLC_FILE
    mat = evalClassifier (weights_wlc, WLC_FILE, DEVKEY)
    actual = scorer.accuracy(mat)
    expected = 0.40
    ok_ (expected <= actual, msg="UNEQUAL Expected:%f, Actual:%f" %(expected,actual))
예제 #4
0
def test_mcc_dev_accuracy ():
    global weights_mcc
    global MCC_FILE
    mat = evalClassifier (weights_mcc, MCC_FILE, DEVKEY)
    actual = scorer.accuracy(mat)
    expected = 0.3756
    assert_almost_equals (expected, actual, places=4, msg="UNEQUAL Expected:%f, Actual:%f" %(expected,actual))
예제 #5
0
def test_wlc_dev_exact_accuracy():
    global weights_wlc
    global WLC_FILE
    mat = evalClassifier(weights_wlc, WLC_FILE, DEVKEY)
    actual = scorer.accuracy(mat)
    expected = 0.4467
    assert_almost_equals(expected, actual, places=4, msg="UNEQUAL Expected:%f, Actual:%f" % (expected, actual))
예제 #6
0
def trainLRbySGD(N_its,inst_generator, outfile, devkey, learning_rate=1e-4, regularizer=1e-2):
    weights = defaultdict(float)
    dv_acc = [None]*N_its
    tr_acc = [None]*N_its

    # this block is all to take care of regularization
    ratereg = learning_rate * regularizer
    def regularize(base_feats,t):
        for base_feat in base_feats:
            for label in ALL_LABELS:
                weights[(label,base_feat)] *= (1 - ratereg) ** (t-last_update[base_feat])
            last_update[base_feat] = t

    for it in xrange(N_its):
        tr_err = 0
        last_update = defaultdict(int) # reset, since we regularize at the end of every iteration
        for i,(inst,true_label) in enumerate(inst_generator):
            # apply "just-in-time" regularization to the weights for features in this instance
            regularize(inst,i)
            # compute likelihood gradient from this instance
            probs = computeLabelProbs(inst,weights,ALL_LABELS)
            if true_label != argmax(probs): tr_err += 1
            # your code for updating the weights goes here

        # regularize all features at the end of each iteration
        regularize([base_feature for label,base_feature in weights.keys()],i)
        
        dv_acc[it] = scorer.accuracy(evalClassifier(weights, outfile, devkey))
        tr_acc[it] = 1. - tr_err/float(i)
        print it,'dev:',dv_acc[it],'train:',tr_acc[it]
    return weights,tr_acc,dv_acc
예제 #7
0
def test_wlc_dev_exact_accuracy ():
    global weights_wlc
    global WLC_FILE
    mat = evalClassifier (weights_wlc, WLC_FILE, DEVKEY)
    actual = scorer.accuracy(mat)
    expected = 0.4467
    assert_almost_equals (expected, actual, places=4, msg="UNEQUAL Expected:%f, Actual:%f" %(expected,actual))
예제 #8
0
def test_wlc_dev_exact_accuracy ():
    global weights_wlc
    global WLC_FILE
    mat = evalClassifier (weights_wlc, WLC_FILE, DEVKEY)
    actual = scorer.accuracy(mat)
    expected_lower = 0.4440
    expected_higher = 0.4470
    ok = (actual >=expected_lower) and (actual <= expected_higher)
    ok_(ok, msg="NOT IN RANGE Expected:%f,%f, Actual:%f" %(expected_lower,expected_higher,actual))
예제 #9
0
def trainLRbyAdaGrad(N_its,inst_generator, outfile, devkey, learning_rate=1e-4, regularizer=1e-2):
    weights = defaultdict(float)
    dv_acc = [None]*N_its
    tr_acc = [None]*N_its

    running_value = defaultdict(float)

    num_inst = len(inst_generator)
    # this block is all to take care of regularization
    ratereg = learning_rate * regularizer
    def regularize(base_feats, t):
        for base_feat in base_feats:
            for label in ALL_LABELS:
                weights[(label, base_feat)] *= (1 - ratereg) ** (t-last_update[base_feat])
            last_update[base_feat] = t

    for it in xrange(N_its):
        tr_err = 0
        last_update = defaultdict(int) # reset, since we regularize at the end of every iteration
        for i, (inst, true_label) in enumerate(inst_generator):
            # apply "just-in-time" regularization to the weights for features in this instance
            regularize(inst, i)
            # compute likelihood gradient from this instance
            probs = computeLabelProbs(inst, weights, ALL_LABELS)

            label_pred = argmax(probs)
            if true_label != label_pred:tr_err += 1

            for word, value in inst.items():
                weights[(true_label, word)] += num_inst * learning_rate * value / running_value.get((true_label, word), 1)
                for label in ALL_LABELS:
                    weights[(label, word)] -= num_inst * probs[label] * learning_rate * value / running_value.get((label, word), 1)
                running_value[(true_label, word)] = value**2

        # regularize all features at the end of each iteration
        regularize([base_feature for label,base_feature in weights.keys()], i)

        dv_acc[it] = scorer.accuracy(evalClassifier(weights, outfile, devkey))
        tr_acc[it] = 1. - tr_err/float(i)
        print it,'dev:',dv_acc[it],'train:',tr_acc[it]
    return weights,tr_acc,dv_acc
예제 #10
0
def setup_nb_testing ():
    global mat
    mat = evalClassifier (weights_nb, NB_FILE, DEVKEY)
예제 #11
0
def setup_nb_testing():
    global mat
    mat = evalClassifier(weights_nb, NB_FILE, DEVKEY)