예제 #1
0
#data!
print("starting simple model data collection")
data, df = data_helper.convert_data("as.csv", skill_name, return_df=True)#save dataframe for further trials
check_data.check_data(data)
print("creating simple model")
results["Simple Model"] = crossvalidate.crossvalidate(data, folds=folds, seed=seed)

print("starting majority class calculation")
majority = 0
if np.sum(data["data"][0]) - len(data["data"][0]) > len(data["data"][0]) - (np.sum(data["data"][0]) - len(data["data"][0])):
    majority = 1
pred_values = np.zeros((len(data["data"][0]),))
pred_values.fill(majority)
true_values = data["data"][0].tolist()
pred_values = pred_values.tolist()
results["Majority Class"] = (accuracy.compute_acc(true_values,pred_values), rmse.compute_rmse(true_values,pred_values), auc.compute_auc(true_values, pred_values))


print("starting item_learning_effect data collection")
data_multilearn = data_helper.convert_data(df, skill_name, multilearn=True)
check_data.check_data(data_multilearn)
print("creating item_learning_effect model")
results["Multilearn"] = crossvalidate.crossvalidate(data_multilearn, folds=folds, seed=seed)

print("starting kt_idem data collection")
data_multiguess = data_helper.convert_data(df, skill_name, multiguess=True)
check_data.check_data(data_multiguess)
print("creating kt_idem model")
results["Multiguess"] = crossvalidate.crossvalidate(data_multiguess, folds=folds, seed=seed)

print("starting item_order_effect data collection")
예제 #2
0
def crossvalidate(data, folds=5, verbose=False, seed=0, return_arrays=False):

    if "resource_names" in data:
        num_learns = len(data["resource_names"])
    else:
        num_learns = 1

    if "gs_names" in data:
        num_gs = len(data["gs_names"])
    else:
        num_gs = 1

    total = 0
    acc = 0
    area_under_curve = 0
    num_fit_initializations = 20
    split_size = (len(data["starts"]) // folds)
    #create random permutation to act as indices for folds for crossvalidation
    shuffle = np.random.RandomState(seed=seed).permutation(len(data["starts"]))
    all_true, all_pred = [], []

    # crossvalidation on students which are identified by the starts array
    for iteration in range(folds):
        #create training/test data based on random permutation from earlier
        train = np.concatenate(
            (shuffle[0:iteration * split_size],
             shuffle[(iteration + 1) * split_size:len(data["starts"])]))
        test = shuffle[iteration * split_size:(iteration + 1) * split_size]
        training_data = fix_data(data, train)
        num_fit_initializations = 5
        best_likelihood = float("-inf")

        for i in range(num_fit_initializations):
            fitmodel = random_model_uni.random_model_uni(
                num_learns, num_gs
            )  # include this line to randomly set initial param values
            (fitmodel,
             log_likelihoods) = EM_fit.EM_fit(fitmodel, training_data)
            if (log_likelihoods[-1] > best_likelihood):
                best_likelihood = log_likelihoods[-1]
                best_model = fitmodel

        if verbose:
            print(" ")
            print('Iteration %d' % (iteration))
            print('\tlearned')
            print('prior\t%.4f' % (best_model["pi_0"][1][0]))
            for r in range(num_learns):
                print('learn%d\t%.4f' %
                      (r + 1, best_model['As'][r, 1, 0].squeeze()))
            for r in range(num_learns):
                print('forget%d\t%.4f' %
                      (r + 1, best_model['As'][r, 0, 1].squeeze()))

            for s in range(num_gs):
                print('guess%d\t%.4f' % (s + 1, best_model['guesses'][s]))
            for s in range(num_gs):
                print('slip%d\t%.4f' % (s + 1, best_model['slips'][s]))

        test_data = fix_data(data, test)

        # run model predictions from training data on test data
        (correct_predictions,
         state_predictions) = predict_onestep.run(best_model, test_data)

        flat_true_values = np.zeros((len(test_data["data"][0]), ),
                                    dtype=np.intc)
        for i in range(len(test_data["data"])):
            for j in range(len(test_data["data"][0])):
                if test_data["data"][i][j] != 0:
                    flat_true_values[j] = test_data["data"][i][j]
        flat_true_values = flat_true_values.tolist()

        # print(len(flat_true_values))
        # print(len(correct_predictions))
        # print(auc.compute_auc(flat_true_values, correct_predictions))
        all_true.extend(flat_true_values)
        all_pred.extend(correct_predictions)

    if return_arrays:
        return (all_true, all_pred)

# print(len(all_true))
    print(len(all_pred))
    total += rmse.compute_rmse(all_true, all_pred)
    acc += accuracy.compute_acc(all_true, all_pred)
    area_under_curve += auc.compute_auc(all_true, all_pred)
    if verbose:
        print("Average RMSE: ", total)
        print("Average Accuracy: ", acc)
        print("Average AUC: ", area_under_curve)
    return (acc, total, area_under_curve)
예제 #3
0
        #for r in range(1):
        #    print('learn%d\t%.4f' % (r+1, best_model['As'][r, 1, 0].squeeze()))
        #for r in range(1):
        #    print('forget%d\t%.4f' % (r+1, best_model['As'][r, 0, 1].squeeze()))

        #for s in range(1):
        #    print('guess%d\t%.4f' % (s+1, best_model['guesses'][s]))
        #for s in range(1):
        #    print('slip%d\t%.4f' % (s+1, best_model['slips'][s]))

        if len(test_data[skill]["resources"]) > 0:
            (correct_predictions,
             state_predictions) = predict_onestep.run(best_model,
                                                      test_data[skill])
            if len(
                    np.unique(test_data[skill]["data"])
            ) > 1:  #auc for single skill only calculated when there are 2+ classifiers
                curr_auc = auc.compute_auc(test_data[skill]["data"][0],
                                           correct_predictions)
            else:
                curr_auc = 0

            all_true.extend(test_data[skill]["data"][0])
            all_pred.extend(correct_predictions)
            print("Skill %s of %s calculation completed with AUC of %.4f" %
                  (skill, skill_count, curr_auc))
        else:
            print("No test data for skill %s" % skill)
total_auc = auc.compute_auc(all_true, all_pred)
print("Overall AUC:", total_auc)
skill_count = 124  #hardcoded for nips data set

#data!
Data = nips_data_helper.convert_data("builder_train.csv",
                                     url2="builder_test.csv")

print("Data preprocessing finished")

for i in range(skill_count):
    check_data.check_data(Data[i])

print("All data okay")

all_true = []
all_pred = []
for skill in range(skill_count):

    if len(Data[skill]["resources"]
           ) < 5:  #auc only calculated when there are 2+ classifiers
        print("Not enough data for skill %s" % skill)
        continue

    temp = crossvalidate.crossvalidate(Data[skill],
                                       verbose=False,
                                       return_arrays=True)
    print("Skill %s of %s calculation completed" % (skill, skill_count - 1))
    all_true.extend(temp[0])
    all_pred.extend(temp[1])
total_auc = auc.compute_auc(all_true, all_pred)
print("Overall AUC:", total_auc)