示例#1
0
def run_hmm(data_file_base, num_support, num_pools, num_iterations, train=True):
	#run crossval
	run_hmm_cross_val.do_crossval(data_file_base, num_support, num_iterations=num_iterations, num_pools=num_pools)

	#If train is true- actually build the model
	if train:
		run_train_hmm.train_model(data_file_base, num_support, num_pools=num_pools, num_iterations=num_iterations)	

	header = "lead,auc"

	#create results_file name
	test_results_file = "results/hmm_" + data_file_base + "_support_%s_test.csv" % (num_support)

	test_data = None

	pool = Pool(num_pools)
	rocs = pool.map(execute_hmm, ["___".join([data_file_base, str(num_support), str(lead)]) for lead in range(1,14)])
	for idx, roc in enumerate(rocs):
		lead = idx + 1
		if roc is not None:
			test_data = utils.add_to_data(test_data, [lead, roc])

	np.savetxt(test_results_file, np.atleast_2d(test_data), fmt="%s", delimiter=",", header= header, comments='')
def run_log_reg_hmm(data_file_base,
                    num_support,
                    num_pools,
                    num_iterations,
                    lead,
                    lag,
                    train=False,
                    do_parallel=True):
    start_time = time.time()
    num_weeks = 15

    data_prefix = "data/"
    data_suffix = ".csv"
    models_prefix = "models/"
    models_suffix = "_support_%s_logreg" % (num_support)
    data_file_train_input = data_prefix + data_file_base + "_train" + data_suffix
    data_file_train_hmm = data_prefix + data_file_base + "_train_logreg" + data_suffix
    data_file_test = data_prefix + data_file_base + "_test" + data_suffix
    models_dir = models_prefix + data_file_base + models_suffix

    train_data = np.genfromtxt(data_file_train_input,
                               delimiter=';',
                               skip_header=0)
    test_data = np.genfromtxt(data_file_test, delimiter=";", skip_header=0)

    #split into train 1 and train 2
    num_students = len(train_data) / num_weeks
    num_students_train_hmm = num_students / 2
    train_hmm_data = train_data[:num_students_train_hmm * num_weeks]
    train_logreg_data = train_data[num_students_train_hmm * num_weeks:]

    #train hmm on train_hmm_data
    np.savetxt(data_file_train_hmm, train_hmm_data, fmt="%d", delimiter=";")

    if not os.path.exists(models_dir) or train:
        run_train_hmm.train_model(data_file_base,
                                  num_support,
                                  num_pools=num_pools,
                                  num_iterations=num_iterations,
                                  logreg=True,
                                  do_parallel=do_parallel)

    assert os.path.exists(
        models_dir), "There is no trained model in directory %s" % (models_dir)

    def get_log_reg_features(data):
        dropout_value = 0  #bin value for a student dropped out
        command_base = ["./HMM_EM", "PredictStateDistribution", models_dir]

        logreg_X = None
        logreg_Y = []
        for student in range(len(data) / num_weeks):
            stud_data = data[student * num_weeks:(student + 1) * num_weeks]

            end_week = lag - 1
            label_week = lead + end_week
            X = stud_data[0:end_week + 1, :].flatten()
            truth_val = stud_data[label_week, 0]

            if stud_data[end_week, 0] == dropout_value:
                continue  #student has already dropped out

            features = np.array([])
            for prediction_week in range(end_week + 1):
                # get hidden state distribution for each prediction_week
                command = command_base + [
                    str(prediction_week)
                ] + X.astype(str).tolist(
                )  #need to pass lead+end_week in- API asks for week to predict
                results = subprocess.check_output(command)
                state_dist = np.fromstring(results, sep=";")[1:-1]
                prediction_week_features = state_dist[:-1]
                features = np.concatenate(
                    [features,
                     np.atleast_1d(prediction_week_features)])
            logreg_X = utils.add_to_data(logreg_X, features)
            logreg_Y += [truth_val]
        return logreg_X, logreg_Y

    # do inference on hmm to get features for logreg
    X_train, Y_train = get_log_reg_features(train_logreg_data)
    print "got train log_reg features for lead %s lag %s cohort %s support %s" % (
        lead, lag, data_file_base,
        num_support), time.time() - start_time, "seconds"

    #do inference on test set to get logreg features
    start_time = time.time()
    X_test, Y_test = get_log_reg_features(test_data)
    print "got test log_reg features for lead %s lag %s cohort %s support %s" % (
        lead, lag, data_file_base,
        num_support), time.time() - start_time, "seconds"

    return logistic_regression.run_regression(X_train, Y_train, X_test, Y_test,
                                              lead, lag)
示例#3
0
def run_experiments(data_file_base, num_support, num_pools, num_iterations):
    header = "lead,lag,support,auc"
    features_base = "features_"
    cohort = data_file_base[len(features_base):len("_bin_5") * -1]

    start_time = time.time()
    train_results_file = "results/logistic_reg_hmm_" + features_base + cohort + "_bin_5_support_%s_train" % num_support + ".csv"
    test_results_file = "results/logistic_reg_hmm_" + features_base + cohort + "_bin_5_support_%s_test" % num_support + ".csv"
    crossval_results_file = "results/logistic_reg_hmm_" + features_base + cohort + "_bin_5_support_%s_crossval" % num_support + ".csv"

    train_data = None
    test_data = None
    crossval_data = None
    data_file_base = features_base + cohort + "_bin_5"
    run_train_hmm.train_model(data_file_base,
                              num_support,
                              num_pools=num_pools,
                              num_iterations=num_iterations,
                              logreg=True,
                              do_parallel=True)
    pool = Pool(num_pools)
    args_list = []
    for lead in range(1, 14):
        for lag in range(1, 15 - lead):
            args_list += [
                "___".join([
                    features_base, cohort,
                    str(num_support),
                    str(lead),
                    str(lag),
                    str(num_pools),
                    str(num_iterations)
                ])
            ]
    lead_lag_train_test_crossvals = pool.map(execute_log_reg_hmm, args_list)
    for lead_lag_train_test_crossval in lead_lag_train_test_crossvals:
        if lead_lag_train_test_crossval:
            lead, lag, train_auc, test_auc, crossval_auc = lead_lag_train_test_crossval.split(
                "___")
            if train_auc:
                train_data = utils.add_to_data(
                    train_data,
                    [int(lead),
                     int(lag), num_support,
                     float(train_auc)])
            if test_auc:
                test_data = utils.add_to_data(
                    test_data,
                    [int(lead),
                     int(lag), num_support,
                     float(test_auc)])
            if crossval_auc:
                crossval_data = utils.add_to_data(
                    crossval_data,
                    [int(lead),
                     int(lag), num_support,
                     float(crossval_auc)])

    print "Ran logistic regression for %s support %s in %s seconds" % (
        cohort, num_support, time.time() - start_time)
    start_time = time.time()
    np.savetxt(train_results_file,
               np.atleast_2d(train_data),
               fmt="%s",
               delimiter=",",
               header=header,
               comments='')
    np.savetxt(test_results_file,
               np.atleast_2d(test_data),
               fmt="%s",
               delimiter=",",
               header=header,
               comments='')
    np.savetxt(crossval_results_file,
               np.atleast_2d(crossval_data),
               fmt="%s",
               delimiter=",",
               header=header,
               comments='')
def run_log_reg_hmm(data_file_base, num_support, num_pools, num_iterations, lead, lag, train=False, do_parallel=True):
	start_time = time.time()
	num_weeks = 15

	data_prefix = "data/"
	data_suffix = ".csv"
	models_prefix = "models/"
	models_suffix = "_support_%s_logreg" % (num_support)
	data_file_train_input = data_prefix + data_file_base + "_train" + data_suffix
	data_file_train_hmm = data_prefix + data_file_base + "_train_logreg" + data_suffix
	data_file_test = data_prefix + data_file_base + "_test" + data_suffix
	models_dir = models_prefix + data_file_base + models_suffix

	train_data = np.genfromtxt(data_file_train_input, delimiter = ';', skip_header = 0)
	test_data = np.genfromtxt(data_file_test, delimiter = ";", skip_header = 0)

	#split into train 1 and train 2
	num_students = len(train_data) / num_weeks
	num_students_train_hmm =  num_students / 2
	train_hmm_data = train_data[: num_students_train_hmm * num_weeks]
	train_logreg_data = train_data[num_students_train_hmm * num_weeks :]

	#train hmm on train_hmm_data
	np.savetxt(data_file_train_hmm, train_hmm_data, fmt="%d", delimiter=";")

	if not os.path.exists(models_dir) or train:
		run_train_hmm.train_model(data_file_base, num_support, num_pools=num_pools, num_iterations=num_iterations, logreg=True, do_parallel=do_parallel)

	assert os.path.exists(models_dir), "There is no trained model in directory %s" % (models_dir)

	def get_log_reg_features(data):
		dropout_value = 0 #bin value for a student dropped out
		command_base = ["./HMM_EM", "PredictStateDistribution", models_dir]

		logreg_X = None
		logreg_Y = []
		for student in range(len(data) / num_weeks):
			stud_data = data[student * num_weeks: (student + 1) * num_weeks]

			end_week = lag -1
			label_week = lead + end_week
			X = stud_data[0: end_week + 1, :].flatten()
			truth_val = stud_data[label_week, 0]
			
			if stud_data[end_week, 0] == dropout_value:
				continue #student has already dropped out

			features = np.array([])
			for prediction_week in range(end_week + 1):
				# get hidden state distribution for each prediction_week
				command = command_base + [str(prediction_week)]+ X.astype(str).tolist() #need to pass lead+end_week in- API asks for week to predict
				results = subprocess.check_output(command)
				state_dist = np.fromstring(results, sep=";")[1:-1]
				prediction_week_features = state_dist[:-1]
				features = np.concatenate([features, np.atleast_1d(prediction_week_features)])
			logreg_X = utils.add_to_data(logreg_X, features)
			logreg_Y += [truth_val]
		return logreg_X, logreg_Y

	# do inference on hmm to get features for logreg
	X_train, Y_train = get_log_reg_features(train_logreg_data)
	print "got train log_reg features for lead %s lag %s cohort %s support %s" % (lead, lag, data_file_base, num_support), time.time() - start_time, "seconds"

	#do inference on test set to get logreg features
	start_time = time.time()
	X_test, Y_test = get_log_reg_features(test_data)
	print "got test log_reg features for lead %s lag %s cohort %s support %s" % (lead, lag, data_file_base, num_support), time.time() - start_time, "seconds"

	return logistic_regression.run_regression(X_train, Y_train, X_test, Y_test, lead, lag)