def run_hmm(data_file_base, num_support, num_pools, num_iterations, train=True): #run crossval run_hmm_cross_val.do_crossval(data_file_base, num_support, num_iterations=num_iterations, num_pools=num_pools) #If train is true- actually build the model if train: run_train_hmm.train_model(data_file_base, num_support, num_pools=num_pools, num_iterations=num_iterations) header = "lead,auc" #create results_file name test_results_file = "results/hmm_" + data_file_base + "_support_%s_test.csv" % (num_support) test_data = None pool = Pool(num_pools) rocs = pool.map(execute_hmm, ["___".join([data_file_base, str(num_support), str(lead)]) for lead in range(1,14)]) for idx, roc in enumerate(rocs): lead = idx + 1 if roc is not None: test_data = utils.add_to_data(test_data, [lead, roc]) np.savetxt(test_results_file, np.atleast_2d(test_data), fmt="%s", delimiter=",", header= header, comments='')
def run_log_reg_hmm(data_file_base, num_support, num_pools, num_iterations, lead, lag, train=False, do_parallel=True): start_time = time.time() num_weeks = 15 data_prefix = "data/" data_suffix = ".csv" models_prefix = "models/" models_suffix = "_support_%s_logreg" % (num_support) data_file_train_input = data_prefix + data_file_base + "_train" + data_suffix data_file_train_hmm = data_prefix + data_file_base + "_train_logreg" + data_suffix data_file_test = data_prefix + data_file_base + "_test" + data_suffix models_dir = models_prefix + data_file_base + models_suffix train_data = np.genfromtxt(data_file_train_input, delimiter=';', skip_header=0) test_data = np.genfromtxt(data_file_test, delimiter=";", skip_header=0) #split into train 1 and train 2 num_students = len(train_data) / num_weeks num_students_train_hmm = num_students / 2 train_hmm_data = train_data[:num_students_train_hmm * num_weeks] train_logreg_data = train_data[num_students_train_hmm * num_weeks:] #train hmm on train_hmm_data np.savetxt(data_file_train_hmm, train_hmm_data, fmt="%d", delimiter=";") if not os.path.exists(models_dir) or train: run_train_hmm.train_model(data_file_base, num_support, num_pools=num_pools, num_iterations=num_iterations, logreg=True, do_parallel=do_parallel) assert os.path.exists( models_dir), "There is no trained model in directory %s" % (models_dir) def get_log_reg_features(data): dropout_value = 0 #bin value for a student dropped out command_base = ["./HMM_EM", "PredictStateDistribution", models_dir] logreg_X = None logreg_Y = [] for student in range(len(data) / num_weeks): stud_data = data[student * num_weeks:(student + 1) * num_weeks] end_week = lag - 1 label_week = lead + end_week X = stud_data[0:end_week + 1, :].flatten() truth_val = stud_data[label_week, 0] if stud_data[end_week, 0] == dropout_value: continue #student has already dropped out features = np.array([]) for prediction_week in range(end_week + 1): # get hidden state distribution for each prediction_week command = command_base + [ str(prediction_week) ] + X.astype(str).tolist( ) #need to pass lead+end_week in- API asks for week to predict results = subprocess.check_output(command) state_dist = np.fromstring(results, sep=";")[1:-1] prediction_week_features = state_dist[:-1] features = np.concatenate( [features, np.atleast_1d(prediction_week_features)]) logreg_X = utils.add_to_data(logreg_X, features) logreg_Y += [truth_val] return logreg_X, logreg_Y # do inference on hmm to get features for logreg X_train, Y_train = get_log_reg_features(train_logreg_data) print "got train log_reg features for lead %s lag %s cohort %s support %s" % ( lead, lag, data_file_base, num_support), time.time() - start_time, "seconds" #do inference on test set to get logreg features start_time = time.time() X_test, Y_test = get_log_reg_features(test_data) print "got test log_reg features for lead %s lag %s cohort %s support %s" % ( lead, lag, data_file_base, num_support), time.time() - start_time, "seconds" return logistic_regression.run_regression(X_train, Y_train, X_test, Y_test, lead, lag)
def run_experiments(data_file_base, num_support, num_pools, num_iterations): header = "lead,lag,support,auc" features_base = "features_" cohort = data_file_base[len(features_base):len("_bin_5") * -1] start_time = time.time() train_results_file = "results/logistic_reg_hmm_" + features_base + cohort + "_bin_5_support_%s_train" % num_support + ".csv" test_results_file = "results/logistic_reg_hmm_" + features_base + cohort + "_bin_5_support_%s_test" % num_support + ".csv" crossval_results_file = "results/logistic_reg_hmm_" + features_base + cohort + "_bin_5_support_%s_crossval" % num_support + ".csv" train_data = None test_data = None crossval_data = None data_file_base = features_base + cohort + "_bin_5" run_train_hmm.train_model(data_file_base, num_support, num_pools=num_pools, num_iterations=num_iterations, logreg=True, do_parallel=True) pool = Pool(num_pools) args_list = [] for lead in range(1, 14): for lag in range(1, 15 - lead): args_list += [ "___".join([ features_base, cohort, str(num_support), str(lead), str(lag), str(num_pools), str(num_iterations) ]) ] lead_lag_train_test_crossvals = pool.map(execute_log_reg_hmm, args_list) for lead_lag_train_test_crossval in lead_lag_train_test_crossvals: if lead_lag_train_test_crossval: lead, lag, train_auc, test_auc, crossval_auc = lead_lag_train_test_crossval.split( "___") if train_auc: train_data = utils.add_to_data( train_data, [int(lead), int(lag), num_support, float(train_auc)]) if test_auc: test_data = utils.add_to_data( test_data, [int(lead), int(lag), num_support, float(test_auc)]) if crossval_auc: crossval_data = utils.add_to_data( crossval_data, [int(lead), int(lag), num_support, float(crossval_auc)]) print "Ran logistic regression for %s support %s in %s seconds" % ( cohort, num_support, time.time() - start_time) start_time = time.time() np.savetxt(train_results_file, np.atleast_2d(train_data), fmt="%s", delimiter=",", header=header, comments='') np.savetxt(test_results_file, np.atleast_2d(test_data), fmt="%s", delimiter=",", header=header, comments='') np.savetxt(crossval_results_file, np.atleast_2d(crossval_data), fmt="%s", delimiter=",", header=header, comments='')
def run_log_reg_hmm(data_file_base, num_support, num_pools, num_iterations, lead, lag, train=False, do_parallel=True): start_time = time.time() num_weeks = 15 data_prefix = "data/" data_suffix = ".csv" models_prefix = "models/" models_suffix = "_support_%s_logreg" % (num_support) data_file_train_input = data_prefix + data_file_base + "_train" + data_suffix data_file_train_hmm = data_prefix + data_file_base + "_train_logreg" + data_suffix data_file_test = data_prefix + data_file_base + "_test" + data_suffix models_dir = models_prefix + data_file_base + models_suffix train_data = np.genfromtxt(data_file_train_input, delimiter = ';', skip_header = 0) test_data = np.genfromtxt(data_file_test, delimiter = ";", skip_header = 0) #split into train 1 and train 2 num_students = len(train_data) / num_weeks num_students_train_hmm = num_students / 2 train_hmm_data = train_data[: num_students_train_hmm * num_weeks] train_logreg_data = train_data[num_students_train_hmm * num_weeks :] #train hmm on train_hmm_data np.savetxt(data_file_train_hmm, train_hmm_data, fmt="%d", delimiter=";") if not os.path.exists(models_dir) or train: run_train_hmm.train_model(data_file_base, num_support, num_pools=num_pools, num_iterations=num_iterations, logreg=True, do_parallel=do_parallel) assert os.path.exists(models_dir), "There is no trained model in directory %s" % (models_dir) def get_log_reg_features(data): dropout_value = 0 #bin value for a student dropped out command_base = ["./HMM_EM", "PredictStateDistribution", models_dir] logreg_X = None logreg_Y = [] for student in range(len(data) / num_weeks): stud_data = data[student * num_weeks: (student + 1) * num_weeks] end_week = lag -1 label_week = lead + end_week X = stud_data[0: end_week + 1, :].flatten() truth_val = stud_data[label_week, 0] if stud_data[end_week, 0] == dropout_value: continue #student has already dropped out features = np.array([]) for prediction_week in range(end_week + 1): # get hidden state distribution for each prediction_week command = command_base + [str(prediction_week)]+ X.astype(str).tolist() #need to pass lead+end_week in- API asks for week to predict results = subprocess.check_output(command) state_dist = np.fromstring(results, sep=";")[1:-1] prediction_week_features = state_dist[:-1] features = np.concatenate([features, np.atleast_1d(prediction_week_features)]) logreg_X = utils.add_to_data(logreg_X, features) logreg_Y += [truth_val] return logreg_X, logreg_Y # do inference on hmm to get features for logreg X_train, Y_train = get_log_reg_features(train_logreg_data) print "got train log_reg features for lead %s lag %s cohort %s support %s" % (lead, lag, data_file_base, num_support), time.time() - start_time, "seconds" #do inference on test set to get logreg features start_time = time.time() X_test, Y_test = get_log_reg_features(test_data) print "got test log_reg features for lead %s lag %s cohort %s support %s" % (lead, lag, data_file_base, num_support), time.time() - start_time, "seconds" return logistic_regression.run_regression(X_train, Y_train, X_test, Y_test, lead, lag)