def split_y(data, col): replace_nan(data, col) print data[:, col] median = np.median(data[:, col]) print median for i in range(data.shape[0]): print '=========' print data[i,[3,7,8]] print data[i, col] if data[i][col] > median: print 1 else: print 0 label = LABELS[col-16] output_fp = os.path.join('data', 'matrix_data', 'logit', "wifi_features_" + label + '.csv') fw = open(output_fp, 'a') for i in range(data.shape[0]): line = [str(x) for x in data[i,[3,7,8]].tolist()] #print type(data[i][col]) if data[i][col] > median: y = 1 else: y = 0 line.append(str(y)) fw.write(','.join(line) + '\n') fw.close()
if __name__ == '__main__': fp = r"data\matrix_data\all_wifi_features.csv" data = np.genfromtxt(fp, delimiter=",", dtype=float, skip_header=1) x_cols = [1,2,3] n = data.shape[0] m = data.shape[1] fold = 5 test_data = np.empty([n/fold, m]) train_data = np.empty([n-n/fold, m]) y_cols = range(4, 19) for y_col in y_cols: print '================================================================' print LABELS[y_col-4] replace_nan(data, y_col) avg_mse = 0.0 for j in range(fold): #print "fold %d" % j for i, x in enumerate(data): if i%fold == j: test_data[i/fold] = data[i] else: train_data[(i/fold)*(fold-1)+i%fold-1] = data[i] beta = get_beta(train_data, y_col) mse = get_mse(test_data, beta, y_col) avg_mse += mse avg_mse /= fold print "average mse: " + str(avg_mse)