def firstProblem(): result = [] for m in range(100): for size in ssize: score = Score() cv = LeaveNOut() classifier = KNeighborsClassifier(n_neighbors=3, algorithm="kd_tree") X = np.random.normal(mu, sigma, size) X = np.expand_dims(X, axis=1) Y = np.zeros([X.shape[0]]) Y = np.expand_dims(Y, axis=1) one_indices = random.sample([i for i in range(X.shape[0])], size / 2) Y[one_indices, 0] = 1 #leave one out pred, true = cv.run(data=X, labels=Y, model=classifier, n_out=1) c = score.c_score(pred, true) auc_score = auc(pred, true, reorder=True) result.append([c, auc_score, size, m]) data = pd.DataFrame(result) data.to_csv('result1.csv', header=False, index=False)
def right_feature_selection(X, Y): score = Score() cv = LeaveNOut() classifier = KNeighborsClassifier(n_neighbors=3, algorithm="kd_tree") pred, true = cv.run(data=X, labels=Y, model=classifier, n_out=1, embedded_feature_selection=True) c = score.c_score(pred, true) auc_score = auc(pred, true, reorder=True) return c, auc_score
def wrong_feature_selection(X, Y): score = Score() cv = LeaveNOut() classifier = KNeighborsClassifier(n_neighbors=3, algorithm="kd_tree") X, indices = cv.select(X, Y, select_count=10) pred, true = cv.run(data=X, labels=Y, model=classifier, n_out=1) c = score.c_score(pred, true) auc_score = auc(pred, true, reorder=True) return c, auc_score
from kNN import NearestNeighborsRegressor from sklearn.neighbors import KNeighborsRegressor from Score import * import matplotlib.pyplot as plt import seaborn as sns from legacy_script import * from LeaveNOut import * from sklearn.svm import SVR import pdb input = pd.read_csv('./Soil_water_permeability_data/INPUT.csv',header=None) output = pd.read_csv('./Soil_water_permeability_data/OUTPUT.csv', header=None) coordinates = pd.read_csv('./Soil_water_permeability_data/COORDINATES.csv', header=None) # Normalization std_input = standardize_dataset(input) plot_data = [] for n in range(0,201,10): model = NearestNeighborsRegressor(n_neighbors=5) runner = LeaveNOut(zone_radius=n) predict, true = runner.run(data=std_input, model=model, labels=output.as_matrix(), n_out=1, coordinates = coordinates.as_matrix()) score = Score() plot_data.append([score.c_score(np.array(predict)[:, 0], np.array(true)[:, 0]), n, 'Concordance Index']) print "epoch %d " % n line_plot(np.array(plot_data), title="Concordance index by different Dead zone radius - Leave 1 out CV", x_title="Dead zone radius", y_title="C-Index")
import matplotlib.pyplot as plt import seaborn as sns from legacy_script import * from LeaveNOut import * from sklearn.svm import SVR import pdb data = pd.read_csv('Water_data.csv') # Split data into labels and features train_labels, train_data = np.hsplit(data, [3]) # Normalization train_data = standardize_dataset(train_data) train_labels = train_labels.as_matrix() #Try with different neighbors and different leave N out cross validation: for n in [1, 3]: plot_data = [] for i in range(1, 30, 1): model = NearestNeighborsRegressor(n_neighbors=i) runner = LeaveNOut() predict, true = runner.run(data=train_data, model=model, labels=train_labels, n_out=n) score = Score() plot_data.append([score.c_score(np.array(predict)[:, 0], np.array(true)[:, 0]), i, 'c_total']) plot_data.append([score.c_score(np.array(predict)[:, 1], np.array(true)[:, 1]), i, 'Cd']) plot_data.append([score.c_score(np.array(predict)[:, 2], np.array(true)[:, 2]), i, 'Pb']) line_plot(np.array(plot_data), title="C_index by different K Neighbors - Leave %s out CV" % n, x_title="K Neighbors", y_title="C-Index")