def cross_validate(subject, data_path, reg_C, random_cv=False): if random_cv: d = load_train_data(data_path, subject) x, y = d['x'], d['y'] skf = StratifiedKFold(y, n_folds=10) else: filenames_grouped_by_hour = cPickle.load(open('filenames.pickle')) data_grouped_by_hour = load_grouped_train_data( data_path, subject, filenames_grouped_by_hour) n_preictal, n_interictal = len(data_grouped_by_hour['preictal']), len( data_grouped_by_hour['interictal']) hours_data = data_grouped_by_hour['preictal'] + data_grouped_by_hour[ 'interictal'] hours_labels = np.concatenate( (np.ones(n_preictal), np.zeros(n_interictal))) n_folds = n_preictal skf = StratifiedKFold(hours_labels, n_folds=n_folds) preictal_probs, labels = [], [] for train_indexes, valid_indexes in skf: x_train, x_valid = [], [] y_train, y_valid = [], [] for i in train_indexes: x_train.extend(hours_data[i]) y_train.extend(hours_labels[i] * np.ones(len(hours_data[i]))) for i in valid_indexes: x_valid.extend(hours_data[i]) y_valid.extend(hours_labels[i] * np.ones(len(hours_data[i]))) x_train = [x[..., np.newaxis] for x in x_train] x_train = np.concatenate(x_train, axis=3) x_train = np.rollaxis(x_train, axis=3) y_train = np.array(y_train) x_valid = [x[..., np.newaxis] for x in x_valid] x_valid = np.concatenate(x_valid, axis=3) x_valid = np.rollaxis(x_valid, axis=3) y_valid = np.array(y_valid) n_valid_examples = x_valid.shape[0] n_timesteps = x_valid.shape[-1] x_train, y_train = reshape_data(x_train, y_train) data_scaler = StandardScaler() x_train = data_scaler.fit_transform(x_train) logreg = LogisticRegression(C=reg_C) logreg.fit(x_train, y_train) x_valid = reshape_data(x_valid) x_valid = data_scaler.transform(x_valid) p_valid = predict(logreg, x_valid, n_valid_examples, n_timesteps) preictal_probs.extend(p_valid) labels.extend(y_valid) return preictal_probs, labels
def curve_per_subject(subject, data_path, test_labels): d = load_train_data(data_path, subject) x, y_10m = d['x'], d['y'] n_train_examples = x.shape[0] n_timesteps = x.shape[-1] print 'n_preictal', np.sum(y_10m) print 'n_inetrictal', np.sum(y_10m - 1) x, y = reshape_data(x, y_10m) data_scaler = StandardScaler() x = data_scaler.fit_transform(x) lda = LDA() lda.fit(x, y) pred_1m = lda.predict_proba(x)[:, 1] pred_10m = np.reshape(pred_1m, (n_train_examples, n_timesteps)) pred_10m = np.mean(pred_10m, axis=1) fpr, tpr, threshold = roc_curve(y_10m, pred_10m) c = np.sqrt((1 - tpr) ** 2 + fpr ** 2) opt_threshold = threshold[np.where(c == np.min(c))[0]][-1] print opt_threshold # ------- TEST --------------- d = load_test_data(data_path, subject) x_test, id = d['x'], d['id'] n_test_examples = x_test.shape[0] n_timesteps = x_test.shape[3] x_test = reshape_data(x_test) x_test = data_scaler.transform(x_test) pred_1m = lda.predict_proba(x_test)[:, 1] pred_10m = np.reshape(pred_1m, (n_test_examples, n_timesteps)) pred_10m = np.mean(pred_10m, axis=1) y_pred = np.zeros_like(test_labels) y_pred[np.where(pred_10m >= opt_threshold)] = 1 cm = confusion_matrix(test_labels, y_pred) print print_cm(cm, labels=['interictal', 'preictal']) sn = 1.0 * cm[1, 1] / (cm[1, 1] + cm[1, 0]) sp = 1.0 * cm[0, 0] / (cm[0, 0] + cm[0, 1]) print sn, sp sn, sp = [], [] t_list = np.arange(0.0, 1.0, 0.01) for t in t_list: y_pred = np.zeros_like(test_labels) y_pred[np.where(pred_10m >= t)] = 1 cm = confusion_matrix(test_labels, y_pred) sn_t = 1.0 * cm[1, 1] / (cm[1, 1] + cm[1, 0]) sp_t = 1.0 * cm[0, 0] / (cm[0, 0] + cm[0, 1]) sn.append(sn_t) sp.append(sp_t) return t_list, sn, sp
def curve_per_subject(subject, data_path, test_labels): d = load_train_data(data_path, subject) x, y_10m = d['x'], d['y'] n_train_examples = x.shape[0] n_timesteps = x.shape[-1] print 'n_preictal', np.sum(y_10m) print 'n_inetrictal', np.sum(y_10m - 1) x, y = reshape_data(x, y_10m) data_scaler = StandardScaler() x = data_scaler.fit_transform(x) lda = LDA() lda.fit(x, y) pred_1m = lda.predict_proba(x)[:, 1] pred_10m = np.reshape(pred_1m, (n_train_examples, n_timesteps)) pred_10m = np.mean(pred_10m, axis=1) fpr, tpr, threshold = roc_curve(y_10m, pred_10m) c = np.sqrt((1 - tpr)**2 + fpr**2) opt_threshold = threshold[np.where(c == np.min(c))[0]][-1] print opt_threshold # ------- TEST --------------- d = load_test_data(data_path, subject) x_test, id = d['x'], d['id'] n_test_examples = x_test.shape[0] n_timesteps = x_test.shape[3] x_test = reshape_data(x_test) x_test = data_scaler.transform(x_test) pred_1m = lda.predict_proba(x_test)[:, 1] pred_10m = np.reshape(pred_1m, (n_test_examples, n_timesteps)) pred_10m = np.mean(pred_10m, axis=1) y_pred = np.zeros_like(test_labels) y_pred[np.where(pred_10m >= opt_threshold)] = 1 cm = confusion_matrix(test_labels, y_pred) print print_cm(cm, labels=['interictal', 'preictal']) sn = 1.0 * cm[1, 1] / (cm[1, 1] + cm[1, 0]) sp = 1.0 * cm[0, 0] / (cm[0, 0] + cm[0, 1]) print sn, sp sn, sp = [], [] t_list = np.arange(0.0, 1.0, 0.01) for t in t_list: y_pred = np.zeros_like(test_labels) y_pred[np.where(pred_10m >= t)] = 1 cm = confusion_matrix(test_labels, y_pred) sn_t = 1.0 * cm[1, 1] / (cm[1, 1] + cm[1, 0]) sp_t = 1.0 * cm[0, 0] / (cm[0, 0] + cm[0, 1]) sn.append(sn_t) sp.append(sp_t) return t_list, sn, sp
def cross_validate(subject, data_path, reg_C, random_cv=False): if random_cv: d = load_train_data(data_path,subject) x, y = d['x'], d['y'] skf = StratifiedKFold(y, n_folds=10) else: filenames_grouped_by_hour = cPickle.load(open('filenames.pickle')) data_grouped_by_hour = load_grouped_train_data(data_path, subject, filenames_grouped_by_hour) n_preictal, n_interictal = len(data_grouped_by_hour['preictal']), len(data_grouped_by_hour['interictal']) hours_data = data_grouped_by_hour['preictal'] + data_grouped_by_hour['interictal'] hours_labels = np.concatenate((np.ones(n_preictal), np.zeros(n_interictal))) n_folds = n_preictal skf = StratifiedKFold(hours_labels, n_folds=n_folds) preictal_probs, labels = [], [] for train_indexes, valid_indexes in skf: x_train, x_valid = [], [] y_train, y_valid = [], [] for i in train_indexes: x_train.extend(hours_data[i]) y_train.extend(hours_labels[i] * np.ones(len(hours_data[i]))) for i in valid_indexes: x_valid.extend(hours_data[i]) y_valid.extend(hours_labels[i] * np.ones(len(hours_data[i]))) x_train = [x[..., np.newaxis] for x in x_train] x_train = np.concatenate(x_train, axis=3) x_train = np.rollaxis(x_train, axis=3) y_train = np.array(y_train) x_valid = [x[..., np.newaxis] for x in x_valid] x_valid = np.concatenate(x_valid, axis=3) x_valid = np.rollaxis(x_valid, axis=3) y_valid = np.array(y_valid) n_valid_examples = x_valid.shape[0] n_timesteps = x_valid.shape[-1] x_train, y_train = reshape_data(x_train, y_train) data_scaler = StandardScaler() x_train = data_scaler.fit_transform(x_train) logreg = LogisticRegression(C=reg_C) logreg.fit(x_train, y_train) x_valid = reshape_data(x_valid) x_valid = data_scaler.transform(x_valid) p_valid = predict(logreg, x_valid, n_valid_examples, n_timesteps) preictal_probs.extend(p_valid) labels.extend(y_valid) return preictal_probs, labels
def train(subject, data_path, reg_C=None): d = load_train_data(data_path, subject) x, y = d['x'], d['y'] x, y = reshape_data(x, y) data_scaler = StandardScaler() x = data_scaler.fit_transform(x) lda = LogisticRegression(C=reg_C) lda.fit(x, y) return lda, data_scaler
def predict(subject, model, data_scaler, data_path, submission_path, test_labels, opt_threshold_train): d = load_test_data(data_path, subject) x_test, id = d['x'], d['id'] n_test_examples = x_test.shape[0] n_timesteps = x_test.shape[3] x_test = reshape_data(x_test) x_test = data_scaler.transform(x_test) pred_1m = model.predict_proba(x_test)[:, 1] pred_10m = np.reshape(pred_1m, (n_test_examples, n_timesteps)) pred_10m = np.mean(pred_10m, axis=1) ans = zip(id, pred_10m) df = DataFrame(data=ans, columns=['clip', 'preictal']) df.to_csv(submission_path + '/' + subject + '.csv', index=False, header=True)
def train(subject, data_path, plot=False): d = load_train_data(data_path, subject) x, y = d['x'], d['y'] print 'n_preictal', np.sum(y) print 'n_inetrictal', np.sum(y - 1) n_channels = x.shape[1] n_fbins = x.shape[2] x, y = reshape_data(x, y) data_scaler = StandardScaler() x = data_scaler.fit_transform(x) lda = LDA() lda.fit(x, y) coef = lda.scalings_ * lda.coef_[:1].T channels = [] fbins = [] for c in range(n_channels): fbins.extend(range(n_fbins)) # 0- delta, 1- theta ... channels.extend([c] * n_fbins) if plot: fig = plt.figure() for i in range(n_channels): if n_channels == 24: fig.add_subplot(4, 6, i) else: fig.add_subplot(4, 4, i) ax = plt.gca() ax.set_xlim([0, n_fbins]) ax.set_xticks(np.arange(0.5, n_fbins + 0.5, 1)) ax.set_xticklabels(np.arange(0, n_fbins)) max_y = max(abs(coef)) + 0.01 ax.set_ylim([0, max_y]) ax.set_yticks( np.around(np.arange(0, max_y, max_y / 4.0), decimals=1)) for label in (ax.get_xticklabels() + ax.get_yticklabels()): label.set_fontsize(15) plt.bar(range(0, n_fbins), abs(coef[i * n_fbins:i * n_fbins + n_fbins])) fig.suptitle(subject, fontsize=20) plt.show() coefs = np.reshape(coef, (n_channels, n_fbins)) return lda, data_scaler, coefs
def predict(subject, model, data_scaler, data_path, submission_path): d = load_test_data(data_path, subject) x_test, id = d['x'], d['id'] n_test_examples = x_test.shape[0] n_timesteps = x_test.shape[3] x_test = reshape_data(x_test) x_test = data_scaler.transform(x_test) pred_1m = model.predict_proba(x_test)[:, 1] pred_10m = np.reshape(pred_1m, (n_test_examples, n_timesteps)) pred_10m = np.mean(pred_10m, axis=1) ans = zip(id, pred_10m) df = DataFrame(data=ans, columns=['clip', 'preictal']) df.to_csv(submission_path + '/' + subject + '.csv', index=False, header=True) return pred_10m
def train(subject, data_path, plot=False): d = load_train_data(data_path, subject) x, y = d['x'], d['y'] print 'n_preictal', np.sum(y) print 'n_inetrictal', np.sum(y - 1) n_channels = x.shape[1] n_fbins = x.shape[2] x, y = reshape_data(x, y) data_scaler = StandardScaler() x = data_scaler.fit_transform(x) lda = LDA() lda.fit(x, y) coef = lda.scalings_ * lda.coef_[:1].T channels = [] fbins = [] for c in range(n_channels): fbins.extend(range(n_fbins)) # 0- delta, 1- theta ... channels.extend([c] * n_fbins) if plot: fig = plt.figure() for i in range(n_channels): if n_channels == 24: fig.add_subplot(4, 6, i) else: fig.add_subplot(4, 4, i) ax = plt.gca() ax.set_xlim([0, n_fbins]) ax.set_xticks(np.arange(0.5, n_fbins + 0.5, 1)) ax.set_xticklabels(np.arange(0, n_fbins)) max_y = max(abs(coef)) + 0.01 ax.set_ylim([0, max_y]) ax.set_yticks(np.around(np.arange(0, max_y, max_y / 4.0), decimals=1)) for label in (ax.get_xticklabels() + ax.get_yticklabels()): label.set_fontsize(15) plt.bar(range(0, n_fbins), abs(coef[i * n_fbins:i * n_fbins + n_fbins])) fig.suptitle(subject, fontsize=20) plt.show() coefs = np.reshape(coef, (n_channels, n_fbins)) return lda, data_scaler, coefs