def plot_features(subject, data_path, model_path, test_labels, dataset='test'): with open(model_path + '/' + subject + '.pickle', 'rb') as f: state_dict = cPickle.load(f) cnn = ConvNet(state_dict['params']) cnn.set_weights(state_dict['weights']) scalers = state_dict['scalers'] if dataset == 'test': d = load_test_data(data_path, subject) x = d['x'] y = test_labels['preictal'] elif dataset == 'train': d = load_train_data(data_path, subject) x, y = d['x'], d['y'] else: raise ValueError('dataset') x, _ = scale_across_time(x, x_test=None, scalers=scalers) if state_dict['params']['scale_time'] \ else scale_across_features(x, x_test=None, scalers=scalers) cnn.batch_size.set_value(x.shape[0]) get_features = theano.function([cnn.x, Param(cnn.training_mode, default=0)], cnn.feature_extractor.output, allow_input_downcast=True) logits_test = get_features(x) model = TSNE(n_components=2, random_state=0) z = model.fit_transform(np.float64(logits_test)) plt.scatter(z[:, 0], z[:, 1], s=60, c=y) plt.show()
def plot_train_probs(subject, data_path, model_path): with open(model_path + '/' + subject + '.pickle', 'rb') as f: state_dict = pickle.load(f) cnn = ConvNet(state_dict['params']) cnn.set_weights(state_dict['weights']) scalers = state_dict['scalers'] d = load_train_data(data_path, subject) x, y = d['x'], d['y'] x, _ = scale_across_time(x, x_test=None, scalers=scalers) if state_dict['params']['scale_time'] \ else scale_across_features(x, x_test=None, scalers=scalers) cnn.batch_size.set_value(x.shape[0]) probs = cnn.get_test_proba(x) fpr, tpr, threshold = roc_curve(y, probs) c = np.sqrt((1 - tpr)**2 + fpr**2) opt_threshold = threshold[np.where(c == np.min(c))[0]] print(opt_threshold) x_coords = np.zeros(len(y), dtype='float64') rng = np.random.RandomState(42) x_coords += rng.normal(0.0, 0.08, size=len(x_coords)) plt.scatter(x_coords, probs, c=y, s=60) plt.title(subject) plt.show()
def predict(subject, data_path, model_path, submission_path): patient_filenames = [ filename for filename in os.listdir(model_path) if subject in filename and filename.endswith('.pickle') ] for filename in patient_filenames: print(filename) d = load_test_data(data_path, subject) x, id = d['x'], d['id'] with open(model_path + '/' + filename, 'rb') as f: state_dict = pickle.load(f) scalers = state_dict['scalers'] x, _ = scale_across_time(x, x_test=None, scalers=scalers) if state_dict['params']['scale_time'] \ else scale_across_features(x, x_test=None, scalers=scalers) cnn = ConvNet(state_dict['params']) cnn.set_weights(state_dict['weights']) test_proba = cnn.get_test_proba(x) ans = list(zip(id, test_proba)) df = DataFrame(data=ans, columns=['clip', 'preictal']) csv_name = '.'.join( filename.split('.')[:-1]) if '.' in filename else filename df.to_csv(submission_path + '/' + csv_name + '.csv', index=False, header=True)
def predict(subject, data_path, model_path, submission_path): patient_filenames = [filename for filename in os.listdir(model_path) if subject in filename and filename.endswith('.pickle')] for filename in patient_filenames: print filename d = load_test_data(data_path, subject) x, id = d['x'], d['id'] with open(model_path + '/' + filename, 'rb') as f: state_dict = cPickle.load(f) scalers = state_dict['scalers'] x, _ = scale_across_time(x, x_test=None, scalers=scalers) if state_dict['params']['scale_time'] \ else scale_across_features(x, x_test=None, scalers=scalers) cnn = ConvNet(state_dict['params']) cnn.set_weights(state_dict['weights']) test_proba = cnn.get_test_proba(x) ans = zip(id, test_proba) df = DataFrame(data=ans, columns=['clip', 'preictal']) csv_name = '.'.join(filename.split('.')[:-1]) if '.' in filename else filename df.to_csv(submission_path + '/' + csv_name + '.csv', index=False, header=True)
def plot_train_probs(subject, data_path, model_path): with open(model_path + "/" + subject + ".pickle", "rb") as f: state_dict = cPickle.load(f) cnn = ConvNet(state_dict["params"]) cnn.set_weights(state_dict["weights"]) scalers = state_dict["scalers"] d = load_train_data(data_path, subject) x, y = d["x"], d["y"] x, _ = ( scale_across_time(x, x_test=None, scalers=scalers) if state_dict["params"]["scale_time"] else scale_across_features(x, x_test=None, scalers=scalers) ) cnn.batch_size.set_value(x.shape[0]) probs = cnn.get_test_proba(x) fpr, tpr, threshold = roc_curve(y, probs) c = np.sqrt((1 - tpr) ** 2 + fpr ** 2) opt_threshold = threshold[np.where(c == np.min(c))[0]] print opt_threshold x_coords = np.zeros(len(y), dtype="float64") rng = np.random.RandomState(42) x_coords += rng.normal(0.0, 0.08, size=len(x_coords)) plt.scatter(x_coords, probs, c=y, s=60) plt.title(subject) plt.show()
def train(subject, data_path, model_path, model_params, validation_params): d = load_train_data(data_path, subject) x, y, filename_to_idx = d['x'], d['y'], d['filename_to_idx'] x_test = load_test_data(data_path, subject)['x'] if model_params['use_test'] else None # --------- add params model_params['n_channels'] = x.shape[1] model_params['n_fbins'] = x.shape[2] model_params['n_timesteps'] = x.shape[3] print '============ parameters' for key, value in model_params.items(): print key, ':', value print '========================' x_train, y_train = None, None x_valid, y_valid = None, None if model_params['overlap']: # no validation if overlap filenames_grouped_by_hour = cPickle.load(open('filenames.pickle')) data_grouped_by_hour = load_grouped_train_data( data_path, subject, filenames_grouped_by_hour) x, y = generate_overlapped_data(data_grouped_by_hour, overlap_size=model_params['overlap'], window_size=x.shape[-1], overlap_interictal=True, overlap_preictal=True) print x.shape x, scalers = scale_across_time(x, x_test=None) if model_params['scale_time'] \ else scale_across_features(x, x_test=None) cnn = ConvNet(model_params) cnn.train(train_set=(x, y), max_iter=175000) state_dict = cnn.get_state() state_dict['scalers'] = scalers with open(model_path + '/' + subject + '.pickle', 'wb') as f: cPickle.dump(state_dict, f, protocol=cPickle.HIGHEST_PROTOCOL) return else: if validation_params['random_split']: skf = StratifiedShuffleSplit(y, n_iter=1, test_size=0.25, random_state=0) for train_idx, valid_idx in skf: x_train, y_train = x[train_idx], y[train_idx] x_valid, y_valid = x[valid_idx], y[valid_idx] else: filenames_grouped_by_hour = cPickle.load(open('filenames.pickle')) d = split_train_valid_filenames(subject, filenames_grouped_by_hour) train_filenames, valid_filenames = d['train_filenames'], d[ 'valid_filnames'] train_idx = [filename_to_idx[i] for i in train_filenames] valid_idx = [filename_to_idx[i] for i in valid_filenames] x_train, y_train = x[train_idx], y[train_idx] x_valid, y_valid = x[valid_idx], y[valid_idx] if model_params['scale_time']: x_train, scalers_train = scale_across_time(x=x_train, x_test=x_test) x_valid, _ = scale_across_time(x=x_valid, x_test=x_test, scalers=scalers_train) else: x_train, scalers_train = scale_across_features(x=x_train, x_test=x_test) x_valid, _ = scale_across_features(x=x_valid, x_test=x_test, scalers=scalers_train) del x, x_test print '============ dataset' print 'train:', x_train.shape print 'n_pos:', np.sum(y_train), 'n_neg:', len(y_train) - np.sum(y_train) print 'valid:', x_valid.shape print 'n_pos:', np.sum(y_valid), 'n_neg:', len(y_valid) - np.sum(y_valid) # -------------- validate cnn = ConvNet(model_params) best_iter = cnn.validate(train_set=(x_train, y_train), valid_set=(x_valid, y_valid), valid_freq=validation_params['valid_freq'], max_iter=validation_params['max_iter'], fname_out=model_path + '/' + subject + '.txt') # ---------------- scale d = load_train_data(data_path, subject) x, y, filename_to_idx = d['x'], d['y'], d['filename_to_idx'] x_test = load_test_data(data_path, subject)['x'] if model_params['use_test'] else None x, scalers = scale_across_time(x=x, x_test=x_test) if model_params['scale_time'] \ else scale_across_features(x=x, x_test=x_test) del x_test cnn = ConvNet(model_params) cnn.train(train_set=(x, y), max_iter=best_iter) state_dict = cnn.get_state() state_dict['scalers'] = scalers with open(model_path + '/' + subject + '.pickle', 'wb') as f: cPickle.dump(state_dict, f, protocol=cPickle.HIGHEST_PROTOCOL)
def train(subject, data_path, model_path, model_params, validation_params): d = load_train_data(data_path, subject) x, y, filename_to_idx = d['x'], d['y'], d['filename_to_idx'] x_test = load_test_data(data_path, subject)['x'] if model_params['use_test'] else None # --------- add params model_params['n_channels'] = x.shape[1] model_params['n_fbins'] = x.shape[2] model_params['n_timesteps'] = x.shape[3] print '============ parameters' for key, value in model_params.items(): print key, ':', value print '========================' x_train, y_train = None, None x_valid, y_valid = None, None if model_params['overlap']: # no validation if overlap filenames_grouped_by_hour = cPickle.load(open('filenames.pickle')) data_grouped_by_hour = load_grouped_train_data(data_path, subject, filenames_grouped_by_hour) x, y = generate_overlapped_data(data_grouped_by_hour, overlap_size=model_params['overlap'], window_size=x.shape[-1], overlap_interictal=True, overlap_preictal=True) print x.shape x, scalers = scale_across_time(x, x_test=None) if model_params['scale_time'] \ else scale_across_features(x, x_test=None) cnn = ConvNet(model_params) cnn.train(train_set=(x, y), max_iter=175000) state_dict = cnn.get_state() state_dict['scalers'] = scalers with open(model_path + '/' + subject + '.pickle', 'wb') as f: cPickle.dump(state_dict, f, protocol=cPickle.HIGHEST_PROTOCOL) return else: if validation_params['random_split']: skf = StratifiedShuffleSplit(y, n_iter=1, test_size=0.25, random_state=0) for train_idx, valid_idx in skf: x_train, y_train = x[train_idx], y[train_idx] x_valid, y_valid = x[valid_idx], y[valid_idx] else: filenames_grouped_by_hour = cPickle.load(open('filenames.pickle')) d = split_train_valid_filenames(subject, filenames_grouped_by_hour) train_filenames, valid_filenames = d['train_filenames'], d['valid_filnames'] train_idx = [filename_to_idx[i] for i in train_filenames] valid_idx = [filename_to_idx[i] for i in valid_filenames] x_train, y_train = x[train_idx], y[train_idx] x_valid, y_valid = x[valid_idx], y[valid_idx] if model_params['scale_time']: x_train, scalers_train = scale_across_time(x=x_train, x_test=x_test) x_valid, _ = scale_across_time(x=x_valid, x_test=x_test, scalers=scalers_train) else: x_train, scalers_train = scale_across_features(x=x_train, x_test=x_test) x_valid, _ = scale_across_features(x=x_valid, x_test=x_test, scalers=scalers_train) del x, x_test print '============ dataset' print 'train:', x_train.shape print 'n_pos:', np.sum(y_train), 'n_neg:', len(y_train) - np.sum(y_train) print 'valid:', x_valid.shape print 'n_pos:', np.sum(y_valid), 'n_neg:', len(y_valid) - np.sum(y_valid) # -------------- validate cnn = ConvNet(model_params) best_iter = cnn.validate(train_set=(x_train, y_train), valid_set=(x_valid, y_valid), valid_freq=validation_params['valid_freq'], max_iter=validation_params['max_iter'], fname_out=model_path + '/' + subject + '.txt') # ---------------- scale d = load_train_data(data_path, subject) x, y, filename_to_idx = d['x'], d['y'], d['filename_to_idx'] x_test = load_test_data(data_path, subject)['x'] if model_params['use_test'] else None x, scalers = scale_across_time(x=x, x_test=x_test) if model_params['scale_time'] \ else scale_across_features(x=x, x_test=x_test) del x_test cnn = ConvNet(model_params) cnn.train(train_set=(x, y), max_iter=best_iter) state_dict = cnn.get_state() state_dict['scalers'] = scalers with open(model_path + '/' + subject + '.pickle', 'wb') as f: cPickle.dump(state_dict, f, protocol=cPickle.HIGHEST_PROTOCOL)