def get_y_for_subject(pred_fn, test_set_0, test_set_1, iterator, final_layer): """Assumes there was no resampling!!""" batch_gen_0 = iterator.get_batches(test_set_0, shuffle=False) all_preds_0 = [pred_fn(batch[0]) for batch in batch_gen_0] batch_gen_1 = iterator.get_batches(test_set_1, shuffle=False) all_preds_1 = [pred_fn(batch[0]) for batch in batch_gen_1] n_sample_preds = get_n_sample_preds(final_layer) input_time_length = lasagne.layers.get_all_layers(final_layer)[0].shape[2] n_samples_0 = test_set_0.get_topological_view().shape[0] preds_arr_0 = get_reshaped_cnt_preds(all_preds_0, n_samples_0, input_time_length, n_sample_preds) n_samples_1 = test_set_1.get_topological_view().shape[0] preds_arr_1 = get_reshaped_cnt_preds(all_preds_1, n_samples_1, input_time_length, n_sample_preds) series_preds = [preds_arr_0, preds_arr_1] return series_preds
def create_submission_csv_for_one_subject(folder_name, kaggle_set, iterator, preprocessor, final_layer, submission_id): ### Load and preprocess data kaggle_set.load() # remember test series lengths before and after resampling to more accurately pad predictions # later (padding due to the lost samples) kaggle_set.load_test_data() test_series_lengths = [len(series) for series in kaggle_set.test_X_series] kaggle_set.resample_test_data() test_series_lengths_resampled = [len(series) for series in kaggle_set.test_X_series] X_train = deepcopy(np.concatenate(kaggle_set.train_X_series)[:,:,np.newaxis,np.newaxis]) X_test_0 = deepcopy(kaggle_set.test_X_series[0][:,:,np.newaxis,np.newaxis]) X_test_1 = deepcopy(kaggle_set.test_X_series[1][:,:,np.newaxis,np.newaxis]) # create dense design matrix sets train_set = DenseDesignMatrixWrapper( topo_view=X_train, y=None, axes=('b','c',0,1)) fake_test_y = np.ones((len(X_test_0), 6)) test_set_0 = DenseDesignMatrixWrapper( topo_view=X_test_0, y=fake_test_y) fake_test_y = np.ones((len(X_test_1), 6)) test_set_1 = DenseDesignMatrixWrapper( topo_view=X_test_1, y=fake_test_y) log.info("Preprocessing data...") preprocessor.apply(train_set, can_fit=True) preprocessor.apply(test_set_0, can_fit=False) preprocessor.apply(test_set_1, can_fit=False) ### Create prediction function and create predictions log.info("Create prediction functions...") input_var = lasagne.layers.get_all_layers(final_layer)[0].input_var predictions = lasagne.layers.get_output(final_layer, deterministic=True) pred_fn = theano.function([input_var], predictions) log.info("Make predictions...") batch_gen_0 = iterator.get_batches(test_set_0, shuffle=False) all_preds_0 = [pred_fn(batch[0]) for batch in batch_gen_0] batch_gen_1 = iterator.get_batches(test_set_1, shuffle=False) all_preds_1 = [pred_fn(batch[0]) for batch in batch_gen_1] ### Pad and reshape predictions n_sample_preds = get_n_sample_preds(final_layer) input_time_length = lasagne.layers.get_all_layers(final_layer)[0].shape[2] n_samples_0 = test_set_0.get_topological_view().shape[0] preds_arr_0 = get_reshaped_cnt_preds(all_preds_0, n_samples_0, input_time_length, n_sample_preds) n_samples_1 = test_set_1.get_topological_view().shape[0] preds_arr_1 = get_reshaped_cnt_preds(all_preds_1, n_samples_1, input_time_length, n_sample_preds) series_preds = [preds_arr_0, preds_arr_1] assert len(series_preds[0]) == test_series_lengths_resampled[0] assert len(series_preds[1]) == test_series_lengths_resampled[1] assert False, ("TODO: here only duplicate if resample half is true for the dataset.. " "also take care how to create submission cv if trained on all subjects") series_preds_duplicated = [np.repeat(preds, 2,axis=0) for preds in series_preds] n_classes = preds_arr_0.shape[1] # pad missing ones with zeros missing_0 = test_series_lengths[0] - len(series_preds_duplicated[0]) full_preds_0 = np.append(np.zeros((missing_0, n_classes), dtype=np.float32), series_preds_duplicated[0], axis=0) missing_1 = test_series_lengths[1] - len(series_preds_duplicated[1]) full_preds_1 = np.append(np.zeros((missing_1, n_classes), dtype=np.float32), series_preds_duplicated[1], axis=0) assert len(full_preds_0) == test_series_lengths[0] assert len(full_preds_1) == test_series_lengths[1] full_series_preds = [full_preds_0, full_preds_1] assert sum([len(a) for a in full_series_preds]) == np.sum(test_series_lengths) ### Create csv log.info("Create csv...") csv_filename = "{:02d}".format(submission_id) + '.csv' csv_filename = os.path.join(folder_name, csv_filename) cols = ['HandStart','FirstDigitTouch', 'BothStartLoadPhase','LiftOff', 'Replace','BothReleased'] # collect ids all_ids = [] all_preds = [] for i_series in (9,10): id_prefix = "subj{:d}_series{:d}_".format(kaggle_set.i_subject, i_series) this_preds = full_series_preds[i_series-9] # respect offsets all_preds.extend(this_preds) this_ids = [id_prefix + str(i_sample) for i_sample in range(this_preds.shape[0])] all_ids.extend(this_ids) all_ids = np.array(all_ids) all_preds = np.array(all_preds) submission = pd.DataFrame(index=all_ids, columns=cols, data=all_preds) submission.to_csv(csv_filename, index_label='id',float_format='%.3f') log.info("Done")
def create_submission_csv_for_one_subject(folder_name, kaggle_set, iterator, preprocessor, final_layer, submission_id): ### Load and preprocess data kaggle_set.load() # remember test series lengths before and after resampling to more accurately pad predictions # later (padding due to the lost samples) kaggle_set.load_test_data() test_series_lengths = [len(series) for series in kaggle_set.test_X_series] kaggle_set.resample_test_data() test_series_lengths_resampled = [ len(series) for series in kaggle_set.test_X_series ] X_train = deepcopy( np.concatenate(kaggle_set.train_X_series)[:, :, np.newaxis, np.newaxis]) X_test_0 = deepcopy(kaggle_set.test_X_series[0][:, :, np.newaxis, np.newaxis]) X_test_1 = deepcopy(kaggle_set.test_X_series[1][:, :, np.newaxis, np.newaxis]) # create dense design matrix sets train_set = DenseDesignMatrixWrapper(topo_view=X_train, y=None, axes=('b', 'c', 0, 1)) fake_test_y = np.ones((len(X_test_0), 6)) test_set_0 = DenseDesignMatrixWrapper(topo_view=X_test_0, y=fake_test_y) fake_test_y = np.ones((len(X_test_1), 6)) test_set_1 = DenseDesignMatrixWrapper(topo_view=X_test_1, y=fake_test_y) log.info("Preprocessing data...") preprocessor.apply(train_set, can_fit=True) preprocessor.apply(test_set_0, can_fit=False) preprocessor.apply(test_set_1, can_fit=False) ### Create prediction function and create predictions log.info("Create prediction functions...") input_var = lasagne.layers.get_all_layers(final_layer)[0].input_var predictions = lasagne.layers.get_output(final_layer, deterministic=True) pred_fn = theano.function([input_var], predictions) log.info("Make predictions...") batch_gen_0 = iterator.get_batches(test_set_0, shuffle=False) all_preds_0 = [pred_fn(batch[0]) for batch in batch_gen_0] batch_gen_1 = iterator.get_batches(test_set_1, shuffle=False) all_preds_1 = [pred_fn(batch[0]) for batch in batch_gen_1] ### Pad and reshape predictions n_sample_preds = get_n_sample_preds(final_layer) input_time_length = lasagne.layers.get_all_layers(final_layer)[0].shape[2] n_samples_0 = test_set_0.get_topological_view().shape[0] preds_arr_0 = get_reshaped_cnt_preds(all_preds_0, n_samples_0, input_time_length, n_sample_preds) n_samples_1 = test_set_1.get_topological_view().shape[0] preds_arr_1 = get_reshaped_cnt_preds(all_preds_1, n_samples_1, input_time_length, n_sample_preds) series_preds = [preds_arr_0, preds_arr_1] assert len(series_preds[0]) == test_series_lengths_resampled[0] assert len(series_preds[1]) == test_series_lengths_resampled[1] assert False, ( "TODO: here only duplicate if resample half is true for the dataset.. " "also take care how to create submission cv if trained on all subjects" ) series_preds_duplicated = [ np.repeat(preds, 2, axis=0) for preds in series_preds ] n_classes = preds_arr_0.shape[1] # pad missing ones with zeros missing_0 = test_series_lengths[0] - len(series_preds_duplicated[0]) full_preds_0 = np.append(np.zeros((missing_0, n_classes), dtype=np.float32), series_preds_duplicated[0], axis=0) missing_1 = test_series_lengths[1] - len(series_preds_duplicated[1]) full_preds_1 = np.append(np.zeros((missing_1, n_classes), dtype=np.float32), series_preds_duplicated[1], axis=0) assert len(full_preds_0) == test_series_lengths[0] assert len(full_preds_1) == test_series_lengths[1] full_series_preds = [full_preds_0, full_preds_1] assert sum([len(a) for a in full_series_preds]) == np.sum(test_series_lengths) ### Create csv log.info("Create csv...") csv_filename = "{:02d}".format(submission_id) + '.csv' csv_filename = os.path.join(folder_name, csv_filename) cols = [ 'HandStart', 'FirstDigitTouch', 'BothStartLoadPhase', 'LiftOff', 'Replace', 'BothReleased' ] # collect ids all_ids = [] all_preds = [] for i_series in (9, 10): id_prefix = "subj{:d}_series{:d}_".format(kaggle_set.i_subject, i_series) this_preds = full_series_preds[i_series - 9] # respect offsets all_preds.extend(this_preds) this_ids = [ id_prefix + str(i_sample) for i_sample in range(this_preds.shape[0]) ] all_ids.extend(this_ids) all_ids = np.array(all_ids) all_preds = np.array(all_preds) submission = pd.DataFrame(index=all_ids, columns=cols, data=all_preds) submission.to_csv(csv_filename, index_label='id', float_format='%.3f') log.info("Done")