header = ret["header"] X = common_utils.extract_features_from_rawdata(chunk, header, args.period, args.features) Xs.append(X) ys += y Xs = np.concatenate(Xs, axis=0) return (Xs, ys) print "==> reading data and extracting features" #chunk_size = 8000 # TODO: bigger chunk_size (train_X, train_y) = read_and_extract_features(train_reader, train_reader.get_number_of_examples()) del train_reader (val_X, val_y) = read_and_extract_features(val_reader, val_reader.get_number_of_examples()) del val_reader (test_X, test_y) = read_and_extract_features(test_reader, test_reader.get_number_of_examples()) del test_reader print "==> imputing missing values" # imput missing values imputer = Imputer(missing_values=np.nan, strategy='mean',
process_one_chunk("train", chunk_index) cnt_trained = chunk_index - n_trained_chunks + 1 if (cnt_trained % 5 == 0): val_loss = process_one_chunk("test", chunk_index) if ((cnt_trained / 5) % args.save_every == 0): state_name = 'states/%s.chunk%d.test%.8f.state' % ( network_name, chunk_index, val_loss) print "==> saving ... %s" % state_name network.save_params(state_name, chunk_index) print "chunk %d took %.3fs" % (chunk_index, float(time.time()) - start_time) chunks_per_epoch = train_reader.get_number_of_examples() // chunk_size if (cnt_trained % chunks_per_epoch == 0): train_reader.random_shuffle() val_reader.random_shuffle() elif args.mode == 'test': # ensure that the code uses test_reader del train_reader del val_reader test_reader = DecompensationReader( dataset_dir='../../data/decompensation/test/', listfile='../../data/decompensation/test_listfile.csv') n_batches = test_reader.get_number_of_examples() // args.batch_size y_true = []
def main(): parser = argparse.ArgumentParser() parser.add_argument('--period', type=str, default='all', help='specifies which period extract features from', choices=['first4days', 'first8days', 'last12hours', 'first25percent', 'first50percent', 'all']) parser.add_argument('--features', type=str, default='all', help='specifies what features to extract', choices=['all', 'len', 'all_but_len']) args = parser.parse_args() print(args) # penalties = ['l2', 'l2', 'l2', 'l2', 'l2', 'l2', 'l1', 'l1', 'l1', 'l1', 'l1'] # Cs = [1.0, 0.1, 0.01, 0.001, 0.0001, 0.00001, 1.0, 0.1, 0.01, 0.001, 0.0001] penalties = ['l2'] Cs = [0.001] train_reader = DecompensationReader(dataset_dir='../../../data/decompensation/train/', listfile='../../../data/decompensation/train_listfile.csv') val_reader = DecompensationReader(dataset_dir='../../../data/decompensation/train/', listfile='../../../data/decompensation/val_listfile.csv') test_reader = DecompensationReader(dataset_dir='../../../data/decompensation/test/', listfile='../../../data/decompensation/test_listfile.csv') print('Reading data and extracting features ...') n_train = min(100000, train_reader.get_number_of_examples()) n_val = min(100000, val_reader.get_number_of_examples()) (train_X, train_y, train_names, train_ts) = read_and_extract_features( train_reader, n_train, args.period, args.features) (val_X, val_y, val_names, val_ts) = read_and_extract_features( val_reader, n_val, args.period, args.features) (test_X, test_y, test_names, test_ts) = read_and_extract_features( test_reader, test_reader.get_number_of_examples(), args.period, args.features) print('Imputing missing values ...') imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0, verbose=0, copy=True) imputer.fit(train_X) train_X = np.array(imputer.transform(train_X), dtype=np.float32) val_X = np.array(imputer.transform(val_X), dtype=np.float32) test_X = np.array(imputer.transform(test_X), dtype=np.float32) print('Normalizing the data to have zero mean and unit variance ...') scaler = StandardScaler() scaler.fit(train_X) train_X = scaler.transform(train_X) val_X = scaler.transform(val_X) test_X = scaler.transform(test_X) common_utils.create_directory('results') for (penalty, C) in zip(penalties, Cs): file_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty, C) logreg = LogisticRegression(penalty=penalty, C=C, random_state=42) logreg.fit(train_X, train_y) with open(os.path.join('results', 'train_{}.json'.format(file_name)), "w") as res_file: ret = print_metrics_binary(train_y, logreg.predict_proba(train_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) with open(os.path.join('results', 'val_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(val_y, logreg.predict_proba(val_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) prediction = logreg.predict_proba(test_X)[:, 1] with open(os.path.join('results', 'test_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(test_y, prediction) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) save_results(test_names, test_ts, prediction, test_y, os.path.join('predictions', file_name + '.csv'))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--period', type=str, default='all', help='specifies which period extract features from', choices=[ 'first4days', 'first8days', 'last12hours', 'first25percent', 'first50percent', 'all' ]) parser.add_argument('--features', type=str, default='all', help='specifies what features to extract', choices=['all', 'len', 'all_but_len']) args = parser.parse_args() print(args) # penalties = ['l2', 'l2', 'l2', 'l2', 'l2', 'l2', 'l1', 'l1', 'l1', 'l1', 'l1'] # Cs = [1.0, 0.1, 0.01, 0.001, 0.0001, 0.00001, 1.0, 0.1, 0.01, 0.001, 0.0001] penalties = ['l2'] Cs = [0.001] train_reader = DecompensationReader( dataset_dir='../../../data/decompensation/train/', listfile='../../../data/decompensation/train_listfile.csv') val_reader = DecompensationReader( dataset_dir='../../../data/decompensation/train/', listfile='../../../data/decompensation/val_listfile.csv') test_reader = DecompensationReader( dataset_dir='../../../data/decompensation/test/', listfile='../../../data/decompensation/test_listfile.csv') print('Reading data and extracting features ...') n_train = min(100000, train_reader.get_number_of_examples()) n_val = min(100000, val_reader.get_number_of_examples()) (train_X, train_y, train_names, train_ts) = read_and_extract_features(train_reader, n_train, args.period, args.features) (val_X, val_y, val_names, val_ts) = read_and_extract_features(val_reader, n_val, args.period, args.features) (test_X, test_y, test_names, test_ts) = read_and_extract_features(test_reader, test_reader.get_number_of_examples(), args.period, args.features) print('Imputing missing values ...') imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0, verbose=0, copy=True) imputer.fit(train_X) train_X = np.array(imputer.transform(train_X), dtype=np.float32) val_X = np.array(imputer.transform(val_X), dtype=np.float32) test_X = np.array(imputer.transform(test_X), dtype=np.float32) print('Normalizing the data to have zero mean and unit variance ...') scaler = StandardScaler() scaler.fit(train_X) train_X = scaler.transform(train_X) val_X = scaler.transform(val_X) test_X = scaler.transform(test_X) common_utils.create_directory('results') for (penalty, C) in zip(penalties, Cs): file_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty, C) logreg = LogisticRegression(penalty=penalty, C=C, random_state=42) logreg.fit(train_X, train_y) with open(os.path.join('results', 'train_{}.json'.format(file_name)), "w") as res_file: ret = print_metrics_binary(train_y, logreg.predict_proba(train_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) with open(os.path.join('results', 'val_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(val_y, logreg.predict_proba(val_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) prediction = logreg.predict_proba(test_X)[:, 1] with open(os.path.join('results', 'test_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(test_y, prediction) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) save_results(test_names, test_ts, prediction, test_y, os.path.join('predictions', file_name + '.csv'))
# ensure that the code uses test_reader del train_data_gen del val_data_gen if args.deep_supervision: del train_data_loader del val_data_loader else: del train_reader del val_reader test_reader = DecompensationReader( dataset_dir='../../data/decompensation/test/', listfile='../../data/decompensation/test_listfile.csv') test_nbatches = test_reader.get_number_of_examples() // args.batch_size test_nbatches = 10000 test_data_gen = utils.BatchGen(test_reader, discretizer, normalizer, args.batch_size, test_nbatches) labels = [] predictions = [] for i in range(test_nbatches): print "\rpredicting {} / {}".format(i, test_nbatches), x, y = next(test_data_gen) x = np.array(x) pred = model.predict_on_batch(x)[:, 0] predictions += list(pred) labels += list(y) metrics.print_metrics_binary(labels, predictions) with open("activations.txt", "w") as fout:
Xs = np.concatenate(Xs, axis=0) return (Xs, ys) print "==> reading data and extracting features" chunk_size = 100000 # TODO: bigger chunk_size (train_X, train_y) = read_and_extract_features(train_reader, chunk_size) del train_reader (val_X, val_y) = read_and_extract_features(val_reader, chunk_size) del val_reader (test_X, test_y) = read_and_extract_features(test_reader, test_reader.get_number_of_examples()) del test_reader print "==> imputing missing values" # imput missing values imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0, verbose=0, copy=True) imputer.fit(train_X) train_X = np.array(imputer.transform(train_X), dtype=np.float32) val_X = np.array(imputer.transform(val_X), dtype=np.float32) test_X = np.array(imputer.transform(test_X), dtype=np.float32) print "==> normalizing data"