def main(): parser = argparse.ArgumentParser() parser.add_argument('--C', type=float, default=1.0, help='inverse of L1 / L2 regularization') parser.add_argument('--l1', dest='l2', action='store_false') parser.add_argument('--l2', dest='l2', action='store_true') parser.set_defaults(l2=True) parser.add_argument('--period', type=str, default='all', help='specifies which period extract features from', choices=['first4days', 'first8days', 'last12hours', 'first25percent', 'first50percent', 'all']) parser.add_argument('--features', type=str, default='all', help='specifies what features to extract', choices=['all', 'len', 'all_but_len']) parser.add_argument('--data', type=str, help='Path to the data of in-hospital mortality task', default=os.path.join(os.path.dirname(__file__), '../../../data/in-hospital-mortality/')) parser.add_argument('--output_dir', type=str, help='Directory relative which all output files are stored', default='.') args = parser.parse_args() print(args) print('Reading data and extracting features ...') train_X, train_y, train_names, val_X, val_y, val_names, test_X, test_y, test_names = \ load_data_logistic_regression(args) penalty = ('l2' if args.l2 else 'l1') file_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty, args.C) logreg = LogisticRegression(penalty=penalty, C=args.C, random_state=42) logreg.fit(train_X, train_y) result_dir = os.path.join(args.output_dir, 'results') common_utils.create_directory(result_dir) with open(os.path.join(result_dir, 'train_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(train_y, logreg.predict_proba(train_X)) ret = {k : float(v) for k, v in ret.items()} json.dump(ret, res_file) with open(os.path.join(result_dir, 'val_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(val_y, logreg.predict_proba(val_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) prediction = logreg.predict_proba(test_X)[:, 1] with open(os.path.join(result_dir, 'test_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(test_y, prediction) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) save_results(test_names, prediction, test_y, os.path.join(args.output_dir, 'predictions', file_name + '.csv'))
def save_embeddings(names, embeddings, path): common_utils.create_directory(os.path.dirname(path)) with open(path, 'w') as f: header = ["stay"] header += ["emb_{}".format(x) for x in range(len(embeddings[0]))] header = ",".join(header) f.write(header + '\n') for name, emb in zip(names, embeddings): line = [name] #emb_str = ["%.10f" % x for x in emb] emb_str = [str(x) for x in emb] line += emb_str line = ",".join(line) f.write(line + '\n')
def save_results(names, ts, predictions, labels, path): n_tasks = 25 common_utils.create_directory(os.path.dirname(path)) with open(path, 'w') as f: header = ["stay", "period_length"] header += ["pred_{}".format(x) for x in range(1, n_tasks + 1)] header += ["label_{}".format(x) for x in range(1, n_tasks + 1)] header = ",".join(header) f.write(header + '\n') for name, t, pred, y in zip(names, ts, predictions, labels): line = [name] line += ["{:.6f}".format(t)] line += ["{:.6f}".format(a) for a in pred] line += [str(a) for a in y] line = ",".join(line) f.write(line + '\n')
def save_results(names, ts, predictions, labels, path): n_tasks = 25 common_utils.create_directory(os.path.dirname(path)) with open(path, 'w') as f: header = ["stay", "period_length"] header += ["pred_{}".format(x) for x in range(1, n_tasks + 1)] header += ["label_{}".format(x) for x in range(1, n_tasks + 1)] header = ",".join(header) f.write(header + '\n') for name, t, pred, y in zip(names, ts, predictions, labels): line = [name] line += ["{:.6f}".format(t)] line += ["{:.6f}".format(a) for a in pred] line += [str(a) for a in y] line = ",".join(line) f.write(line + '\n')
def save_results(names, pred, y_true, path, stochastic=False): common_utils.create_directory(os.path.dirname(path)) with open(path, 'w') as f: if stochastic: aleatoric = np.mean(pred * (1. - pred), axis=1) epistemic = np.var(pred, axis=1) pred = np.mean(pred, axis=1) uncertainty = aleatoric + epistemic f.write("stay,prediction,y_true,aleatoric,epistemic,uncertainty\n") for (name, x, y, a, e, u) in zip(names, pred, y_true, aleatoric, epistemic, uncertainty): f.write("{},{:.6f},{},{:.6f},{:.6f},{:.6f}\n".format( name, x, y, a, e, u)) else: f.write("stay,prediction,y_true\n") for (name, x, y) in zip(names, pred, y_true): f.write("{},{:.6f},{}\n".format(name, x, y))
def save_results(names, ts, pred, y_true, path, aleatoric=None, epistemic=None): common_utils.create_directory(os.path.dirname(path)) with open(path, 'w') as f: if aleatoric is not None and epistemic is not None: f.write( "stay,period_length,prediction,y_true,epistemic,aleatoric,uncertainty\n" ) for (name, t, x, y, e, a) in zip(names, ts, pred, y_true, epistemic, aleatoric): f.write( "{},{:.6f},{:.6f},{:.6f},{:.6f},{:.6f},{:.6f}\n".format( name, t, x, y, e, a, e + a)) else: f.write("stay,period_length,prediction,y_true\n") for (name, t, x, y) in zip(names, ts, pred, y_true): f.write("{},{:.6f},{:.6f},{:.6f}\n".format(name, t, x, y))
def save_results(names, ts, predictions, labels, path, stochastic=False): n_tasks = 25 common_utils.create_directory(os.path.dirname(path)) with open(path, 'w') as f: header = ["stay", "period_length"] header += ["pred_{}".format(x) for x in range(1, n_tasks + 1)] header += ["label_{}".format(x) for x in range(1, n_tasks + 1)] if stochastic: header += ["epistemic_{}".format(x) for x in range(1, n_tasks + 1)] header += ["aleatoric_{}".format(x) for x in range(1, n_tasks + 1)] header = ",".join(header) f.write(header + '\n') if stochastic: epistemic = np.var(predictions, axis=1) aleatoric = np.mean(predictions * (1. - predictions), axis=1) predictions = np.mean(predictions, axis=1) for name, t, pred, y, epis, alea in zip(names, ts, predictions, labels, epistemic, aleatoric): line = [name] line += ["{:.6f}".format(t)] line += ["{:.6f}".format(a) for a in pred] line += [str(a) for a in y] line += ["{:.6f}".format(e) for e in epis] line += ["{:.6f}".format(a) for a in alea] line = ",".join(line) f.write(line + '\n') else: for name, t, pred, y in zip(names, ts, predictions, labels): line = [name] line += ["{:.6f}".format(t)] line += ["{:.6f}".format(a) for a in pred] line += [str(a) for a in y] line = ",".join(line) f.write(line + '\n')
def main(): parser = argparse.ArgumentParser() parser.add_argument('--C', type=float, default=1.0, help='inverse of L1 / L2 regularization') parser.add_argument('--l1', dest='l2', action='store_false') parser.add_argument('--l2', dest='l2', action='store_true') parser.set_defaults(l2=True) parser.add_argument('--period', type=str, default='all', help='specifies which period extract features from', choices=[ 'first4days', 'first8days', 'last12hours', 'first25percent', 'first50percent', 'all' ]) parser.add_argument('--features', type=str, default='all', help='specifies what features to extract', choices=['all', 'len', 'all_but_len', 'mean_and_sd']) parser.add_argument('--data', type=str, help='Path to the data of in-hospital mortality task', default=os.path.join( os.path.dirname(__file__), '../../../data/in-hospital-mortality/')) parser.add_argument( '--output_dir', type=str, help='Directory relative which all output files are stored', default='.') args = parser.parse_args() print(args) train_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'train_listfile.csv'), period_length=48.0) val_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'val_listfile.csv'), period_length=48.0) test_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'test'), listfile=os.path.join(args.data, 'test_listfile.csv'), period_length=48.0) print('Reading data and extracting features ...') # read_and_extract removes some highly implausible values according to plausible_values.json print('Remove implausible values ...') (train_X, train_y, train_names) = read_and_extract_features(train_reader, args.period, args.features) (val_X, val_y, val_names) = read_and_extract_features(val_reader, args.period, args.features) (test_X, test_y, test_names) = read_and_extract_features(test_reader, args.period, args.features) print(' train data shape = {}'.format(train_X.shape)) print(' validation data shape = {}'.format(val_X.shape)) print(' test data shape = {}'.format(test_X.shape)) # print('Imputing missing values ...') # imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0, verbose=0, copy=True) # imputer.fit(train_X) # train_X = np.array(imputer.transform(train_X), dtype=np.float32) # val_X = np.array(imputer.transform(val_X), dtype=np.float32) # test_X = np.array(imputer.transform(test_X), dtype=np.float32) print('Imputing missing values with -1.') # Verified that all values are greater or equal than zero via np.nanmin() train_X[np.isnan(train_X)] = -1. val_X[np.isnan(val_X)] = -1. test_X[np.isnan(test_X)] = -1. train_X = np.array(train_X, dtype=np.float32) val_X = np.array(val_X, dtype=np.float32) test_X = np.array(test_X, dtype=np.float32) # # print('Normalizing the data to have zero mean and unit variance ...') # scaler = StandardScaler() # scaler.fit(train_X) # train_X = scaler.transform(train_X) # val_X = scaler.transform(val_X) # test_X = scaler.transform(test_X) print('Export features along with target as csv files ...') train_file = os.path.join(args.output_dir, 'in-hospital-mortality-train.csv') val_file = os.path.join(args.output_dir, 'in-hospital-mortality-val.csv') test_file = os.path.join(args.output_dir, 'in-hospital-mortality-test.csv') np.savetxt(train_file, np.concatenate((train_X, (np.array([train_y])).T), axis=1), delimiter='\t') np.savetxt(val_file, np.concatenate((val_X, (np.array([val_y])).T), axis=1), delimiter='\t') np.savetxt(test_file, np.concatenate((test_X, (np.array([test_y])).T), axis=1), delimiter='\t') penalty = ('l2' if args.l2 else 'l1') file_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty, args.C) logreg = LogisticRegression(penalty=penalty, C=args.C, random_state=42) logreg.fit(train_X, train_y) result_dir = os.path.join(args.output_dir, 'results') common_utils.create_directory(result_dir) with open(os.path.join(result_dir, 'train_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(train_y, logreg.predict_proba(train_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) with open(os.path.join(result_dir, 'val_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(val_y, logreg.predict_proba(val_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) prediction = logreg.predict_proba(test_X)[:, 1] with open(os.path.join(result_dir, 'test_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(test_y, prediction) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) save_results( test_names, prediction, test_y, os.path.join(args.output_dir, 'predictions', file_name + '.csv'))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--period', type=str, default='all', help='specifies which period extract features from', choices=[ 'first4days', 'first8days', 'last12hours', 'first25percent', 'first50percent', 'all' ]) parser.add_argument('--features', type=str, default='all', help='specifies what features to extract', choices=['all', 'len', 'all_but_len']) args = parser.parse_args() print(args) # penalties = ['l2', 'l2', 'l2', 'l2', 'l2', 'l2', 'l1', 'l1', 'l1', 'l1', 'l1'] # Cs = [1.0, 0.1, 0.01, 0.001, 0.0001, 0.00001, 1.0, 0.1, 0.01, 0.001, 0.0001] penalties = ['l2'] Cs = [0.00001] train_reader = LengthOfStayReader( dataset_dir='../../../data/length-of-stay/train/', listfile='../../../data/length-of-stay/train_listfile.csv') val_reader = LengthOfStayReader( dataset_dir='../../../data/length-of-stay/train/', listfile='../../../data/length-of-stay/val_listfile.csv') test_reader = LengthOfStayReader( dataset_dir='../../../data/length-of-stay/test/', listfile='../../../data/length-of-stay/test_listfile.csv') print('Reading data and extracting features ...') n_train = min(100000, train_reader.get_number_of_examples()) n_val = min(100000, val_reader.get_number_of_examples()) (train_X, train_y, train_actual, train_names, train_ts) = read_and_extract_features(train_reader, n_train, args.period, args.features) (val_X, val_y, val_actual, val_names, val_ts) = read_and_extract_features(val_reader, n_val, args.period, args.features) (test_X, test_y, test_actual, test_names, test_ts) = read_and_extract_features(test_reader, test_reader.get_number_of_examples(), args.period, args.features) print("train set shape: {}".format(train_X.shape)) print("validation set shape: {}".format(val_X.shape)) print("test set shape: {}".format(test_X.shape)) print('Imputing missing values ...') imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0, verbose=0, copy=True) imputer.fit(train_X) train_X = np.array(imputer.transform(train_X), dtype=np.float32) val_X = np.array(imputer.transform(val_X), dtype=np.float32) test_X = np.array(imputer.transform(test_X), dtype=np.float32) print('Normalizing the data to have zero mean and unit variance ...') scaler = StandardScaler() scaler.fit(train_X) train_X = scaler.transform(train_X) val_X = scaler.transform(val_X) test_X = scaler.transform(test_X) common_utils.create_directory('cf_results') for (penalty, C) in zip(penalties, Cs): model_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty, C) train_activations = np.zeros(shape=train_y.shape, dtype=float) val_activations = np.zeros(shape=val_y.shape, dtype=float) test_activations = np.zeros(shape=test_y.shape, dtype=float) for task_id in range(n_bins): logreg = LogisticRegression(penalty=penalty, C=C, random_state=42) logreg.fit(train_X, train_y[:, task_id]) train_preds = logreg.predict_proba(train_X) train_activations[:, task_id] = train_preds[:, 1] val_preds = logreg.predict_proba(val_X) val_activations[:, task_id] = val_preds[:, 1] test_preds = logreg.predict_proba(test_X) test_activations[:, task_id] = test_preds[:, 1] train_predictions = np.array([ metrics.get_estimate_custom(x, n_bins) for x in train_activations ]) val_predictions = np.array( [metrics.get_estimate_custom(x, n_bins) for x in val_activations]) test_predictions = np.array( [metrics.get_estimate_custom(x, n_bins) for x in test_activations]) with open( os.path.join('cf_results', 'train_{}.json'.format(model_name)), 'w') as f: ret = metrics.print_metrics_custom_bins(train_actual, train_predictions) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, f) with open(os.path.join('cf_results', 'val_{}.json'.format(model_name)), 'w') as f: ret = metrics.print_metrics_custom_bins(val_actual, val_predictions) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, f) with open( os.path.join('cf_results', 'test_{}.json'.format(model_name)), 'w') as f: ret = metrics.print_metrics_custom_bins(test_actual, test_predictions) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, f) save_results(test_names, test_ts, test_predictions, test_actual, os.path.join('cf_predictions', model_name + '.csv'))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--C', type=float, default=1.0, help='inverse of L1 / L2 regularization') parser.add_argument('--l1', dest='l2', action='store_false') parser.add_argument('--l2', dest='l2', action='store_true') parser.set_defaults(l2=True) parser.add_argument('--period', type=str, default='all', help='specifies which period extract features from', choices=[ 'first4days', 'first8days', 'last12hours', 'first25percent', 'first50percent', 'all' ]) parser.add_argument('--features', type=str, default='all', help='specifies what features to extract', choices=['all', 'len', 'all_but_len']) parser.add_argument('--data', type=str, help='Path to the data of in-hospital mortality task', default=os.path.join( os.path.dirname(__file__), '../../../data/in-hospital-mortality/')) parser.add_argument( '--output_dir', type=str, help='Directory relative which all output files are stored', default='.') parser.add_argument('--generate-data-only', dest='generate_data_only', action="store_true") parser.set_defaults(generate_data_only=False) args = parser.parse_args() print(args) train_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'train_listfile.csv'), period_length=48.0) val_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'val_listfile.csv'), period_length=48.0) test_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'test'), listfile=os.path.join(args.data, 'test_listfile.csv'), period_length=48.0) print('Reading data and extracting features ...') (train_X, train_y, train_names) = read_and_extract_features(train_reader, args.period, args.features) (val_X, val_y, val_names) = read_and_extract_features(val_reader, args.period, args.features) (test_X, test_y, test_names) = read_and_extract_features(test_reader, args.period, args.features) print(' train data shape = {}'.format(train_X.shape)) print(' validation data shape = {}'.format(val_X.shape)) print(' test data shape = {}'.format(test_X.shape)) if args.generate_data_only: data_path = os.path.join(args.output_dir, "mimic3_benchmark_data_logistic.csv") dataset = create_frame(train_X, train_y).append( create_frame(test_X, test_y)).append(create_frame(val_X, val_y)) dataset.to_csv(data_path) print("Generated and saved the data at: %s" % data_path) return print('Imputing missing values ...') imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0, verbose=0, copy=True) imputer.fit(train_X) train_X = np.array(imputer.transform(train_X), dtype=np.float32) val_X = np.array(imputer.transform(val_X), dtype=np.float32) test_X = np.array(imputer.transform(test_X), dtype=np.float32) print('Normalizing the data to have zero mean and unit variance ...') scaler = StandardScaler() scaler.fit(train_X) train_X = scaler.transform(train_X) val_X = scaler.transform(val_X) test_X = scaler.transform(test_X) penalty = ('l2' if args.l2 else 'l1') file_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty, args.C) logreg = LogisticRegression(penalty=penalty, C=args.C, random_state=42) logreg.fit(train_X, train_y) result_dir = os.path.join(args.output_dir, 'results') common_utils.create_directory(result_dir) with open(os.path.join(result_dir, 'train_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(train_y, logreg.predict_proba(train_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) with open(os.path.join(result_dir, 'val_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(val_y, logreg.predict_proba(val_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) prediction = logreg.predict_proba(test_X)[:, 1] with open(os.path.join(result_dir, 'test_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(test_y, prediction) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) save_results( test_names, prediction, test_y, os.path.join(args.output_dir, 'predictions', file_name + '.csv'))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--period', type=str, default='all', help='specifies which period extract features from', choices=['first4days', 'first8days', 'last12hours', 'first25percent', 'first50percent', 'all']) parser.add_argument('--features', type=str, default='all', help='specifies what features to extract', choices=['all', 'len', 'all_but_len']) args = parser.parse_args() print(args) # penalties = ['l2', 'l2', 'l2', 'l2', 'l2', 'l2', 'l1', 'l1', 'l1', 'l1', 'l1'] # Cs = [1.0, 0.1, 0.01, 0.001, 0.0001, 0.00001, 1.0, 0.1, 0.01, 0.001, 0.0001] penalties = ['l2'] Cs = [0.001] train_reader = DecompensationReader(dataset_dir='../../../data/decompensation/train/', listfile='../../../data/decompensation/train_listfile.csv') val_reader = DecompensationReader(dataset_dir='../../../data/decompensation/train/', listfile='../../../data/decompensation/val_listfile.csv') test_reader = DecompensationReader(dataset_dir='../../../data/decompensation/test/', listfile='../../../data/decompensation/test_listfile.csv') print('Reading data and extracting features ...') n_train = min(100000, train_reader.get_number_of_examples()) n_val = min(100000, val_reader.get_number_of_examples()) (train_X, train_y, train_names, train_ts) = read_and_extract_features( train_reader, n_train, args.period, args.features) (val_X, val_y, val_names, val_ts) = read_and_extract_features( val_reader, n_val, args.period, args.features) (test_X, test_y, test_names, test_ts) = read_and_extract_features( test_reader, test_reader.get_number_of_examples(), args.period, args.features) print('Imputing missing values ...') imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0, verbose=0, copy=True) imputer.fit(train_X) train_X = np.array(imputer.transform(train_X), dtype=np.float32) val_X = np.array(imputer.transform(val_X), dtype=np.float32) test_X = np.array(imputer.transform(test_X), dtype=np.float32) print('Normalizing the data to have zero mean and unit variance ...') scaler = StandardScaler() scaler.fit(train_X) train_X = scaler.transform(train_X) val_X = scaler.transform(val_X) test_X = scaler.transform(test_X) common_utils.create_directory('results') for (penalty, C) in zip(penalties, Cs): file_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty, C) logreg = LogisticRegression(penalty=penalty, C=C, random_state=42) logreg.fit(train_X, train_y) with open(os.path.join('results', 'train_{}.json'.format(file_name)), "w") as res_file: ret = print_metrics_binary(train_y, logreg.predict_proba(train_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) with open(os.path.join('results', 'val_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(val_y, logreg.predict_proba(val_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) prediction = logreg.predict_proba(test_X)[:, 1] with open(os.path.join('results', 'test_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(test_y, prediction) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) save_results(test_names, test_ts, prediction, test_y, os.path.join('predictions', file_name + '.csv'))
def save_results(names, pred, y_true, path): common_utils.create_directory(os.path.dirname(path)) with open(path, 'w') as f: f.write("stay,prediction,y_true\n") for (name, x, y) in zip(names, pred, y_true): f.write("{},{:.6f},{}\n".format(name, x, y))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--period', type=str, default='all', help='specifies which period extract features from', choices=[ 'first4days', 'first8days', 'last12hours', 'first25percent', 'first50percent', 'all' ]) parser.add_argument('--features', type=str, default='all', help='specifies what features to extract', choices=['all', 'len', 'all_but_len']) args = parser.parse_args() print(args) # penalties = ['l2', 'l2', 'l2', 'l2', 'l2', 'l2', 'l1', 'l1', 'l1', 'l1', 'l1'] # Cs = [1.0, 0.1, 0.01, 0.001, 0.0001, 0.00001, 1.0, 0.1, 0.01, 0.001, 0.0001] penalties = ['l2'] Cs = [0.001] train_reader = DecompensationReader( dataset_dir='../../../data/decompensation/train/', listfile='../../../data/decompensation/train_listfile.csv') val_reader = DecompensationReader( dataset_dir='../../../data/decompensation/train/', listfile='../../../data/decompensation/val_listfile.csv') test_reader = DecompensationReader( dataset_dir='../../../data/decompensation/test/', listfile='../../../data/decompensation/test_listfile.csv') print('Reading data and extracting features ...') n_train = min(100000, train_reader.get_number_of_examples()) n_val = min(100000, val_reader.get_number_of_examples()) (train_X, train_y, train_names, train_ts) = read_and_extract_features(train_reader, n_train, args.period, args.features) (val_X, val_y, val_names, val_ts) = read_and_extract_features(val_reader, n_val, args.period, args.features) (test_X, test_y, test_names, test_ts) = read_and_extract_features(test_reader, test_reader.get_number_of_examples(), args.period, args.features) print('Imputing missing values ...') imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0, verbose=0, copy=True) imputer.fit(train_X) train_X = np.array(imputer.transform(train_X), dtype=np.float32) val_X = np.array(imputer.transform(val_X), dtype=np.float32) test_X = np.array(imputer.transform(test_X), dtype=np.float32) print('Normalizing the data to have zero mean and unit variance ...') scaler = StandardScaler() scaler.fit(train_X) train_X = scaler.transform(train_X) val_X = scaler.transform(val_X) test_X = scaler.transform(test_X) common_utils.create_directory('results') for (penalty, C) in zip(penalties, Cs): file_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty, C) logreg = LogisticRegression(penalty=penalty, C=C, random_state=42) logreg.fit(train_X, train_y) with open(os.path.join('results', 'train_{}.json'.format(file_name)), "w") as res_file: ret = print_metrics_binary(train_y, logreg.predict_proba(train_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) with open(os.path.join('results', 'val_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(val_y, logreg.predict_proba(val_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) prediction = logreg.predict_proba(test_X)[:, 1] with open(os.path.join('results', 'test_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(test_y, prediction) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) save_results(test_names, test_ts, prediction, test_y, os.path.join('predictions', file_name + '.csv'))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--period', type=str, default='all', help='specifies which period extract features from', choices=[ 'first4days', 'first8days', 'last12hours', 'first25percent', 'first50percent', 'all' ]) parser.add_argument('--features', type=str, default='all', help='specifies what features to extract', choices=['all', 'len', 'all_but_len']) parser.add_argument('--data', type=str, help='Path to the data of length-of-stay task', default=os.path.join(os.path.dirname(__file__), '../../../data/length-of-stay/')) parser.add_argument( '--output_dir', type=str, help='Directory relative which all output files are stored', default='.') args = parser.parse_args() print(args) train_reader = LengthOfStayReader( dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'train_listfile.csv')) val_reader = LengthOfStayReader( dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'val_listfile.csv')) test_reader = LengthOfStayReader( dataset_dir=os.path.join(args.data, 'test'), listfile=os.path.join(args.data, 'test_listfile.csv')) print('Reading data and extracting features ...') n_train = min(100000, train_reader.get_number_of_examples()) n_val = min(100000, val_reader.get_number_of_examples()) (train_X, train_y, train_names, train_ts) = read_and_extract_features(train_reader, n_train, args.period, args.features) (val_X, val_y, val_names, val_ts) = read_and_extract_features(val_reader, n_val, args.period, args.features) (test_X, test_y, test_names, test_ts) = read_and_extract_features(test_reader, test_reader.get_number_of_examples(), args.period, args.features) print(train_X.shape) assert False print('Imputing missing values ...') imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0, verbose=0, copy=True) imputer.fit(train_X) train_X = np.array(imputer.transform(train_X), dtype=np.float32) val_X = np.array(imputer.transform(val_X), dtype=np.float32) test_X = np.array(imputer.transform(test_X), dtype=np.float32) print('Normalizing the data to have zero mean and unit variance ...') scaler = StandardScaler() scaler.fit(train_X) train_X = scaler.transform(train_X) val_X = scaler.transform(val_X) test_X = scaler.transform(test_X) file_name = "{}.{}".format(args.period, args.features) linreg = LinearRegression() linreg.fit(train_X, train_y) result_dir = os.path.join(args.output_dir, 'results') common_utils.create_directory(result_dir) with open(os.path.join(result_dir, 'train_{}.json'.format(file_name)), "w") as res_file: ret = print_metrics_regression(train_y, linreg.predict(train_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) with open(os.path.join(result_dir, 'val_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_regression(val_y, linreg.predict(val_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) prediction = linreg.predict(test_X) with open(os.path.join(result_dir, 'test_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_regression(test_y, prediction) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) save_results( test_names, test_ts, prediction, test_y, os.path.join(args.output_dir, 'predictions', file_name + '.csv'))
test_X = scaler.transform(features_test) test_X = [np.hstack([X, d]) for (X, d) in zip(test_X, diseases_embedding_test)] test_X = [np.hstack([X, d]) for (X, d) in zip(test_X, idx_features_test)] test_X = [np.hstack([X, d]) for (X, d) in zip(test_X, mask_test)] #=========SVM==================== penalty = ('l2') #file_name = '{}.{}.{}.C{}'.format(penalty, 0.001) logreg = SVC(probability=True) logreg.fit(train_X, train_y) #----------------- common_utils.create_directory('svm_results') common_utils.create_directory('svm_predictions') with open(os.path.join('svm_results', 'train.json'), 'w') as res_file: ret = print_metrics_binary(train_y, logreg.predict_proba(train_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) with open(os.path.join('svm_results', 'val.json'), 'w') as res_file: ret = print_metrics_binary(val_y, logreg.predict_proba(val_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) prediction = logreg.predict_proba(test_X)[:, 1] with open(os.path.join('svm_results', 'test.json'), 'w') as res_file:
def main(): parser = argparse.ArgumentParser() parser.add_argument('--period', type=str, default='all', help='specifies which period extract features from', choices=[ 'first4days', 'first8days', 'last12hours', 'first25percent', 'first50percent', 'all' ]) parser.add_argument('--features', type=str, default='all', help='specifies what features to extract', choices=['all', 'len', 'all_but_len']) parser.add_argument('--grid-search', dest='grid_search', action='store_true') parser.add_argument('--no-grid-search', dest='grid_search', action='store_false') parser.set_defaults(grid_search=False) parser.add_argument('--data', type=str, help='Path to the data of phenotyping task', default=os.path.join(os.path.dirname(__file__), '../../../data/phenotyping/')) parser.add_argument( '--output_dir', type=str, help='Directory relative which all output files are stored', default='.') args = parser.parse_args() print(args) if args.grid_search: penalties = [ 'l2', 'l2', 'l2', 'l2', 'l2', 'l2', 'l1', 'l1', 'l1', 'l1', 'l1' ] coefs = [ 1.0, 0.1, 0.01, 0.001, 0.0001, 0.00001, 1.0, 0.1, 0.01, 0.001, 0.0001 ] else: penalties = ['l1'] coefs = [0.1] train_reader = PhenotypingReader( dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'train_listfile.csv')) val_reader = PhenotypingReader( dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'val_listfile.csv')) test_reader = PhenotypingReader( dataset_dir=os.path.join(args.data, 'test'), listfile=os.path.join(args.data, 'test_listfile.csv')) print('Reading data and extracting features ...') (train_X, train_y, train_names, train_ts) = read_and_extract_features(train_reader, args.period, args.features) train_y = np.array(train_y) (val_X, val_y, val_names, val_ts) = read_and_extract_features(val_reader, args.period, args.features) val_y = np.array(val_y) (test_X, test_y, test_names, test_ts) = read_and_extract_features(test_reader, args.period, args.features) test_y = np.array(test_y) print("train set shape: {}".format(train_X.shape)) print("validation set shape: {}".format(val_X.shape)) print("test set shape: {}".format(test_X.shape)) print('Imputing missing values ...') imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0, verbose=0, copy=True) imputer.fit(train_X) train_X = np.array(imputer.transform(train_X), dtype=np.float32) val_X = np.array(imputer.transform(val_X), dtype=np.float32) test_X = np.array(imputer.transform(test_X), dtype=np.float32) print('Normalizing the data to have zero mean and unit variance ...') scaler = StandardScaler() scaler.fit(train_X) train_X = scaler.transform(train_X) val_X = scaler.transform(val_X) test_X = scaler.transform(test_X) n_tasks = 25 result_dir = os.path.join(args.output_dir, 'results') common_utils.create_directory(result_dir) for (penalty, C) in zip(penalties, coefs): model_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty, C) train_activations = np.zeros(shape=train_y.shape, dtype=float) val_activations = np.zeros(shape=val_y.shape, dtype=float) test_activations = np.zeros(shape=test_y.shape, dtype=float) for task_id in range(n_tasks): print('Starting task {}'.format(task_id)) logreg = LogisticRegression(penalty=penalty, C=C, random_state=42) logreg.fit(train_X, train_y[:, task_id]) train_preds = logreg.predict_proba(train_X) train_activations[:, task_id] = train_preds[:, 1] val_preds = logreg.predict_proba(val_X) val_activations[:, task_id] = val_preds[:, 1] test_preds = logreg.predict_proba(test_X) test_activations[:, task_id] = test_preds[:, 1] with open(os.path.join(result_dir, 'train_{}.json'.format(model_name)), 'w') as f: ret = metrics.print_metrics_multilabel(train_y, train_activations) ret = {k: float(v) for k, v in ret.items() if k != 'auc_scores'} json.dump(ret, f) with open(os.path.join(result_dir, 'val_{}.json'.format(model_name)), 'w') as f: ret = metrics.print_metrics_multilabel(val_y, val_activations) ret = {k: float(v) for k, v in ret.items() if k != 'auc_scores'} json.dump(ret, f) with open(os.path.join(result_dir, 'test_{}.json'.format(model_name)), 'w') as f: ret = metrics.print_metrics_multilabel(test_y, test_activations) ret = {k: float(v) for k, v in ret.items() if k != 'auc_scores'} json.dump(ret, f) save_results( test_names, test_ts, test_activations, test_y, os.path.join(args.output_dir, 'predictions', model_name + '.csv'))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--C', type=float, default=1.0, help='inverse of L1 / L2 regularization') parser.add_argument('--l1', dest='l2', action='store_false') parser.add_argument('--l2', dest='l2', action='store_true') parser.set_defaults(l2=True) parser.add_argument('--period', type=str, default='all', help='specifies which period extract features from', choices=[ 'first4days', 'first8days', 'last12hours', 'first25percent', 'first50percent', 'all' ]) parser.add_argument('--features', type=str, default='all', help='specifies what features to extract', choices=['all', 'len', 'all_but_len']) parser.add_argument('--method', type=str, default='logistic', choices=['gridsearch', 'lgbm', 'logistic']) args = parser.parse_args() print(args) import os, pickle data_cache = '../../../data/in-hospital-mortality/lr_cache.pickle' if os.path.exists(data_cache): print('Loading data cache ...') with open(data_cache, 'rb') as f: (train_X, train_y, train_names), (val_X, val_y, val_names), (test_X, test_y, test_names) = pickle.load(f) else: train_reader = InHospitalMortalityReader( dataset_dir='../../../data/in-hospital-mortality/train/', listfile='../../../data/in-hospital-mortality/train_listfile.csv', period_length=48.0) val_reader = InHospitalMortalityReader( dataset_dir='../../../data/in-hospital-mortality/train/', listfile='../../../data/in-hospital-mortality/val_listfile.csv', period_length=48.0) test_reader = InHospitalMortalityReader( dataset_dir='../../../data/in-hospital-mortality/test/', listfile='../../../data/in-hospital-mortality/test_listfile.csv', period_length=48.0) print('Reading data and extracting features ...') (train_X, train_y, train_names) = read_and_extract_features(train_reader, args.period, args.features) (val_X, val_y, val_names) = read_and_extract_features(val_reader, args.period, args.features) (test_X, test_y, test_names) = read_and_extract_features(test_reader, args.period, args.features) print(' train data shape = {}'.format(train_X.shape)) print(' validation data shape = {}'.format(val_X.shape)) print(' test data shape = {}'.format(test_X.shape)) print('Imputing missing values ...') imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0, verbose=0, copy=True) imputer.fit(train_X) train_X = np.array(imputer.transform(train_X), dtype=np.float32) val_X = np.array(imputer.transform(val_X), dtype=np.float32) test_X = np.array(imputer.transform(test_X), dtype=np.float32) print('Normalizing the data to have zero mean and unit variance ...') scaler = StandardScaler() scaler.fit(train_X) train_X = scaler.transform(train_X) val_X = scaler.transform(val_X) test_X = scaler.transform(test_X) with open(data_cache, 'wb') as f: pickle.dump([(train_X, train_y, train_names), (val_X, val_y, val_names), (test_X, test_y, test_names)], f, pickle.HIGHEST_PROTOCOL) penalty = ('l2' if args.l2 else 'l1') file_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty, args.C) print("use {} to fit".format(args.method)) if args.method == "gridsearch": param_test1 = {'n_estimators': range(10, 200, 20)} gsearch1 = GridSearchCV(estimator=GradientBoostingClassifier(), param_grid=param_test1) gsearch1.fit(train_X, train_y) print("gridsearch best result: ", gsearch1.best_params_, gsearch1.best_score_) logreg = GradientBoostingClassifier( n_estimators=gsearch1.best_params_['n_estimators']) elif args.method == "lgbm": logreg = lgb.LGBMClassifier(objective='binary', num_leaves=31, learning_rate=0.05, n_estimators=20) elif args.method == "logistic": logreg = LogisticRegression(penalty=penalty, C=args.C, random_state=42) logreg.fit(train_X, train_y) common_utils.create_directory('results') with open(os.path.join('results', 'train_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(train_y, logreg.predict_proba(train_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) with open(os.path.join('results', 'val_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(val_y, logreg.predict_proba(val_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) prediction = logreg.predict_proba(test_X)[:, 1] with open(os.path.join('results', 'test_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(test_y, prediction) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) save_results(test_names, prediction, test_y, os.path.join('predictions', file_name + '.csv'))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--C', type=float, default=1.0, help='inverse of L1 / L2 regularization') parser.add_argument('--l1', dest='l2', action='store_false') parser.add_argument('--l2', dest='l2', action='store_true') parser.set_defaults(l2=True) parser.add_argument('--period', type=str, default='all', help='specifies which period extract features from', choices=[ 'first4days', 'first8days', 'last12hours', 'first25percent', 'first50percent', 'all' ]) parser.add_argument('--features', type=str, default='all', help='specifies what features to extract', choices=['all', 'len', 'all_but_len']) parser.add_argument('--data', type=str, help='Path to the data of in-hospital mortality task', default=os.path.join( os.path.dirname(__file__), '../../../data/in-hospital-mortality/')) parser.add_argument( '--output_dir', type=str, help='Directory relative which all output files are stored', default='.') args = parser.parse_args() print(args) train_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'train_listfile.csv'), period_length=48.0) val_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'val_listfile.csv'), period_length=48.0) test_reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'test'), listfile=os.path.join(args.data, 'test_listfile.csv'), period_length=48.0) # Extract feature names if args.features == "all" and args.period == "all": reader = InHospitalMortalityReader( dataset_dir=os.path.join(args.data, 'train'), listfile=os.path.join(args.data, 'train_listfile.csv'), period_length=48.0) feature_names = [] header = reader.read_next()["header"] for item in header[1:]: # First item is 'hours' for sub_period in [ "full-series", "first-10%", "first-25%", "first-50%", "last-10%", "last-25%", "last-50%" ]: for function in ["min", "max", "mean", "std", "skew", "count"]: feature_names.append(f"{item}->{sub_period}->{function}") with open(os.path.join(args.output_dir, "feature_names.pkl"), "wb") as feature_names_file: pickle.dump(feature_names, feature_names_file) print('Reading data and extracting features ...') (train_X, train_y, train_names) = read_and_extract_features(train_reader, args.period, args.features) (val_X, val_y, val_names) = read_and_extract_features(val_reader, args.period, args.features) (test_X, test_y, test_names) = read_and_extract_features(test_reader, args.period, args.features) print(' train data shape = {}'.format(train_X.shape)) print(' validation data shape = {}'.format(val_X.shape)) print(' test data shape = {}'.format(test_X.shape)) print('Imputing missing values ...') imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0, verbose=0, copy=True) imputer.fit(train_X) train_X = np.array(imputer.transform(train_X), dtype=np.float32) val_X = np.array(imputer.transform(val_X), dtype=np.float32) test_X = np.array(imputer.transform(test_X), dtype=np.float32) print('Normalizing the data to have zero mean and unit variance ...') scaler = StandardScaler() scaler.fit(train_X) train_X = scaler.transform(train_X) val_X = scaler.transform(val_X) test_X = scaler.transform(test_X) print('Writing data ...') data_dir = os.path.join(args.output_dir, 'data') common_utils.create_directory(data_dir) common_utils.write_data(data_dir, train_X, val_X, test_X, train_y, val_y, test_y) penalty = ('l2' if args.l2 else 'l1') file_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty, args.C) logreg = LogisticRegression(penalty=penalty, C=args.C, random_state=42) logreg.fit(train_X, train_y) result_dir = os.path.join(args.output_dir, 'results') common_utils.create_directory(result_dir) with open(os.path.join(result_dir, 'train_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(train_y, logreg.predict_proba(train_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) with open(os.path.join(result_dir, 'val_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(val_y, logreg.predict_proba(val_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) prediction = logreg.predict_proba(test_X)[:, 1] with open(os.path.join(result_dir, 'test_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(test_y, prediction) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) save_results( test_names, prediction, test_y, os.path.join(args.output_dir, 'predictions', file_name + '.csv')) joblib.dump(logreg, os.path.join(args.output_dir, "lr.joblib")) # Save model # Generate ranked list of features if args.features == "all" and args.period == "all": coefs = logreg.coef_.reshape((714, )) features = list(zip(feature_names, coefs)) ranked = sorted(features, key=lambda pair: abs(pair[1]), reverse=True) with open(os.path.join(args.output_dir, "ranked_features.csv"), "w") as ranked_features_file: writer = csv.writer(ranked_features_file) _ = writer.writerow(("Feature Name", "Coefficient Magnitude")) for pair in ranked: _ = writer.writerow(pair)
def save_results(names, pred, y_true, path): common_utils.create_directory(os.path.dirname(path)) with open(path, 'w') as f: f.write("stay,prediction,y_true\n") for (name, x, y) in zip(names, pred, y_true): f.write("{},{:.6f},{}\n".format(name, x, y))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--C', type=float, default=1.0, help='inverse of L1 / L2 regularization') parser.add_argument('--l1', dest='l2', action='store_false') parser.add_argument('--l2', dest='l2', action='store_true') parser.set_defaults(l2=True) parser.add_argument('--period', type=str, default='all', help='specifies which period extract features from', choices=['first4days', 'first8days', 'last12hours', 'first25percent', 'first50percent', 'all']) parser.add_argument('--features', type=str, default='all', help='specifies what features to extract', choices=['all', 'len', 'all_but_len']) args = parser.parse_args() print(args) train_reader = InHospitalMortalityReader(dataset_dir='../../../data/in-hospital-mortality/train/', listfile='../../../data/in-hospital-mortality/train_listfile.csv', period_length=48.0) val_reader = InHospitalMortalityReader(dataset_dir='../../../data/in-hospital-mortality/train/', listfile='../../../data/in-hospital-mortality/val_listfile.csv', period_length=48.0) test_reader = InHospitalMortalityReader(dataset_dir='../../../data/in-hospital-mortality/test/', listfile='../../../data/in-hospital-mortality/test_listfile.csv', period_length=48.0) print('Reading data and extracting features ...') (train_X, train_y, train_names) = read_and_extract_features(train_reader, args.period, args.features) (val_X, val_y, val_names) = read_and_extract_features(val_reader, args.period, args.features) (test_X, test_y, test_names) = read_and_extract_features(test_reader, args.period, args.features) print(' train data shape = {}'.format(train_X.shape)) print(' validation data shape = {}'.format(val_X.shape)) print(' test data shape = {}'.format(test_X.shape)) print('Imputing missing values ...') imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0, verbose=0, copy=True) imputer.fit(train_X) train_X = np.array(imputer.transform(train_X), dtype=np.float32) val_X = np.array(imputer.transform(val_X), dtype=np.float32) test_X = np.array(imputer.transform(test_X), dtype=np.float32) print('Normalizing the data to have zero mean and unit variance ...') scaler = StandardScaler() scaler.fit(train_X) train_X = scaler.transform(train_X) val_X = scaler.transform(val_X) test_X = scaler.transform(test_X) penalty = ('l2' if args.l2 else 'l1') file_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty, args.C) logreg = LogisticRegression(penalty=penalty, C=args.C, random_state=42) logreg.fit(train_X, train_y) common_utils.create_directory('results') with open(os.path.join('results', 'train_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(train_y, logreg.predict_proba(train_X)) ret = {k : float(v) for k, v in ret.items()} json.dump(ret, res_file) with open(os.path.join('results', 'val_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(val_y, logreg.predict_proba(val_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) prediction = logreg.predict_proba(test_X)[:, 1] with open(os.path.join('results', 'test_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(test_y, prediction) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) save_results(test_names, prediction, test_y, os.path.join('predictions', file_name + '.csv'))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--period', type=str, default='all', help='specifies which period extract features from', choices=['first4days', 'first8days', 'last12hours', 'first25percent', 'first50percent', 'all']) parser.add_argument('--features', type=str, default='all', help='specifies what features to extract', choices=['all', 'len', 'all_but_len']) args = parser.parse_args() print(args) train_reader = LengthOfStayReader(dataset_dir='../../../data/length-of-stay/train/', listfile='../../../data/length-of-stay/train_listfile.csv') val_reader = LengthOfStayReader(dataset_dir='../../../data/length-of-stay/train/', listfile='../../../data/length-of-stay/val_listfile.csv') test_reader = LengthOfStayReader(dataset_dir='../../../data/length-of-stay/test/', listfile='../../../data/length-of-stay/test_listfile.csv') print('Reading data and extracting features ...') n_train = min(100000, train_reader.get_number_of_examples()) n_val = min(100000, val_reader.get_number_of_examples()) (train_X, train_y, train_names, train_ts) = read_and_extract_features( train_reader, n_train, args.period, args.features) (val_X, val_y, val_names, val_ts) = read_and_extract_features( val_reader, n_val, args.period, args.features) (test_X, test_y, test_names, test_ts) = read_and_extract_features( test_reader, test_reader.get_number_of_examples(), args.period, args.features) print('Imputing missing values ...') imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0, verbose=0, copy=True) imputer.fit(train_X) train_X = np.array(imputer.transform(train_X), dtype=np.float32) val_X = np.array(imputer.transform(val_X), dtype=np.float32) test_X = np.array(imputer.transform(test_X), dtype=np.float32) print('Normalizing the data to have zero mean and unit variance ...') scaler = StandardScaler() scaler.fit(train_X) train_X = scaler.transform(train_X) val_X = scaler.transform(val_X) test_X = scaler.transform(test_X) file_name = "{}.{}".format(args.period, args.features) linreg = LinearRegression() linreg.fit(train_X, train_y) common_utils.create_directory('results') with open(os.path.join("results", 'train_{}.json'.format(file_name)), "w") as res_file: ret = print_metrics_regression(train_y, linreg.predict(train_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) with open(os.path.join('results', 'val_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_regression(val_y, linreg.predict(val_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) prediction = linreg.predict(test_X) with open(os.path.join('results', 'test_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_regression(test_y, prediction) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) save_results(test_names, test_ts, prediction, test_y, os.path.join('predictions', file_name + '.csv'))
def save_results(names, ts, pred, y_true, path): common_utils.create_directory(path) with open(path, 'w') as f: f.write("stay,period_length,prediction,y_true\n") for (name, t, x, y) in zip(names, ts, pred, y_true): f.write("{},{:.6f},{:.6f},{:.6f}\n".format(name, t, x, y))
imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0, verbose=0, copy=True) imputer.fit(train_X) train_X = np.array(imputer.transform(train_X), dtype=np.float32) val_X = np.array(imputer.transform(val_X), dtype=np.float32) test_X = np.array(imputer.transform(test_X), dtype=np.float32) print('Normalizing the data to have zero mean and unit variance ...') scaler = StandardScaler() scaler.fit(train_X) train_X = scaler.transform(train_X) val_X = scaler.transform(val_X) test_X = scaler.transform(test_X) n_tasks = 25 result_dir = os.path.join(args.output_dir, 'results') common_utils.create_directory(result_dir) for (penalty, C) in zip(penalties, coefs): model_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty, C) train_activations = np.zeros(shape=train_y.shape, dtype=float) val_activations = np.zeros(shape=val_y.shape, dtype=float) test_activations = np.zeros(shape=test_y.shape, dtype=float) for task_id in range(n_tasks): print('Starting task {}'.format(task_id)) logreg = LogisticRegression(penalty=penalty, C=C, random_state=42) logreg.fit(train_X, train_y[:, task_id]) train_preds = logreg.predict_proba(train_X)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--C', type=float, default=1.0, help='inverse of L1 / L2 regularization') parser.add_argument('--l1', dest='l2', action='store_false') parser.add_argument('--l2', dest='l2', action='store_true') parser.set_defaults(l2=True) parser.add_argument('--period', type=str, default='all', help='specifies which period extract features from', choices=[ 'first4days', 'first8days', 'last12hours', 'first25percent', 'first50percent', 'all' ]) parser.add_argument('--features', type=str, default='all', help='specifies what features to extract', choices=['all', 'len', 'all_but_len']) args = parser.parse_args() print(args) train_reader = InHospitalMortalityReader( dataset_dir='../../../data/in-hospital-mortality/train/', listfile='../../../data/in-hospital-mortality/train_listfile.csv', period_length=48.0) val_reader = InHospitalMortalityReader( dataset_dir='../../../data/in-hospital-mortality/train/', listfile='../../../data/in-hospital-mortality/val_listfile.csv', period_length=48.0) test_reader = InHospitalMortalityReader( dataset_dir='../../../data/in-hospital-mortality/test/', listfile='../../../data/in-hospital-mortality/test_listfile.csv', period_length=48.0) print('Reading data and extracting features ...') (train_X, train_y, train_names) = read_and_extract_features(train_reader, args.period, args.features) (val_X, val_y, val_names) = read_and_extract_features(val_reader, args.period, args.features) (test_X, test_y, test_names) = read_and_extract_features(test_reader, args.period, args.features) print(' train data shape = {}'.format(train_X.shape)) print(' validation data shape = {}'.format(val_X.shape)) print(' test data shape = {}'.format(test_X.shape)) print('Imputing missing values ...') imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0, verbose=0, copy=True) imputer.fit(train_X) train_X = np.array(imputer.transform(train_X), dtype=np.float32) val_X = np.array(imputer.transform(val_X), dtype=np.float32) test_X = np.array(imputer.transform(test_X), dtype=np.float32) print('Normalizing the data to have zero mean and unit variance ...') scaler = StandardScaler() scaler.fit(train_X) train_X = scaler.transform(train_X) val_X = scaler.transform(val_X) test_X = scaler.transform(test_X) penalty = ('l2' if args.l2 else 'l1') file_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty, args.C) logreg = LogisticRegression(penalty=penalty, C=args.C, random_state=42) logreg.fit(train_X, train_y) common_utils.create_directory('results') with open(os.path.join('results', 'train_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(train_y, logreg.predict_proba(train_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) with open(os.path.join('results', 'val_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(val_y, logreg.predict_proba(val_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) prediction = logreg.predict_proba(test_X)[:, 1] with open(os.path.join('results', 'test_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(test_y, prediction) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) save_results(test_names, prediction, test_y, os.path.join('predictions', file_name + '.csv'))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--period', type=str, default='all', help='specifies which period extract features from', choices=['first4days', 'first8days', 'last12hours', 'first25percent', 'first50percent', 'all']) parser.add_argument('--features', type=str, default='all', help='specifies what features to extract', choices=['all', 'len', 'all_but_len']) args = parser.parse_args() print(args) # penalties = ['l2', 'l2', 'l2', 'l2', 'l2', 'l2', 'l1', 'l1', 'l1', 'l1', 'l1'] # Cs = [1.0, 0.1, 0.01, 0.001, 0.0001, 0.00001, 1.0, 0.1, 0.01, 0.001, 0.0001] penalties = ['l1'] Cs = [0.1] train_reader = PhenotypingReader(dataset_dir='../../../data/phenotyping/train/', listfile='../../../data/phenotyping/train_listfile.csv') val_reader = PhenotypingReader(dataset_dir='../../../data/phenotyping/train/', listfile='../../../data/phenotyping/val_listfile.csv') test_reader = PhenotypingReader(dataset_dir='../../../data/phenotyping/test/', listfile='../../../data/phenotyping/test_listfile.csv') print('Reading data and extracting features ...') (train_X, train_y, train_names, train_ts) = read_and_extract_features(train_reader, args.period, args.features) train_y = np.array(train_y) (val_X, val_y, val_names, val_ts) = read_and_extract_features(val_reader, args.period, args.features) val_y = np.array(val_y) (test_X, test_y, test_names, test_ts) = read_and_extract_features(test_reader, args.period, args.features) test_y = np.array(test_y) print("train set shape: {}".format(train_X.shape)) print("validation set shape: {}".format(val_X.shape)) print("test set shape: {}".format(test_X.shape)) print('Imputing missing values ...') imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0, verbose=0, copy=True) imputer.fit(train_X) train_X = np.array(imputer.transform(train_X), dtype=np.float32) val_X = np.array(imputer.transform(val_X), dtype=np.float32) test_X = np.array(imputer.transform(test_X), dtype=np.float32) print('Normalizing the data to have zero mean and unit variance ...') scaler = StandardScaler() scaler.fit(train_X) train_X = scaler.transform(train_X) val_X = scaler.transform(val_X) test_X = scaler.transform(test_X) n_tasks = 25 common_utils.create_directory('results') for (penalty, C) in zip(penalties, Cs): model_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty, C) train_activations = np.zeros(shape=train_y.shape, dtype=float) val_activations = np.zeros(shape=val_y.shape, dtype=float) test_activations = np.zeros(shape=test_y.shape, dtype=float) for task_id in range(n_tasks): print('Starting task {}'.format(task_id)) logreg = LogisticRegression(penalty=penalty, C=C, random_state=42) logreg.fit(train_X, train_y[:, task_id]) train_preds = logreg.predict_proba(train_X) train_activations[:, task_id] = train_preds[:, 1] val_preds = logreg.predict_proba(val_X) val_activations[:, task_id] = val_preds[:, 1] test_preds = logreg.predict_proba(test_X) test_activations[:, task_id] = test_preds[:, 1] with open(os.path.join('results', 'train_{}.json'.format(model_name)), 'w') as f: ret = metrics.print_metrics_multilabel(train_y, train_activations) ret = {k: float(v) for k, v in ret.items() if k != 'auc_scores'} json.dump(ret, f) with open(os.path.join('results', 'val_{}.json'.format(model_name)), 'w') as f: ret = metrics.print_metrics_multilabel(val_y, val_activations) ret = {k: float(v) for k, v in ret.items() if k != 'auc_scores'} json.dump(ret, f) with open(os.path.join('results', 'test_{}.json'.format(model_name)), 'w') as f: ret = metrics.print_metrics_multilabel(test_y, test_activations) ret = {k: float(v) for k, v in ret.items() if k != 'auc_scores'} json.dump(ret, f) save_results(test_names, test_ts, test_activations, test_y, os.path.join('predictions', model_name + '.csv'))