def main(args): np.random.seed(args.seed) # CPU only instead of GPU if args.cpu_only: logging.info('Setting env for CPU-only mode...') os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # see issue #152 os.environ["CUDA_VISIBLE_DEVICES"] = '-1' # Load and process data # Provide objective to load recall_0 = Utils.class_recall(0) recall_1 = Utils.class_recall(1) custom_obj = {'metr': recall_0} logging.info('Loading model...') ## pkl logging.info(' Loading mstd...') F = os.path.join(args.model_path, args.mstd_name) if not os.path.exists(F): msg = 'Model file not available at data-path: {}' raise IOError(msg.format(F)) with open(F, 'rb') as mstd: mean_tr, std_tr = pickle.load(mstd) ## h5 logging.info(' Loading h5...') F = os.path.join(args.model_path, args.model_name) if not os.path.exists(F): msg = 'Model file not available at data-path: {}' raise IOError(msg.format(F)) model = load_model(F, custom_objects=custom_obj) # outdir if not os.path.exists(args.save_path): os.makedirs(args.save_path) logging.info('Loading features...') x, y, i2n = Utils.load_features_nogt(args.feature_file_table, force_overwrite=args.force_overwrite, pickle_only=args.pickle_only, n_procs=args.n_procs) logging.info('Loaded {} contigs'.format(len(set(i2n.values())))) n2i = Utils.reverse_dict(i2n) x = [xi for xmeta in x for xi in xmeta] y = np.concatenate(y) logging.info('Running model generator...') dataGen = Models.Generator(x, y, batch_size=64, shuffle=False, norm_raw=0, mean_tr=mean_tr, std_tr=std_tr) logging.info('Computing predictions...') scores = Utils.compute_predictions(n2i, dataGen, model, args.save_path, args.save_name)
def __init__(self, config): self.max_len = config.max_len self.filters = config.filters self.n_conv = config.n_conv self.n_features = config.n_features self.pool_window = config.pool_window self.dropout = config.dropout self.lr_init = config.lr_init self.n_fc = config.n_fc self.n_hid = config.n_hid self.net = Sequential() self.net.add( Conv2D(self.filters, kernel_size=(2, self.n_features), input_shape=(self.max_len, self.n_features, 1), activation='relu', padding='valid')) self.net.add(BatchNormalization(axis=-1)) for i in range(1, self.n_conv): self.net.add( Conv2D(2**i * self.filters, kernel_size=(2, 1), strides=2, input_shape=(self.max_len, 1, 2**(i - 1) * self.filters), activation='relu')) self.net.add(BatchNormalization(axis=-1)) self.net.add(AveragePooling2D((self.pool_window, 1))) self.net.add(Flatten()) optimizer = keras.optimizers.adam(lr=self.lr_init) # binary classification for _ in range(self.n_fc - 1): self.net.add(Dense(self.n_hid, activation='relu')) self.net.add(Dropout(rate=self.dropout)) self.net.add(Dense(1, activation='sigmoid')) self.net.add(Dropout(rate=self.dropout)) recall_0 = Utils.class_recall(0) recall_1 = Utils.class_recall(1) self.net.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=[recall_0, recall_1]) self.reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=0.01 * self.lr_init)
def __init__(self, x, y, max_len=10000, batch_size=32, shuffle=True, norm_raw=True, mean_tr=None, std_tr=None): self.batch_size = batch_size self.shuffle = shuffle self.max_len = max_len self.x = x self.y = y self.shuffle = shuffle self.n_feat = x[0].shape[1] if mean_tr is None: mean, std = Utils.compute_mean_std(self.x) self.mean = mean self.std = std if not norm_raw: self.mean[0:4] = 0 self.std[0:4] = 1 else: self.mean = mean_tr self.std = std_tr # Shuffle data self.indices = np.arange(len(x)) if self.shuffle: np.random.shuffle(self.indices) self.on_epoch_end()
def main(args): np.random.seed(12) # Build model config = Config(args) deepmased = Models.deepmased(config) deepmased.print_summary() if not os.path.exists(args.save_path): os.makedirs(args.save_path) save_path = args.save_path # Load and process data logging.info('Loading data...') x, y = Utils.load_features_tr(args.data_path, max_len=args.max_len, standard=args.standard, mode = config.mode, pickle_only=args.pickle_only) if args.n_folds == -1: # Append elements in x x = [item for sl in x for item in sl] y = np.concatenate(y) if args.n_folds > -1: if os.path.exists(os.path.join(save_path, str(args.n_folds - 1) + '_model.h5')): exit() ap_scores = [] for val_idx in range(args.n_folds): x_tr, x_val, y_tr, y_val = Utils.kfold(x, y, val_idx, k=args.n_folds) deepmased = Models.deepmased(config) #Construct generator dataGen = Models.Generator(x_tr, y_tr, args.max_len, batch_size=64, norm_raw=bool(args.norm_raw)) # Init validation generator and dataGen_val = Models.Generator(x_val, y_val, args.max_len, batch_size=64, shuffle=False, norm_raw=bool(args.norm_raw), mean_tr=dataGen.mean, std_tr=dataGen.std) #Train model tb_logs = keras.callbacks.TensorBoard(log_dir=os.path.join(save_path, 'logs'), histogram_freq=0, write_graph=True, write_images=True) logging.info('Training network...') if config.mode in ['chimera', 'extensive']: w_one = int(len(np.where(y_tr == 0)[0]) / len(np.where(y_tr == 1)[0])) class_weight = {0 : 1 , 1: w_one} deepmased.net.fit_generator(generator=dataGen, validation_data=dataGen_val, epochs=args.n_epochs, use_multiprocessing=True, verbose=2, callbacks=[tb_logs, deepmased.reduce_lr]) elif config.mode == 'edit': st = StandardScaler() y_tr = st.fit_transform(y_tr) y_te = st.transform(y_te) deepmased.net.fit(x_tr, y_tr, validation_data=(x_te, y_te), epochs=args.n_epochs, callbacks=[tb_logs, deepmased.reduce_lr]) logging.info('Computing AUC scores...') scores_val = deepmased.predict_generator(dataGen_val) ap_scores.append(average_precision_score(y_val[0 : scores_val.size], scores_val)) deepmased.save(os.path.join(save_path, str(val_idx) + '_model.h5')) with open(os.path.join(save_path, 'scores.pkl'), 'wb') as f: pickle.dump(ap_scores, f) else: dataGen = Models.Generator(x, y, args.max_len, batch_size=64, norm_raw=bool(args.norm_raw)) deepmased = Models.deepmased(config) tb_logs = keras.callbacks.TensorBoard(log_dir=os.path.join(save_path, 'logs_final'), histogram_freq=0, write_graph=True, write_images=True) logging.info('Training network...') if config.mode in ['chimera', 'extensive']: w_one = int(len(np.where(y == 0)[0]) / len(np.where(y == 1)[0])) class_weight = {0 : 1 , 1: w_one} deepmased.net.fit_generator(generator=dataGen, epochs=args.n_epochs, use_multiprocessing=True, verbose=2, callbacks=[tb_logs, deepmased.reduce_lr]) logging.info('Saving trained model...') outfile = os.path.join(save_path, 'final_model.h5') deepmased.save(outfile) logging.info(' File written: {}'.format(outfile)) outfile = os.path.join(save_path, 'mean_std_final_model.pkl') with open(outfile, 'wb') as f: pickle.dump([dataGen.mean, dataGen.std], f) logging.info(' File written: {}'.format(outfile))
def main(args): """Main interface """ np.random.seed(12) save_plot = args.save_plot if save_plot is None: save_plot = args.save_path # Load and process data # Provide objective to load recall_0 = Utils.class_recall(0) recall_1 = Utils.class_recall(1) custom_obj = {'metr': recall_0} path_to_models = os.listdir(args.save_path) auc = [] for model_path in path_to_models: if not os.path.exists( (os.path.join(args.save_path, model_path, 'final_model.h5'))): continue if not os.path.exists( os.path.join(args.save_path, model_path, 'predictions')): os.makedirs(os.path.join(args.save_path, model_path, 'predictions')) if not os.path.exists( os.path.join(args.save_path, model_path, 'predictions', args.data_path.split('/')[-1])): os.makedirs( os.path.join(args.save_path, model_path, 'predictions', args.data_path.split('/')[-1])) F = os.path.join(args.save_path, model_path, 'mean_std_final_model.pkl') with open(F, 'rb') as mstd: mean_tr, std_tr = pickle.load(mstd) model = load_model(os.path.join(args.save_path, model_path, 'final_model.h5'), custom_objects=custom_obj) tech = args.technology logging.info('Loading data...') if args.is_synthetic == 1: x, y, i2n = Utils.load_features(args.data_path, max_len=args.max_len, mode=args.mode, technology=tech) else: x, y, i2n = Utils.load_features_nogt(args.data_path, max_len=args.max_len, mode=args.mode) logging.info('Loaded {} contigs...'.format(len(set(i2n.values())))) n2i = Utils.reverse_dict(i2n) x = [xi for xmeta in x for xi in xmeta] y = np.concatenate(y) dataGen = Models.Generator(x, y, args.max_len, batch_size=64, shuffle=False, norm_raw=bool(args.norm_raw), mean_tr=mean_tr, std_tr=std_tr) loggin.info('Computing predictions for {}...'.format(tech)) scores = compute_predictions(y, n2i) outfile = os.path.join(args.save_path, model_path, 'predictions', args.data_path.split('/')[-1], tech + '.pkl') with open(outfile, 'wb') as spred: pickle.dump(scores, spred) logging.info('File written: {}'.format(outfile))
def main(args): """Main interface """ # init np.random.seed(args.seed) ## where to save the plot save_plot = args.save_plot if save_plot is None: save_plot = args.save_path # Load and process data # Provide objective to load logging.info('Loading data...') recall_0 = Utils.class_recall(0) recall_1 = Utils.class_recall(1) custom_obj = {'metr': recall_0} h5_file = os.path.join(args.model_path, args.model_name) if not os.path.exists(h5_file): msg = 'Cannot find {} file in {}' raise IOError(msg.format(args.model_name, args.model_path)) logging.info('Loading model: {}'.format(h5_file)) model = load_model(h5_file, custom_objects=custom_obj) # model pkl pkl_file = os.path.join(args.model_path, args.mstd_name) logging.info('Loading file: {}'.format(pkl_file)) with open(pkl_file, 'rb') as mstd: mean_tr, std_tr = pickle.load(mstd) # loading features if args.is_synthetic == 1: logging.info('Loading synthetic features') x, y, i2n = Utils.load_features(args.feature_file_table, max_len=args.max_len, technology=args.technology, force_overwrite=args.force_overwrite, n_procs=args.n_procs) else: logging.info('Loading non-synthetic features') x, y, i2n = Utils.load_features_nogt( args.feature_file_table, max_len=args.max_len, force_overwrite=args.force_overwrite, n_procs=args.n_procs) logging.info('Loaded {} contigs'.format(len(set(i2n.values())))) n2i = Utils.reverse_dict(i2n) x = [xi for xmeta in x for xi in xmeta] y = np.concatenate(y) logging.info('Running model generator...') dataGen = Models.Generator(x, y, args.max_len, batch_size=64, shuffle=False, norm_raw=bool(args.norm_raw), mean_tr=mean_tr, std_tr=std_tr) logging.info('Computing predictions for {}...'.format(args.technology)) scores = Utils.compute_predictions_y_known(y, n2i, model, dataGen) outfile = os.path.join( args.save_path, '_'.join([args.save_name, args.technology + '.pkl'])) with open(outfile, 'wb') as spred: pickle.dump(scores, spred) logging.info('File written: {}'.format(outfile))
def main(args): # init np.random.seed(args.seed) if not os.path.exists(args.save_path): os.makedirs(args.save_path) save_path = args.save_path # Build model config = Config(args) if not args.pickle_only: logging.info('Building model') deepmased = Models.deepmased(config) deepmased.print_summary() # Load and process data x, y = Utils.load_features_tr(args.feature_file_table, max_len=args.max_len, technology = args.technology, pickle_only = args.pickle_only, force_overwrite = args.force_overwrite, n_procs = args.n_procs) # kfold cross validation if args.n_folds >= 0: logging.info('Running kfold cross validation. n-folds: {}'.format(args.n_folds)) outfile_h5 = os.path.join(save_path, str(args.n_folds - 1) + '_model.h5') if os.path.exists(outfile_h5) and args.force_overwrite is False: msg = 'Output already exists ({}). Use --force-overwrite to overwrite the file' raise IOError(msg.format(outfile_h5)) # iter over folds ap_scores = [] for val_idx in range(args.n_folds): logging.info('Fold {}: Constructing model...'.format(val_idx)) x_tr, x_val, y_tr, y_val = Utils.kfold(x, y, val_idx, k=args.n_folds) deepmased = Models.deepmased(config) #Construct generator dataGen = Models.Generator(x_tr, y_tr, args.max_len, batch_size=64, norm_raw=bool(args.norm_raw)) # Init validation generator and dataGen_val = Models.Generator(x_val, y_val, args.max_len, batch_size=64, shuffle=False, norm_raw=bool(args.norm_raw), mean_tr=dataGen.mean, std_tr=dataGen.std) #Train model tb_logs = keras.callbacks.TensorBoard(log_dir=os.path.join(save_path, 'logs'), histogram_freq=0, write_graph=True, write_images=True) logging.info('Fold {}: Training network...'.format(val_idx)) ## binary classification (extensive misassembly) try: w_one = int(len(np.where(y_tr == 0)[0]) / len(np.where(y_tr == 1)[0])) except ZeroDivisionError: logging.warning(' No misassemblies present!') w_one = 0 class_weight = {0 : 1 , 1: w_one} deepmased.net.fit_generator(generator=dataGen, validation_data=dataGen_val, epochs=args.n_epochs, use_multiprocessing=args.n_procs > 1, workers=args.n_procs, verbose=2, callbacks=[tb_logs, deepmased.reduce_lr]) # AUC scores logging.info('Fold {}: Computing AUC scores...'.format(val_idx)) scores_val = deepmased.predict_generator(dataGen_val) ap_scores.append(average_precision_score(y_val[0 : scores_val.size], scores_val)) # Saving data outfile_h5_fold = os.path.join(save_path, str(val_idx) + '_model.h5') deepmased.save(outfile_h5_fold) logging.info('Fold {}: File written: {}'.format(val_idx, outfile_h5_fold)) outfile_pkl_fold = os.path.join(save_path, 'scores.pkl') with open(outfile_pkl_fold, 'wb') as f: pickle.dump(ap_scores, f) logging.info('Fold {}: File written: {}'.format(val_idx, outfile_pkl_fold)) else: # Skip kfold and simply pool all the data for training ## all elements in x and y are combined logging.info('NOTE: Training on all pooled data!') x = [item for sl in x for item in sl] y = np.concatenate(y) # #downsample to half # import random # dwnsample = np.array(random.sample(range(len(y)), int(len(y)/2))) # x = np.array(x)[dwnsample] # y = np.array(y)[dwnsample] logging.info('Constructing model...') dataGen = Models.Generator(x, y, args.max_len, batch_size=64, norm_raw=bool(args.norm_raw)) deepmased = Models.deepmased(config) tb_logs = keras.callbacks.TensorBoard(log_dir=os.path.join(save_path, 'logs_final'), histogram_freq=0, write_graph=True, write_images=True) logging.info('Training network...') deepmased.net.fit_generator(generator=dataGen, epochs=args.n_epochs, use_multiprocessing=args.n_procs > 1, workers=args.n_procs, verbose=2, callbacks=[tb_logs, deepmased.reduce_lr]) logging.info('Saving trained model...') x = [args.save_name, args.technology, 'model.h5'] outfile = os.path.join(save_path, '_'.join(x)) deepmased.save(outfile) logging.info(' File written: {}'.format(outfile)) x = [args.save_name, args.technology, 'mean_std.pkl'] outfile = os.path.join(save_path, '_'.join(x)) with open(outfile, 'wb') as f: pickle.dump([dataGen.mean, dataGen.std], f) logging.info(' File written: {}'.format(outfile))
def __init__(self, config): self.max_len = config.max_len self.filters = config.filters self.n_conv = config.n_conv self.n_features = config.n_features self.pool_window = config.pool_window self.dropout = config.dropout self.lr_init = config.lr_init self.mode = config.mode self.n_fc = config.n_fc self.n_hid = config.n_hid self.net = Sequential() self.net.add( Conv2D(self.filters, kernel_size=(2, self.n_features), input_shape=(self.max_len, self.n_features, 1), activation='relu', padding='valid')) self.net.add(BatchNormalization(axis=-1)) for i in range(1, self.n_conv): self.net.add( Conv2D(2**i * self.filters, kernel_size=(2, 1), strides=2, input_shape=(self.max_len, 1, 2**(i - 1) * self.filters), activation='relu')) self.net.add(BatchNormalization(axis=-1)) self.net.add(AveragePooling2D((self.pool_window, 1))) self.net.add(Flatten()) optimizer = keras.optimizers.adam(lr=self.lr_init) if self.mode in ['chimera', 'extensive']: for _ in range(self.n_fc - 1): self.net.add(Dense(self.n_hid, activation='relu')) self.net.add(Dropout(rate=self.dropout)) self.net.add(Dense(1, activation='sigmoid')) self.net.add(Dropout(rate=self.dropout)) recall_0 = Utils.class_recall(0) recall_1 = Utils.class_recall(1) self.net.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=[recall_0, recall_1]) elif self.mode == 'edit': self.net.add(Dense(20, activation='relu')) self.net.add(Dropout(rate=dropout)) self.net.add(Dense(20, activation='relu')) self.net.add(Dropout(rate=dropout)) self.net.add(Dense(1, activation='linear')) self.net.compile(loss='mean_absolute_error', optimizer=optimizer, metrics=[Utils.explained_var]) else: raise ('Training mode "{}" not supported.'.format(mode)) self.reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=0.01 * self.lr_init)