def main(sim_path, out): # Define wildcard path to contact matrix data cm_filepath = os.path.join(sim_path, 'output-cm-*.h5') # Collect contact matrix file names sorted by sim_id cm_files = sorted(glob(cm_filepath)) if not cm_files: raise FileNotFoundError(f'No h5 files found, recheck your input path {sim_path}') with ExitStack() as stack: # Open all h5 files and add them to exit stack open_cm_files = map(lambda file: stack.enter_context(open_h5(file)), cm_files) # Iterate through open h5 files and get contact_map datasets cm_data = list(map(lambda file: file['contact_maps'], open_cm_files)) # Compress all .h5 files into one in cvae format cvae_input = cm_to_cvae(cm_data) # Create .h5 as cvae input cvae_input_file = os.path.join(out, 'cvae-input.h5') # Create and open contact map aggregation output file cvae_input_file = stack.enter_context(h5py.File(cvae_input_file, 'w')) # Write aggregated contact map dataset to file cvae_input_file.create_dataset('contact_maps', data=cvae_input)
def main(input_path, out_path): f = open_h5(input_path) contact_maps = np.array(f['contact_maps'][:]) f.close() sparse_contact_maps_from_matrices(contact_maps, out_path)
def __init__(self, path, split_ptc=0.8, split='train', squeeze=False): with open_h5(path) as input_file: # Access contact matrix data from h5 file data = np.array(input_file['contact_maps']) # 80-20 train validation split index split_ind = int(split_ptc * len(data)) if split == 'train': self.data = data[:split_ind] elif split == 'valid': self.data = data[split_ind:] else: raise ValueError(f'Parameter split={split} is invalid.') # TODO: in future contact map Dataset, pass in device to precompute # the operation # TODO: this reshape code may not be the best solution. revisit num_residues = self.data.shape[2] assert num_residues == 22 if squeeze: shape = (-1, num_residues, num_residues) else: shape = (-1, 1, num_residues, num_residues) self.data = torch.from_numpy(self.data.reshape(shape)).to(torch.float32)
def __init__(self, path, out_dir, squeeze, sample_interval=20, batch_size=128, writer=None): """ Parameters ---------- path : str Path to h5 file containing contact matrices. out_dir : str Directory to store output plots. squeeze : bool If True, data is reshaped to (H, W) else if False data is reshaped to (1, H, W). sample_interval : int Plots every sample_interval'th point in the data set batch_size : int Batch size to load raw contact matrices into memory. Batches are loaded into memory, encoded to a latent dimension and then collected in a np.ndarray. The np.ndarray is then passed to the TSNE algorithm. NOTE: Not a learning hyperparameter, simply needs to be small enough to load batch into memory. writer : torch.utils.tensorboard.SummaryWriter """ os.makedirs(out_dir, exist_ok=True) # Open h5 file. Python's garbage collector closes the # file when class is destructed. h5_file = open_h5(path) self.dset = h5_file['contact_maps'] self.out_dir = out_dir self.sample_interval = sample_interval self.batch_size = batch_size self.writer = writer self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') if squeeze: self.shape = (self.dset.shape[1], self.dset.shape[2]) else: self.shape = (1, self.dset.shape[1], self.dset.shape[2]) self._init_plot(h5_file)
def generate_embeddings(encoder_hparams_path, encoder_weight_path, cm_path): encoder_hparams = EncoderHyperparams.load(encoder_hparams_path) with open_h5(cm_path) as file: # Access contact matrix data from h5 file data = file['contact_maps'] # Get shape of an individual contact matrix # (ignore total number of matrices) input_shape = data.shape[1:] encoder = EncoderConvolution2D(input_shape=input_shape, hyperparameters=encoder_hparams) # Load best model weights encoder.load_weights(encoder_weight_path) # Create contact matrix embeddings cm_embeddings, *_ = encoder.embed(data) return cm_embeddings
def main(input_path, out_path, model_id, gpu, epochs, batch_size, latent_dim): # Set CUDA environment variables os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID' os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu) with open_h5(input_path) as input_file: # Access contact matrix data from h5 file data = np.array(input_file['contact_maps']) # Shuffle data before train validation split np.random.shuffle(data) # 80-20 train validation split index split = int(0.8 * len(data)) # Partition input data into 80-20 train valid split train, valid = data[:split], data[split:] # Get shape of an individual contact matrix # (ignore total number of matrices) input_shape = train.shape[1:] # Set model hyperparameters for encoder and decoder shared_hparams = {'num_conv_layers': 4, 'filters': [64, 64, 64, 64], 'kernels': [3, 3, 3, 3], 'strides': [1, 2, 1, 1], 'num_affine_layers': 1, 'affine_widths': [128], 'latent_dim': latent_dim } affine_dropouts = [0] encoder_hparams = EncoderHyperparams(affine_dropouts=affine_dropouts, **shared_hparams) decoder_hparams = DecoderHyperparams(**shared_hparams) encoder = EncoderConvolution2D(input_shape=input_shape, hyperparameters=encoder_hparams) # Get shape attributes of the last encoder layer to define the decoder encode_conv_shape, num_conv_params = encoder.get_final_conv_params() decoder = DecoderConvolution2D(output_shape=input_shape, enc_conv_params=num_conv_params, enc_conv_shape=encode_conv_shape, hyperparameters=decoder_hparams) optimizer = RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0) cvae = VAE(input_shape=input_shape, encoder=encoder, decoder=decoder, optimizer=optimizer) # Define callbacks to report model performance for analysis embed_callback = EmbeddingCallback(train, cvae) loss_callback = LossHistory() cvae.train(data=train, validation_data=valid, batch_size=batch_size, epochs=epochs, callbacks=[embed_callback, loss_callback]) # Define file paths to store model performance and weights ae_weight_path = os.path.join(out_path, f'ae-weight-{model_id}.h5') encoder_weight_path = os.path.join(out_path, f'encoder-weight-{model_id}.h5') encoder_hparams_path = os.path.join(out_path, f'encoder-hparams-{model_id}.pkl') decoder_hparams_path = os.path.join(out_path, f'decoder-hparams-{model_id}.pkl') embed_path = os.path.join(out_path, f'embed-{model_id}.npy') idx_path = os.path.join(out_path, f'embed-idx-{model_id}.npy') loss_path = os.path.join(out_path, f'loss-{model_id}.npy') val_loss_path = os.path.join(out_path, f'val-loss-{model_id}.npy') # Save weights, hyperparameters, and model performance. # Save encoder weights seperately so the full model doesn't need to be # loaded during the outlier detection stage. cvae.save_weights(ae_weight_path) encoder.save_weights(encoder_weight_path) encoder_hparams.save(encoder_hparams_path) decoder_hparams.save(decoder_hparams_path) embed_callback.save(embed_path=embed_path, idx_path=idx_path) loss_callback.save(loss_path=loss_path, val_loss_path=val_loss_path)
def __init__(self, path, input_shape, split_ptc=0.8, split='train', sparse=False, gpu=None): """ Parameters ---------- path : str Path to h5 file containing contact matrices. input_shape : tuple Shape of contact matrices (H, W), may be (1, H, W). split_ptc : float Percentage of total data to be used as training set. split : str Either 'train' or 'valid', specifies whether this dataset returns train or validation data. sparse : bool If True, process data as sparse row/col COO format. Data should not contain any values because they are all 1's and generated on the fly. If False, input data is normal tensor. gpu : int, None If None, then data will be put onto the default GPU if CUDA is available and otherwise is put onto a CPU. If gpu is int type, then data is put onto the specified GPU. """ if split not in ('train', 'valid'): raise ValueError("Parameter split must be 'train' or 'valid'.") if split_ptc < 0 or split_ptc > 1: raise ValueError( 'Parameter split_ptc must satisfy 0 <= split_ptc <= 1.') # Open h5 file. Python's garbage collector closes the # file when class is destructed. h5_file = open_h5(path) if sparse: group = h5_file['contact_maps'] self.row_dset = group.get('row') self.col_dset = group.get('col') self.len = len(self.row_dset) else: # contact_maps dset has shape (N, W, H, 1) self.dset = h5_file['contact_maps'] self.len = len(self.dset) # train validation split index self.split_ind = int(split_ptc * self.len) self.split = split self.sparse = sparse self.shape = input_shape if gpu is None: self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') else: self.device = torch.device(f'cuda:{gpu}')