def test_pileup_visualization(snp_variant): output_folder = tempfile.mkdtemp(prefix='vw_test_output_') encoder = PileupEncoder( layers=[PileupEncoder.Layer.READ, PileupEncoder.Layer.ALLELE, PileupEncoder.Layer.REFERENCE, PileupEncoder.Layer.BASE_QUALITY, PileupEncoder.Layer.MAPPING_QUALITY], base_encoder=BaseUnicodeEncoder() ) fig_title, fig = encoder.visualize(snp_variant, save_to_path=output_folder, max_subplots_per_line=2) assert len([name for name in os.listdir(output_folder) if os.path.isfile(os.path.join(output_folder, name))]) == 1 writer = SummaryWriter(log_dir=output_folder) writer.add_figure(fig_title, fig) writer.flush() writer.close() shutil.rmtree(output_folder)
def test_snp_allele_encoding(snp_variant): max_reads = 1 window_size = 5 layers = [PileupEncoder.Layer.ALLELE] encoder = PileupEncoder(window_size=window_size, max_reads=max_reads, layers=layers) variant = snp_variant encoding = encoder(variant) assert(encoding[0, 0, window_size] == BaseEnumEncoder()(variant.allele))
def test_snp_ref_encoding(snp_variant): max_reads = 1 window_size = 5 layers = [PileupEncoder.Layer.REFERENCE] encoder = PileupEncoder(window_size=window_size, max_reads=max_reads, layers=layers) variant = snp_variant encoding = encoder(variant) assert(encoding[0, 0, window_size] == BaseEnumEncoder()(variant.ref))
def generate_hdf5(args): """Serialize encodings to HDF5. Generate encodings in multiprocess loop and save tensors to HDF5. """ # Get list of files from arguments # and generate the variant entries using VCF reader. bam = args.bam vcf_readers = [] for tp_file in args.tp_files: vcf_readers.append(VCFReader(vcf=tp_file, bams=[bam], is_fp=False)) for fp_file in args.fp_files: vcf_readers.append(VCFReader(vcf=fp_file, bams=[bam], is_fp=True)) total_labels = sum([len(reader) for reader in vcf_readers]) # Setup encoder for samples and labels. sample_encoder = PileupEncoder( window_size=100, max_reads=100, layers=[PileupEncoder.Layer.READ, PileupEncoder.Layer.BASE_QUALITY]) label_encoder = ZygosityLabelEncoder() encode_func = partial(encode, sample_encoder, label_encoder) # Create HDF5 datasets. h5_file = h5py.File(args.output_file, "w") encoded_data = h5_file.create_dataset( "encodings", shape=(total_labels, sample_encoder.depth, sample_encoder.height, sample_encoder.width), dtype=np.float32, fillvalue=0) label_data = h5_file.create_dataset("labels", shape=(total_labels, ), dtype=np.int64, fillvalue=0) pool = mp.Pool(args.threads) print("Serializing {} entries...".format(total_labels)) for vcf_reader in vcf_readers: label_idx = 0 for out in pool.imap(encode_func, vcf_reader): if label_idx % 1000 == 0: print("Saved {} entries".format(label_idx)) encoding, label = out encoded_data[label_idx] = encoding label_data[label_idx] = label label_idx += 1 print("Saved {} entries".format(total_labels)) h5_file.close()
def test_snp_encoder_basic(snp_variant): max_reads = 100 window_size = 10 width = 2 * window_size + 1 height = max_reads layers = [PileupEncoder.Layer.READ] encoder = PileupEncoder(window_size=window_size, max_reads=max_reads, layers=layers) variant = snp_variant encoding = encoder(variant) assert(encoding.size() == torch.Size([len(layers), height, width]))
def test_deletion_read_encoding(deletion_variant): max_reads = 100 window_size = 10 width = 2 * window_size + 1 height = max_reads layers = [PileupEncoder.Layer.READ, PileupEncoder.Layer.REFERENCE, PileupEncoder.Layer.ALLELE] encoder = PileupEncoder(window_size=window_size, max_reads=max_reads, layers=layers) variant = deletion_variant encoding = encoder(variant) assert(encoding.size() == torch.Size([len(layers), height, width]))
def test_snp_encoder_mapping_quality(snp_variant): max_reads = 100 window_size = 5 width = 2 * window_size + 1 height = max_reads layers = [PileupEncoder.Layer.MAPPING_QUALITY] encoder = PileupEncoder(window_size=window_size, max_reads=max_reads, layers=layers) variant = snp_variant encoding = encoder(variant) assert(encoding.size() == torch.Size([len(layers), height, width])) # Verify that all elements are <= 1 by first outputing a bool tensor # and then converting it to a long tensor and summing up all elements to match # against total size. all_lt_1 = (encoding <= 1.0).long() assert(torch.sum(all_lt_1) == (height * width))
def test_pileup_unknown_layer(): max_reads = 100 window_size = 5 with pytest.raises(AttributeError): layers = [PileupEncoder.Layer.BLAH] PileupEncoder(window_size=window_size, max_reads=max_reads, layers=layers)
def __init__(self, data_loader_type, variant_loaders, batch_size=32, shuffle=True, num_workers=4, sample_encoder=PileupEncoder( window_size=100, max_reads=100, layers=[PileupEncoder.Layer.READ]), label_encoder=ZygosityLabelEncoder()): """Construct a data loader. Args: data_loader_type : Type of data loader (ReadPileupDataLoader.Type.TRAIN/EVAL/TEST) variant_loaders : A list of loader classes for variants batch_size : batch size for data loader [32] shuffle : shuffle dataset [True] num_workers : numbers of parallel data loader threads [4] sample_encoder : Custom pileup encoder for variant [READ pileup encoding, window size 100] label_encoder : Custom label encoder for variant [ZygosityLabelEncoder] (Only applicable when type=TRAIN/EVAL) Returns: Instance of class. """ super().__init__() self.data_loader_type = data_loader_type self.variant_loaders = variant_loaders self.sample_encoder = sample_encoder self.label_encoder = label_encoder class DatasetWrapper(TorchDataset): """A wrapper around Torch dataset class to generate individual samples.""" def __init__(self, data_loader_type, sample_encoder, variant_loaders, label_encoder): """Construct a dataset wrapper. Args: data_loader_type : Type of data loader sample_encoder : Custom pileup encoder for variant variant_loaders : A list of loader classes for variants label_encoder : Custom label encoder for variant Returns: Instance of class. """ super().__init__() self.variant_loaders = variant_loaders self.label_encoder = label_encoder self.sample_encoder = sample_encoder self.data_loader_type = data_loader_type self._len = sum( [len(loader) for loader in self.variant_loaders]) def _map_idx_to_sample(self, sample_idx): file_idx = 0 while (file_idx < len(self.variant_loaders)): if sample_idx < len(self.variant_loaders[file_idx]): return self.variant_loaders[file_idx][sample_idx] else: sample_idx -= len(self.variant_loaders[file_idx]) file_idx += 1 raise RuntimeError( "Could not map sample index to file. This is a bug.") def __len__(self): return self._len def __getitem__(self, idx): sample = self._map_idx_to_sample(idx) if self.data_loader_type == ReadPileupDataLoader.Type.TEST: sample = self.sample_encoder(sample) return sample else: encoding = self.sample_encoder(sample) label = self.label_encoder(sample) return label, encoding dataset = DatasetWrapper(data_loader_type, self.sample_encoder, self.variant_loaders, self.label_encoder) sampler = None if self._placement == DeviceType.AllGpu: sampler = torch.utils.data.distributed.DistributedSampler( self._dataset) self.dataloader = TorchDataLoader( dataset, batch_size=batch_size, shuffle=shuffle if sampler is None else False, num_workers=num_workers, pin_memory=True, sampler=sampler)
from variantworks.networks import AlexNet from variantworks.encoders import PileupEncoder, ZygosityLabelDecoder # Get VariantWorks root directory repo_root_dir = pathlib.Path(__file__).parent.parent.parent.parent.absolute() # Create neural factory. In this case, the checkpoint_dir has to be set for NeMo to pick # up a pre-trained model. nf = nemo.core.NeuralModuleFactory( placement=nemo.core.neural_factory.DeviceType.GPU, checkpoint_dir="./") # Dataset generation is done in a similar manner. It's important to note that the encoder used # for inference much match that for training. encoding_layers = [PileupEncoder.Layer.READ, PileupEncoder.Layer.BASE_QUALITY] pileup_encoder = PileupEncoder(window_size=100, max_reads=100, layers=encoding_layers) # Neural Network model = AlexNet(num_input_channels=len(encoding_layers), num_output_logits=3) # Similar to training, a dataloader needs to be setup for the relevant datasets. In the case of # inference, it doesn't matter if the files are tagged as false positive or not. Each example will be # evaluated by the network. For simplicity the example is using the same dataset from training. # Note: No label encoder is required in inference. data_folder = os.path.join(repo_root_dir, "tests", "data") bam = os.path.join(data_folder, "small_bam.bam") labels = os.path.join(data_folder, "candidates.vcf.gz") vcf_loader = VCFReader(vcf=labels, bams=[bam], is_fp=False) test_dataset = ReadPileupDataLoader(ReadPileupDataLoader.Type.TEST, [vcf_loader],
from variantworks.io.vcfio import VCFReader # Get VariantWorks root directory repo_root_dir = pathlib.Path(__file__).parent.parent.parent.parent.absolute() # Get BAM and VCF files for the raw sample data. data_folder = os.path.join(repo_root_dir, "tests", "data") bam = os.path.join(data_folder, "small_bam.bam") samples = os.path.join(data_folder, "candidates.vcf.gz") # Generate the variant entries using VCF reader. vcf_reader = VCFReader(vcf=samples, bams=[bam], is_fp=False) print("Serializing {} entries...".format(len(vcf_reader))) # Setup encoder for samples and labels. sample_encoder = PileupEncoder(window_size=100, max_reads=100, layers=[ PileupEncoder.Layer.READ]) label_encoder = ZygosityLabelEncoder() # Create HDF5 datasets. _, output_file = tempfile.mkstemp(prefix='hdf5_generation_snippet_', suffix=".hdf5") h5_file = h5py.File(output_file, "w") encoded_data = h5_file.create_dataset("encodings", shape=(len(vcf_reader), sample_encoder.depth, sample_encoder.height, sample_encoder.width), dtype=np.float32, fillvalue=0) label_data = h5_file.create_dataset("labels", shape=(len(vcf_reader),), dtype=np.int64, fillvalue=0) # Loop through all entries, encode them and save them in HDF5. for i, variant in enumerate(vcf_reader): encoding = sample_encoder(variant)