예제 #1
0
def test_pileup_visualization(snp_variant):
    output_folder = tempfile.mkdtemp(prefix='vw_test_output_')
    encoder = PileupEncoder(
        layers=[PileupEncoder.Layer.READ, PileupEncoder.Layer.ALLELE, PileupEncoder.Layer.REFERENCE,
                PileupEncoder.Layer.BASE_QUALITY, PileupEncoder.Layer.MAPPING_QUALITY],
        base_encoder=BaseUnicodeEncoder()
    )
    fig_title, fig = encoder.visualize(snp_variant, save_to_path=output_folder, max_subplots_per_line=2)
    assert len([name for name in os.listdir(output_folder) if os.path.isfile(os.path.join(output_folder, name))]) == 1

    writer = SummaryWriter(log_dir=output_folder)
    writer.add_figure(fig_title, fig)
    writer.flush()
    writer.close()

    shutil.rmtree(output_folder)
예제 #2
0
def test_snp_allele_encoding(snp_variant):
    max_reads = 1
    window_size = 5
    layers = [PileupEncoder.Layer.ALLELE]

    encoder = PileupEncoder(window_size=window_size,
                            max_reads=max_reads, layers=layers)

    variant = snp_variant
    encoding = encoder(variant)
    assert(encoding[0, 0, window_size] == BaseEnumEncoder()(variant.allele))
예제 #3
0
def test_snp_ref_encoding(snp_variant):
    max_reads = 1
    window_size = 5
    layers = [PileupEncoder.Layer.REFERENCE]

    encoder = PileupEncoder(window_size=window_size,
                            max_reads=max_reads, layers=layers)

    variant = snp_variant
    encoding = encoder(variant)
    assert(encoding[0, 0, window_size] == BaseEnumEncoder()(variant.ref))
예제 #4
0
def generate_hdf5(args):
    """Serialize encodings to HDF5.

    Generate encodings in multiprocess loop and save tensors to HDF5.
    """
    # Get list of files from arguments
    # and generate the variant entries using VCF reader.
    bam = args.bam
    vcf_readers = []
    for tp_file in args.tp_files:
        vcf_readers.append(VCFReader(vcf=tp_file, bams=[bam], is_fp=False))
    for fp_file in args.fp_files:
        vcf_readers.append(VCFReader(vcf=fp_file, bams=[bam], is_fp=True))
    total_labels = sum([len(reader) for reader in vcf_readers])

    # Setup encoder for samples and labels.
    sample_encoder = PileupEncoder(
        window_size=100,
        max_reads=100,
        layers=[PileupEncoder.Layer.READ, PileupEncoder.Layer.BASE_QUALITY])
    label_encoder = ZygosityLabelEncoder()

    encode_func = partial(encode, sample_encoder, label_encoder)

    # Create HDF5 datasets.
    h5_file = h5py.File(args.output_file, "w")
    encoded_data = h5_file.create_dataset(
        "encodings",
        shape=(total_labels, sample_encoder.depth, sample_encoder.height,
               sample_encoder.width),
        dtype=np.float32,
        fillvalue=0)
    label_data = h5_file.create_dataset("labels",
                                        shape=(total_labels, ),
                                        dtype=np.int64,
                                        fillvalue=0)

    pool = mp.Pool(args.threads)
    print("Serializing {} entries...".format(total_labels))
    for vcf_reader in vcf_readers:
        label_idx = 0
        for out in pool.imap(encode_func, vcf_reader):
            if label_idx % 1000 == 0:
                print("Saved {} entries".format(label_idx))
            encoding, label = out
            encoded_data[label_idx] = encoding
            label_data[label_idx] = label
            label_idx += 1
    print("Saved {} entries".format(total_labels))

    h5_file.close()
예제 #5
0
def test_snp_encoder_basic(snp_variant):
    max_reads = 100
    window_size = 10
    width = 2 * window_size + 1
    height = max_reads
    layers = [PileupEncoder.Layer.READ]

    encoder = PileupEncoder(window_size=window_size,
                            max_reads=max_reads, layers=layers)

    variant = snp_variant

    encoding = encoder(variant)
    assert(encoding.size() == torch.Size([len(layers), height, width]))
예제 #6
0
def test_deletion_read_encoding(deletion_variant):
    max_reads = 100
    window_size = 10
    width = 2 * window_size + 1
    height = max_reads
    layers = [PileupEncoder.Layer.READ, PileupEncoder.Layer.REFERENCE, PileupEncoder.Layer.ALLELE]

    encoder = PileupEncoder(window_size=window_size,
                            max_reads=max_reads, layers=layers)

    variant = deletion_variant

    encoding = encoder(variant)
    assert(encoding.size() == torch.Size([len(layers), height, width]))
예제 #7
0
def test_snp_encoder_mapping_quality(snp_variant):
    max_reads = 100
    window_size = 5
    width = 2 * window_size + 1
    height = max_reads
    layers = [PileupEncoder.Layer.MAPPING_QUALITY]

    encoder = PileupEncoder(window_size=window_size,
                            max_reads=max_reads, layers=layers)

    variant = snp_variant

    encoding = encoder(variant)
    assert(encoding.size() == torch.Size([len(layers), height, width]))

    # Verify that all elements are <= 1 by first outputing a bool tensor
    # and then converting it to a long tensor and summing up all elements to match
    # against total size.
    all_lt_1 = (encoding <= 1.0).long()
    assert(torch.sum(all_lt_1) == (height * width))
예제 #8
0
def test_pileup_unknown_layer():
    max_reads = 100
    window_size = 5
    with pytest.raises(AttributeError):
        layers = [PileupEncoder.Layer.BLAH]
        PileupEncoder(window_size=window_size, max_reads=max_reads, layers=layers)
예제 #9
0
    def __init__(self,
                 data_loader_type,
                 variant_loaders,
                 batch_size=32,
                 shuffle=True,
                 num_workers=4,
                 sample_encoder=PileupEncoder(
                     window_size=100,
                     max_reads=100,
                     layers=[PileupEncoder.Layer.READ]),
                 label_encoder=ZygosityLabelEncoder()):
        """Construct a data loader.

        Args:
            data_loader_type : Type of data loader (ReadPileupDataLoader.Type.TRAIN/EVAL/TEST)
            variant_loaders : A list of loader classes for variants
            batch_size : batch size for data loader [32]
            shuffle : shuffle dataset [True]
            num_workers : numbers of parallel data loader threads [4]
            sample_encoder : Custom pileup encoder for variant [READ pileup encoding, window size 100]
            label_encoder : Custom label encoder for variant [ZygosityLabelEncoder] (Only applicable
            when type=TRAIN/EVAL)

        Returns:
            Instance of class.
        """
        super().__init__()
        self.data_loader_type = data_loader_type
        self.variant_loaders = variant_loaders
        self.sample_encoder = sample_encoder
        self.label_encoder = label_encoder

        class DatasetWrapper(TorchDataset):
            """A wrapper around Torch dataset class to generate individual samples."""
            def __init__(self, data_loader_type, sample_encoder,
                         variant_loaders, label_encoder):
                """Construct a dataset wrapper.

                Args:
                    data_loader_type : Type of data loader
                    sample_encoder : Custom pileup encoder for variant
                    variant_loaders : A list of loader classes for variants
                    label_encoder : Custom label encoder for variant

                Returns:
                    Instance of class.
                """
                super().__init__()
                self.variant_loaders = variant_loaders
                self.label_encoder = label_encoder
                self.sample_encoder = sample_encoder
                self.data_loader_type = data_loader_type

                self._len = sum(
                    [len(loader) for loader in self.variant_loaders])

            def _map_idx_to_sample(self, sample_idx):
                file_idx = 0
                while (file_idx < len(self.variant_loaders)):
                    if sample_idx < len(self.variant_loaders[file_idx]):
                        return self.variant_loaders[file_idx][sample_idx]
                    else:
                        sample_idx -= len(self.variant_loaders[file_idx])
                        file_idx += 1
                raise RuntimeError(
                    "Could not map sample index to file. This is a bug.")

            def __len__(self):
                return self._len

            def __getitem__(self, idx):
                sample = self._map_idx_to_sample(idx)

                if self.data_loader_type == ReadPileupDataLoader.Type.TEST:
                    sample = self.sample_encoder(sample)

                    return sample
                else:
                    encoding = self.sample_encoder(sample)
                    label = self.label_encoder(sample)

                    return label, encoding

        dataset = DatasetWrapper(data_loader_type, self.sample_encoder,
                                 self.variant_loaders, self.label_encoder)

        sampler = None
        if self._placement == DeviceType.AllGpu:
            sampler = torch.utils.data.distributed.DistributedSampler(
                self._dataset)

        self.dataloader = TorchDataLoader(
            dataset,
            batch_size=batch_size,
            shuffle=shuffle if sampler is None else False,
            num_workers=num_workers,
            pin_memory=True,
            sampler=sampler)
예제 #10
0
from variantworks.networks import AlexNet
from variantworks.encoders import PileupEncoder, ZygosityLabelDecoder

# Get VariantWorks root directory
repo_root_dir = pathlib.Path(__file__).parent.parent.parent.parent.absolute()

# Create neural factory. In this case, the checkpoint_dir has to be set for NeMo to pick
# up a pre-trained model.
nf = nemo.core.NeuralModuleFactory(
    placement=nemo.core.neural_factory.DeviceType.GPU, checkpoint_dir="./")

# Dataset generation is done in a similar manner. It's important to note that the encoder used
# for inference much match that for training.
encoding_layers = [PileupEncoder.Layer.READ, PileupEncoder.Layer.BASE_QUALITY]
pileup_encoder = PileupEncoder(window_size=100,
                               max_reads=100,
                               layers=encoding_layers)

# Neural Network
model = AlexNet(num_input_channels=len(encoding_layers), num_output_logits=3)

# Similar to training, a dataloader needs to be setup for the relevant datasets. In the case of
# inference, it doesn't matter if the files are tagged as false positive or not. Each example will be
# evaluated by the network. For simplicity the example is using the same dataset from training.
# Note: No label encoder is required in inference.
data_folder = os.path.join(repo_root_dir, "tests", "data")
bam = os.path.join(data_folder, "small_bam.bam")
labels = os.path.join(data_folder, "candidates.vcf.gz")
vcf_loader = VCFReader(vcf=labels, bams=[bam], is_fp=False)
test_dataset = ReadPileupDataLoader(ReadPileupDataLoader.Type.TEST,
                                    [vcf_loader],
예제 #11
0
from variantworks.io.vcfio import VCFReader

# Get VariantWorks root directory
repo_root_dir = pathlib.Path(__file__).parent.parent.parent.parent.absolute()

# Get BAM and VCF files for the raw sample data.
data_folder = os.path.join(repo_root_dir, "tests", "data")
bam = os.path.join(data_folder, "small_bam.bam")
samples = os.path.join(data_folder, "candidates.vcf.gz")

# Generate the variant entries using VCF reader.
vcf_reader = VCFReader(vcf=samples, bams=[bam], is_fp=False)
print("Serializing {} entries...".format(len(vcf_reader)))

# Setup encoder for samples and labels.
sample_encoder = PileupEncoder(window_size=100, max_reads=100, layers=[
    PileupEncoder.Layer.READ])
label_encoder = ZygosityLabelEncoder()

# Create HDF5 datasets.
_, output_file = tempfile.mkstemp(prefix='hdf5_generation_snippet_', suffix=".hdf5")
h5_file = h5py.File(output_file, "w")
encoded_data = h5_file.create_dataset("encodings",
                                      shape=(len(vcf_reader), sample_encoder.depth,
                                             sample_encoder.height, sample_encoder.width),
                                      dtype=np.float32, fillvalue=0)
label_data = h5_file.create_dataset("labels",
                                    shape=(len(vcf_reader),), dtype=np.int64, fillvalue=0)

# Loop through all entries, encode them and save them in HDF5.
for i, variant in enumerate(vcf_reader):
    encoding = sample_encoder(variant)