Пример #1
0
    def __getitem__(self, idx):
        if self.fasta_extractor is None:
            # Fasta
            self.fasta_extractor = FastaExtractor(self.fasta_file)
            # DNase
            self.dnase_extractor = BigwigExtractor(self.dnase_file)
            self.mappability_extractor = BigwigExtractor(self.mappability_file)

        # Get the interval
        interval = self.bt[idx]
        if interval.stop - interval.start != self.SEQ_WIDTH:
            center = (interval.start + interval.stop) // 2
            interval.start = center - self.SEQ_WIDTH // 2
            interval.end = center + self.SEQ_WIDTH // 2 + self.SEQ_WIDTH % 2
        # Get the gencode features
        gencode_counts = np.array([v[idx].count for k, v in self.overlap_beds],
                                  dtype=bool)

        # Run the fasta extractor
        seq = np.squeeze(self.fasta_extractor([interval]), axis=0)
        seq_rc = seq[::-1, ::-1]

        # Dnase
        dnase = np.squeeze(self.dnase_extractor([interval],
                                                axis=0))[:, np.newaxis]
        dnase[np.isnan(dnase)] = 0  # NA fill
        dnase_rc = dnase[::-1]

        bigwig_list = [seq]
        bigwig_rc_list = [seq_rc]
        mappability = np.squeeze(self.mappability_extractor(
            [interval], axis=0))[:, np.newaxis]
        mappability[np.isnan(mappability)] = 0  # NA fill
        mappability_rc = mappability[::-1]
        bigwig_list.append(mappability)
        bigwig_rc_list.append(mappability_rc)
        bigwig_list.append(dnase)
        bigwig_rc_list.append(dnase_rc)

        ranges = GenomicRanges.from_interval(interval)
        ranges_rc = GenomicRanges.from_interval(interval)
        ranges_rc.strand = "-"

        return {
            "inputs": [
                np.concatenate(bigwig_list,
                               axis=-1),  # stack along the last axis
                np.concatenate(bigwig_rc_list, axis=-1),  # RC version
                np.append(self.meta_feat, gencode_counts)
            ],
            "targets": {},  # No Targets
            "metadata": {
                "ranges": ranges,
                "ranges_rc": ranges_rc
            }
        }
Пример #2
0
def test__overlap_vcf_region():
    vcf_path = kipoi.postprocessing.variant_effects.ensure_tabixed_vcf(
        "examples/rbp/example_files/variants.vcf")
    vcf_obj = cyvcf2.VCF(vcf_path)
    all_records = [rec for rec in vcf_obj]
    vcf_obj.close()
    vcf_obj = cyvcf2.VCF(vcf_path)
    #
    regions_dict = {
        "chr": ["chr22"],
        "start": [21541589],
        "end": [36702137],
        "id": [0]
    }
    regions_gr = GenomicRanges(regions_dict["chr"], regions_dict["start"],
                               regions_dict["end"], regions_dict["id"])
    for regions in [regions_dict, regions_gr]:
        found_vars, overlapping_region = sp._overlap_vcf_region(
            vcf_obj, regions, exclude_indels=False)
        assert all([
            str(el1) == str(el2) for el1, el2 in zip(all_records, found_vars)
        ])
        assert len(overlapping_region) == len(found_vars)
        assert all([el == 0 for el in overlapping_region])

    regions_dict = {
        "chr": ["chr22", "chr22", "chr22"],
        "start": [21541589, 21541589, 30630220],
        "end": [36702137, 21541590, 30630222],
        "id": [0, 1, 2]
    }
    regions_gr = GenomicRanges(regions_dict["chr"], regions_dict["start"],
                               regions_dict["end"], regions_dict["id"])
    #
    plus_indel_results = all_records + all_records[:1] + all_records[3:4]
    snv_results = [el for el in plus_indel_results if not el.is_indel]
    #
    ref_lines_indel = [0] * len(all_records) + [1] + [2]
    snv_ref_lines = [
        el for el, el1 in zip(ref_lines_indel, plus_indel_results)
        if not el1.is_indel
    ]
    #
    for regions in [regions_dict, regions_gr]:
        for exclude_indels, ref_res, ref_lines in zip(
            [False, True], [plus_indel_results, snv_results],
            [ref_lines_indel, snv_ref_lines]):
            found_vars, overlapping_region = sp._overlap_vcf_region(
                vcf_obj, regions, exclude_indels)
            assert all([
                str(el1) == str(el2) for el1, el2 in zip(ref_res, found_vars)
                if not el1.is_indel
            ])
            assert overlapping_region == ref_lines
Пример #3
0
    def compatible_with_batch(self, batch, verbose=True):
        """Checks compatibility with a particular numpy array

        Args:
          batch: numpy array of a batch

          verbose: print the fail reason
        """

        def print_msg(msg):
            if verbose:
                print("MetadataStruct mismatch")
                print(msg)

        # custom classess
        if self.type == MetadataType.GENOMIC_RANGES:
            if not isinstance(batch, GenomicRanges):
                # TODO - do we strictly require the GenomicRanges class?
                #          - relates to metadata.py TODO about numpy_collate
                #        for now we should just be able to convert to the GenomicRanges class
                #        without any errors
                try:
                    GenomicRanges.from_dict(batch)
                except Exception as e:
                    print_msg("expecting a GenomicRanges object or a GenomicRanges-like dict")
                    print_msg("convertion error: {0}".format(e))
                    return False
                else:
                    return True
            else:
                return True

        # type = np.ndarray
        if not isinstance(batch, np.ndarray):
            print_msg("Expecting a np.ndarray. Got type(batch) = {0}".format(type(batch)))
            return False

        if not batch.ndim >= 1:
            print_msg("The array is a scalar (expecting at least the batch dimension)")
            return False

        bshape = batch.shape[1:]

        # scalars
        if self.type in {MetadataType.INT, MetadataType.STR, MetadataType.FLOAT}:
            if bshape != () and bshape != (1,):
                print_msg("expecting a scalar, got an array with shape (without the batch axis): {0}".format(bshape))
                return False

        # arrays
        # - no checks

        return True
Пример #4
0
    def __getitem__(self, idx):
        if self.fasta_extractor is None:
            self.fasta_extractor = FastaExtractor(self.fasta_file)
        interval = self.bt[idx]

        if interval.stop - interval.start != self.SEQ_WIDTH:
            center = (interval.start + interval.stop) // 2
            interval.start = center - self.SEQ_WIDTH // 2
            interval.end = center + self.SEQ_WIDTH // 2 + self.SEQ_WIDTH % 2

        if self.targets is not None:
            y = self.targets.iloc[idx].values
        else:
            y = {}

        # Run the fasta extractor
        seq = np.squeeze(self.fasta_extractor([interval]), axis=0)
        import pdb
        #pdb.set_trace()
        # Reformat so that it matches the DeepSEA shape
        seq = np.swapaxes(seq, 1, 0)[:, None, :]
        return {
            "inputs": seq,
            "targets": y,
            "metadata": {
                "ranges": GenomicRanges.from_interval(interval)
            }
        }
Пример #5
0
    def __getitem__(self, idx):
        """
        Return a list of Branch objects. They contain coordinates that can be
        written to bed files
        """
        out = {}
        out['inputs'] = {}
        branch = self.branches[idx]

        # input sequence
        out['inputs']['bidirectional_1_input'] = branch.seq

        # metadata
        out['metadata'] = {}
        out['metadata']['geneID'] = branch.geneID
        out['metadata']['transcriptID'] = branch.transcriptID
        out['metadata']['chrom'] = branch.chrom
        out['metadata']['strand'] = branch.strand
        out['metadata']['start'] = branch.grange[0] - 1  # use 0-base indexing
        out['metadata']['stop'] = branch.grange[1]
        out['metadata']['biotype'] = branch.biotype
        out['metadata']['ranges'] = GenomicRanges(
            branch.chrom,
            branch.grange[0] - 1,  # use 0-base indexing
            branch.grange[1],
            branch.geneID + "_" + branch.transcriptID,
            branch.strand)

        return out
Пример #6
0
    def __getitem__(self, idx):
        if self.fasta_extractor is None:
            self.fasta_extractor = FastaExtractor(self.fasta_file)
        interval = self.bt[idx]

        if interval.stop - interval.start != self.SEQ_WIDTH:
            raise ValueError(
                "Expected the interval to be {0} wide. Recieved stop - start = {1}"
                .format(self.SEQ_WIDTH, interval.stop - interval.start))

        if self.targets is not None:
            y = self.targets.iloc[idx].values
        else:
            y = {}

        # Run the fasta extractor
        seq = np.squeeze(self.fasta_extractor([interval]), axis=0)
        seq = np.expand_dims(np.swapaxes(seq, 1, 0), axis=1)
        return {
            "inputs": seq,
            "targets": y,
            "metadata": {
                "ranges": GenomicRanges.from_interval(interval)
            }
        }
Пример #7
0
    def __getitem__(self, idx):
        if self.fasta is None:
            self.fasta = FastaFile(self.fasta_file)
        out = {}

        if self.MISO_AS:
            gene = self.genes[idx]
            out['inputs'] = self.get_seq(gene)
            out['metadata'] = {}
            out['metadata']['geneName'] = gene.geneName
            out['metadata']['chrom'] = gene.chrom
            out['metadata']['strand'] = gene.strand
            out['metadata']['start'] = gene.start
            out['metadata']['stop'] = gene.stop

        else:
            spliceSite = self.spliceSites[idx]
            out['inputs'] = spliceSite.get_seq(self.fasta)
            out['metadata'] = {}
            out['metadata']['geneID'] = spliceSite.geneID
            out['metadata']['transcriptID'] = spliceSite.transcriptID
            out['metadata']['biotype'] = spliceSite.biotype
            out['metadata']['order'] = spliceSite.order
            out['metadata']['ranges'] = GenomicRanges(
                spliceSite.chrom,
                spliceSite.grange[0] - 1,  # use 0-base indexing
                spliceSite.grange[1],
                spliceSite.geneID,
                spliceSite.strand)
        return out
Пример #8
0
    def __getitem__(self, idx):
        if self.fasta_extractors is None:
            self.fasta_extractors = FastaStringExtractor(
                self.fasta_file,
                use_strand=False,  # self.use_strand,
                force_upper=self.force_upper)

        interval, labels = self.bed[idx]

        if self.auto_resize_len:
            # automatically resize the sequence to cerat
            interval = resize_interval(interval,
                                       self.auto_resize_len,
                                       anchor='center')

        # QUESTION: @kromme - why to we need max_seq_len?
        # if self.max_seq_len is not None:
        #     assert interval.stop - interval.start <= self.max_seq_len

        # Run the fasta extractor and transform if necessary
        seq = self.fasta_extractors.extract(interval)

        return {
            "inputs": np.array(seq),
            "targets": labels,
            "metadata": {
                "ranges":
                GenomicRanges(interval.chrom, interval.start, interval.stop,
                              str(idx))
            }
        }
Пример #9
0
    def __getitem__(self, idx):
        if self.fasta_extractor is None:
            self.fasta_extractor = FastaExtractor(self.fasta_file)
        interval = self.bt[idx]

        if interval.stop - interval.start != self.SEQ_WIDTH:
            raise ValueError("Expected the interval to be {0} wide. Recieved stop - start = {1}".
                             format(self.SEQ_WIDTH, interval.stop - interval.start))

        if interval.name is not None:
            y = np.array([float(interval.name)])
        else:
            y = {}

        # Run the fasta extractor
        seq = np.squeeze(self.fasta_extractor([interval]))

        # Reformat so that it matches the Basset shape
        # seq = np.swapaxes(seq, 1, 0)[:,:,None]
        return {
            "inputs": {"data/genome_data_dir": seq},
            "targets": y,
            "metadata": {
                "ranges": GenomicRanges.from_interval(interval)
            }
        }
Пример #10
0
    def __getitem__(self, idx):
        interval = self.bt[idx]

        # Intervals can't be bigger than 1000bp
        if (interval.stop - interval.start) > 1000:
            raise Exception("Input sequences should be at maximum 1000bp.")

        # Fetch the fasta line
        seq = self.fasta.fetch(str(interval.chrom), interval.start,
                               interval.stop).upper()

        # Reverse complement input string is requested
        if interval.strand == "-":
            seq = rc_str(seq)
        """
        # generate an id
        id = str(interval.chrom) + ":" + str(interval.start) + "-" + str(interval.stop)
        if interval.name not in ["", ".", "*"]:
            id = interval.name
        """

        return {
            "inputs": seq,
            "metadata": {
                "ranges": GenomicRanges.from_interval(interval)
            }
        }
Пример #11
0
 def __iter__(self):
     interval: Interval
     variant: Variant
     for interval, variant in self.matcher:
         yield {
             "inputs": {
                 "ref_seq":
                 self.one_hot(self.reference_sequence.extract(interval)),
                 "alt_seq":
                 self.one_hot(
                     self.variant_seq_extractor.extract(
                         interval,
                         [variant],
                         anchor=135 if interval.neg_strand else 70,
                     )),
             },
             "metadata": {
                 "variant": {
                     "chrom": variant.chrom,
                     "start": variant.start,
                     "end": variant.end,
                     "ref": variant.ref,
                     "alt": variant.alt,
                     "id": variant.id,
                     "str": str(variant),
                 },
                 "ranges": GenomicRanges.from_interval(interval),
                 **{
                     k: interval.attrs.get(k, '')
                     for k in self.interval_attrs
                 },
             }
         }
Пример #12
0
    def __getitem__(self, idx):
        if self.seq_extractor is None:
            self.seq_extractor = FastaExtractor(self.fasta_file)
            self.dist_extractor = DistToClosestLandmarkExtractor(gtf_file=self.gtf,
                                                                 landmarks=ALL_LANDMARKS)

        interval = self.bt[idx]

        if interval.stop - interval.start != self.SEQ_WIDTH:
            raise ValueError("Expected the interval to be {0} wide. Recieved stop - start = {1}".
                             format(self.SEQ_WIDTH, interval.stop - interval.start))
        out = {}
        out['inputs'] = {}
        # input - sequence
        out['inputs']['seq'] = np.squeeze(self.seq_extractor([interval]), axis=0)

        # input - distance
        dist_dict = self.dist_transformer.transform(self.dist_extractor([interval]))
        dist_dict = {k: np.squeeze(v, axis=0) for k, v in dist_dict.items()}  # squeeze the batch axis
        out['inputs'] = {**out['inputs'], **dist_dict}

        # targets
        if self.target_dataset is not None:
            out["targets"] = np.array([self.target_dataset[idx]])

        # metadata
        out['metadata'] = {}
        out['metadata']['ranges'] = GenomicRanges.from_interval(interval)

        return out
Пример #13
0
    def __getitem__(self, idx):
        # create interval correctly here
        interval = self.bt[idx]

        # Intervals need to be 1000bp wide
        assert interval.stop - interval.start == 1000

        # check targets is none, pass targets file
        if interval.name is not None:
            y = np.array([float(interval.name)])
        else:
            y = {}

        # Run the fasta extractor
        seq = np.squeeze(self.fasta_extractor([interval]))

        # Reformat so that it matches the Basset shape
        # seq = np.swapaxes(seq, 1, 0)[:,:,None]
        return {
            "inputs": {
                "data/genome_data_dir": seq
            },
            "targets": y,
            "metadata": {
                "ranges": GenomicRanges.from_interval(interval)
            }
        }
Пример #14
0
    def __getitem__(self, idx):
        out = {}
        if self.MISO_AS:
            gene = self.genes[idx]
            inputs, ranges = self.get_seq(gene)
            out['inputs'] = inputs
            if self.Y is not None:
                out['targets'] = self.Y.get_target(gene.geneName)
            else:
                out['targets'] = np.nan
            out['metadata'] = {}
            out['metadata']['geneName'] = gene.geneName
            out['metadata']['chrom'] = gene.chrom
            out['metadata']['strand'] = gene.strand
            out['metadata']['start'] = gene.start
            out['metadata']['stop'] = gene.stop
            out['metadata']['extracted_regions'] = ranges

        else:
            spliceSite = self.spliceSites[idx]
            out['inputs'] = spliceSite.get_seq(self.fasta)
            out['metadata'] = {}
            out['metadata']['geneID'] = spliceSite.geneID
            out['metadata']['transcriptID'] = spliceSite.transcriptID
            out['metadata']['biotype'] = spliceSite.biotype
            out['metadata']['order'] = spliceSite.order
            out['metadata']['ranges'] = GenomicRanges(
                spliceSite.chrom,
                spliceSite.grange[0] - 1,  # use 0-base indexing
                spliceSite.grange[1],
                spliceSite.geneID,
                spliceSite.strand)

        return out
Пример #15
0
    def __getitem__(self, idx):
        if self.fasta_extractor is None:
            self.fasta_extractor = FastaExtractor(self.fasta_file)

        interval, labels = self.tsv[idx]

        if self.auto_resize_len:
            # automatically resize the sequence to cerat
            interval = resize_interval(interval, self.auto_resize_len)

        # Run the fasta extractor
        seq = np.squeeze(self.fasta_extractor([interval]))

        return {
            "inputs": {"seq": seq},
            "targets": labels,
            "metadata": {
                "ranges": GenomicRanges(chr=interval.chrom,
                                        start=interval.start,
                                        end=interval.stop,
                                        id=str(idx),
                                        strand=(interval.strand
                                                if interval.strand is not None
                                                else "*"),
                                        ),
                "interval_from_task": ''
            }
        }
Пример #16
0
    def __getitem__(self, idx):
        # Get the interval
        interval = self.bt[idx]
        if interval.stop - interval.start != self.SEQ_WIDTH:
            center = (interval.start + interval.stop) // 2
            interval.start = center - self.SEQ_WIDTH // 2
            interval.end = center + self.SEQ_WIDTH // 2 + self.SEQ_WIDTH % 2

        # Run the fasta extractor
        seq = np.squeeze(self.fasta_extractor([interval]), axis=0)
        seq_rc = seq[::-1, ::-1]

        # Dnase
        dnase = np.squeeze(self.dnase_extractor([interval],
                                                axis=0))[:, np.newaxis]
        dnase[np.isnan(dnase)] = 0  # NA fill
        dnase_rc = dnase[::-1]

        bigwig_list = [seq]
        bigwig_rc_list = [seq_rc]
        mappability = np.squeeze(self.mappability_extractor(
            [interval], axis=0))[:, np.newaxis]
        mappability[np.isnan(mappability)] = 0  # NA fill
        mappability_rc = mappability[::-1]
        bigwig_list.append(mappability)
        bigwig_rc_list.append(mappability_rc)
        bigwig_list.append(dnase)
        bigwig_rc_list.append(dnase_rc)

        ranges = GenomicRanges.from_interval(interval)
        ranges_rc = GenomicRanges.from_interval(interval)
        ranges_rc.strand = "-"

        return {
            "inputs": [
                np.concatenate(bigwig_list,
                               axis=-1),  # stack along the last axis
                np.concatenate(bigwig_rc_list, axis=-1),  # RC version
                self.meta_feat
            ],
            "targets": {},  # No Targets
            "metadata": {
                "ranges": ranges,
                "ranges_rc": ranges_rc
            }
        }
Пример #17
0
def dl_batch():
    return {"inputs": np.arange(3),
            "metadata": {
                "ranges": GenomicRanges(chr=np.array(["chr1", "chr1", "chr1"]),
                                        start=np.arange(3) + 1,
                                        end=np.arange(3) + 5,
                                        id=np.arange(3).astype(str),
                                        strand=np.array(["*"] * 3)
                                        ),
                "gene_id": np.arange(3).astype(str)
    }}
Пример #18
0
 def __getitem__(self, idx):
     row = self._gtf_anchor.iloc[idx]
     interval = self._create_anchored_interval(
         row,
         num_upstream=self._num_upstream,
         num_downstream=self._num_downstream)
     sequence = self._fa.extract(interval)
     sequence = self._transform(sequence)
     metadata_dict = {k: row.get(k, '') for k in self._interval_attrs}
     metadata_dict["ranges"] = GenomicRanges(interval.chrom, interval.start,
                                             interval.stop, str(idx))
     return {"inputs": np.array(sequence), "metadata": metadata_dict}
Пример #19
0
    def __getitem__(self, idx):
        if self.fasta_extractor is None:
            self.fasta_extractor = FastaExtractor(self.fasta_file)
            self.bigwig_extractors = {
                a: [BigwigExtractor(f) for f in self.bigwigs[a]]
                for a in self.bigwigs
            }

        interval, labels = self.tsv[idx]
        interval = resize_interval(interval, 1000)
        # Intervals need to be 1000bp wide
        assert interval.stop - interval.start == 1000

        # Run the fasta extractor
        seq = np.squeeze(self.fasta_extractor([interval]))

        interval_wide = resize_interval(deepcopy(interval), self.track_width)

        return {
            "inputs": {
                "seq": seq
            },
            "targets": {
                a:
                sum([e([interval_wide])[0]
                     for e in self.bigwig_extractors[a]]).sum()
                for a in self.bigwig_extractors
            },
            "metadata": {
                "ranges":
                GenomicRanges(interval.chrom, interval.start, interval.stop,
                              str(idx)),
                "ranges_wide":
                GenomicRanges.from_interval(interval_wide),
                "name":
                interval.name
            }
        }
Пример #20
0
    def __getitem__(self, idx):
        self.fasta_extractor = FastaStringExtractor(self.fasta_file)

        # get the intervals
        interval, targets = self.bt[idx]

        # resize to 500bp
        interval = resize_interval(interval, 500, anchor='center')

        # extract the sequence
        seq = self.fasta_extractor.extract(interval)

        # one-hot encode the sequence
        seq_onehot = self.transform(seq)
        seq_onehot_rc = seq_onehot[::-1, ::-1]
        
        ranges = GenomicRanges.from_interval(interval)
        ranges_rc = GenomicRanges.from_interval(interval)

        return {
            "inputs": [seq_onehot, seq_onehot_rc],
            "metadata":  [ranges, ranges_rc]
        }
Пример #21
0
    def __next__(self):
        ss = next(self.exonGenerator)
        out = {}
        out['inputs'] = {}
        seq = ss.get_seq(self.fasta).upper()
        if self.split_seq:
            seq = self.split(seq, ss.overhang)['donor'][0]
        out['inputs']['ss'] = seq

        out['metadata'] = {}
        out['metadata']['ranges'] = GenomicRanges(ss.chrom, ss.Exon_Start,
                                                  ss.Exon_End,
                                                  ss.transcript_id, ss.strand)

        return out
Пример #22
0
    def __getitem__(self, idx):
        interval = self.bt[idx]

        if interval.stop - interval.start != self.SEQ_WIDTH:
            raise ValueError("Expected the interval to be {0} wide. Recieved stop - start = {1}".
                             format(self.SEQ_WIDTH, interval.stop - interval.start))

        # Run the fasta extractor
        seq = np.squeeze(self.fasta_extractor([interval]), axis=0)
        return {
            "inputs": seq,
            "targets": {},  # No Targets
            "metadata": {
                "ranges": GenomicRanges.from_interval(interval)
            }
        }
Пример #23
0
    def __getitem__(self, idx):
        interval = self.bt[idx]

        if self.targets is not None:
            y = self.targets.iloc[idx].values
        else:
            y = {}

        # Run the fasta extractor
        seq = np.squeeze(self.fasta_extractor([interval]), axis=0)
        return {
            "inputs": seq,
            "targets": y,
            "metadata": {
                "ranges": GenomicRanges.from_interval(interval)
            }
        }
Пример #24
0
 def __getitem__(self, idx):
     if self.fasta_extractor is None:
         self.fasta_extractor = FastaStringExtractor(self.fasta_file,
                                                     use_strand=True,
                                                     force_upper=True)
     feature = self.start_codons[idx]
     interval = get_upstream(feature, self.n_upstream)
     seq = self.fasta_extractor.extract(interval)
     seq_one_hot_encoded = self.input_transform(seq)
     return {
         "inputs": seq_one_hot_encoded,
         "metadata": {
             "ranges": GenomicRanges.from_interval(interval),
             "gene_id": feature.attributes.get('gene_id', [""])[0],
             "transcript_id": feature.attributes.get('transcript_id',
                                                     [""])[0],
             "gene_biotype": feature.attributes.get('gene_biotype', [""])[0]
         }
     }
Пример #25
0
    def __getitem__(self, idx):
        if self.fasta_extractor is None:
            self.fasta_extractor = FastaExtractor(self.fasta_file)

        interval, labels = self.tsv[idx]

        # Intervals need to be 1000bp wide
        assert interval.stop - interval.start == 1000

        # Run the fasta extractor
        seq = np.squeeze(self.fasta_extractor([interval]))

        return {
            "inputs": {"data/genome_data_dir": seq},
            "targets": labels,
            "metadata": {
                "ranges": GenomicRanges(interval.chrom, interval.start, interval.stop, str(idx))
            }
        }
Пример #26
0
    def __iter__(self):
        interval: Interval
        variant: Variant

        for index, row in self.regions_of_interest.as_df().iterrows():
            interval = Interval(
                chrom=row["Chromosome"],
                start=row["Start"],
                end=row["End"],
                strand=row["Strand"],
            )
            yield {
                "inputs":
                self.one_hot(self.reference_sequence.extract(interval)),
                "metadata": {
                    "ranges": GenomicRanges.from_interval(interval),
                    **{k: row[k]
                       for k in self.interval_attrs},
                }
            }
Пример #27
0
    def __getitem__(self, idx):
        interval = self.bt[idx]

        # Intervals need to be 101bp wide
        assert interval.stop - interval.start == 101

        if self.targets is not None:
            y = self.targets.iloc[idx].values
        else:
            y = {}

        # Run the fasta extractor
        seq  = self.fasta_extractor([interval]).squeeze() 
        return {
            "inputs": seq,
            "targets": y,
            "metadata": {
                "ranges": GenomicRanges.from_interval(interval)
            }
        }
Пример #28
0
    def __getitem__(self, idx):
        if self.fasta_extractor is None:
            self.fasta_extractor = Fasta(self.fasta_file)
        interval = self.bt[idx]
        interval_fasta_id = self._interval_to_fasta_id(interval)

        if self.targets is not None:
            y = self.targets.iloc[idx].values
        else:
            y = {}

        # Run the fasta extractor
        start, end = self._compute_relative_coords(interval)
        record = self.fasta_extractor[interval_fasta_id]
        seq = record[start:end].seq

        return {
            "inputs": encodeDNA([seq]).squeeze(),
            "targets": y,
            "metadata": {
                "ranges": GenomicRanges.from_interval(interval)
            }
        }
Пример #29
0
    def __getitem__(self, idx):
        if self.fasta_extractor is None:
            self.fasta_extractor = FastaStringExtractor(self.fasta_file)

        interval = self.bt[idx]

        # Intervals need to be 1000bp wide
        assert interval.stop - interval.start == 1000

        if self.targets is not None:
            y = self.targets.iloc[idx].values
        else:
            y = {}

        # Run the fasta extractor
        seq = one_hot_dna(self.fasta_extractor.extract(interval), dtype=np.float32) # TODO: Remove additional dtype after kipoiseq gets a new release
        return {
            "inputs": seq,
            "targets": y,
            "metadata": {
                "ranges": GenomicRanges.from_interval(interval)
            }
        }
Пример #30
0
    def __getitem__(self, idx):
        interval = self.bt[idx]

        if interval.stop - interval.start != self.SEQ_WIDTH:
            raise ValueError(
                "Expected the interval to be {0} wide. Recieved stop - start = {1}"
                .format(self.SEQ_WIDTH, interval.stop - interval.start))

        if self.targets is not None:
            y = self.targets.iloc[idx].values
        else:
            y = {}

        # Run the fasta extractor
        seq = np.squeeze(self.fasta_extractor([interval]), axis=0)
        # Reformat so that it matches the Basset shape
        seq = np.swapaxes(seq, 1, 0)[:, :, None]
        return {
            "inputs": seq,
            "targets": y,
            "metadata": {
                "ranges": GenomicRanges.from_interval(interval)
            }
        }