예제 #1
0
def test_ambiguous_mask2(tmpdir):
    # only ambigous regions are present
    bed_file = write_tmp(
        'chr1\t1\t2\t1\t0\nchr2\t1\t3\t0\t1\nchr3\t1\t3\t-1\t-1', tmpdir)
    bt = BedDataset(bed_file, ambiguous_mask=-1)
    assert len(bt) == 2
    assert np.all(bt.get_targets().max(axis=1) >= 0)
예제 #2
0
    def __init__(self,
                 intervals_file,
                 fasta_file,
                 bigwigs,
                 track_width=2000,
                 incl_chromosomes=None,
                 excl_chromosomes=None,
                 num_chr_fasta=False):
        self.num_chr_fasta = num_chr_fasta
        self.intervals_file = intervals_file
        self.fasta_file = fasta_file
        self.bigwigs = bigwigs
        self.incl_chromosomes = incl_chromosomes
        self.excl_chromosomes = excl_chromosomes
        self.track_width = track_width

        self.tsv = BedDataset(
            self.intervals_file,
            num_chr=self.num_chr_fasta,
            bed_columns=4,
            ignore_targets=True,
            incl_chromosomes=incl_chromosomes,
            excl_chromosomes=excl_chromosomes,
        )
        self.fasta_extractor = None
        self.bigwig_extractors = None
예제 #3
0
def test_more_columns(tmpdir):
    bed_file = write_tmp(
        'chr1\t1\t2\tinterval1\t1\t0\nchr2\t1\t3\tinterval2\t0\t1', tmpdir)
    with pytest.raises(Exception):
        bt = BedDataset(bed_file, label_dtype=bool)
    bt = BedDataset(bed_file, bed_columns=4, label_dtype=bool)
    assert bt[0][0].name == 'interval1'
    assert bt[1][0].name == 'interval2'

    with pytest.raises(Exception):
        bt = BedDataset(bed_file)
예제 #4
0
def test_ambiguous_mask(tmpdir):
    bed_file = write_tmp(
        'chr1\t1\t2\t1\t0\nchr2\t1\t3\t0\t1\nchr3\t1\t3\t0\t-1', tmpdir)
    bt = BedDataset(bed_file)
    assert len(bt) == 3
    assert np.all(bt[2][1] == np.array([0, -1]))

    # same as before
    bt = BedDataset(bed_file, ambiguous_mask=-1)
    assert len(bt) == 3
    assert np.all(bt[2][1] == np.array([0, -1]))
    assert np.all(bt.get_targets().max(axis=1) >= 0)
예제 #5
0
def test_incl_excl_chromosomes(tmpdir):
    bed_file = write_tmp(
        'chr1\t1\t2\t1\t0\nchr2\t1\t3\t0\t1\nchr3\t1\t3\t0\t1', tmpdir)
    bt = BedDataset(bed_file)
    assert len(bt) == 3

    bt = BedDataset(bed_file, incl_chromosomes=['chr1'])
    assert len(bt) == 1
    assert bt[0][0] == Interval("chr1", 1, 2)

    bt = BedDataset(bed_file, excl_chromosomes=['chr1'])
    assert len(bt) == 2
    assert bt[0][0] == Interval("chr2", 1, 3)
예제 #6
0
def test_bed3_labels(tmpdir):
    bed_file = write_tmp('chr1\t1\t2\t1\t0\nchr1\t1\t3\t0\t1', tmpdir)
    bt = BedDataset(bed_file)
    assert np.all(bt.get_targets() == np.array([[1, 0], [0, 1]]))
    assert len(bt) == 2
    assert bt.n_tasks == 2
    assert np.all(bt.df[0] == 'chr1')
    assert bt[0][0] == Interval("chr1", 1, 2)
    assert np.all(bt[0][1] == np.array([1, 0]))

    assert bt[1][0] == Interval("chr1", 1, 3)
    assert np.all(bt[1][1] == np.array([0, 1]))
    assert len(bt) == 2
예제 #7
0
 def __init__(self, intervals_file, fasta_file, ignore_targets=True):
     self.bt = BedDataset(intervals_file,
                          bed_columns=3,
                          ignore_targets=ignore_targets)
     self.fasta_file = fasta_file
     self.fasta_extractor = None
     self.transform = OneHot()  # one-hot encode DNA sequence
예제 #8
0
def test_bed3(tmpdir):
    bed_file = write_tmp('chr1\t1\t2\nchr1\t1\t3', tmpdir)
    bt = BedDataset(bed_file)
    assert bt.n_tasks == 0
    assert len(bt) == 2
    assert np.all(bt.df[0] == 'chr1')
    assert bt[0] == (Interval("chr1", 1, 2), {})
    assert bt[1] == (Interval("chr1", 1, 3), {})
예제 #9
0
def test_tsvreader(tsv_file, num_chr, label_dtype):
    ds = BedDataset(tsv_file, label_dtype=label_dtype, num_chr=num_chr)
    interval, labels = ds[0]
    assert isinstance(interval, Interval)
    if not num_chr:
        assert interval.chrom.startswith("chr")
    assert isinstance(labels[0], label_dtype)
    assert interval.start == 2
    assert interval.end == 4
예제 #10
0
def test_label_dtype(tmpdir):
    bed_file = write_tmp('chr1\t1\t2\t1\t0\nchr2\t1\t3\t0\t1', tmpdir)
    bt = BedDataset(bed_file, label_dtype=bool)
    assert len(bt) == 2
    assert bt[0][1].dtype == bool
    assert bt.get_targets().dtype == bool
예제 #11
0
def test_num_chr(tmpdir):
    bed_file = write_tmp(
        'chr1\t1\t2\t1\t0\nchr2\t1\t3\t0\t1\nchr3\t1\t3\t0\t-1', tmpdir)
    bt = BedDataset(bed_file, num_chr=True)
    assert len(bt) == 3
    assert bt[0][0].chrom == '1'
예제 #12
0
class ActivityDataset(Dataset):
    """
    Args:
        intervals_file: bed4 file containing chrom  start  end  name
        fasta_file: file path; Genome sequence
        label_dtype: label data type
        num_chr_fasta: if True, the tsv-loader will make sure that the chromosomes
          don't start with chr
    """
    def __init__(self,
                 intervals_file,
                 fasta_file,
                 bigwigs,
                 track_width=2000,
                 incl_chromosomes=None,
                 excl_chromosomes=None,
                 num_chr_fasta=False):
        self.num_chr_fasta = num_chr_fasta
        self.intervals_file = intervals_file
        self.fasta_file = fasta_file
        self.bigwigs = bigwigs
        self.incl_chromosomes = incl_chromosomes
        self.excl_chromosomes = excl_chromosomes
        self.track_width = track_width

        self.tsv = BedDataset(
            self.intervals_file,
            num_chr=self.num_chr_fasta,
            bed_columns=4,
            ignore_targets=True,
            incl_chromosomes=incl_chromosomes,
            excl_chromosomes=excl_chromosomes,
        )
        self.fasta_extractor = None
        self.bigwig_extractors = None

    def __len__(self):
        return len(self.tsv)

    def __getitem__(self, idx):
        if self.fasta_extractor is None:
            self.fasta_extractor = FastaExtractor(self.fasta_file)
            self.bigwig_extractors = {
                a: [BigwigExtractor(f) for f in self.bigwigs[a]]
                for a in self.bigwigs
            }

        interval, labels = self.tsv[idx]
        interval = resize_interval(interval, 1000)
        # Intervals need to be 1000bp wide
        assert interval.stop - interval.start == 1000

        # Run the fasta extractor
        seq = np.squeeze(self.fasta_extractor([interval]))

        interval_wide = resize_interval(deepcopy(interval), self.track_width)

        return {
            "inputs": {
                "seq": seq
            },
            "targets": {
                a:
                sum([e([interval_wide])[0]
                     for e in self.bigwig_extractors[a]]).sum()
                for a in self.bigwig_extractors
            },
            "metadata": {
                "ranges":
                GenomicRanges(interval.chrom, interval.start, interval.stop,
                              str(idx)),
                "ranges_wide":
                GenomicRanges.from_interval(interval_wide),
                "name":
                interval.name
            }
        }

    def get_targets(self):
        return self.tsv.get_targets()
예제 #13
0
    def __init__(self,
                 intervals_file,
                 fasta_file,
                 vcf_file,
                 chr_order_file,
                 vcf_file_tbi=None,
                 strand_column=6,
                 id_column=4,
                 num_chr=True):

        # workaround for test
        if vcf_file_tbi is not None and vcf_file_tbi.endswith("vcf_file_tbi"):
            os.rename(vcf_file_tbi,
                      vcf_file_tbi.replace("vcf_file_tbi", "vcf_file.tbi"))

        self.num_chr_fasta = num_chr
        self.intervals_file = intervals_file
        self.fasta_file = fasta_file
        self.vcf_file = vcf_file
        self.chr_order_file = chr_order_file

        self.strand_column = strand_column - 1
        self.id_column = id_column - 1

        self.force_upper = True

        # "Parse" bed file
        self.bed = BedDataset(self.intervals_file,
                              num_chr=self.num_chr_fasta,
                              bed_columns=3,
                              label_dtype=str,
                              ignore_targets=False)

        # Intersect bed and vcf using bedtools
        # bedtools c flag: for each bed interval, counts number of vcf entries it overlaps
        bed_tool = pybedtools.BedTool(self.intervals_file)
        intersect_counts = list(
            bed_tool.intersect(self.vcf_file,
                               c=True,
                               sorted=True,
                               g=self.chr_order_file))
        intersect_counts = np.array(
            [isect.count for isect in intersect_counts])

        # Retain only those transcripts that intersect a variant
        utr5_bed = self.bed.df
        id_col = utr5_bed.iloc[:, self.id_column]
        retain_transcripts = utr5_bed[
            intersect_counts > 0].iloc[:, self.id_column]
        utr5_bed = utr5_bed[utr5_bed.iloc[:, self.id_column].isin(
            retain_transcripts)]

        # Aggregate 5utr positions per transcript
        tuples = list(zip(utr5_bed.iloc[:, 1], utr5_bed.iloc[:, 2]))
        pos = [[x] for x in tuples]
        id_chr_strand = list(
            zip(utr5_bed.iloc[:, self.id_column], utr5_bed.iloc[:, 0],
                utr5_bed.iloc[:, self.strand_column]))
        utr5_bed_posaggreg = pd.DataFrame({
            "pos": pos,
            "id_chr_strand": id_chr_strand
        })
        utr5_bed_posaggreg = utr5_bed_posaggreg.groupby("id_chr_strand").agg(
            {'pos': 'sum'})

        # Rebuild "bed"
        utr5_bed_posaggreg["id"] = [x[0] for x in utr5_bed_posaggreg.index]
        utr5_bed_posaggreg["chr"] = [x[1] for x in utr5_bed_posaggreg.index]
        utr5_bed_posaggreg["strand"] = [x[2] for x in utr5_bed_posaggreg.index]
        self.bed = utr5_bed_posaggreg.reset_index()[[
            "id", "chr", "pos", "strand"
        ]]

        self.fasta_extractor = None
        self.vcf = None
        self.vcf_extractor = None