示例#1
0
class SplicingMaxEntDataset(Dataset):
    """
    args:
      MISO_AS:
        doc: Whether the given annotation file is MISO alternative splicing annotation. default False.
      fasta_file:
        doc: Reference Genome sequence in fasta format
        example:
          md5: 936544855b253835442a0f253dd4b083
          url: https://zenodo.org/record/1466099/files/3prime-example_files-hg19.chr22.fa?download=1
        type: str
      gtf_file:
        doc: file path; Genome annotation GTF file
        example:
          md5: 174fd11303ae2c2369094bfcbe303c07
          url: https://zenodo.org/record/1466099/files/3prime-example_files-hg19.chr22.gtf?download=1
      label_col:
        doc: response label column name
      target_file:
        doc: path to the targets (txt) file
        optional: true
      side:
        doc: 5 or 3prime splice-site
    dependencies:
      conda:
        - pysam=0.15.2
        - python=3.5.6
    info:
      authors:
        - github: s6juncheng
          name: Jun Cheng
      doc: MaxEnt Splicing Model
      name: MaxEnt
      version: 0.1
    output_schema:
      inputs:
        associated_metadata: ranges
        doc: a junction (donor or acceptor) sequence
        name: seq
        shape: ()
        special_type: DNAStringSeq
      metadata:
        biotype:
          doc: gene biotype, can be used to filter protein coding genes for instance
          type: str
        geneID:
          doc: gene ID
          type: str
        order:
          doc: order of the donor site in the transcript, counted from 5' to 3'.
          type: int
        ranges:
          doc: ranges that the sequences were extracted
          type: GenomicRanges
        transcriptID:
          doc: transcript id
          type: str
      targets:
        doc: Predicted psi
        name: psi
        shape: (1,)
    """
    def __init__(
            self,
            gtf_file,
            fasta_file,
            side='5prime',  # 5prime/3prime
            target_file=None,
            MISO_AS=False,
            label_col='event_name'):
        from gtf_utils import loadgene

        self.genes = loadgene(gtf_file)
        self.fasta_file = fasta_file
        self.fasta = None  # open the file later

        if side in ["5prime", "3prime"]:
            self.side = side
        else:
            raise Exception("side should be 5prime or 3prime")
        if self.side == "5prime":
            self.overhang_l = 3
            self.overhang_r = 6
        else:
            self.overhang_l = 3
            self.overhang_r = 20
        if target_file is not None:
            self.Y = Target(target_file, label_col)
        else:
            self.Y = None

        self.MISO_AS = MISO_AS
        if not MISO_AS:
            self.spliceSites = self.get_spliceSites()

        self._name = None
        self._species = None

    def __len__(self):
        if self.MISO_AS:
            return len(self.genes)
        else:
            return len(self.spliceSites)

    def __getitem__(self, idx):
        if self.fasta is None:
            from fasta_utils import FastaFile
            self.fasta = FastaFile(self.fasta_file)

        out = {}
        if self.MISO_AS:
            gene = self.genes[idx]
            inputs, ranges = self.get_seq(gene)
            out['inputs'] = inputs
            if self.Y is not None:
                out['targets'] = self.Y.get_target(gene.geneName)
            else:
                out['targets'] = np.nan
            out['metadata'] = {}
            out['metadata']['geneName'] = gene.geneName
            out['metadata']['chrom'] = gene.chrom
            out['metadata']['strand'] = gene.strand
            out['metadata']['start'] = gene.start
            out['metadata']['stop'] = gene.stop
            out['metadata']['extracted_regions'] = ranges

        else:
            spliceSite = self.spliceSites[idx]
            out['inputs'] = spliceSite.get_seq(self.fasta)
            out['metadata'] = {}
            out['metadata']['geneID'] = spliceSite.geneID
            out['metadata']['transcriptID'] = spliceSite.transcriptID
            out['metadata']['biotype'] = spliceSite.biotype
            out['metadata']['order'] = spliceSite.order
            out['metadata']['ranges'] = GenomicRanges(
                spliceSite.chrom,
                spliceSite.grange[0] - 1,  # use 0-base indexing
                spliceSite.grange[1],
                spliceSite.geneID,
                spliceSite.strand)

        return out

    def get_seq(self, gene):
        """ Get exon and intron sequences """
        exons = gene.get_all_exons()
        # N_exon = exons.shape[0]
        introns = gene.get_all_introns()

        # Take intron coordinate
        # Try both normal gtf and AS_gtf

        if self.side == "5prime":
            if gene.strand == "+":
                seq_ranges = exons[:-1, 1].reshape(-1, 1) + np.array(
                    [-self.overhang_l + 1, self.overhang_r])
            else:
                seq_ranges = exons[:-1, 0].reshape(-1, 1) + np.array(
                    [-self.overhang_r, self.overhang_l - 1])
        else:
            if gene.strand == "+":
                seq_ranges = exons[1:, 0].reshape(-1, 1) + np.array(
                    [-self.overhang_r, self.overhang_l - 1])
            else:
                seq_ranges = exons[1:, 1].reshape(-1, 1) + np.array(
                    [-self.overhang_l + 1, self.overhang_r])

        seq = [
            self.fasta.get_seq(gene.chrom, seq_range, gene.strand)
            for seq_range in seq_ranges
        ]
        return np.array(seq), seq_ranges

    def _get_spliceSites(self, gene):
        ''' Get splice site sequence for all transcripts of a single gene.
        Applied for normal gtf annotation.
        '''
        spliceSites = []
        for transcript in gene.trans:
            exons = transcript.exons
            ind = np.lexsort((exons[:, 1], exons[:, 0]))
            if len(exons) > 1:
                if self.side == "5prime":
                    if gene.strand == "+":
                        seq_ranges = exons[:-1, 1].reshape(-1, 1) + np.array(
                            [-self.overhang_l + 1, self.overhang_r])
                    else:
                        ind = ind[::-1]
                        exons = exons[ind]
                        seq_ranges = exons[:-1, 0].reshape(-1, 1) + np.array(
                            [-self.overhang_r, self.overhang_l - 1])
                else:
                    if gene.strand == "+":
                        seq_ranges = exons[1:, 0].reshape(-1, 1) + np.array(
                            [-self.overhang_r, self.overhang_l - 1])
                    else:
                        ind = ind[::-1]
                        exons = exons[ind]
                        seq_ranges = exons[1:, 1].reshape(-1, 1) + np.array(
                            [-self.overhang_l + 1, self.overhang_r])

                for i in range(seq_ranges.shape[0]):
                    spliceSite = SpliceSite(gene.chrom, seq_ranges[i, 0],
                                            seq_ranges[i, 1], gene.strand,
                                            transcript.tranID, gene.geneID,
                                            gene.biotype, i)
                    # can call get_seq later in iterator to save memory
                    # spliceSite.seq = spliceSite.get_seq(self.fasta)
                    spliceSites.append(spliceSite)
        return spliceSites

    def get_spliceSites(self):
        ''' Get splice sites for all donors
        '''
        spliceSites = list(map(self._get_spliceSites, self.genes))
        spliceSites = list(itertools.chain.from_iterable(spliceSites))
        return spliceSites

    @property
    def name(self):
        return self._name

    @name.setter
    def name(self, value):
        self._name = value

    @property
    def species(self):
        return self._species

    @species.setter
    def species(self, value):
        self._species = value
示例#2
0
class SplicingMaxEntDataset(Dataset):
    """
    Args:
        gtf_file: gtf file. Can be dowloaded from MISO or ensembl.
        fasta_file: file path; Genome sequence
        side: 5prime or 3prime
        target_file: file path; path to the targets in MISO summary format.
        MISO_AS: whether the used annotation file is from MISO alternative splicing annotation.
        label_col: column name in target file which has PSI.
    """
    def __init__(
            self,
            gtf_file,
            fasta_file,
            side='5prime',  # 5prime/3prime
            target_file=None,
            MISO_AS=False,
            label_col='event_name'):
        self.genes = loadgene(gtf_file)
        self.fasta_file = fasta_file
        self.fasta = None  # open the file later

        if side in ["5prime", "3prime"]:
            self.side = side
        else:
            raise Exception("side should be 5prime or 3prime")
        if self.side == "5prime":
            self.overhang_l = 3
            self.overhang_r = 6
        else:
            self.overhang_l = 3
            self.overhang_r = 20
        if target_file is not None:
            self.Y = Target(target_file, label_col)
        else:
            self.Y = None

        self.MISO_AS = MISO_AS
        if not MISO_AS:
            self.spliceSites = self.get_spliceSites()

        self._name = None
        self._species = None

    def __len__(self):
        if self.MISO_AS:
            return len(self.genes)
        else:
            return len(self.spliceSites)

    def __getitem__(self, idx):
        if self.fasta is None:
            self.fasta = FastaFile(self.fasta_file)

        out = {}
        if self.MISO_AS:
            gene = self.genes[idx]
            inputs, ranges = self.get_seq(gene)
            out['inputs'] = inputs
            if self.Y is not None:
                out['targets'] = self.Y.get_target(gene.geneName)
            else:
                out['targets'] = np.nan
            out['metadata'] = {}
            out['metadata']['geneName'] = gene.geneName
            out['metadata']['chrom'] = gene.chrom
            out['metadata']['strand'] = gene.strand
            out['metadata']['start'] = gene.start
            out['metadata']['stop'] = gene.stop
            out['metadata']['extracted_regions'] = ranges

        else:
            spliceSite = self.spliceSites[idx]
            out['inputs'] = spliceSite.get_seq(self.fasta)
            out['metadata'] = {}
            out['metadata']['geneID'] = spliceSite.geneID
            out['metadata']['transcriptID'] = spliceSite.transcriptID
            out['metadata']['biotype'] = spliceSite.biotype
            out['metadata']['order'] = spliceSite.order
            out['metadata']['ranges'] = GenomicRanges(
                spliceSite.chrom,
                spliceSite.grange[0] - 1,  # use 0-base indexing
                spliceSite.grange[1],
                spliceSite.geneID,
                spliceSite.strand)

        return out

    def get_seq(self, gene):
        """ Get exon and intron sequences """
        exons = gene.get_all_exons()
        # N_exon = exons.shape[0]
        introns = gene.get_all_introns()

        # Take intron coordinate
        # Try both normal gtf and AS_gtf

        if self.side == "5prime":
            if gene.strand == "+":
                seq_ranges = exons[:-1, 1].reshape(-1, 1) + np.array(
                    [-self.overhang_l + 1, self.overhang_r])
            else:
                seq_ranges = exons[:-1, 0].reshape(-1, 1) + np.array(
                    [-self.overhang_r, self.overhang_l - 1])
        else:
            if gene.strand == "+":
                seq_ranges = exons[1:, 0].reshape(-1, 1) + np.array(
                    [-self.overhang_r, self.overhang_l - 1])
            else:
                seq_ranges = exons[1:, 1].reshape(-1, 1) + np.array(
                    [-self.overhang_l + 1, self.overhang_r])

        seq = [
            self.fasta.get_seq(gene.chrom, seq_range, gene.strand)
            for seq_range in seq_ranges
        ]
        return np.array(seq), seq_ranges

    def _get_spliceSites(self, gene):
        ''' Get splice site sequence for all transcripts of a single gene.
        Applied for normal gtf annotation.
        '''
        spliceSites = []
        for transcript in gene.trans:
            exons = transcript.exons
            ind = np.lexsort((exons[:, 1], exons[:, 0]))
            if len(exons) > 1:
                if self.side == "5prime":
                    if gene.strand == "+":
                        seq_ranges = exons[:-1, 1].reshape(-1, 1) + np.array(
                            [-self.overhang_l + 1, self.overhang_r])
                    else:
                        ind = ind[::-1]
                        exons = exons[ind]
                        seq_ranges = exons[:-1, 0].reshape(-1, 1) + np.array(
                            [-self.overhang_r, self.overhang_l - 1])
                else:
                    if gene.strand == "+":
                        seq_ranges = exons[1:, 0].reshape(-1, 1) + np.array(
                            [-self.overhang_r, self.overhang_l - 1])
                    else:
                        ind = ind[::-1]
                        exons = exons[ind]
                        seq_ranges = exons[1:, 1].reshape(-1, 1) + np.array(
                            [-self.overhang_l + 1, self.overhang_r])

                for i in range(seq_ranges.shape[0]):
                    spliceSite = SpliceSite(gene.chrom, seq_ranges[i, 0],
                                            seq_ranges[i, 1], gene.strand,
                                            transcript.tranID, gene.geneID,
                                            gene.biotype, i)
                    # can call get_seq later in iterator to save memory
                    # spliceSite.seq = spliceSite.get_seq(self.fasta)
                    spliceSites.append(spliceSite)
        return spliceSites

    def get_spliceSites(self):
        ''' Get splice sites for all donors
        '''
        spliceSites = list(map(self._get_spliceSites, self.genes))
        spliceSites = list(itertools.chain.from_iterable(spliceSites))
        return spliceSites

    @property
    def name(self):
        return self._name

    @name.setter
    def name(self, value):
        self._name = value

    @property
    def species(self):
        return self._species

    @species.setter
    def species(self, value):
        self._species = value
示例#3
0
class SplicingKmerDataset(Dataset):
    """
    Args:
        gtf_file: gtf file. Can be dowloaded from MISO or ensembl.
        fasta_file: file path; Genome sequence
        target_file: file path; path to the targets in MISO summary format.
        overhang: length of overhang.
        MISO_AS: whether the used annotation file is from MISO alternative splicing annotation.
    """
    def __init__(self,
                 gtf_file,
                 fasta_file,
                 overhang=80,
                 MISO_AS=False):  # intron + ~ bp exon from both side
        self.genes = loadgene(gtf_file)
        self.fasta_file = fasta_file
        self.fasta = None
        self.overhang = overhang
        self.MISO_AS = MISO_AS
        if not MISO_AS:
            self.spliceSites = self.get_spliceSites()
        self._name = None
        self._species = None

    def __len__(self):
        if self.MISO_AS:
            return len(self.genes)
        else:
            return len(self.spliceSites)

    def __getitem__(self, idx):
        if self.fasta is None:
            self.fasta = FastaFile(self.fasta_file)
        out = {}

        if self.MISO_AS:
            gene = self.genes[idx]
            out['inputs'] = self.get_seq(gene)
            out['metadata'] = {}
            out['metadata']['geneName'] = gene.geneName
            out['metadata']['chrom'] = gene.chrom
            out['metadata']['strand'] = gene.strand
            out['metadata']['start'] = gene.start
            out['metadata']['stop'] = gene.stop

        else:
            spliceSite = self.spliceSites[idx]
            out['inputs'] = spliceSite.get_seq(self.fasta)
            out['metadata'] = {}
            out['metadata']['geneID'] = spliceSite.geneID
            out['metadata']['transcriptID'] = spliceSite.transcriptID
            out['metadata']['biotype'] = spliceSite.biotype
            out['metadata']['order'] = spliceSite.order
            out['metadata']['ranges'] = GenomicRanges(
                spliceSite.chrom,
                spliceSite.grange[0] - 1,  # use 0-base indexing
                spliceSite.grange[1],
                spliceSite.geneID,
                spliceSite.strand)
        return out

    def get_seq(self, gene):
        """ Get splice site sequence with flanking exon and intron sequences. 
        This function is applied for MISO annotation with 3 exon alternative splicing model.
        """
        exons = gene.get_all_exons()
        # N_exon = exons.shape[0]
        if gene.strand == "+":
            seq_ranges = exons[:-1, 1].reshape(-1, 1) + np.array(
                [-self.overhang + 1, self.overhang])
        else:
            seq_ranges = exons[:-1, 0].reshape(-1, 1) + np.array(
                [-self.overhang, self.overhang - 1])
        seq = [
            self.fasta.get_seq(gene.chrom, seq_range, gene.strand)
            for seq_range in seq_ranges
        ]
        return np.array(seq)

    def _get_spliceSites(self, gene):
        ''' Get splice site sequence for all transcripts of a single gene.
        Applied for normal gtf annotation.
        '''
        spliceSites = []
        for transcript in gene.trans:
            exons = transcript.exons
            ind = np.lexsort((exons[:, 1], exons[:, 0]))
            if len(exons) > 1:
                if gene.strand == "+":
                    seq_ranges = exons[:-1, 1].reshape(-1, 1) + np.array(
                        [-self.overhang + 1, self.overhang])
                else:
                    ind = ind[::-1]
                    exons = exons[ind]
                    seq_ranges = exons[:-1, 0].reshape(-1, 1) + np.array(
                        [-self.overhang, self.overhang - 1])
                for i in range(seq_ranges.shape[0]):
                    spliceSite = SpliceSite(gene.chrom, seq_ranges[i, 0],
                                            seq_ranges[i, 1], gene.strand,
                                            transcript.tranID, gene.geneID,
                                            gene.biotype, i)
                    # can call get_seq later in iterator to save memory
                    # spliceSite.seq = spliceSite.get_seq(self.fasta)
                    spliceSites.append(spliceSite)
        return spliceSites

    def get_spliceSites(self):
        ''' Get splice sites for all donors
        '''
        spliceSites = list(map(self._get_spliceSites, self.genes))
        spliceSites = list(itertools.chain.from_iterable(spliceSites))
        return spliceSites

    @property
    def name(self):
        return self._namem

    @name.setter
    def name(self, value):
        self._name = value

    @property
    def species(self):
        return self._species

    @species.setter
    def species(self, value):
        self._species = value