def __getitem__(self, idx): if self.fasta is None: self.fasta = FastaFile(self.fasta_file) out = {} if self.MISO_AS: gene = self.genes[idx] out['inputs'] = self.get_seq(gene) out['metadata'] = {} out['metadata']['geneName'] = gene.geneName out['metadata']['chrom'] = gene.chrom out['metadata']['strand'] = gene.strand out['metadata']['start'] = gene.start out['metadata']['stop'] = gene.stop else: spliceSite = self.spliceSites[idx] out['inputs'] = spliceSite.get_seq(self.fasta) out['metadata'] = {} out['metadata']['geneID'] = spliceSite.geneID out['metadata']['transcriptID'] = spliceSite.transcriptID out['metadata']['biotype'] = spliceSite.biotype out['metadata']['order'] = spliceSite.order out['metadata']['ranges'] = GenomicRanges( spliceSite.chrom, spliceSite.grange[0] - 1, # use 0-base indexing spliceSite.grange[1], spliceSite.geneID, spliceSite.strand) return out
def __init__( self, gtf_file, fasta_file, side='5prime', # 5prime/3prime target_file=None, MISO_AS=False, label_col='event_name'): self.genes = loadgene(gtf_file) self.fasta = FastaFile(fasta_file) if side in ["5prime", "3prime"]: self.side = side else: raise Exception("side should be 5prime or 3prime") if self.side == "5prime": self.overhang_l = 3 self.overhang_r = 6 else: self.overhang_l = 3 self.overhang_r = 20 if target_file is not None: self.Y = Target(target_file, label_col) else: self.Y = None self.MISO_AS = MISO_AS if not MISO_AS: self.spliceSites = self.get_spliceSites() self._name = None self._species = None
def __init__(self, gtf_file, fasta_file, overhang=80, MISO_AS=False): # intron + ~ bp exon from both side self.genes = loadgene(gtf_file) self.fasta = FastaFile(fasta_file) self.overhang = overhang self.MISO_AS = MISO_AS if not MISO_AS: self.spliceSites = self.get_spliceSites() self._name = None self._species = None
def __getitem__(self, idx): if self.fasta is None: from fasta_utils import FastaFile self.fasta = FastaFile(self.fasta_file) out = {} if self.MISO_AS: gene = self.genes[idx] inputs, ranges = self.get_seq(gene) out['inputs'] = inputs if self.Y is not None: out['targets'] = self.Y.get_target(gene.geneName) else: out['targets'] = np.nan out['metadata'] = {} out['metadata']['geneName'] = gene.geneName out['metadata']['chrom'] = gene.chrom out['metadata']['strand'] = gene.strand out['metadata']['start'] = gene.start out['metadata']['stop'] = gene.stop out['metadata']['extracted_regions'] = ranges else: spliceSite = self.spliceSites[idx] out['inputs'] = spliceSite.get_seq(self.fasta) out['metadata'] = {} out['metadata']['geneID'] = spliceSite.geneID out['metadata']['transcriptID'] = spliceSite.transcriptID out['metadata']['biotype'] = spliceSite.biotype out['metadata']['order'] = spliceSite.order out['metadata']['ranges'] = GenomicRanges( spliceSite.chrom, spliceSite.grange[0] - 1, # use 0-base indexing spliceSite.grange[1], spliceSite.geneID, spliceSite.strand) return out
class SplicingMaxEntDataset(Dataset): """ Args: gtf_file: gtf file. Can be dowloaded from MISO or ensembl. fasta_file: file path; Genome sequence side: 5prime or 3prime target_file: file path; path to the targets in MISO summary format. MISO_AS: whether the used annotation file is from MISO alternative splicing annotation. label_col: column name in target file which has PSI. """ def __init__( self, gtf_file, fasta_file, side='5prime', # 5prime/3prime target_file=None, MISO_AS=False, label_col='event_name'): self.genes = loadgene(gtf_file) self.fasta_file = fasta_file self.fasta = None # open the file later if side in ["5prime", "3prime"]: self.side = side else: raise Exception("side should be 5prime or 3prime") if self.side == "5prime": self.overhang_l = 3 self.overhang_r = 6 else: self.overhang_l = 3 self.overhang_r = 20 if target_file is not None: self.Y = Target(target_file, label_col) else: self.Y = None self.MISO_AS = MISO_AS if not MISO_AS: self.spliceSites = self.get_spliceSites() self._name = None self._species = None def __len__(self): if self.MISO_AS: return len(self.genes) else: return len(self.spliceSites) def __getitem__(self, idx): if self.fasta is None: self.fasta = FastaFile(self.fasta_file) out = {} if self.MISO_AS: gene = self.genes[idx] inputs, ranges = self.get_seq(gene) out['inputs'] = inputs if self.Y is not None: out['targets'] = self.Y.get_target(gene.geneName) else: out['targets'] = np.nan out['metadata'] = {} out['metadata']['geneName'] = gene.geneName out['metadata']['chrom'] = gene.chrom out['metadata']['strand'] = gene.strand out['metadata']['start'] = gene.start out['metadata']['stop'] = gene.stop out['metadata']['extracted_regions'] = ranges else: spliceSite = self.spliceSites[idx] out['inputs'] = spliceSite.get_seq(self.fasta) out['metadata'] = {} out['metadata']['geneID'] = spliceSite.geneID out['metadata']['transcriptID'] = spliceSite.transcriptID out['metadata']['biotype'] = spliceSite.biotype out['metadata']['order'] = spliceSite.order out['metadata']['ranges'] = GenomicRanges( spliceSite.chrom, spliceSite.grange[0] - 1, # use 0-base indexing spliceSite.grange[1], spliceSite.geneID, spliceSite.strand) return out def get_seq(self, gene): """ Get exon and intron sequences """ exons = gene.get_all_exons() # N_exon = exons.shape[0] introns = gene.get_all_introns() # Take intron coordinate # Try both normal gtf and AS_gtf if self.side == "5prime": if gene.strand == "+": seq_ranges = exons[:-1, 1].reshape(-1, 1) + np.array( [-self.overhang_l + 1, self.overhang_r]) else: seq_ranges = exons[:-1, 0].reshape(-1, 1) + np.array( [-self.overhang_r, self.overhang_l - 1]) else: if gene.strand == "+": seq_ranges = exons[1:, 0].reshape(-1, 1) + np.array( [-self.overhang_r, self.overhang_l - 1]) else: seq_ranges = exons[1:, 1].reshape(-1, 1) + np.array( [-self.overhang_l + 1, self.overhang_r]) seq = [ self.fasta.get_seq(gene.chrom, seq_range, gene.strand) for seq_range in seq_ranges ] return np.array(seq), seq_ranges def _get_spliceSites(self, gene): ''' Get splice site sequence for all transcripts of a single gene. Applied for normal gtf annotation. ''' spliceSites = [] for transcript in gene.trans: exons = transcript.exons ind = np.lexsort((exons[:, 1], exons[:, 0])) if len(exons) > 1: if self.side == "5prime": if gene.strand == "+": seq_ranges = exons[:-1, 1].reshape(-1, 1) + np.array( [-self.overhang_l + 1, self.overhang_r]) else: ind = ind[::-1] exons = exons[ind] seq_ranges = exons[:-1, 0].reshape(-1, 1) + np.array( [-self.overhang_r, self.overhang_l - 1]) else: if gene.strand == "+": seq_ranges = exons[1:, 0].reshape(-1, 1) + np.array( [-self.overhang_r, self.overhang_l - 1]) else: ind = ind[::-1] exons = exons[ind] seq_ranges = exons[1:, 1].reshape(-1, 1) + np.array( [-self.overhang_l + 1, self.overhang_r]) for i in range(seq_ranges.shape[0]): spliceSite = SpliceSite(gene.chrom, seq_ranges[i, 0], seq_ranges[i, 1], gene.strand, transcript.tranID, gene.geneID, gene.biotype, i) # can call get_seq later in iterator to save memory # spliceSite.seq = spliceSite.get_seq(self.fasta) spliceSites.append(spliceSite) return spliceSites def get_spliceSites(self): ''' Get splice sites for all donors ''' spliceSites = list(map(self._get_spliceSites, self.genes)) spliceSites = list(itertools.chain.from_iterable(spliceSites)) return spliceSites @property def name(self): return self._name @name.setter def name(self, value): self._name = value @property def species(self): return self._species @species.setter def species(self, value): self._species = value
class SplicingMaxEntDataset(Dataset): """ args: MISO_AS: doc: Whether the given annotation file is MISO alternative splicing annotation. default False. fasta_file: doc: Reference Genome sequence in fasta format example: md5: 936544855b253835442a0f253dd4b083 url: https://zenodo.org/record/1466099/files/3prime-example_files-hg19.chr22.fa?download=1 type: str gtf_file: doc: file path; Genome annotation GTF file example: md5: 174fd11303ae2c2369094bfcbe303c07 url: https://zenodo.org/record/1466099/files/3prime-example_files-hg19.chr22.gtf?download=1 label_col: doc: response label column name target_file: doc: path to the targets (txt) file optional: true side: doc: 5 or 3prime splice-site dependencies: conda: - pysam=0.15.2 - python=3.5.6 info: authors: - github: s6juncheng name: Jun Cheng doc: MaxEnt Splicing Model name: MaxEnt version: 0.1 output_schema: inputs: associated_metadata: ranges doc: a junction (donor or acceptor) sequence name: seq shape: () special_type: DNAStringSeq metadata: biotype: doc: gene biotype, can be used to filter protein coding genes for instance type: str geneID: doc: gene ID type: str order: doc: order of the donor site in the transcript, counted from 5' to 3'. type: int ranges: doc: ranges that the sequences were extracted type: GenomicRanges transcriptID: doc: transcript id type: str targets: doc: Predicted psi name: psi shape: (1,) """ def __init__( self, gtf_file, fasta_file, side='5prime', # 5prime/3prime target_file=None, MISO_AS=False, label_col='event_name'): from gtf_utils import loadgene self.genes = loadgene(gtf_file) self.fasta_file = fasta_file self.fasta = None # open the file later if side in ["5prime", "3prime"]: self.side = side else: raise Exception("side should be 5prime or 3prime") if self.side == "5prime": self.overhang_l = 3 self.overhang_r = 6 else: self.overhang_l = 3 self.overhang_r = 20 if target_file is not None: self.Y = Target(target_file, label_col) else: self.Y = None self.MISO_AS = MISO_AS if not MISO_AS: self.spliceSites = self.get_spliceSites() self._name = None self._species = None def __len__(self): if self.MISO_AS: return len(self.genes) else: return len(self.spliceSites) def __getitem__(self, idx): if self.fasta is None: from fasta_utils import FastaFile self.fasta = FastaFile(self.fasta_file) out = {} if self.MISO_AS: gene = self.genes[idx] inputs, ranges = self.get_seq(gene) out['inputs'] = inputs if self.Y is not None: out['targets'] = self.Y.get_target(gene.geneName) else: out['targets'] = np.nan out['metadata'] = {} out['metadata']['geneName'] = gene.geneName out['metadata']['chrom'] = gene.chrom out['metadata']['strand'] = gene.strand out['metadata']['start'] = gene.start out['metadata']['stop'] = gene.stop out['metadata']['extracted_regions'] = ranges else: spliceSite = self.spliceSites[idx] out['inputs'] = spliceSite.get_seq(self.fasta) out['metadata'] = {} out['metadata']['geneID'] = spliceSite.geneID out['metadata']['transcriptID'] = spliceSite.transcriptID out['metadata']['biotype'] = spliceSite.biotype out['metadata']['order'] = spliceSite.order out['metadata']['ranges'] = GenomicRanges( spliceSite.chrom, spliceSite.grange[0] - 1, # use 0-base indexing spliceSite.grange[1], spliceSite.geneID, spliceSite.strand) return out def get_seq(self, gene): """ Get exon and intron sequences """ exons = gene.get_all_exons() # N_exon = exons.shape[0] introns = gene.get_all_introns() # Take intron coordinate # Try both normal gtf and AS_gtf if self.side == "5prime": if gene.strand == "+": seq_ranges = exons[:-1, 1].reshape(-1, 1) + np.array( [-self.overhang_l + 1, self.overhang_r]) else: seq_ranges = exons[:-1, 0].reshape(-1, 1) + np.array( [-self.overhang_r, self.overhang_l - 1]) else: if gene.strand == "+": seq_ranges = exons[1:, 0].reshape(-1, 1) + np.array( [-self.overhang_r, self.overhang_l - 1]) else: seq_ranges = exons[1:, 1].reshape(-1, 1) + np.array( [-self.overhang_l + 1, self.overhang_r]) seq = [ self.fasta.get_seq(gene.chrom, seq_range, gene.strand) for seq_range in seq_ranges ] return np.array(seq), seq_ranges def _get_spliceSites(self, gene): ''' Get splice site sequence for all transcripts of a single gene. Applied for normal gtf annotation. ''' spliceSites = [] for transcript in gene.trans: exons = transcript.exons ind = np.lexsort((exons[:, 1], exons[:, 0])) if len(exons) > 1: if self.side == "5prime": if gene.strand == "+": seq_ranges = exons[:-1, 1].reshape(-1, 1) + np.array( [-self.overhang_l + 1, self.overhang_r]) else: ind = ind[::-1] exons = exons[ind] seq_ranges = exons[:-1, 0].reshape(-1, 1) + np.array( [-self.overhang_r, self.overhang_l - 1]) else: if gene.strand == "+": seq_ranges = exons[1:, 0].reshape(-1, 1) + np.array( [-self.overhang_r, self.overhang_l - 1]) else: ind = ind[::-1] exons = exons[ind] seq_ranges = exons[1:, 1].reshape(-1, 1) + np.array( [-self.overhang_l + 1, self.overhang_r]) for i in range(seq_ranges.shape[0]): spliceSite = SpliceSite(gene.chrom, seq_ranges[i, 0], seq_ranges[i, 1], gene.strand, transcript.tranID, gene.geneID, gene.biotype, i) # can call get_seq later in iterator to save memory # spliceSite.seq = spliceSite.get_seq(self.fasta) spliceSites.append(spliceSite) return spliceSites def get_spliceSites(self): ''' Get splice sites for all donors ''' spliceSites = list(map(self._get_spliceSites, self.genes)) spliceSites = list(itertools.chain.from_iterable(spliceSites)) return spliceSites @property def name(self): return self._name @name.setter def name(self, value): self._name = value @property def species(self): return self._species @species.setter def species(self, value): self._species = value
class SplicingKmerDataset(Dataset): """ Args: gtf_file: gtf file. Can be dowloaded from MISO or ensembl. fasta_file: file path; Genome sequence target_file: file path; path to the targets in MISO summary format. overhang: length of overhang. MISO_AS: whether the used annotation file is from MISO alternative splicing annotation. """ def __init__(self, gtf_file, fasta_file, overhang=80, MISO_AS=False): # intron + ~ bp exon from both side self.genes = loadgene(gtf_file) self.fasta_file = fasta_file self.fasta = None self.overhang = overhang self.MISO_AS = MISO_AS if not MISO_AS: self.spliceSites = self.get_spliceSites() self._name = None self._species = None def __len__(self): if self.MISO_AS: return len(self.genes) else: return len(self.spliceSites) def __getitem__(self, idx): if self.fasta is None: self.fasta = FastaFile(self.fasta_file) out = {} if self.MISO_AS: gene = self.genes[idx] out['inputs'] = self.get_seq(gene) out['metadata'] = {} out['metadata']['geneName'] = gene.geneName out['metadata']['chrom'] = gene.chrom out['metadata']['strand'] = gene.strand out['metadata']['start'] = gene.start out['metadata']['stop'] = gene.stop else: spliceSite = self.spliceSites[idx] out['inputs'] = spliceSite.get_seq(self.fasta) out['metadata'] = {} out['metadata']['geneID'] = spliceSite.geneID out['metadata']['transcriptID'] = spliceSite.transcriptID out['metadata']['biotype'] = spliceSite.biotype out['metadata']['order'] = spliceSite.order out['metadata']['ranges'] = GenomicRanges( spliceSite.chrom, spliceSite.grange[0] - 1, # use 0-base indexing spliceSite.grange[1], spliceSite.geneID, spliceSite.strand) return out def get_seq(self, gene): """ Get splice site sequence with flanking exon and intron sequences. This function is applied for MISO annotation with 3 exon alternative splicing model. """ exons = gene.get_all_exons() # N_exon = exons.shape[0] if gene.strand == "+": seq_ranges = exons[:-1, 1].reshape(-1, 1) + np.array( [-self.overhang + 1, self.overhang]) else: seq_ranges = exons[:-1, 0].reshape(-1, 1) + np.array( [-self.overhang, self.overhang - 1]) seq = [ self.fasta.get_seq(gene.chrom, seq_range, gene.strand) for seq_range in seq_ranges ] return np.array(seq) def _get_spliceSites(self, gene): ''' Get splice site sequence for all transcripts of a single gene. Applied for normal gtf annotation. ''' spliceSites = [] for transcript in gene.trans: exons = transcript.exons ind = np.lexsort((exons[:, 1], exons[:, 0])) if len(exons) > 1: if gene.strand == "+": seq_ranges = exons[:-1, 1].reshape(-1, 1) + np.array( [-self.overhang + 1, self.overhang]) else: ind = ind[::-1] exons = exons[ind] seq_ranges = exons[:-1, 0].reshape(-1, 1) + np.array( [-self.overhang, self.overhang - 1]) for i in range(seq_ranges.shape[0]): spliceSite = SpliceSite(gene.chrom, seq_ranges[i, 0], seq_ranges[i, 1], gene.strand, transcript.tranID, gene.geneID, gene.biotype, i) # can call get_seq later in iterator to save memory # spliceSite.seq = spliceSite.get_seq(self.fasta) spliceSites.append(spliceSite) return spliceSites def get_spliceSites(self): ''' Get splice sites for all donors ''' spliceSites = list(map(self._get_spliceSites, self.genes)) spliceSites = list(itertools.chain.from_iterable(spliceSites)) return spliceSites @property def name(self): return self._namem @name.setter def name(self, value): self._name = value @property def species(self): return self._species @species.setter def species(self, value): self._species = value
def __init__(self, gtf_file, fasta_file, length=70): self.genes = loadgene(gtf_file) self.fasta = FastaFile(fasta_file) self.length = length self.branches = self.get_branches()