def to_gtf(self): strand_str = Strand.to_gtf(self.strand) f = GTF.Feature() f.seqid = self.chrom f.source = 'taco' f.feature = 'transcript' f.start = self.start f.end = self.end f.score = 0.0 f.strand = strand_str f.phase = '.' f.attrs = { GTF.Attr.TRANSCRIPT_ID: self._id, GTF.Attr.SAMPLE_ID: self.sample_id, GTF.Attr.EXPR: str(self.expr), GTF.Attr.REF: str(int(self.is_ref)) } yield f for e in self.exons: f = GTF.Feature() f.seqid = self.chrom f.source = 'taco' f.feature = 'exon' f.start = e.start f.end = e.end f.score = 0.0 f.strand = strand_str f.phase = '.' f.attrs = {GTF.Attr.TRANSCRIPT_ID: self._id} yield f
def get_change_point_gtf(self, cp): graph_id = ('G_%s_%d_%d_%s' % (self.chrom, self.start, self.end, Strand.to_gtf(self.strand))) features = [] f = GTF.Feature() f.seqid = self.chrom f.source = 'taco' f.feature = 'changept' f.start = cp.pos f.end = cp.pos + 1 f.score = 0 f.strand = Strand.to_gtf(self.strand) f.phase = '.' f.attrs = {'graph_id': graph_id, 'sign': str(cp.sign), 'pvalue': str(cp.pvalue), 'foldchange': str(cp.foldchange)} features.append(f) f = GTF.Feature() f.seqid = self.chrom f.source = 'taco' f.feature = 'changeinterval' f.start = cp.start f.end = cp.end f.score = 0 f.strand = Strand.to_gtf(self.strand) f.phase = '.' f.attrs = {'graph_id': graph_id, 'sign': str(cp.sign), 'pvalue': str(cp.pvalue), 'foldchange': str(cp.foldchange)} features.append(f) return features
def get_node_gtf(self): graph_id = ( 'G_%s_%d_%d_%s' % (self.chrom, self.start, self.end, Strand.to_gtf(self.strand))) # iterate through locus and return change point data for n_id in self.G: n = self.get_node_interval(n_id) expr_data = self.get_expr_data(*n) ref_starts = _array_subset(self.ref_start_sites, *n) ref_stops = _array_subset(self.ref_stop_sites, *n) # return gtf feature for each node f = GTF.Feature() f.seqid = self.chrom f.source = 'taco' f.feature = 'node' f.start = n[0] f.end = n[1] f.score = 0 f.strand = Strand.to_gtf(self.strand) f.phase = '.' f.attrs = { 'graph_id': graph_id, 'expr_min': str(expr_data.min()), 'expr_max': str(expr_data.max()), 'expr_mean': str(expr_data.mean()), 'ref_starts': ','.join(map(str, ref_starts)), 'ref_stops': ','.join(map(str, ref_stops)) } yield f
def get_gtf_features(chrom, strand, exons, locus_id, gene_id, tss_id, transcript_id, expr, rel_frac, abs_frac): tx_start = exons[0].start tx_end = exons[-1].end strand_str = Strand.to_gtf(strand) attr_dict = { 'locus_id': locus_id, 'gene_id': gene_id, 'tss_id': tss_id, 'transcript_id': transcript_id } f = GTF.Feature() f.seqid = chrom f.source = 'taco' f.feature = 'transcript' f.start = tx_start f.end = tx_end f.score = int(round(1000.0 * rel_frac)) f.strand = strand_str f.phase = '.' f.attrs = { 'expr': '%.3f' % expr, 'rel_frac': '%.5f' % rel_frac, 'abs_frac': '%.5f' % abs_frac } f.attrs.update(attr_dict) yield f for e in exons: f = GTF.Feature() f.seqid = chrom f.source = 'taco' f.feature = 'exon' f.start = e.start f.end = e.end f.score = int(round(1000.0 * rel_frac)) f.strand = strand_str f.phase = '.' f.attrs = {} f.attrs.update(attr_dict) yield f
def _make_transcript_feature(exon_features): f = GTF.Feature() f.seqid = exon_features[0].seqid f.source = exon_features[0].source f.feature = 'transcript' f.start = exon_features[0].start f.end = exon_features[-1].end f.score = exon_features[0].score f.strand = exon_features[0].strand f.phase = '.' f.attrs = exon_features[0].attrs.copy() if 'exon_number' in f.attrs: del f.attrs['exon_number'] return f
def assemble_isoforms(sgraph, config): # create a path graph from the splice graph K, k = create_optimal_path_graph( sgraph, kmax=config.path_graph_kmax, loss_threshold=config.path_graph_loss_threshold, stats_fh=config.path_graph_stats_fh) if K is None: return [] if len(K) == 0: return [] # report lost nodes if config.assembly_loss_gtf_fh is not None: graph_id = ('L_%s:%d-%d[%s]' % (sgraph.chrom, sgraph.start, sgraph.end, Strand.to_gtf(sgraph.strand))) for n_id in get_lost_nodes(sgraph, K): n = sgraph.get_node_interval(n_id) expr_data = sgraph.get_node_expr_data(n_id) # return gtf feature for each node f = GTF.Feature() f.seqid = sgraph.chrom f.source = 'taco' f.feature = 'lost_node' f.start = n[0] f.end = n[1] f.score = 0.0 f.strand = Strand.to_gtf(sgraph.strand) f.phase = '.' f.attrs = {'graph_id': graph_id, 'expr': str(expr_data.mean())} print >> config.assembly_loss_gtf_fh, str(f) # smooth kmer graph smooth_graph(K) source_node = K.graph['source'] source_expr = K.node[source_node][KMER_EXPR] logging.debug('%s:%d-%d[%s] finding paths in k=%d graph ' '(%d nodes) source_expr=%f' % (sgraph.chrom, sgraph.start, sgraph.end, Strand.to_gtf(sgraph.strand), k, len(K), source_expr)) id_kmer_map = K.graph['id_kmer_map'] paths = [] for kmer_path, expr in find_paths(K, KMER_EXPR, config.path_frac, config.max_paths): path = reconstruct_path(kmer_path, id_kmer_map, sgraph) logging.debug("\texpr=%f length=%d" % (expr, len(path))) paths.append((path, expr)) # build gene clusters clusters, filtered = Cluster.build(paths, min_frac=config.isoform_frac) logging.debug('\tclusters: %d filtered: %d' % (len(clusters), len(filtered))) gene_isoforms = [] for cluster in clusters: isoforms = [] for path, expr, rel_frac, abs_frac in cluster.iterpaths(): isoforms.append( Isoform(path=path, expr=expr, rel_frac=rel_frac, abs_frac=abs_frac)) # apply max isoforms limit (per cluster) if config.max_isoforms > 0: isoforms = isoforms[:config.max_isoforms] gene_isoforms.append(isoforms) return gene_isoforms