def create_optimal_path_graph(sgraph, kmax=0, loss_threshold=0.10, stats_fh=None): ''' create a path graph from the original splice graph using paths of length 'k' for assembly. The parameter 'k' will be chosen by maximizing the number of reachable k-mers in the path graph while tolerating at most 'loss_threshold' percent of expression. ''' # find upper bound to k user_kmax = kmax kmax = find_longest_path(sgraph) if user_kmax > 0: # user can force a specific kmax (for debugging/testing purposes) kmax = min(user_kmax, kmax) sgraph_id_str = '%s:%d-%d[%s]' % (sgraph.chrom, sgraph.start, sgraph.end, Strand.to_gtf(sgraph.strand)) tot_expr = sum(sgraph.get_node_expr_data(n).mean() for n in sgraph.G) def compute_kmers(k): K = create_path_graph(sgraph, k) valid = K.graph['valid'] short_transfrags = K.graph['short_transfrags'] num_lost_kmers = K.graph['num_lost_kmers'] lost_nodes = get_lost_nodes(sgraph, K) lost_expr = sum( sgraph.get_node_expr_data(n).mean() for n in lost_nodes) lost_expr_frac = 0.0 if tot_expr == 0 else lost_expr / tot_expr logging.debug('%s k=%d kmax=%d t=%d kmers=%d short_transfrags=%d ' 'lost_kmers=%d tot_expr=%.3f lost_expr=%.3f ' 'lost_expr_frac=%.3f valid=%d' % (sgraph_id_str, k, kmax, len(sgraph.transfrags), len(K), len(short_transfrags), num_lost_kmers, tot_expr, lost_expr, lost_expr_frac, int(valid))) if stats_fh: fields = [ sgraph_id_str, k, kmax, len(sgraph.transfrags), len(K), len(short_transfrags), num_lost_kmers, tot_expr, lost_expr, lost_expr_frac, int(valid) ] print >> stats_fh, '\t'.join(map(str, fields)) if not valid: return -k #if lost_expr_frac > loss_threshold: # return -k return len(K) k, num_kmers = maximize_bisect(compute_kmers, 1, kmax, 0) logging.debug('Creating path graph k=%d num_kmers=%d' % (k, num_kmers)) K = create_path_graph(sgraph, k) logging.debug('Rescuing short transfrags') rescue_short_transfrags_saindex(K) return K, k
def create_optimal(self, kmax=0, loss_threshold=0.10, stats_fh=None): ''' create a graph where nodes are paths of length 'k'. the parameter 'k' is chosen to maximizing the number of reachable k-mers in the path graph while tolerating at most 'loss_threshold' percent of expression. ''' if len(self.paths) == 0: return None, 0 # find upper bound to k user_kmax = kmax kmax = self.longest_path_length() if user_kmax > 0: # user can force a specific kmax (for debugging/testing purposes) kmax = min(user_kmax, kmax) id_str = ( '%s:%d-%d[%s]' % (self.chrom, self.start, self.end, Strand.to_gtf(self.strand))) def compute_kmers(k): K = self.create(k) tot_expr = sum(K.exprs[i] for i in K.node_ids_iter()) lost_expr = K.lost_kmer_expr lost_expr_frac = 0.0 if tot_expr == 0 else lost_expr / tot_expr if stats_fh: fields = [ self.chrom, self.start, self.end, Strand.to_gtf(self.strand), k, kmax, len(self.paths), K.n, len(K.short_transfrags), K.num_lost_kmers, tot_expr, lost_expr, lost_expr_frac, int(K.valid) ] print >> stats_fh, '\t'.join(map(str, fields)) if not K.valid: return -k #if lost_expr_frac > loss_threshold: # return -k return len(K) k, num_kmers = maximize_bisect(compute_kmers, 1, kmax, 0) logging.debug('%s creating path graph k=%d num_kmers=%d' % (id_str, k, num_kmers)) K = self.create(k) logging.debug('%s rescuing short transfrags kmers=%d' % (id_str, len(K))) num_lost = self.rescue_short_transfrags(K, K.short_transfrags) logging.debug('%s lost %d of %d short transfrags' % (id_str, num_lost, len(K.short_transfrags))) return K, k
def create_optimal_path_graph(sgraph, kmax=0, loss_threshold=0.10, stats_fh=None): ''' create a path graph from the original splice graph using paths of length 'k' for assembly. The parameter 'k' will be chosen by maximizing the number of reachable k-mers in the path graph while tolerating at most 'loss_threshold' percent of expression. ''' # find upper bound to k user_kmax = kmax kmax = find_longest_path(sgraph) if user_kmax > 0: # user can force a specific kmax (for debugging/testing purposes) kmax = min(user_kmax, kmax) sgraph_id_str = '%s:%d-%d[%s]' % (sgraph.chrom, sgraph.start, sgraph.end, Strand.to_gtf(sgraph.strand)) tot_expr = sum(sgraph.get_node_expr_data(n).mean() for n in sgraph.G) def compute_kmers(k): K = create_path_graph(sgraph, k) valid = K.graph['valid'] short_transfrags = K.graph['short_transfrags'] num_lost_kmers = K.graph['num_lost_kmers'] lost_nodes = get_lost_nodes(sgraph, K) lost_expr = sum(sgraph.get_node_expr_data(n).mean() for n in lost_nodes) lost_expr_frac = 0.0 if tot_expr == 0 else lost_expr / tot_expr logging.debug('%s k=%d kmax=%d t=%d kmers=%d short_transfrags=%d ' 'lost_kmers=%d tot_expr=%.3f lost_expr=%.3f ' 'lost_expr_frac=%.3f valid=%d' % (sgraph_id_str, k, kmax, len(sgraph.transfrags), len(K), len(short_transfrags), num_lost_kmers, tot_expr, lost_expr, lost_expr_frac, int(valid))) if stats_fh: fields = [sgraph_id_str, k, kmax, len(sgraph.transfrags), len(K), len(short_transfrags), num_lost_kmers, tot_expr, lost_expr, lost_expr_frac, int(valid)] print >>stats_fh, '\t'.join(map(str, fields)) if not valid: return -k #if lost_expr_frac > loss_threshold: # return -k return len(K) k, num_kmers = maximize_bisect(compute_kmers, 1, kmax, 0) logging.debug('Creating path graph k=%d num_kmers=%d' % (k, num_kmers)) K = create_path_graph(sgraph, k) logging.debug('Rescuing short transfrags') rescue_short_transfrags_saindex(K) return K, k
def create_optimal(self, kmax=0, stats_fh=None): ''' create a graph where nodes are paths of length 'k'. the parameter 'k' is chosen to maximizing the number of reachable k-mers in the path graph ''' if len(self.paths) == 0: return None, 0 # find upper bound to k user_kmax = kmax kmax = self.longest_path_length if user_kmax > 0: # user can force a specific kmax kmax = min(user_kmax, kmax) def compute_kmers(k): K = self.create(k) expr_frac = 0.0 if self.total_expr == 0 else K.graph_expr / self.total_expr if stats_fh: fields = self.get_stats(K, kmax=kmax) print >> stats_fh, '\t'.join(map(str, fields)) if not K.valid: return -k # optimize based on kmers only (commented out) #return len(K) # optimize based on kmers weighted by fraction of total # transcript expression retained in the path graph return int(round(expr_frac * len(K))) k, num_kmers = maximize_bisect(compute_kmers, 1, kmax, 0) logging.debug('%s creating path graph k=%d num_kmers=%d' % (str(self), k, num_kmers)) K = self.create(k) logging.debug('%s rescuing short transfrags kmers=%d' % (str(self), len(K))) lost_short, lost_short_expr = \ self.rescue_short_transfrags(K, K.short_transfrags) logging.debug('%s lost %d of %d short transfrags' % (str(self), lost_short, len(K.short_transfrags))) if stats_fh: fields = self.get_stats(K, kmax=kmax, lost_short=lost_short, lost_short_expr=lost_short_expr, is_opt=1) print >> stats_fh, '\t'.join(map(str, fields)) return K, k
def create_optimal(self, kmax=0, stats_fh=None): ''' create a graph where nodes are paths of length 'k'. the parameter 'k' is chosen to maximizing the number of reachable k-mers in the path graph ''' if len(self.paths) == 0: return None, 0 # find upper bound to k user_kmax = kmax kmax = self.longest_path_length if user_kmax > 0: # user can force a specific kmax kmax = min(user_kmax, kmax) def compute_kmers(k): K = self.create(k) expr_frac = 0.0 if self.total_expr == 0 else K.graph_expr / self.total_expr if stats_fh: fields = self.get_stats(K, kmax=kmax) print >>stats_fh, '\t'.join(map(str, fields)) if not K.valid: return -k # optimize based on kmers only (commented out) #return len(K) # optimize based on kmers weighted by fraction of total # transcript expression retained in the path graph return int(round(expr_frac * len(K))) k, num_kmers = maximize_bisect(compute_kmers, 1, kmax, 0) logging.debug('%s creating path graph k=%d num_kmers=%d' % (str(self), k, num_kmers)) K = self.create(k) logging.debug('%s rescuing short transfrags kmers=%d' % (str(self), len(K))) lost_short, lost_short_expr = \ self.rescue_short_transfrags(K, K.short_transfrags) logging.debug('%s lost %d of %d short transfrags' % (str(self), lost_short, len(K.short_transfrags))) if stats_fh: fields = self.get_stats(K, kmax=kmax, lost_short=lost_short, lost_short_expr=lost_short_expr, is_opt=1) print >>stats_fh, '\t'.join(map(str, fields)) return K, k
def create_optimal(self, kmax=0, loss_threshold=0.10, stats_fh=None): ''' create a graph where nodes are paths of length 'k'. the parameter 'k' is chosen to maximizing the number of reachable k-mers in the path graph while tolerating at most 'loss_threshold' percent of expression. ''' if len(self.paths) == 0: return None, 0 # find upper bound to k user_kmax = kmax kmax = self.longest_path_length() if user_kmax > 0: # user can force a specific kmax (for debugging/testing purposes) kmax = min(user_kmax, kmax) id_str = ('%s:%d-%d[%s]' % (self.chrom, self.start, self.end, Strand.to_gtf(self.strand))) def compute_kmers(k): K = self.create(k) tot_expr = sum(K.exprs[i] for i in K.node_ids_iter()) lost_expr = K.lost_kmer_expr lost_expr_frac = 0.0 if tot_expr == 0 else lost_expr / tot_expr if stats_fh: fields = [self.chrom, self.start, self.end, Strand.to_gtf(self.strand), k, kmax, len(self.paths), K.n, len(K.short_transfrags), K.num_lost_kmers, tot_expr, lost_expr, lost_expr_frac, int(K.valid)] print >>stats_fh, '\t'.join(map(str, fields)) if not K.valid: return -k #if lost_expr_frac > loss_threshold: # return -k return len(K) k, num_kmers = maximize_bisect(compute_kmers, 1, kmax, 0) logging.debug('%s creating path graph k=%d num_kmers=%d' % (id_str, k, num_kmers)) K = self.create(k) logging.debug('%s rescuing short transfrags kmers=%d' % (id_str, len(K))) num_lost = self.rescue_short_transfrags(K, K.short_transfrags) logging.debug('%s lost %d of %d short transfrags' % (id_str, num_lost, len(K.short_transfrags))) return K, k