예제 #1
0
def create_optimal_path_graph(sgraph,
                              kmax=0,
                              loss_threshold=0.10,
                              stats_fh=None):
    '''
    create a path graph from the original splice graph using paths of length
    'k' for assembly. The parameter 'k' will be chosen by maximizing the
    number of reachable k-mers in the path graph while tolerating at most
    'loss_threshold' percent of expression.
    '''
    # find upper bound to k
    user_kmax = kmax
    kmax = find_longest_path(sgraph)
    if user_kmax > 0:
        # user can force a specific kmax (for debugging/testing purposes)
        kmax = min(user_kmax, kmax)
    sgraph_id_str = '%s:%d-%d[%s]' % (sgraph.chrom, sgraph.start, sgraph.end,
                                      Strand.to_gtf(sgraph.strand))
    tot_expr = sum(sgraph.get_node_expr_data(n).mean() for n in sgraph.G)

    def compute_kmers(k):
        K = create_path_graph(sgraph, k)
        valid = K.graph['valid']
        short_transfrags = K.graph['short_transfrags']
        num_lost_kmers = K.graph['num_lost_kmers']
        lost_nodes = get_lost_nodes(sgraph, K)
        lost_expr = sum(
            sgraph.get_node_expr_data(n).mean() for n in lost_nodes)
        lost_expr_frac = 0.0 if tot_expr == 0 else lost_expr / tot_expr
        logging.debug('%s k=%d kmax=%d t=%d kmers=%d short_transfrags=%d '
                      'lost_kmers=%d tot_expr=%.3f lost_expr=%.3f '
                      'lost_expr_frac=%.3f valid=%d' %
                      (sgraph_id_str, k, kmax, len(sgraph.transfrags), len(K),
                       len(short_transfrags), num_lost_kmers, tot_expr,
                       lost_expr, lost_expr_frac, int(valid)))
        if stats_fh:
            fields = [
                sgraph_id_str, k, kmax,
                len(sgraph.transfrags),
                len(K),
                len(short_transfrags), num_lost_kmers, tot_expr, lost_expr,
                lost_expr_frac,
                int(valid)
            ]
            print >> stats_fh, '\t'.join(map(str, fields))
        if not valid:
            return -k
        #if lost_expr_frac > loss_threshold:
        #    return -k
        return len(K)

    k, num_kmers = maximize_bisect(compute_kmers, 1, kmax, 0)
    logging.debug('Creating path graph k=%d num_kmers=%d' % (k, num_kmers))
    K = create_path_graph(sgraph, k)
    logging.debug('Rescuing short transfrags')
    rescue_short_transfrags_saindex(K)
    return K, k
예제 #2
0
    def create_optimal(self, kmax=0, loss_threshold=0.10, stats_fh=None):
        '''
        create a graph where nodes are paths of length 'k'. the parameter
        'k' is chosen to maximizing the number of reachable k-mers in the
        path graph while tolerating at most 'loss_threshold' percent of
        expression.
        '''
        if len(self.paths) == 0:
            return None, 0

        # find upper bound to k
        user_kmax = kmax
        kmax = self.longest_path_length()
        if user_kmax > 0:
            # user can force a specific kmax (for debugging/testing purposes)
            kmax = min(user_kmax, kmax)
        id_str = (
            '%s:%d-%d[%s]' %
            (self.chrom, self.start, self.end, Strand.to_gtf(self.strand)))

        def compute_kmers(k):
            K = self.create(k)
            tot_expr = sum(K.exprs[i] for i in K.node_ids_iter())
            lost_expr = K.lost_kmer_expr
            lost_expr_frac = 0.0 if tot_expr == 0 else lost_expr / tot_expr
            if stats_fh:
                fields = [
                    self.chrom, self.start, self.end,
                    Strand.to_gtf(self.strand), k, kmax,
                    len(self.paths), K.n,
                    len(K.short_transfrags), K.num_lost_kmers, tot_expr,
                    lost_expr, lost_expr_frac,
                    int(K.valid)
                ]
                print >> stats_fh, '\t'.join(map(str, fields))
            if not K.valid:
                return -k
            #if lost_expr_frac > loss_threshold:
            #    return -k
            return len(K)

        k, num_kmers = maximize_bisect(compute_kmers, 1, kmax, 0)
        logging.debug('%s creating path graph k=%d num_kmers=%d' %
                      (id_str, k, num_kmers))
        K = self.create(k)
        logging.debug('%s rescuing short transfrags kmers=%d' %
                      (id_str, len(K)))
        num_lost = self.rescue_short_transfrags(K, K.short_transfrags)
        logging.debug('%s lost %d of %d short transfrags' %
                      (id_str, num_lost, len(K.short_transfrags)))
        return K, k
예제 #3
0
파일: path_graph.py 프로젝트: yniknafs/taco
def create_optimal_path_graph(sgraph, kmax=0, loss_threshold=0.10,
                              stats_fh=None):
    '''
    create a path graph from the original splice graph using paths of length
    'k' for assembly. The parameter 'k' will be chosen by maximizing the
    number of reachable k-mers in the path graph while tolerating at most
    'loss_threshold' percent of expression.
    '''
    # find upper bound to k
    user_kmax = kmax
    kmax = find_longest_path(sgraph)
    if user_kmax > 0:
        # user can force a specific kmax (for debugging/testing purposes)
        kmax = min(user_kmax, kmax)
    sgraph_id_str = '%s:%d-%d[%s]' % (sgraph.chrom, sgraph.start, sgraph.end,
                                      Strand.to_gtf(sgraph.strand))
    tot_expr = sum(sgraph.get_node_expr_data(n).mean() for n in sgraph.G)

    def compute_kmers(k):
        K = create_path_graph(sgraph, k)
        valid = K.graph['valid']
        short_transfrags = K.graph['short_transfrags']
        num_lost_kmers = K.graph['num_lost_kmers']
        lost_nodes = get_lost_nodes(sgraph, K)
        lost_expr = sum(sgraph.get_node_expr_data(n).mean() for n in lost_nodes)
        lost_expr_frac = 0.0 if tot_expr == 0 else lost_expr / tot_expr
        logging.debug('%s k=%d kmax=%d t=%d kmers=%d short_transfrags=%d '
                      'lost_kmers=%d tot_expr=%.3f lost_expr=%.3f '
                      'lost_expr_frac=%.3f valid=%d' %
                      (sgraph_id_str, k, kmax, len(sgraph.transfrags),
                       len(K), len(short_transfrags), num_lost_kmers,
                       tot_expr, lost_expr, lost_expr_frac, int(valid)))
        if stats_fh:
            fields = [sgraph_id_str, k, kmax, len(sgraph.transfrags), len(K),
                      len(short_transfrags), num_lost_kmers, tot_expr,
                      lost_expr, lost_expr_frac, int(valid)]
            print >>stats_fh, '\t'.join(map(str, fields))
        if not valid:
            return -k
        #if lost_expr_frac > loss_threshold:
        #    return -k
        return len(K)

    k, num_kmers = maximize_bisect(compute_kmers, 1, kmax, 0)
    logging.debug('Creating path graph k=%d num_kmers=%d' % (k, num_kmers))
    K = create_path_graph(sgraph, k)
    logging.debug('Rescuing short transfrags')
    rescue_short_transfrags_saindex(K)
    return K, k
예제 #4
0
파일: path_graph.py 프로젝트: tacorna/taco
    def create_optimal(self, kmax=0, stats_fh=None):
        '''
        create a graph where nodes are paths of length 'k'. the parameter
        'k' is chosen to maximizing the number of reachable k-mers in the
        path graph
        '''
        if len(self.paths) == 0:
            return None, 0

        # find upper bound to k
        user_kmax = kmax
        kmax = self.longest_path_length
        if user_kmax > 0:
            # user can force a specific kmax
            kmax = min(user_kmax, kmax)

        def compute_kmers(k):
            K = self.create(k)
            expr_frac = 0.0 if self.total_expr == 0 else K.graph_expr / self.total_expr
            if stats_fh:
                fields = self.get_stats(K, kmax=kmax)
                print >> stats_fh, '\t'.join(map(str, fields))
            if not K.valid:
                return -k
            # optimize based on kmers only (commented out)
            #return len(K)
            # optimize based on kmers weighted by fraction of total
            # transcript expression retained in the path graph
            return int(round(expr_frac * len(K)))

        k, num_kmers = maximize_bisect(compute_kmers, 1, kmax, 0)
        logging.debug('%s creating path graph k=%d num_kmers=%d' %
                      (str(self), k, num_kmers))
        K = self.create(k)
        logging.debug('%s rescuing short transfrags kmers=%d' %
                      (str(self), len(K)))
        lost_short, lost_short_expr = \
            self.rescue_short_transfrags(K, K.short_transfrags)
        logging.debug('%s lost %d of %d short transfrags' %
                      (str(self), lost_short, len(K.short_transfrags)))
        if stats_fh:
            fields = self.get_stats(K,
                                    kmax=kmax,
                                    lost_short=lost_short,
                                    lost_short_expr=lost_short_expr,
                                    is_opt=1)
            print >> stats_fh, '\t'.join(map(str, fields))
        return K, k
예제 #5
0
파일: path_graph.py 프로젝트: tacorna/taco
    def create_optimal(self, kmax=0, stats_fh=None):
        '''
        create a graph where nodes are paths of length 'k'. the parameter
        'k' is chosen to maximizing the number of reachable k-mers in the
        path graph
        '''
        if len(self.paths) == 0:
            return None, 0

        # find upper bound to k
        user_kmax = kmax
        kmax = self.longest_path_length
        if user_kmax > 0:
            # user can force a specific kmax
            kmax = min(user_kmax, kmax)

        def compute_kmers(k):
            K = self.create(k)
            expr_frac = 0.0 if self.total_expr == 0 else K.graph_expr / self.total_expr
            if stats_fh:
                fields = self.get_stats(K, kmax=kmax)
                print >>stats_fh, '\t'.join(map(str, fields))
            if not K.valid:
                return -k
            # optimize based on kmers only (commented out)
            #return len(K)
            # optimize based on kmers weighted by fraction of total
            # transcript expression retained in the path graph
            return int(round(expr_frac * len(K)))

        k, num_kmers = maximize_bisect(compute_kmers, 1, kmax, 0)
        logging.debug('%s creating path graph k=%d num_kmers=%d' %
                      (str(self), k, num_kmers))
        K = self.create(k)
        logging.debug('%s rescuing short transfrags kmers=%d' %
                      (str(self), len(K)))
        lost_short, lost_short_expr = \
            self.rescue_short_transfrags(K, K.short_transfrags)
        logging.debug('%s lost %d of %d short transfrags' %
                      (str(self), lost_short, len(K.short_transfrags)))
        if stats_fh:
            fields = self.get_stats(K, kmax=kmax,
                                    lost_short=lost_short,
                                    lost_short_expr=lost_short_expr,
                                    is_opt=1)
            print >>stats_fh, '\t'.join(map(str, fields))
        return K, k
예제 #6
0
파일: path_graph.py 프로젝트: yniknafs/taco
    def create_optimal(self, kmax=0, loss_threshold=0.10, stats_fh=None):
        '''
        create a graph where nodes are paths of length 'k'. the parameter
        'k' is chosen to maximizing the number of reachable k-mers in the
        path graph while tolerating at most 'loss_threshold' percent of
        expression.
        '''
        if len(self.paths) == 0:
            return None, 0

        # find upper bound to k
        user_kmax = kmax
        kmax = self.longest_path_length()
        if user_kmax > 0:
            # user can force a specific kmax (for debugging/testing purposes)
            kmax = min(user_kmax, kmax)
        id_str = ('%s:%d-%d[%s]' % (self.chrom, self.start, self.end,
                                    Strand.to_gtf(self.strand)))

        def compute_kmers(k):
            K = self.create(k)
            tot_expr = sum(K.exprs[i] for i in K.node_ids_iter())
            lost_expr = K.lost_kmer_expr
            lost_expr_frac = 0.0 if tot_expr == 0 else lost_expr / tot_expr
            if stats_fh:
                fields = [self.chrom, self.start, self.end,
                          Strand.to_gtf(self.strand), k, kmax,
                          len(self.paths), K.n, len(K.short_transfrags),
                          K.num_lost_kmers, tot_expr, lost_expr,
                          lost_expr_frac, int(K.valid)]
                print >>stats_fh, '\t'.join(map(str, fields))
            if not K.valid:
                return -k
            #if lost_expr_frac > loss_threshold:
            #    return -k
            return len(K)

        k, num_kmers = maximize_bisect(compute_kmers, 1, kmax, 0)
        logging.debug('%s creating path graph k=%d num_kmers=%d' %
                      (id_str, k, num_kmers))
        K = self.create(k)
        logging.debug('%s rescuing short transfrags kmers=%d' %
                      (id_str, len(K)))
        num_lost = self.rescue_short_transfrags(K, K.short_transfrags)
        logging.debug('%s lost %d of %d short transfrags' %
                      (id_str, num_lost, len(K.short_transfrags)))
        return K, k