Exemplo n.º 1
0
def count_graph_coverage_wrapper(fname_in, fname_out, CFG):

    (genes, inserted) = cPickle.load(open(fname_in, 'r'))

    if genes[0].segmentgraph is None or genes[0].segmentgraph.is_empty():
        for g in genes:
            g.segmentgraph = Segmentgraph(g)
        cPickle.dump((genes, inserted), open(fname_in, 'w'), -1)

    counts = dict()
    counts['segments'] = []
    counts['seg_pos'] = []
    counts['gene_ids_segs'] = []
    counts['edges'] = []
    counts['gene_ids_edges'] = []
    counts['seg_len'] = sp.hstack([
        x.segmentgraph.segments[1, :] - x.segmentgraph.segments[0, :]
        for x in genes
    ]).T
    counts['gene_names'] = sp.array([x.name for x in genes], dtype='str')

    if not CFG['rproc']:
        for s_idx in range(CFG['strains'].shape[0]):
            print '\n%i/%i' % (s_idx + 1, CFG['strains'].shape[0])
            if s_idx == 0:
                counts_tmp = count_graph_coverage(genes,
                                                  CFG['bam_fnames'][s_idx],
                                                  CFG)
            else:
                counts_tmp = sp.r_[
                    sp.atleast_2d(counts_tmp),
                    count_graph_coverage(genes, CFG['bam_fnames'][s_idx], CFG)]

        for c in range(counts_tmp.shape[1]):
            counts['segments'].append(
                sp.hstack(
                    [sp.atleast_2d(x.segments).T for x in counts_tmp[:, c]]))
            counts['seg_pos'].append(
                sp.hstack(
                    [sp.atleast_2d(x.seg_pos).T for x in counts_tmp[:, c]]))
            counts['gene_ids_segs'].append(
                sp.ones((sp.atleast_2d(counts_tmp[0, c].seg_pos).shape[1], 1),
                        dtype='int') * c)
            tmp = [
                sp.atleast_2d(x.edges) for x in counts_tmp[:, c]
                if x.edges.shape[0] > 0
            ]
            if len(tmp) == 0:
                continue
            tmp = sp.hstack(tmp)
            if tmp.shape[0] > 0:
                counts['edges'].append(sp.c_[tmp[:, 0],
                                             tmp[:,
                                                 range(1, tmp.shape[1], 2)]])
                counts['gene_ids_edges'].append(
                    sp.ones((tmp.shape[0], 1), dtype='int') * c)

        ### write result data to hdf5
        for key in counts:
            counts[key] = sp.vstack(
                counts[key]) if len(counts[key]) > 0 else counts[key]
        counts['edge_idx'] = counts['edges'][:, 0] if len(
            counts['edges']) > 0 else sp.array([])
        counts['edges'] = counts['edges'][:, 1:] if len(
            counts['edges']) > 0 else sp.array([])
        h5fid = h5py.File(fname_out, 'w')
        h5fid.create_dataset(name='strains', data=CFG['strains'])
        for key in counts:
            h5fid.create_dataset(name=key, data=counts[key])
        h5fid.close()
    else:
        ### have an adaptive chunk size, that takes into account the number of strains (take as many genes as it takes to have ~10K strains)
        chunksize = int(max(1, math.floor(10000 / len(CFG['strains']))))

        jobinfo = []

        PAR = dict()
        PAR['CFG'] = CFG

        for c_idx in range(0, genes.shape[0], chunksize):
            cc_idx = min(genes.shape[0], c_idx + chunksize)
            fn = fname_out.replace('.pickle',
                                   '.chunk_%i_%i.pickle' % (c_idx, cc_idx))
            if os.path.exists(fn):
                continue
            else:
                print 'submitting chunk %i to %i' % (c_idx, cc_idx)
                PAR['genes'] = genes[c_idx:cc_idx]
                PAR['fn_bam'] = CFG['bam_fnames']
                PAR['fn_out'] = fn
                PAR['CFG'] = CFG
                jobinfo.append(
                    rp.rproc('count_graph_coverage', PAR, 6000,
                             CFG['options_rproc'], 60 * 48))

        rp.rproc_wait(jobinfo, 30, 1.0, -1)

        ### merge results from count chunks
        if 'verbose' in CFG and CFG['verbose']:
            print '\nCollecting count data from chunks ...\n'
            print 'writing data to %s' % fname_out

        ### write data to hdf5 continuously
        h5fid = h5py.File(fname_out, 'w')
        h5fid.create_dataset(name='gene_names', data=counts['gene_names'])
        h5fid.create_dataset(name='seg_len', data=counts['seg_len'])
        h5fid.create_dataset(name='strains', data=CFG['strains'])
        for c_idx in range(0, genes.shape[0], chunksize):
            cc_idx = min(genes.shape[0], c_idx + chunksize)
            if 'verbose' in CFG and CFG['verbose']:
                print 'collecting chunk %i-%i (%i)' % (c_idx, cc_idx,
                                                       genes.shape[0])
            fn = fname_out.replace('.pickle',
                                   '.chunk_%i_%i.pickle' % (c_idx, cc_idx))
            if not os.path.exists(fn):
                print >> sys.stderr, 'ERROR: Not all chunks in counting graph coverage completed!'
                sys.exit(1)
            else:
                counts_tmp = cPickle.load(open(fn, 'r'))
                for c in range(counts_tmp.shape[1]):
                    if 'segments' in h5fid:
                        appendToHDF5(
                            h5fid,
                            sp.hstack([
                                sp.atleast_2d(x.segments).T
                                for x in counts_tmp[:, c]
                            ]), 'segments')
                        appendToHDF5(
                            h5fid,
                            sp.hstack([
                                sp.atleast_2d(x.seg_pos).T
                                for x in counts_tmp[:, c]
                            ]), 'seg_pos')
                        appendToHDF5(
                            h5fid,
                            sp.ones((sp.atleast_2d(
                                counts_tmp[0, c].seg_pos).shape[1], 1),
                                    dtype='int') * (c_idx + c),
                            'gene_ids_segs')
                    else:
                        h5fid.create_dataset(name='segments',
                                             data=sp.hstack([
                                                 sp.atleast_2d(x.segments).T
                                                 for x in counts_tmp[:, c]
                                             ]),
                                             chunks=True,
                                             compression='gzip',
                                             maxshape=(None,
                                                       len(CFG['strains'])))
                        h5fid.create_dataset(name='seg_pos',
                                             data=sp.hstack([
                                                 sp.atleast_2d(x.seg_pos).T
                                                 for x in counts_tmp[:, c]
                                             ]),
                                             chunks=True,
                                             compression='gzip',
                                             maxshape=(None,
                                                       len(CFG['strains'])))
                        h5fid.create_dataset(
                            name='gene_ids_segs',
                            data=sp.ones((sp.atleast_2d(
                                counts_tmp[0, c].seg_pos).shape[1], 1),
                                         dtype='int') * (c_idx + c),
                            chunks=True,
                            compression='gzip',
                            maxshape=(None, 1))

                        #counts['segments'].append(sp.hstack([sp.atleast_2d(x.segments).T for x in counts_tmp[:, c]]))
                        #counts['seg_pos'].append(sp.hstack([sp.atleast_2d(x.seg_pos).T for x in counts_tmp[:, c]]))
                        #counts['gene_ids_segs'].append(sp.ones((sp.atleast_2d(counts_tmp[0, c].seg_pos).shape[1], 1), dtype='int') * (c_idx + c))

                    tmp = [
                        sp.atleast_2d(x.edges) for x in counts_tmp[:, c]
                        if x.edges.shape[0] > 0
                    ]
                    if len(tmp) == 0:
                        continue
                    tmp = sp.hstack(tmp)
                    if tmp.shape[0] > 0:
                        if 'edges' in h5fid:
                            appendToHDF5(h5fid, tmp[:,
                                                    range(1, tmp.shape[1], 2)],
                                         'edges')
                            appendToHDF5(h5fid, tmp[:, 0], 'edge_idx')
                            appendToHDF5(
                                h5fid,
                                sp.ones((tmp.shape[0], 1), dtype='int') *
                                (c_idx + c), 'gene_ids_edges')
                        else:
                            h5fid.create_dataset(
                                name='edges',
                                data=tmp[:, range(1, tmp.shape[1], 2)],
                                chunks=True,
                                compression='gzip',
                                maxshape=(None, tmp.shape[1] / 2))
                            h5fid.create_dataset(name='edge_idx',
                                                 data=tmp[:, 0],
                                                 chunks=True,
                                                 compression='gzip',
                                                 maxshape=(None, ))
                            h5fid.create_dataset(
                                name='gene_ids_edges',
                                data=sp.ones((tmp.shape[0], 1), dtype='int') *
                                (c_idx + c),
                                chunks=True,
                                compression='gzip',
                                maxshape=(None, 1))
                        #counts['edges'].append(sp.c_[tmp[:, 0], tmp[:, range(1, tmp.shape[1], 2)]])
                        #counts['gene_ids_edges'].append(sp.ones((tmp.shape[0], 1), dtype='int') * (c_idx + c))
                del tmp, counts_tmp
        h5fid.close()
Exemplo n.º 2
0
def count_graph_coverage_wrapper(fname_in, fname_out, CFG, sample_idx=None):

    (genes, inserted) = cPickle.load(open(fname_in, 'r'))
    
    if genes[0].segmentgraph is None or genes[0].segmentgraph.is_empty():
        for g in genes:
            g.segmentgraph = Segmentgraph(g)
        cPickle.dump((genes, inserted), open(fname_in, 'w'), -1)

    counts = dict()
    counts['segments'] = []
    counts['seg_pos'] = []
    counts['gene_ids_segs'] = []
    counts['edges'] = []
    counts['gene_ids_edges'] = []
    counts['seg_len'] = sp.hstack([x.segmentgraph.segments[1, :] - x.segmentgraph.segments[0, :] for x in genes]).T
    counts['gene_names'] = sp.array([x.name for x in genes], dtype='str')

    if not CFG['rproc']:
        if CFG['merge_strategy'] == 'single':
            print '\nprocessing %s' % (CFG['samples'][sample_idx])
            counts_tmp = count_graph_coverage(genes, CFG['bam_fnames'][sample_idx], CFG)
        else:
            for s_idx in range(CFG['strains'].shape[0]):
                print '\n%i/%i' % (s_idx + 1, CFG['strains'].shape[0])
                if s_idx == 0:
                    counts_tmp = count_graph_coverage(genes, CFG['bam_fnames'][s_idx], CFG)
                else:
                    counts_tmp = sp.r_[sp.atleast_2d(counts_tmp), count_graph_coverage(genes, CFG['bam_fnames'][s_idx], CFG)]

        for c in range(counts_tmp.shape[1]):
            counts['segments'].append(sp.hstack([sp.atleast_2d(x.segments).T for x in counts_tmp[:, c]]))
            counts['seg_pos'].append(sp.hstack([sp.atleast_2d(x.seg_pos).T for x in counts_tmp[:, c]]))
            counts['gene_ids_segs'].append(sp.ones((sp.atleast_2d(counts_tmp[0, c].seg_pos).shape[1], 1), dtype='int') * c)
            tmp = [sp.atleast_2d(x.edges) for x in counts_tmp[:, c] if x.edges.shape[0] > 0]
            if len(tmp) == 0:
                continue
            tmp = sp.hstack(tmp)
            if tmp.shape[0] > 0:
                counts['edges'].append(sp.c_[tmp[:, 0], tmp[:, range(1, tmp.shape[1], 2)]])
                counts['gene_ids_edges'].append(sp.ones((tmp.shape[0], 1), dtype='int') * c)

        ### write result data to hdf5
        for key in counts:
            counts[key] = sp.vstack(counts[key]) if len(counts[key]) > 0 else counts[key]
        counts['edge_idx'] = counts['edges'][:, 0] if len(counts['edges']) > 0 else sp.array([])
        counts['edges'] = counts['edges'][:, 1:] if len(counts['edges']) > 0 else sp.array([])
        h5fid = h5py.File(fname_out, 'w')
        h5fid.create_dataset(name='strains', data=CFG['strains'])
        for key in counts:
            h5fid.create_dataset(name=key, data=counts[key])
        h5fid.close()
    else:
        ### have an adaptive chunk size, that takes into account the number of strains (take as many genes as it takes to have ~10K strains)
        chunksize = int(max(1, math.floor(10000 / len(CFG['strains']))))

        jobinfo = []

        PAR = dict()
        PAR['CFG'] = CFG.copy()
        if CFG['merge_strategy'] == 'single':
            PAR['CFG']['bam_fnames'] = PAR['CFG']['bam_fnames'][sample_idx]
            PAR['CFG']['samples'] = PAR['CFG']['samples'][sample_idx]
            PAR['CFG']['strains'] = PAR['CFG']['strains'][sample_idx]

        #s_idx = sp.argsort([x.chr for x in genes]) # TODO
        s_idx = sp.arange(genes.shape[0])
        for c_idx in range(0, s_idx.shape[0], chunksize):
            cc_idx = min(s_idx.shape[0], c_idx + chunksize)
            fn = re.sub(r'.hdf5$', '', fname_out) + '.chunk_%i_%i.pickle' % (c_idx, cc_idx)
            if os.path.exists(fn):
                continue
            else:
                print 'submitting chunk %i to %i (%i)' % (c_idx, cc_idx, s_idx.shape[0])
                PAR['genes'] = genes[s_idx][c_idx:cc_idx]
                PAR['fn_bam'] = CFG['bam_fnames']
                PAR['fn_out'] = fn
                PAR['CFG'] = CFG
                jobinfo.append(rp.rproc('count_graph_coverage', PAR, 15000, CFG['options_rproc'], 60*12))

        rp.rproc_wait(jobinfo, 30, 1.0, -1)
        del genes

        ### merge results from count chunks
        if 'verbose' in CFG and CFG['verbose']:
            print '\nCollecting count data from chunks ...\n'
            print 'writing data to %s' % fname_out

        ### write data to hdf5 continuously
        h5fid = h5py.File(fname_out, 'w')
        h5fid.create_dataset(name='gene_names', data=counts['gene_names'])
        h5fid.create_dataset(name='seg_len', data=counts['seg_len'])
        h5fid.create_dataset(name='strains', data=CFG['strains'])
        for c_idx in range(0, s_idx.shape[0], chunksize):
            cc_idx = min(s_idx.shape[0], c_idx + chunksize)
            if 'verbose' in CFG and CFG['verbose']:
                print 'collecting chunk %i-%i (%i)' % (c_idx, cc_idx, s_idx.shape[0])
            fn = re.sub(r'.hdf5$', '', fname_out) + '.chunk_%i_%i.pickle' % (c_idx, cc_idx)
            if not os.path.exists(fn):
                print >> sys.stderr, 'ERROR: Not all chunks in counting graph coverage completed!'
                sys.exit(1)
            else:
                counts_tmp = cPickle.load(open(fn, 'r'))
                for c in range(counts_tmp.shape[1]):
                    if 'segments' in h5fid:
                        appendToHDF5(h5fid, sp.hstack([sp.atleast_2d(x.segments).T for x in counts_tmp[:, c]]), 'segments')
                        appendToHDF5(h5fid, sp.hstack([sp.atleast_2d(x.seg_pos).T for x in counts_tmp[:, c]]), 'seg_pos') 
                        appendToHDF5(h5fid, sp.ones((sp.atleast_2d(counts_tmp[0, c].seg_pos).shape[1], 1), dtype='int') * (s_idx[c_idx + c]), 'gene_ids_segs')
                    else:
                        h5fid.create_dataset(name='segments', data=sp.hstack([sp.atleast_2d(x.segments).T for x in counts_tmp[:, c]]), chunks=True, compression='gzip', maxshape=(None, len(CFG['strains'])))
                        h5fid.create_dataset(name='seg_pos', data=sp.hstack([sp.atleast_2d(x.seg_pos).T for x in counts_tmp[:, c]]), chunks=True, compression='gzip', maxshape=(None, len(CFG['strains'])))
                        h5fid.create_dataset(name='gene_ids_segs', data=sp.ones((sp.atleast_2d(counts_tmp[0, c].seg_pos).shape[1], 1), dtype='int') * (s_idx[c_idx + c]), chunks=True, compression='gzip', maxshape=(None, 1))

                    tmp = [sp.atleast_2d(x.edges) for x in counts_tmp[:, c] if x.edges.shape[0] > 0]
                    if len(tmp) == 0:
                        continue
                    tmp = sp.hstack(tmp)
                    if tmp.shape[0] > 0:
                        if 'edges' in h5fid:
                            appendToHDF5(h5fid, tmp[:, range(1, tmp.shape[1], 2)], 'edges')
                            appendToHDF5(h5fid, tmp[:, 0], 'edge_idx')
                            appendToHDF5(h5fid, sp.ones((tmp.shape[0], 1), dtype='int') * (s_idx[c_idx + c]), 'gene_ids_edges')
                        else:
                            h5fid.create_dataset(name='edges', data=tmp[:, range(1, tmp.shape[1], 2)], chunks=True, compression='gzip', maxshape=(None, tmp.shape[1] / 2))
                            h5fid.create_dataset(name='edge_idx', data=tmp[:, 0], chunks=True, compression='gzip', maxshape=(None,))
                            h5fid.create_dataset(name='gene_ids_edges', data=sp.ones((tmp.shape[0], 1), dtype='int') * (s_idx[c_idx + c]), chunks=True, compression='gzip', maxshape=(None, 1))
                del tmp, counts_tmp
        h5fid.close()
Exemplo n.º 3
0
def run_merge(CFG):

    merge_all = (CFG['merge_strategy'] == 'merge_all')
    merge_all_tag = ''
    if merge_all:
        merge_all_tag = '_merged_bams'

    prune_tag = ''
    if CFG['do_prune']:
        prune_tag = '_pruned'

    chunksize = 50

    fn_out = '%s/spladder/genes_graph_conf%i.%s%s.pickle' % (CFG['out_dirname'] , CFG['confidence_level'], CFG['merge_strategy'], prune_tag)
    fn_out_val = '%s/spladder/genes_graph_conf%i.%s%s.validated.pickle' % (CFG['out_dirname'], CFG['confidence_level'], CFG['merge_strategy'], prune_tag)
    if CFG['validate_splicegraphs']:
        fn_out_count = '%s/spladder/genes_graph_conf%i.%s%s.validated.count.pickle' % (CFG['out_dirname'], CFG['confidence_level'], CFG['merge_strategy'] , prune_tag)
    else:
        fn_out_count = '%s/spladder/genes_graph_conf%i.%s%s.count.pickle' % (CFG['out_dirname'], CFG['confidence_level'], CFG['merge_strategy'] , prune_tag)

    if not os.path.exists(fn_out):
        if not CFG['rproc']:
            merge_genes_by_splicegraph(CFG)
        else:
            jobinfo = []
            PAR = dict()
            PAR['CFG'] = CFG
            if chunksize > 0:
                merge_list_len = len(CFG['samples'])
                if merge_all:
                    merge_list_len += 1
                for c_idx in range(0, merge_list_len, chunksize):
                    fn = '%s/spladder/genes_graph_conf%i.%s%s_chunk%i_%i.pickle' % (CFG['out_dirname'], CFG['confidence_level'], CFG['merge_strategy'], prune_tag, c_idx, min(merge_list_len, c_idx + chunksize))
                    if os.path.exists(fn):
                        continue
                    else:
                        print 'submitting chunk %i to %i' % (c_idx, min(merge_list_len, c_idx + chunksize))
                        PAR['chunk_idx'] = range(c_idx, min(merge_list_len, c_idx + chunksize))
                        jobinfo.append(rp.rproc('merge_genes_by_splicegraph', PAR, 50000, CFG['options_rproc'], 40*60))
            else:
                jobinfo.append(rp.rproc('merge_genes_by_splicegraph', PAR, 10000, CFG['options_rproc'], 40*60))

            rp.rproc_wait(jobinfo, 30, 1.0, -1)
            ### merge chunks
            if chunksize > 0:
                PAR['chunksize'] = chunksize
                merge_chunks_by_splicegraph(PAR)
    else:
        print 'File %s already exists!' % fn_out

    ### generate validated version of splice graph
    if CFG['validate_splicegraphs'] and not os.path.exists(fn_out_val):
        (genes, inserted) = cPickle.load(open(fn_out, 'r'))
        genes = filter_by_edgecount(genes, CFG)
        cPickle.dump((genes, inserted), open(fn_out_val, 'w'), -1)
        del genes

    ### count segment graph
    if CFG['validate_splicegraphs']:
       count_graph_coverage_wrapper(fn_out_val, fn_out_count, CFG)
    else:
       count_graph_coverage_wrapper(fn_out, fn_out_count, CFG)

    if CFG['do_gen_isoforms']:
        fn_out = '%s/spladder/genes_graph_conf%i.%s%s_isoforms.pickle' % (CFG['out_dirname'], CFG['confidence_level'], CFG['merge_strategy'], prune_tag)
        if not os.path.exists(fn_out):
            if not CFG['rproc']:
                merge_genes_by_isoform(CFG['out_dirname'], CFG['confidence_level'], merge_all, experiment)
            else:
                jobinfo = [rp.rproc('merge_genes_by_isoform', PAR, 10000, CFG['options_rproc'], 40*60)]
                rp.rproc_wait(jobinfo, 30, 1.0, 1)
        else:
            print 'File %s already exists!' % fn_out
Exemplo n.º 4
0
def run_merge(CFG):

    merge_all = (CFG['merge_strategy'] == 'merge_all')
    merge_all_tag = ''
    if merge_all:
        merge_all_tag = '_merged_bams'

    prune_tag = ''
    if CFG['do_prune']:
        prune_tag = '_pruned'

    chunksize = 10

    fn_out = '%s/spladder/genes_graph_conf%i.%s%s.pickle' % (CFG['out_dirname'] , CFG['confidence_level'], CFG['merge_strategy'], prune_tag)
    #fn_out_val = '%s/spladder/genes_graph_conf%i.%s%s.validated.pickle' % (CFG['out_dirname'], CFG['confidence_level'], CFG['merge_strategy'], prune_tag)
    if CFG['validate_splicegraphs']:
        fn_out_count = '%s/spladder/genes_graph_conf%i.%s%s.validated.count.pickle' % (CFG['out_dirname'], CFG['confidence_level'], CFG['merge_strategy'] , prune_tag)
    else:
        fn_out_count = '%s/spladder/genes_graph_conf%i.%s%s.count.pickle' % (CFG['out_dirname'], CFG['confidence_level'], CFG['merge_strategy'] , prune_tag)

    if not os.path.exists(fn_out):
        if not CFG['rproc']:
            merge_list = sp.array(['%s/spladder/genes_graph_conf%i.%s%s.pickle' % (CFG['out_dirname'], CFG['confidence_level'], x, prune_tag) for x in CFG['samples']])
            merge_genes_by_splicegraph(CFG, merge_list=merge_list, fn_out=fn_out)
        else:
            jobinfo = []
            PAR = dict()
            PAR['CFG'] = CFG
            if chunksize > 0:
                levels = int(math.ceil(math.log(len(CFG['samples']), chunksize)))
                level_files = dict()
                for level in range(1, levels + 1):
                    print 'merging files on level %i' % level
                    if level == 1:
                        merge_list = sp.array(['%s/spladder/genes_graph_conf%i.%s%s.pickle' % (CFG['out_dirname'], CFG['confidence_level'], x, prune_tag) for x in CFG['samples']])
                    else:
                        merge_list = sp.array(level_files[level - 1])
                    level_files[level] = []
                    for c_idx in range(0, len(merge_list), chunksize):
                        if level == levels:
                            assert(len(merge_list) <= chunksize)
                            fn = fn_out
                        else:
                            fn = '%s/spladder/genes_graph_conf%i.%s%s_level%i_chunk%i_%i.pickle' % (CFG['out_dirname'], CFG['confidence_level'], CFG['merge_strategy'], prune_tag, level, c_idx, min(len(merge_list), c_idx + chunksize))
                        level_files[level].append(fn)
                        if os.path.exists(fn):
                            continue
                        else:
                            print 'submitting level %i chunk %i to %i' % (level, c_idx, min(len(merge_list), c_idx + chunksize))
                            chunk_idx = range(c_idx, min(len(merge_list), c_idx + chunksize))
                            PAR['merge_list'] = merge_list[chunk_idx]
                            PAR['fn_out'] = fn
                            jobinfo.append(rp.rproc('merge_genes_by_splicegraph', PAR, 20000*level, CFG['options_rproc'], 40*60))
                    rp.rproc_wait(jobinfo, 30, 1.0, -1)
            else:
                PAR['merge_list'] = CFG['samples']
                PAR['fn_out'] = fn_out
                jobinfo.append(rp.rproc('merge_genes_by_splicegraph', PAR, 10000, CFG['options_rproc'], 40*60))
                rp.rproc_wait(jobinfo, 30, 1.0, -1)
    else:
        print 'File %s already exists!' % fn_out

    ### generate validated version of splice graph
    #if CFG['validate_splicegraphs'] and not os.path.exists(fn_out_val):
    #    (genes, inserted) = cPickle.load(open(fn_out, 'r'))
    #    genes = filter_by_edgecount(genes, CFG)
    #    cPickle.dump((genes, inserted), open(fn_out_val, 'w'), -1)
    #    del genes

    ### count segment graph
    #if CFG['validate_splicegraphs']:
    #   count_graph_coverage_wrapper(fn_out_val, fn_out_count, CFG)
    #else:
    #   count_graph_coverage_wrapper(fn_out, fn_out_count, CFG)

    if CFG['do_gen_isoforms']:
        fn_out = '%s/spladder/genes_graph_conf%i.%s%s_isoforms.pickle' % (CFG['out_dirname'], CFG['confidence_level'], CFG['merge_strategy'], prune_tag)
        if not os.path.exists(fn_out):
            if not CFG['rproc']:
                merge_genes_by_isoform(CFG['out_dirname'], CFG['confidence_level'], merge_all, experiment)
            else:
                jobinfo = [rp.rproc('merge_genes_by_isoform', PAR, 10000, CFG['options_rproc'], 40*60)]
                rp.rproc_wait(jobinfo, 30, 1.0, 1)
        else:
            print 'File %s already exists!' % fn_out
Exemplo n.º 5
0
def count_graph_coverage_wrapper(fname_in, fname_out, CFG):

    (genes, inserted) = cPickle.load(open(fname_in, 'r'))
    
    if genes[0].segmentgraph is None:
        for g in genes:
            g.segmentgraph = Segmentgraph(g)
        cPickle.dump((genes, inserted), open(fname_in, 'w'), -1)

    counts = dict()
    counts['segments'] = []
    counts['seg_pos'] = []
    counts['gene_ids_segs'] = []
    counts['edges'] = []
    counts['gene_ids_edges'] = []
    if not CFG['rproc']:
        for s_idx in range(CFG['strains'].shape[0]):
            print '\n%i/%i' % (s_idx + 1, CFG['strains'].shape[0])
            if s_idx == 0:
                counts_tmp = count_graph_coverage(genes, CFG['bam_fnames'][s_idx], CFG)
            else:
                counts_tmp = sp.r_[sp.atleast_2d(counts_tmp), count_graph_coverage(genes, CFG['bam_fnames'][s_idx], CFG)]

        for c in range(counts_tmp.shape[1]):
            counts['segments'].append(sp.hstack([sp.atleast_2d(x.segments).T for x in counts_tmp[:, c]]))
            counts['seg_pos'].append(sp.hstack([sp.atleast_2d(x.seg_pos).T for x in counts_tmp[:, c]]))
            counts['gene_ids_segs'].append(sp.ones((sp.atleast_2d(counts_tmp[0, c].seg_pos).shape[1], 1), dtype='int') * c)
            tmp = sp.hstack([sp.atleast_2d(x.edges) for x in counts_tmp[:, c]])
            if tmp.shape[0] > 0:
                counts['edges'].append(sp.c_[tmp[:, 0], tmp[:, range(1, tmp.shape[1], 2)]])
                counts['gene_ids_edges'].append(sp.ones((tmp.shape[0], 1), dtype='int') * c)
    else:
        ### have an adaptive chunk size, that takes into account the number of strains (take as many genes as it takes to have ~10K strains)
        chunksize = int(max(1, math.floor(10000 / len(CFG['strains']))))

        jobinfo = []

        PAR = dict()
        PAR['CFG'] = CFG
        for c_idx in range(0, genes.shape[0], chunksize):
            cc_idx = min(genes.shape[0], c_idx + chunksize)
            fn = fname_out.replace('.pickle', '.chunk_%i_%i.pickle' % (c_idx, cc_idx))
            if os.path.exists(fn):
                continue
            else:
                print 'submitting chunk %i to %i' % (c_idx, cc_idx)
                PAR['genes'] = genes[c_idx:cc_idx]
                PAR['fn_bam'] = CFG['bam_fnames']
                PAR['fn_out'] = fn
                PAR['CFG'] = CFG
                jobinfo.append(rp.rproc('count_graph_coverage', PAR, 30000, CFG['options_rproc'], 60))

        rp.rproc_wait(jobinfo, 30, 1.0, -1)

        ### merge results
        for c_idx in range(0, genes.shape[0], chunksize):
            cc_idx = min(genes.shape[0], c_idx + chunksize)
            fn = fname_out.replace('.pickle', '.chunk_%i_%i.pickle' % (c_idx, cc_idx))
            if not os.path.exists(fn):
                print >> sys.stderr, 'ERROR: Not all chunks in counting graph coverage completed!'
                sys.exit(1)
            else:
                counts_tmp = cPickle.load(open(fn, 'r'))
                for c in range(counts_tmp.shape[1]):
                    counts['segments'].append(sp.hstack([sp.atleast_2d(x.segments).T for x in counts_tmp[:, c]]))
                    counts['seg_pos'].append(sp.hstack([sp.atleast_2d(x.seg_pos).T for x in counts_tmp[:, c]]))
                    counts['gene_ids_segs'].append(sp.ones((sp.atleast_2d(counts_tmp[0, c].seg_pos).shape[1], 1), dtype='int') * (c_idx + c))
                    tmp = sp.hstack([sp.atleast_2d(x.edges) for x in counts_tmp[:, c]])
                    if tmp.shape[0] > 0:
                        counts['edges'].append(sp.c_[tmp[:, 0], tmp[:, range(1, tmp.shape[1], 2)]])
                        counts['gene_ids_edges'].append(sp.ones((tmp.shape[0], 1), dtype='int') * (c_idx + c))

    for key in counts:
        if len(counts[key]) > 0:
            counts[key] = sp.vstack(counts[key])
    if len(counts['edges']) > 0:
        counts['edge_idx'] = counts['edges'][:, 0]
        counts['edges'] = counts['edges'][:, 1:]
    else:
        counts['edge_idx'] = sp.array([])
        counts['edges'] = sp.array([])
    counts['seg_len'] = sp.hstack([x.segmentgraph.segments[1, :] - x.segmentgraph.segments[0, :] for x in genes]).T

    ### write result data to hdf5
    h5fid = h5py.File(fname_out, 'w')
    h5fid.create_dataset(name='gene_names', data=sp.array([x.name for x in genes], dtype='str'))
    h5fid.create_dataset(name='strains', data=CFG['strains'])
    for key in counts:
        h5fid.create_dataset(name=key, data=counts[key])
    h5fid.close()