def main(): ### get command line options options = parse_options(sys.argv) ### parse parameters from options object CFG = settings.parse_args(options, identity='test') CFG['use_exon_counts'] = False ### generate output directory outdir = os.path.join(options.outdir, 'testing') if options.timestamp == 'y': outdir = '%s_%s' % (outdir, str(datetime.datetime.now()).replace( ' ', '_')) if CFG['diagnose_plots']: CFG['plot_dir'] = os.path.join(options.outdir, 'plots') if not os.path.exists(CFG['plot_dir']): os.makedirs(CFG['plot_dir']) if options.labelA != 'condA' and options.labelB != 'condB': outdir = '%s_%s_vs_%s' % (outdir, options.labelA, options.labelB) if not os.path.exists(outdir): os.makedirs(outdir) if CFG['debug']: print "Generating simulated dataset" npr.seed(23) CFG['is_matlab'] = False #cov = npr.permutation(20000-20).astype('float').reshape(999, 20) #cov = sp.r_[cov, sp.c_[sp.ones((1, 10)) *10, sp.ones((1, 10)) * 500000] + npr.normal(10, 1, 20)] #sf = sp.ones((cov.shape[1], ), dtype='float') setsize = 50 ### diff event counts cov = sp.zeros((500, 2 * setsize), dtype='int') for i in range(10): cov[i, :setsize] = nbinom.rvs(30, 0.8, size=setsize) cov[i, setsize:] = nbinom.rvs(10, 0.8, size=setsize) for i in range(10, cov.shape[0]): cov[i, :] = nbinom.rvs(30, 0.8, size=2 * setsize) ### diff gene expression cov2 = sp.zeros((500, 2 * setsize), dtype='int') for i in range(20): cov2[i, :setsize] = nbinom.rvs(2000, 0.2, size=setsize) cov2[i, setsize:] = nbinom.rvs(2000, 0.3, size=setsize) for i in range(20, cov2.shape[0]): cov2[i, :] = nbinom.rvs(2000, 0.3, size=2 * setsize) cov = sp.c_[cov, cov2] * 10000 tidx = sp.arange(setsize) sf = npr.uniform(0, 5, 2 * setsize) sf = sp.r_[sf, sf] #dmatrix0 = sp.ones((cov.shape[1], 3), dtype='bool') dmatrix1 = sp.zeros((cov.shape[1], 4), dtype='float') dmatrix1[:, 0] = 1 dmatrix1[tidx, 1] = 1 #dmatrix1[tidx, 2] = 1 dmatrix1[tidx + (2 * setsize), 2] = 1 dmatrix1[(2 * setsize):, 3] = 1 #dmatrix1[:, 4] = sp.log(sf) dmatrix0 = dmatrix1[:, [0, 2, 3]] cov = cov * sf #sf = sp.ones((cov.shape[1], ), dtype='float') pvals = run_testing(cov, dmatrix0, dmatrix1, sf, CFG) pvals_adj = adj_pval(pvals, CFG) pdb.set_trace() else: val_tag = '' if CFG['validate_splicegraphs']: val_tag = '.validated' if CFG['is_matlab']: CFG['fname_genes'] = os.path.join( CFG['out_dirname'], 'spladder', 'genes_graph_conf%i.%s%s.mat' % (CFG['confidence_level'], CFG['merge_strategy'], val_tag)) CFG['fname_count_in'] = os.path.join( CFG['out_dirname'], 'spladder', 'genes_graph_conf%i.%s%s.count.mat' % (CFG['confidence_level'], CFG['merge_strategy'], val_tag)) else: CFG['fname_genes'] = os.path.join( CFG['out_dirname'], 'spladder', 'genes_graph_conf%i.%s%s.pickle' % (CFG['confidence_level'], CFG['merge_strategy'], val_tag)) CFG['fname_count_in'] = os.path.join( CFG['out_dirname'], 'spladder', 'genes_graph_conf%i.%s%s.count.hdf5' % (CFG['confidence_level'], CFG['merge_strategy'], val_tag)) condition_strains = None CFG['fname_exp_hdf5'] = os.path.join( CFG['out_dirname'], 'spladder', 'genes_graph_conf%i.%s%s.gene_exp.hdf5' % (CFG['confidence_level'], CFG['merge_strategy'], val_tag)) if os.path.exists(CFG['fname_exp_hdf5']): if CFG['verbose']: print 'Loading expression counts from %s' % CFG[ 'fname_exp_hdf5'] IN = h5py.File(CFG['fname_exp_hdf5'], 'r') gene_counts = IN['raw_count'][:] gene_strains = IN['strains'][:] gene_ids = IN['genes'][:] IN.close() else: if options.subset_samples == 'y': condition_strains = sp.unique( sp.r_[sp.array(CFG['conditionA']), sp.array(CFG['conditionB'])]) CFG['fname_exp_hdf5'] = os.path.join( CFG['out_dirname'], 'spladder', 'genes_graph_conf%i.%s%s.gene_exp.%i.hdf5' % (CFG['confidence_level'], CFG['merge_strategy'], val_tag, hash(tuple(sp.unique(condition_strains))) * -1)) if os.path.exists(CFG['fname_exp_hdf5']): if CFG['verbose']: print 'Loading expression counts from %s' % CFG[ 'fname_exp_hdf5'] IN = h5py.File(CFG['fname_exp_hdf5'], 'r') gene_counts = IN['raw_count'][:] gene_strains = IN['strains'][:] gene_ids = IN['genes'][:] IN.close() else: gene_counts, gene_strains, gene_ids = get_gene_expression( CFG, fn_out=CFG['fname_exp_hdf5'], strain_subset=condition_strains) gene_strains = sp.array( [x.split(':')[1] if ':' in x else x for x in gene_strains]) ### estimate size factors for library size normalization sf_ge = get_size_factors(gene_counts, CFG) ### get index of samples for difftest idx1 = sp.where(sp.in1d(gene_strains, CFG['conditionA']))[0] idx2 = sp.where(sp.in1d(gene_strains, CFG['conditionB']))[0] ### for TESTING #setsize = 100 #idx1 = sp.arange(0, setsize / 2) #idx2 = sp.arange(setsize / 2, setsize) ### subset expression counts to tested samples gene_counts = gene_counts[:, sp.r_[idx1, idx2]] sf_ge = sf_ge[sp.r_[idx1, idx2]] #sf = sp.r_[sf, sf] ### test each event type individually for event_type in CFG['event_types']: if CFG['verbose']: print 'Testing %s events' % event_type CFG['fname_events'] = os.path.join( CFG['out_dirname'], 'merge_graphs_%s_C%i.counts.hdf5' % (event_type, CFG['confidence_level'])) ### quantify events (cov, gene_idx, event_idx, event_ids, event_strains) = quantify.quantify_from_counted_events( CFG['fname_events'], sp.r_[idx1, idx2], event_type, CFG) ### estimate size factors sf_ev = get_size_factors(sp.vstack(cov), CFG) sf = sp.r_[sf_ev, sf_ge] assert (sp.all(gene_strains == event_strains)) ### map gene expression to event order curr_gene_counts = gene_counts[gene_idx, :] ### filter for min expression if event_type == 'intron_retention': k_idx = sp.where((sp.mean(cov[0] == 0, axis=1) < CFG['max_0_frac']) | \ (sp.mean(cov[1] == 0, axis=1) < CFG['max_0_frac']))[0] else: k_idx = sp.where(((sp.mean(cov[0] == 0, axis=1) < CFG['max_0_frac']) | \ (sp.mean(cov[1] == 0, axis=1) < CFG['max_0_frac'])) & \ (sp.mean(sp.c_[cov[0][:, :idx1.shape[0]], cov[1][:, :idx1.shape[0]]] == 0, axis=1) < CFG['max_0_frac']) & \ (sp.mean(sp.c_[cov[0][:, idx2.shape[0]:], cov[1][:, idx2.shape[0]:]] == 0, axis=1) < CFG['max_0_frac']))[0] if CFG['verbose']: print 'Exclude %i of %i %s events (%.2f percent) from testing due to low coverage' % ( cov[0].shape[0] - k_idx.shape[0], cov[0].shape[0], event_type, (1 - float(k_idx.shape[0]) / cov[0].shape[0]) * 100) if k_idx.shape[0] == 0: print 'All events of type %s were filtered out due to low coverage. Please try re-running with less stringent filter criteria' % event_type continue # k_idx = sp.where((sp.mean(sp.c_[cov[0], cov[1]], axis=1) > 2))[0] # k_idx = sp.where((sp.mean(cov[0], axis=1) > 2) & (sp.mean(cov[1], axis=1) > 2))[0] cov[0] = cov[0][k_idx, :] cov[1] = cov[1][k_idx, :] curr_gene_counts = curr_gene_counts[k_idx, :] event_idx = event_idx[k_idx] gene_idx = gene_idx[k_idx] event_ids = [x[k_idx] for x in event_ids] cov[0] = sp.around(sp.hstack([cov[0], curr_gene_counts])) cov[1] = sp.around(sp.hstack([cov[1], curr_gene_counts])) cov = sp.vstack(cov) event_ids = sp.hstack(event_ids) tidx = sp.arange(idx1.shape[0]) #if CFG['debug']: # for i in range(cov.shape[0]): # fig = plt.figure(figsize=(8, 6), dpi=100) # ax = fig.add_subplot(111) # ax.hist(cov[i, :] * sf, 50, histtype='bar', rwidth=0.8) # #ax.plot(sp.arange(cov.shape[1]), sorted(cov[i, :]), 'bo') # ax.set_title('Count Distribution - Sample %i' % i ) # plt.savefig('count_dist.%i.pdf' % i, format='pdf', bbox_inches='tight') # plt.close(fig) ### build design matrix for testing dmatrix1 = sp.zeros((cov.shape[1], 4), dtype='bool') dmatrix1[:, 0] = 1 # intercept dmatrix1[tidx, 1] = 1 # delta a dmatrix1[tidx, 2] = 1 # delta g dmatrix1[tidx + (idx1.shape[0] + idx2.shape[0]), 2] = 1 # delta g dmatrix1[(idx1.shape[0] + idx2.shape[0]):, 3] = 1 # is g dmatrix0 = dmatrix1[:, [0, 2, 3]] ### make event splice forms unique to prevent unnecessary tests event_ids, u_idx, r_idx = sp.unique(event_ids, return_index=True, return_inverse=True) if CFG['verbose']: print 'Consider %i unique event splice forms for testing' % u_idx.shape[ 0] ### run testing #pvals = run_testing(cov[u_idx, :], dmatrix0, dmatrix1, sf, CFG, r_idx) pvals = run_testing(cov, dmatrix0, dmatrix1, sf, CFG) pvals_adj = adj_pval(pvals, CFG) ### write output out_fname = os.path.join( outdir, 'test_results_C%i_%s.tsv' % (options.confidence, event_type)) if CFG['verbose']: print 'Writing test results to %s' % out_fname s_idx = sp.argsort(pvals_adj) header = sp.array(['event_id', 'gene', 'p_val', 'p_val_adj']) event_ids = sp.array( ['%s_%i' % (event_type, i + 1) for i in event_idx], dtype='str') if CFG['is_matlab']: data_out = sp.c_[event_ids[s_idx], gene_ids[gene_idx[s_idx], 0], pvals[s_idx].astype('str'), pvals_adj[s_idx].astype('str')] else: data_out = sp.c_[event_ids[s_idx], gene_ids[gene_idx[s_idx]], pvals[s_idx].astype('str'), pvals_adj[s_idx].astype('str')] data_out = sp.r_[header[sp.newaxis, :], data_out] sp.savetxt(out_fname, data_out, delimiter='\t', fmt='%s')
def spladder(): ### get command line options options = parse_options(sys.argv) ### parse parameters from options object CFG = settings.parse_args(options) ### add dependencies provided in config section #if 'paths' in CFG: # for i in CFG['paths']: # eval('import %s'% CFG['paths'][i]) ### load confidence level settings if not CFG['no_reset_conf']: CFG = settings.set_confidence_level(CFG) ### do not compute components of merged set, if result file already exists fn_out_merge = get_filename('fn_out_merge', CFG) fn_out_merge_val = get_filename('fn_out_merge_val', CFG) if not 'spladder_infile' in CFG and not os.path.exists(fn_out_merge): ### iterate over files, if merge strategy is single if CFG['merge_strategy'] in ['single', 'merge_graphs']: idxs = range(len(CFG['samples'])) else: idxs = [0] ### set parallelization if CFG['rproc']: jobinfo = [] ### create out-directory if not os.path.exists(CFG['out_dirname']): os.makedirs(CFG['out_dirname']) ### create spladder sub-directory if not os.path.exists(os.path.join(CFG['out_dirname'], 'spladder')): os.makedirs(os.path.join(CFG['out_dirname'], 'spladder')) ### pre-process annotation, if necessary if CFG['anno_fname'].split('.')[-1] != 'pickle': if not os.path.exists(CFG['anno_fname'] + '.pickle'): if CFG['anno_fname'].split('.')[-1].lower() in ['gff', 'gff3']: (genes, CFG) = init.init_genes_gff3(CFG['anno_fname'], CFG, CFG['anno_fname'] + '.pickle') elif CFG['anno_fname'].split('.')[-1].lower() in ['gtf']: (genes, CFG) = init.init_genes_gtf(CFG['anno_fname'], CFG, CFG['anno_fname'] + '.pickle') else: print >> sys.stderr, 'ERROR: Unknown annotation format. File needs to end in gtf or gff/gff3\nCurrent file: %s' % CFG[ 'anno_fname'] sys.exit(1) CFG['anno_fname'] += '.pickle' ### add anotation contigs into lookup table if not 'genes' in CFG: genes = cPickle.load(open(CFG['anno_fname'], 'r')) else: genes = CFG['genes'] CFG = init.append_chrms( sp.unique(sp.array([x.chr for x in genes], dtype='str')), CFG) del genes ### convert input BAMs to sparse arrays - filtered case if CFG['bam_to_sparse']: for bfn in CFG['bam_fnames']: if bfn.endswith('bam') and not os.path.exists( re.sub(r'.bam$', '', bfn) + '.filt.hdf5'): #cnts = dict() if not 'chrm_lookup' in CFG: IN = pysam.Samfile(bfn, 'rb') CFG = append_chrms( [x['SN'] for x in parse_header(IN.text)['SQ']], CFG) IN.close() OUT = h5py.File( re.sub(r'.bam$', '', bfn) + '.filt.hdf5', 'w') if CFG['parallel'] > 1: import multiprocessing as mp pool = mp.Pool(processes=CFG['parallel']) result = [ pool.apply_async( summarize_chr, args=( bfn, str(chrm), CFG, ), kwds={'filter': CFG['read_filter']}) for chrm in sorted(CFG['chrm_lookup']) ] while result: tmp = result.pop(0).get() OUT.create_dataset(name=(tmp[0] + '_reads_row'), data=tmp[1].row.astype('uint8'), compression='gzip') OUT.create_dataset(name=(tmp[0] + '_reads_col'), data=tmp[1].col, compression='gzip') OUT.create_dataset(name=(tmp[0] + '_reads_dat'), data=tmp[1].data, compression='gzip') OUT.create_dataset(name=(tmp[0] + '_reads_shp'), data=tmp[1].shape) OUT.create_dataset(name=(tmp[0] + '_introns_m'), data=tmp[2], compression='gzip') OUT.create_dataset(name=(tmp[0] + '_introns_p'), data=tmp[3], compression='gzip') del tmp else: for chrm in CFG['chrm_lookup']: tmp = summarize_chr(bfn, str(chrm), CFG, filter=CFG['read_filter']) OUT.create_dataset(name=(chrm + '_reads_row'), data=tmp[1].row.astype('uint8'), compression='gzip') OUT.create_dataset(name=(chrm + '_reads_col'), data=tmp[1].col, compression='gzip') OUT.create_dataset(name=(chrm + '_reads_dat'), data=tmp[1].data, compression='gzip') OUT.create_dataset(name=(chrm + '_reads_shp'), data=tmp[1].shape) OUT.create_dataset(name=(chrm + '_introns_m'), data=tmp[2], compression='gzip') OUT.create_dataset(name=(chrm + '_introns_p'), data=tmp[3], compression='gzip') OUT.close() elif CFG['verbose']: print >> sys.stdout, 'Filtered sparse BAM representation for %s already exists.' % bfn ### build individual graphs for idx in idxs: CFG_ = dict() if CFG['merge_strategy'] != 'merge_bams': CFG_['bam_fnames'] = CFG['bam_fnames'] CFG_['samples'] = CFG['samples'] CFG['bam_fnames'] = CFG['bam_fnames'][idx] CFG['samples'] = CFG['samples'][idx] CFG['out_fname'] = '%s/spladder/genes_graph_conf%i.%s.pickle' % ( CFG['out_dirname'], CFG['confidence_level'], CFG['samples']) else: CFG['out_fname'] = '%s/spladder/genes_graph_conf%i.%s.pickle' % ( CFG['out_dirname'], CFG['confidence_level'], CFG['merge_strategy']) ### assemble out filename to check if we are already done fn_out = CFG['out_fname'] if CFG['do_prune']: fn_out = re.sub('.pickle$', '_pruned.pickle', fn_out) if CFG['do_gen_isoforms']: fn_out = re.sub('.pickle$', '_with_isoforms.pickle', fn_out) if os.path.exists(fn_out): print >> sys.stdout, '%s - All result files already exist.' % fn_out else: if CFG['rproc']: jobinfo.append( rp.rproc('spladder_core', CFG, 15000, CFG['options_rproc'], 60 * 60)) else: spladder_core(CFG) for key in CFG_: try: CFG[key] = CFG_[key].copy() except AttributeError: CFG[key] = CFG_[key] ### collect results after parallelization if CFG['rproc']: rp.rproc_wait(jobinfo, 30, 1.0, -1) ### merge parts if necessary if CFG['merge_strategy'] == 'merge_graphs': run_merge(CFG) if not 'spladder_infile' in CFG and CFG[ 'merge_strategy'] == 'merge_graphs' and CFG[ 'validate_splicegraphs'] and not os.path.exists( fn_out_merge_val): (genes, inserted) = cPickle.load(open(fn_out_merge, 'r')) genes = filter_by_edgecount(genes, CFG) cPickle.dump((genes, inserted), open(fn_out_merge_val, 'w'), -1) del genes ### convert input BAMs to sparse arrays - unfiltered case if CFG['bam_to_sparse']: for bfn in CFG['bam_fnames']: if bfn.endswith('bam') and not os.path.exists( re.sub(r'.bam$', '', bfn) + '.hdf5'): #cnts = dict() if not 'chrm_lookup' in CFG: IN = pysam.Samfile(bfn, 'rb') CFG = append_chrms( [x['SN'] for x in parse_header(IN.text)['SQ']], CFG) IN.close() OUT = h5py.File(re.sub(r'.bam$', '', bfn) + '.hdf5', 'w') if CFG['parallel'] > 1: import multiprocessing as mp pool = mp.Pool(processes=CFG['parallel']) result = [ pool.apply_async(summarize_chr, args=( bfn, str(chrm), CFG, )) for chrm in sorted(CFG['chrm_lookup']) ] while result: tmp = result.pop(0).get() OUT.create_dataset(name=(tmp[0] + '_reads_row'), data=tmp[1].row.astype('uint8'), compression='gzip') OUT.create_dataset(name=(tmp[0] + '_reads_col'), data=tmp[1].col, compression='gzip') OUT.create_dataset(name=(tmp[0] + '_reads_dat'), data=tmp[1].data, compression='gzip') OUT.create_dataset(name=(tmp[0] + '_reads_shp'), data=tmp[1].shape) OUT.create_dataset(name=(tmp[0] + '_introns_m'), data=tmp[2], compression='gzip') OUT.create_dataset(name=(tmp[0] + '_introns_p'), data=tmp[3], compression='gzip') else: for chrm in CFG['chrm_lookup']: tmp = summarize_chr(bfn, str(chrm), CFG) OUT.create_dataset(name=(chrm + '_reads_row'), data=tmp[1].row.astype('uint8'), compression='gzip') OUT.create_dataset(name=(chrm + '_reads_col'), data=tmp[1].col, compression='gzip') OUT.create_dataset(name=(chrm + '_reads_dat'), data=tmp[1].data, compression='gzip') OUT.create_dataset(name=(chrm + '_reads_shp'), data=tmp[1].shape) OUT.create_dataset(name=(chrm + '_introns_m'), data=tmp[2], compression='gzip') OUT.create_dataset(name=(chrm + '_introns_p'), data=tmp[3], compression='gzip') OUT.close() elif CFG['verbose']: print >> sys.stdout, 'Sparse BAM representation for %s already exists.' % bfn if CFG['merge_strategy'] == 'single': idxs = range(len(CFG['samples'])) else: idxs = [0] for idx in idxs: ### get count output file if CFG['merge_strategy'] == 'single': fn_in_count = get_filename('fn_count_in', CFG, sample_idx=idx) fn_out_count = get_filename('fn_count_out', CFG, sample_idx=idx) else: fn_in_count = get_filename('fn_count_in', CFG) fn_out_count = get_filename('fn_count_out', CFG) ### count segment graph if CFG['run_as_analysis'] or CFG['count_segment_graph']: if not os.path.exists(fn_out_count): if CFG['merge_strategy'] == 'single': count_graph_coverage_wrapper(fn_in_count, fn_out_count, CFG, sample_idx=idx) else: count_graph_coverage_wrapper(fn_in_count, fn_out_count, CFG) ### count intron coverage phenotype if CFG['count_intron_cov']: fn_out_intron_count = fn_out_count.replace('pickle', 'introns.pickle') count_intron_coverage_wrapper(fn_in_count, fn_out_intron_count, CFG) ### handle alternative splicing part if CFG['run_as_analysis']: collect_events(CFG) for e_idx in range(len(CFG['event_types'])): analyze_events(CFG, CFG['event_types'][e_idx])
def main(): ### get command line options options = parse_options(sys.argv) ### parse parameters from options object CFG = settings.parse_args(options, identity='test') CFG['use_exon_counts'] = False ### generate output directory outdir = os.path.join(options.outdir, 'testing') if options.timestamp == 'y': outdir = '%s_%s' % (outdir, str(datetime.datetime.now()).replace(' ', '_')) if CFG['diagnose_plots']: CFG['plot_dir'] = os.path.join(options.outdir, 'plots') if not os.path.exists(CFG['plot_dir']): os.makedirs(CFG['plot_dir']) if options.labelA != 'condA' and options.labelB != 'condB': outdir = '%s_%s_vs_%s' % (outdir, options.labelA, options.labelB) if not os.path.exists(outdir): os.makedirs(outdir) if CFG['debug']: print "Generating simulated dataset" npr.seed(23) CFG['is_matlab'] = False #cov = npr.permutation(20000-20).astype('float').reshape(999, 20) #cov = sp.r_[cov, sp.c_[sp.ones((1, 10)) *10, sp.ones((1, 10)) * 500000] + npr.normal(10, 1, 20)] #sf = sp.ones((cov.shape[1], ), dtype='float') setsize = 50 ### diff event counts cov = sp.zeros((500, 2 * setsize), dtype='int') for i in range(10): cov[i, :setsize] = nbinom.rvs(30, 0.8, size=setsize) cov[i, setsize:] = nbinom.rvs(10, 0.8, size=setsize) for i in range(10, cov.shape[0]): cov[i, :] = nbinom.rvs(30, 0.8, size=2*setsize) ### diff gene expression cov2 = sp.zeros((500, 2 * setsize), dtype='int') for i in range(20): cov2[i, :setsize] = nbinom.rvs(2000, 0.2, size=setsize) cov2[i, setsize:] = nbinom.rvs(2000, 0.3, size=setsize) for i in range(20, cov2.shape[0]): cov2[i, :] = nbinom.rvs(2000, 0.3, size=2*setsize) cov = sp.c_[cov, cov2] * 10000 tidx = sp.arange(setsize) sf = npr.uniform(0, 5, 2*setsize) sf = sp.r_[sf, sf] #dmatrix0 = sp.ones((cov.shape[1], 3), dtype='bool') dmatrix1 = sp.zeros((cov.shape[1], 4), dtype='float') dmatrix1[:, 0] = 1 dmatrix1[tidx, 1] = 1 #dmatrix1[tidx, 2] = 1 dmatrix1[tidx + (2*setsize), 2] = 1 dmatrix1[(2*setsize):, 3] = 1 #dmatrix1[:, 4] = sp.log(sf) dmatrix0 = dmatrix1[:, [0, 2, 3]] cov = cov * sf #sf = sp.ones((cov.shape[1], ), dtype='float') pvals = run_testing(cov, dmatrix0, dmatrix1, sf, CFG) pvals_adj = adj_pval(pvals, CFG) pdb.set_trace() else: val_tag = '' if CFG['validate_splicegraphs']: val_tag = '.validated' if CFG['is_matlab']: CFG['fname_genes'] = os.path.join(CFG['out_dirname'], 'spladder', 'genes_graph_conf%i.%s%s.mat' % (CFG['confidence_level'], CFG['merge_strategy'], val_tag)) CFG['fname_count_in'] = os.path.join(CFG['out_dirname'], 'spladder', 'genes_graph_conf%i.%s%s.count.mat' % (CFG['confidence_level'], CFG['merge_strategy'], val_tag)) else: CFG['fname_genes'] = os.path.join(CFG['out_dirname'], 'spladder', 'genes_graph_conf%i.%s%s.pickle' % (CFG['confidence_level'], CFG['merge_strategy'], val_tag)) CFG['fname_count_in'] = os.path.join(CFG['out_dirname'], 'spladder', 'genes_graph_conf%i.%s%s.count.hdf5' % (CFG['confidence_level'], CFG['merge_strategy'], val_tag)) condition_strains = None CFG['fname_exp_hdf5'] = os.path.join(CFG['out_dirname'], 'spladder', 'genes_graph_conf%i.%s%s.gene_exp.hdf5' % (CFG['confidence_level'], CFG['merge_strategy'], val_tag)) if os.path.exists(CFG['fname_exp_hdf5']): if CFG['verbose']: print 'Loading expression counts from %s' % CFG['fname_exp_hdf5'] IN = h5py.File(CFG['fname_exp_hdf5'], 'r') gene_counts = IN['raw_count'][:] gene_strains = IN['strains'][:] gene_ids = IN['genes'][:] IN.close() else: if options.subset_samples == 'y': condition_strains = sp.unique(sp.r_[sp.array(CFG['conditionA']), sp.array(CFG['conditionB'])]) CFG['fname_exp_hdf5'] = os.path.join(CFG['out_dirname'], 'spladder', 'genes_graph_conf%i.%s%s.gene_exp.%i.hdf5' % (CFG['confidence_level'], CFG['merge_strategy'], val_tag, hash(tuple(sp.unique(condition_strains))) * -1)) if os.path.exists(CFG['fname_exp_hdf5']): if CFG['verbose']: print 'Loading expression counts from %s' % CFG['fname_exp_hdf5'] IN = h5py.File(CFG['fname_exp_hdf5'], 'r') gene_counts = IN['raw_count'][:] gene_strains = IN['strains'][:] gene_ids = IN['genes'][:] IN.close() else: gene_counts, gene_strains, gene_ids = get_gene_expression(CFG, fn_out=CFG['fname_exp_hdf5'], strain_subset=condition_strains) gene_strains = sp.array([x.split(':')[1] if ':' in x else x for x in gene_strains]) ### estimate size factors for library size normalization sf_ge = get_size_factors(gene_counts, CFG) ### get index of samples for difftest idx1 = sp.where(sp.in1d(gene_strains, CFG['conditionA']))[0] idx2 = sp.where(sp.in1d(gene_strains, CFG['conditionB']))[0] ### for TESTING #setsize = 100 #idx1 = sp.arange(0, setsize / 2) #idx2 = sp.arange(setsize / 2, setsize) ### subset expression counts to tested samples gene_counts = gene_counts[:, sp.r_[idx1, idx2]] sf_ge = sf_ge[sp.r_[idx1, idx2]] #sf = sp.r_[sf, sf] ### test each event type individually for event_type in CFG['event_types']: if CFG['verbose']: print 'Testing %s events' % event_type CFG['fname_events'] = os.path.join(CFG['out_dirname'], 'merge_graphs_%s_C%i.counts.hdf5' % (event_type, CFG['confidence_level'])) ### quantify events (cov, gene_idx, event_idx, event_ids, event_strains) = quantify.quantify_from_counted_events(CFG['fname_events'], sp.r_[idx1, idx2], event_type, CFG) ### estimate size factors sf_ev = get_size_factors(sp.vstack(cov), CFG) sf = sp.r_[sf_ev, sf_ge] assert(sp.all(gene_strains == event_strains)) ### map gene expression to event order curr_gene_counts = gene_counts[gene_idx, :] ### filter for min expression if event_type == 'intron_retention': k_idx = sp.where((sp.mean(cov[0] == 0, axis=1) < CFG['max_0_frac']) | \ (sp.mean(cov[1] == 0, axis=1) < CFG['max_0_frac']))[0] else: k_idx = sp.where(((sp.mean(cov[0] == 0, axis=1) < CFG['max_0_frac']) | \ (sp.mean(cov[1] == 0, axis=1) < CFG['max_0_frac'])) & \ (sp.mean(sp.c_[cov[0][:, :idx1.shape[0]], cov[1][:, :idx1.shape[0]]] == 0, axis=1) < CFG['max_0_frac']) & \ (sp.mean(sp.c_[cov[0][:, idx2.shape[0]:], cov[1][:, idx2.shape[0]:]] == 0, axis=1) < CFG['max_0_frac']))[0] if CFG['verbose']: print 'Exclude %i of %i %s events (%.2f percent) from testing due to low coverage' % (cov[0].shape[0] - k_idx.shape[0], cov[0].shape[0], event_type, (1 - float(k_idx.shape[0]) / cov[0].shape[0]) * 100) if k_idx.shape[0] == 0: print 'All events of type %s were filtered out due to low coverage. Please try re-running with less stringent filter criteria' % event_type continue # k_idx = sp.where((sp.mean(sp.c_[cov[0], cov[1]], axis=1) > 2))[0] # k_idx = sp.where((sp.mean(cov[0], axis=1) > 2) & (sp.mean(cov[1], axis=1) > 2))[0] cov[0] = cov[0][k_idx, :] cov[1] = cov[1][k_idx, :] curr_gene_counts = curr_gene_counts[k_idx, :] event_idx = event_idx[k_idx] gene_idx = gene_idx[k_idx] event_ids = [x[k_idx] for x in event_ids] cov[0] = sp.around(sp.hstack([cov[0], curr_gene_counts])) cov[1] = sp.around(sp.hstack([cov[1], curr_gene_counts])) cov = sp.vstack(cov) event_ids = sp.hstack(event_ids) tidx = sp.arange(idx1.shape[0]) #if CFG['debug']: # for i in range(cov.shape[0]): # fig = plt.figure(figsize=(8, 6), dpi=100) # ax = fig.add_subplot(111) # ax.hist(cov[i, :] * sf, 50, histtype='bar', rwidth=0.8) # #ax.plot(sp.arange(cov.shape[1]), sorted(cov[i, :]), 'bo') # ax.set_title('Count Distribution - Sample %i' % i ) # plt.savefig('count_dist.%i.pdf' % i, format='pdf', bbox_inches='tight') # plt.close(fig) ### build design matrix for testing dmatrix1 = sp.zeros((cov.shape[1], 4), dtype='bool') dmatrix1[:, 0] = 1 # intercept dmatrix1[tidx, 1] = 1 # delta a dmatrix1[tidx, 2] = 1 # delta g dmatrix1[tidx + (idx1.shape[0] + idx2.shape[0]), 2] = 1 # delta g dmatrix1[(idx1.shape[0] + idx2.shape[0]):, 3] = 1 # is g dmatrix0 = dmatrix1[:, [0, 2, 3]] ### make event splice forms unique to prevent unnecessary tests event_ids, u_idx, r_idx = sp.unique(event_ids, return_index=True, return_inverse=True) if CFG['verbose']: print 'Consider %i unique event splice forms for testing' % u_idx.shape[0] ### run testing #pvals = run_testing(cov[u_idx, :], dmatrix0, dmatrix1, sf, CFG, r_idx) pvals = run_testing(cov, dmatrix0, dmatrix1, sf, CFG) pvals_adj = adj_pval(pvals, CFG) ### write output out_fname = os.path.join(outdir, 'test_results_C%i_%s.tsv' % (options.confidence, event_type)) if CFG['verbose']: print 'Writing test results to %s' % out_fname s_idx = sp.argsort(pvals_adj) header = sp.array(['event_id', 'gene', 'p_val', 'p_val_adj']) event_ids = sp.array(['%s_%i' % (event_type, i + 1) for i in event_idx], dtype='str') if CFG['is_matlab']: data_out = sp.c_[event_ids[s_idx], gene_ids[gene_idx[s_idx], 0], pvals[s_idx].astype('str'), pvals_adj[s_idx].astype('str')] else: data_out = sp.c_[event_ids[s_idx], gene_ids[gene_idx[s_idx]], pvals[s_idx].astype('str'), pvals_adj[s_idx].astype('str')] data_out = sp.r_[header[sp.newaxis, :], data_out] sp.savetxt(out_fname, data_out, delimiter='\t', fmt='%s')
def spladder(): ### get command line options options = parse_options(sys.argv) ### parse parameters from options object CFG = settings.parse_args(options) ### add dependencies provided in config section #if 'paths' in CFG: # for i in CFG['paths']: # eval('import %s'% CFG['paths'][i]) ### load confidence level settings if not CFG['no_reset_conf']: CFG = settings.set_confidence_level(CFG) ### do not compute components of merged set, if result file already exists fn_out_merge = get_filename('fn_out_merge', CFG) fn_out_merge_val = get_filename('fn_out_merge_val', CFG) if not 'spladder_infile' in CFG and not os.path.exists(fn_out_merge): ### iterate over files, if merge strategy is single if CFG['merge_strategy'] in ['single', 'merge_graphs']: idxs = range(len(CFG['samples'])) else: idxs = [0] ### set parallelization if CFG['rproc']: jobinfo = [] ### create out-directory if not os.path.exists(CFG['out_dirname']): os.makedirs(CFG['out_dirname']) ### create spladder sub-directory if not os.path.exists(os.path.join(CFG['out_dirname'], 'spladder')): os.makedirs(os.path.join(CFG['out_dirname'], 'spladder')) ### pre-process annotation, if necessary if CFG['anno_fname'].split('.')[-1] != 'pickle': if not os.path.exists(CFG['anno_fname'] + '.pickle'): if CFG['anno_fname'].split('.')[-1] in ['gff', 'gff3']: (genes, CFG) = init.init_genes_gff3(CFG['anno_fname'], CFG, CFG['anno_fname'] + '.pickle') elif CFG['anno_fname'].split('.')[-1] in ['gtf']: (genes, CFG) = init.init_genes_gtf(CFG['anno_fname'], CFG, CFG['anno_fname'] + '.pickle') else: print >> sys.stderr, 'ERROR: Unknown annotation format. File needs to end in gtf or gff/gff3\nCurrent file: %s' % CFG['anno_fname'] sys.exit(1) CFG['anno_fname'] += '.pickle' ### add anotation contigs into lookup table if not 'genes' in CFG: genes = cPickle.load(open(CFG['anno_fname'], 'r')) else: genes = CFG['genes'] CFG = init.append_chrms(sp.unique(sp.array([x.chr for x in genes], dtype='str')), CFG) del genes for idx in idxs: CFG_ = dict() if CFG['merge_strategy'] != 'merge_bams': CFG_['bam_fnames'] = CFG['bam_fnames'] CFG_['samples'] = CFG['samples'] CFG['bam_fnames'] = CFG['bam_fnames'][idx] CFG['samples'] = CFG['samples'][idx] CFG['out_fname'] = '%s/spladder/genes_graph_conf%i.%s.pickle' % (CFG['out_dirname'], CFG['confidence_level'], CFG['samples']) else: CFG['out_fname'] = '%s/spladder/genes_graph_conf%i.%s.pickle' % (CFG['out_dirname'], CFG['confidence_level'], CFG['merge_strategy']) ### assemble out filename to check if we are already done fn_out = CFG['out_fname'] if CFG['do_prune']: fn_out = re.sub('.pickle$', '_pruned.pickle', fn_out) if CFG['do_gen_isoforms']: fn_out = re.sub('.pickle$', '_with_isoforms.pickle', fn_out) if os.path.exists(fn_out): print >> sys.stdout, '%s - All result files already exist.' % fn_out else: if CFG['rproc']: jobinfo.append(rp.rproc('spladder_core', CFG, 15000, CFG['options_rproc'], 60*60)) else: spladder_core(CFG) for key in CFG_: try: CFG[key] = CFG_[key].copy() except AttributeError: CFG[key] = CFG_[key] ### collect results after parallelization if CFG['rproc']: rp.rproc_wait(jobinfo, 30, 1.0, -1) ### merge parts if necessary if CFG['merge_strategy'] == 'merge_graphs': run_merge(CFG) if not 'spladder_infile' in CFG and CFG['validate_splicegraphs'] and not os.path.exists(fn_out_merge_val): (genes, inserted) = cPickle.load(open(fn_out_merge, 'r')) genes = filter_by_edgecount(genes, CFG) cPickle.dump((genes, inserted), open(fn_out_merge_val, 'w'), -1) del genes ### get count output file fn_in_count = get_filename('fn_count_in', CFG) fn_out_count = get_filename('fn_count_out', CFG) ### convert input BAMs to sparse arrays if CFG['bam_to_sparse']: for bfn in CFG['bam_fnames']: if bfn.endswith('bam') and not os.path.exists(re.sub(r'.bam$', '', bfn) + '.npz'): cnts = dict() if not 'chrm_lookup' in CFG: IN = pysam.Samfile(bfn, 'rb') CFG = append_chrms([x['SN'] for x in parse_header(IN.text)['SQ']], CFG) IN.close() if CFG['parallel'] > 1: import multiprocessing as mp pool = mp.Pool(processes=CFG['parallel']) result = [pool.apply_async(summarize_chr, args=(bfn, str(chrm), CFG,)) for chrm in sorted(CFG['chrm_lookup'])] while result: tmp = result.pop(0).get() cnts[tmp[0] + '_reads_row'] = tmp[1].row.astype('uint8') cnts[tmp[0] + '_reads_col'] = tmp[1].col cnts[tmp[0] + '_reads_dat'] = tmp[1].data cnts[tmp[0] + '_reads_shp'] = tmp[1].shape cnts[tmp[0] + '_introns_m'] = tmp[2] cnts[tmp[0] + '_introns_p'] = tmp[3] else: for chrm in CFG['chrm_lookup']: tmp = summarize_chr(bfn, str(chrm), CFG) cnts[chrm + '_reads_row'] = tmp[1].row.astype('uint8') cnts[chrm + '_reads_col'] = tmp[1].col cnts[chrm + '_reads_dat'] = tmp[1].data cnts[chrm + '_reads_shp'] = tmp[1].shape cnts[chrm + '_introns_m'] = tmp[2] cnts[chrm + '_introns_p'] = tmp[3] sp.savez_compressed(re.sub(r'.bam$', '', bfn), **cnts) elif CFG['verbose']: print >> sys.stdout, 'Sparse BAM representation for %s already exists.' % bfn ### count segment graph if CFG['run_as_analysis'] or CFG['count_segment_graph']: if not os.path.exists(fn_out_count): count_graph_coverage_wrapper(fn_in_count, fn_out_count, CFG) ### count intron coverage phenotype if CFG['count_intron_cov']: fn_out_intron_count = fn_out_count.replace('mat', 'introns.pickle') count_intron_coverage_wrapper(fn_in_count, fn_out_intron_count, CFG) ### handle alternative splicing part if CFG['run_as_analysis']: collect_events(CFG) for idx in range(len(CFG['event_types'])): analyze_events(CFG, CFG['event_types'][idx])
def spladder_viz(): """Main visualization code""" ### parse command line parameters options = parse_options(sys.argv) ### parse parameters from options object CFG = settings.parse_args(options, identity='viz') ### create plot directory if it does not exist yet if options.testdir != '-': dirname = options.testdir else: dirname = CFG['out_dirname'] if not os.path.exists(os.path.join(dirname, 'plots')): os.mkdir(os.path.join(dirname, 'plots')) if options.format == 'd3': try: import mpld3 from mpld3 import plugins except ImportError: sys.stderr.write( "ERROR: missing package for output format d3. Package mpld3 required" ) sys.exit(1) ### load gene information gene_names = get_gene_names(CFG) rows = get_plot_len(CFG) gs = gridspec.GridSpec(rows, 1) ### set color maps cmap_cov = plt.get_cmap('jet') cmap_edg = plt.get_cmap('jet') ### plot log scale? log_tag = '' if options.log: log_tag = '.log' event_tag = '' ### did we get any labels? if CFG['plot_labels']: CFG['plot_labels'] = CFG['plot_labels'].strip(',').split(',') assert len(CFG['plot_labels']) == len( CFG['bam_fnames'] ), "The number of given labels (%i) needs to match the number of given bam file groups (%i)" % ( len(CFG['plot_labels']), len(CFG['bam_fnames'])) ### the user chose a specific gene for plotting ### create pairs of gene ids and an event_id (the latter is None by default) if options.gene_name is not None: #gid = sp.where(sp.array([x.split('.')[0] for x in gene_names]) == options.gene_name.split('.')[0])[0] gids = [[ sp.where(sp.array(gene_names) == options.gene_name)[0][0], options.event_id ]] if gids.shape[0] == 0: sys.stderr.write( 'ERROR: provided gene ID %s could not be found, please check for correctness\n' % options.gene_name) sys.exit(1) ### the plotting happens on the results of spladder test ### the user chooses to plot the top k significant events ### this requires the event type to be specified elif options.test_result > 0: gene_names = [] for event_type in CFG['event_types']: ### the testing script should generate a setup file for the test ### SETUP is structured as follows: ### [gene_strains, event_strains, dmatrix0, dmatrix1, event_type, options, CFG] labels = options.test_labels.split(':') options.labels = labels if options.testdir != '-': testdir = dirname else: testdir = os.path.join( dirname, 'testing_%s_vs_%s' % (labels[0], labels[1])) SETUP = cPickle.load( open( os.path.join( testdir, 'test_setup_C%i_%s.pickle' % (CFG['confidence_level'], event_type)), 'r')) ### get strains to plot idx1 = sp.where(sp.in1d(SETUP[0], SETUP[6]['conditionA']))[0] idx2 = sp.where(sp.in1d(SETUP[0], SETUP[6]['conditionB']))[0] ### load test results for l, line in enumerate( open( os.path.join( testdir, 'test_results_C%i_%s.tsv' % (CFG['confidence_level'], event_type)), 'r')): if l == 0: continue if l > options.test_result: break sl = line.strip().split('\t') gene_names.append([sl[1], sl[0]]) gids = get_gene_ids(CFG, gene_names) ### no gene specified but result provided - plot all genes with confirmed events ### if an event_id is provided, only the associated gene will be plotted else: gids = get_gene_ids(CFG) ### iterate over genes to plot for gid in gids: ### gather information about the gene we plot gene = load_genes(CFG, idx=[gid[0]])[0] if CFG['verbose']: print 'plotting information for gene %s' % gene.name gene.from_sparse() ### event to plot is specified with the gene id list if gid[1] is not None: event_info = [ x[::-1] for x in re.split(r'[._]', gid[1][::-1], maxsplit=1)[::-1] ] event_info[1] = int(event_info[1]) - 1 event_info = sp.array(event_info, dtype='str')[sp.newaxis, :] event_tag = '.%s' % gid[1] ### get all significant events of the current gene else: event_info = get_conf_events(CFG, gid[0]) ### go over different plotting options axes = [] ### plot result of testing if options.test_result > 0: fig = plt.figure(figsize=(9, 5), dpi=200) gs = gridspec.GridSpec(2, 1, height_ratios=[4, 1]) _add_ax(fig, axes, gs) _add_ax(fig, axes, gs) _plot_event(CFG, event_info, fig, axes[1], gs, None, padding=100) start, stop = axes[1].get_xlim() plot_bam(options, gene, CFG['bam_fnames'], fig, axes[0], gs, None, cmap_cov, cmap_edg, single=False, sharex=axes[1], start=int(start), stop=int(stop)) ### plot custom layout elif options.user == 'y': if options.format == 'd3': fig = plt.figure(figsize=(12, 2 * rows), dpi=100) else: fig = plt.figure(figsize=(18, 3 * rows), dpi=200) ### plot splicing graph if options.splicegraph == 'y': _plot_splicegraph(gene, fig, axes, gs) xlim = axes[-1].get_xlim() ### plot annotated transcripts if CFG['plot_transcripts']: sharex = None if len(axes) == 0 else axes[0] axes.append(fig.add_subplot(gs[len(axes), 0], sharex=sharex)) multiple(gene.exons, ax=axes[-1], x_range=xlim) axes[-1].set_title('Annotated Transcripts') ### plot coverage information for a set of given samples if len(CFG['bam_fnames']) > 0: plot_bam(options, gene, CFG['bam_fnames'], fig, axes, gs, xlim, cmap_cov, cmap_edg) ### plot all the samples in a single plot if len(CFG['bam_fnames']) > 1: plot_bam(options, gene, CFG['bam_fnames'], fig, axes, gs, xlim, cmap_cov, cmap_edg, single=False) ### plot segment counts if len(CFG['bam_fnames'] ) == 0 or False: # add option for segment plots if options.test_result > 0: _plot_segments(CFG, gid, fig, axes, gs, options, [idx1, idx2]) else: _plot_segments(CFG, gid, fig, axes, gs, options) ### plot structure of a single given event _plot_event(CFG, event_info, fig, axes, gs, xlim) ### we only need to adapt the xoom for one axis object - as we share the x zoom_x = [float(x) for x in options.zoom_x.split(',')] xlim = axes[0].get_xlim() xdiff = xlim[1] - xlim[0] axes[0].set_xlim( [xlim[0] + (zoom_x[0] * xdiff), xlim[0] + (zoom_x[1] * xdiff)]) for ax in axes: vax.clean_axis(ax) plt.tight_layout() ### save plot into file if options.format == 'd3': out_fname = os.path.join( dirname, 'plots', 'gene_overview_C%i_%s%s%s.html' % (options.confidence, gene.name, event_tag, log_tag)) plugins.clear(fig) plugins.connect(fig, plugins.Zoom(enabled=True)) mpld3.save_html(fig, open(out_fname, 'w')) else: if options.test_result > 0: out_fname = os.path.join( dirname, 'plots', 'gene_overview_C%i_%s%s%s.%s' % (options.confidence, gene.name, event_tag, log_tag, options.format)) else: out_fname = os.path.join( dirname, 'plots', 'gene_overview_C%i_%s%s%s.%s' % (options.confidence, gene.name, event_tag, log_tag, options.format)) plt.savefig(out_fname, format=options.format, bbox_inches='tight') plt.close(fig)
def spladder(): ### get command line options options = parse_options(sys.argv) ### parse parameters from options object CFG = settings.parse_args(options) ### add dependencies provided in config section #if 'paths' in CFG: # for i in CFG['paths']: # eval('import %s'% CFG['paths'][i]) ### load confidence level settings if not CFG['no_reset_conf']: CFG = settings.set_confidence_level(CFG) ### do not compute components of merged set, if result file already exists fn_out_merge = '' prune_tag = '' if CFG['do_prune']: prune_tag = '_pruned' if CFG['merge_strategy'] == 'merge_graphs': fn_out_merge = '%s/spladder/genes_graph_conf%i.%s%s.pickle' % (CFG['out_dirname'], CFG['confidence_level'], CFG['merge_strategy'], prune_tag) if not 'spladder_infile' in CFG and not os.path.exists(fn_out_merge): ### iterate over files, if merge strategy is single if CFG['merge_strategy'] in ['single', 'merge_graphs']: idxs = range(len(CFG['samples'])) else: idxs = [0] ### set parallelization if CFG['rproc']: jobinfo = [] ### create out-directory if not os.path.exists(CFG['out_dirname']): os.makedirs(CFG['out_dirname']) ### create spladder sub-directory if not os.path.exists(os.path.join(CFG['out_dirname'], 'spladder')): os.makedirs(os.path.join(CFG['out_dirname'], 'spladder')) ### pre-process annotation, if necessary if CFG['anno_fname'].split('.')[-1] != 'pickle': if not os.path.exists(CFG['anno_fname'] + '.pickle'): if CFG['anno_fname'].split('.')[-1] in ['gff', 'gff3']: (genes, CFG) = init.init_genes_gff3(CFG['anno_fname'], CFG, CFG['anno_fname'] + '.pickle') elif CFG['anno_fname'].split('.')[-1] in ['gtf']: (genes, CFG) = init.init_genes_gtf(CFG['anno_fname'], CFG, CFG['anno_fname'] + '.pickle') else: print >> sys.stderr, 'ERROR: Unknown annotation format. File needs to end in gtf or gff/gff3\nCurrent file: %s' % CFG['anno_fname'] sys.exit(1) CFG['anno_fname'] += '.pickle' ### add anotation contigs into lookup table if not 'genes' in CFG: genes = cPickle.load(open(CFG['anno_fname'], 'r')) else: genes = CFG['genes'] CFG = init.append_chrms(sp.unique(sp.array([x.chr for x in genes], dtype='str')), CFG) del genes for idx in idxs: CFG_ = dict() if CFG['merge_strategy'] != 'merge_bams': CFG_['bam_fnames'] = CFG['bam_fnames'] CFG_['samples'] = CFG['samples'] CFG['bam_fnames'] = CFG['bam_fnames'][idx] CFG['samples'] = CFG['samples'][idx] CFG['out_fname'] = '%s/spladder/genes_graph_conf%i.%s.pickle' % (CFG['out_dirname'], CFG['confidence_level'], CFG['samples']) else: CFG['out_fname'] = '%s/spladder/genes_graph_conf%i.%s.pickle' % (CFG['out_dirname'], CFG['confidence_level'], CFG['merge_strategy']) ### assemble out filename to check if we are already done fn_out = CFG['out_fname'] if CFG['do_prune']: fn_out = re.sub('.pickle$', '_pruned.pickle', fn_out) if CFG['do_gen_isoforms']: fn_out = re.sub('.pickle$', '_with_isoforms.pickle', fn_out) if os.path.exists(fn_out): print >> sys.stdout, 'All result files already exist.' else: if CFG['rproc']: jobinfo.append(rp.rproc('spladder_core', CFG, 15000, CFG['options_rproc'], 40*60)) else: spladder_core(CFG) for key in CFG_: try: CFG[key] = CFG_[key].copy() except AttributeError: CFG[key] = CFG_[key] ### collect results after parallelization if CFG['rproc']: rp.rproc_wait(jobinfo, 30, 1.0, -1) ### merge parts if necessary if CFG['merge_strategy'] == 'merge_graphs': run_merge(CFG) ### determine count output file if not 'spladder_infile' in CFG: if CFG['validate_splicegraphs']: fn_in_count = '%s/spladder/genes_graph_conf%i.%s%s.validated.pickle' % (CFG['out_dirname'], CFG['confidence_level'], CFG['merge_strategy'], prune_tag) else: fn_in_count = '%s/spladder/genes_graph_conf%i.%s%s.pickle' % (CFG['out_dirname'], CFG['confidence_level'], CFG['merge_strategy'], prune_tag) else: fn_in_count = CFG['spladder_infile'] fn_out_count = fn_in_count.replace('.pickle', '') + '.count.pickle' ### count segment graph if not os.path.exists(fn_out_count): count_graph_coverage_wrapper(fn_in_count, fn_out_count, CFG) ### count intron coverage phenotype if CFG['count_intron_cov']: fn_out_intron_count = fn_out_count.replace('mat', 'introns.pickle') count_intron_coverage_wrapper(fn_in_count, fn_out_intron_count, CFG) ### handle alternative splicing part if CFG['run_as_analysis']: collect_events(CFG) for idx in range(len(CFG['event_types'])): analyze_events(CFG, CFG['event_types'][idx])