def _get_vector(self, bp): bed = read_bed(bp) divide_bed(bed, self.bin_size) merge_bed(bed) try: vector = bed2vector(bed, self.chromosome_len) except: print bp raise return vector
def init_dhs_extraction(self): with open(self.dhs_path) as dp: dhs = read_bed(dp) divide_bed(dhs, self.bin_size) merge_bed(dhs) self.dhs_vector = bed2vector(dhs, self.chromosome_len) coverage = len(nonzero(self.dhs_vector)[0]) print coverage,"coverage" ret = empty((coverage, len(self.beds_path)), dtype="int8") current_col = 0 for bed_path in self.beds_path: with open(bed_path) as bp: ret[:,current_col] = self._bed2extracted_vector(bp) current_col += 1 self.sparse = csr_matrix(ret,dtype="int16") print "init DHS extraction finished"
def aggregate_parallel(samples, args, results): ''' Process and aggregate GTF input files samples: list of Sample objects args: from Argparse module. command-line arguments to configure the assembly process results: Results object containing input and output filenames ''' logging.info('Aggregating in parallel using %d processes' % (args.num_processes)) if args.filter_splice_juncs and args.ref_genome_fasta_file: # test opening FastaFile logging.info('Indexing reference genome fasta file (if necessary)') fasta_fh = FastaFile(args.ref_genome_fasta_file) fasta_fh.close() # create queue input_queue = JoinableQueue(maxsize=args.num_processes * 2) # start worker processes procs = [] worker_results = [] for i in xrange(args.num_processes): worker_id = 'aggregate_worker%03d' % i worker_dir = os.path.join(results.tmp_dir, worker_id) if not os.path.exists(worker_dir): os.makedirs(worker_dir) worker_results.append(Results(worker_dir)) p = Process(target=aggregate_worker, args=(input_queue, args, worker_dir)) p.start() procs.append(p) # reference gtf if args.ref_gtf_file is not None: logging.debug('Reference: %s' % args.ref_gtf_file) input_queue.put(Sample(args.ref_gtf_file, Sample.REF_ID)) # parse samples for sample in samples: input_queue.put(sample) for p in procs: input_queue.put(None) # close input queue input_queue.join() input_queue.close() # join worker processes for p in procs: p.join() # merge output files logging.info('Merging aggregated files') logging.debug('\tmerging bed files') retcode = merge_bed(input_files=[r.transfrags_bed_file for r in worker_results], output_file=results.transfrags_bed_file, num_processes=args.num_processes, tmp_dir=results.tmp_dir) if retcode != 0: raise TacoError('Error running linux merge') logging.debug('\tmerging filtered bed files') retcode = merge_bed(input_files=[r.transfrags_filtered_bed_file for r in worker_results], output_file=results.transfrags_filtered_bed_file, num_processes=args.num_processes, tmp_dir=results.tmp_dir) if retcode != 0: raise TacoError('Error running linux merge') logging.debug('\tmerging sample stats') def sort_key_field0(line): fields = line.split('\t', 1) return fields[0] stats_header = ['sample_id', 'num_transfrags', 'filtered_length', 'filtered_expr', 'filtered_splice\n'] stats_header = '\t'.join(stats_header) merge_files(input_files=[r.sample_stats_file for r in worker_results], output_file=results.sample_stats_file, key=sort_key_field0, header=stats_header) # cleanup worker data logging.info('Removing temporary files') def shutil_error_callback(func, path, excinfo): logging.error('Error removing tmp files path=%s message=%s' % (path, excinfo)) for r in worker_results: shutil.rmtree(r.output_dir, onerror=shutil_error_callback) logging.info('Aggregate done') return 0
def aggregate_parallel(samples, args, results): ''' Process and aggregate GTF input files samples: list of Sample objects args: from Argparse module. command-line arguments to configure the assembly process results: Results object containing input and output filenames ''' logging.info('Aggregating in parallel using %d processes' % (args.num_processes)) if args.filter_splice_juncs and args.ref_genome_fasta_file: # test opening FastaFile logging.info('Indexing reference genome fasta file (if necessary)') fasta_fh = FastaFile(args.ref_genome_fasta_file) fasta_fh.close() # create queue input_queue = JoinableQueue(maxsize=args.num_processes * 2) # start worker processes procs = [] worker_results = [] for i in xrange(args.num_processes): worker_id = 'aggregate_worker%03d' % i worker_dir = os.path.join(results.tmp_dir, worker_id) if not os.path.exists(worker_dir): os.makedirs(worker_dir) worker_results.append(Results(worker_dir)) p = Process(target=aggregate_worker, args=(input_queue, args, worker_dir)) p.start() procs.append(p) # reference gtf if args.ref_gtf_file is not None: logging.debug('Reference: %s' % args.ref_gtf_file) input_queue.put(Sample(args.ref_gtf_file, Sample.REF_ID)) # parse samples for sample in samples: input_queue.put(sample) for p in procs: input_queue.put(None) # close input queue input_queue.join() input_queue.close() # join worker processes for p in procs: p.join() # merge output files logging.info('Merging aggregated files') logging.debug('\tmerging bed files') retcode = merge_bed( input_files=[r.transfrags_bed_file for r in worker_results], output_file=results.transfrags_bed_file, num_processes=args.num_processes, tmp_dir=results.tmp_dir) if retcode != 0: raise TacoError('Error running linux merge') logging.debug('\tmerging filtered bed files') retcode = merge_bed( input_files=[r.transfrags_filtered_bed_file for r in worker_results], output_file=results.transfrags_filtered_bed_file, num_processes=args.num_processes, tmp_dir=results.tmp_dir) if retcode != 0: raise TacoError('Error running linux merge') logging.debug('\tmerging sample stats') def sort_key_field0(line): fields = line.split('\t', 1) return fields[0] stats_header = [ 'sample_id', 'num_transfrags', 'filtered_length', 'filtered_expr', 'filtered_splice\n' ] stats_header = '\t'.join(stats_header) merge_files(input_files=[r.sample_stats_file for r in worker_results], output_file=results.sample_stats_file, key=sort_key_field0, header=stats_header) # cleanup worker data logging.info('Removing temporary files') def shutil_error_callback(func, path, excinfo): logging.error('Error removing tmp files path=%s message=%s' % (path, excinfo)) for r in worker_results: shutil.rmtree(r.output_dir, onerror=shutil_error_callback) logging.info('Aggregate done') return 0
def testMerge2(self): divide_bed(self.bed_obj, 100) merge_bed(self.bed_obj) with open("./utest/test1_divided_merged.bed") as dmd: self.bed_dmer = read_bed(dmd) self.assertEqual(self.bed_obj, self.bed_dmer)
def testMerge(self): merge_bed(self.bed_obj) with open("./utest/test1_merged.bed") as md: self.bed_mer = read_bed(md) self.assertEqual(self.bed_obj, self.bed_mer)