Пример #1
0
def RunSnv(args):
	cfg = Parse.generate_snv_cfg(args)
	Parse.print_snv_options(cfg)

	if not cfg['debug']:
		logging.disable(logging.CRITICAL)

	regions_df = pd.read_table(cfg['region_file'], compression='gzip' if cfg['region_file'].split('.')[-1] == 'gz' else None)
	regions_df = regions_df[regions_df['job'] == int(cfg['job'])].reset_index(drop=True)
	return_values = {}
	models_out = {}
	bgzfiles = {}
	print ''
	for m in cfg['model_order']:
		print "initializing out file for model " + m if m != '___no_tag___' else "initializing out file"
		models_out[m] = cfg['out'] if m == '___no_tag___' else cfg['out'] + '.' + m
		try:
			bgzfiles[m] = bgzf.BgzfWriter(models_out[m] + '.gz', 'wb')
		except:
			print Process.Error("failed to initialize bgzip format out file " + models_out[m] + '.gz').out
			return 1

	if len(cfg['meta_order']) > 0:
		for m in cfg['meta_order']:
			print "initializing out file for meta " + m
			models_out[m] = cfg['out'] + '.' + m
			try:
				bgzfiles[m] = bgzf.BgzfWriter(models_out[m] + '.gz', 'wb')
			except:
				print Process.Error("failed to initialize bgzip format out file " + models_out[m] + '.gz').out
				return 1

	if cfg['cpus'] > 1:
		pool = mp.Pool(cfg['cpus']-1)
		for i in xrange(1,cfg['cpus']):
			return_values[i] = pool.apply_async(process_regions, args=(regions_df,cfg,i,True,))
			print "submitting job on cpu " + str(i) + " of " + str(cfg['cpus'])
		pool.close()
		print "executing job for cpu " + str(cfg['cpus']) + " of " + str(cfg['cpus']) + " via main process"
		main_return = process_regions(regions_df,cfg,cfg['cpus'],True)
		pool.join()

		if 1 in [return_values[i].get() for i in return_values] or main_return == 1:
			print Process.Error("error detected, see log files").out
			return 1

	else:
		main_return = process_regions(regions_df,cfg,1,True)
		if main_return == 1:
			print Process.Error("error detected, see log files").out
			return 1

	for i in xrange(1,cfg['cpus']+1):
		try:
			logfile = open(cfg['out'] + '.cpu' + str(i) + '.log', 'r')
		except:
			print Process.Error("failed to initialize log file " + cfg['out'] + '.cpu' + str(i) + '.log').out
			return 1
		print logfile.read()
		logfile.close()
		os.remove(cfg['out'] + '.cpu' + str(i) + '.log')

	for m in cfg['model_order']:
		written = False
		for i in xrange(1,cfg['cpus']+1):
			regions_cpu_df = regions_df[regions_df['cpu'] == i].reset_index(drop=True)
			out_model_range = '/'.join(cfg['out'].split('/')[0:-1]) + '/' + (cfg['out'] + '.cpu' + str(i) + '.' + m).split('/')[-1] + '.pkl'
			pkl = open(out_model_range,"rb")
			results_final,metadata,results_header,tbx_start,tbx_end = pickle.load(pkl)
			if not written:
				bgzfiles[m].write(metadata)
				bgzfiles[m].write("\t".join(results_header) + '\n')
				written = True
			if results_final.shape[0] > 0:
				results_final.replace({'None': 'NA'}).to_csv(bgzfiles[m], index=False, sep='\t', header=False, na_rep='NA', float_format='%.5g', columns = results_header, append=True)
			pkl.close()
			os.remove(out_model_range)

		bgzfiles[m].close()
		print "indexing out file for model " + m if m != '___no_tag___' else "indexing out file"
		try:
			pysam.tabix_index(models_out[m] + '.gz',seq_col=0,start_col=tbx_start,end_col=tbx_end,force=True)
		except:
			print Process.Error('failed to generate index for file ' + models_out[m] + '.gz').out
			return 1

	if len(cfg['meta_order']) > 0:
		for m in cfg['meta_order']:
			written = False
			for i in xrange(1,cfg['cpus']+1):
				out_model_meta = '/'.join(cfg['out'].split('/')[0:-1]) + '/' + cfg['out'].split('/')[-1] + '.cpu' + str(i) + '.' + m + '.pkl'
				pkl = open(out_model_meta,"rb")
				results_final_meta,metadata,results_header,tbx_start,tbx_end = pickle.load(pkl)
				if not written:
					bgzfiles[m].write(metadata)
					bgzfiles[m].write('#' + '\t'.join(results_header) + '\n')
					written = True
				if results_final_meta.shape[0] > 0:
					results_final_meta.replace({'None': 'NA'}).to_csv(bgzfiles[m], index=False, sep='\t', header=False, na_rep='NA', float_format='%.5g', columns = results_header, append=True)
				pkl.close()
				os.remove(out_model_meta)

			bgzfiles[m].close()
			print "indexing out file for meta " + m
			try:
				pysam.tabix_index(models_out[m] + '.gz',seq_col=0,start_col=tbx_start,end_col=tbx_end,force=True)
			except:
				print Process.Error('failed to generate index for file ' + models_out[m] + '.gz').out
				return 1

	print "process complete"
	return 0