def metalfox_pipe(config_file, sample_pairs, ref_mnt): (metalfox_tool, cont, obj, map_ref, max_t, ram) = parse_config(config_file) map_ref = ref_mnt + '/' + map_ref src_cmd = '. ~/.novarc;' deproxy = 'unset http_proxy; unset https_proxy;' pairs = open(sample_pairs, 'r') job_list = [] for sn in pairs: sn = sn.rstrip('\n') info = sn.split('\t') sys.stderr.write('Getting bam file name for ' + info[1] + '\n') get_bam_name = 'swift list ' + cont + ' --prefix ' + obj + '/' + info[1] + '/BAM/' + info[1] \ + ' | grep .rmdup.srt.ba* ' bam = subprocess.check_output(get_bam_name, shell=True).split('\n') dl_bam = 'swift download --skip-identical ' + cont + ' ' + bam[1] + ';swift download --skip-identical ' \ + cont + ' ' + bam[0] + ';' mut_out = 'ANALYSIS/' + info[0] + '/OUTPUT/' + info[0] + '.out.keep' dl_out = 'swift download ' + cont + ' ' + mut_out + ';' # .bai/.bam extension not always clear if bam[1][-3:] == 'bam': run_metal = metalfox_tool + ' -f1 ' + mut_out + ' -f3 ' + bam[1] + ' -m ' + map_ref + ' > ' + info[0] + \ '.foxog_scored_added.out;' else: run_metal = metalfox_tool + ' -f1 ' + mut_out + ' -f3 ' + bam[0] + ' -m ' + map_ref + ' > ' + info[0] + \ '.foxog_scored_added.out;' cleanup = 'rm ' + ' '.join((bam[0], bam[1], mut_out)) + ';' job_list.append(src_cmd + deproxy + dl_bam + dl_out + run_metal) # + cleanup) pairs.close() sys.stderr.write(date_time() + 'Queueing jobs\n') job_manager(job_list, max_t)
def run_fastqc(config=None, run_config=None): os.chdir(os.path.join(config['root_dir'], run_config['run_name'])) try: os.mkdir(config['qc']['dir']) except OSError: print 'Warning: {} directory already exists, which is ok.'.format( config['qc']['dir']) os.chdir(os.path.join(config['root_dir'], run_config['run_name'], 'TEMP')) jobs = [] for fastq in glob.glob('*_sequence.txt.gz'): cmd = '{fastqc} -j {java} -o {QC} {fastq}'.format( fastqc=config['qc']['fastqc'], java=config['qc']['java'], QC=os.path.join(config['root_dir'], run_config['run_name'], config['qc']['dir']), fastq=fastq) print 'appending cmd: {}'.format(cmd) jobs.append(cmd) job_manager.job_manager(cmd_list=jobs, threads=multiprocessing.cpu_count(), interval=20)
def oxog_check(config_file, lane_list, ref_mnt): (java, picard, fa_ordered, intervals, cont, obj, max_t, ram) = parse_config(config_file) src_cmd = ". /home/ubuntu/.novarc;" ram = str(int(ram) / int(max_t)) job_list = [] lane_fh = open(lane_list, 'r') for sample in lane_fh: sample = sample.rstrip('\n') info = sample.split('\t') lanes = info[2].split(', ') bid = info[0] for lane in lanes: dl_bam = src_cmd + 'swift download ' + cont + ' --prefix ' + obj + '/' + bid + '/BAM/' + bid \ + '_' + lane + '.rmdup.srt.ba;' # pdb.set_trace() bam = obj + '/' + bid + '/BAM/' + bid + '_' + lane + '.rmdup.srt.bam' oxoG = java + ' -Xmx' + ram + 'g -jar ' + picard + ' CollectOxoGMetrics I=' + bam + ' O=' + bid + '_' + \ lane + '.oxo_summary.txt R=' + ref_mnt + '/' + fa_ordered + ' INTERVALS=' + ref_mnt + '/' + \ intervals + ' 2> ' + bid + '_' + lane + '.log;' del_bam = 'rm ' + obj + '/' + bid + '/BAM/' + bid + '_' + lane + '.rmdup.srt.bam;' job_list.append(dl_bam + oxoG + del_bam) lane_fh.close() job_manager(job_list, max_t)
READ_GROUP_NAME = root sname = meta[0] LIBRARY_NAME = meta[0] PLATFORM_UNIT = meta[4] PLATFORM = 'illumina' bam = os.path.basename(root) + '_unaligned.bam' bam_list.append(bam) log_file = READ_GROUP_NAME + '.convert.log' picard_cmd = args['JAVA'] + ' -Djava.io.tmpdir=tmp -Xmx' + str(p_mem) + 'G -jar ' + args['PICARD'] \ + ' FastqToSam FASTQ=' + r1 + ' FASTQ2=' + r2 + ' OUTPUT=' + bam + ' READ_GROUP_NAME=' \ + READ_GROUP_NAME + ' SAMPLE_NAME=' + sname + ' LIBRARY_NAME=' + LIBRARY_NAME + ' PLATFORM_UNIT=' \ + PLATFORM_UNIT + ' PLATFORM=' + PLATFORM picard_cmd += ' 2> ' + log_file + ' >> ' + log_file + '; rm ' + r1 + ' ' + r2 cmd_list.append(picard_cmd) sys.stderr.write(date_time() + 'Queueing jobs for conversion\n') job_manager(cmd_list, args['THREADS']) novo_cmd = 'mkdir tmp; ' + args['NOVOSORT'] + ' -c ' + args['THREADS'] + ' -m ' + args['MEMORY'] + 'G -n -t tmp ' \ + ' '.join(bam_list) + ' > ' + sname + '_ualigned_merged.bam' sys.stderr.write(date_time() + ' Merging unaligned bams with command ' + novo_cmd + '\n') check = subprocess.call(novo_cmd, shell=True) if check != 0: sys.stderr.write(date_time() + 'Novosort merge failed!\n') exit(1) else: sys.stderr.write(date_time() + 'Merge complete, deleting individual bams\n') #rm_bam = 'rm ' + ' '.join(bam_list) #subprocess.call(rm_bam, shell=True)
def send_fastqc_to_server(config=None, run_config=None): os.chdir(os.path.join(config['root_dir'], run_config['run_name'], config['qc']['dir'])) cmd = 'rsync -av --progress --stats *html ' + config['qc']['user'] + '@{}/{}/'.format(config['qc']['server'], run_config['run_name']) job_manager.job_manager([cmd], threads=1, interval=10)
Options: -h Arguments: <list> fastq list <th> num threads """ import subprocess import sys sys.path.append('/home/ubuntu/TOOLS/Scripts/alignment') sys.path.append('/home/ubuntu/TOOLS/Scripts/utility') from docopt import docopt args = docopt(__doc__) fh = open(args['<list>']) th = args['<th>'] cmd_list = [] #dir_mk = 'mkdir converted' #subprocess.call(dir_mk, shell=True) for line in fh: line = line.rstrip('\n') cmd = '/home/ubuntu/TOOLS/Scripts/utility/fastq_64_to_33.py ' + line cmd_list.append(cmd) from job_manager import job_manager job_manager(cmd_list, th)
# create sub_files to process lc_info = subprocess.check_output('wc -l ' + fn, shell=True) fh = open(fn, 'r') lc = lc_info.split() line_split = math.ciel(float(lc[0])/float(th)) cur = 1 fct = 1 flist = [] cur_file = fn + str(fct) + 'split' out_pre = 'Gene_metrics' + str(fct) out = open(cur_file, 'w') job_list = [] cmd = '/home/ubuntu/TOOLS/dropseq/2_calc_mean_variance_bin.py ' job_list.append(cmd + cur_file + ' ' + out_pre + ' 0') head = next(fh) # out.write(head) for line in fh: if cur > line_split: out.close() fct += 1 cur_file = fn + str(fct) + 'split' out = open(cur_file, 'w') out_pre = 'Gene_metrics' + str(fct) job_list.append(cmd + cur_file + ' ' + out_pre) cur = 1 # out.write(head) out.write(line) cur += 1 out.close() job_manager(job_list, th)
# create sub_files to process lc_info = subprocess.check_output('wc -l ' + fn, shell=True) fh = open(fn, 'r') lc = lc_info.split() line_split = math.ciel(float(lc[0]) / float(th)) cur = 1 fct = 1 flist = [] cur_file = fn + str(fct) + 'split' out_pre = 'Gene_metrics' + str(fct) out = open(cur_file, 'w') job_list = [] cmd = '/home/ubuntu/TOOLS/dropseq/2_calc_mean_variance_bin.py ' job_list.append(cmd + cur_file + ' ' + out_pre + ' 0') head = next(fh) # out.write(head) for line in fh: if cur > line_split: out.close() fct += 1 cur_file = fn + str(fct) + 'split' out = open(cur_file, 'w') out_pre = 'Gene_metrics' + str(fct) job_list.append(cmd + cur_file + ' ' + out_pre) cur = 1 # out.write(head) out.write(line) cur += 1 out.close() job_manager(job_list, th)