def output(self): sample_name = pfn(self.sampleID, 'sample_name') project_name = pfn(self.sampleID, 'project_name') return luigi.LocalTarget( output_fmt.format( path=base_outpath, PN=project_name, SN=sample_name) + '.sam')
def run(self): sample_name = pfn(self.PE1, 'sample_name') project_name = pfn(self.PE1, 'project_name') log_name = '{base}/{PN}_result/trim_result/{SN}_trimed.log'.format( base=base_outpath, PN=project_name, SN=sample_name) if not os.path.isdir('{base}/{PN}_result/trim_result'.format( base=base_outpath, PN=project_name)): os.makedirs('{base}/{PN}_result/trim_result'.format( base=base_outpath, PN=project_name)) input1 = self.PE1 input2 = self.PE2 if input2: cmdline = "java -jar ~/tools/Trimmomatic-0.36/trimmomatic-0.36.jar PE -threads 20 {base_in}/{input1}.fastq.gz {base_in}/{input2}.fastq.gz -trimlog {output} {base_out}/{input1}.clean.fq.gz {base_out}/{input1}.unpaired.fq.gz {base_out}/{input2}.clean.fq.gz {base_out}/{input2}.unpaired.fq.gz ILLUMINACLIP:/home/liaoth/tools/Trimmomatic-0.36/adapters/TruSeq3-PE.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:50".format( input1=input1, input2=input2, base_in=base_inpath, base_out=os.path.dirname(log_name), output=log_name) os.system(cmdline) record_cmdline(cmdline) else: cmdline = "java -jar ~/tools/Trimmomatic-0.36/trimmomatic-0.36.jar SE -threads 20 {base_in}/{input1}.fastq.gz -trimlog {output} {base_out}/{input1}.clean.fq.gz ILLUMINACLIP:/home/liaoth/tools/Trimmomatic-0.36/adapters/TruSeq3-SE.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36".format( input1=input1, base_in=base_inpath, base_out=os.path.dirname(log_name), output=log_name) os.system(cmdline) record_cmdline(cmdline)
def run(self): sampleIDs = self.sample_IDs.split(',') output_dir = self.output().path.rpartition('/')[0] if os.path.isdir(output_dir) != True: os.makedirs(output_dir) if pfn(sampleIDs[0], 'mt2_for') == NORMAL_SIG: input_normal = self.input()[0].path input_tumor = self.input()[1].path elif pfn(sampleIDs[0], 'mt2_for') == TUMOR_SIG: input_normal = self.input()[1].path input_tumor = self.input()[0].path else: input_tumor = '' input_normal = '' prefix = self.output().path.rpartition('.bam')[0] cmdline = '''java -Xmx10g -jar ~/tools/GenomeAnalysisTK-3.6/GenomeAnalysisTK.jar -T MuTect2 --allSitePLs -R {REF} --cosmic {cosmic} --dbsnp {db_snp} --input_file:normal {input_normal} --input_file:tumor {input_tumor} --out {prefix}.vcf --bamOutput {prefix}.bam --log_to_file {prefix}.log'''.format( REF=REF_file_path, cosmic=cos_snp, db_snp=db_snp, input_tumor=input_tumor, input_normal=input_normal, prefix=prefix) os.system(cmdline) record_cmdline(cmdline)
def output(self): Project_ID = pfn(self.sample_NT, 'project_name') sample_name = pfn(self.sample_NT, 'sample_name') output_path = somatic_single_output_fmt.format( path=base_outpath, PN=Project_ID, SN=sample_name) + '.mt2.bam' return luigi.LocalTarget(output_path)
def output(self): sampleIDs = self.sample_IDs.split(',') Project_ID = pfn(sampleIDs[0], 'project_name') pair_name = pfn(sampleIDs[0], 'pair_name') output_path = somatic_pair_output_fmt.format( path=base_outpath, PN=Project_ID, PairN=pair_name) + '.mt2.bam' return luigi.LocalTarget(output_path)
def run(self): input1 = self.input()[0].path mt2_id = pfn(self.sample_NT, 'mt2_for') prefix = self.output().path.rpartition('.bam')[0] output_dir = self.output().path.rpartition('/')[0] if os.path.isdir(output_dir) != True: os.makedirs(output_dir) if mt2_id == NORMAL_SIG: cmdline = '''java -Xmx10g -jar ~/tools/GenomeAnalysisTK-3.6/GenomeAnalysisTK.jar -T MuTect2 --allSitePLs --artifact_detection_mode -R {REF} --cosmic {cosmic} --dbsnp {db_snp} --input_file:tumor {input_tumor} --out {prefix}.vcf --bamOutput {prefix}.bam --log_to_file {prefix}.log '''.format( REF=REF_file_path, cosmic=cos_snp, db_snp=db_snp, input_tumor=input1, prefix=prefix) os.system(cmdline) record_cmdline(cmdline) # Normal only else: cmdline = '''java -Xmx10g -jar ~/tools/GenomeAnalysisTK-3.6/GenomeAnalysisTK.jar -T MuTect2 --allSitePLs --artifact_detection_mode -R {REF} --cosmic {cosmic} --dbsnp {db_snp} --input_file:tumor {input_tumor} --out {prefix}.vcf --bamOutput {prefix}.bam --log_to_file {prefix}.log --tumor_lod 4 '''.format( REF=REF_file_path, cosmic=cos_snp, db_snp=db_snp, input_tumor=input1, prefix=prefix) os.system(cmdline) record_cmdline(cmdline)
def requires(self): samples_IDs = str(self.x).split(',') pair_bucket = defaultdict(list) for _x in samples_IDs: pair_bucket[pfn(_x, 'pair_name')].append(_x) adjust_multiple = [] for each in pair_bucket.keys(): if len(pair_bucket[each]) > 2: tmp = pair_bucket[each] only_normal = [ _ for _ in tmp if pfn(_, 'mt2_for') == NORMAL_SIG ][0] for _each in tmp: if pfn(_each, 'mt2_for') == TUMOR_SIG and pfn( _each, 'sample_name').replace(TUMOR_SIG, '') != each: adjust_multiple.append( (pfn(_each, 'sample_name').replace(TUMOR_SIG, ''), [only_normal, _each])) elif pfn(_each, 'mt2_for') == TUMOR_SIG and pfn( _each, 'sample_name').replace(TUMOR_SIG, '') == each: adjust_multiple.append((each, [only_normal, _each])) pair_bucket.update(dict(adjust_multiple)) global pair_bucket ###{'XK-2': ['XK-2T_S20', 'XK-2W_S17'],'XK-8': ['XK-8T_S21', 'XK-8W_S18']} samples_IDs += [_x for _x in pair_bucket.keys()] if debug_: import pdb pdb.set_trace() for i in samples_IDs: yield Annovar2(sample_ID=i)
def output(self): sampleIDs = self.sample_IDs.split(',') Project_ID = pfn(sampleIDs[0], 'project_name') pair_name = [ k for k, v in pair_bucket.items() if set(v) == set(sampleIDs) ][0] output_path = somatic_pair_output_fmt.format( path=base_outpath, PN=Project_ID, PairN=pair_name) + '.mt2.bam' return luigi.LocalTarget(output_path)
def formatter_output(args): val = '' if ',' in args: arg_List = args.split(',') parsed_name = [] for val in arg_List: parsed_name.append(pfn(PE1_fmt.format(input=val), 'all')) else: val = args parsed_name = pfn(PE1_fmt.format(input=val), 'all') if not self_adjust_fn: fq_file = PE1_fmt.format(input=val) else: input_list = glob.glob(base_inpath + '/*' + val + '*') if filter_str: input_list = [ _i.replace(fq_suffix, '') for _i in input_list if filter_str not in _i ] fq_file = input_list[0] if debug_: import pdb pdb.set_trace() output = """Current Variants: Please make sure your variants is right.\n\n Input path: {b_i} output path: {b_o} Sig represent NORMAL: {sig_n} Sig represent TUMOR: {sig_T} Pair file format: {pe_fmt} input_fastq_file example:{fq}\n One of args filename parsed: {parsed_result}""".format( b_i=base_inpath, b_o=base_outpath, sig_n=NORMAL_SIG, sig_T=TUMOR_SIG, pe_fmt=PE1_fmt, fq=fq_file, parsed_result=str(parsed_name)) return output
def requires(self): samples_IDs = str(self.x).split(',') pair_bucket = defaultdict(list) for _x in samples_IDs: pair_bucket[pfn(_x, 'pair_name')].append(_x) global pair_bucket ###{'XK-2': ['XK-2T_S20', 'XK-2W_S17'],'XK-8': ['XK-8T_S21', 'XK-8W_S18']} samples_IDs += [_x for _x in pair_bucket.keys()] for i in samples_IDs: yield Annovar2(sample_ID=i)
def run(self): sample_name = pfn(self.PE1, 'sample_name') project_name = pfn(self.PE1, 'project_name') trim_r_path = trim_fmt.format(base=base_outpath, PN=project_name) log_name = os.path.join(trim_r_path, '%s_trimed.log' % sample_name) if not os.path.isdir(trim_r_path): os.makedirs(trim_r_path) input1 = self.PE1 input2 = self.PE2 output1 = PE1_fmt.format(input=pfn(self.PE1, 'sample_name')) output2 = PE2_fmt.format(input=pfn(self.PE2, 'sample_name')) if input2: cmdline = "java -jar {trimmomatic_jar} PE -threads 20 {base_in}/{input1}{fq_suffix} {base_in}/{input2}{fq_suffix} -trimlog {output} {base_out}/{output1}.clean.fq.gz {base_out}/{output1}.unpaired.fq.gz {base_out}/{output2}.clean.fq.gz {base_out}/{output2}.unpaired.fq.gz ILLUMINACLIP:{trimmomatic_jar_dir}/adapters/TruSeq3-PE.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:50".format( trimmomatic_jar=trimmomatic_jar, trimmomatic_jar_dir=os.path.dirname(trimmomatic_jar), input1=input1, input2=input2, base_in=base_inpath, base_out=os.path.dirname(log_name), output1=output1, output2=output2, fq_suffix=fq_suffix, output=log_name) os.system(cmdline) record_cmdline(cmdline) else: cmdline = "java -jar {trimmomatic_jar} SE -threads 20 {base_in}/{input1}{fq_suffix} -trimlog {output} {base_out}/{input1}.clean.fq.gz ILLUMINACLIP:{trimmomatic_jar_dir}/adapters/TruSeq3-SE.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36".format( trimmomatic_jar=trimmomatic_jar, trimmomatic_jar_dir=os.path.dirname(trimmomatic_jar), input1=input1, base_in=base_inpath, base_out=os.path.dirname(log_name), fq_suffix=fq_suffix, output=log_name) os.system(cmdline) record_cmdline(cmdline)
def run(self): sample_name = pfn(self.sampleID, 'sample_name') project_name = pfn(self.sampleID, 'project_name') if Pair_data: input1 = self.input().path input2 = self.input().path.replace(R1_INDICATOR, R2_INDICATOR) if not os.path.isdir( output_dir.format( path=base_outpath, PN=project_name, SN=sample_name)): os.makedirs( output_dir.format(path=base_outpath, PN=project_name, SN=sample_name)) cmdline = "bwa mem -M -t 20 -k 19 -R '@RG\\tID:{SN}\\tSM:{SN}\\tPL:illumina\\tLB:lib1\\tPU:L001' {REF} {i1} {i2} > {o}".format( SN=sample_name, REF=REF_file_path, i1=input1, i2=input2, o=self.output().path) os.system(cmdline) record_cmdline(cmdline) else: input1 = self.input().path if not os.path.isdir( output_dir.format( path=base_outpath, PN=project_name, SN=sample_name)): os.makedirs( output_dir.format(path=base_outpath, PN=project_name, SN=sample_name)) cmdline = "bwa mem -M -t 20 -k 19 -R '@RG\\tID:{SN}\\tSM:{SN}\\tPL:illumina\\tLB:lib1\\tPU:L001' {REF} {i1} > {o}".format( SN=sample_name, REF=REF_file_path, i1=input1, o=self.output().path) os.system(cmdline) record_cmdline(cmdline)
def run(self): input1 = self.input()[0].path mt2_id = pfn(self.sample_NT, 'mt2_for') prefix = self.output().path.rpartition('.bam')[0] output_dir = self.output().path.rpartition('/')[0] if os.path.isdir(output_dir) != True: os.makedirs(output_dir) if bed_file_path: suffix_str = " --intervals %s" % bed_file_path else: suffix_str = '' if mt2_id == NORMAL_SIG: cmdline = "{gatk} Mutect2 --java-options '-Xmx20g' --native-pair-hmm-threads 20 --reference {REF} -I {input_tumor} -tumor {T_name} --dbsnp {db_snp} --seconds-between-progress-updates 60 --all-site-pls -stand-call-conf 10 -A Coverage -A DepthPerAlleleBySample -A FisherStrand -A BaseQuality -A QualByDepth -A RMSMappingQuality -A MappingQualityRankSumTest -A ReadPosRankSumTest -A ChromosomeCounts --all-site-pls true --output {prefix}.vcf -bamout {prefix}.bam".format( gatk=gatk_pro, REF=REF_file_path, db_snp=db_snp, input_tumor=input1, prefix=prefix, T_name=pfn(self.sample_NT, 'sample_name')) + suffix_str os.system(cmdline) record_cmdline(cmdline) # Normal only else: cmdline = "{gatk} Mutect2 --java-options '-Xmx20g' --native-pair-hmm-threads 20 --reference {REF} -I {input_tumor} -tumor {T_name} --dbsnp {db_snp} --seconds-between-progress-updates 60 --all-site-pls -stand-call-conf 10 -A Coverage -A DepthPerAlleleBySample -A FisherStrand -A BaseQuality -A QualByDepth -A RMSMappingQuality -A MappingQualityRankSumTest -A ReadPosRankSumTest -A ChromosomeCounts --all-site-pls true --output {prefix}.vcf -bamout {prefix}.bam --tumor-lod-to-emit 4".format( gatk=gatk_pro, REF=REF_file_path, db_snp=db_snp, input_tumor=input1, prefix=prefix, T_name=pfn(self.sample_NT, 'sample_name')) + suffix_str os.system(cmdline) record_cmdline(cmdline)
def run(self): sampleIDs = self.sample_IDs.split(',') output_dir = self.output().path.rpartition('/')[0] if os.path.isdir(output_dir) != True: os.makedirs(output_dir) input_tumor = '' input_normal = '' normal_name = '' tumor_name = '' if pfn(sampleIDs[0], 'mt2_for') == NORMAL_SIG: input_normal = self.input()[0].path input_tumor = self.input()[1].path normal_name = pfn(sampleIDs[0], 'sample_name') tumor_name = pfn(sampleIDs[1], 'sample_name') elif pfn(sampleIDs[0], 'mt2_for') == TUMOR_SIG: input_normal = self.input()[1].path input_tumor = self.input()[0].path normal_name = pfn(sampleIDs[1], 'sample_name') tumor_name = pfn(sampleIDs[0], 'sample_name') prefix = self.output().path.rpartition('.bam')[0] if bed_file_path: suffix_str = " --intervals %s" % bed_file_path else: suffix_str = '' cmdline = "{gatk} Mutect2 --java-options '-Xmx20g' --native-pair-hmm-threads 20 --reference {REF} -I {input_normal} -normal {N_name} -I {input_tumor} -tumor {T_name} --dbsnp {db_snp} --seconds-between-progress-updates 60 --all-site-pls -stand-call-conf 10 -A Coverage -A DepthPerAlleleBySample -A FisherStrand -A BaseQuality -A QualByDepth -A RMSMappingQuality -A MappingQualityRankSumTest -A ReadPosRankSumTest -A ChromosomeCounts --all-site-pls true --output {prefix}.vcf -bamout {prefix}.bam".format( REF=REF_file_path, cosmic=cos_snp, db_snp=db_snp, input_tumor=input_tumor, input_normal=input_normal, gatk=gatk_pro, N_name=normal_name, T_name=tumor_name, prefix=prefix) + suffix_str os.system(cmdline) record_cmdline(cmdline)
def output(self): project_name = pfn(self.PE1, 'project_name') output1 = PE1_fmt.format(input=pfn(self.PE1, 'sample_name')) return luigi.LocalTarget( os.path.join(trim_fmt.format(base=base_outpath, PN=project_name), '/%s.clean.fq.gz' % output1))
def Add_in_vcf_PA(bam_list, vcf_path, output_vcf, fasta_file='/home/liaoth/data/hg19/ucsc.hg19.fasta', N_sig=NORMAL_SIG, T_sig=TUMOR_SIG): """ receive a vcf file and a related bam. Add coverage from bam into vcf and make it a new field. For pair analyse vcf. bam_list order must like [normal one, tumor one] :param bam: bam path :param vcf: vcf path or vcf file object. :return: A new vcf to output. """ ori_format2info = ['AF', 'AD'] field1 = "SAD" field2 = "SAF" field3 = "PoS" NT_SIG = [pfn(_bam, 'mt2_for') for _bam in bam_list] NT_name = [pfn(_bam, 'sample_name') for _bam in bam_list] if type(vcf_path) == str: vcf_readed = vcf.Reader(open(vcf_path, 'r')) else: try: vcf_readed = vcf.Reader(fsock=vcf_path) except: raise IOError print 'Wrong vcf, it is a %s' % str(type(vcf_path)) pos_list = parsed_vcf2pos_list(vcf_path) is_single = False right_infos = vcf_readed.infos machine = right_infos.values()[0] # Modify the info part. if not is_single: field1_info = [ field1, '4', 'Integer', "(REF base count, alt base count). Self cal allele depths from bam file. If there are two pair, it is normal-tumore order." ] field2_info = [ field2, 'R', 'Float', "Alt base count divide the total reads number in this pos. Self cal frequency from bam file. If there are two pair, it is normal-tumore order." ] field3_info = [ field3, '1', 'Integer', "A field which describe this file is single only analysis or pair analysis. 1 for single analysis, 2 for pair analysis." ] right_infos[field1] = machine._make(field1_info + [None, None]) right_infos[field2] = machine._make(field2_info + [None, None]) right_infos[field3] = machine._make(field3_info + [None, None]) for ori_format in ori_format2info: _ori_format_info = vcf_readed.formats[ori_format]._asdict().values( ) + [None, None] _ori_format_info[ 3] += ". If there are two pair, it is normal-tumore order." # fetch ori format value and ID and fix it into length == 6 if ori_format == 'AD': _ori_format_info[1] = '4' elif ori_format == 'AF': _ori_format_info[1] = 'R' right_infos[ori_format] = machine._make(_ori_format_info) vcf_readed.infos = right_infos # Fetch the cov info from bam file, and prepare the writed file. all_cov_info = special_cal_cov(bam_list, pos_list, fasta_file) vcf_writer = vcf.Writer(open(output_vcf, 'w'), vcf_readed) for record in vcf_readed: if record.is_snp: query_for = (record.CHROM, record.POS - 1) buckec_SAD = [] bucket_SAF = [] for ori_format in ori_format2info: exec 'bucket_%s = []' % ori_format for sample_call in record.samples: # it needs to fix the sample and the cov_info order. sample = str(sample_call.sample) idx = [ NT_SIG.index(s) for s, n in zip(NT_SIG, NT_name) if sample == n ][0] cov_info = all_cov_info[idx][query_for] ref_base, ref_cov = cov_info[0] if len(cov_info) > 2: for n_i in range(1, len(cov_info) - 1): if cov_info[n_i][0] == record.ALT[0]: alt_base, alt_cov = cov_info[n_i] elif len(cov_info) == 1: alt_base = record.ALT[0] alt_cov = 0 else: alt_base, alt_cov = cov_info[1] ### fix the bucket order to normal-tumore order. if sample == [ n for s, n in zip(NT_SIG, NT_name) if s == N_sig ]: buckec_SAD.insert(0, int(alt_cov)) buckec_SAD.insert(0, int(ref_cov)) if sum((int(ref_cov), int(alt_cov))) != 0: bucket_SAF.insert( 0, round( float(alt_cov) / sum( (int(ref_cov), int(alt_cov))), 4)) else: bucket_SAF.insert(0, 0) # data = dict(sample_call.data._asdict()) for ori_format in ori_format2info: if ori_format == 'AD': exec "bucket_{i}.insert(0,tuple(data['{i}'])[0])".format( i=ori_format) exec "bucket_{i}.insert(0,tuple(data['{i}'])[1])".format( i=ori_format) else: exec "bucket_{i}.insert(0,data['{i}'])".format( i=ori_format) else: buckec_SAD += [int(ref_cov), int(alt_cov)] if sum((int(ref_cov), int(alt_cov))) != 0: bucket_SAF.append( round( float(alt_cov) / sum( (int(ref_cov), int(alt_cov))), 4)) else: bucket_SAF.append(0) # data = dict(sample_call.data._asdict()) for ori_format in ori_format2info: if ori_format == 'AD': exec "bucket_{i} += list(data['{i}'])".format( i=ori_format) else: exec "bucket_{i}.append(data['{i}'])".format( i=ori_format) record.INFO[field1] = buckec_SAD record.INFO[field2] = bucket_SAF record.INFO[field3] = 2 for ori_format in ori_format2info: exec "record.INFO['{i}'] = bucket_{i}".format(i=ori_format) vcf_writer.write_record(record) vcf_writer.close()
def output(self): project_name = pfn(self.PE1, 'project_name') return luigi.LocalTarget( '{base}/{PN}_result/trim_result/{input1}.clean.fq.gz'.format( base=base_outpath, PN=project_name, input1=self.PE1))