Пример #1
0
def filter_ann_txt_files_just_maf(infile, cols_to_filter, freq_req, outfile_prefix, protein_changing_definitions, exonic_definitions, genename):
	##filter rnaseq data
	for freq in freq_req:
		##only 'rare' (<=1%)
		filtering_annotated.filter(working_dir, "and", infile, "1.temp", [cols_to_filter[0]], ['<='], [freq])
		##q>=30 and coverage >=5
		filtering_annotated.filter(working_dir, "and", "1.temp", outfile_prefix + '.' + str(freq) + '.' + genename + ".xls", cols_to_filter[1], ['>=', '>='], [30,5])
def filter_rpt_c3h(file_prefix):
    ##remove if in rmsk, segdup
    filtering_annotated.filter(working_dir, "and",
                               file_prefix + '.annotated.txt',
                               file_prefix + "11.temp", [11, 12], ['==', '=='],
                               ['.', '.'])
    ##in c3hr mouse
    filtering_annotated.filter(working_dir, "and", file_prefix + '11.temp',
                               file_prefix + ".c3h.xls", [25], ['!='], ['.'])
def filter_exonic_variants(infile, outfile):
    col_exon = 6
    exon_definition = ['exonic', 'splicing']
    col_function = 9
    syn_definition = 'synonymous SNV'
    filtering_annotated.filter(working_dir, "or", infile, 'temp1.txt',
                               [col_exon, col_exon], ['==', '=='],
                               [exon_definition[0], exon_definition[1]])
    ##remove synonymous
    filtering_annotated.filter(working_dir, "and", 'temp1.txt', outfile,
                               [col_function], ['!='], [syn_definition])
def filter_ann_file_3(file_prefix):
    ##remove if in rmsk, segdup
    filtering_annotated.filter(working_dir, "and",
                               file_prefix + '.annotated.txt',
                               file_prefix + "21.temp", [11, 12], ['==', '=='],
                               ['.', '.'])
    ##homozygous in all three mice
    filtering_annotated.filter(working_dir, "or", file_prefix + '21.temp',
                               file_prefix + '.non_rpt.in_any.xls',
                               [16, 17, 18], ['!=', '!=', '!='],
                               ['.', '.', '.'])
def filter_ann_file_2(file_prefix):
    ##remove if in rmsk, segdup, or b6
    filtering_annotated.filter(working_dir, "and",
                               file_prefix + '.annotated.txt',
                               file_prefix + "11.temp", [11, 12, 15],
                               ['==', '==', '=='], ['.', '.', '.'])
    ##homozygous in all three mice
    filtering_annotated.filter(working_dir, "and", file_prefix + '11.temp',
                               file_prefix + '.non_rpt_b6.hom_in_all.xls',
                               [16, 17, 18], ['==', '==', '=='],
                               ['hom', 'hom', 'hom'])
def filter_for_enu(file_prefix):
    ##remove if in rmsk, segdup, dbsnp, mgp, c3h
    filtering_annotated.filter(working_dir, "and",
                               file_prefix + '.annotated.txt',
                               file_prefix + ".enu_rpts.xls",
                               [11, 12, 13, 14, 25],
                               ['==', '==', '==', '==', '=='],
                               ['.', '.', '.', '.', '.'])
    ##remove if in  dbsnp, mgp, c3h
    filtering_annotated.filter(working_dir, "and",
                               file_prefix + '.annotated.txt',
                               file_prefix + ".enu.xls", [13, 14, 25],
                               ['==', '==', '=='], ['.', '.', '.'])
Пример #7
0
def filter_ann_txt_files(samples, cols_to_filter, freq_req, outfile_prefix):
    ##filter rnaseq data
    for sample in samples:
        for freq in freq_req:
            ##only 'rare' (<=1%)
            filtering_annotated.filter(
                working_dir, "and",
                outfile_prefix + '.' + sample + '.annotated.txt', "1.temp",
                [cols_to_filter[1]], ['<='], [freq])
            ##q>=30 and coverage >=5
            filtering_annotated.filter(working_dir, "and", "1.temp", "2.temp",
                                       cols_to_filter[4], ['>=', '>='],
                                       [30, 5])
            ##exonic_variants in refGene
            filtering_annotated.filter(
                working_dir, "or", "2.temp",
                outfile_prefix + '.' + str(freq) + "3.temp",
                [cols_to_filter[0], cols_to_filter[0]], ['==', '=='],
                [exonic_definitions[0], exonic_definitions[1]])
            ##get all protein changing
            filtering_annotated.filter(
                working_dir, "and",
                outfile_prefix + '.' + str(freq) + "3.temp", outfile_prefix +
                '.' + sample + '.' + str(freq) + ".protein_changing.xls",
                make_list(cols_to_filter[2], non_protein_changing_definitions),
                make_list('!=', non_protein_changing_definitions),
                non_protein_changing_definitions)
Пример #8
0
def filter_ann_file_mt2(file_prefix):
	##if not passed by mt2
	filtering_annotated.filter(working_dir, "and", file_prefix + '.mt2.annotated.txt', file_prefix + '.mt2.passed.xls', [137], ['=='], ['PASS'])

	##exonic_variants
	filtering_annotated.filter(working_dir, "or", file_prefix + '.mt2.annotated.txt' , file_prefix + "_1.temp", [6, 6], ['==','=='], [exon_definition[0],exon_definition[1]])
	##remove synonymous
	filtering_annotated.filter(working_dir, "and", file_prefix + "_1.temp", file_prefix + "_2.temp", [9], ['!='], ['synonymous SNV'])
	##<10% in all gnomad
	filtering_annotated.filter(working_dir, "and", file_prefix + "_2.temp", file_prefix + ".mt2.exonic.rare.xls", [83,100,117], ['<=','<=','<='], [freq_req,freq_req,freq_req])
Пример #9
0
def filter_ann_txt_files(infile, cols_to_filter, freq_req, outfile_prefix, protein_changing_definitions, exonic_definitions, genename):
	##filter rnaseq data
	for freq in freq_req:
		##only 'rare' (<=1%)
		filtering_annotated.filter(working_dir, "and", infile, "1.temp", [cols_to_filter[0]], ['<='], [freq])
		##q>=30 and coverage >=5
		filtering_annotated.filter(working_dir, "and", "1.temp", "2.temp", cols_to_filter[1], ['>=', '>='], [30,5])
		##exonic_variants in refGene
		filtering_annotated.filter(working_dir, "or", "2.temp", outfile_prefix + '.' + str(freq) +  ".exonic_temp.xls", [cols_to_filter[2], cols_to_filter[2]], ['==','=='], [exonic_definitions[0],exonic_definitions[1]])
		##get all protein changing
		filtering_annotated.filter(working_dir, "or", outfile_prefix + '.' + str(freq) +  ".exonic_temp.xls", outfile_prefix + '.' + str(freq) +  ".protein_changing." + genename + ".xls", make_list(cols_to_filter[3], protein_changing_definitions), make_list('==', protein_changing_definitions), protein_changing_definitions)
Пример #10
0
def filter_ann_file(file_prefix):
    ##remove if in rmsk, segdup
    # filtering_annotated.filter(working_dir, "and", file_prefix + '.annotated.txt', file_prefix + "11.temp", [11,12], ['==','=='], ['.','.'])
    ##in chr17 region
    # filtering_annotated.filter(working_dir, "and", file_prefix + '11.temp', file_prefix + '.chr17_20-30mb.xls', [1,3,3], ['==','>=','<='], ['chr17',20000000,30000000])
    # filtering_annotated.filter(working_dir, "and", file_prefix + '11.temp', file_prefix + '.chr17_20-40mb.xls', [1,3,3], ['==','>=','<='], ['chr17',20000000,40000000])
    # filtering_annotated.filter(working_dir, "and", file_prefix + '11.temp', file_prefix + '.chr17_20-60mb.xls', [1,3,3], ['==','>=','<='], ['chr17',20000000,60000000])
    # filtering_annotated.filter(working_dir, "and", file_prefix + '11.temp', file_prefix + '.chr17.xls', [1], ['=='], ['chr17'])

    # filtering_annotated.filter(working_dir, "and", file_prefix + '11.temp', file_prefix + '.chr17_20-30mb.xls', [1,3,3], ['==','>=','<='], ['chr17','20000000','30000000'])

    ##unique to ko
    # filtering_annotated.filter(working_dir, "and", file_prefix + '.chr17_20-30mb.xls', file_prefix + '.chr17_20-30mb.ko_only.xls', [15], ['=='], ['.'])
    # filtering_annotated.filter(working_dir, "and", file_prefix + '.chr17_20-40mb.xls', file_prefix + '.chr17_20-40mb.ko_only.xls', [15], ['=='], ['.'])
    # filtering_annotated.filter(working_dir, "and", file_prefix + '.chr17_20-60mb.xls', file_prefix + '.chr17_20-60mb.ko_only.xls', [15], ['=='], ['.'])
    # filtering_annotated.filter(working_dir, "and", file_prefix + '.chr17.xls', file_prefix + '.chr17.ko_only.xls', [15], ['=='], ['.'])
    ##in 129
    filtering_annotated.filter(working_dir, "or",
                               file_prefix + '.chr17.ko_only.xls',
                               file_prefix + '.chr17.ko_only.129.xls',
                               [17, 18, 19], ['!=', '!=', '!='],
                               ['.', '.', '.'])
Пример #11
0
def filter_ann_file_somatic_only(file_prefix, sample):
	file_prefix = file_prefix + '.' + sample
	##remove if in control, i.e. not in TZ008
	if sample == 'TZ001' or sample == 'TZ002':
		filtering_annotated.filter(working_dir, "and", file_prefix + '.annotated.txt', file_prefix + ".not_in_TZ008.xls", [153], ['=='], ['.'])
	##remove if in control, i.e. not in TZ009
	elif sample == 'TZ003':
		filtering_annotated.filter(working_dir, "and", file_prefix + '.annotated.txt', file_prefix + ".not_in_TZ009.xls", [154], ['=='], ['.'])
	##remove if in control, i.e. not in TZ007
	elif sample == 'TZ004' or sample == 'TZ005' or sample == 'TZ006':
		filtering_annotated.filter(working_dir, "and", file_prefix + '.annotated.txt', file_prefix + ".not_in_TZ007.xls", [152], ['=='], ['.'])
	else:
		print(sample, 'sample name not recognized')
def filter_rpt(file_prefix):
    ##remove if in rmsk, segdup
    filtering_annotated.filter(working_dir, "and",
                               file_prefix + '.annotated.txt',
                               file_prefix + ".c15_vars.xls", [11, 12],
                               ['==', '=='], ['.', '.'])
		genome_and_window = homozygosity_mapping_ub26.make_windows(working_dir, genome_fai, ws, step_size).split('.')[0]

		##make bed file from variants
		homozygosity_mapping_ub26.make_bed_from_ann(working_dir, 'samtools', sample +  '.hom_temp.txt', zygosity_col, info_col)
		##hom and het count and hom percentage
		homozygosity_mapping_ub26.count_and_percentage(working_dir, genome_and_window, sample + '.bed')
		##naf
		homozygosity_mapping_ub26.naf_in_window(working_dir, genome_and_window, sample + '.bed')
		##total snp number
		homozygosity_mapping_ub26.total_snp_in_window(working_dir, genome_and_window, sample + '.bed')

		##combine bedgraphs for graphing in r
		homozygosity_mapping_ub26.combine_bedgraphs_for_r(working_dir, sample, genome_and_window)
'''

# '''
##filter variants for candidates snps
affected_samples = ['M75', 'M77']
for sample in affected_samples:
    ##auts2
    filtering_annotated.filter(working_dir, "and", sample + '.annotated.txt',
                               sample + "_1.temp", [1, 2, 3],
                               ['==', '>=', '<='],
                               ['chr5', 131437306, 132542649])
    ##keep if het in all sample
    filtering_annotated.filter(working_dir, "and", sample + "_1.temp",
                               sample + ".auts2_het_in_all.xls",
                               [14, 15, 16, 17], ['==', '==', '==', '=='],
                               ['het', 'het', 'het', 'het'])
# '''
# combine_fq_file(r1_files_to_combine, r2_files_to_combine, K541_combined_r1, K541_combined_r2)
# align_with_bwa(fq_dict)
variant_calling_samtools(fq_dict, mkdup_bam, st_vcf_suffix)
convert_to_annovar(fq_dict, st_vcf_suffix + '.gz')
run_table_annovar(fq_dict)
multianno_to_annotated(fq_dict)




##filter variants for variants,  homozygsity mapping then counts
# '''
##filter variants for candidates snps
for sample in samples:
	##exonic_variants
	filtering_annotated.filter(working_dir, "or", sample + '.annotated.txt' , sample + "_1.temp", [col_exon, col_exon], ['==','=='], [exon_definition[0],exon_definition[1]])
	##remove synonymous
	filtering_annotated.filter(working_dir, "and", sample + "_1.temp", sample + "_2.temp", [col_function], ['!='], [syn_definition])
	##remove if in dbsnp, sanger, or other mouse line
	filtering_annotated.filter(working_dir, "and", sample + "_2.temp", sample + "_3.temp", [13,14,15,16,17,18,19,20,21,22,23], ['==','==','==','==','==','==','==','==','==','==','=='], ['','','','','','','','','','',''])
	##keep if hom
	filtering_annotated.filter(working_dir, "and", sample + "_3.temp", sample + '.hom_exonic_rare.xls', [zygosity_col], ['=='], ['hom'])
	##filter variants by coverage and quality 
	filtering_annotated.filter(working_dir, "and", sample + '.hom_exonic_rare.xls', sample + '.hom_exonic_rare_qual_filtered.xls', [cov_col,qual_col], ['>=','>='], [cov_definition,qual_definition])
# '''

	
# '''
##homozygosity mapping
# window_size = [100000,500000,1000000,2000000]
window_size = [10000000]
		##naf
		homozygosity_mapping_cybertron.naf_in_window(working_dir, genome_and_window, sample + '.bed')
		##total snp number
		homozygosity_mapping_cybertron.total_snp_in_window(working_dir, genome_and_window, sample + '.bed')

		##combine bedgraphs for graphing in r
		homozygosity_mapping_cybertron.combine_bedgraphs_for_r(working_dir, sample, genome_and_window)
'''
## add not in affected mouse to c3Hh analysis
# '''
samples = ['mut_combined']
for ws in window_size:
    for sample in samples:
        ##remove if in rmsk, segdup, not hom in  unaffected mouse and keep hom if in c3h mouse
        filtering_annotated.filter(working_dir, "and",
                                   sample + '.annotated.txt',
                                   sample + "21.temp", [11, 12], ['==', '=='],
                                   ['', ''])
        filtering_annotated.filter(working_dir, "and", sample + '21.temp',
                                   sample + "31.temp", [22], ['!='], ['hom'])
        filtering_annotated.filter(working_dir, "and", sample + '31.temp',
                                   sample + "41.temp", [26], ['=='], ['hom'])
        ##filter variants by coverage and quality
        filtering_annotated.filter(working_dir, "and", sample + "41.temp",
                                   sample + '.hom_temp.txt',
                                   [cov_col, qual_col], ['>=', '>='],
                                   [cov_definition, qual_definition])
        ##get hom vars
        filtering_annotated.filter(working_dir, "and",
                                   sample + ".hom_temp.txt",
                                   sample + '.no_rpts_c3h_notunaff_hom.xls',
                                   [zygosity_col], ['=='], ['hom'])
Пример #16
0
		##hom and het count and hom percentage
		homozygosity_mapping_cybertron.count_and_percentage(working_dir, genome_and_window, sample + '.bed')
		##naf
		homozygosity_mapping_cybertron.naf_in_window(working_dir, genome_and_window, sample + '.bed')
		##total snp number
		homozygosity_mapping_cybertron.total_snp_in_window(working_dir, genome_and_window, sample + '.bed')

		##combine bedgraphs for graphing in r
		homozygosity_mapping_cybertron.combine_bedgraphs_for_r(working_dir, sample, genome_and_window)
# '''

##filter variants for candidates snps
for sample in samples:
    ##remove if in dbsnp, sanger, or other mouse line
    filtering_annotated.filter(working_dir, "and", sample + '.annotated.txt',
                               sample + "_23.temp", [11, 12, 13, 14, 15, 16],
                               ['==', '==', '==', '==', '==', '=='],
                               ['', '', '', '', '', ''])
    ##keep if hom
    filtering_annotated.filter(working_dir, "and", sample + "_23.temp",
                               sample + "_24.temp", [zygosity_col], ['=='],
                               ['hom'])
    ##filter variants by coverage and quality
    filtering_annotated.filter(working_dir, "and", sample + "_24.temp",
                               sample + '.rare_qual_filtered.xls',
                               [cov_col, qual_col], ['>=', '>='],
                               [cov_definition, qual_definition])

##find C3H windows
##filter variants for counts
c3h_snp_suffix = '.c3h_filtered.xls'
c3h_snp_bed_suffix = '.c3h_filtered.bed'
	filtering_annotated.filter(working_dir, "and", sample + '.hom_exonic_rare.xls', sample + '.hom_exonic_rare_qual_filtered.xls', [cov_col,qual_col], ['>=','>='], [cov_definition,qual_definition])
'''

# '''
##homozygosity mapping
##for 0917 analysis
window_size = [100000, 500000, 1000000, 2000000]
# window_size = [10000000]
step_size = 100000
fq_dict = ['timon_comb']
for ws in window_size:
    for sample in fq_dict:
        ##remove if in dbsnp, sanger, other ped or rmsk
        filtering_annotated.filter(
            working_dir, "and", sample + '.annotated.txt', sample + "11.temp",
            [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], [
                '==', '==', '==', '==', '==', '==', '==', '==', '==', '==',
                '==', '==', '=='
            ], ['', '', '', '', '', '', '', '', '', '', '', '', ''])
        # filtering_annotated.filter(working_dir, "and", sample + '.annotated.txt', sample + "11.temp", [11,12,23], ['==','==','=='], ['','',''])

        ##filter variants by coverage and quality
        filtering_annotated.filter(working_dir, "and", sample + "11.temp",
                                   sample + '.hom_temp.txt',
                                   [cov_col, qual_col], ['>=', '>='],
                                   [cov_definition, qual_definition])

        #make bed file with windows and returns genome name and window size variable
        genome_and_window = homozygosity_mapping_ub26.make_windows(
            working_dir, genome_fai, ws, step_size).split('.')[0]

        ##make bed file from variants
Пример #18
0
def filter_ann_file_causal(file_prefix, sample):
	file_prefix = file_prefix + '.' + sample
	##if not passed by gatk
	filtering_annotated.filter(working_dir, "and", file_prefix + '.annotated.txt', file_prefix + "_0.temp", [9], ['=='], ['PASS'])

	##remove if in control, i.e. not in TZ008
	if sample == 'TZ001' or sample == 'TZ002':
		filtering_annotated.filter(working_dir, "and", file_prefix + "_0.temp", file_prefix + ".not_in_TZ008.xls", [153], ['=='], ['.'])
		##exonic_variants
		filtering_annotated.filter(working_dir, "or", file_prefix + ".not_in_TZ008.xls" , file_prefix + "_1.temp", [col_exon, col_exon], ['==','=='], [exon_definition[0],exon_definition[1]])
		##remove synonymous
		filtering_annotated.filter(working_dir, "and", file_prefix + "_1.temp", file_prefix + "_2.temp", [col_function], ['!='], [syn_definition])
		##<10% in all gnomad
		filtering_annotated.filter(working_dir, "and", file_prefix + "_2.temp", file_prefix + ".not_in_TZ008.exonic.rare.xls", af_cols, ['<=','<=','<='], [freq_req,freq_req,freq_req])

	##remove if in control, i.e. not in TZ009
	elif sample == 'TZ003':
		filtering_annotated.filter(working_dir, "and", file_prefix + "_0.temp", file_prefix + ".not_in_TZ009.xls", [154], ['=='], ['.'])
		##exonic_variants
		filtering_annotated.filter(working_dir, "or", file_prefix + ".not_in_TZ009.xls" , file_prefix + "_1.temp", [col_exon, col_exon], ['==','=='], [exon_definition[0],exon_definition[1]])
		##remove synonymous
		filtering_annotated.filter(working_dir, "and", file_prefix + "_1.temp", file_prefix + "_2.temp", [col_function], ['!='], [syn_definition])
		##<10% in all gnomad
		filtering_annotated.filter(working_dir, "and", file_prefix + "_2.temp", file_prefix + ".not_in_TZ009.exonic.rare.xls", af_cols, ['<=','<=','<='], [freq_req,freq_req,freq_req])

	##remove if in control, i.e. not in TZ007
	elif sample == 'TZ004' or sample == 'TZ005' or sample == 'TZ006':
		filtering_annotated.filter(working_dir, "and", file_prefix + "_0.temp", file_prefix + ".not_in_TZ007.xls", [152], ['=='], ['.'])
		##exonic_variants
		filtering_annotated.filter(working_dir, "or", file_prefix + ".not_in_TZ007.xls" , file_prefix + "_1.temp", [col_exon, col_exon], ['==','=='], [exon_definition[0],exon_definition[1]])
		##remove synonymous
		filtering_annotated.filter(working_dir, "and", file_prefix + "_1.temp", file_prefix + "_2.temp", [col_function], ['!='], [syn_definition])
		##<10% in all gnomad
		filtering_annotated.filter(working_dir, "and", file_prefix + "_2.temp", file_prefix + ".not_in_TZ007.exonic.rare.xls", af_cols, ['<=','<=','<='], [freq_req,freq_req,freq_req])

	else:
		print(sample, 'sample name not recognized')
Пример #19
0
        window_bed = genome_and_window + '.bed'
        ##all shared snps
        out_bed = bed_to_graph.rsplit('.',
                                      1)[0] + '.' + genome_and_window + '.bed'
        ##bedtools intersect
        with open('temp.bed', "w") as naf_fh:
            hom_bt_intersect = subprocess.Popen([
                bedtools, 'intersect', '-a', window_bed, '-b', bed_to_graph,
                '-c'
            ],
                                                stdout=naf_fh)
            hom_bt_intersect.wait()
        ##filter for region of interest
        if '30mb' in bed_to_graph:
            filtering_annotated.filter(working_dir, "and", 'temp.bed',
                                       'temp2.bed', [1, 3, 3],
                                       ['==', '>', '<='],
                                       ['chr17', 20000000, 30000000])
        elif '40mb' in bed_to_graph:
            filtering_annotated.filter(working_dir, "and", 'temp.bed',
                                       'temp2.bed', [1, 3, 3],
                                       ['==', '>', '<='],
                                       ['chr17', 20000000, 40000000])
        elif '60mb' in bed_to_graph:
            filtering_annotated.filter(working_dir, "and", 'temp.bed',
                                       'temp2.bed', [1, 3, 3],
                                       ['==', '>', '<='],
                                       ['chr17', 20000000, 60000000])

        else:
            filtering_annotated.filter(working_dir, "and", 'temp.bed',
                                       'temp2.bed', [1], ['=='], ['chr17'])
Пример #20
0
def filter_ann_txt_files(exac_txt, cases_txt, control_txt, esp6500_cols, case_controls_col, freq_req):
	##remove duplicate vars from cases and controls and add count in first col
	remove_dup_vars_and_add_count(cases_txt,'cases.nodup_temp.txt')
	remove_dup_vars_and_add_count(control_txt,'controls.nodup_temp.txt')
	##filter rnaseq data
	for freq in freq_req:
		for group in ['cases', 'controls']:
			##only 'rare' (<=1%)
			filtering_annotated.filter(working_dir, "and", group + '.nodup_temp.txt', group + '.' + str(freq) +  ".1.temp", case_controls_col[1], make_list('<=', case_controls_col[1]), make_list(freq, case_controls_col[1]))
			##q>=30 and coverage >=5
			filtering_annotated.filter(working_dir, "and", group + '.' + str(freq) + ".1.temp", group + '.' + str(freq) +  ".2.temp", case_ctl_qual_cov, ['>=', '>='], [30,5])
			##exonic_variants in refGene
			filtering_annotated.filter(working_dir, "or", group + '.' + str(freq) +  ".2.temp", group + '.' + str(freq) +  ".exonic.xls", [case_controls_col[0], case_controls_col[0]], ['==','=='], [exonic_definitions[0],exonic_definitions[1]])
			##get dispuptive
			filtering_annotated.filter(working_dir, "or", group + '.' + str(freq) +  ".exonic.xls", group + '.' + str(freq) +  ".disruptive.xls", make_list(case_controls_col[2], disruptive_definitions), make_list('==', disruptive_definitions), disruptive_definitions)
			##get all protein changing
			filtering_annotated.filter(working_dir, "or", group + '.' + str(freq) +  ".exonic.xls", group + '.' + str(freq) +  ".protein_changing.xls", make_list(case_controls_col[2], protein_changing_definitions), make_list('==', protein_changing_definitions), protein_changing_definitions)
			##get damaging - pp2_hdiv, pp2_hvar, cadd_phred, gerp - in all or in any
			#get non synonymous snps
			filtering_annotated.filter(working_dir, "or", group + '.' + str(freq) +  ".exonic.xls", group + '.' + str(freq) +  ".3.temp", [case_controls_col[2], case_controls_col[2]], ['==', '=='], nosyn_definitions)
			#get if all are positive
			# filtering_annotated.filter(working_dir, "and", group + '.' + str(freq) +  ".3.temp", group + '.' + str(freq) +  ".4.temp", case_controls_col[3], make_list('>=', case_controls_col[3]), damaging_definitions)
			filtering_annotated.filter(working_dir, "or", group + '.' + str(freq) +  ".3.temp", group + '.' + str(freq) +  ".4a.temp", case_controls_col[3][:2], make_list('>=', case_controls_col[3][:2]), pp2_score)
			filtering_annotated.filter(working_dir, "and", group + '.' + str(freq) +  ".4a.temp", group + '.' + str(freq) +  ".4b.temp", case_controls_col[3][2:], make_list('>=', case_controls_col[3][2:]), cadd_gerp_score)
			remove_rows_with_no_data(group + '.' + str(freq) +  ".4b.temp", group + '.' + str(freq) +  ".damaging_all.xls", case_controls_col[3])
			#get if any are positive
			filtering_annotated.filter(working_dir, "or", group + '.' + str(freq) +  ".3.temp", group + '.' + str(freq) +  ".5.temp", case_controls_col[3], make_list('>=', case_controls_col[3]), damaging_definitions)
			remove_rows_with_no_data(group + '.' + str(freq) +  ".5.temp", group + '.' + str(freq) +  ".damaging_any.xls", case_controls_col[3])
		##and exac
		for group in ['exac3_1115']:
			##only 'rare' (<=1%)
			filtering_annotated.filter(working_dir, "and", exac_txt, group + '.' + str(freq) +  ".1.temp", esp6500_rvis_col[1], make_list('<=', esp6500_rvis_col[1]), make_list(freq, esp6500_rvis_col[1]))
			##exonic_variants in refGene
			filtering_annotated.filter(working_dir, "or", group + '.' + str(freq) +  ".1.temp", group + '.' + str(freq) +  ".exonic.xls", [esp6500_rvis_col[0], esp6500_rvis_col[0]], ['==','=='], [exonic_definitions[0],exonic_definitions[1]])
			##get dispuptive
			filtering_annotated.filter(working_dir, "or", group + '.' + str(freq) +  ".exonic.xls", group + '.' + str(freq) +  ".disruptive.xls", make_list(esp6500_rvis_col[2], disruptive_definitions), make_list('==', disruptive_definitions), disruptive_definitions)
			##get all protein changing
			filtering_annotated.filter(working_dir, "or", group + '.' + str(freq) +  ".exonic.xls", group + '.' + str(freq) +  ".protein_changing.xls", make_list(esp6500_rvis_col[2], protein_changing_definitions), make_list('==', protein_changing_definitions), protein_changing_definitions)
			##get damaging - pp2_hdiv, pp2_hvar, cadd_phred, gerp - in all or in any
			#get non synonymous snps
			filtering_annotated.filter(working_dir, "or", group + '.' + str(freq) +  ".exonic.xls", group + '.' + str(freq) +  ".3.temp", [esp6500_rvis_col[2], esp6500_rvis_col[2]], ['==', '=='], nosyn_definitions)
			#get if all are positive
			# filtering_annotated.filter(working_dir, "and", group + '.' + str(freq) +  ".3.temp", group + '.' + str(freq) +  ".4.temp", esp6500_rvis_col[3], make_list('>=', esp6500_rvis_col[3]), damaging_definitions)
			filtering_annotated.filter(working_dir, "or", group + '.' + str(freq) +  ".3.temp", group + '.' + str(freq) +  ".4a.temp", esp6500_rvis_col[3][:2], make_list('>=', esp6500_rvis_col[3][:2]), pp2_score)
			filtering_annotated.filter(working_dir, "and", group + '.' + str(freq) +  ".4a.temp", group + '.' + str(freq) +  ".4b.temp", esp6500_rvis_col[3][2:], make_list('>=', esp6500_rvis_col[3][2:]), cadd_gerp_score)
			remove_rows_with_no_data(group + '.' + str(freq) +  ".4b.temp", group + '.' + str(freq) +  ".damaging_all.xls", esp6500_rvis_col[3])
			#get if any are positive
			filtering_annotated.filter(working_dir, "or", group + '.' + str(freq) +  ".3.temp", group + '.' + str(freq) +  ".5.temp", esp6500_rvis_col[3], make_list('>=', esp6500_rvis_col[3]), damaging_definitions)
			remove_rows_with_no_data(group + '.' + str(freq) +  ".5.temp", group + '.' + str(freq) +  ".damaging_any.xls", esp6500_rvis_col[3])
Пример #21
0
def filter_ann_txt_files_just_exac(exac_txt, esp6500_cols, freq_req):
	for freq in freq_req:
		for group in [exac_unfiltered_prefix]:
			# ##only 'rare' (<=1%)
			# filtering_annotated.filter(working_dir, "and", exac_txt, group + '.' + str(freq) +  ".1.temp", esp6500_rvis_col[1], make_list('<=', esp6500_rvis_col[1]), make_list(freq, esp6500_rvis_col[1]))
			# ##exonic_variants in refGene
			# filtering_annotated.filter(working_dir, "or", group + '.' + str(freq) +  ".1.temp", group + '.' + str(freq) +  ".exonic.xls", [esp6500_rvis_col[0], esp6500_rvis_col[0]], ['==','=='], [exonic_definitions[0],exonic_definitions[1]])

			##only 'rare' (<=1%)
			filtering_annotated.filter(working_dir, "and", exac_txt, group + '.' + str(freq) +  ".1.temp", esp6500_rvis_col[1], make_list('<=', esp6500_rvis_col[1]), make_list(freq, esp6500_rvis_col[1]))
			##exonic_variants in refGene
			filtering_annotated.filter(working_dir, "or", group + '.' + str(freq) +  ".1.temp", group + '.' + str(freq) +  ".exonic.xls", [esp6500_rvis_col[0], esp6500_rvis_col[0]], ['==','=='], [exonic_definitions[0],exonic_definitions[1]])
			##get dispuptive
			filtering_annotated.filter(working_dir, "or", group + '.' + str(freq) +  ".exonic.xls", group + '.' + str(freq) +  ".disruptive.xls", make_list(esp6500_rvis_col[2], disruptive_definitions), make_list('==', disruptive_definitions), disruptive_definitions)
			##get all protein changing
			filtering_annotated.filter(working_dir, "or", group + '.' + str(freq) +  ".exonic.xls", group + '.' + str(freq) +  ".protein_changing.xls", make_list(esp6500_rvis_col[2], protein_changing_definitions), make_list('==', protein_changing_definitions), protein_changing_definitions)
			##get damaging - pp2_hdiv, pp2_hvar, cadd_phred, gerp - in all or in any
			#get non synonymous snps
			filtering_annotated.filter(working_dir, "or", group + '.' + str(freq) +  ".exonic.xls", group + '.' + str(freq) +  ".3.temp", [esp6500_rvis_col[2], esp6500_rvis_col[2]], ['==', '=='], nosyn_definitions)
			#get if all are positive
			# filtering_annotated.filter(working_dir, "and", group + '.' + str(freq) +  ".3.temp", group + '.' + str(freq) +  ".4.temp", esp6500_rvis_col[3], make_list('>=', esp6500_rvis_col[3]), damaging_definitions)
			filtering_annotated.filter(working_dir, "or", group + '.' + str(freq) +  ".3.temp", group + '.' + str(freq) +  ".4a.temp", esp6500_rvis_col[3][:2], make_list('>=', esp6500_rvis_col[3][:2]), pp2_score)
			filtering_annotated.filter(working_dir, "and", group + '.' + str(freq) +  ".4a.temp", group + '.' + str(freq) +  ".4b.temp", esp6500_rvis_col[3][2:], make_list('>=', esp6500_rvis_col[3][2:]), cadd_gerp_score)
			remove_rows_with_no_data(group + '.' + str(freq) +  ".4b.temp", group + '.' + str(freq) +  ".damaging_all.xls", esp6500_rvis_col[3])
			#get if any are positive
			filtering_annotated.filter(working_dir, "or", group + '.' + str(freq) +  ".3.temp", group + '.' + str(freq) +  ".5.temp", esp6500_rvis_col[3], make_list('>=', esp6500_rvis_col[3]), damaging_definitions)
			remove_rows_with_no_data(group + '.' + str(freq) +  ".5.temp", group + '.' + str(freq) +  ".damaging_any.xls", esp6500_rvis_col[3])
hom_bed_suffix = '.resc_hom.bed'
# window_size = [1000000, 500000, 100000]
# step_size = 100000
window_size = [100000]
step_size = 10000
zygosity_col = 27
cov_col = 29
cov_definition = 20
qual_col = 28
qual_definition = 30
working_dir = work_dir
genome_fai = fasta_fai

for sample in samples_to_annotate:
    ##remove if in repeat region or indel and cov/qual
    filtering_annotated.filter(working_dir, "and", sample + '.annotated.xls',
                               sample + "21.temp", [16], ['=='], [''])
    filtering_annotated.filter(working_dir, "and", sample + "21.temp",
                               sample + "22.temp", [4, 5], ['!=', '!='],
                               ['-', '-'])
    filtering_annotated.filter(working_dir, "and", sample + "22.temp",
                               sample + "23.temp", [cov_col, qual_col],
                               ['>=', '>='], [cov_definition, qual_definition])
    ##keep if hom not hom in unrescued
    filtering_annotated.filter(working_dir, "and", sample + "23.temp",
                               sample + "24.temp", [zygosity_col], ['=='],
                               ['hom'])
    ##keep if not hom in unrescued
    filtering_annotated.filter(working_dir, "and", sample + "24.temp",
                               sample + hom_snp_suffix, [20, 21, 22],
                               ['!=', '!=', '!='], ['hom', 'hom', 'hom'])