def find_leastP_SNPs(gene_region_list, table_loc, out_base, freq_loc, keep_loc, table_type, annot_dict): title_list = TITLE_LIST zero_title_list = ['snp_name','locuszoom_snp','p-value','|z-score|','weight'] title_line = '\t'.join(title_list) zero_line = '\t'.join(zero_title_list) keep = open(keep_loc,mode="w") z_list = list() lw_list = list() out_file = out_base + '_yank.tbl' with open(out_file, mode="w") as out_text: out_text.write(title_line +'\n') with open(out_file, mode="a") as out_text: for gene in gene_region_list: gene_info, zero_list, lw_snp = read_table(table_loc, gene, table_type, annot_dict)#, position_form) keep.write(gene_info['lz']+'\n') if not zero_list == []: #zero_start, ext = os.path.splitext(out_file) zero_path = out_base + '_yank_'+gene_info['ID'] +'.txt' zeros = open(zero_path,mode='w') zeros.write(zero_line +'\n') for zero in zero_list: zeros.write('\t'.join(zero)+'\n') zeros.close() freq_info = pc_toolbox.retrieve_freq(freq_loc, gene_info['snp']) if freq_info == None: freq_info = pc_toolbox.retrieve_freq(freq_loc, gene_info['lz']) if freq_info == None: gene_info.update({'maf':'NA','control_a1':'NA', 'control_a2':'NA'}) else: gene_info.update({'maf':freq_info[0],'control_a1':freq_info[1], 'control_a2':freq_info[2]}) for item in ORDER_LIST: print gene_info out_text.write(str(gene_info[item]) + '\t') out_text.write('\n') if lw_snp is not None: lw_list.append(lw_snp) z_list = create_condition_list(gene_info, table_loc, out_file, z_list) keep.close() print z_list print lw_list return z_list, lw_list
def update_with_freq(freq_loc, snp_info, pop, aa_freq_loc=None): print snp_info print ("3") if pop == 'UK' and aa_freq_loc == None: if snp_info['snp'] == 'NA': snp_info.update({'maf':'NA','control_a1':'NA','control_a2':'NA'}) return snp_info freq_info = pc_toolbox.retrieve_freq(freq_loc, snp_info['snp']) if freq_info == None: freq_info = pc_toolbox.retrieve_freq(freq_loc, snp_info['lz']) if freq_info == None: snp_info.update({'maf':'NA','control_a1':'NA','control_a2':'NA'}) else: snp_info.update({'maf':freq_info[0],'control_a1':freq_info[1], 'control_a2':freq_info[2]}) return snp_info else: snp_info = update_with_aa_freq(aa_freq_loc, freq_loc, snp_info) return snp_info
def update_with_aa_freq(aa_freq_loc,uk_freq_loc, snp_info): print snp_info print ("3") if snp_info['snp'] == 'NA': snp_info.update({'AA_maf':'NA','AA_control_a1':'NA','AA_control_a2':'NA'}) return snp_info aa_freq_info = pc_toolbox.retrieve_freq(aa_freq_loc, snp_info['snp']) if aa_freq_info == None: aa_freq_info = pc_toolbox.retrieve_freq(aa_freq_loc, snp_info['lz']) if aa_freq_info == None: snp_info.update({'AA_maf':'NA','AA_control_a1':'NA','AA_control_a2':'NA'}) else: snp_info.update({'AA_maf':aa_freq_info[0],'AA_control_a1':aa_freq_info[1], 'AA_control_a2':aa_freq_info[2]}) uk_freq_info = pc_toolbox.retrieve_freq(uk_freq_loc, snp_info['snp']) if uk_freq_info == None: uk_freq_info = pc_toolbox.retrieve_freq(uk_freq_loc, snp_info['lz']) if uk_freq_info == None: snp_info.update({'UK_maf':'NA','UK_control_a1':'NA','UK_control_a2':'NA'}) else: snp_info.update({'UK_maf':uk_freq_info[0],'UK_control_a1':uk_freq_info[1], 'UK_control_a2':uk_freq_info[2]}) print snp_info return snp_info
def read_log(log_summary,log_file,freq_loc,map_loc, repair_loc,annot_dict): ''' Extract relevant information from log file and append it to a summary file. Args: log_file -- path to log file which has been formated in a specific manner log_summary -- path to which summary information should be written Returns: Nothing. ''' #values used to reinitialize non-range information in output_list BLANK = '--' empty_list = [] condition_list = empty_list reset_val = [empty_list,BLANK,BLANK,BLANK,BLANK,BLANK,BLANK,BLANK,BLANK,BLANK,'--'] blank_dict = {'chr':'chr','band':'band','ref':'refgene','start':'start','end':'end','flag':'--', 'clist':BLANK,'im':BLANK,'sig':BLANK,'pval':BLANK, 'OR':BLANK,'lo':BLANK, 'hi':BLANK,'t':BLANK,'aa1':BLANK,'maf':BLANK,'ma1':BLANK,'ma2':BLANK, 'lz':BLANK,'ca':BLANK,'total':'--'} order_list = ['chr','band','ref','start','end','flag','clist','im','sig','pval','OR','lo', 'hi','t','aa1','maf','ma1','ma2','lz','ca','total'] reset_list = ['clist','im','sig','pval','OR','lo', 'hi','t','aa1','maf','ma1','ma2','lz','ca'] output_dict = blank_dict with open(log_file, mode='r')as log: with open(log_summary, mode='a')as summary: for logline in log: #if logline begins '$$$', it contains the range info if logline.startswith('$$$'): range_info = mb_range(logline) output_dict['band']=range_info.band output_dict['chr']=range_info.chro output_dict['ref']=range_info.gene output_dict['start']=range_info.start output_dict['end']=range_info.end output_dict['flag']=range_info.flag #if logline begins '%%%', info pertains to loop through plink_association.py if logline.startswith('%%%'): #if there is unprinted (and thus unwiped) loop information #in output_dict, LocusZoom did not spit out #a 'Found: ' phrase with reference SNP name. This is bad news! if not output_dict['sig']==BLANK: output_dict['lz'] = 'ERROR' print('ERROR: LZ did not announce a reference SNP!') print output_dict dict_list = [] for key in order_list: dict_list.append(output_dict[key]) print dict_list summary.write('\t'.join(dict_list)+'\n') #gather info pertaining to this loop through plink snp_pv_loop = snp_p_loop(logline) ## print('SNP_PV_LOOP is: ') ## print snp_pv_loop output_dict['sig']=snp_pv_loop[0] output_dict['pval']=snp_pv_loop[1] output_dict['ca']=snp_pv_loop[2] #look up the allele frequency of the snp in the freq file maf,ma1,ma2 = pc_toolbox.retrieve_freq(freq_loc, snp_pv_loop[0]) output_dict['maf']=maf output_dict['ma1']=ma1 output_dict['ma2']=ma2 #look up the original Immunochip SNP name in the map file #im = pc_toolbox.retrieve_im(map_loc, snp_pv_loop[0], repair_loc) im = annot_dict[anp_pv_loop[0]].name output_dict['im']=im #if logline begins with Odds Ratio etc. elif logline.startswith('Regression Coefficient') or logline.startswith('Odds Ratio'): orbeta = logline.strip().partition(':')[2] output_dict['OR']=orbeta elif logline.startswith('Coefficient T-Statistic'): stat = logline.strip().partition(':')[2] output_dict['t']=stat elif logline.startswith('A1:'): aa1 = logline.strip().partition(':')[2] output_dict['aa1']=aa1 #if logline begins 'Found: ', obtain name of LZ's reference SNP elif logline.startswith('Found:'): lz_snp = one_value(logline) output_dict['lz']=lz_snp if len(condition_list)==0: output_dict['clist']=BLANK else: output_dict['clist']='['+','.join(condition_list)+']' dict_list = [] for key in order_list: dict_list.append(output_dict[key]) print dict_list summary.write('\t'.join(dict_list)+'\n') condition_list = [] #if output_dict['total'] is '--', #then this is NOT last loop through plink_association.py. #(if it were, p_a wouldn't give a ref SNP, and LZ would, #leading to an expected mismatch.) if output_dict['total']=='--': if not output_dict['lz']==output_dict['sig']: print('ERROR: plink_association and Locus Zoom did not identify the same reference SNP!') summary.write('ERROR: PLINK_ASSOCIATION AND LOCUS ZOOM IDENTIFIED DIFFERENT REFERENCE SNPs! \n') output_dict['total']='--' for key in reset_list: output_dict[key]=BLANK #if logline begins ':::', this is final loop through p_a elif logline.startswith(':::'): final_loopcount = one_value(logline) output_dict['total']=final_loopcount output_dict['ca']=final_loopcount #if logline begins '&&&', it contains a snp in condition list elif logline.startswith('&&&'): condition_list.append(one_value(logline)) print(condition_list) elif logline.startswith('Confidence Interval'): want = logline.partition(':')[2].strip() lo = want.partition('-')[0].strip() hi = want.partition('-')[2].strip() output_dict['lo']=lo output_dict['hi']=hi next
def main(argv): global outfolder, assoc, chromosome,snpstar global out_flag, user_script_loc global range_start_bp, range_end_bp, hit_index, hitstring global region_id, build, single, ldfolder, freq_loc, multi, hit1 cl_arguments(argv) placeholder = '' annot_dict_loc = fix_it.locate_annot_dict(build) annot_dict = fix_it.build_annot_dict('LOG',annot_dict_loc) snpstar_im = annot_dict[snpstar].name table_folder = os.path.join(outfolder, 'ResultTables') summary_folder = os.path.join(outfolder, 'SummaryTables') str_hit_index = str(hit_index) if hit_index < 10: str_hit_index = '0'+str_hit_index if not os.path.exists(table_folder): os.makedirs(table_folder) if not os.path.exists(summary_folder): os.makedirs(summary_folder) chr_folder = os.path.join(outfolder, 'chr{0}'.format(chromosome)) if not os.path.exists(chr_folder): os.makedirs(chr_folder) reg_folder = os.path.join(chr_folder, region_id) if multi: placeholder = '_'+snpstar+'_'+str_hit_index if not os.path.exists(reg_folder): os.makedirs(reg_folder) super_outbase = os.path.join(reg_folder, region_id) list_loc = super_outbase + '_snps.list' table_loc = os.path.join(chr_folder, region_id+placeholder+'.tbl') new_table_loc = os.path.join(table_folder, region_id+placeholder+'.tbl') summary_table_loc = os.path.join(summary_folder, region_id+placeholder+'.tbl') print(table_loc) snplist = make_snp_pos_list(assoc,list_loc) ld_loc = os.path.join(ldfolder,'chr{0}'.format(chromosome), '{0}_r2_0.ld'.format(region_id)) table = open(table_loc, mode="w") table.write('\t'.join(['SNP*','SNP*_pos','SNP*_im','conditional_snp', 'csnp_im','csnp_pos','SNP*_pvalue','OR','ci_lo','ci_hi', 'a1',"r2","csnp_freq","csnp_freq_a1"])+'\n') table.close() index = 1 for snp_tuple in snplist: snp = snp_tuple[0] snp_pos = snp_tuple[1] snp_im = annot_dict[snp].name corrected_snp = snp.replace(':','_') outbase = super_outbase+'_'+corrected_snp script_loc = outbase + '.script' assoc_out = outbase +'.assoc.logistic' if hit1: write_script(outbase,script_loc, user_script_loc, snp, single, hitstring) plink(script_loc) print(''' ********************************************************************** ************************************************************ The data will be conditioned on the following SNP: %%% {0} This is snp #{1} in a list of {2}. ****************************************************** ********************************************************************** '''.format(snp,index,len(snplist))) ## if snp == snpstar: ## write_script(outbase,script_loc, user_script_loc, snp, single, hitstring) ## plink(script_loc) elif not multi: write_script(outbase,script_loc, user_script_loc, snp, single, snpstar) plink(script_loc) print(''' ********************************************************************** ************************************************************ The data will be conditioned on the following SNP: %%% {0} This is snp #{1} in a list of {2}. ****************************************************** ********************************************************************** '''.format(snp,index,len(snplist))) info = filter_result(assoc_out,snpstar) snp_freq = pc_toolbox.retrieve_freq(freq_loc, snp) r2 = pc_toolbox.retrieve_r2(snpstar,snp,ld_loc) if r2 is None: r2 = "???" index = index + 1 with open(table_loc, mode='a') as table: table.write('\t'.join([snpstar,info.pos, snpstar_im, snp,snp_im,snp_pos,str(info.p), info.OR,info.lo,info.hi, info.a1,r2,snp_freq[0],snp_freq[1]])+'\n') shutil.copy(table_loc, new_table_loc) summarize_table(new_table_loc, summary_table_loc, snpstar)