def _step4b_draw_snps(self, draw, neff, ndraws, draw_selection, plink, emp, out): """ draw random sets of snps (based on calculated effective snps) """ common.getfile_handle(emp, "(--neff_calc)", False) import jag.drawrandom as drawrandom log.info ("Drawing random SNP sets (based on nEff)...") snps = drawrandom.Snp(draw) snps.inregion = draw_selection snps.snp(neff, ndraws, emp, plink, out) log.info(_get_endtime()) sys.exit(0)
def get_aa_results(self, plinkin,geneset, gene_based=False): """ check if there is already sumlog*.log in current dir and return as cluster results file. If files are not present create the files Returns list of cluster results objects """ results_files = self.in_and_out.get_sumlog_filenames() aa_results = [] if (len(results_files) > 0): for i in range(len(results_files)): if(os.path.exists(results_files[i])): file_handle = common.getfile_handle(results_files[i]) #read file aa_result = CR.Clusterresults() aa_result.read_formated_results(file_handle) if (aa_result.geneset_path == geneset): aa_results.append(aa_result) else: log.info("sumlog file does not have identical path to geneset file\n" + "Will run Association analysis again") aa_result = self.run_step1(plinkin, geneset, gene_based) clusterresults = [pheno_dict["clusterresults"] for pheno_dict in aa_result.values()] aa_results.extend(clusterresults) break else: aa_result = self.run_step1(plinkin, geneset, gene_based) clusterresults = [pheno_dict["clusterresults"] for pheno_dict in aa_result.values()] aa_results.extend(clusterresults) else: clusterresults = [pheno_dict["clusterresults"] for pheno_dict in self.run_step1(plinkin, geneset, gene_based).values()] aa_results = clusterresults return aa_results
def _read_gene_boundaries(gene_boundaries_file, genes_in_groups): """ # read gene boundaries """ gb_file = common.getfile_handle(gene_boundaries_file) genes_with_boundaries = [] #header = gb_file.readline() for line in gb_file: gene_data = line.strip().split("\t") if gene_data[0] in genes_in_groups: genes_in_groups.remove(gene_data[0]) #save gene information chr_number = gene_data[1] chrstart = int(gene_data[2]) chrend = int(gene_data[3]) chr_strand = gene_data[4] symbol = gene_data[1] gene_info = {"geneid":gene_data[0], "chr_number":chr_number, "chrstart":chrstart, "chrend":chrend, "chr_strand":chr_strand, "symbol":symbol, "snp_list":[] } genes_with_boundaries.append(gene_info) genes_without_boundaries = genes_in_groups return(genes_with_boundaries, genes_without_boundaries)
def extract_permutated_scores(snp_to_group_map, resultfiles, seeds): """ extract permuted scores from a assoc files """ permuted_results = {} for pheno, files in resultfiles.iteritems(): clusterresults = permutationresults.PermutatedResults() for resultfile in files: filehandle = common.getfile_handle(resultfile) for line in filehandle: splitted = line.split() snp_id = splitted[1] p_value = splitted[8] if(p_value == "NA"): p_value = 1 if (snp_id in snp_to_group_map): clusterresults.add_snp_to_cluster(p_value, snp_to_group_map[snp_id]) clusterresults.process_permutation() clusterresults.add_seeds(seeds) permuted_results[pheno] = clusterresults return permuted_results
def _read_group_gene_mapping(infile): """ read file with genes for gene mapping """ gene_group_mapping = [] #read gene file file_handle = common.getfile_handle(infile) for line in file_handle: gene_group_mapping.extend([line.strip().split("\t")]) return gene_group_mapping
def _create_indep_snp_file(self, plink): """ Create a list of independent snps. plink: a plink object.(only needs a bfile) """ log.info("\nPerforming LD based pruning...") plink.set_plink_arguments("--bfile " + plink.bfile + " --indep-pairwise 200 5 0.25") resultfile = plink.run_plink() prune_file = resultfile + ".prune.in" snp2gene_mapping = {} #create genemapping mapping file_handle = common.getfile_handle(self.drawrandom.genemapping) for text in file_handle: text_array = text.strip().split() if len(text.strip()) != 0: try: snp2gene_mapping[text_array[0]] += "," + text_array[1] except KeyError: snp2gene_mapping[text_array[0]] = text_array[1] # use snp2gene_mapping to map the snp's to genes outfile_text = "" file_handle = common.getfile_handle(prune_file) for text in file_handle: rs_number = text.strip() try: outfile_text += rs_number + '\t' + str(snp2gene_mapping[rs_number]) + "\n" #add mapping except KeyError: outfile_text += str(rs_number) + "\t" + " " + "\n" outfile = self.drawrandom.inoutput.save_text_to_filename("prune.in", outfile_text) #save mapped prune file to prunefile log.info("Saved pruned SNP file as " + outfile) return (outfile)
def _read_snp_file(snp_data_file, all_chromosomes): """ Read snp_data file and skip the chromosomes that are not present in the list all_chromosomes """ chrsnpmapping = {} for chr_name in all_chromosomes: chrsnpmapping[chr_name] = [] snp_file_handle = common.getfile_handle(snp_data_file) old_chr = "" for line in snp_file_handle: try: snp_data = line.strip().split("\t") current_chr = snp_data[1] if not current_chr in all_chromosomes: if not old_chr == current_chr: log.info("There are no genes on chromosome " + current_chr) old_chr = current_chr else: if not old_chr == current_chr: log.info("Mapping SNPs to genes on chromosome " + str(current_chr)) old_chr = current_chr #save snp rsnumber = snp_data[0] try: chr_location = int(snp_data[2]) chrsnpmapping[current_chr].append((chr_location, rsnumber)) except ValueError: log.info("Ignoring following line in SNP file: " + "\t".join(snp_data)) except IndexError: log.info("indexerror" + str(line)) except ValueError: log.info (line) log.info("") return(chrsnpmapping)
def _gene2snpmapping(self): """ create a mapping from a gene to SNP mapping file The key a is a gene and the value is a list of SNP's """ allsnp_and_genes_fh = common.getfile_handle(self.drawrandom.genemapping) gene2snp_mapping = {} for text in allsnp_and_genes_fh: text_array = text.strip().split("\t") if len(text.strip()) != 0: # if line is not empty try: gene2snp_mapping[text_array[1]].append(text_array[0]) except KeyError: gene2snp_mapping[text_array[1]] = [text_array[0]] return gene2snp_mapping
def mergeresults(self): """ function to merge multiple permutation files into one file """ #load sumlog files to create sumlog files aa_result = self._load_sumlog_files() ordered_results = self._get_ordered_results() perm_files = self._get_ordered_files() keys = ordered_results.keys() keys.sort(key=common.alpha_sort) pheno_nr = 0 #merge each phenotype for key in keys: pheno_nr=pheno_nr+1 log.info("\nMerging " + (str(len(perm_files[key]))) + " permutation files for phenotype " + key + "...") perm_out_string = ordered_results[key].format_permout() # concatenated results perm_filename = "merged.P" + key + ".perm" perm_out_filename = self.inout.save_text_to_filename(perm_filename, perm_out_string) log.info("Saved merged permutations as " + perm_out_filename) #save empirical P file if sumlog files is found if (aa_result.has_key(key)): aa_object = Clusterresults() aa_object.read_formated_results(common.getfile_handle(aa_result[key])) empp_out_as_text = ordered_results[key].format_permutated_results(aa_object) emp_filename = "merged.P" + key + ".empp" empp_filename = self.inout.save_text_to_filename(emp_filename, empp_out_as_text) log.info("Saved empirical pvalues as " + empp_filename) #call R for distribution plot self.files[key] = {"perm":perm_out_filename, "empp":empp_filename} if self.inout.run_rproject: import jag.plot_with_r as plot_with_r plotter = plot_with_r.call_r(self.inout) plotter.draw_dist_plot(self.files, key) else: log.info("\nWarning: Could not find sumlog file " + self.inout.out + ".P" + key + ".sumlog") log.info(common.get_terminated_time()) sys.exit()
def map_p_values_from_assoc_file(self, snptogroup, assoc_file, adjusted=False): """ extracts P-values from a assoc file and adds them based on the snptogroup mapping to the right gene-set """ file_handle = common.getfile_handle(assoc_file) for line in file_handle: splitted = line.split() snp_id = splitted[1] if adjusted: p_value = splitted[3] else: p_value = splitted[8] if (snp_id in snptogroup): self.add_snp_to_cluster_all_info(p_value, snp_id, snptogroup[snp_id])
def read_permutated_results(permfile): """ read the permutated results of a a permutation file and return the header and file as list of lists """ file_handle = common.getfile_handle(permfile) text_as_list = file_handle.readline() header = text_as_list.strip().split("\t") results = [[] for i in range(len(header))] for line in file_handle: splitted_line = line.strip().split("\t") results[0].append(splitted_line.pop(0)) for i , value in enumerate(splitted_line, 1): results[i].append(float(value)) return (header, results)
def _read_empp_results(file_name): """ Read a empp file and get nEff back as a dict with the genesetname as key """ file_handle = common.getfile_handle(file_name) header_raw = file_handle.readline() header = header_raw.strip().split("\t") try: neff_column = header.index("nEff") except ValueError: sys.exit("nEff column not found in header of " + file_name) geneset_neff_mapping = {} for line in file_handle: splitted_line = line.strip().split("\t") geneset_neff_mapping[str(splitted_line[0])] = int(splitted_line[neff_column]) return (geneset_neff_mapping)
def read_permout(self, files): """ read multiple permout files and make one permutationresults file """ for permfile in files: file_handle = common.getfile_handle(permfile) text_as_list = file_handle.readlines()#pylint: disable=E1103 header = text_as_list[0].strip().split("\t") #check header last column is seed if(not header[len(header) - 1] == "seed"): sys.exit("last column of " + permfile + " does not have the \"seed\"") #create empty list of list to store columns results = [ [] for i in range(len(header))] for line in range(1, len(text_as_list)): splitted_line = text_as_list[line].strip().split("\t") for i in range(len(splitted_line)): results[i].append(float(splitted_line[i])) #put the results list of list in permutation results for i in range(len(header) - 1): self.add_results(header[i], results[i]) self.add_seeds(results[len(header) - 1])
def __init__(self, args): opts = self.extract_variables_from_command_line(args) from jag.plink import Plink plink = Plink() geneset = None perm = None seed = None annotate_file = False up = float(0) down = float(0) gene_loc_file = None snp_loc = None no_emp = False create_plots = True draw_ngenes = False empirical_p_filename = False ndraws = 0 draw_selection = None exclude_group = True complete_gene_snp_mapping = None verbose = False gene_based = False merge = False adjusted = False prefix = None orig = False sims = False set_name = False out_prefix = "jag" # open log file for identifier, assigned_value in opts: if identifier in ("-o", "--out"): out_prefix = assigned_value elif identifier in ("-s", "--set"): match_draw = re.match("\w+(\.draws_n\w*)\.set.annot$", assigned_value) if match_draw is not None: out_prefix = out_prefix + match_draw.group(1) elif identifier in ("--gene_based"): gene_based = True out_prefix = out_prefix + ".gene_based" from jag.file_fetch_and_write import InAndOut inoutput = InAndOut() inoutput.set_outfile(out_prefix) self.enable_logging_with_prefix(inoutput) log.info(_get_header_of_program()) log.info("Save logfile as [" + inoutput.out + "log]\n") log.info(_get_starttime()) log.info("\nUsed options:") for o, a in opts: if o not in ("-h", "--help"): log.info("\t" + o + " " + a) else: log.info("\t" + o + " " + a) log.info("\nPrinting help documentation...") log.info(_usage()) print(_get_terminated_time()) sys.exit(2) if len(opts) == 0: _usage() else: for identifier, assigned_value in opts: if identifier in ("-o", "--out"): #this stuff is printed to catch the out for the location of the log file out_prefix = assigned_value elif identifier in ("-s", "--set"): assert common.getfile_handle(assigned_value,"(--set)", verbose) group = assigned_value plink.group = assigned_value elif identifier in ("-m", "--perm"): perm = int(assigned_value) elif identifier in ("-v", "--verbose"): plink.verbose = True verbose = True elif identifier in ("--snp_loc"): snp_loc = assigned_value assert common.getfile_handle(assigned_value, \ "(--snp_loc)", verbose) elif identifier in ("--control_empp"): assert common.getfile_handle(assigned_value, \ "(--control_empp)", verbose) sims = assigned_value elif identifier in ("--orig_empp"): assert common.getfile_handle(assigned_value, \ "(--orig_empp)", verbose) orig = assigned_value elif identifier in ("--gene_set"): set_name = assigned_value elif identifier in ("--no_emp"): no_emp = True create_plots = False elif identifier in ("--no_plots"): create_plots = False elif identifier in ("--ndraw"): ndraws = int(assigned_value) elif identifier in ("--draw_ngenes"): draw_ngenes = assigned_value elif identifier in ("--snp2gene"): assert common.getfile_handle(assigned_value, \ "(--snp2gene)", verbose) annotate_file = assigned_value elif identifier in ("--up"): try: up = float(assigned_value) except ValueError: log.info("Value after --up parameter should be a number") sys.exit(1) elif identifier in ("--down"): try: down = float(assigned_value) except ValueError: log.info("Value after --down parameter should be a number") sys.exit(1) elif identifier in ("--gene_loc"): assert common.getfile_handle(assigned_value, "Cannot find file with gene boundaries (set by --gene_loc)", verbose) gene_loc_file = assigned_value elif identifier in ("--pool"): complete_gene_snp_mapping = assigned_value elif identifier in ("--draw_neff_genic"): draw_selection = "genic" set_name = assigned_value elif identifier in ("--draw_neff_intergenic"): draw_selection = "intergenic" set_name = assigned_value elif identifier in ("--draw_neff_all"): draw_selection = "all" set_name = assigned_value elif identifier in ("--covar"): assert common.getfile_handle(assigned_value, \ "(--covar;", verbose) plink.covar_file = assigned_value elif identifier in ("--linear"): plink.switches += "--linear " elif identifier in ("--logistic"): plink.switches += "--logistic " elif identifier in ("--adjust"): plink.switches += "--adjust " adjusted = True elif identifier in ("--exclude"): exclude_group = True elif identifier in ("--include"): exclude_group = False elif identifier in ("--neff_calc"): assert common.getfile_handle(assigned_value, "(--neff_calc)", verbose) empirical_p_filename = assigned_value elif identifier in ("-b", "--bfile"): plink.bfile = common.check_bim_bed_fam(assigned_value) elif identifier in ("-p", "--pheno"): assert common.getfile_handle(assigned_value, \ "(-p or --pheno)", verbose) plink.pheno_file = assigned_value elif identifier in ("--seed"): seed = assigned_value elif identifier in ("--gene_based"): gene_based = True elif identifier in ("--merge"): prefix = assigned_value inoutput = InAndOut() inoutput.set_outfile(prefix) merge = True else: assert False, "unhandled option" print _get_terminated_time() sys.exit(2) log.info("") inoutput.run_rproject = create_plots if merge: self._step3_merge_results(inoutput) if orig or sims: self._calc_emp_p_of_emp_p(orig , sims , set_name) sys.exit(0) if annotate_file and gene_loc_file and snp_loc: self._step0_annotate_genes(inoutput, annotate_file, up, down, \ gene_loc_file, snp_loc) elif draw_ngenes or draw_selection: common.getfile_handle(complete_gene_snp_mapping, "--pool is not set correctly.", verbose) common.getfile_handle(group, "--set is not set.", verbose) import jag.drawrandom as drawrandom draw = drawrandom.DrawRandom(complete_gene_snp_mapping, group, inoutput) draw.exclude = exclude_group if seed: draw.setseed(seed) if draw_ngenes: self._step4a_draw_genes(draw, draw_ngenes, ndraws) if draw_selection: self._step4b_draw_snps(draw, set_name, ndraws, draw_selection, \ plink, empirical_p_filename, out_prefix) elif(perm > 0 and plink.bfile and group): #Run Step 1 + Step 2 self._step1_run_association_analysis(group, plink, inoutput, gene_based, adjusted) self._step2_run_permutations(group, perm, seed, no_emp, plink, inoutput, gene_based) elif(perm == 0): # Run only association analysis self._step1_run_association_analysis(group, plink, inoutput, gene_based, adjusted) log.info("\nNo permutations will be proceeded since number of permutations is zero.") else: log.info("You are using an invalid combination of parameters.\nCheck the help file for the correct combination.") log.info(_get_endtime())
def snp(self, geneset, amount, empp_file, plink, out): """ draw random snp, on number of neff snps in the .empp file, from an independent snp file (.prune.in). If this file is not present, this file will be created within _create_indep_snp_file function. """ geneset_to_snp_mapping = common.map_geneset_to_snp(self.drawrandom.snpmapping) if (geneset not in geneset_to_snp_mapping): log.info("\n" + geneset + " is not known as a geneset name. Please check your data for correct geneset name.") log.info(_get_terminated_time()) sys.exit() self.exclude_snps = set([ x["s"] for x in geneset_to_snp_mapping[geneset]]) #drawsnps try: n_snp = _read_empp_results(empp_file)[geneset] except ValueError: log.info("geneset to select is not found in empp file") log.info(_get_terminated_time()) sys.exit() prunedin_file = str(os.path.abspath(os.path.curdir)) + "/" + out + ".prune.in" #get pruned.in file prunedin = os.path.exists(prunedin_file) if (prunedin == False): #create prune.in from file, if it does not excist prunedin = self._create_indep_snp_file(plink) else: log.info ("\nUsing the pruned SNP set from " + prunedin_file) prunedin = prunedin_file allsnp_and_genes = common.getfile_handle(prunedin).readlines() #pylint: disable=E1103 length_genic = 0 length_nongenic = 0 for line in allsnp_and_genes: line = line.rsplit('\t') gene = line[1].rsplit() if len(gene) is not 0: length_genic += 1 else: length_nongenic += 1 amount_allsnp_and_genes = len(allsnp_and_genes) - 1 #random_snp_text = "RS#\tGeneID\tDraw_#\n" random_snp_text = "" for new_group_number in xrange(1, amount + 1): self.drawrandom.accessed = set() setname = "Draw_" + str(new_group_number) count = 0 while(count < n_snp): random_snp_text += ("\t".join(self._get_random_snp(allsnp_and_genes, amount_allsnp_and_genes))) random_snp_text += ("\t" + setname + "\n") count = count + 1 inregion_text = "unknown_in_region" if(self.inregion == "genic"): inregion_text = "genic" if amount*n_snp > length_genic: log.info("\nWarning: pool of " + str(length_genic) + " SNPs located within genes is to small to draw " + \ str(amount) + " x " + str(n_snp) + " independent nEff SNPs!") log.info(_get_terminated_time()) sys.exit() else: log.info("\nDrawing " + str(amount) + " x " + str(n_snp) + " nEff SNPs from a pool of " + \ str(length_genic) + " SNPs located within genes") elif(self.inregion == "intergenic"): inregion_text = "intergenic" if amount*n_snp > length_nongenic: log.info("\nWarning: pool of " + str(length_nongenic) + " SNPs located outside genes is to small to draw " + \ str(amount) + " x " + str(n_snp) + " independent nEff SNPs!") log.info(_get_terminated_time()) sys.exit() else: log.info("\nDrawing " + str(amount) + " x " + str(n_snp) + " nEff SNPs from a pool of " + \ str(length_nongenic) + " SNPs located outside genes") elif(self.inregion == "all"): inregion_text = "all" if amount*n_snp > len(allsnp_and_genes): log.info("\nWarning: pool of " + str(len(allsnp_and_genes)) + " SNPs located in- and outside genes is to small to draw " + \ str(amount) + " x " + str(n_snp) + " independent nEff SNPs!") log.info(_get_terminated_time()) sys.exit() else: log.info("\nDrawing " + str(amount) + " x " + str(n_snp) + " nEff SNPs from a pool of " + \ str(len(allsnp_and_genes)) + " SNPs located in- and outside genes") incl_or_excl = "unknown" if (self.drawrandom.exclude is False): incl_or_excl = "incl" elif (self.drawrandom.exclude is True): incl_or_excl = "excl" filename = "draws_neff_" + inregion_text + ".set.annot" out = self.drawrandom.inoutput.save_text_to_filename(filename, random_snp_text) #save random snps file log.info("\nSaved random draws on number of effective number of SNPS as " + out) return(filename)