示例#1
0
文件: jag_main.py 项目: dposthuma/jag
    def _step4b_draw_snps(self, draw, neff, ndraws, draw_selection, plink, emp, out):
        """
        draw random sets of snps (based on calculated effective snps)
        """
        common.getfile_handle(emp, "(--neff_calc)", False)

        import jag.drawrandom as drawrandom
        log.info ("Drawing random SNP sets (based on nEff)...")
       
        snps = drawrandom.Snp(draw)
        snps.inregion = draw_selection

        snps.snp(neff, ndraws, emp, plink, out)
        log.info(_get_endtime())
        sys.exit(0)
示例#2
0
    def get_aa_results(self, plinkin,geneset, gene_based=False):
        """
        check if there is already sumlog*.log in current dir and return as cluster 
        results file. If files are not present create the files

        Returns list of cluster results objects
        """
        results_files = self.in_and_out.get_sumlog_filenames()
        aa_results = []
        
        if (len(results_files) > 0):
            for i in range(len(results_files)):
                if(os.path.exists(results_files[i])):
                    file_handle = common.getfile_handle(results_files[i])   #read file
                    aa_result = CR.Clusterresults()
                    aa_result.read_formated_results(file_handle)
                    
                    if (aa_result.geneset_path == geneset):
                        aa_results.append(aa_result)
                    else:
                        log.info("sumlog file does not have identical path to geneset file\n"
                            + "Will run Association analysis again")
                        aa_result = self.run_step1(plinkin, geneset, gene_based)
                        clusterresults = [pheno_dict["clusterresults"] for  pheno_dict in aa_result.values()]
                        aa_results.extend(clusterresults)
                        break
                else:
                    aa_result = self.run_step1(plinkin, geneset, gene_based)
                    clusterresults = [pheno_dict["clusterresults"] for pheno_dict in aa_result.values()]
                    aa_results.extend(clusterresults)
        else:
            clusterresults = [pheno_dict["clusterresults"] for  pheno_dict in self.run_step1(plinkin, geneset, gene_based).values()]
            aa_results = clusterresults

        return aa_results
示例#3
0
def _read_gene_boundaries(gene_boundaries_file, genes_in_groups):
    """
    # read gene boundaries
    """
    gb_file = common.getfile_handle(gene_boundaries_file)
    genes_with_boundaries = []
    #header = gb_file.readline() 

    for line in gb_file:
        gene_data = line.strip().split("\t")
     
        if gene_data[0] in genes_in_groups:
            genes_in_groups.remove(gene_data[0]) #save gene information
            chr_number = gene_data[1]
            chrstart = int(gene_data[2])
            chrend = int(gene_data[3])
            chr_strand = gene_data[4]
            symbol = gene_data[1]
            gene_info = {"geneid":gene_data[0],
                        "chr_number":chr_number,
                        "chrstart":chrstart,
                        "chrend":chrend,
                        "chr_strand":chr_strand,
                        "symbol":symbol,
                        "snp_list":[]
                        }
            genes_with_boundaries.append(gene_info)
    
    genes_without_boundaries = genes_in_groups
    return(genes_with_boundaries, genes_without_boundaries)
示例#4
0
文件: plink.py 项目: dposthuma/jag
def extract_permutated_scores(snp_to_group_map, resultfiles, seeds):
    """
    extract permuted scores from a assoc files
    """
    permuted_results = {}
    
    for pheno, files in resultfiles.iteritems():
        clusterresults = permutationresults.PermutatedResults()
        
        for resultfile in files:
            filehandle = common.getfile_handle(resultfile)
                        
            for line in filehandle:
                splitted = line.split()
                snp_id = splitted[1]
                p_value = splitted[8]
               
                if(p_value == "NA"):
                    p_value = 1

                if (snp_id in snp_to_group_map):
                    clusterresults.add_snp_to_cluster(p_value, snp_to_group_map[snp_id])

            clusterresults.process_permutation()

        clusterresults.add_seeds(seeds)
        permuted_results[pheno] = clusterresults
        
    return permuted_results
示例#5
0
def _read_group_gene_mapping(infile):
    """
    read file with genes for gene mapping
    """
    gene_group_mapping = [] #read gene file
    file_handle = common.getfile_handle(infile)
    
    for line in file_handle: 
        gene_group_mapping.extend([line.strip().split("\t")])
        
    return gene_group_mapping
示例#6
0
    def _create_indep_snp_file(self, plink):
        """
        Create a list of independent snps.
        plink: a plink object.(only needs a bfile)
        
        """
        log.info("\nPerforming LD based pruning...")
        plink.set_plink_arguments("--bfile " + plink.bfile + " --indep-pairwise 200 5 0.25")
        resultfile = plink.run_plink()
        prune_file = resultfile + ".prune.in"
     
        snp2gene_mapping = {}   #create genemapping mapping
        file_handle = common.getfile_handle(self.drawrandom.genemapping)
        
        for text in file_handle:
            text_array = text.strip().split()
            if len(text.strip()) != 0:
                try:
                    snp2gene_mapping[text_array[0]] += "," + text_array[1]
                except KeyError:
                    snp2gene_mapping[text_array[0]] = text_array[1]
                        
        # use snp2gene_mapping to map the snp's to genes
        outfile_text = ""
        file_handle = common.getfile_handle(prune_file)

        for text in file_handle:
            rs_number = text.strip()
            try:
                outfile_text += rs_number + '\t' + str(snp2gene_mapping[rs_number]) + "\n"  #add mapping
            except KeyError:
                outfile_text += str(rs_number) + "\t" + " " + "\n"  
        
        outfile = self.drawrandom.inoutput.save_text_to_filename("prune.in", outfile_text)  #save mapped prune file to prunefile
        log.info("Saved pruned SNP file as " + outfile)
        
        return (outfile)
示例#7
0
def _read_snp_file(snp_data_file, all_chromosomes):
    """
    Read snp_data file and skip the chromosomes that are not
    present in the list all_chromosomes
    """
    chrsnpmapping = {}
    for chr_name in all_chromosomes:
        chrsnpmapping[chr_name] = []

    snp_file_handle = common.getfile_handle(snp_data_file)
    old_chr = ""

    for line in snp_file_handle:
        try:
            snp_data = line.strip().split("\t")
            current_chr = snp_data[1]
 
            if not current_chr in all_chromosomes:                
                if not old_chr == current_chr:
                    log.info("There are no genes on chromosome " + current_chr)
              
                old_chr = current_chr
                
            else:
                if not old_chr == current_chr:
                    log.info("Mapping SNPs to genes on chromosome " + str(current_chr))
                    old_chr = current_chr

                #save snp
                rsnumber = snp_data[0]
                try:
                    chr_location = int(snp_data[2])
                    chrsnpmapping[current_chr].append((chr_location, rsnumber))

                except ValueError:
                    log.info("Ignoring following line in SNP file: " + "\t".join(snp_data))
                    
        except IndexError:
            log.info("indexerror" + str(line))
            
        except ValueError:
            log.info (line)
            
    log.info("")
    
    return(chrsnpmapping)
示例#8
0
    def _gene2snpmapping(self):
        """
        create a mapping from a gene to SNP mapping file
        The key a is a gene and the value is a list of SNP's
        """
        
        allsnp_and_genes_fh = common.getfile_handle(self.drawrandom.genemapping)
        gene2snp_mapping = {}
        for text in allsnp_and_genes_fh:
            text_array = text.strip().split("\t")
            
            if len(text.strip()) != 0:  # if line is not empty
                try:
                    gene2snp_mapping[text_array[1]].append(text_array[0])
                except KeyError:
                    gene2snp_mapping[text_array[1]] = [text_array[0]]

        return gene2snp_mapping
示例#9
0
    def mergeresults(self):
        """
        function to merge multiple permutation files into one file
        """
        #load sumlog files to create sumlog files
        aa_result = self._load_sumlog_files()
        ordered_results = self._get_ordered_results()
        perm_files = self._get_ordered_files()
        keys = ordered_results.keys()
        keys.sort(key=common.alpha_sort) 
             
        pheno_nr = 0
        
        #merge each phenotype
        for key in keys:
            pheno_nr=pheno_nr+1
            log.info("\nMerging " + (str(len(perm_files[key]))) + " permutation files for phenotype " + key + "...")
            perm_out_string = ordered_results[key].format_permout() # concatenated results
            perm_filename = "merged.P" + key + ".perm"
            perm_out_filename = self.inout.save_text_to_filename(perm_filename, perm_out_string)
            log.info("Saved merged permutations as " + perm_out_filename)
            #save empirical P file if sumlog files is found
            if (aa_result.has_key(key)):

                aa_object = Clusterresults()
                aa_object.read_formated_results(common.getfile_handle(aa_result[key]))

                empp_out_as_text = ordered_results[key].format_permutated_results(aa_object)
                emp_filename = "merged.P" + key + ".empp"
                empp_filename = self.inout.save_text_to_filename(emp_filename, empp_out_as_text)
                log.info("Saved empirical pvalues as " + empp_filename)
                #call R for distribution plot
                self.files[key] = {"perm":perm_out_filename, "empp":empp_filename}
                
                if self.inout.run_rproject:
                    import jag.plot_with_r as plot_with_r
                    plotter = plot_with_r.call_r(self.inout)
                    plotter.draw_dist_plot(self.files, key)
        
            else:
                log.info("\nWarning: Could not find sumlog file " + self.inout.out + ".P" + key + ".sumlog")
                log.info(common.get_terminated_time())
                sys.exit()
示例#10
0
 def map_p_values_from_assoc_file(self, snptogroup, assoc_file, adjusted=False):
     """
     extracts P-values from a assoc file and adds them based on the 
     snptogroup mapping to the right gene-set
     
     """
     file_handle = common.getfile_handle(assoc_file)
     
     for line in file_handle:
         splitted = line.split()
         snp_id = splitted[1]
         
         if adjusted:
             p_value = splitted[3]
                             
         else:
             p_value = splitted[8]
                             
         if (snp_id in snptogroup):
             self.add_snp_to_cluster_all_info(p_value, snp_id, snptogroup[snp_id])
示例#11
0
def read_permutated_results(permfile):
    """
    read the permutated results of a a permutation file and return the 
    header and file as list of lists
    """
    file_handle = common.getfile_handle(permfile)
    text_as_list = file_handle.readline()
    header = text_as_list.strip().split("\t")
    results = [[] for i in range(len(header))]
    for line in file_handle:
        splitted_line = line.strip().split("\t")
        results[0].append(splitted_line.pop(0))
        for i , value in enumerate(splitted_line, 1):
            results[i].append(float(value))

    return (header, results)



    
示例#12
0
def _read_empp_results(file_name):
    """
    Read a empp file and get nEff back as a dict with the genesetname as key
    """

    file_handle = common.getfile_handle(file_name)
    header_raw = file_handle.readline()
    header = header_raw.strip().split("\t")

    try:
        neff_column = header.index("nEff")
    except ValueError:
        sys.exit("nEff column not found in header of " + file_name)

    geneset_neff_mapping = {}
    
    for line in file_handle:
        splitted_line = line.strip().split("\t")
        geneset_neff_mapping[str(splitted_line[0])] = int(splitted_line[neff_column])
    
    return (geneset_neff_mapping)
示例#13
0
    def read_permout(self, files):
        """
        read multiple permout files and make one permutationresults file
        """
        for permfile in files:
            file_handle = common.getfile_handle(permfile)
            text_as_list = file_handle.readlines()#pylint: disable=E1103
            header = text_as_list[0].strip().split("\t")
            #check header last column is seed
            if(not header[len(header) - 1] == "seed"):
                sys.exit("last column of " + permfile + " does not have the \"seed\"")
            #create empty list of list to store columns
            results = [  [] for i in range(len(header))]

            for line in range(1, len(text_as_list)):
                splitted_line = text_as_list[line].strip().split("\t")
                for i in range(len(splitted_line)):
                    results[i].append(float(splitted_line[i]))
            #put the results list of list in permutation results
            for i in range(len(header) - 1):
                self.add_results(header[i], results[i])
            self.add_seeds(results[len(header) - 1])
示例#14
0
文件: jag_main.py 项目: dposthuma/jag
    def __init__(self, args):
                
        opts = self.extract_variables_from_command_line(args)
        
        from jag.plink import Plink
        plink = Plink()
                       
        geneset = None
        perm = None
        seed = None

        annotate_file = False
        up = float(0)
        down = float(0)
        gene_loc_file = None
        snp_loc = None

        no_emp = False
        create_plots = True
        draw_ngenes = False
        empirical_p_filename = False

        ndraws = 0
        draw_selection = None
        exclude_group = True
        complete_gene_snp_mapping = None
        verbose = False
        gene_based = False
       
        merge = False
        adjusted = False
        prefix = None
        
        orig = False
        sims = False
        set_name = False

        out_prefix = "jag"
        
        # open log file
        for identifier, assigned_value in opts:
            if identifier in ("-o", "--out"):
                out_prefix = assigned_value
                
            elif identifier in ("-s", "--set"):
                match_draw = re.match("\w+(\.draws_n\w*)\.set.annot$", assigned_value)
                if match_draw is not None:
                    out_prefix = out_prefix + match_draw.group(1)
                 
            elif identifier in ("--gene_based"):
                gene_based = True
                out_prefix = out_prefix + ".gene_based"
              
        from jag.file_fetch_and_write import InAndOut
        inoutput = InAndOut()
        inoutput.set_outfile(out_prefix)
        
        self.enable_logging_with_prefix(inoutput)        
        log.info(_get_header_of_program())
        log.info("Save logfile as [" + inoutput.out + "log]\n")
        log.info(_get_starttime())
        
        log.info("\nUsed options:")
        for o, a in opts:
            if o not in ("-h", "--help"):
                log.info("\t" + o + " " + a)
            else:
                log.info("\t" + o + " " + a)
                log.info("\nPrinting help documentation...")
                log.info(_usage())
                print(_get_terminated_time())
                sys.exit(2)
     
        if len(opts) == 0:
            _usage()
            
        else:
            
            for identifier, assigned_value in opts:

                if identifier in ("-o", "--out"):
                    #this stuff is printed to catch the out for the location of the log file
                    out_prefix = assigned_value
                                       
                elif identifier in ("-s", "--set"):
                    assert common.getfile_handle(assigned_value,"(--set)", verbose)
                    group = assigned_value
                    plink.group = assigned_value
                    
                elif identifier in ("-m", "--perm"):
                    perm = int(assigned_value)
                    
                elif identifier in ("-v", "--verbose"):
                    plink.verbose = True
                    verbose = True
                    
                elif identifier in ("--snp_loc"):
                    snp_loc = assigned_value
                    
                    assert common.getfile_handle(assigned_value, \
                                                 "(--snp_loc)", verbose)

                elif identifier in ("--control_empp"):
                    assert common.getfile_handle(assigned_value, \
                                                  "(--control_empp)", verbose)
                    sims = assigned_value
                    
                elif identifier in ("--orig_empp"):
                    assert common.getfile_handle(assigned_value, \
                                                  "(--orig_empp)", verbose)
                    orig = assigned_value
                    
                elif identifier in ("--gene_set"):
                    set_name = assigned_value
                    
                elif identifier in ("--no_emp"):
                    no_emp = True
                    create_plots = False
                    
                elif identifier in ("--no_plots"):
                    create_plots = False
                    
                elif identifier in ("--ndraw"):
                    ndraws = int(assigned_value)
                    
                elif identifier in ("--draw_ngenes"):
                    draw_ngenes = assigned_value
                    
                elif identifier in ("--snp2gene"):
                    assert common.getfile_handle(assigned_value, \
                                                 "(--snp2gene)", verbose)
                    annotate_file = assigned_value
                    
                elif identifier in ("--up"):
                    try:
                        up = float(assigned_value)
                        
                    except ValueError:
                        log.info("Value after --up parameter should be a number")
                        sys.exit(1)
                                       
                elif identifier in ("--down"):
                    try:
                        down = float(assigned_value)
                        
                    except ValueError:
                        log.info("Value after --down parameter should be a number")
                        sys.exit(1)
                
                elif identifier in ("--gene_loc"):
                    assert common.getfile_handle(assigned_value, "Cannot find file with gene boundaries (set by --gene_loc)", verbose)
                    gene_loc_file = assigned_value
                 
                elif identifier in ("--pool"):
                    complete_gene_snp_mapping = assigned_value

                elif identifier in ("--draw_neff_genic"):
                    draw_selection = "genic"
                    set_name = assigned_value
                    
                elif identifier in ("--draw_neff_intergenic"):
                    draw_selection = "intergenic"
                    set_name = assigned_value
                    
                elif identifier in ("--draw_neff_all"):
                    draw_selection = "all"
                    set_name = assigned_value
                    
                elif identifier in ("--covar"):
                    assert common.getfile_handle(assigned_value, \
                         "(--covar;", verbose)
                    plink.covar_file = assigned_value
                                
                elif identifier in ("--linear"):
                    plink.switches += "--linear "
                                        
                elif identifier in ("--logistic"):
                    plink.switches += "--logistic "
                    
                elif identifier in ("--adjust"):
                    plink.switches += "--adjust "
                    adjusted = True

                elif identifier in ("--exclude"):
                    exclude_group = True
                    
                elif identifier in ("--include"):
                    exclude_group = False

                elif identifier in ("--neff_calc"):
                    assert common.getfile_handle(assigned_value, "(--neff_calc)", verbose)
                    empirical_p_filename = assigned_value
                    
                elif identifier in ("-b", "--bfile"):
                    plink.bfile = common.check_bim_bed_fam(assigned_value)
                    
                elif identifier in ("-p", "--pheno"):
                    assert common.getfile_handle(assigned_value, \
                     "(-p or --pheno)", verbose)
                    plink.pheno_file = assigned_value
                    
                elif identifier in ("--seed"):
                    seed = assigned_value
                    
                elif identifier in ("--gene_based"):
                    gene_based = True

                elif identifier in ("--merge"):
                    prefix = assigned_value
                    inoutput = InAndOut()
                    inoutput.set_outfile(prefix)
                    merge = True
                                    
                else:
                    assert False, "unhandled option"
                    print _get_terminated_time()
                    sys.exit(2)
                    
            log.info("")        
                  
            inoutput.run_rproject = create_plots
                       
            if merge:
                self._step3_merge_results(inoutput)

            if orig or sims:
                self._calc_emp_p_of_emp_p(orig , sims , set_name)
                sys.exit(0)

            if annotate_file and gene_loc_file and snp_loc:
                self._step0_annotate_genes(inoutput, annotate_file, up, down, \
                                            gene_loc_file, snp_loc)

            elif draw_ngenes or draw_selection:
                common.getfile_handle(complete_gene_snp_mapping, "--pool is not set correctly.", verbose)
                common.getfile_handle(group, "--set is not set.", verbose)

                import jag.drawrandom as drawrandom
                draw = drawrandom.DrawRandom(complete_gene_snp_mapping, group, inoutput)

                draw.exclude = exclude_group

                if seed:
                    draw.setseed(seed)

                if draw_ngenes:
                    self._step4a_draw_genes(draw, draw_ngenes, ndraws)
                
                if draw_selection:
                    self._step4b_draw_snps(draw, set_name, ndraws, draw_selection, \
                                           plink, empirical_p_filename, out_prefix)

            elif(perm > 0 and plink.bfile and group):
                #Run Step 1 + Step 2
                self._step1_run_association_analysis(group, plink, inoutput, gene_based, adjusted)
                self._step2_run_permutations(group, perm, seed, no_emp, plink, inoutput, gene_based)

            elif(perm == 0):
                # Run only association analysis
                self._step1_run_association_analysis(group, plink, inoutput, gene_based, adjusted)
                log.info("\nNo permutations will be proceeded since number of permutations is zero.")
                
            else:
                log.info("You are using an invalid combination of parameters.\nCheck the help file for the correct combination.")

        log.info(_get_endtime())
示例#15
0
    def snp(self, geneset, amount, empp_file, plink, out):
        """
        draw random snp, on number of neff snps in the .empp file, from an independent snp file (.prune.in). If this file is not 
        present, this file will be created within _create_indep_snp_file function.
        
        """
        
        geneset_to_snp_mapping = common.map_geneset_to_snp(self.drawrandom.snpmapping)
        
        if (geneset not in geneset_to_snp_mapping):
            log.info("\n" + geneset + " is not known as a geneset name. Please check your data for correct geneset name.")
            log.info(_get_terminated_time())

            sys.exit()

        self.exclude_snps = set([ x["s"] for x in geneset_to_snp_mapping[geneset]]) #drawsnps
        
        try:
            n_snp = _read_empp_results(empp_file)[geneset]

        except ValueError:
            log.info("geneset to select is not found in empp file")
            log.info(_get_terminated_time())
            
            sys.exit()
        
        prunedin_file = str(os.path.abspath(os.path.curdir)) + "/" + out + ".prune.in" #get pruned.in file
        prunedin = os.path.exists(prunedin_file)
        
        if (prunedin == False): #create prune.in from file, if it does not excist
            prunedin = self._create_indep_snp_file(plink)
            
        else:
            log.info ("\nUsing the pruned SNP set from " + prunedin_file)
            prunedin = prunedin_file

        allsnp_and_genes = common.getfile_handle(prunedin).readlines() #pylint: disable=E1103
        
        length_genic = 0
        length_nongenic = 0
        
        for line in allsnp_and_genes:
            line = line.rsplit('\t')
            gene = line[1].rsplit()
                     
            if len(gene) is not 0:
                length_genic += 1
            else:
                length_nongenic += 1
                                     
        amount_allsnp_and_genes = len(allsnp_and_genes) - 1
              
        #random_snp_text = "RS#\tGeneID\tDraw_#\n"
        random_snp_text = ""
        
        for new_group_number in xrange(1, amount + 1):
            self.drawrandom.accessed = set()
            setname = "Draw_" + str(new_group_number)

            count = 0
            while(count < n_snp):
                random_snp_text += ("\t".join(self._get_random_snp(allsnp_and_genes, amount_allsnp_and_genes)))
                random_snp_text += ("\t" + setname + "\n")
                count = count + 1
                
        inregion_text = "unknown_in_region"
        
        if(self.inregion == "genic"):
            inregion_text = "genic"
            
            if amount*n_snp > length_genic:
                log.info("\nWarning: pool of " + str(length_genic) + " SNPs located within genes is to small to draw " + \
                 str(amount) + " x " + str(n_snp) + " independent nEff SNPs!")
                log.info(_get_terminated_time())
                sys.exit()
            else:               
                log.info("\nDrawing " + str(amount) + " x " + str(n_snp) + " nEff SNPs from a pool of " + \
                 str(length_genic) + " SNPs located within genes")
            
        elif(self.inregion == "intergenic"):
            inregion_text = "intergenic"
            
            if amount*n_snp > length_nongenic:
                log.info("\nWarning: pool of " + str(length_nongenic) + " SNPs located outside genes is to small to draw " + \
                 str(amount) + " x " + str(n_snp) + " independent nEff SNPs!")
                log.info(_get_terminated_time())
                sys.exit()
            else:
                log.info("\nDrawing " + str(amount) + " x " + str(n_snp) + " nEff SNPs from a pool of " + \
                 str(length_nongenic) + " SNPs located outside genes")
            
        elif(self.inregion == "all"):
            inregion_text = "all"
            
            if amount*n_snp > len(allsnp_and_genes):
                log.info("\nWarning: pool of " + str(len(allsnp_and_genes)) + " SNPs located in- and outside genes is to small to draw " + \
                 str(amount) + " x " + str(n_snp) + " independent nEff SNPs!")
                log.info(_get_terminated_time())
                sys.exit()
            else:
                log.info("\nDrawing " + str(amount) + " x " + str(n_snp) + " nEff SNPs from a pool of " + \
                 str(len(allsnp_and_genes)) + " SNPs located in- and outside genes")
            
        incl_or_excl = "unknown"
        
        if (self.drawrandom.exclude is False):
            incl_or_excl = "incl"
            
        elif (self.drawrandom.exclude is True):
            incl_or_excl = "excl"
        
        filename = "draws_neff_" + inregion_text + ".set.annot"
        out = self.drawrandom.inoutput.save_text_to_filename(filename, random_snp_text)     #save random snps file
        log.info("\nSaved random draws on number of effective number of SNPS as " + out)
        return(filename)