예제 #1
0
파일: drawrandom.py 프로젝트: dposthuma/jag
    def genes(self, geneset, amount):
        """
        Draw random genes, where set is the geneset name selected
         and amount the number of new sets to be formed
        """
        geneset_to_snp_mapping = common.map_geneset_to_snp(self.drawrandom.snpmapping)
        if (geneset not in geneset_to_snp_mapping):
            log.info("\n" + geneset + " is not known as a geneset name. Please check your data for correct geneset name.")
            log.info(_get_terminated_time())
            sys.exit()
        #load number of  genes in geneset by the mapping file
        self.exclude_genes = set([ x["g"] for x in geneset_to_snp_mapping[geneset]])

        ngenes = len(self.exclude_genes)
        gene2snp = self._gene2snpmapping()  #create gene to all SNP's mapping
        all_genes = set(gene2snp.keys())
        
        if(len(all_genes) <= (ngenes*amount)):
            max = len(all_genes)/ngenes
            log.info("\nYour sample of " + str(len(all_genes)) + " genes is too small to draw " + str(amount) + " random genesets of size " + str(ngenes))
            log.info("\nMaximum number of independent gene-sets to draw (" + str(len(all_genes))+"/"+ str(ngenes) + "): " + str(max))
            log.info(_get_terminated_time())
            sys.exit()
        
        
        log.info("\nSize of gene pool (--pool): " + str(len(all_genes)) + " unique genes")
        log.info("\nThe geneset " + geneset + " contains " + str(ngenes) + " genes" )
        log.info("\nMinimum size of gene pool needed to draw " + str(amount) + " gene-sets (" + str(ngenes) + "*" +  str(amount) + "): " + str(ngenes*amount))
   
        #exclude genes that are in the selected geneset
        if (self.drawrandom.exclude):
            all_genes = all_genes.difference(set(self.exclude_genes))

        output_text = ""
        
        for new_set_number in xrange(1, amount + 1):

            setname = "Draw_" + str(new_set_number)
            sampled_keys = random.sample(all_genes, ngenes)

            for gene_as_key in sampled_keys:
                output_text += '\n'.join([str(snp) + "\t" + str(gene_as_key) + \
                        "\t" + setname for snp in gene2snp[gene_as_key]])
                output_text += '\n'

        incl_or_excl = "excl" if self.drawrandom.exclude else "incl"      #create correct filename
        filename = "draws_ngenes.set.annot"
        out = self.drawrandom.inoutput.save_text_to_filename(filename, output_text)     #save file
        log.info("\nSaved random draws on number of genes as " + out)
        
        return(output_text)
예제 #2
0
파일: drawrandom.py 프로젝트: dposthuma/jag
    def snp(self, geneset, amount, empp_file, plink, out):
        """
        draw random snp, on number of neff snps in the .empp file, from an independent snp file (.prune.in). If this file is not 
        present, this file will be created within _create_indep_snp_file function.
        
        """
        
        geneset_to_snp_mapping = common.map_geneset_to_snp(self.drawrandom.snpmapping)
        
        if (geneset not in geneset_to_snp_mapping):
            log.info("\n" + geneset + " is not known as a geneset name. Please check your data for correct geneset name.")
            log.info(_get_terminated_time())

            sys.exit()

        self.exclude_snps = set([ x["s"] for x in geneset_to_snp_mapping[geneset]]) #drawsnps
        
        try:
            n_snp = _read_empp_results(empp_file)[geneset]

        except ValueError:
            log.info("geneset to select is not found in empp file")
            log.info(_get_terminated_time())
            
            sys.exit()
        
        prunedin_file = str(os.path.abspath(os.path.curdir)) + "/" + out + ".prune.in" #get pruned.in file
        prunedin = os.path.exists(prunedin_file)
        
        if (prunedin == False): #create prune.in from file, if it does not excist
            prunedin = self._create_indep_snp_file(plink)
            
        else:
            log.info ("\nUsing the pruned SNP set from " + prunedin_file)
            prunedin = prunedin_file

        allsnp_and_genes = common.getfile_handle(prunedin).readlines() #pylint: disable=E1103
        
        length_genic = 0
        length_nongenic = 0
        
        for line in allsnp_and_genes:
            line = line.rsplit('\t')
            gene = line[1].rsplit()
                     
            if len(gene) is not 0:
                length_genic += 1
            else:
                length_nongenic += 1
                                     
        amount_allsnp_and_genes = len(allsnp_and_genes) - 1
              
        #random_snp_text = "RS#\tGeneID\tDraw_#\n"
        random_snp_text = ""
        
        for new_group_number in xrange(1, amount + 1):
            self.drawrandom.accessed = set()
            setname = "Draw_" + str(new_group_number)

            count = 0
            while(count < n_snp):
                random_snp_text += ("\t".join(self._get_random_snp(allsnp_and_genes, amount_allsnp_and_genes)))
                random_snp_text += ("\t" + setname + "\n")
                count = count + 1
                
        inregion_text = "unknown_in_region"
        
        if(self.inregion == "genic"):
            inregion_text = "genic"
            
            if amount*n_snp > length_genic:
                log.info("\nWarning: pool of " + str(length_genic) + " SNPs located within genes is to small to draw " + \
                 str(amount) + " x " + str(n_snp) + " independent nEff SNPs!")
                log.info(_get_terminated_time())
                sys.exit()
            else:               
                log.info("\nDrawing " + str(amount) + " x " + str(n_snp) + " nEff SNPs from a pool of " + \
                 str(length_genic) + " SNPs located within genes")
            
        elif(self.inregion == "intergenic"):
            inregion_text = "intergenic"
            
            if amount*n_snp > length_nongenic:
                log.info("\nWarning: pool of " + str(length_nongenic) + " SNPs located outside genes is to small to draw " + \
                 str(amount) + " x " + str(n_snp) + " independent nEff SNPs!")
                log.info(_get_terminated_time())
                sys.exit()
            else:
                log.info("\nDrawing " + str(amount) + " x " + str(n_snp) + " nEff SNPs from a pool of " + \
                 str(length_nongenic) + " SNPs located outside genes")
            
        elif(self.inregion == "all"):
            inregion_text = "all"
            
            if amount*n_snp > len(allsnp_and_genes):
                log.info("\nWarning: pool of " + str(len(allsnp_and_genes)) + " SNPs located in- and outside genes is to small to draw " + \
                 str(amount) + " x " + str(n_snp) + " independent nEff SNPs!")
                log.info(_get_terminated_time())
                sys.exit()
            else:
                log.info("\nDrawing " + str(amount) + " x " + str(n_snp) + " nEff SNPs from a pool of " + \
                 str(len(allsnp_and_genes)) + " SNPs located in- and outside genes")
            
        incl_or_excl = "unknown"
        
        if (self.drawrandom.exclude is False):
            incl_or_excl = "incl"
            
        elif (self.drawrandom.exclude is True):
            incl_or_excl = "excl"
        
        filename = "draws_neff_" + inregion_text + ".set.annot"
        out = self.drawrandom.inoutput.save_text_to_filename(filename, random_snp_text)     #save random snps file
        log.info("\nSaved random draws on number of effective number of SNPS as " + out)
        return(filename)