def run(self): """ 2010-8-15 create a RBDict of gene-forms [chr, start, stop] with min_overlap_ratio=1. value is a sub-RBDict of the gene structure (UTR, non-UTR-exon, intron) given any CNV, use RBDict.findNodes() to find all gene-forms. WATCH: use an alternative comparison function. """ if self.debug: import pdb pdb.set_trace() genome_db = GenomeDB.GenomeDatabase(drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.genome_dbname, ) genome_db.setup(create_tables=False) db_250k = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, \ hostname=self.hostname, database=self.dbname) db_250k.setup(create_tables=False) oneGenomeData = genome_db.getOneGenomeData(tax_id=self.tax_id, chr_gap=0) cumuSpan2ChrRBDict = oneGenomeData.cumuSpan2ChrRBDict genomeRBDict = genome_db.dealWithGenomeRBDict(self.genomeRBDictPickleFname, tax_id=self.tax_id, \ max_distance=self.max_distance, debug=self.debug) #genomeRBDict = None pd = PassingData(min_MAF=self.min_MAF,\ starting_rank=0, \ need_chr_pos_ls=0,\ need_candidate_association=False,\ min_big_overlap=self.min_big_overlap,\ no_of_permutations=self.no_of_permutations,\ no_of_min_breaks=self.no_of_min_breaks) for result_id in self.result_id_ls: candidate_gene_set = db_250k.dealWithCandidateGeneList(self.list_type_id, return_set=True) #internal cache pd.candidate_gene_set = candidate_gene_set #gwr = db_250k.getResultMethodContent(result_id, pdata=pd) #top_loci = gwr.getTopLoci(no_of_top_loci=self.no_of_top_loci) top_loci = self.getResultPeak(result_id, self.result_peak_type_id, pd) top_loci_in_cumu_pos = self.translateChrPosDataObjectIntoCumuPos(top_loci, oneGenomeData.chr_id2cumu_start) top_loci_in_chr_pos = self.translateCumuPosIntoChrPos(top_loci_in_cumu_pos, cumuSpan2ChrRBDict) permData = self.prepareDataForPermutationRankTest(top_loci_in_chr_pos, genomeRBDict, pd, report=True) #m = self.dealWithNoOfSNPsAssociatedWithCandidateGeneList(pd.list_type_id, rm, pd) #cache is internally going on #n = permData.no_of_total_snps - m candidate_sample_size = len(permData.captured_candidate_gene_set) non_candidate_sample_size = len(permData.non_candidate_gene_snp_rank_ls) return_data = self.get_enrichment_pvalue_by_gw_looping(candidate_sample_size, top_loci_in_cumu_pos, candidate_gene_set, \ genomeRBDict, cumuSpan2ChrRBDict=cumuSpan2ChrRBDict, \ no_of_permutations=pd.no_of_permutations, no_of_min_breaks=pd.no_of_min_breaks, param_data=pd) pvalue = return_data.pvalue no_of_tests = return_data.no_of_tests no_of_tests_passed = return_data.no_of_tests_passed sys.stderr.write("%s pvalue: %s.\n"%(result_id, pvalue)) if self.commit: db_250k.session.flush()
def run(self): """ 2010-8-15 create a RBDict of gene-forms [chr, start, stop] with min_overlap_ratio=1. value is a sub-RBDict of the gene structure (UTR, non-UTR-exon, intron) given any CNV, use RBDict.findNodes() to find all gene-forms. WATCH: use an alternative comparison function. """ if self.debug: import pdb pdb.set_trace() genome_db = GenomeDB.GenomeDatabase( drivername=self.drivername, username=self.db_user, password=self.db_passwd, hostname=self.hostname, database=self.genome_dbname, ) genome_db.setup(create_tables=False) db_250k = Stock_250kDB.Stock_250kDB(drivername=self.drivername, username=self.db_user, password=self.db_passwd, \ hostname=self.hostname, database=self.dbname) db_250k.setup(create_tables=False) oneGenomeData = genome_db.getOneGenomeData(tax_id=self.tax_id, chr_gap=0) genomeRBDict = genome_db.dealWithGenomeRBDict(self.genomeRBDictPickleFname, tax_id=self.tax_id, \ max_distance=self.max_distance, debug=self.debug) #genomeRBDict = None pd = PassingData(min_MAF=self.min_MAF,\ min_score=self.min_score, \ results_directory=self.results_directory, \ no_of_top_loci=self.no_of_top_loci, \ starting_rank=0, \ need_chr_pos_ls=0,\ need_candidate_association=False,\ min_big_overlap=self.min_big_overlap,\ no_of_permutations=self.no_of_permutations,\ no_of_min_breaks=self.no_of_min_breaks) compareIns = CNVCompareByOverlapLen( min_overlap_len=100) #any overlap is an overlap translationData = None for result_id in self.result_id_ls: #establish the map from cnv.id from chr_pos rm = Stock_250kDB.ResultsMethod.get(result_id) if not rm.cnv_method_id: sys.stderr.write( "ResultsMethod %s doesn't have cnv_method_id. Skip.\n" % (result_id)) continue if not db_250k._cnv_id2chr_pos: db_250k.cnv_id2chr_pos = rm.cnv_method_id translationData = self.getTranslationDataStructureForBackgroundLoci( db_250k, cnv_method_id=rm.cnv_method_id, min_MAF=self.min_MAF) if not translationData.chrSpan2cumuStartRBDict: sys.stderr.write( "Error: translationData.chrSpan2cumuStartRBDict is empty for cnv method %s. exit.\n" % (rm.cnv_method_id)) sys.exit(3) pd.db_id2chr_pos = db_250k.cnv_id2chr_pos candidate_gene_set = db_250k.dealWithCandidateGeneList( self.list_type_id, return_set=True) #internal cache pd.candidate_gene_set = candidate_gene_set gwr = db_250k.getResultMethodContent( result_id, pdata=pd, min_value_cutoff=self.min_score) top_loci = gwr.getTopLoci(no_of_top_loci=self.no_of_top_loci, min_score=self.min_score) top_loci_in_cumu_pos = self.translateChrPosDataObjectIntoCumuPos( top_loci, translationData.chrSpan2cumuStartRBDict) top_loci_in_chr_pos = self.translateCumuPosIntoChrPos(top_loci_in_cumu_pos, translationData.cumuSpan2ChrSpanRBDict, \ compareIns=compareIns) permData = self.prepareDataForPermutationRankTest( top_loci_in_chr_pos, genomeRBDict, pd, report=True) #m = self.dealWithNoOfSNPsAssociatedWithCandidateGeneList(pd.list_type_id, rm, pd) #cache is internally going on #n = permData.no_of_total_snps - m candidate_sample_size = len(permData.captured_candidate_gene_set) non_candidate_sample_size = len( permData.non_candidate_gene_snp_rank_ls) return_data = self.get_enrichment_pvalue_by_gw_looping(candidate_sample_size, top_loci_in_cumu_pos, candidate_gene_set, \ genomeRBDict, cumuSpan2ChrSpanRBDict=translationData.cumuSpan2ChrSpanRBDict, \ no_of_permutations=pd.no_of_permutations, no_of_min_breaks=pd.no_of_min_breaks, param_data=pd,\ compareIns=compareIns) pvalue = return_data.pvalue no_of_tests = return_data.no_of_tests no_of_tests_passed = return_data.no_of_tests_passed sys.stderr.write("%s pvalue: %s.\n" % (result_id, pvalue)) if self.commit: db_250k.session.flush()