def test_Fixed_Region_Full(self): self.fail() regionID = 42469 self.chmodName = "H3K4me3" self.chrommod = readDatasetDescriptions.readDataset([ "chrommod_" + self.chmodName, "hg18_Barski_chrommod_" + self.chmodName + ".ini", "D:/Projects/Integrated_Genome_Profile_Search_Engine/Cosgen/Datasets/" ]) self.chrommod.datasetCollectionName = "main" self.chrommod.openDBConnections() self.chrommod.cR.execute( "SELECT regionID,chrom,start,stop FROM regions WHERE regionID=" + str(regionID)) regionData = list(self.chrommod.cR.fetchone()) chrom = regionData[1] start = regionData[2] stop = regionData[3] print chrom, start, stop self.chrommod.initializeIntervalArray() overlapingRegionsInterval = self.chrommod.intervalArray.findTwoDimentionalWithAdditionalInfo( chrom, start, stop, 0, 1) print overlapingRegionsInterval reducedGR = utilities.gr_reduceRegionSet( list(overlapingRegionsInterval)) print reducedGR overlap_ratio = utilities.gr_Coverage(reducedGR, start, stop) / float(stop - start) print overlap_ratio
def oneRegionTest(self,regionID,chrommod): self.chmodName = chrommod self.chrommod = readDatasetDescriptions.readDataset(["chrommod_"+self.chmodName, "hg18_Barski_chrommod_"+ self.chmodName +".ini", "D:/Projects/Integrated_Genome_Profile_Search_Engine/Cosgen/Datasets/"]) self.chrommod.datasetCollectionName = "main" self.chrommod.openDBConnections() self.chrommod.cR.execute("SELECT regionID,chrom,start,stop FROM regions WHERE regionID="+str(regionID)) regionData = list(self.chrommod.cR.fetchone()) start = regionData[2] stop = regionData[3] print regionID,regionData sqlQuery = "SELECT overlap_ratio, overlap_count FROM "+ self.chrommod.datasetSimpleName + "_data WHERE regionID = "+str(regionID) self.chrommod.cD.execute(sqlQuery) regionResult = list(self.chrommod.cD.fetchone()) print regionID,regionResult dbcon = cx_Oracle.connect("epigraph_admin", "epigraph123", "bioinfo") dbcur = dbcon.cursor() oracleSQLQuery = "SELECT chromstart, chromend FROM hg18_EPIGRAPH_#romatin WHERE chrom = '"+str(utilities.convertIntToChrom(self.genome,regionData[1]))+"' AND chromstart <= "+str(stop)+" AND chromend > "+str(start)+" AND chrommod = '"+self.chmodName+"' ORDER BY chromstart" dbcur.execute(oracleSQLQuery) oracleD = map(list,list(dbcur.fetchall())) print regionID,oracleD reducedGR = utilities.gr_reduceRegionSet(list(oracleD)) oracle_overlap_ratio = utilities.gr_Coverage(reducedGR, start, stop) / float(stop - start) self.assertEqual(len(oracleD),regionResult[1]) self.assertEqual(regionResult[0],oracle_overlap_ratio)
def preprocessDownloadedDataset(self): if self.hasPreprocessedFile(): #what to do if the data was already preprocessed extext = self.datasetSimpleName + ": the dataset was already preprocessed in "+self.binaryFile log(extext) return self.maxNNRatio = float(self.maxNNRatio) if not self.coverage in ["genome","around dataset regions"]: raise GDMException, "Invalid coverage "+self.coverage self.tilingRegionSizes = self.tilingRegionSizes.split(",") self.tilingRegionSteps = self.tilingRegionSteps.split(",") dsFull = [] if self.coverage == "genome": for i in range(len(self.tilingRegionSizes)): tilingRegionSize = int(self.tilingRegionSizes[i].strip()) tilingRegionStep = int(self.tilingRegionSteps[i].strip()) list_of_chr = [[c,0,settings.genomeData[self.genome][c]] for c in settings.genomeDataStr[self.genome].keys()] ds = dataset_methods.getTilingRegions(list_of_chr,self.genome,tilingRegionSize,tilingRegionStep) #dataset_methods.saveDataset(ds,"trail",False) if self.maxNNRatio > 0: ds = dataset_methods.filterMinNNContent(ds,self.genome,self.maxNNRatio) dsFull.extend(ds) elif self.coverage == "around dataset regions": datasetParts = map(lambda x:x.strip(),self.aroundDataset.split(":")) # get the Main index file for the whole genome, Tiling regions are never precomputed otherwise datasetParts.append(getMainDatasetIndexFileName(self.genome)) aroundDataset = readDatasetDescriptions.readDataset(datasetParts) aroundDataset.init(False) datasetRegions = aroundDataset.getRegions() f = open(settings.baseFolder+"temp.txt","w") f.write(str(datasetRegions)) f.close() self.datasetShore = int(self.datasetShore) # dataset_methods.defaultFolder = "/TL/epigenetics/work/completesearch/Datasets/hg18_RawDatasets/" datasetRegionsExtended = [] for i in xrange(datasetRegions.shape[0]): chr = convertIntToChrom(self.genome,datasetRegions[i,0]) # if chr == "chr0": # print datasetRegions[i] start = datasetRegions[i,1]-self.datasetShore end = datasetRegions[i,2]+self.datasetShore start,end = getCorrectedCoordinates(self.genome,chr,start,end) datasetRegionsExtended.append([chr,start,end]) # dataset_methods.saveDataset(datasetRegionsExtended,"raw",False) nonOverlappingDatasetRegions = dataset_methods.mergeOverlappingMax(datasetRegionsExtended) # dataset_methods.saveDataset(nonOverlappingDatasetRegions,"overlapping",False) for i in range(len(self.tilingRegionSizes)): tilingRegionSize = int(self.tilingRegionSizes[i].strip()) tilingRegionStep = int(self.tilingRegionSteps[i].strip()) ds = dataset_methods.getTilingRegions(nonOverlappingDatasetRegions,self.genome,tilingRegionSize,tilingRegionStep) # dataset_methods.saveDataset(ds,"trail",False) if self.maxNNRatio > 0: ds = dataset_methods.filterMinNNContent(ds,self.genome,maxNNRatio) dsFull.extend(ds) self.__init_regions_dataset_for_local_db__(dsFull)
def mainTest(self,chrommod,n): self.chmodName = chrommod self.chrommod = readDatasetDescriptions.readDataset(["chrommod_"+self.chmodName, "hg18_Barski_chrommod_"+ self.chmodName +".ini", "D:/Projects/Integrated_Genome_Profile_Search_Engine/Cosgen/Datasets/"]) self.chrommod.datasetCollectionName = "main" # load n random regions self.chrommod.openDBConnections() self.chrommod.cR.execute("SELECT COUNT(*) FROM regions") nRows = self.chrommod.cR.fetchone()[0] print nRows self.chrommod.cR.execute("SELECT regionID,chrom,start,stop,datasetID FROM regions ORDER BY regionID") dR = self.chrommod.cR.fetchall() selectedRegionIDs = [] selectedRegionsData = {} for i in range(n): id = random.randint(0,nRows-1) selectedRegionIDs.append(id) if id != dR[id-1][0]: print id, dR[id-1] raise Exception selectedRegionsData[id] = dR[id-1][1:] selectedRegionIDs = list(set(selectedRegionIDs)) selectedRegionIDs.sort() print selectedRegionIDs dbcon = cx_Oracle.connect("epigraph_admin", "epigraph123", "bioinfo") dbcur = dbcon.cursor() count = 0 for regionID in selectedRegionIDs: sqlQuery = "SELECT overlap_ratio, overlap_count FROM "+ self.chrommod.datasetSimpleName + "_data WHERE regionID = "+str(regionID) self.chrommod.cD.execute(sqlQuery) try: regionData = list(self.chrommod.cD.fetchone()) overlap_ratio = regionData[0] overlap_count = regionData[1] except TypeError,ex: overlap_ratio = 0 overlap_count = 0 start = selectedRegionsData[regionID][1] stop = selectedRegionsData[regionID][2] print regionID,overlap_ratio,overlap_count oracleSQLQuery = "SELECT chromstart, chromend FROM hg18_EPIGRAPH_#romatin WHERE chrom = '"+str(utilities.convertIntToChrom(self.genome,selectedRegionsData[regionID][0]))+"' AND chromstart <= "+str(stop)+" AND chromend > "+str(start)+" AND chrommod = '"+self.chmodName+"' ORDER BY chromstart" print oracleSQLQuery dbcur.execute(oracleSQLQuery) oracleD = map(list,list(dbcur.fetchall())) #print oracleD self.assertEqual(overlap_count,len(oracleD)) reducedGR = utilities.gr_reduceRegionSet(list(oracleD)) oracle_overlap_ratio = utilities.gr_Coverage(reducedGR, start, stop) / float(stop - start) self.assertEqual(overlap_ratio,oracle_overlap_ratio)
def filterMinNNContent(ds,genome,maxNNContent): import readDatasetDescriptions import CGSAnnotationsSettings seqDataset = readDatasetDescriptions.readDataset(["dnasequence",settings.baseFolder+"/Datasets/Datasets_descriptions/"+genome+"_unix_dnasequence.ini","./"]) seqDataset.init() cgsAS = CGSAnnotationsSettings.CGSAnnotationsSettings("dnasequence",genome,{},{}) cgsAS.addFeatureDataset({seqDataset.datasetSimpleName:seqDataset.getDefaultAnnotationSettings()}) print seqDataset.chromFiles newDS = [] for region in ds: if cgsAS.featuresDatasets[seqDataset.datasetSimpleName]["currentLoadedChromosome"] != utilities.convertChromToInt(genome,region[0]): print "New chromosome",region[0] seqDataset.loadChromosome(utilities.convertChromToInt(genome,region[0]),cgsAS) o = cgsAS.featuresDatasets[seqDataset.datasetSimpleName]["currentLoadedChromosomeSeq"][region[1]:region[2]].count("N")/float(region[2]-region[1]) if o <= maxNNContent: newDS.append(region) return newDS
def test_Fixed_Region_Full(self): self.fail() regionID = 42469 self.chmodName = "H3K4me3" self.chrommod = readDatasetDescriptions.readDataset(["chrommod_"+self.chmodName, "hg18_Barski_chrommod_"+ self.chmodName +".ini", "D:/Projects/Integrated_Genome_Profile_Search_Engine/Cosgen/Datasets/"]) self.chrommod.datasetCollectionName = "main" self.chrommod.openDBConnections() self.chrommod.cR.execute("SELECT regionID,chrom,start,stop FROM regions WHERE regionID="+str(regionID)) regionData = list(self.chrommod.cR.fetchone()) chrom = regionData[1] start = regionData[2] stop = regionData[3] print chrom,start,stop self.chrommod.initializeIntervalArray() overlapingRegionsInterval = self.chrommod.intervalArray.findTwoDimentionalWithAdditionalInfo(chrom, start, stop,0,1) print overlapingRegionsInterval reducedGR = utilities.gr_reduceRegionSet(list(overlapingRegionsInterval)) print reducedGR overlap_ratio = utilities.gr_Coverage(reducedGR, start, stop) / float(stop - start) print overlap_ratio
def oneRegionTest(self, regionID, chrommod): self.chmodName = chrommod self.chrommod = readDatasetDescriptions.readDataset([ "chrommod_" + self.chmodName, "hg18_Barski_chrommod_" + self.chmodName + ".ini", "D:/Projects/Integrated_Genome_Profile_Search_Engine/Cosgen/Datasets/" ]) self.chrommod.datasetCollectionName = "main" self.chrommod.openDBConnections() self.chrommod.cR.execute( "SELECT regionID,chrom,start,stop FROM regions WHERE regionID=" + str(regionID)) regionData = list(self.chrommod.cR.fetchone()) start = regionData[2] stop = regionData[3] print regionID, regionData sqlQuery = "SELECT overlap_ratio, overlap_count FROM " + self.chrommod.datasetSimpleName + "_data WHERE regionID = " + str( regionID) self.chrommod.cD.execute(sqlQuery) regionResult = list(self.chrommod.cD.fetchone()) print regionID, regionResult dbcon = cx_Oracle.connect("epigraph_admin", "epigraph123", "bioinfo") dbcur = dbcon.cursor() oracleSQLQuery = "SELECT chromstart, chromend FROM hg18_EPIGRAPH_#romatin WHERE chrom = '" + str( utilities.convertIntToChrom(self.genome, regionData[1]) ) + "' AND chromstart <= " + str(stop) + " AND chromend > " + str( start ) + " AND chrommod = '" + self.chmodName + "' ORDER BY chromstart" dbcur.execute(oracleSQLQuery) oracleD = map(list, list(dbcur.fetchall())) print regionID, oracleD reducedGR = utilities.gr_reduceRegionSet(list(oracleD)) oracle_overlap_ratio = utilities.gr_Coverage( reducedGR, start, stop) / float(stop - start) self.assertEqual(len(oracleD), regionResult[1]) self.assertEqual(regionResult[0], oracle_overlap_ratio)
def mainTest(self, chrommod, n): self.chmodName = chrommod self.chrommod = readDatasetDescriptions.readDataset([ "chrommod_" + self.chmodName, "hg18_Barski_chrommod_" + self.chmodName + ".ini", "D:/Projects/Integrated_Genome_Profile_Search_Engine/Cosgen/Datasets/" ]) self.chrommod.datasetCollectionName = "main" # load n random regions self.chrommod.openDBConnections() self.chrommod.cR.execute("SELECT COUNT(*) FROM regions") nRows = self.chrommod.cR.fetchone()[0] print nRows self.chrommod.cR.execute( "SELECT regionID,chrom,start,stop,datasetID FROM regions ORDER BY regionID" ) dR = self.chrommod.cR.fetchall() selectedRegionIDs = [] selectedRegionsData = {} for i in range(n): id = random.randint(0, nRows - 1) selectedRegionIDs.append(id) if id != dR[id - 1][0]: print id, dR[id - 1] raise Exception selectedRegionsData[id] = dR[id - 1][1:] selectedRegionIDs = list(set(selectedRegionIDs)) selectedRegionIDs.sort() print selectedRegionIDs dbcon = cx_Oracle.connect("epigraph_admin", "epigraph123", "bioinfo") dbcur = dbcon.cursor() count = 0 for regionID in selectedRegionIDs: sqlQuery = "SELECT overlap_ratio, overlap_count FROM " + self.chrommod.datasetSimpleName + "_data WHERE regionID = " + str( regionID) self.chrommod.cD.execute(sqlQuery) try: regionData = list(self.chrommod.cD.fetchone()) overlap_ratio = regionData[0] overlap_count = regionData[1] except TypeError, ex: overlap_ratio = 0 overlap_count = 0 start = selectedRegionsData[regionID][1] stop = selectedRegionsData[regionID][2] print regionID, overlap_ratio, overlap_count oracleSQLQuery = "SELECT chromstart, chromend FROM hg18_EPIGRAPH_#romatin WHERE chrom = '" + str( utilities.convertIntToChrom(self.genome, selectedRegionsData[regionID][0]) ) + "' AND chromstart <= " + str(stop) + " AND chromend > " + str( start ) + " AND chrommod = '" + self.chmodName + "' ORDER BY chromstart" print oracleSQLQuery dbcur.execute(oracleSQLQuery) oracleD = map(list, list(dbcur.fetchall())) #print oracleD self.assertEqual(overlap_count, len(oracleD)) reducedGR = utilities.gr_reduceRegionSet(list(oracleD)) oracle_overlap_ratio = utilities.gr_Coverage( reducedGR, start, stop) / float(stop - start) self.assertEqual(overlap_ratio, oracle_overlap_ratio)
def preprocessDownloadedDataset(self): if self.hasPreprocessedFile(): #what to do if the data was already preprocessed extext = self.datasetSimpleName + ": the dataset was already preprocessed in " + self.binaryFile log(extext) return self.maxNNRatio = float(self.maxNNRatio) if not self.coverage in ["genome", "around dataset regions"]: raise GDMException, "Invalid coverage " + self.coverage self.tilingRegionSizes = self.tilingRegionSizes.split(",") self.tilingRegionSteps = self.tilingRegionSteps.split(",") dsFull = [] if self.coverage == "genome": for i in range(len(self.tilingRegionSizes)): tilingRegionSize = int(self.tilingRegionSizes[i].strip()) tilingRegionStep = int(self.tilingRegionSteps[i].strip()) list_of_chr = [[ c, 0, settings.genomeData[self.genome][c] ] for c in settings.genomeDataStr[self.genome].keys()] ds = dataset_methods.getTilingRegions(list_of_chr, self.genome, tilingRegionSize, tilingRegionStep) #dataset_methods.saveDataset(ds,"trail",False) if self.maxNNRatio > 0: ds = dataset_methods.filterMinNNContent( ds, self.genome, self.maxNNRatio) dsFull.extend(ds) elif self.coverage == "around dataset regions": datasetParts = map(lambda x: x.strip(), self.aroundDataset.split(":")) # get the Main index file for the whole genome, Tiling regions are never precomputed otherwise datasetParts.append(getMainDatasetIndexFileName(self.genome)) aroundDataset = readDatasetDescriptions.readDataset(datasetParts) aroundDataset.init(False) datasetRegions = aroundDataset.getRegions() f = open(settings.baseFolder + "temp.txt", "w") f.write(str(datasetRegions)) f.close() self.datasetShore = int(self.datasetShore) # dataset_methods.defaultFolder = "/TL/epigenetics/work/completesearch/Datasets/hg18_RawDatasets/" datasetRegionsExtended = [] for i in xrange(datasetRegions.shape[0]): chr = convertIntToChrom(self.genome, datasetRegions[i, 0]) # if chr == "chr0": # print datasetRegions[i] start = datasetRegions[i, 1] - self.datasetShore end = datasetRegions[i, 2] + self.datasetShore start, end = getCorrectedCoordinates(self.genome, chr, start, end) datasetRegionsExtended.append([chr, start, end]) # dataset_methods.saveDataset(datasetRegionsExtended,"raw",False) nonOverlappingDatasetRegions = dataset_methods.mergeOverlappingMax( datasetRegionsExtended) # dataset_methods.saveDataset(nonOverlappingDatasetRegions,"overlapping",False) for i in range(len(self.tilingRegionSizes)): tilingRegionSize = int(self.tilingRegionSizes[i].strip()) tilingRegionStep = int(self.tilingRegionSteps[i].strip()) ds = dataset_methods.getTilingRegions( nonOverlappingDatasetRegions, self.genome, tilingRegionSize, tilingRegionStep) # dataset_methods.saveDataset(ds,"trail",False) if self.maxNNRatio > 0: ds = dataset_methods.filterMinNNContent( ds, self.genome, maxNNRatio) dsFull.extend(ds) self.__init_regions_dataset_for_local_db__(dsFull)