def test_Fixed_Region_Full(self):
        self.fail()
        regionID = 42469
        self.chmodName = "H3K4me3"
        self.chrommod = readDatasetDescriptions.readDataset([
            "chrommod_" + self.chmodName,
            "hg18_Barski_chrommod_" + self.chmodName + ".ini",
            "D:/Projects/Integrated_Genome_Profile_Search_Engine/Cosgen/Datasets/"
        ])
        self.chrommod.datasetCollectionName = "main"

        self.chrommod.openDBConnections()
        self.chrommod.cR.execute(
            "SELECT regionID,chrom,start,stop FROM regions WHERE regionID=" +
            str(regionID))
        regionData = list(self.chrommod.cR.fetchone())
        chrom = regionData[1]
        start = regionData[2]
        stop = regionData[3]
        print chrom, start, stop
        self.chrommod.initializeIntervalArray()
        overlapingRegionsInterval = self.chrommod.intervalArray.findTwoDimentionalWithAdditionalInfo(
            chrom, start, stop, 0, 1)
        print overlapingRegionsInterval
        reducedGR = utilities.gr_reduceRegionSet(
            list(overlapingRegionsInterval))
        print reducedGR
        overlap_ratio = utilities.gr_Coverage(reducedGR, start,
                                              stop) / float(stop - start)
        print overlap_ratio
示例#2
0
 def oneRegionTest(self,regionID,chrommod):
     self.chmodName = chrommod
     self.chrommod = readDatasetDescriptions.readDataset(["chrommod_"+self.chmodName,
                                                     "hg18_Barski_chrommod_"+ self.chmodName +".ini",
                                                     "D:/Projects/Integrated_Genome_Profile_Search_Engine/Cosgen/Datasets/"])
     self.chrommod.datasetCollectionName = "main"        
     
     self.chrommod.openDBConnections()        
     self.chrommod.cR.execute("SELECT regionID,chrom,start,stop FROM regions WHERE regionID="+str(regionID))
     regionData = list(self.chrommod.cR.fetchone())
     start = regionData[2]
     stop = regionData[3]
     print regionID,regionData
     sqlQuery = "SELECT overlap_ratio, overlap_count FROM "+ self.chrommod.datasetSimpleName + "_data WHERE regionID = "+str(regionID)
     self.chrommod.cD.execute(sqlQuery)
     regionResult = list(self.chrommod.cD.fetchone())
     print regionID,regionResult
     dbcon = cx_Oracle.connect("epigraph_admin", "epigraph123", "bioinfo")
     dbcur = dbcon.cursor()
     oracleSQLQuery = "SELECT chromstart, chromend FROM hg18_EPIGRAPH_#romatin WHERE chrom = '"+str(utilities.convertIntToChrom(self.genome,regionData[1]))+"' AND chromstart <= "+str(stop)+" AND chromend > "+str(start)+" AND chrommod = '"+self.chmodName+"' ORDER BY chromstart"
     dbcur.execute(oracleSQLQuery)
     oracleD = map(list,list(dbcur.fetchall()))
     print regionID,oracleD
     reducedGR = utilities.gr_reduceRegionSet(list(oracleD))
     oracle_overlap_ratio = utilities.gr_Coverage(reducedGR, start, stop) / float(stop - start)
     self.assertEqual(len(oracleD),regionResult[1])
     self.assertEqual(regionResult[0],oracle_overlap_ratio)
示例#3
0
    def preprocessDownloadedDataset(self):
        if self.hasPreprocessedFile():
            #what to do if the data was already preprocessed
            extext = self.datasetSimpleName + ": the dataset was already preprocessed in "+self.binaryFile
            log(extext)
            return                
        self.maxNNRatio = float(self.maxNNRatio)
        if not self.coverage in ["genome","around dataset regions"]:
            raise GDMException, "Invalid coverage "+self.coverage
        self.tilingRegionSizes = self.tilingRegionSizes.split(",")
        self.tilingRegionSteps = self.tilingRegionSteps.split(",")
        dsFull = []
        if self.coverage == "genome":            
            for i in range(len(self.tilingRegionSizes)):
                tilingRegionSize = int(self.tilingRegionSizes[i].strip())
                tilingRegionStep = int(self.tilingRegionSteps[i].strip())
                list_of_chr =  [[c,0,settings.genomeData[self.genome][c]] for c in settings.genomeDataStr[self.genome].keys()]
                ds = dataset_methods.getTilingRegions(list_of_chr,self.genome,tilingRegionSize,tilingRegionStep)
                #dataset_methods.saveDataset(ds,"trail",False)
                if self.maxNNRatio > 0:
                    ds = dataset_methods.filterMinNNContent(ds,self.genome,self.maxNNRatio)
                dsFull.extend(ds)
        elif self.coverage == "around dataset regions":
            datasetParts = map(lambda x:x.strip(),self.aroundDataset.split(":"))
            # get the Main index file for the whole genome, Tiling regions are never precomputed otherwise
            datasetParts.append(getMainDatasetIndexFileName(self.genome))
            aroundDataset = readDatasetDescriptions.readDataset(datasetParts)
            aroundDataset.init(False)
            datasetRegions = aroundDataset.getRegions()
            f = open(settings.baseFolder+"temp.txt","w")
            f.write(str(datasetRegions))
            f.close()
            self.datasetShore = int(self.datasetShore)
#            dataset_methods.defaultFolder = "/TL/epigenetics/work/completesearch/Datasets/hg18_RawDatasets/"
            datasetRegionsExtended =  []
            for i in xrange(datasetRegions.shape[0]):
                chr = convertIntToChrom(self.genome,datasetRegions[i,0])
#                if chr == "chr0":
#                    print datasetRegions[i]
                start = datasetRegions[i,1]-self.datasetShore
                end = datasetRegions[i,2]+self.datasetShore
                start,end = getCorrectedCoordinates(self.genome,chr,start,end)
                datasetRegionsExtended.append([chr,start,end])    
                
#            dataset_methods.saveDataset(datasetRegionsExtended,"raw",False)
            nonOverlappingDatasetRegions = dataset_methods.mergeOverlappingMax(datasetRegionsExtended)            
#            dataset_methods.saveDataset(nonOverlappingDatasetRegions,"overlapping",False)
            for i in range(len(self.tilingRegionSizes)):
                tilingRegionSize = int(self.tilingRegionSizes[i].strip())
                tilingRegionStep = int(self.tilingRegionSteps[i].strip())                
                ds = dataset_methods.getTilingRegions(nonOverlappingDatasetRegions,self.genome,tilingRegionSize,tilingRegionStep)
#                dataset_methods.saveDataset(ds,"trail",False)
                if self.maxNNRatio > 0:
                    ds = dataset_methods.filterMinNNContent(ds,self.genome,maxNNRatio)
                dsFull.extend(ds)            
            
        self.__init_regions_dataset_for_local_db__(dsFull) 
示例#4
0
 def mainTest(self,chrommod,n):
     self.chmodName = chrommod        
     self.chrommod = readDatasetDescriptions.readDataset(["chrommod_"+self.chmodName,
                                                     "hg18_Barski_chrommod_"+ self.chmodName +".ini",
                                                     "D:/Projects/Integrated_Genome_Profile_Search_Engine/Cosgen/Datasets/"])
     self.chrommod.datasetCollectionName = "main"
     
     # load n random regions
     self.chrommod.openDBConnections()
     self.chrommod.cR.execute("SELECT COUNT(*) FROM regions")
     nRows = self.chrommod.cR.fetchone()[0]
     print nRows
     self.chrommod.cR.execute("SELECT regionID,chrom,start,stop,datasetID FROM regions ORDER BY regionID")
     dR = self.chrommod.cR.fetchall()
     selectedRegionIDs = []
     selectedRegionsData = {}
     for i in range(n):
         id  = random.randint(0,nRows-1)
         selectedRegionIDs.append(id)
         if id != dR[id-1][0]:
             print id, dR[id-1]
             raise Exception                
         selectedRegionsData[id] = dR[id-1][1:]
         
     selectedRegionIDs = list(set(selectedRegionIDs))
     selectedRegionIDs.sort()
     print selectedRegionIDs
     dbcon = cx_Oracle.connect("epigraph_admin", "epigraph123", "bioinfo")
     dbcur = dbcon.cursor()
     count = 0
     for regionID in selectedRegionIDs:
         sqlQuery = "SELECT overlap_ratio, overlap_count FROM "+ self.chrommod.datasetSimpleName + "_data WHERE regionID = "+str(regionID)
         self.chrommod.cD.execute(sqlQuery)
         try:
             regionData = list(self.chrommod.cD.fetchone())
             overlap_ratio = regionData[0]  
             overlap_count = regionData[1]
         except TypeError,ex:
             overlap_ratio = 0
             overlap_count  = 0                
         start = selectedRegionsData[regionID][1]
         stop = selectedRegionsData[regionID][2]
         print regionID,overlap_ratio,overlap_count
         oracleSQLQuery = "SELECT chromstart, chromend FROM hg18_EPIGRAPH_#romatin WHERE chrom = '"+str(utilities.convertIntToChrom(self.genome,selectedRegionsData[regionID][0]))+"' AND chromstart <= "+str(stop)+" AND chromend > "+str(start)+" AND chrommod = '"+self.chmodName+"' ORDER BY chromstart"
         print oracleSQLQuery
         dbcur.execute(oracleSQLQuery)
         oracleD = map(list,list(dbcur.fetchall()))
         #print oracleD
         self.assertEqual(overlap_count,len(oracleD))
         reducedGR = utilities.gr_reduceRegionSet(list(oracleD))
         oracle_overlap_ratio = utilities.gr_Coverage(reducedGR, start, stop) / float(stop - start)
         self.assertEqual(overlap_ratio,oracle_overlap_ratio)        
def filterMinNNContent(ds,genome,maxNNContent):
    import readDatasetDescriptions
    import CGSAnnotationsSettings
    seqDataset = readDatasetDescriptions.readDataset(["dnasequence",settings.baseFolder+"/Datasets/Datasets_descriptions/"+genome+"_unix_dnasequence.ini","./"])
    seqDataset.init()    
    cgsAS = CGSAnnotationsSettings.CGSAnnotationsSettings("dnasequence",genome,{},{})
    cgsAS.addFeatureDataset({seqDataset.datasetSimpleName:seqDataset.getDefaultAnnotationSettings()}) 
    print seqDataset.chromFiles    
    newDS = []
    for region in ds:
        if cgsAS.featuresDatasets[seqDataset.datasetSimpleName]["currentLoadedChromosome"] != utilities.convertChromToInt(genome,region[0]):
            print "New chromosome",region[0]
            seqDataset.loadChromosome(utilities.convertChromToInt(genome,region[0]),cgsAS)
        o = cgsAS.featuresDatasets[seqDataset.datasetSimpleName]["currentLoadedChromosomeSeq"][region[1]:region[2]].count("N")/float(region[2]-region[1])
        if o <= maxNNContent:
            newDS.append(region)
    return newDS
示例#6
0
 def test_Fixed_Region_Full(self):
     self.fail()
     regionID = 42469         
     self.chmodName = "H3K4me3"
     self.chrommod = readDatasetDescriptions.readDataset(["chrommod_"+self.chmodName,
                                                     "hg18_Barski_chrommod_"+ self.chmodName +".ini",
                                                     "D:/Projects/Integrated_Genome_Profile_Search_Engine/Cosgen/Datasets/"])
     self.chrommod.datasetCollectionName = "main"   
     
     self.chrommod.openDBConnections()        
     self.chrommod.cR.execute("SELECT regionID,chrom,start,stop FROM regions WHERE regionID="+str(regionID))
     regionData = list(self.chrommod.cR.fetchone())
     chrom = regionData[1]
     start = regionData[2]
     stop = regionData[3]
     print chrom,start,stop        
     self.chrommod.initializeIntervalArray()
     overlapingRegionsInterval = self.chrommod.intervalArray.findTwoDimentionalWithAdditionalInfo(chrom, start, stop,0,1)
     print overlapingRegionsInterval
     reducedGR = utilities.gr_reduceRegionSet(list(overlapingRegionsInterval))
     print reducedGR
     overlap_ratio = utilities.gr_Coverage(reducedGR, start, stop) / float(stop - start)
     print overlap_ratio
    def oneRegionTest(self, regionID, chrommod):
        self.chmodName = chrommod
        self.chrommod = readDatasetDescriptions.readDataset([
            "chrommod_" + self.chmodName,
            "hg18_Barski_chrommod_" + self.chmodName + ".ini",
            "D:/Projects/Integrated_Genome_Profile_Search_Engine/Cosgen/Datasets/"
        ])
        self.chrommod.datasetCollectionName = "main"

        self.chrommod.openDBConnections()
        self.chrommod.cR.execute(
            "SELECT regionID,chrom,start,stop FROM regions WHERE regionID=" +
            str(regionID))
        regionData = list(self.chrommod.cR.fetchone())
        start = regionData[2]
        stop = regionData[3]
        print regionID, regionData
        sqlQuery = "SELECT overlap_ratio, overlap_count FROM " + self.chrommod.datasetSimpleName + "_data WHERE regionID = " + str(
            regionID)
        self.chrommod.cD.execute(sqlQuery)
        regionResult = list(self.chrommod.cD.fetchone())
        print regionID, regionResult
        dbcon = cx_Oracle.connect("epigraph_admin", "epigraph123", "bioinfo")
        dbcur = dbcon.cursor()
        oracleSQLQuery = "SELECT chromstart, chromend FROM hg18_EPIGRAPH_#romatin WHERE chrom = '" + str(
            utilities.convertIntToChrom(self.genome, regionData[1])
        ) + "' AND chromstart <= " + str(stop) + " AND chromend > " + str(
            start
        ) + " AND chrommod = '" + self.chmodName + "' ORDER BY chromstart"
        dbcur.execute(oracleSQLQuery)
        oracleD = map(list, list(dbcur.fetchall()))
        print regionID, oracleD
        reducedGR = utilities.gr_reduceRegionSet(list(oracleD))
        oracle_overlap_ratio = utilities.gr_Coverage(
            reducedGR, start, stop) / float(stop - start)
        self.assertEqual(len(oracleD), regionResult[1])
        self.assertEqual(regionResult[0], oracle_overlap_ratio)
    def mainTest(self, chrommod, n):
        self.chmodName = chrommod
        self.chrommod = readDatasetDescriptions.readDataset([
            "chrommod_" + self.chmodName,
            "hg18_Barski_chrommod_" + self.chmodName + ".ini",
            "D:/Projects/Integrated_Genome_Profile_Search_Engine/Cosgen/Datasets/"
        ])
        self.chrommod.datasetCollectionName = "main"

        # load n random regions
        self.chrommod.openDBConnections()
        self.chrommod.cR.execute("SELECT COUNT(*) FROM regions")
        nRows = self.chrommod.cR.fetchone()[0]
        print nRows
        self.chrommod.cR.execute(
            "SELECT regionID,chrom,start,stop,datasetID FROM regions ORDER BY regionID"
        )
        dR = self.chrommod.cR.fetchall()
        selectedRegionIDs = []
        selectedRegionsData = {}
        for i in range(n):
            id = random.randint(0, nRows - 1)
            selectedRegionIDs.append(id)
            if id != dR[id - 1][0]:
                print id, dR[id - 1]
                raise Exception
            selectedRegionsData[id] = dR[id - 1][1:]

        selectedRegionIDs = list(set(selectedRegionIDs))
        selectedRegionIDs.sort()
        print selectedRegionIDs
        dbcon = cx_Oracle.connect("epigraph_admin", "epigraph123", "bioinfo")
        dbcur = dbcon.cursor()
        count = 0
        for regionID in selectedRegionIDs:
            sqlQuery = "SELECT overlap_ratio, overlap_count FROM " + self.chrommod.datasetSimpleName + "_data WHERE regionID = " + str(
                regionID)
            self.chrommod.cD.execute(sqlQuery)
            try:
                regionData = list(self.chrommod.cD.fetchone())
                overlap_ratio = regionData[0]
                overlap_count = regionData[1]
            except TypeError, ex:
                overlap_ratio = 0
                overlap_count = 0
            start = selectedRegionsData[regionID][1]
            stop = selectedRegionsData[regionID][2]
            print regionID, overlap_ratio, overlap_count
            oracleSQLQuery = "SELECT chromstart, chromend FROM hg18_EPIGRAPH_#romatin WHERE chrom = '" + str(
                utilities.convertIntToChrom(self.genome,
                                            selectedRegionsData[regionID][0])
            ) + "' AND chromstart <= " + str(stop) + " AND chromend > " + str(
                start
            ) + " AND chrommod = '" + self.chmodName + "' ORDER BY chromstart"
            print oracleSQLQuery
            dbcur.execute(oracleSQLQuery)
            oracleD = map(list, list(dbcur.fetchall()))
            #print oracleD
            self.assertEqual(overlap_count, len(oracleD))
            reducedGR = utilities.gr_reduceRegionSet(list(oracleD))
            oracle_overlap_ratio = utilities.gr_Coverage(
                reducedGR, start, stop) / float(stop - start)
            self.assertEqual(overlap_ratio, oracle_overlap_ratio)
    def preprocessDownloadedDataset(self):
        if self.hasPreprocessedFile():
            #what to do if the data was already preprocessed
            extext = self.datasetSimpleName + ": the dataset was already preprocessed in " + self.binaryFile
            log(extext)
            return
        self.maxNNRatio = float(self.maxNNRatio)
        if not self.coverage in ["genome", "around dataset regions"]:
            raise GDMException, "Invalid coverage " + self.coverage
        self.tilingRegionSizes = self.tilingRegionSizes.split(",")
        self.tilingRegionSteps = self.tilingRegionSteps.split(",")
        dsFull = []
        if self.coverage == "genome":
            for i in range(len(self.tilingRegionSizes)):
                tilingRegionSize = int(self.tilingRegionSizes[i].strip())
                tilingRegionStep = int(self.tilingRegionSteps[i].strip())
                list_of_chr = [[
                    c, 0, settings.genomeData[self.genome][c]
                ] for c in settings.genomeDataStr[self.genome].keys()]
                ds = dataset_methods.getTilingRegions(list_of_chr, self.genome,
                                                      tilingRegionSize,
                                                      tilingRegionStep)
                #dataset_methods.saveDataset(ds,"trail",False)
                if self.maxNNRatio > 0:
                    ds = dataset_methods.filterMinNNContent(
                        ds, self.genome, self.maxNNRatio)
                dsFull.extend(ds)
        elif self.coverage == "around dataset regions":
            datasetParts = map(lambda x: x.strip(),
                               self.aroundDataset.split(":"))
            # get the Main index file for the whole genome, Tiling regions are never precomputed otherwise
            datasetParts.append(getMainDatasetIndexFileName(self.genome))
            aroundDataset = readDatasetDescriptions.readDataset(datasetParts)
            aroundDataset.init(False)
            datasetRegions = aroundDataset.getRegions()
            f = open(settings.baseFolder + "temp.txt", "w")
            f.write(str(datasetRegions))
            f.close()
            self.datasetShore = int(self.datasetShore)
            #            dataset_methods.defaultFolder = "/TL/epigenetics/work/completesearch/Datasets/hg18_RawDatasets/"
            datasetRegionsExtended = []
            for i in xrange(datasetRegions.shape[0]):
                chr = convertIntToChrom(self.genome, datasetRegions[i, 0])
                #                if chr == "chr0":
                #                    print datasetRegions[i]
                start = datasetRegions[i, 1] - self.datasetShore
                end = datasetRegions[i, 2] + self.datasetShore
                start, end = getCorrectedCoordinates(self.genome, chr, start,
                                                     end)
                datasetRegionsExtended.append([chr, start, end])

#            dataset_methods.saveDataset(datasetRegionsExtended,"raw",False)
            nonOverlappingDatasetRegions = dataset_methods.mergeOverlappingMax(
                datasetRegionsExtended)
            #            dataset_methods.saveDataset(nonOverlappingDatasetRegions,"overlapping",False)
            for i in range(len(self.tilingRegionSizes)):
                tilingRegionSize = int(self.tilingRegionSizes[i].strip())
                tilingRegionStep = int(self.tilingRegionSteps[i].strip())
                ds = dataset_methods.getTilingRegions(
                    nonOverlappingDatasetRegions, self.genome,
                    tilingRegionSize, tilingRegionStep)
                #                dataset_methods.saveDataset(ds,"trail",False)
                if self.maxNNRatio > 0:
                    ds = dataset_methods.filterMinNNContent(
                        ds, self.genome, maxNNRatio)
                dsFull.extend(ds)

        self.__init_regions_dataset_for_local_db__(dsFull)