def parse_onename(onename): np.random.seed() #Parsing individual files if parseInMemory == True: finalname = onename + "_parsed.frag" #if not os.path.exists(finalname): if True: #create dataset in memory, parse and then save to destination TR = HiCdataset("bla" + str(np.random.randint(100000000000)), genome=getGenome(workingGenome), maximumMoleculeLength=500,enzymeName = enzyme,tmpFolder = "tmp", inMemory=True) # remove inMemory if you don't have enough RAM TR.parseInputData(dictLike=onename) folder = os.path.split(onename)[0] print(onename) TR.save(ensure(finalname)) folder, fname = os.path.split(onename) statSubFolder = os.path.join(statFolder, folder) TR.printMetadata(saveTo=ensure(os.path.join(statSubFolder, fname + ".stat"))) else: print("skipping parsed: ", onename) else: #Create dataset at destination, parse on HDD, then no need to save. TR = HiCdataset(ensure(onename + "_parsed.frag"), genome=getGenome(workingGenome),enzymeName = enzyme,tmpFolder = "tmp", maximumMoleculeLength=500, mode='w') TR.parseInputData(dictLike=onename, enzymeToFillRsites=enzyme) TR.printMetadata(saveTo=ensure(os.path.join(statFolder, onename + ".stat")))
) # a prefix preceeding .fastq.gz, which will be used to distinguish side 1 and side 2 genomeName = "hg19" #genomeName = "/home/ubuntu/data3/HiCdata/DM_DpnII_Oct18/dm3" threads = 8 bowtieIndex = "/home/ubuntu/rDNA_data/index/hg19/hg19mannual" #bowtiePath = "/usr/bin/bowtie2" bowtiePath = "/home/ubuntu/tools/miniconda2/envs/main/bin/bowtie2" bowtieFlags = "--very-sensitive" #bowtieFlags = "-D 20 -R 3 -N 0 -L 7 -i S,1,0.10" bowtieFlags = "--very-sensitive --n-ceil L,0,0.1" seqSkipStart = 0 # skip first 2 bp of the read, if you want #minMapLen = 25 # start mapping at this length genome_db = getGenome(genomeName) if not os.path.exists(bowtiePath): raise if mode == "sra": iterList = 2 * GEOids elif mode == "fastq": iterList = sorted(os.listdir(inFastqDir)) print iterList for i in iterList: if mode == "sra": sraNum = i expName = "SRR{0}".format(i) i = expName
def refineDataset(filenames, create=True, delete=False, parseInMemory=True): """ Parameters ---------- filenames[0] is a list of filenames of incoming files filenames[1] is a folder for outgoing file filenames[2] is a working genome, that is output directory filenames[3] is an enzyme for a given experiment create : bool, optional If True, parse each file. If False, assume that files were already parsed (e.g. if you are just playing around with filtering parameters) delete : bool, optional If True, delete parsed files after merging. Man, these files may be huge... if you don't have a 10TB RAID, this may be useful. parseInMemory : bool, optional Perform parsing input files in memory. """ in_files = filenames[0] out_file = filenames[1] statFolder = os.path.join("statistics", out_file) workingGenome = filenames[2] enzyme = filenames[3] if create == True: # if we need to parse the input files (.hdf5 from mapping). def parse_onename(onename): np.random.seed() #Parsing individual files if parseInMemory == True: finalname = onename + "_parsed.frag" #if not os.path.exists(finalname): if True: #create dataset in memory, parse and then save to destination TR = HiCdataset( "bla" + str(np.random.randint(100000000000)), genome=getGenome(workingGenome), maximumMoleculeLength=500, enzymeName=enzyme, tmpFolder="tmp", inMemory=True ) # remove inMemory if you don't have enough RAM TR.parseInputData(dictLike=onename) folder = os.path.split(onename)[0] print(onename) TR.save(ensure(finalname)) folder, fname = os.path.split(onename) statSubFolder = os.path.join(statFolder, folder) TR.printMetadata(saveTo=ensure( os.path.join(statSubFolder, fname + ".stat"))) else: print("skipping parsed: ", onename) else: #Create dataset at destination, parse on HDD, then no need to save. TR = HiCdataset(ensure(onename + "_parsed.frag"), genome=getGenome(workingGenome), enzymeName=enzyme, tmpFolder="tmp", maximumMoleculeLength=500, mode='w') TR.parseInputData(dictLike=onename, enzymeToFillRsites=enzyme) TR.printMetadata( saveTo=ensure(os.path.join(statFolder, onename + ".stat"))) list(map(parse_onename, in_files)) "Merging files alltogether, applying filters" TR = HiCdataset(ensure(out_file + "_merged.frag"), genome=getGenome(workingGenome), enzymeName=enzyme, tmpFolder="tmp", dictToStoreIDs="h5dict", mode="w") TR.merge([i + "_parsed.frag" for i in in_files]) #Merge in all parsed files from one experiment if delete == True: # cleaning up parsed files for delFile in [i + "_parsed.frag" for i in in_files]: os.remove(delFile) "Now opening new dataset for refined data, and performing all the filtering " TR = HiCdataset(out_file + "_refined.frag", enzymeName=enzyme, genome=getGenome(workingGenome), tmpFolder="tmp", dictToStoreIDs="h5dict", mode='w') TR.load(out_file + "_merged.frag") #----------------------------Set of filters applied ------------- TR.filterDuplicates() #TR.save(out_file+".dat") #TR.filterExtreme(cutH=0.0001, cutL=0) #TR.filterRsiteStart() #TR.filterLarge() TR.writeFilteringStats() TR.printMetadata(saveTo=statFolder + ".stat") #------------------------End set of filters applied---------- else: #If merging & filters has already been done, just load files TR = HiCdataset(out_file + "_working.frag", enzymeName=enzyme, mode='w', genome=getGenome(workingGenome)) TR.load(out_file + "_refined.frag") TR.printMetadata(saveTo=statFolder + ".stat") print("----->Building Raw heatmap at different resolutions") TR.printStats() for res in coolerResolutions: TR.saveCooler(out_file + ".{0}.cool".format(res), res)
pass #Now merging different experiments alltogether #note that the first column is not here, as it is a replica experiments = set([(i[0], i[2], i[3]) for i in combinedExperimentNames]) print(experiments) for experiment in experiments: workingGenome = experiment[1] myExperimentNames = [i[1] + "_refined.frag" for i in combinedExperimentNames if (i[0], i[2], i[3]) == (experiment[0], experiment[1],experiment[2])] assert len(myExperimentNames) > 0 if len(myExperimentNames) > 0: #If we have more than one experiment (replica) for the same data, we can combine. TR = HiCdataset(os.path.join(workingGenome, "%s-all-%s_refined.frag" % (experiment[0],experiment[2])), genome=getGenome(workingGenome), enzymeName = experiment[2],tmpFolder = "tmp",dictToStoreIDs="h5dict") statSaveName = os.path.join("statistics", workingGenome, "%s-all-%s_refined.stat" % (experiment[0], experiment[2])) TR.merge(myExperimentNames) TR.printMetadata(saveTo=statSaveName) for res in wholeGenomeResolutionsKb: TR.saveHeatmap(os.path.join(workingGenome, "%s-all-%s-{0}k.hm" % (experiment[0], experiment[2])).format(res), res*1000) for res in byChromosomeResolutionsKb: TR.saveByChromosomeHeatmap(os.path.join(workingGenome, "%s-all-%s-{0}k.byChr" % (experiment[0], experiment[2])).format(res), res*1000) for res in HiResWithOverlapResolutionsKb: TR.saveHiResHeatmapWithOverlaps(os.path.join(workingGenome, "%s-all-%s-{0}k_HighRes.byChr" % (experiment[0], experiment[2])).format(res), res*1000) for res in SuperHiResWithOverlapResolutionsKb: TR.saveSuperHighResMapWithOverlaps(os.path.join(workingGenome, "%s-all-%s-{0}k_SuperHighRes.byChr" % (experiment[0], experiment[2])).format(res), res*1000)
except: raise ValueError("Cannot create directory") return f path = "/home/ubuntu/RNAdata/Golov/hic/mapped-hg19/35_AGAGGCCT_reads_merged_len_adj/" f1 = h5py.File(path + "chunk0001.hdf5" ,'r+') f1 = mirnylib.h5dict.h5dict(path + "chunk0001.hdf5" ,'r+') print f1["misc"]["genome"]["idx2label"] chrm_conversion_table = f1["misc"]["genome"]["idx2label"] genome_db = getGenome("hg19") workingGenome = "hg19" for key in genome_db.idx2label: print key , print genome_db.idx2label[key] , print("Keys: %s" % f1.keys()) chrms1_key = list(f1.keys())[0] chrms2_key = list(f1.keys())[1] cuts1_key = list(f1.keys())[2] cuts2_key = list(f1.keys())[3] misc_key = list(f1.keys())[4]