def generatecouplemat(pathTEcouple,repository,speciesname,endofthefile,chrsizepath,resolution): resolution=int(resolution) #cast of argv Matpath=repository+speciesname+".TEcouplematrix" TEposdict=generateTElistedposdict(repository,speciesname,endofthefile,resolution) print(TEposdict.keys()) filecouple=open(pathTEcouple,"r") chrlist=utils.dictchr[speciesname] #help us to loop only on good chr sizedict=utils.loadchrsizedict(chrsizepath,resolution) matcolsize=sizedict["HicChrBeginchrX"] #size of the matrix : ["HicUsedTotalSize"] or #TODO here l=filecouple.readline() #initialisation of the loop ls=l.split() vi=np.zeros((1,matcolsize)) vi=completetheline(ls[0],TEposdict[ls[0]],sizedict,vi,1) vi=completetheline(ls[1],TEposdict[ls[1]],sizedict,vi,-1) mat=vi l=filecouple.readline() i=0 while l: ls=l.split() print("===>line",i) vi=np.zeros((1,matcolsize)) vi=completetheline(ls[0],TEposdict[ls[0]],sizedict,vi,1) vi=completetheline(ls[1],TEposdict[ls[1]],sizedict,vi,1) mat=np.concatenate((mat,vi),axis=0) l=filecouple.readline() i+=1 filecouple.close() toto={}#just to be un matlab savemat format toto['mat']=mat scpio.savemat(Matpath,toto)
def gccontentforonechr(fastarep,chrsizepath,species,GCoutrep,resolution): resolution=int(resolution) sizedict=utils.loadchrsizedict(chrsizepath,resolution) matcolsize=sizedict["HicUsedTotalSize"] #HicUsedTotalSize HicChrBeginchr2 mat=np.zeros(matcolsize) k=0 for i in utils.dictchr[species]: print(i) o=open(fastarep+i+".fa","r") l=o.readline() s="" #init l=l.split(" ") l=o.readline() while l: l=l.replace("\n","") s+=l l=o.readline() o.close() i=0 j=resolution if resolution>len(s): mat[k]=gcpercent(s) k+=1 while j<len(s): mat[k]=gcpercent(s[i:j]) i+=resolution j+=resolution k+=1 toto={}#just to be un matlab savemat format toto['mat']=mat scpio.savemat(GCoutrep+str(resolution)+"pbGCvec.mat",toto) utils.savematrixasfilelist3(mat,GCoutrep+str(resolution)+"pbGCvec.csv")
def generateTErecurencematrixHicComparableForOneChr(repository,endofthefile,achr,resolution,chrsizepath,species): dumpmatname=repository+achr+endofthefile+"dumpmatHiCbin.dump" densityname=repository+achr+endofthefile if op.exists(dumpmatname+".npy"): print("===> Matrice deja existante") thematrix=np.load(open(dumpmatname+".npy","rb")) else: print("===>Constitution de la matrice") filename=repository+achr+endofthefile print("resolution:",resolution) sizedict=utils.loadchrsizedict(chrsizepath,resolution) #find the size of the list of list base on distance sizeList=findsizeforHicListofList(sizedict,utils.dictchr[species],achr) print("taille de la liste:",sizeList) setlist=generateHicListofpattern(filename,sizeList,resolution) transformHiClistofpatterninDensity(setlist,"LTR/Gypsy",densityname+"Gypsy") transformHiClistofpatterninDensity(setlist,"LINE/CR1",densityname+"CR1") transformHiClistofpatterninDensity(setlist,"DNA/P",densityname+"DNAP") transformHiClistofpatterninDensity(setlist,"RC/Helitron",densityname+"helitron") print("taille de la setlist",len(setlist)) L=sizeList #in theory len(setlist)==sizeList #Lset=windowsize #juste for speed writing print("==>Taille de la matrice:",L) thematrix=np.zeros((L,L)) i=0 j=0 while i<L: print(i) while j<L: if j>=i: val=distancebetweenunequallist3(setlist[i],setlist[j]) thematrix[i,j]=val thematrix[j,i]=val j+=1 j=0 i+=1 print("====>Dumping de la matrice") #pickle.dump(thematrix,open(dumpmatname,"wb")) #other option of dump but with bigger file #toto={}#just to be un matlab savemat format #toto['mat']=thematrix #scpio.savemat(repository+achr+endofthefile+"TErecurencematrix.mat",toto) np.save(dumpmatname, thematrix) """cormat=np.corrcoef(thematrix) v,e=giveeigenvalueandvectorvalue(cormat,10) utils.savematrixasfilelist3(e[:,0],repository+achr+"Corrv1") utils.savematrixasfilelist3(e[:,1],repository+achr+"Corrv2") utils.savematrixasfilelist3(e[:,2],repository+achr+"Corrv3") utils.savematrixasfilelist3(e[:,3],repository+achr+"Corrv4")""" print("===> Afichage de la matrice") manipname=repository+achr+endofthefile+"recurenceplot.png" print(thematrix.shape) #print(np.where(thematrix<0)) utils.showamatwithcolorcode(thematrix,manipname,"bwr") #jet<bwr return thematrix
def generateTEmatrixforaspecies(repository,speciesname,endofthefile,chrsizepath,resolution): resolution=int(resolution) #cast of argv TEpath=repository+speciesname+".TElist" Matpath=repository+speciesname+".TEmatrix" TEpath=repository+speciesname+".TElist" DEpath=repository+speciesname+".TEdict" DEfamilypath=repository+speciesname+".TEfamilydict" #dict to return if op.exists(TEpath): TElist=utils.loadfilelist(TEpath) TEfamilydict=utils.loadstrfiledict(DEfamilypath) else: TElist,TEfamilydict=utils.generatelistofTEforaspecies(repository,speciesname,endofthefile) sizedict=utils.loadchrsizedict(chrsizepath,resolution) matcolsize=sizedict["HicUsedTotalSize"] #size of the matrix : ["HicUsedTotalSize"] or ["HicChrBeginchrX"]#TODO here mat=np.zeros((len(TElist),matcolsize)) chrlist=utils.dictchr[speciesname] #help us to loop only on good chr #print(sizedict) print(matcolsize) print(chrlist) #by chr file for i in chrlist: filename=repository+i+endofthefile filein=open(filename,"r") l=filein.readline() l=filein.readline() ls=l.split() #just to have the first chr size before loop chrbegin=np.float(sizedict["HicChrBegin"+ls[0]]) #line of the file while l: ls=l.split() #by bin begin=(np.float(ls[4])/resolution)+chrbegin end=(np.float(ls[5])/resolution)+chrbegin i=0 #print(TElist[0],TElist[1]) while i<np.ceil(end-begin): #print((end-begin),ls[1],ls[0],ls[4],ls[5],TElist.index(ls[1]),begin+i,chrbegin) mat[TElist.index(ls[1]),begin+i]=1 i+=1 l=filein.readline() filein.close() toto={}#just to be un matlab savemat format toto['mat']=sparse.csr_matrix(mat) scpio.savemat(Matpath,toto)
def makeTEPieChart(repository, speciesname, endofthefile, chrsizepath): outname = repository + speciesname + "TEproportion" sizedict = utils.loadchrsizedict(chrsizepath, 1) # absolutely no binning stuff => resolution=1 # load some accelerator information that we supposed to have TEpath = repository + speciesname + ".TElist" DEfamilypath = repository + speciesname + ".TEfamilydict" if op.exists(TEpath): TElist = utils.loadfilelist(TEpath) TEfamilydict = utils.loadstrfiledict(DEfamilypath) else: TElist, TEfamilydict = utils.generatelistofTEforaspecies(repository, speciesname, endofthefile) ProportionDict, TEreversefamilitydict = reservefamilydict(TEfamilydict) # reel algorythm sumchrtot = 0 sumTE = 0 # helpfull tu calculed unmasked proportion chrlist = utils.dictchr[speciesname] # in theory : chr here are same as chrsizepath #no bug ifelse for z in chrlist: filename = repository + z + endofthefile print("=====>nom du fichier de chr: ", filename) filein = open(filename, "r") l = filein.readline() # first line : annotation l = filein.readline() while l: ls = l.split() val = float(ls[5]) - float(ls[4]) if val < 0: print("Danger") ProportionDict[TEfamilydict[ls[1]]] += val sumTE += val l = filein.readline() sumchrtot += sizedict[z] filein.close() # print(ProportionDict) ProportionDict["Autre"] = sumchrtot - sumTE # save the dict fout = open(outname, "w") print("nom du repertoire de sortie", outname) fout.write("TEtype\tQuantity\n") for i in ProportionDict: s = i + "\t" + str(ProportionDict[i]) + "\n" fout.write(s) fout.close()