def DistanceStruct(StructFile, SVMlFile, numberofsruct, MFESnbrstruct, constrainte): Redondantestructure = defaultdict(aa) MatDist = defaultdict(aa) Redondantestructure1 = [] DicStruct = {} Dicnumberofsruct = {} for i in range(len(constrainte) - 1): Dicnumberofsruct[constrainte[i]] = numberofsruct Dicnumberofsruct[constrainte[len(constrainte) - 1]] = MFESnbrstruct nb, DicStruct = GetBasePairsFromStructFile(StructFile) for i in range(0, nb): for j in range(i + 1, nb): MatDist[i][j] = DistanceTwoBPlist(DicStruct[i], DicStruct[j]) if MatDist[i][j] == 0: if j not in Redondantestructure1: if j > numberofsruct * (len(constrainte) - 1): Dicnumberofsruct[constrainte[len(constrainte) - 1]] -= 1 else: Dicnumberofsruct[constrainte[int(j / numberofsruct)]] -= 1 Redondantestructure1.append(j) MatDist[j][i] = MatDist[i][j] for elem in Redondantestructure1: if elem < numberofsruct * (len(constrainte) - 1): ConditionNumber = int((elem) / numberofsruct) else: ConditionNumber = len(constrainte) - 1 StructureNumber = elem - ConditionNumber * numberofsruct Redondantestructure[constrainte[ConditionNumber]][StructureNumber] = 1 # strore the distance matrix in the file SVMLFile o = open(os.path.join("output", SVMlFile), "w") for i in range(len(MatDist)): o.write("%i\t" % (i + 1)) for j in range(len(MatDist)): if (i != j): o.write("%i:%.4f\t" % (j + 1, MatDist[i][j])) o.write("\n") o.close() if Redondantestructure != 0: print "Warning! redundant structures" FF.PickleVariable(MatDist, os.path.join(conf.PickledData, "dissmatrix.pkl")) FF.PickleVariable( Redondantestructure1, os.path.join(conf.PickledData, "Redondantestructures.pkl")) FF.PickleVariable( Redondantestructure, os.path.join(conf.PickledData, "Redondantestructures_Id.pkl")) FF.PickleVariable(Dicnumberofsruct, os.path.join(conf.PickledData, "Dicnumberofsruct.pkl")) return 0
def Boltzmann_Calc(constraintes, StructfileRepos, numberofsruct, MFESnbrstruct, rna, Redondantestructure): Energy = defaultdict(aa) Boltzman = defaultdict(aa) ConditionalBoltzman = defaultdict(aa) ZBolzman = defaultdict(aa) for Condition in constraintes: FileStructure = StructfileRepos + '/' + Condition #print FileStructure,"ffftft" Energy[Condition] = ENERGY_VALUES_STRUCTURES( FileStructure, rna ) # list of energy values for the structures present in the Condition #print Energy,"llllllllllllllllllllll","done" for Condition in constraintes: #print MFESnbrstruct if Condition == "MFES": Boltzman[Condition] = [ BoltzmannEnergy(Energy[Condition][i]) for i in range(2) ] #print "heehrh",Boltzman[Condition] #for i in range(MFESnbrstruct): # print i, Boltzman[Condition][i] #print Boltzman[Condition],"mfe" else: listawithoutRedonddnace = [] for i in range(numberofsruct): Boltzman[Condition][i] = BoltzmannEnergy(Energy[Condition][i]) if Redondantestructure[Condition][ i] == 0: # if the structure is not redundant listawithoutRedonddnace.append( BoltzmannEnergy(Energy[Condition][i])) #print Boltzman, "eeeeeeeeeeeeeeeeeeeeeeeeee" ZBolzman[Condition] = sum( listawithoutRedonddnace) # Partition function #FF.PickleVariable(Boltzman, os.path.join(conf.PickledData, "Boltzman.pkl")) listall = [] for Condition in constraintes[:-1]: # to not count MFES lista = [] for i in range(numberofsruct): if Redondantestructure[Condition][i] == 0: lista.append( BoltzmannEnergy(Energy[Condition][i]) / ZBolzman[Condition]) else: lista.append( 0 ) # to solve the problem of the number of structure variation listall += lista ConditionalBoltzman[Condition] = lista FF.PickleVariable( ConditionalBoltzman, os.path.join(conf.PickledData, "ConditionalBoltzman.pkl")) FF.PickleVariable(ZBolzman, os.path.join(conf.PickledData, "ZBolzman.pkl")) return Boltzman
def DistanceStruct(StructFile, SVMlFile, numberofsruct, constrainte): conf = loadConfig() Redondantestructure = defaultdict(aa) MatDist = defaultdict(aa) Redondantestructure1 = {} Dicnumberofsruct = {} for i in range(len(constrainte)): Dicnumberofsruct[constrainte[i]] = numberofsruct nb, DicStruct = GetBasePairsFromStructFile(StructFile) progress.StartTask("Dissimilarity Loop") for i in range(0, nb): for j in range(i + 1, nb): MatDist[i][j] = DistanceTwoStructs(DicStruct[i], DicStruct[j]) MatDist[j][i] = MatDist[i][j] ####### Check for redundancy if MatDist[i][j] == 0: jconstraint = int(j / numberofsruct) if j not in Redondantestructure1 and int( i / numberofsruct ) == jconstraint: # To be sure that the redundant structure belongs to the same probing condition Dicnumberofsruct[constrainte[jconstraint]] -= 1 Redondantestructure1[j] = jconstraint progress.EndTask() progress.StartTask("Export dissimilarity matrix") for elem in Redondantestructure1: jconstraint = Redondantestructure1[elem] StructureNumber = elem - jconstraint * numberofsruct Redondantestructure[constrainte[jconstraint]][ StructureNumber] = 1 # we mark redundant structures by value 1 # store the distance matrix in the SVMLFile SVMLFullPath = os.path.join(conf.OutputFolder, "tmp", SVMlFile) if os.path.isfile(SVMLFullPath): os.remove(SVMLFullPath) # To clean the previous version o = open(SVMLFullPath, "w") for i in range(len(MatDist)): o.write("%i\t" % (i + 1)) for j in range(len(MatDist)): if (i != j): o.write("%i:%.4f\t" % (j + 1, MatDist[i][j])) o.write("\n") o.close() progress.EndTask() progress.StartTask("Pickle all data") FF.PickleVariable(MatDist, "dissmatrix.pkl") FF.PickleVariable(list(Redondantestructure1.keys()), "Redondantestructures.pkl") FF.PickleVariable(Redondantestructure, "Redondantestructures_Id.pkl") FF.PickleVariable(Dicnumberofsruct, "Dicnumberofsruct.pkl") progress.EndTask() return 0
def Boltzmann_Calc(constraintes, StructfileRepository, NumStructures, rna, Redondantestructure): Energy = defaultdict(aa) Boltzman = defaultdict(aa) ConditionalBoltzman = defaultdict(aa) ZBolzman = defaultdict(aa) # Calculate structure energies in each condition sample for Condition in constraintes: FileStructure = os.path.join(StructfileRepository, Condition) Energy[Condition] = EvalStructuresEnergies( FileStructure, rna ) # list of energy values for the structures present in the Condition for Condition in constraintes: ListwithoutRedundnacy = [] for i in range(NumStructures): Boltzman[Condition][i] = BoltzmannFactor(Energy[Condition][i]) if Redondantestructure[Condition][ i] == 0: # if the structure is not redundant ListwithoutRedundnacy.append(Boltzman[Condition][i]) # Calculate the normalization term as the sum over all Boltzmann probabilities for one copy of each structure ZBolzman[Condition] = sum(ListwithoutRedundnacy) # Partition function # FF.PickleVariable(Boltzman, "Boltzman.pkl") listall = [] for Condition in constraintes: # to not count MFES lista = [] for i in range(NumStructures): if Redondantestructure[Condition][ i] == 0: # a non redundnat structure lista.append(Boltzman[Condition][i] / ZBolzman[Condition]) else: lista.append( 0. ) # Redundant structures have a conditional Boltzmann value NULL listall += lista ConditionalBoltzman[Condition] = lista # print "Condition \t ConditionalBoltzman", Condition, ConditionalBoltzman[Condition] FF.PickleVariable(Boltzman, "Boltzman.pkl") FF.PickleVariable(ConditionalBoltzman, "ConditionalBoltzman.pkl") FF.PickleVariable(ZBolzman, "ZBolzman.pkl") return ConditionalBoltzman
def AffinityPropagation(SVMLMatrix, Redundant): # To be able to use the pickeling we need to defined at module level, that means it is not an instance method of a class and it's not nested within another function, and it is a "real" function with a name, not a lambda function. clusters = defaultdict(a) X, y = np.array(load_svmlight_file(SVMLMatrix)) algorithm = cluster.MiniBatchKMeans(n_clusters=6)# #algorithm=cluster.AffinityPropagation(damping=.9, preference=None) algorithm.fit(X) if hasattr(algorithm, 'labels_'): y_pred = algorithm.labels_.astype(np.int) else: y_pred = algorithm.predict(X) for i in range(len(y_pred)): clusters[y_pred[i]].append(i + 1) # eliminate redundancy for elem in clusters: clusters[elem] = FilterCluster(clusters[elem], Redundant) # print "Clusters",clusters FF.PickleVariable(clusters, os.path.join(conf.PickledData,"Clusters_Aff_Prop.pkl")) return 0
def DefineNumberCluster(SVMLMatrix, Redundant, method, DM, BoltzmanFactor, Probingconditions, rna): conf = loadConfig() epsilon = 1 # Cetroid base pair distance threshold Cluster = defaultdict(lambda: defaultdict(CL.a)) Clust = defaultdict(lambda: defaultdict(CL.a)) CumulBE = defaultdict(lambda: defaultdict(CL.a)) Centroids = defaultdict(lambda: defaultdict(CL.a)) progress.StartTask("Initialization step") # Initialization step Cluster[1] = CL.MiniBatchKMeans(SVMLMatrix, 1) Centroids[1], BZ, X, Y, Z, IntradistanceStop = CT.CentroidBycluster( Cluster[1], os.path.join(conf.OutputFolder, "tmp", 'OutputSamples' + str(conf.SampleSize), 'Samples.txt'), BoltzmanFactor, int(conf.SampleSize), Probingconditions, rna) CumulBE[1] = CumulatedBoltzmannsbyCluster(Cluster[1], BZ, int(conf.SampleSize), Probingconditions) #print "***************************************verification bz", "Cluster \t Centroids \t CumulBE \t ", Centroids[1], CumulBE[1] progress.EndTask() for nb in range(2, 21): progress.StartTask("Clustering with %s clusters" % nb) progress.StartTask("Run MBKM") Clust[nb] = CL.MiniBatchKMeans(SVMLMatrix, nb) progress.EndTask() Centroids[nb], BZ, X, Y, Z, Intradistance = CT.CentroidBycluster( Clust[nb], os.path.join(conf.OutputFolder, "tmp", 'OutputSamples' + str(conf.SampleSize), 'Samples.txt'), BoltzmanFactor, int(conf.SampleSize), Probingconditions, rna) CumulBE[nb] = CumulatedBoltzmannsbyCluster(Clust[nb], BZ, int(conf.SampleSize), Probingconditions) lista = [] ''' ####***************************************************First crierion: if len([ elem for elem in IntradistanceStop if elem <= epsilon_intradist ] )!=0: print "************************************* Clustering done with ", nb ," as the optimal number of clusters using the first criterion intradistance*********************************************************************" break # ************************************* second criterion ''' for elem1 in Centroids[nb - 1].keys(): rep = [] ''' print "distance to all elements" print "Ref \t i \t i+1 \t BPdist \t CumulatedBz i \t CumulatedBz i+1 \t CumulatedBzdist" ''' for elem2 in Centroids[nb].keys(): rep.append( (elem2, SF.DistanceTwoStructs( SF.BasePairsFromStruct(Centroids[nb - 1][elem1]), SF.BasePairsFromStruct(Centroids[nb][elem2])))) minima = np.min([item[1] for item in rep]) pos = [elem[0] for elem in rep if elem[1] == minima][0] l1 = CumulBE[nb - 1][elem1] l2 = CumulBE[nb][pos] # print "what s wrong!", l1,l2 Dist = l1 - l2 lista.append((minima, (l1, l2, Dist))) ########## The new criterion i about the existence of probable cluster Bzmepsilon = 0.3 * CumulBE[1][0] BP_All_probable_centroids = [ BPdist for BPdist, Bzmandist in lista if Bzmandist[0] >= Bzmepsilon ] progress.EndTask() if (len([elem for elem in Intradistance if elem <= epsilon]) != 0 or len([ distance for distance in BP_All_probable_centroids if distance <= epsilon ]) == len(BP_All_probable_centroids)): FF.PickleVariable(Cluster[nb], "Clusters" + method + ".pkl") progress.Print("Choosing %s as the optimal number of clusters" % nb) break # for the entire clusters while keeping redundancy return Clust[nb], Centroids[nb]