def DistanceStruct(StructFile, SVMlFile, numberofsruct, constrainte): conf = loadConfig() Redondantestructure = defaultdict(aa) MatDist = defaultdict(aa) Redondantestructure1 = {} Dicnumberofsruct = {} for i in range(len(constrainte)): Dicnumberofsruct[constrainte[i]] = numberofsruct nb, DicStruct = GetBasePairsFromStructFile(StructFile) progress.StartTask("Dissimilarity Loop") for i in range(0, nb): for j in range(i + 1, nb): MatDist[i][j] = DistanceTwoStructs(DicStruct[i], DicStruct[j]) MatDist[j][i] = MatDist[i][j] ####### Check for redundancy if MatDist[i][j] == 0: jconstraint = int(j / numberofsruct) if j not in Redondantestructure1 and int( i / numberofsruct ) == jconstraint: # To be sure that the redundant structure belongs to the same probing condition Dicnumberofsruct[constrainte[jconstraint]] -= 1 Redondantestructure1[j] = jconstraint progress.EndTask() progress.StartTask("Export dissimilarity matrix") for elem in Redondantestructure1: jconstraint = Redondantestructure1[elem] StructureNumber = elem - jconstraint * numberofsruct Redondantestructure[constrainte[jconstraint]][ StructureNumber] = 1 # we mark redundant structures by value 1 # store the distance matrix in the SVMLFile SVMLFullPath = os.path.join(conf.OutputFolder, "tmp", SVMlFile) if os.path.isfile(SVMLFullPath): os.remove(SVMLFullPath) # To clean the previous version o = open(SVMLFullPath, "w") for i in range(len(MatDist)): o.write("%i\t" % (i + 1)) for j in range(len(MatDist)): if (i != j): o.write("%i:%.4f\t" % (j + 1, MatDist[i][j])) o.write("\n") o.close() progress.EndTask() progress.StartTask("Pickle all data") FF.PickleVariable(MatDist, "dissmatrix.pkl") FF.PickleVariable(list(Redondantestructure1.keys()), "Redondantestructures.pkl") FF.PickleVariable(Redondantestructure, "Redondantestructures_Id.pkl") FF.PickleVariable(Dicnumberofsruct, "Dicnumberofsruct.pkl") progress.EndTask() return 0
def CentroidBycluster(clusters, StructFile, Boltzmann, numberofsruct, constrainte, rna): progress.StartTask("Computing centroids") dim_clustering = len(clusters) E = defaultdict() mycentroid = defaultdict() Intradistance = [] centroids = defaultdict(lambda: defaultdict()) Myproba = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0))) ListStructures = [SF.BasePairsFromStruct(Struct) for Struct in FF.Parsefile(StructFile)] progress.StartTask("Gathering base pairs") ListBPbystructure, ListBP, Myproba, Boltzmancluster = BasePairsbyCluster(clusters, ListStructures, Boltzmann, numberofsruct, constrainte) # Eliminate cluster reporting one structure ListDiameters, Listeliminated_clusers = ClustersDiameter(clusters, ListBPbystructure) for elem in Listeliminated_clusers: del clusters[elem] progress.EndTask() progress.StartTask("Computing cluster distance distribution") E = ClustersDistances(clusters, Boltzmann, ListBPbystructure, numberofsruct, constrainte) progress.EndTask() progress.StartTask("Computing MEA centroids") for ClusterNumber in clusters: mycentroid[ClusterNumber], centroids[ClusterNumber] = MEA(Myproba[ClusterNumber], rna) progress.EndTask() MatriceDistanceCentroids = scipy.zeros([dim_clustering, dim_clustering]) MatriceDistanceClustersEucld = scipy.zeros([dim_clustering, dim_clustering]) for ClusterNumber in clusters.keys(): for ClusterNumber2 in clusters.keys(): if ClusterNumber2 > ClusterNumber: l = SF.DistanceTwoStructs(centroids[ClusterNumber], centroids[ClusterNumber2]) #print "BP_centoid_distances", "\t", ClusterNumber, "\t", ClusterNumber2, "\t", l Intradistance.append(l) # print "distance between clusters comparing the centroide's distances",l MatriceDistanceCentroids[ClusterNumber][ClusterNumber2] = l MatriceDistanceCentroids[ClusterNumber2][ClusterNumber] = l # print "distance between clusters comparing the means distances", ClusterNumber, ClusterNumber2, np.abs(E[ClusterNumber]-E[ClusterNumber2]),np.sqrt(abs(pow(E[ClusterNumber],2)-pow(E[ClusterNumber2],2))) # print E l = np.sqrt(abs(pow(E[ClusterNumber], 2) - pow(E[ClusterNumber2], 2))) MatriceDistanceClustersEucld[ClusterNumber][ClusterNumber2] = l MatriceDistanceClustersEucld[ClusterNumber2][ClusterNumber] = l # print "distance between clusters compring the centroide's distances", ClusterNumber, ClusterNumber2, DistanceTwoBPlist(ListBPbystrcut[ClusterNumber][listCentroidStructure[ClusterNumber][0]],ListBPbystrcut[ClusterNumber2][listCentroidStructure[ClusterNumber2][0]]) # VT.plotDistanceClusters(MatriceDistanceCentroids, clusters, "blue", " Base pair distance between centroids") # VT.plotDistanceClusters(MatriceDistanceClustersEucld, clusters, "red", "Eucledian distance between structures") #print "BZ_distance_btw_clusters", "\t", E progress.EndTask() return mycentroid, Boltzmancluster, E, MatriceDistanceCentroids, ListDiameters, Intradistance
def ClustersDiameter(clusters, BPStructs): progress.StartTask("Computing cluster diameters") eliminated_clusters = [] lista = [] for ClusterNumber in clusters: if len(clusters[ClusterNumber]) > 1: # not unique structure d = max([SF.DistanceTwoStructs(BPStructs[ClusterNumber][structure1], BPStructs[ClusterNumber][structure2]) for structure1 in clusters[ClusterNumber] for structure2 in clusters[ClusterNumber]]) else: d = 0 eliminated_clusters.append(ClusterNumber) lista.append(d) progress.EndTask() return lista, eliminated_clusters
def StructSampling(Pathconstraints, Conditions, numberStructures, T, m, b, defaultFasta): conf = loadConfig() dir = os.path.join(conf.OutputFolder, "tmp", 'OutputSamples' + str(numberStructures)) FF.CreateFold(dir) thermoMsgShown = False for filename in Conditions: lines = [] header = [] progress.StartTask("Processing %s"%(filename)) while len(lines) - NUM_HEADER_LINES < numberStructures: # If alternative sequence file found in constraints folder, use it rather than default Input = defaultFasta for p in Pathconstraints: tmpInput = os.path.join(p, filename + '.' + IPANEMAP.FASTA_EXTENSION) if os.path.isfile(tmpInput): Input = tmpInput output = os.path.join(dir, filename) Command = 'RNAsubopt -p ' + str(numberStructures) + ' -s -T ' + str(T) (hasHardConstraints, hasSoftConstraints) = (False, False) hardConstraintFile = os.path.join(conf.PathConstraintsFile, filename + '.txt') if os.path.isfile(hardConstraintFile): Command += ' -C --enforceConstraint ' hasHardConstraints = True Input = hardConstraintFile ShapeFile = os.path.join(conf.PathConstraintsFileShape, filename + '.txt') if os.path.isfile(ShapeFile): Command += ' --shape ' + ShapeFile + ' --shapeMethod="Dm' + str(m) + 'b' + str(b) + '"' hasSoftConstraints = True if not (hasHardConstraints or hasSoftConstraints or thermoMsgShown): progress.Print("Warning: Did not find suitable constraint file for this condition, using purely thermodynamic sampling") thermoMsgShown = True subprocess.call(Command, stdin=open(Input, 'r'), stdout=open(output, 'wb'), stderr=open(os.devnull, 'w'), shell=True) with open(output, 'r') as f: nlines = f.readlines() header = nlines[:NUM_HEADER_LINES] lines += nlines[NUM_HEADER_LINES:] with open(output, 'w') as f: f.writelines(header+lines[:numberStructures]) progress.EndTask() return dir
# Get probing conditions for the treated RNA ProbingConditions = [RNAName + state for state in conf.Conditions] # Specify whether to generate new sample or use a previously generated one OutputSamples = os.path.join(conf.OutputFolder, "tmp", 'OutputSamples') + conf.SampleSize if str.lower( conf.Sampling) == "true" or not os.path.isdir(OutputSamples): progress.StartTask("Sampling %s structures for each condition" % (conf.SampleSize)) OutputSamples = SP.StructSampling( [conf.PathConstraintsFile, conf.PathConstraintsFileShape], ProbingConditions, int(conf.SampleSize), conf.Temperature, conf.m, conf.b, conf.RNA) progress.EndTask() else: progress.Print("Using existing sample") progress.Print("Probing conditions: %s" % (ProbingConditions)) # Create a global file that contains structures sampled from the list of Probing conditions FF.MergeFiles(OutputSamples, os.path.join(OutputSamples, 'Samples.txt'), ProbingConditions, SP.NUM_HEADER_LINES) # Create a distance matrix file progress.StartTask("Computing dissimilarity matrix") SVMlFile = "DissimilarityMatrix" + conf.SampleSize # Calculate distance and identify redundant structures within the same condition SF.DistanceStruct(os.path.join(OutputSamples, 'Samples.txt'), SVMlFile, int(conf.SampleSize), ProbingConditions)
def DefineNumberCluster(SVMLMatrix, Redundant, method, DM, BoltzmanFactor, Probingconditions, rna): conf = loadConfig() epsilon = 1 # Cetroid base pair distance threshold Cluster = defaultdict(lambda: defaultdict(CL.a)) Clust = defaultdict(lambda: defaultdict(CL.a)) CumulBE = defaultdict(lambda: defaultdict(CL.a)) Centroids = defaultdict(lambda: defaultdict(CL.a)) progress.StartTask("Initialization step") # Initialization step Cluster[1] = CL.MiniBatchKMeans(SVMLMatrix, 1) Centroids[1], BZ, X, Y, Z, IntradistanceStop = CT.CentroidBycluster( Cluster[1], os.path.join(conf.OutputFolder, "tmp", 'OutputSamples' + str(conf.SampleSize), 'Samples.txt'), BoltzmanFactor, int(conf.SampleSize), Probingconditions, rna) CumulBE[1] = CumulatedBoltzmannsbyCluster(Cluster[1], BZ, int(conf.SampleSize), Probingconditions) #print "***************************************verification bz", "Cluster \t Centroids \t CumulBE \t ", Centroids[1], CumulBE[1] progress.EndTask() for nb in range(2, 21): progress.StartTask("Clustering with %s clusters" % nb) progress.StartTask("Run MBKM") Clust[nb] = CL.MiniBatchKMeans(SVMLMatrix, nb) progress.EndTask() Centroids[nb], BZ, X, Y, Z, Intradistance = CT.CentroidBycluster( Clust[nb], os.path.join(conf.OutputFolder, "tmp", 'OutputSamples' + str(conf.SampleSize), 'Samples.txt'), BoltzmanFactor, int(conf.SampleSize), Probingconditions, rna) CumulBE[nb] = CumulatedBoltzmannsbyCluster(Clust[nb], BZ, int(conf.SampleSize), Probingconditions) lista = [] ''' ####***************************************************First crierion: if len([ elem for elem in IntradistanceStop if elem <= epsilon_intradist ] )!=0: print "************************************* Clustering done with ", nb ," as the optimal number of clusters using the first criterion intradistance*********************************************************************" break # ************************************* second criterion ''' for elem1 in Centroids[nb - 1].keys(): rep = [] ''' print "distance to all elements" print "Ref \t i \t i+1 \t BPdist \t CumulatedBz i \t CumulatedBz i+1 \t CumulatedBzdist" ''' for elem2 in Centroids[nb].keys(): rep.append( (elem2, SF.DistanceTwoStructs( SF.BasePairsFromStruct(Centroids[nb - 1][elem1]), SF.BasePairsFromStruct(Centroids[nb][elem2])))) minima = np.min([item[1] for item in rep]) pos = [elem[0] for elem in rep if elem[1] == minima][0] l1 = CumulBE[nb - 1][elem1] l2 = CumulBE[nb][pos] # print "what s wrong!", l1,l2 Dist = l1 - l2 lista.append((minima, (l1, l2, Dist))) ########## The new criterion i about the existence of probable cluster Bzmepsilon = 0.3 * CumulBE[1][0] BP_All_probable_centroids = [ BPdist for BPdist, Bzmandist in lista if Bzmandist[0] >= Bzmepsilon ] progress.EndTask() if (len([elem for elem in Intradistance if elem <= epsilon]) != 0 or len([ distance for distance in BP_All_probable_centroids if distance <= epsilon ]) == len(BP_All_probable_centroids)): FF.PickleVariable(Cluster[nb], "Clusters" + method + ".pkl") progress.Print("Choosing %s as the optimal number of clusters" % nb) break # for the entire clusters while keeping redundancy return Clust[nb], Centroids[nb]