def userNodeSelectionAct(dataArray): print dataArray[1] metadata = parseList(raw_input("Input the metadata that will cluster the set of samples among those written above. [ e.g. " + dataArray[1][0] + ";" + dataArray[1][-1] + " ]\n")) isInDatabase(metadata,dataArray[1]) #@dataArray[2] = idSequences is a dictionary of (key=identifier,value=(name,rank of node)) listofNodes = dataArray[2].values() nodesList = parseListNode(raw_input("Choose the group of nodes you want to consider exclusively. [ Read the taxonomic tree to help you: e.g. " + sanitizeNode(listofNodes[-3]) + ";" + sanitizeNode(listofNodes[1]) + ";" + sanitizeNode(listofNodes[-1]) + " ]\n")) isInDatabase(nodesList,listofNodes) numberofSamples = len(dataArray[0]) numberStartingSamples = sanitize(raw_input("Knowing there is/are " + str(numberofSamples) + "sample(s), how many samples do you want to create the training set? \n")) x = integer.match(numberStartingSamples) if not x or (x and (int(numberStartingSamples) > numberofSamples)): print "\n/!\ ERROR: You should write down an integer less or equal to",numberofSamples,"." raise ValueError numberStartingSamples = int(numberStartingSamples) #@shape for @assignedClasses is the same than the one for @classes assignedClasses,classes,valueSets = classifyIt(dataArray,metadata,nodesList,numberStartingSamples) numberClass = classes.lenMDL() youdenJ = countYouden(assignedClasses,classes,numberofSamples) interpretIt(youdenJ) answer = raw_input("Do you want to plot the classes obtained as a pie chart? Y/N\n") if answer == "Y": labels = [ "Metadata: " + str(metadata) + ", Values for each metadatum: " + str([ valueSet for valueSet in valueSets]) ] percentagesAs = assignedClasses.mapMDL(len) percentages = classes.mapMDL(len) plotPie(labels,percentagesAs,"Assignments depending on " + str(nodesList) + " to class for metadata " + str(metadata)) plotPie(labels,percentages,"Real classes depending on " + str(nodesList) + " for metadata " + str(metadata)) elif not (answer == "N"): print "\n Answer by Y or N!" answer = raw_input("Do you want to save the results? Y/N \n") if (answer == "Y"): writeFile("Youden's J statistic for this classification is: " + str(youdenJ) + "\n","Assignments depending on " + listNodes(nodesList) + " to classes for metadata " + str(metadata)) elif not (answer == "N"): print "\n Answer by Y or N!" return assignedClasses,youdenJ
def userNodeSelectionAct(dataArray): print dataArray[1] metadatum = sanitize(raw_input("Input the metadatum that will cluster the set of samples among those written above. [ e.g. " + dataArray[1][0] + " ]\n")).split(";")[0] isInDatabase([metadatum],dataArray[1]) listnodes = dataArray[3].values() nodesList = parseListNode(raw_input("Choose the group of nodes you want to consider exclusively. [ Read the taxonomic tree to help you: e.g. " + sanitizeNode(listnodes[-3]) + ";" + sanitizeNode(listnodes[1]) + ";" + sanitizeNode(listnodes[-1]) + " ]\n")) isInDatabase(nodesList,listnodes) numberSamples = len(dataArray[0]) numberStartingSamples = sanitize(raw_input("Knowing there is/are " + str(numberSamples) + " sample(s), how many samples do you want to create the training set?\n")) x = integer.match(numberStartingSamples) if not x or (x and int(numberStartingSamples) > numberSamples) : print "\n/!\ ERROR: You should write down an integer inferior or equal to ",numberSamples,"." raise ValueError numberStartingSamples = int(numberStartingSamples) assignedClasses,classes,valueSet = classifyIt(dataArray,metadatum,nodesList,numberStartingSamples) numberClass = len(classes) youdenJ = countYouden(assignedClasses,classes,numberStartingSamples) interpretIt(youdenJ) answer = raw_input("Do you want to plot the classes obtained as a pie chart? Y/N") if answer == "Y": labels = [ metadatum + " = " + str(value) for value in valueSet ] percentagesAs = [ len(class1) for class1 in assignedClasses ] percentages = [ len(class1) for class1 in classes ] plotPieChart(labels,percentagesAs,"Assignments depending on " + str(nodesList) + " to class for metadatum " + metadatum) plotPieChart(labels,percentages,"Real classes depending on " + str(nodesList) + " for metadatum " + metadatum) elif not (answer == "N"): print "\n Answer by Y or N!" answer = raw_input("Do you want to save the results? Y/N") if (answer == "Y"): writeFile("Youden's J statistic for this classification is: " + str(youdenJ) + "\n","Assignments depending on " + listNodes(nodesList) + " to classes for metadatum " + metadatum) elif not (answer == "N"): print "\n Answer by Y or N!" return assignedClasses,youdenJ
def distanceAct(dataArray): answer = raw_input("Import matrix? Y/N\n") if answer == "Y": filename = raw_input("Write down the file name where the matrix is stored [ without the extension .taxotree ].\n") matrix = importMatrix(filename) else: if not (answer == "N"): print "/!\ You should answer 'Y' or 'N'!" print "/!\ Computing similarity matrix..." print "[ You may have to wait for a few minutes... ]" matrix = computeSimilarity(dataArray) print "[Preview.]" print matrix answer = raw_input("Save the results? Y/N\n") if (answer == "Y"): writeFile(m,"Similarity coefficients between patients using previous calculi on total ratio, pattern ratio and diversity coefficient\n\nNota Bene: 1e+14 stands for +inf\n","array") elif not (answer == "N"): print "\n/!\ You should answer 'Y' or 'N'!" answer = raw_input("Compute the most different groups of samples? Y/N\n") if (answer == "Y"): answer = raw_input("Do you want to select samples by metadata or to select all samples? metadata/all") if (answer == "metadata"): print dataArray[1] metadatum = parseList(raw_input("Choose the metadatum among those printed above [ e.g. " + dataArray[1][0] + ";" + dataArray[1][-1] + " ]\n")) isInDatabase(metadatum,dataArray[1]) _,valueSampleMetadatum = partitionSampleByMetadatumValue(metadatum[0],dataArray[1],dataArray[0]) valueSampleMetadatumNameOnly = [] for sampleGroup in valueSampleMetadatum: sampleGroupNameOnly = [] for sample in sampleGroup: sampleGroupNameOnly.append(sample[0]) valueSampleMetadatumNameOnly.append(sampleGroupNameOnly) pairsList = mostDifferentSamplesGroups(matrix,dataArray[8],valueSampleMetadatumNameOnly) if (answer == "all"): pairsList = mostDifferentSamplesGroups(matrix,dataArray[8],[[sample] for sample in dataArray[8]]) else: print "\n/!\ ERROR: You should answer 'metadata' or 'all'." raise ValueError print "[ Preview. ]" print "List of the pairs of most different sample groups according to the similarity coefficients computed:" for pair in pairsList: print pair answer2 = raw_input("\nSave the results? Y/N\n") if (answer2 == "Y"): stringPairs = "" for pair in pairsList: stringPairs += "*" + str(pair) + "\n" if (answer == "metadata"): stringSamples = "" for group in valueSampleMetadatumNameOnly: stringSamples += "*" + str(group) + "\n" data = "Most different groups of samples ****\nsorted by values of metadatum: " + metadatum[0] + "\nGroups were:\n\n" + stringSamples + "\n\nAnd the most different ones are:\n\n" + stringPairs + "\n\nEND OF FILE ****" else: data = "Most different groups of samples ****\n\nThe most different ones are:\n\n" + stringPairs + "\n\nEND OF FILE ****" writeFile(data,"","text") elif not (answer2 == "N"): print "/!\ You should answer 'Y' or 'N'!"
def userNodeSelectionAct(dataArray): print dataArray[1] metadatum = sanitize( raw_input( "Input the metadatum that will cluster the set of samples among those written above. [ e.g. " + dataArray[1][0] + " ]\n")).split(";")[0] isInDatabase([metadatum], dataArray[1]) listnodes = dataArray[3].values() nodesList = parseListNode( raw_input( "Choose the group of nodes you want to consider exclusively. [ Read the taxonomic tree to help you: e.g. " + sanitizeNode(listnodes[-3]) + ";" + sanitizeNode(listnodes[1]) + ";" + sanitizeNode(listnodes[-1]) + " ]\n")) isInDatabase(nodesList, listnodes) numberSamples = len(dataArray[0]) numberStartingSamples = sanitize( raw_input( "Knowing there is/are " + str(numberSamples) + " sample(s), how many samples do you want to create the training set?\n" )) x = integer.match(numberStartingSamples) if not x or (x and int(numberStartingSamples) > numberSamples): print "\n/!\ ERROR: You should write down an integer inferior or equal to ", numberSamples, "." raise ValueError numberStartingSamples = int(numberStartingSamples) assignedClasses, classes, valueSet = classifyIt(dataArray, metadatum, nodesList, numberStartingSamples) numberClass = len(classes) youdenJ = countYouden(assignedClasses, classes, numberStartingSamples) interpretIt(youdenJ) answer = raw_input( "Do you want to plot the classes obtained as a pie chart? Y/N") if answer == "Y": labels = [metadatum + " = " + str(value) for value in valueSet] percentagesAs = [len(class1) for class1 in assignedClasses] percentages = [len(class1) for class1 in classes] plotPieChart( labels, percentagesAs, "Assignments depending on " + str(nodesList) + " to class for metadatum " + metadatum) plotPieChart( labels, percentages, "Real classes depending on " + str(nodesList) + " for metadatum " + metadatum) elif not (answer == "N"): print "\n Answer by Y or N!" answer = raw_input("Do you want to save the results? Y/N") if (answer == "Y"): writeFile( "Youden's J statistic for this classification is: " + str(youdenJ) + "\n", "Assignments depending on " + listNodes(nodesList) + " to classes for metadatum " + metadatum) elif not (answer == "N"): print "\n Answer by Y or N!" return assignedClasses, youdenJ
def randomSubSamplingAct(dataArray): print dataArray[1] metadatum = sanitize(raw_input("Input the metadatum that will cluster the set of samples among those written above. [ e.g. " + dataArray[1][0] + " ]\n")).split(";")[0] isInDatabase([metadatum],dataArray[1]) s = raw_input("Input the number s of random samplings.\n") n = raw_input("Input the number n of nodes to select at each try.\n") if not integer.match(s) or not integer.match(n): print "\n/!\ ERROR: s and n must both be integers." raise ValueError numberSamples = len(dataArray[0]) numberStartingSamples = sanitize(raw_input("Knowing there is/are " + str(numberSamples) + "sample(s), how many samples do you want to create the training set? \n")) x = integer.match(numberStartingSamples) if not x or (x and int(numberStartingSamples) > numberSamples): print "\n/!\ ERROR: You should write down an integer." raise ValueError numberStartingSamples = int(numberStartingSamples) listnodes = dataArray[3].values() s,n = int(s),int(n) #Here the set of classes is a list of two lists containing the samples in C and not C bestClassification = [] bestClassesList = [] #Worse value for this coefficient currBestYouden = inf nodesNumber = len(dataArray[3]) while s: #Randomly draw n distinct nodes among the nodes in the taxonomic tree nodesList = randomChoice(listnodes,n) assignedClasses,classes,valueSet = classifyIt(dataArray,metadatum,nodesList,numberStartingSamples) numberClass = len(classes) youdenJ = countYouden(assignedClasses,classes,numberSamples) res = numberClass - youdenJ if min(res,currBestYouden) == res: bestClassification = [] for i in nodesList: bestClassification.append(i) currBestYouden = res bestClassesList = [] for i in assignedClasses: bestClassesList.append(i) s -= 1 interpretIt(numberClass - currBestYouden) if answer == "Y": percentagesAs = [ len(class1) for class1 in assignedClasses ] labels = [ metadatum + " = " + str(value) for value in valueSet ] percentages = [ len(class1) for class1 in classes ] plotPieChart(labels,percentagesAs,"Assignments depending on " + listNodes(nodesList) + " to class for metadatum " + metadatum) plotPieChart(labels,percentages,"Real classes depending on " + listNodes(nodesList) + " for metadatum " + metadatum) answer = raw_input("Do you want to save the results? Y/N") if (answer == "Y"): writeFile("Best Youden's J statistic for this classification is: " + str(numberClass - currBestYouden) + "\nand most relevant list of nodes for this metadatum is:" + str(bestClassification),"Assignments to classes for metadatum " + metadatum) elif not (answer == "N"): print "\n Answer by Y or N!" return bestClassification,(numberClass - currBestYouden),bestClassesList
def similarityAct(dataArray,iMatrix): print dataArray[1] metadataList = parseList(raw_input("Input the list of metadata you want to consider among those written above. [ e.g. " + dataArray[1][0] + ";" + dataArray[1][-1] + " ]\n")) isInDatabase(metadataList,dataArray[1]) print "/!\ Computing similarity matrix..." m = similarity(dataArray[0],dataArray[1],metadataList) print "[Preview.]" print m answer = raw_input("Save the results? Y/N\n") if (answer == "Y"): writeFile(m,"Similarity coefficients between patients for file meta/" + iMatrix + ".csv:\n" + listNodes(dataArray[8]),"array") elif not (answer == "N"): print "/!\ You should answer 'Y' or 'N'!" return m
def runAct(dataArray): print "Choosing the list of samples." #or use partition by metadatum values sampleNameList,metadataList,interval1List,interval2List = createSampleNameList(dataArray) n = len(sampleNameList) print "\nAVAILABLE COMPARISON FUNCTION(S):" fctF = printComparison() f = raw_input("\nChoose your comparison function above those printed above.\n") isInDatabase([f],fctF) completeGraph = Graph(n).constructComplete(sampleNameList,dataArray[7],f) superTree,w = kruskal(completeGraph) #Constructing distance matrix matrix = np.zeros((n,n)) print "\nAVAILABLE DISTANCE FUNCTION(S):" fctD = printDistance() d = raw_input("\nChoose your distance function above those printed above.\n") isInDatabase([d],fctD) valueArray = [] print "\nSUPERTREE of weight:",w print superTree.vertices print superTree.edges for i in range(n): for j in range(i,n): #matrix is symmetric (distance) s = applyFctD(d,superTree,i,j) matrix[i][j] = s matrix[j][i] = s valueArray.append(s) valueArray = sorted(valueArray) valueNumber = n*n/2 quartile3 = valueNumber*3/4 valueQuartile = valueArray[quartile3] mostDifferent = [] #Distance is symmetric for i in range(n): for j in range(i+1,n): if matrix[i][j] >= valueQuartile: mostDifferent.append((sampleNameList[i],sampleNameList[j])) print "\nRESULTING MATRIX:" print matrix print "\n---\nMost different samples groups from:\n" for sampleGroup in sampleNameList: print sampleGroup print "\nare:\n" print mostDifferent print "\n--- END OF DISPLAY\n"
def createSampleNameList(dataArray): metadataList = [] interval1List = [] interval2List = [] sampleIDList = dataArray[8] i = raw_input("/!\ How many different lists of samples do you want?\n") if not integer.match(i): print "\n/!\ ERROR: You need to enter a integer here!" raise ValueError numberList = int(i) sampleNameList = [] if (numberList < 1): print "\n/!\ ERROR: Empty set of lists of samples!" raise ValueError while numberList: answer = raw_input("Do you want to select samples one by one, or to select samples matching requirements on metadata? one/matching \n") if (answer == "one"): if (len(sampleIDList) < 2): print "\n/!\ ERROR: List of samples is empty or only of length one!..." raise ValueError print sampleIDList sampleNameList11 = parseList(raw_input("Input the list of samples using the ID printed above. [e.g. " + sampleIDList[0] + ";"+ sampleIDList[1] + " ]\n")) elif (answer == "matching"): print dataArray[1] metadataList = parseList(raw_input("Input the list of metadata you want to consider among those written above. [ e.g. " + dataArray[1][0] + ";" + dataArray[1][-1] + " ]\n")) isInDatabase(metadataList,dataArray[1]) interval1List = parseIntList(raw_input("Input the list of lower interval bounds corresponding to metadatum/metadata above. [ Please refer to README for more details. e.g. 1;2 ]\n")) if not (len(interval1List) == len(metadataList)): print "\n/!\ ERROR: You need to enter the same number of lower bounds than of metadata!" raise ValueError interval2List = parseIntList(raw_input("Input the list of upper interval bounds corresponding to metadatum/metadata above. [ Please refer to README for more details. e.g. 3;2 ]\n")) if not (len(interval2List) == len(metadataList)): print "\n/!\ ERROR: You need to enter the same number of upper bounds than of metadata!" raise ValueError sampleNameList11 = computeSamplesInGroup(dataArray[0],dataArray[1],metadataList,interval1List,interval2List)[0] else: print "\n/!\ ERROR: You need to answer either 'one' or 'matching' and not: \"",answer,"\"." raise ValueError isInDatabase(sampleNameList11,sampleIDList) sampleNameList.append(sampleNameList11) numberList -= 1 return sampleNameList,metadataList,interval1List,interval2List
def percentageAct(dataArray): uTree = raw_input("Do you to get percentage of assignments to subtrees or to bacterias themselves? subtree/bacteria \n") usingTree = (uTree == "subtree") if not (uTree == "subtree" or uTree == "bacteria"): print "\n/!\ ERROR: You need to answer 'bacteria' or 'subtree'." raise ValueError nodesGroup = parseListNode(raw_input("Input the list of nodes/roots of subtrees you want to consider. [ Please look at the taxonomic tree file to help you: e.g. " + sanitizeNode(dataArray[6][-3]) + ";" + sanitizeNode(dataArray[6][1]) + ";" + sanitizeNode(dataArray[6][-1]) + ". ]\n")) isInDatabase(nodesGroup,dataArray[6]) sampleNameList,metadataList,interval1List,interval2List = createSampleNameList(dataArray,True) result = percentageAssign(dataArray[0],dataArray[1],sampleNameList,dataArray[7],nodesGroup,dataArray[2],dataArray[3],usingTree) print "\n[Preview.]" print result l = len(result) data = np.zeros(l) for i in range(l): data[i] = result[i] print "" answer = raw_input("Save the results? Y/N\n") if (answer == "Y"): writeFile(data,"Percentage of assignments ****\nin the group of nodes: " + listNodes(nodesGroup) + listSampleInvolved(metadataList,interval1List,interval2List,sampleNameList),"array") elif not (answer == "N"): print "/!\ You should answer 'Y' or 'N'!" return result,nodesGroup,sampleNameList,metadataList
def clusteringAct(dataArray): print dataArray[1] metadatum = sanitize( raw_input( "Select the metadatum among those above to cluster the set of samples. [e.g. " + dataArray[1][0] + "]\n")).split(";")[0] isInDatabase([metadatum], dataArray[1]) valueSet, clusters1 = partitionSampleByMetadatumValue( metadatum, dataArray[1], dataArray[0]) clusters = [[sample[0] for sample in cluster] for cluster in clusters1] #that is, k in K-means Algorithm numberClass = len(valueSet) print "/!\ Number of classes:", numberClass, "." startSet = [cluster[0] for cluster in clusters] #Selects the starting samples of each cluster kClusters = [[start] for start in startSet] if not (len(clusters) == numberClass): print "\n/!\ ERROR: Different lengths: numberClass", numberClass, "clusters:", len( clusters), "." raise ValueError trimmedList = trimList(dataArray[3], startSet) print "/!\ Clustering with the first distance..." #@distanceInClusters is a list of lists of (sample,sum of all distances from this sample to others samples in the same cluster) #@dataArray[8] = distMatchedDict kClusters, meanSamples, distanceDict, distanceInClusters = kMeans( trimmedList, numberClass, kClusters, startSet, dataArray[8], dataArray) print "-- End of first clustering --" number = 0 for cluster in kClusters: for _ in cluster: number += 1 if not (number == len(dataArray[3])): print "\n/!\ ERROR: A bug occurred during the clustering:", number, "=/=", len( dataArray[3]), "." raise ValueError #Deletes samples in cluster that are too far from the others kClusters, untaken = cleanClusters(kClusters, distanceInClusters) startSet = [cluster[0] for cluster in clusters] #Remove from untaken the starting samples untaken2 = [] for x in untaken: if not (x in startSet): untaken2.append(x) untaken = untaken2 #Remove the samples in untaken from the total set of samples sampleSet = [] for cluster in kClusters: for x in cluster: if not (x in sampleSet): sampleSet.append(x) for x in startSet: if not (x in sampleSet): sampleSet.append(x) trimmedList = trimList(sampleSet, startSet) print "/!\ Clustering with the second distance..." #@distanceDict is the distance dictionary (key=(sample1,sample2),value=distance between sample1 and sample2) #@dataArray[9] = distConsensusDict kClusters, meanSamples, distanceDict, _ = kMeans(trimmedList, numberClass, kClusters, startSet, dataArray[9], dataArray) #,meanSamples) print "-- End of second clustering --" number = 0 for cluster in kClusters: for _ in cluster: number += 1 if not (number <= len(dataArray[3])): print "\n/!\ ERROR: An error occurred during the clustering:", number, ">", len( dataArray[3]), "." raise ValueError print "Printing the", numberClass, "clusters:" i = 1 #@kClusters contains the list of the k clusters. Each cluster is a list of sample IDs for cluster in kClusters: print "\n-- Cluster #", i, "associated to", metadatum, "=", valueSet[ i - 1], ":" print "Size:", len(cluster) print sorted(cluster) i += 1 print "\nScore of the clustering (comprised between 0 and 1):" print "The more it is close to 1, the more the clustering is relevant." #The clustering obtained with the K-Means method kClustersCopy = [cluster for cluster in kClusters] #The clustering obtained by comparing the values of the metadatum clustersCopy = [cluster for cluster in clusters] #Score by using first method of comparison compareClusterScore = 0 if not (len(kClustersCopy) == numberClass == len(clustersCopy)): print "\n/!\ ERROR: Length error in clustering:", numberClass, len( kClustersCopy), len(clustersCopy), "." raise ValueError while kClustersCopy and clustersCopy: cl1 = kClustersCopy.pop() cl2 = clustersCopy.pop() #clusters are non-empty x = compareCluster(cl1, cl2, untaken) if x: compareClusterScore += x else: compareClusterScore = None break if compareClusterScore: compareClusterScore = compareClusterScore / numberClass printClusterScore = compareClusterScore else: printClusterScore = "None" #Score by using second method of comparison #compareCentersScore = compareCenters(meanSamples,distanceDict,numberClass) print "Compare clusters score is:", printClusterScore, "." #print "Compare centers score is:",compareCentersScore,"." answer = raw_input("Do you want to save the results? Y/N\n") if (answer == "Y"): answer2 = raw_input( "Do you want to compute the sets of common nodes for each cluster? [It can be considered relevant when the score of comparing clusters is at least over 0.5] Y/N\n" ) if (answer2 == "Y"): commonList = extractCommonNodes(kClusters, dataArray) elif not (answer2 == "N"): print "\n/!\ You should answer Y or N, not:", answer2, "." data = "**** CLUSTERS FOR METADATUM " + metadatum + " WITH VALUES: " + str( valueSet) i = 0 for cluster in kClusters: data += "\n\n-- Cluster #" + str( i + 1) + " associated to " + metadatum + " = " + str( valueSet[i]) data += "\nSize: " + str(len(cluster)) if (answer2 == "Y"): data += "\nSet of common nodes: " + str(commonList[i]) data += "\n" + str(cluster) i += 1 data += "\n\nCompare clusters score is: " + str(compareClusterScore) #data += "\n\nCompare centers score is: " + str(compareCentersScore) data += "\n\nEND OF FILE ****" print "\n/!\ Saving clusters..." writeFile(data) answer2 = raw_input( "Do you want to compute the graph of the clusters? Y/N\n") if (answer2 == "Y"): print "\n/!\ Constructing the graph of the clusters..." #@dataArray[3] = filenames graph = convertClustersIntoGraph(kClusters, distanceDict, len(dataArray[3])) graphNO(graph) print "\n/!\ Done. The graph is in DOT format in \"files\" folder." elif not (answer2 == "N"): print "\n/!\ You should answer Y or N, not:", answer2, "." elif not (answer == "N"): print "/!\ You should answer by Y or N."
def creatingArray(dataArray,pearson=False): #Available cases in Pearson function if pearson: typeInput = raw_input("Do you want to compute bacteria/bacteria or bacteria/metadatum? BB/BM [ Please read README for details. ]\n") if (typeInput == "BB"): valueInput1 = parseListNode(raw_input("Choose the first group of bacterias [ Read the taxonomic tree to help you: e.g. " + sanitizeNode(dataArray[6][-3]) + ";" + sanitizeNode(dataArray[6][1]) + ";" + sanitizeNode(dataArray[6][-1]) + " ]\n")) isInDatabase(valueInput1,dataArray[6]) valueInput2 = parseListNode(raw_input("Choose the second group of bacterias [ Read the taxonomic tree to help you: e.g. " + sanitizeNode(dataArray[6][-3]) + ";" + sanitizeNode(dataArray[6][1]) + ";" + sanitizeNode(dataArray[6][-1]) + " ]\n")) isInDatabase(valueInput2,dataArray[6]) xArray,yArray = getValueBacteriaBacteria(dataArray[2],dataArray[3],dataArray[8],valueInput1,valueInput2) return xArray,yArray,typeInput,valueInput1,valueInput2 elif (typeInput == "BM"): valueInput1 = parseListNode(raw_input("Choose the group of bacterias [ Read the taxonomic tree to help you: e.g. " + sanitizeNode(dataArray[6][-3]) + ";" + sanitizeNode(dataArray[6][1]) + ";" + sanitizeNode(dataArray[6][-1]) + " ]\n")) isInDatabase(valueInput1,dataArray[6]) print dataArray[1] valueInput2 = [parseList(raw_input("Choose the metadatum among those printed above [ e.g. " + dataArray[1][0] + ";" + dataArray[1][-1] + " ]\n"))[0]] isInDatabase(valueInput2,dataArray[1]) xArray,yArray = getValueBacteriaMetadata(dataArray[0],dataArray[1],valueInput1,dataArray[8],dataArray[2],dataArray[3],valueInput2) return xArray,yArray,typeInput,valueInput1,valueInput2 else: print "\nERROR: You need to answer 'BB' or 'BM', and not ",typeInput raise ValueError #Available cases for only plotting graphs else: graphTypeInput = raw_input("Do you want to plot a graph or a pie chart? graph/pie [Read README for details. Histograms will be available in later versions]\n") if graphTypeInput == "graph": typeInput = raw_input("Do you want to plot bacteria/bacteria or bacteria/metadatum? BB/BM [ Please read README for details. ]\n") if (typeInput == "BB"): valueInput1 = parseListNode(raw_input("Choose the first group of bacterias [ Read the taxonomic tree to help you: e.g. " + sanitizeNode(dataArray[6][-3]) + ";" + sanitizeNode(dataArray[6][1]) + ";" + sanitizeNode(dataArray[6][-1]) + " ]\n")) isInDatabase(valueInput1,dataArray[6]) valueInput2 = parseListNode(raw_input("Choose the second group of bacterias [ Read the taxonomic tree to help you: e.g. " + sanitizeNode(dataArray[6][-3]) + ";" + sanitizeNode(dataArray[6][1]) + ";" + sanitizeNode(dataArray[6][-1]) + " ]\n")) isInDatabase(valueInput2,dataArray[6]) return graphTypeInput,getValueBacteriaBacteria(dataArray[2],dataArray[3],dataArray[8],valueInput1,valueInput2),typeInput,valueInput1,valueInput2 elif (typeInput == "BM"): valueInput1 = parseListNode(raw_input("Choose the group of bacterias [ Read the taxonomic tree to help you: e.g. " + sanitizeNode(dataArray[6][-3]) + ";" + sanitizeNode(dataArray[6][1]) + ";" + sanitizeNode(dataArray[6][-1]) + " ]\n")) isInDatabase(valueInput1,dataArray[6]) print dataArray[1] valueInput2 = [parseList(raw_input("Choose the metadatum among those printed above [ e.g. " + dataArray[1][0] + ";" + dataArray[1][-1] + " ]\n"))[0]] isInDatabase(valueInput2,dataArray[1]) return graphTypeInput,getValueBacteriaMetadata(dataArray[0],dataArray[1],valueInput1,dataArray[8],dataArray[2],dataArray[3],valueInput2),typeInput,valueInput1,valueInput2 else: print "\nERROR: You need to answer 'BB' or 'BM', and not ",typeInput raise ValueError elif graphTypeInput == "pie": result,nodesGroup,sampleNameList,metadataList = percentageAct(dataArray) return graphTypeInput,result,nodesGroup,sampleNameList,metadataList else: print "\nERROR: You need to answer 'graph' or 'pie', and not ",graphTypeInput raise ValueError
def clusteringAct(dataArray): print dataArray[1] metadatum = sanitize(raw_input("Select the metadatum among those above to cluster the set of samples. [e.g. " + dataArray[1][0] + "]\n")).split(";")[0] isInDatabase([metadatum],dataArray[1]) valueSet,clusters1 = partitionSampleByMetadatumValue(metadatum,dataArray[1],dataArray[0]) clusters = [[sample[0] for sample in cluster] for cluster in clusters1] #that is, k in K-means Algorithm numberClass = len(valueSet) print "/!\ Number of classes:",numberClass,"." startSet = [cluster[0] for cluster in clusters] #Selects the starting samples of each cluster kClusters = [[start] for start in startSet] if not (len(clusters) == numberClass): print "\n/!\ ERROR: Different lengths: numberClass",numberClass,"clusters:",len(clusters),"." raise ValueError trimmedList = trimList(dataArray[3],startSet) print "/!\ Clustering with the first distance..." #@distanceInClusters is a list of lists of (sample,sum of all distances from this sample to others samples in the same cluster) #@dataArray[8] = distMatchedDict kClusters,meanSamples,distanceDict,distanceInClusters = kMeans(trimmedList,numberClass,kClusters,startSet,dataArray[8],dataArray) print "-- End of first clustering --" number = 0 for cluster in kClusters: for _ in cluster: number += 1 if not (number == len(dataArray[3])): print "\n/!\ ERROR: A bug occurred during the clustering:",number,"=/=",len(dataArray[3]),"." raise ValueError #Deletes samples in cluster that are too far from the others kClusters,untaken = cleanClusters(kClusters,distanceInClusters) startSet = [cluster[0] for cluster in clusters] #Remove from untaken the starting samples untaken2 = [] for x in untaken: if not (x in startSet): untaken2.append(x) untaken = untaken2 #Remove the samples in untaken from the total set of samples sampleSet = [] for cluster in kClusters: for x in cluster: if not (x in sampleSet): sampleSet.append(x) for x in startSet: if not (x in sampleSet): sampleSet.append(x) trimmedList = trimList(sampleSet,startSet) print "/!\ Clustering with the second distance..." #@distanceDict is the distance dictionary (key=(sample1,sample2),value=distance between sample1 and sample2) #@dataArray[9] = distConsensusDict kClusters,meanSamples,distanceDict,_ = kMeans(trimmedList,numberClass,kClusters,startSet,dataArray[9],dataArray)#,meanSamples) print "-- End of second clustering --" number = 0 for cluster in kClusters: for _ in cluster: number += 1 if not (number <= len(dataArray[3])): print "\n/!\ ERROR: An error occurred during the clustering:",number,">",len(dataArray[3]),"." raise ValueError print "Printing the",numberClass,"clusters:" i = 1 #@kClusters contains the list of the k clusters. Each cluster is a list of sample IDs for cluster in kClusters: print "\n-- Cluster #",i,"associated to",metadatum,"=",valueSet[i-1],":" print "Size:",len(cluster) print sorted(cluster) i += 1 print "\nScore of the clustering (comprised between 0 and 1):" print "The more it is close to 1, the more the clustering is relevant." #The clustering obtained with the K-Means method kClustersCopy = [cluster for cluster in kClusters] #The clustering obtained by comparing the values of the metadatum clustersCopy = [cluster for cluster in clusters] #Score by using first method of comparison compareClusterScore = 0 if not (len(kClustersCopy) == numberClass == len(clustersCopy)): print "\n/!\ ERROR: Length error in clustering:",numberClass,len(kClustersCopy),len(clustersCopy),"." raise ValueError while kClustersCopy and clustersCopy: cl1 = kClustersCopy.pop() cl2 = clustersCopy.pop() #clusters are non-empty x = compareCluster(cl1,cl2,untaken) if x: compareClusterScore += x else: compareClusterScore = None break if compareClusterScore: compareClusterScore = compareClusterScore/numberClass printClusterScore = compareClusterScore else: printClusterScore = "None" #Score by using second method of comparison #compareCentersScore = compareCenters(meanSamples,distanceDict,numberClass) print "Compare clusters score is:",printClusterScore,"." #print "Compare centers score is:",compareCentersScore,"." answer = raw_input("Do you want to save the results? Y/N\n") if (answer == "Y"): answer2 = raw_input("Do you want to compute the sets of common nodes for each cluster? [It can be considered relevant when the score of comparing clusters is at least over 0.5] Y/N\n") if (answer2 == "Y"): commonList = extractCommonNodes(kClusters,dataArray) elif not (answer2 == "N"): print "\n/!\ You should answer Y or N, not:",answer2,"." data = "**** CLUSTERS FOR METADATUM " + metadatum + " WITH VALUES: " + str(valueSet) i = 0 for cluster in kClusters: data += "\n\n-- Cluster #" + str(i+1) + " associated to " + metadatum + " = " + str(valueSet[i]) data += "\nSize: " + str(len(cluster)) if (answer2 == "Y"): data += "\nSet of common nodes: " + str(commonList[i]) data += "\n" + str(cluster) i += 1 data += "\n\nCompare clusters score is: " + str(compareClusterScore) #data += "\n\nCompare centers score is: " + str(compareCentersScore) data += "\n\nEND OF FILE ****" print "\n/!\ Saving clusters..." writeFile(data) answer2 = raw_input("Do you want to compute the graph of the clusters? Y/N\n") if (answer2 == "Y"): print "\n/!\ Constructing the graph of the clusters..." #@dataArray[3] = filenames graph = convertClustersIntoGraph(kClusters,distanceDict,len(dataArray[3])) graphNO(graph) print "\n/!\ Done. The graph is in DOT format in \"files\" folder." elif not (answer2 == "N"): print "\n/!\ You should answer Y or N, not:",answer2,"." elif not (answer == "N"): print "/!\ You should answer by Y or N."
def randomSubSamplingAct(dataArray): print dataArray[1] metadatum = sanitize( raw_input( "Input the metadatum that will cluster the set of samples among those written above. [ e.g. " + dataArray[1][0] + " ]\n")).split(";")[0] isInDatabase([metadatum], dataArray[1]) s = raw_input("Input the number s of random samplings.\n") n = raw_input("Input the number n of nodes to select at each try.\n") if not integer.match(s) or not integer.match(n): print "\n/!\ ERROR: s and n must both be integers." raise ValueError numberSamples = len(dataArray[0]) numberStartingSamples = sanitize( raw_input( "Knowing there is/are " + str(numberSamples) + "sample(s), how many samples do you want to create the training set? \n" )) x = integer.match(numberStartingSamples) if not x or (x and int(numberStartingSamples) > numberSamples): print "\n/!\ ERROR: You should write down an integer." raise ValueError numberStartingSamples = int(numberStartingSamples) listnodes = dataArray[3].values() s, n = int(s), int(n) #Here the set of classes is a list of two lists containing the samples in C and not C bestClassification = [] bestClassesList = [] #Worse value for this coefficient currBestYouden = inf nodesNumber = len(dataArray[3]) while s: #Randomly draw n distinct nodes among the nodes in the taxonomic tree nodesList = randomChoice(listnodes, n) assignedClasses, classes, valueSet = classifyIt( dataArray, metadatum, nodesList, numberStartingSamples) numberClass = len(classes) youdenJ = countYouden(assignedClasses, classes, numberSamples) res = numberClass - youdenJ if min(res, currBestYouden) == res: bestClassification = [] for i in nodesList: bestClassification.append(i) currBestYouden = res bestClassesList = [] for i in assignedClasses: bestClassesList.append(i) s -= 1 interpretIt(numberClass - currBestYouden) if answer == "Y": percentagesAs = [len(class1) for class1 in assignedClasses] labels = [metadatum + " = " + str(value) for value in valueSet] percentages = [len(class1) for class1 in classes] plotPieChart( labels, percentagesAs, "Assignments depending on " + listNodes(nodesList) + " to class for metadatum " + metadatum) plotPieChart( labels, percentages, "Real classes depending on " + listNodes(nodesList) + " for metadatum " + metadatum) answer = raw_input("Do you want to save the results? Y/N") if (answer == "Y"): writeFile( "Best Youden's J statistic for this classification is: " + str(numberClass - currBestYouden) + "\nand most relevant list of nodes for this metadatum is:" + str(bestClassification), "Assignments to classes for metadatum " + metadatum) elif not (answer == "N"): print "\n Answer by Y or N!" return bestClassification, (numberClass - currBestYouden), bestClassesList
def randomSubSamplingAct(dataArray): print dataArray[1] metadata = parseList(raw_input("Input the metadata that will cluster the set of samples among those written above. [ e.g. " + dataArray[1][0] + ";" + dataArray[1][-1] + " ]\n")) isInDatabase(metadata,dataArray[1]) s = raw_input("Input the number s of random samplings.\n") n = raw_input("Input the number n of nodes to select at each try.\n") numberofSamples = len(dataArray[0]) if not integer.match(s) or not integer.match(n): print "\n/!\ ERROR: s and n must both be integers." raise ValueError s,n = int(s),int(n) numberStartingSamples = sanitize(raw_input("Knowing there is/are " + str(numberofSamples) + " sample(s), how many samples do you want to create the training set?\n")) x = integer.match(numberStartingSamples) if not x or (x and (int(numberStartingSamples) > numberofSamples)): print "\n/!\ ERROR: You should write down an integer less or equal to",numberofSamples,"." raise ValueError numberStartingSamples = int(numberStartingSamples) #Here the set of classes is a list of two lists containing the samples in C and not C bestClassification = [] bestClassesList = [] bestShape = [] bestValuesList = [] #Worse value for this coefficient currBestYouden = inf nodesNumber = len(dataArray[3]) #@dataArray[2] = idSequences, which is a dictionary of (key=identifier,values=(name,rank of node)) listofNodes = dataArray[2].values() while s: #Randomly draw n distinct nodes among the nodes in the taxonomic tree nodesList = randomChoice(listofNodes,n) assignedClasses,classes,valueSets = classifyIt(dataArray,metadata,nodesList,numberStartingSamples) numberClass = classes.lenMDL(shape) #len(dataArray[0])? youdenJ = countYouden(assignedClasses,classes,numberofSamples) res = numberClass - youdenJ if min(res,currBestYouden) == res: bestValuesList = [] for i in valueSets: bestValuesList.append(i) bestClassification = [] for i in nodesList: bestClassification.append(i) bestShape = [] for i in shape: bestShape.append(i) currBestYouden = res bestClassesList = [] for i in assignedClasses: bestClassesList.append(i) s -= 1 interpretIt(numberClass - currBestYouden) if answer == "Y": labels = [ "Metadata: " + str(metadata) + ", Values for each metadatum: " + str([ valueSet for valueSet in valueSets]) ] percentagesAs = assignedClasses.mapMDL(len) percentages = classes.mapMDL(len) plotPie(labels,percentagesAs,"Assignments depending on " + str(nodesList) + " to class for metadata " + str(metadata)) plotPie(labels,percentages,"Real classes depending on " + str(nodesList) + " for metadata " + str(metadata)) answer = raw_input("Do you want to save the results? Y/N \n") if (answer == "Y"): writeFile("Best Youden's J statistic for this classification is: " + str(numberClass - currBestYouden) + "\nand most relevant list of nodes for this set of metadata is:" + str(bestClassification),"Assignments to classes for metadata " + str(metadata)) elif not (answer == "N"): print "\n Answer by Y or N!" return bestClassification,(numberClass - currBestYouden),bestClassesList