def userNodeSelectionAct(dataArray): print dataArray[1] metadatum = sanitize(raw_input("Input the metadatum that will cluster the set of samples among those written above. [ e.g. " + dataArray[1][0] + " ]\n")).split(";")[0] isInDatabase([metadatum],dataArray[1]) listnodes = dataArray[3].values() nodesList = parseListNode(raw_input("Choose the group of nodes you want to consider exclusively. [ Read the taxonomic tree to help you: e.g. " + sanitizeNode(listnodes[-3]) + ";" + sanitizeNode(listnodes[1]) + ";" + sanitizeNode(listnodes[-1]) + " ]\n")) isInDatabase(nodesList,listnodes) numberSamples = len(dataArray[0]) numberStartingSamples = sanitize(raw_input("Knowing there is/are " + str(numberSamples) + " sample(s), how many samples do you want to create the training set?\n")) x = integer.match(numberStartingSamples) if not x or (x and int(numberStartingSamples) > numberSamples) : print "\n/!\ ERROR: You should write down an integer inferior or equal to ",numberSamples,"." raise ValueError numberStartingSamples = int(numberStartingSamples) assignedClasses,classes,valueSet = classifyIt(dataArray,metadatum,nodesList,numberStartingSamples) numberClass = len(classes) youdenJ = countYouden(assignedClasses,classes,numberStartingSamples) interpretIt(youdenJ) answer = raw_input("Do you want to plot the classes obtained as a pie chart? Y/N") if answer == "Y": labels = [ metadatum + " = " + str(value) for value in valueSet ] percentagesAs = [ len(class1) for class1 in assignedClasses ] percentages = [ len(class1) for class1 in classes ] plotPieChart(labels,percentagesAs,"Assignments depending on " + str(nodesList) + " to class for metadatum " + metadatum) plotPieChart(labels,percentages,"Real classes depending on " + str(nodesList) + " for metadatum " + metadatum) elif not (answer == "N"): print "\n Answer by Y or N!" answer = raw_input("Do you want to save the results? Y/N") if (answer == "Y"): writeFile("Youden's J statistic for this classification is: " + str(youdenJ) + "\n","Assignments depending on " + listNodes(nodesList) + " to classes for metadatum " + metadatum) elif not (answer == "N"): print "\n Answer by Y or N!" return assignedClasses,youdenJ
def parseInfo(filename): samplesList = [] file_matrix = open("meta/" + filename + ".csv", "r") lines = file_matrix.readlines() file_matrix.close() #Data need to be sanitized infoListDirty = lines[0].split(",") infoList = [] for info in infoListDirty: infoList.append(sanitize(info.split("\n")[0])) for line in lines[1:]: #Construction of the list associated to one sample thisSampleList = [] lsDirty = line.split(",") #Checks if lsDirty is not empty if not lsDirty: print "\n /!\ ERROR: [BUG] [parsingInfo/parseInfo] Parsing error. (1)" raise ValueError ls = [] for data in lsDirty: if not (data == ""): ls.append(data) else: #unknown values are remplaced by "N" ls.append("N") for data in ls: thisSampleList.append(sanitize(data).split("\n")[0]) #samplesList is the list of every sample's list if not (len(thisSampleList) == len(infoList)): print "\n /!\ ERROR: [BUG] [parsingInfo/parseInfo] Parsing error. (2)" raise ValueError samplesList.append(thisSampleList) return samplesList, infoList
def parseInfo(filename): samplesList = [] file_matrix = open("meta/" + filename + ".csv","r") lines = file_matrix.readlines() file_matrix.close() #Data need to be sanitized infoListDirty = lines[0].split(",") infoList = [] for info in infoListDirty: infoList.append(sanitize(info.split("\n")[0])) for line in lines[1:]: #Construction of the list associated to one sample thisSampleList = [] lsDirty = line.split(",") #Checks if lsDirty is not empty if not lsDirty: print "\n /!\ ERROR: [BUG] [parsingInfo/parseInfo] Parsing error. (1)" raise ValueError ls = [] for data in lsDirty: if not (data == ""): ls.append(data) else: #unknown values are remplaced by "N" ls.append("N") for data in ls: thisSampleList.append(sanitize(data).split("\n")[0]) #samplesList is the list of every sample's list if not (len(thisSampleList) == len(infoList)): print "\n /!\ ERROR: [BUG] [parsingInfo/parseInfo] Parsing error. (2)" raise ValueError samplesList.append(thisSampleList) return samplesList,infoList
def parseMatrix(filename): speciesList = [] samplesList = [] file_matrix = open("meta/" + filename + ".csv", "r") lines = file_matrix.readlines() file_matrix.close() boolean = True for line in lines: ls = line.split(",") # First line gives the name and rank of species in the samples if boolean: # ls is then a list of strings of type "rank:name" # Turns "rank:name" into (name,rank) for string in ls: ls1 = string.split(":") rank = sanitize(ls1[0]) # Deletes the white space after name # Otherwise equality on strings does not work name = sanitize(ls1[-1]) speciesList.append((name, rank)) boolean = False n = len(speciesList) else: thisSampleList = [] for number in ls: number = sanitize(number) if integer.match(number): thisSampleList.append(int(number)) else: thisSampleList.append(number) if not (len(thisSampleList) == n): print "\n /!\ ERROR: [BUG] [parsingMatrix/parseMatrix] Parsing error." raise ValueError samplesList.append(thisSampleList) return samplesList, speciesList
def userNodeSelectionAct(dataArray): print dataArray[1] metadatum = sanitize( raw_input( "Input the metadatum that will cluster the set of samples among those written above. [ e.g. " + dataArray[1][0] + " ]\n")).split(";")[0] isInDatabase([metadatum], dataArray[1]) listnodes = dataArray[3].values() nodesList = parseListNode( raw_input( "Choose the group of nodes you want to consider exclusively. [ Read the taxonomic tree to help you: e.g. " + sanitizeNode(listnodes[-3]) + ";" + sanitizeNode(listnodes[1]) + ";" + sanitizeNode(listnodes[-1]) + " ]\n")) isInDatabase(nodesList, listnodes) numberSamples = len(dataArray[0]) numberStartingSamples = sanitize( raw_input( "Knowing there is/are " + str(numberSamples) + " sample(s), how many samples do you want to create the training set?\n" )) x = integer.match(numberStartingSamples) if not x or (x and int(numberStartingSamples) > numberSamples): print "\n/!\ ERROR: You should write down an integer inferior or equal to ", numberSamples, "." raise ValueError numberStartingSamples = int(numberStartingSamples) assignedClasses, classes, valueSet = classifyIt(dataArray, metadatum, nodesList, numberStartingSamples) numberClass = len(classes) youdenJ = countYouden(assignedClasses, classes, numberStartingSamples) interpretIt(youdenJ) answer = raw_input( "Do you want to plot the classes obtained as a pie chart? Y/N") if answer == "Y": labels = [metadatum + " = " + str(value) for value in valueSet] percentagesAs = [len(class1) for class1 in assignedClasses] percentages = [len(class1) for class1 in classes] plotPieChart( labels, percentagesAs, "Assignments depending on " + str(nodesList) + " to class for metadatum " + metadatum) plotPieChart( labels, percentages, "Real classes depending on " + str(nodesList) + " for metadatum " + metadatum) elif not (answer == "N"): print "\n Answer by Y or N!" answer = raw_input("Do you want to save the results? Y/N") if (answer == "Y"): writeFile( "Youden's J statistic for this classification is: " + str(youdenJ) + "\n", "Assignments depending on " + listNodes(nodesList) + " to classes for metadatum " + metadatum) elif not (answer == "N"): print "\n Answer by Y or N!" return assignedClasses, youdenJ
def randomSubSamplingAct(dataArray): print dataArray[1] metadatum = sanitize(raw_input("Input the metadatum that will cluster the set of samples among those written above. [ e.g. " + dataArray[1][0] + " ]\n")).split(";")[0] isInDatabase([metadatum],dataArray[1]) s = raw_input("Input the number s of random samplings.\n") n = raw_input("Input the number n of nodes to select at each try.\n") if not integer.match(s) or not integer.match(n): print "\n/!\ ERROR: s and n must both be integers." raise ValueError numberSamples = len(dataArray[0]) numberStartingSamples = sanitize(raw_input("Knowing there is/are " + str(numberSamples) + "sample(s), how many samples do you want to create the training set? \n")) x = integer.match(numberStartingSamples) if not x or (x and int(numberStartingSamples) > numberSamples): print "\n/!\ ERROR: You should write down an integer." raise ValueError numberStartingSamples = int(numberStartingSamples) listnodes = dataArray[3].values() s,n = int(s),int(n) #Here the set of classes is a list of two lists containing the samples in C and not C bestClassification = [] bestClassesList = [] #Worse value for this coefficient currBestYouden = inf nodesNumber = len(dataArray[3]) while s: #Randomly draw n distinct nodes among the nodes in the taxonomic tree nodesList = randomChoice(listnodes,n) assignedClasses,classes,valueSet = classifyIt(dataArray,metadatum,nodesList,numberStartingSamples) numberClass = len(classes) youdenJ = countYouden(assignedClasses,classes,numberSamples) res = numberClass - youdenJ if min(res,currBestYouden) == res: bestClassification = [] for i in nodesList: bestClassification.append(i) currBestYouden = res bestClassesList = [] for i in assignedClasses: bestClassesList.append(i) s -= 1 interpretIt(numberClass - currBestYouden) if answer == "Y": percentagesAs = [ len(class1) for class1 in assignedClasses ] labels = [ metadatum + " = " + str(value) for value in valueSet ] percentages = [ len(class1) for class1 in classes ] plotPieChart(labels,percentagesAs,"Assignments depending on " + listNodes(nodesList) + " to class for metadatum " + metadatum) plotPieChart(labels,percentages,"Real classes depending on " + listNodes(nodesList) + " for metadatum " + metadatum) answer = raw_input("Do you want to save the results? Y/N") if (answer == "Y"): writeFile("Best Youden's J statistic for this classification is: " + str(numberClass - currBestYouden) + "\nand most relevant list of nodes for this metadatum is:" + str(bestClassification),"Assignments to classes for metadatum " + metadatum) elif not (answer == "N"): print "\n Answer by Y or N!" return bestClassification,(numberClass - currBestYouden),bestClassesList
def userNodeSelectionAct(dataArray): print dataArray[1] metadata = parseList(raw_input("Input the metadata that will cluster the set of samples among those written above. [ e.g. " + dataArray[1][0] + ";" + dataArray[1][-1] + " ]\n")) isInDatabase(metadata,dataArray[1]) #@dataArray[2] = idSequences is a dictionary of (key=identifier,value=(name,rank of node)) listofNodes = dataArray[2].values() nodesList = parseListNode(raw_input("Choose the group of nodes you want to consider exclusively. [ Read the taxonomic tree to help you: e.g. " + sanitizeNode(listofNodes[-3]) + ";" + sanitizeNode(listofNodes[1]) + ";" + sanitizeNode(listofNodes[-1]) + " ]\n")) isInDatabase(nodesList,listofNodes) numberofSamples = len(dataArray[0]) numberStartingSamples = sanitize(raw_input("Knowing there is/are " + str(numberofSamples) + "sample(s), how many samples do you want to create the training set? \n")) x = integer.match(numberStartingSamples) if not x or (x and (int(numberStartingSamples) > numberofSamples)): print "\n/!\ ERROR: You should write down an integer less or equal to",numberofSamples,"." raise ValueError numberStartingSamples = int(numberStartingSamples) #@shape for @assignedClasses is the same than the one for @classes assignedClasses,classes,valueSets = classifyIt(dataArray,metadata,nodesList,numberStartingSamples) numberClass = classes.lenMDL() youdenJ = countYouden(assignedClasses,classes,numberofSamples) interpretIt(youdenJ) answer = raw_input("Do you want to plot the classes obtained as a pie chart? Y/N\n") if answer == "Y": labels = [ "Metadata: " + str(metadata) + ", Values for each metadatum: " + str([ valueSet for valueSet in valueSets]) ] percentagesAs = assignedClasses.mapMDL(len) percentages = classes.mapMDL(len) plotPie(labels,percentagesAs,"Assignments depending on " + str(nodesList) + " to class for metadata " + str(metadata)) plotPie(labels,percentages,"Real classes depending on " + str(nodesList) + " for metadata " + str(metadata)) elif not (answer == "N"): print "\n Answer by Y or N!" answer = raw_input("Do you want to save the results? Y/N \n") if (answer == "Y"): writeFile("Youden's J statistic for this classification is: " + str(youdenJ) + "\n","Assignments depending on " + listNodes(nodesList) + " to classes for metadata " + str(metadata)) elif not (answer == "N"): print "\n Answer by Y or N!" return assignedClasses,youdenJ
def change_name(discord_id, new_name): cur = db.cursor() new_name = sanitize(new_name) uuid = get_uuid(discord_id) sql = """ UPDATE players SET name='%s' WHERE uuid='%s' """ cur.execute(sql, new_name, uuid)
def clusteringAct(dataArray): print dataArray[1] metadatum = sanitize( raw_input( "Select the metadatum among those above to cluster the set of samples. [e.g. " + dataArray[1][0] + "]\n")).split(";")[0] isInDatabase([metadatum], dataArray[1]) valueSet, clusters1 = partitionSampleByMetadatumValue( metadatum, dataArray[1], dataArray[0]) clusters = [[sample[0] for sample in cluster] for cluster in clusters1] #that is, k in K-means Algorithm numberClass = len(valueSet) print "/!\ Number of classes:", numberClass, "." startSet = [cluster[0] for cluster in clusters] #Selects the starting samples of each cluster kClusters = [[start] for start in startSet] if not (len(clusters) == numberClass): print "\n/!\ ERROR: Different lengths: numberClass", numberClass, "clusters:", len( clusters), "." raise ValueError trimmedList = trimList(dataArray[3], startSet) print "/!\ Clustering with the first distance..." #@distanceInClusters is a list of lists of (sample,sum of all distances from this sample to others samples in the same cluster) #@dataArray[8] = distMatchedDict kClusters, meanSamples, distanceDict, distanceInClusters = kMeans( trimmedList, numberClass, kClusters, startSet, dataArray[8], dataArray) print "-- End of first clustering --" number = 0 for cluster in kClusters: for _ in cluster: number += 1 if not (number == len(dataArray[3])): print "\n/!\ ERROR: A bug occurred during the clustering:", number, "=/=", len( dataArray[3]), "." raise ValueError #Deletes samples in cluster that are too far from the others kClusters, untaken = cleanClusters(kClusters, distanceInClusters) startSet = [cluster[0] for cluster in clusters] #Remove from untaken the starting samples untaken2 = [] for x in untaken: if not (x in startSet): untaken2.append(x) untaken = untaken2 #Remove the samples in untaken from the total set of samples sampleSet = [] for cluster in kClusters: for x in cluster: if not (x in sampleSet): sampleSet.append(x) for x in startSet: if not (x in sampleSet): sampleSet.append(x) trimmedList = trimList(sampleSet, startSet) print "/!\ Clustering with the second distance..." #@distanceDict is the distance dictionary (key=(sample1,sample2),value=distance between sample1 and sample2) #@dataArray[9] = distConsensusDict kClusters, meanSamples, distanceDict, _ = kMeans(trimmedList, numberClass, kClusters, startSet, dataArray[9], dataArray) #,meanSamples) print "-- End of second clustering --" number = 0 for cluster in kClusters: for _ in cluster: number += 1 if not (number <= len(dataArray[3])): print "\n/!\ ERROR: An error occurred during the clustering:", number, ">", len( dataArray[3]), "." raise ValueError print "Printing the", numberClass, "clusters:" i = 1 #@kClusters contains the list of the k clusters. Each cluster is a list of sample IDs for cluster in kClusters: print "\n-- Cluster #", i, "associated to", metadatum, "=", valueSet[ i - 1], ":" print "Size:", len(cluster) print sorted(cluster) i += 1 print "\nScore of the clustering (comprised between 0 and 1):" print "The more it is close to 1, the more the clustering is relevant." #The clustering obtained with the K-Means method kClustersCopy = [cluster for cluster in kClusters] #The clustering obtained by comparing the values of the metadatum clustersCopy = [cluster for cluster in clusters] #Score by using first method of comparison compareClusterScore = 0 if not (len(kClustersCopy) == numberClass == len(clustersCopy)): print "\n/!\ ERROR: Length error in clustering:", numberClass, len( kClustersCopy), len(clustersCopy), "." raise ValueError while kClustersCopy and clustersCopy: cl1 = kClustersCopy.pop() cl2 = clustersCopy.pop() #clusters are non-empty x = compareCluster(cl1, cl2, untaken) if x: compareClusterScore += x else: compareClusterScore = None break if compareClusterScore: compareClusterScore = compareClusterScore / numberClass printClusterScore = compareClusterScore else: printClusterScore = "None" #Score by using second method of comparison #compareCentersScore = compareCenters(meanSamples,distanceDict,numberClass) print "Compare clusters score is:", printClusterScore, "." #print "Compare centers score is:",compareCentersScore,"." answer = raw_input("Do you want to save the results? Y/N\n") if (answer == "Y"): answer2 = raw_input( "Do you want to compute the sets of common nodes for each cluster? [It can be considered relevant when the score of comparing clusters is at least over 0.5] Y/N\n" ) if (answer2 == "Y"): commonList = extractCommonNodes(kClusters, dataArray) elif not (answer2 == "N"): print "\n/!\ You should answer Y or N, not:", answer2, "." data = "**** CLUSTERS FOR METADATUM " + metadatum + " WITH VALUES: " + str( valueSet) i = 0 for cluster in kClusters: data += "\n\n-- Cluster #" + str( i + 1) + " associated to " + metadatum + " = " + str( valueSet[i]) data += "\nSize: " + str(len(cluster)) if (answer2 == "Y"): data += "\nSet of common nodes: " + str(commonList[i]) data += "\n" + str(cluster) i += 1 data += "\n\nCompare clusters score is: " + str(compareClusterScore) #data += "\n\nCompare centers score is: " + str(compareCentersScore) data += "\n\nEND OF FILE ****" print "\n/!\ Saving clusters..." writeFile(data) answer2 = raw_input( "Do you want to compute the graph of the clusters? Y/N\n") if (answer2 == "Y"): print "\n/!\ Constructing the graph of the clusters..." #@dataArray[3] = filenames graph = convertClustersIntoGraph(kClusters, distanceDict, len(dataArray[3])) graphNO(graph) print "\n/!\ Done. The graph is in DOT format in \"files\" folder." elif not (answer2 == "N"): print "\n/!\ You should answer Y or N, not:", answer2, "." elif not (answer == "N"): print "/!\ You should answer by Y or N."
def clusteringAct(dataArray): print dataArray[1] metadatum = sanitize(raw_input("Select the metadatum among those above to cluster the set of samples. [e.g. " + dataArray[1][0] + "]\n")).split(";")[0] isInDatabase([metadatum],dataArray[1]) valueSet,clusters1 = partitionSampleByMetadatumValue(metadatum,dataArray[1],dataArray[0]) clusters = [[sample[0] for sample in cluster] for cluster in clusters1] #that is, k in K-means Algorithm numberClass = len(valueSet) print "/!\ Number of classes:",numberClass,"." startSet = [cluster[0] for cluster in clusters] #Selects the starting samples of each cluster kClusters = [[start] for start in startSet] if not (len(clusters) == numberClass): print "\n/!\ ERROR: Different lengths: numberClass",numberClass,"clusters:",len(clusters),"." raise ValueError trimmedList = trimList(dataArray[3],startSet) print "/!\ Clustering with the first distance..." #@distanceInClusters is a list of lists of (sample,sum of all distances from this sample to others samples in the same cluster) #@dataArray[8] = distMatchedDict kClusters,meanSamples,distanceDict,distanceInClusters = kMeans(trimmedList,numberClass,kClusters,startSet,dataArray[8],dataArray) print "-- End of first clustering --" number = 0 for cluster in kClusters: for _ in cluster: number += 1 if not (number == len(dataArray[3])): print "\n/!\ ERROR: A bug occurred during the clustering:",number,"=/=",len(dataArray[3]),"." raise ValueError #Deletes samples in cluster that are too far from the others kClusters,untaken = cleanClusters(kClusters,distanceInClusters) startSet = [cluster[0] for cluster in clusters] #Remove from untaken the starting samples untaken2 = [] for x in untaken: if not (x in startSet): untaken2.append(x) untaken = untaken2 #Remove the samples in untaken from the total set of samples sampleSet = [] for cluster in kClusters: for x in cluster: if not (x in sampleSet): sampleSet.append(x) for x in startSet: if not (x in sampleSet): sampleSet.append(x) trimmedList = trimList(sampleSet,startSet) print "/!\ Clustering with the second distance..." #@distanceDict is the distance dictionary (key=(sample1,sample2),value=distance between sample1 and sample2) #@dataArray[9] = distConsensusDict kClusters,meanSamples,distanceDict,_ = kMeans(trimmedList,numberClass,kClusters,startSet,dataArray[9],dataArray)#,meanSamples) print "-- End of second clustering --" number = 0 for cluster in kClusters: for _ in cluster: number += 1 if not (number <= len(dataArray[3])): print "\n/!\ ERROR: An error occurred during the clustering:",number,">",len(dataArray[3]),"." raise ValueError print "Printing the",numberClass,"clusters:" i = 1 #@kClusters contains the list of the k clusters. Each cluster is a list of sample IDs for cluster in kClusters: print "\n-- Cluster #",i,"associated to",metadatum,"=",valueSet[i-1],":" print "Size:",len(cluster) print sorted(cluster) i += 1 print "\nScore of the clustering (comprised between 0 and 1):" print "The more it is close to 1, the more the clustering is relevant." #The clustering obtained with the K-Means method kClustersCopy = [cluster for cluster in kClusters] #The clustering obtained by comparing the values of the metadatum clustersCopy = [cluster for cluster in clusters] #Score by using first method of comparison compareClusterScore = 0 if not (len(kClustersCopy) == numberClass == len(clustersCopy)): print "\n/!\ ERROR: Length error in clustering:",numberClass,len(kClustersCopy),len(clustersCopy),"." raise ValueError while kClustersCopy and clustersCopy: cl1 = kClustersCopy.pop() cl2 = clustersCopy.pop() #clusters are non-empty x = compareCluster(cl1,cl2,untaken) if x: compareClusterScore += x else: compareClusterScore = None break if compareClusterScore: compareClusterScore = compareClusterScore/numberClass printClusterScore = compareClusterScore else: printClusterScore = "None" #Score by using second method of comparison #compareCentersScore = compareCenters(meanSamples,distanceDict,numberClass) print "Compare clusters score is:",printClusterScore,"." #print "Compare centers score is:",compareCentersScore,"." answer = raw_input("Do you want to save the results? Y/N\n") if (answer == "Y"): answer2 = raw_input("Do you want to compute the sets of common nodes for each cluster? [It can be considered relevant when the score of comparing clusters is at least over 0.5] Y/N\n") if (answer2 == "Y"): commonList = extractCommonNodes(kClusters,dataArray) elif not (answer2 == "N"): print "\n/!\ You should answer Y or N, not:",answer2,"." data = "**** CLUSTERS FOR METADATUM " + metadatum + " WITH VALUES: " + str(valueSet) i = 0 for cluster in kClusters: data += "\n\n-- Cluster #" + str(i+1) + " associated to " + metadatum + " = " + str(valueSet[i]) data += "\nSize: " + str(len(cluster)) if (answer2 == "Y"): data += "\nSet of common nodes: " + str(commonList[i]) data += "\n" + str(cluster) i += 1 data += "\n\nCompare clusters score is: " + str(compareClusterScore) #data += "\n\nCompare centers score is: " + str(compareCentersScore) data += "\n\nEND OF FILE ****" print "\n/!\ Saving clusters..." writeFile(data) answer2 = raw_input("Do you want to compute the graph of the clusters? Y/N\n") if (answer2 == "Y"): print "\n/!\ Constructing the graph of the clusters..." #@dataArray[3] = filenames graph = convertClustersIntoGraph(kClusters,distanceDict,len(dataArray[3])) graphNO(graph) print "\n/!\ Done. The graph is in DOT format in \"files\" folder." elif not (answer2 == "N"): print "\n/!\ You should answer Y or N, not:",answer2,"." elif not (answer == "N"): print "/!\ You should answer by Y or N."
def getNameRankList(string): return [(sanitize(string.split(":")[-1]).split("\n")[0],sanitize(string.split(":")[0]).split("\n")[0])]
def randomSubSamplingAct(dataArray): print dataArray[1] metadatum = sanitize( raw_input( "Input the metadatum that will cluster the set of samples among those written above. [ e.g. " + dataArray[1][0] + " ]\n")).split(";")[0] isInDatabase([metadatum], dataArray[1]) s = raw_input("Input the number s of random samplings.\n") n = raw_input("Input the number n of nodes to select at each try.\n") if not integer.match(s) or not integer.match(n): print "\n/!\ ERROR: s and n must both be integers." raise ValueError numberSamples = len(dataArray[0]) numberStartingSamples = sanitize( raw_input( "Knowing there is/are " + str(numberSamples) + "sample(s), how many samples do you want to create the training set? \n" )) x = integer.match(numberStartingSamples) if not x or (x and int(numberStartingSamples) > numberSamples): print "\n/!\ ERROR: You should write down an integer." raise ValueError numberStartingSamples = int(numberStartingSamples) listnodes = dataArray[3].values() s, n = int(s), int(n) #Here the set of classes is a list of two lists containing the samples in C and not C bestClassification = [] bestClassesList = [] #Worse value for this coefficient currBestYouden = inf nodesNumber = len(dataArray[3]) while s: #Randomly draw n distinct nodes among the nodes in the taxonomic tree nodesList = randomChoice(listnodes, n) assignedClasses, classes, valueSet = classifyIt( dataArray, metadatum, nodesList, numberStartingSamples) numberClass = len(classes) youdenJ = countYouden(assignedClasses, classes, numberSamples) res = numberClass - youdenJ if min(res, currBestYouden) == res: bestClassification = [] for i in nodesList: bestClassification.append(i) currBestYouden = res bestClassesList = [] for i in assignedClasses: bestClassesList.append(i) s -= 1 interpretIt(numberClass - currBestYouden) if answer == "Y": percentagesAs = [len(class1) for class1 in assignedClasses] labels = [metadatum + " = " + str(value) for value in valueSet] percentages = [len(class1) for class1 in classes] plotPieChart( labels, percentagesAs, "Assignments depending on " + listNodes(nodesList) + " to class for metadatum " + metadatum) plotPieChart( labels, percentages, "Real classes depending on " + listNodes(nodesList) + " for metadatum " + metadatum) answer = raw_input("Do you want to save the results? Y/N") if (answer == "Y"): writeFile( "Best Youden's J statistic for this classification is: " + str(numberClass - currBestYouden) + "\nand most relevant list of nodes for this metadatum is:" + str(bestClassification), "Assignments to classes for metadatum " + metadatum) elif not (answer == "N"): print "\n Answer by Y or N!" return bestClassification, (numberClass - currBestYouden), bestClassesList
def randomSubSamplingAct(dataArray): print dataArray[1] metadata = parseList(raw_input("Input the metadata that will cluster the set of samples among those written above. [ e.g. " + dataArray[1][0] + ";" + dataArray[1][-1] + " ]\n")) isInDatabase(metadata,dataArray[1]) s = raw_input("Input the number s of random samplings.\n") n = raw_input("Input the number n of nodes to select at each try.\n") numberofSamples = len(dataArray[0]) if not integer.match(s) or not integer.match(n): print "\n/!\ ERROR: s and n must both be integers." raise ValueError s,n = int(s),int(n) numberStartingSamples = sanitize(raw_input("Knowing there is/are " + str(numberofSamples) + " sample(s), how many samples do you want to create the training set?\n")) x = integer.match(numberStartingSamples) if not x or (x and (int(numberStartingSamples) > numberofSamples)): print "\n/!\ ERROR: You should write down an integer less or equal to",numberofSamples,"." raise ValueError numberStartingSamples = int(numberStartingSamples) #Here the set of classes is a list of two lists containing the samples in C and not C bestClassification = [] bestClassesList = [] bestShape = [] bestValuesList = [] #Worse value for this coefficient currBestYouden = inf nodesNumber = len(dataArray[3]) #@dataArray[2] = idSequences, which is a dictionary of (key=identifier,values=(name,rank of node)) listofNodes = dataArray[2].values() while s: #Randomly draw n distinct nodes among the nodes in the taxonomic tree nodesList = randomChoice(listofNodes,n) assignedClasses,classes,valueSets = classifyIt(dataArray,metadata,nodesList,numberStartingSamples) numberClass = classes.lenMDL(shape) #len(dataArray[0])? youdenJ = countYouden(assignedClasses,classes,numberofSamples) res = numberClass - youdenJ if min(res,currBestYouden) == res: bestValuesList = [] for i in valueSets: bestValuesList.append(i) bestClassification = [] for i in nodesList: bestClassification.append(i) bestShape = [] for i in shape: bestShape.append(i) currBestYouden = res bestClassesList = [] for i in assignedClasses: bestClassesList.append(i) s -= 1 interpretIt(numberClass - currBestYouden) if answer == "Y": labels = [ "Metadata: " + str(metadata) + ", Values for each metadatum: " + str([ valueSet for valueSet in valueSets]) ] percentagesAs = assignedClasses.mapMDL(len) percentages = classes.mapMDL(len) plotPie(labels,percentagesAs,"Assignments depending on " + str(nodesList) + " to class for metadata " + str(metadata)) plotPie(labels,percentages,"Real classes depending on " + str(nodesList) + " for metadata " + str(metadata)) answer = raw_input("Do you want to save the results? Y/N \n") if (answer == "Y"): writeFile("Best Youden's J statistic for this classification is: " + str(numberClass - currBestYouden) + "\nand most relevant list of nodes for this set of metadata is:" + str(bestClassification),"Assignments to classes for metadata " + str(metadata)) elif not (answer == "N"): print "\n Answer by Y or N!" return bestClassification,(numberClass - currBestYouden),bestClassesList