def emlimitateUnusedFeature(self, trainData, testData = None): trainData.set_class_index(trainData.num_attributes() - 1) # set class attribute featureIndex = -1 filteredTrainData = trainData filteredTestData = testData attribute_index = 0 while attribute_index < filteredTrainData.num_attributes() - 1: sampleCoverage = 0 #print attribute_index # check value for current feature in each instance for instance_index in range(0, filteredTrainData.num_instances()): instance = filteredTrainData.get_instance(instance_index) value = instance.get_value(attribute_index) if value > 0: sampleCoverage += 1 if sampleCoverage == 0: #print "found" remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", str(attribute_index+1)]) #The index in this function start from 1 remove.set_inputformat(filteredTrainData) filteredTrainData = remove.filter(filteredTrainData) if filteredTestData: remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", str(attribute_index+1)]) #The index in this function start from 1 remove.set_inputformat(filteredTestData) filteredTestData = remove.filter(filteredTestData) else: attribute_index += 1 return [filteredTrainData, filteredTestData]
def attributeSelector(self, data, selectNum): attributeSelector = Filter(classname="weka.filters.supervised.attribute.AttributeSelection",\ options=["-S", "weka.attributeSelection.Ranker -T -1.7976931348623157E308 -N " + str(selectNum),\ "-E", "weka.attributeSelection.InfoGainAttributeEval"]) attributeSelector.set_inputformat(data) data = attributeSelector.filter(data) return data
def filterUnusedFeatureFromList(self, data, unusedFuncitonList): filteredData = data for attribute in unusedFuncitonList: remove = Filter(classname="weka.filters.unsupervised.attribute.RemoveByName", options=["-E", "^" + attribute + ".*$"]) remove.set_inputformat(filteredData) filteredData = remove.filter(filteredData) return filteredData
def filterUnusedFeatureFromList(self, data, unusedFuncitonList): filteredData = data for attribute in unusedFuncitonList: remove = Filter( classname="weka.filters.unsupervised.attribute.RemoveByName", options=["-E", "^" + attribute + ".*$"]) remove.set_inputformat(filteredData) filteredData = remove.filter(filteredData) return filteredData
def getSetDataBySetIndex(self, data, index): # cut feature set out featureTable = FeatureTable() startIndexList = featureTable.getEachSetStartIndex() start = startIndexList[index] end = startIndexList[index+1] - 1 remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-V", "-R", str(start) + "-" + str(end) + ",last"]) remove.set_inputformat(data) filteredData = remove.filter(data) return filteredData
def filterOutUnnecessaryAPIAndEvaluateOurApproach(self, ourApproahFile, apiFile, indexInTable, methodName, databaseTable, csvFilePath): outputStr = methodName + "," resultList = [] # Get whole feature set of our approach filteredData = self.load_Arff(ourApproahFile) # Use this function to get selected API feature and save the unselected api in a list filterOutList = self.attribueSelectionBasedOnRankingInDatabase( apiFile, indexInTable, databaseTable, "")[1] # Remove unselected API for functionName in filterOutList: functionName = functionName.split("(")[0] + "\(\)" functionName = functionName.replace('$', '\$') remove = Filter( classname="weka.filters.unsupervised.attribute.RemoveByName", options=["-E", "^" + functionName + ".*$"]) remove.set_inputformat(filteredData) filteredData = remove.filter(filteredData) featureNum = filteredData.num_attributes() - 1 print "featureNum: " + str(featureNum) if csvFilePath != "": self.writeTenScaledTitleManual(featureNum, csvFilePath) #print "i:" + str(i) #print "functionName:" + functionName #print "featureNum: " + str(filteredData.num_attributes() - 1) for attributeStr in filteredData.attributes(): print(attributeStr) # Run ten scaled generation and evaluation step = 10 while step < featureNum: roundData = self.attributeSelector(filteredData, step) classifier = self.algorithmPicker(roundData, indexInTable) evaluation = self.evaluation(classifier, roundData) #print(self.algorithmTable[indexInTable] + ": " + "{:.2f}".format(evaluation.percent_correct()) + ", Feature select number:" + str(roundData.num_attributes() - 1) + "/" + str(featureNum)) resultList.append("{:.2f}".format(evaluation.percent_correct())) #csvFile.write("{:.2f}".format(evaluation.percent_correct()) +",") step += 10 classifier = self.algorithmPicker(filteredData, indexInTable) evaluation = self.evaluation(classifier, filteredData) #print(self.algorithmTable[indexInTable] + ": " + "{:.2f}".format(evaluation.percent_correct()) + ", Feature select number:" + str(filteredData.num_attributes() - 1) + "/" + str(featureNum)) resultList.append("{:.2f}".format(evaluation.percent_correct())) # Write out to CSV file for item in resultList: outputStr += item + "," outputStr = outputStr[0:-1] + "\n" self.writeToPath(csvFilePath, outputStr)
def getSetDataBySetIndex(self, data, index): # cut feature set out featureTable = FeatureTable() startIndexList = featureTable.getEachSetStartIndex() start = startIndexList[index] end = startIndexList[index + 1] - 1 remove = Filter( classname="weka.filters.unsupervised.attribute.Remove", options=["-V", "-R", str(start) + "-" + str(end) + ",last"]) remove.set_inputformat(data) filteredData = remove.filter(data) return filteredData
def _pre_process_to_classification(self, dataset): filter_data = Filter(classname = 'weka.filters.unsupervised.attribute.MathExpression', options = ['-unset-class-temporarily', '-E', "ifelse ( A>0, 1, 0 )", '-V', '-R', 'last']) filter_data.set_inputformat(dataset) filtered = filter_data.filter(dataset) discretize_data = Filter(classname = 'weka.filters.unsupervised.attribute.NumericToNominal', options = ['-R', 'last']) discretize_data.set_inputformat(filtered) discretized = discretize_data.filter(filtered) return discretized
def filterOutUnnecessaryAPIAndEvaluateOurApproach(self, ourApproahFile, apiFile, indexInTable, methodName, databaseTable, csvFilePath): outputStr = methodName+"," resultList = [] # Get whole feature set of our approach filteredData = self.load_Arff(ourApproahFile) # Use this function to get selected API feature and save the unselected api in a list filterOutList = self.attribueSelectionBasedOnRankingInDatabase(apiFile, indexInTable, databaseTable, "")[1] # Remove unselected API for functionName in filterOutList: functionName = functionName.split("(")[0] + "\(\)" functionName = functionName.replace('$','\$') remove = Filter(classname="weka.filters.unsupervised.attribute.RemoveByName", options=["-E", "^" + functionName + ".*$"]) remove.set_inputformat(filteredData) filteredData = remove.filter(filteredData) featureNum = filteredData.num_attributes() - 1 print "featureNum: " + str(featureNum) if csvFilePath != "": self.writeTenScaledTitleManual(featureNum, csvFilePath) #print "i:" + str(i) #print "functionName:" + functionName #print "featureNum: " + str(filteredData.num_attributes() - 1) for attributeStr in filteredData.attributes(): print(attributeStr) # Run ten scaled generation and evaluation step = 10 while step < featureNum: roundData = self.attributeSelector(filteredData, step) classifier = self.algorithmPicker(roundData, indexInTable) evaluation = self.evaluation(classifier, roundData) #print(self.algorithmTable[indexInTable] + ": " + "{:.2f}".format(evaluation.percent_correct()) + ", Feature select number:" + str(roundData.num_attributes() - 1) + "/" + str(featureNum)) resultList.append("{:.2f}".format(evaluation.percent_correct())) #csvFile.write("{:.2f}".format(evaluation.percent_correct()) +",") step += 10 classifier = self.algorithmPicker(filteredData, indexInTable) evaluation = self.evaluation(classifier, filteredData) #print(self.algorithmTable[indexInTable] + ": " + "{:.2f}".format(evaluation.percent_correct()) + ", Feature select number:" + str(filteredData.num_attributes() - 1) + "/" + str(featureNum)) resultList.append("{:.2f}".format(evaluation.percent_correct())) # Write out to CSV file for item in resultList: outputStr += item +"," outputStr = outputStr[0:-1] + "\n" self.writeToPath(csvFilePath, outputStr)
def createTwoDatasets(self, wholeDataPath, trainingDataPercentage, trainingPath, testingPath, shuffleSeed=43): wholeData = self.load_Arff(wholeDataPath) randomize = Filter( classname="weka.filters.unsupervised.instance.Randomize", options=["-S", str(shuffleSeed)]) randomize.set_inputformat(wholeData) wholeData = randomize.filter(wholeData) removePercentage = Filter( classname="weka.filters.unsupervised.instance.RemovePercentage", options=["-P", str(trainingDataPercentage), "-V"]) removePercentage.set_inputformat(wholeData) trainingData = removePercentage.filter(wholeData) print "instances:" + str(trainingData.num_instances()) removePercentage = Filter( classname="weka.filters.unsupervised.instance.RemovePercentage", options=["-P", str(trainingDataPercentage)]) removePercentage.set_inputformat(wholeData) testingData = removePercentage.filter(wholeData) print "instances:" + str(testingData.num_instances()) self.save_Arff(trainingData, trainingPath) self.save_Arff(testingData, testingPath)
def emlimitateUnusedFeature(self, trainData, testData=None): trainData.set_class_index(trainData.num_attributes() - 1) # set class attribute featureIndex = -1 filteredTrainData = trainData filteredTestData = testData attribute_index = 0 while attribute_index < filteredTrainData.num_attributes() - 1: sampleCoverage = 0 #print attribute_index # check value for current feature in each instance for instance_index in range(0, filteredTrainData.num_instances()): instance = filteredTrainData.get_instance(instance_index) value = instance.get_value(attribute_index) if value > 0: sampleCoverage += 1 if sampleCoverage == 0: #print "found" remove = Filter( classname="weka.filters.unsupervised.attribute.Remove", options=["-R", str(attribute_index + 1) ]) #The index in this function start from 1 remove.set_inputformat(filteredTrainData) filteredTrainData = remove.filter(filteredTrainData) if filteredTestData: remove = Filter( classname="weka.filters.unsupervised.attribute.Remove", options=["-R", str(attribute_index + 1) ]) #The index in this function start from 1 remove.set_inputformat(filteredTestData) filteredTestData = remove.filter(filteredTestData) else: attribute_index += 1 return [filteredTrainData, filteredTestData]
def createTwoDatasets(self, wholeDataPath, trainingDataPercentage, trainingPath, testingPath, shuffleSeed = 43): wholeData = self.load_Arff(wholeDataPath) randomize = Filter(classname="weka.filters.unsupervised.instance.Randomize", options=["-S", str(shuffleSeed)]) randomize.set_inputformat(wholeData) wholeData = randomize.filter(wholeData) removePercentage = Filter(classname="weka.filters.unsupervised.instance.RemovePercentage", options=["-P", str(trainingDataPercentage), "-V"]) removePercentage.set_inputformat(wholeData) trainingData = removePercentage.filter(wholeData) print "instances:" + str(trainingData.num_instances()) removePercentage = Filter(classname="weka.filters.unsupervised.instance.RemovePercentage", options=["-P", str(trainingDataPercentage)]) removePercentage.set_inputformat(wholeData) testingData = removePercentage.filter(wholeData) print "instances:" + str(testingData.num_instances()) self.save_Arff(trainingData, trainingPath) self.save_Arff(testingData, testingPath)
from weka.core.converters import Loader from weka.clusterers import Clusterer, ClusterEvaluation from weka.filters import Filter import weka.plot.clusterers as plc jvm.start() # load iris fname = data_dir + os.sep + "iris.arff" print("\nLoading dataset: " + fname + "\n") loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(fname) # remove class attribute flt = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "last"]) flt.set_inputformat(data) filtered = flt.filter(data) # build KMeans print("\n--> SimpleKMeans\n") cl = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "3"]) cl.build_clusterer(filtered) evl = ClusterEvaluation() evl.set_model(cl) evl.test_model(filtered) print(evl.get_cluster_results()) plc.plot_cluster_assignments(evl, data, atts=[], inst_no=True, wait=True) # use AddCluster filter print("\n--> AddCluster filter\n") flt = Filter(classname="weka.filters.unsupervised.attribute.AddCluster",
writer.writerow(row) # close csvfile csvfile.close() # start JVM jvm.start() # load CSV file loader = Loader(classname="weka.core.converters.CSVLoader", options=["-E", '"', "-F", ","]) data = loader.load_file(csvfilename) #print(data) # convert class to nominal wfilter = Filter(classname="weka.filters.unsupervised.attribute.StringToNominal", options=["-R", "last"]) wfilter.set_inputformat(data) data = wfilter.filter(data) # convert content to string wfilter = Filter(classname="weka.filters.unsupervised.attribute.NominalToString", options=["-C", "first"]) wfilter.set_inputformat(data) data = wfilter.filter(data) # set class attribute data.set_class_index(data.num_attributes() - 1) # generate baseline zeror = Classifier(classname="weka.classifiers.rules.ZeroR") evaluation = Evaluation(data) evaluation.crossvalidate_model(zeror, data, 10, Random(1)) print("\nBaseline:\n" + evaluation.to_summary())
csvfile.close() # start JVM jvm.start() # load CSV file loader = Loader(classname="weka.core.converters.CSVLoader", options=["-E", '"', "-F", ","]) data = loader.load_file(csvfilename) #print(data) # convert class to nominal wfilter = Filter( classname="weka.filters.unsupervised.attribute.StringToNominal", options=["-R", "last"]) wfilter.set_inputformat(data) data = wfilter.filter(data) # convert content to string wfilter = Filter( classname="weka.filters.unsupervised.attribute.NominalToString", options=["-C", "first"]) wfilter.set_inputformat(data) data = wfilter.filter(data) # set class attribute data.set_class_index(data.num_attributes() - 1) # generate baseline zeror = Classifier(classname="weka.classifiers.rules.ZeroR") evaluation = Evaluation(data)
# load diabetes loader = Loader(classname="weka.core.converters.ArffLoader") fname = data_dir + os.sep + "diabetes.arff" print("\nLoading dataset: " + fname + "\n") data = loader.load_file(fname) data.set_class_index(data.num_attributes() - 1) # simulate the 10 train/test pairs of cross-validation evl = Evaluation(data) for i in xrange(1, 11): # create train set remove = Filter( classname="weka.filters.supervised.instance.StratifiedRemoveFolds", options=["-N", "10", "-F", str(i), "-S", "1", "-V"]) remove.set_inputformat(data) train = remove.filter(data) # create test set remove = Filter( classname="weka.filters.supervised.instance.StratifiedRemoveFolds", options=["-N", "10", "-F", str(i), "-S", "1"]) remove.set_inputformat(data) test = remove.filter(data) cls = Classifier(classname="weka.classifiers.trees.J48") cls.build_classifier(train) evl.test_model(cls, test) print("Simulated CV accuracy: %0.1f%%" % (evl.percent_correct()))
def attribueSelectionBasedOnRankingInDatabase(self, trainingData, indexInTable, databaseTable, csvFilePath, testingData = None): featureNum = trainingData.num_attributes() - 1 outputStr = "" outputStr += databaseTable+"," # select from database vector difference featureList3 = [] wholefeatureList = [] dbmgr = permissionMappingManager(databasePath) for row in dbmgr.query("select * from " + databaseTable): featureList3.append(row[0]) wholefeatureList.append(row[0]) #featureList3.reverse() bestRemainFilterList = [] resultList = [] digit = len(featureList3) % 10 bestAccuracy = 0 bestTrainingData = None bestTestingData = None bestEvaluation = None classifier = self.algorithmPicker(trainingData, indexInTable) evaluation = self.evaluation(classifier, trainingData, testingData) if evaluation.percent_correct() >= bestAccuracy: bestAccuracy = evaluation.percent_correct() bestTrainingData = trainingData bestTestingData = testingData bestRemainFilterList = list(featureList3) bestEvaluation = evaluation print(self.algorithmTable[indexInTable] + ": " + "{:.2f}".format(evaluation.percent_correct()) + ", Feature select number:" + str(trainingData.num_attributes() - 1) + "/" + str(featureNum)) resultList.append("{:.2f}".format(evaluation.percent_correct())) if digit > 0: for i in range(0, digit): functionName = featureList3.pop().split("(")[0] + "\(\)" functionName = functionName.replace('$','\$') #print "functionName:" + functionName remove = Filter(classname="weka.filters.unsupervised.attribute.RemoveByName", options=["-E", "^" + functionName + ".*$"]) remove.set_inputformat(trainingData) trainingData = remove.filter(trainingData) if testingData: remove.set_inputformat(testingData) testingData = remove.filter(testingData) #print "i:" + str(i) #print "functionName:" + functionName #print "featureNum: " + str(filteredData.num_attributes() - 1) #for attributeStr in trainingData.attributes(): # print(attributeStr) #self.printFunctionInfo(trainingData, trainingData.num_instances()) classifier = self.algorithmPicker(trainingData, indexInTable) evaluation = self.evaluation(classifier, trainingData, testingData) if evaluation.percent_correct() >= bestAccuracy: bestAccuracy = evaluation.percent_correct() bestTrainingData = trainingData bestTestingData = testingData bestRemainFilterList = list(featureList3) bestEvaluation = evaluation print(self.algorithmTable[indexInTable] + ": " + "{:.2f}".format(evaluation.percent_correct()) + ", Feature select number:" + str(trainingData.num_attributes() - 1) + "/" + str(featureNum)) resultList.append("{:.2f}".format(evaluation.percent_correct())) while trainingData.num_attributes() - 1 > 10: for i in range(0,10): functionName = featureList3.pop().split("(")[0] + "\(\)" functionName = functionName.replace('$','\$') #print "functionName:" + functionName remove = Filter(classname="weka.filters.unsupervised.attribute.RemoveByName", options=["-E", "^" + functionName + ".*$"]) remove.set_inputformat(trainingData) trainingData = remove.filter(trainingData) if testingData: remove.set_inputformat(testingData) testingData = remove.filter(testingData) #print functionName #print "featureNum: " + str(filteredData.num_attributes() - 1) #for attributeStr in trainingData.attributes(): # print(attributeStr) classifier = self.algorithmPicker(trainingData, indexInTable) evaluation = self.evaluation(classifier, trainingData, testingData) if evaluation.percent_correct() >= bestAccuracy: bestAccuracy = evaluation.percent_correct() bestTrainingData = trainingData bestTestingData = testingData bestRemainFilterList = list(featureList3) bestEvaluation = evaluation #print "update feature number:" + str(len(bestRemainFilterList)) print(self.algorithmTable[indexInTable] + ": " + "{:.2f}".format(evaluation.percent_correct()) + ", Feature select number:" + str(trainingData.num_attributes() - 1) + "/" + str(featureNum)) resultList.append("{:.2f}".format(evaluation.percent_correct())) resultList.reverse() fileteredfeatureList = [] #print "bestRemainFilterList number:" + str(len(bestRemainFilterList)) #print "wholefeatureList number:" + str(len(wholefeatureList)) for item in wholefeatureList: if item not in bestRemainFilterList: fileteredfeatureList.append(item) #print "update fileteredfeatureList number:" + str(len(fileteredfeatureList)) for item in resultList: outputStr += item +"," outputStr = outputStr[0:-1] + "\n" print outputStr self.writeToPath(csvFilePath, outputStr) accuracyStr = "{:.2f}".format(bestAccuracy) #print fileteredfeatureList return [bestEvaluation, bestTrainingData, bestTestingData, resultList]
def getTenScaledResultsRankedByInfo(self, trainingData, indexInTable, csvFilePath, testingData=None): dbmgr = permissionMappingManager(databasePath) featureNum = trainingData.num_attributes() - 1 attributeIn = trainingData.attributes() attributeList = [] for item in attributeIn: functionName = str(item).split(" ")[1] functionName = functionName.split("(")[0] + "\(\)" functionName = functionName.replace('$', '\$') #print functionName attributeList.append(functionName) outputStr = "" outputStr += "InfomationGain" + "," resultList = [] bestAccuracy = 0 bestTrainData = 0 bestTestData = 0 #for index in range(0, len(attributeList)-1): # attributeList[index] = attributeList[index].split(" ")[1] # print attributeList[index] csvFile = open(csvFilePath, "a") csvFile.write(self.algorithmTable[indexInTable] + ",") step = 10 while step < featureNum: # pick top features filteredTrainData = self.attributeSelector(trainingData, step) # check top feature informations APIList = [] for item in filteredTrainData.attributes(): #print str(item) functionName = str(item).split(" ")[1] #functionName = functionName.split("_")[0][1:] APIList.append(functionName) numberOfInstance = self.getNumOfInstance(trainingData) # Get those features that it doesn't pick filteredList = [] attributeIn = filteredTrainData.attributes() for item in attributeIn: functionName = str(item).split(" ")[1] functionName = functionName.split("(")[0] + "\(\)" functionName = functionName.replace('$', '\$') filteredList.append(functionName) items = self.getItemsNotInTheList(attributeList, filteredList) #print len(items) #for item in items: # print item # Re-process training data and make testing Data synchronized filteredTrainData = trainingData filterTestingData = testingData for attribute in items: remove = Filter( classname= "weka.filters.unsupervised.attribute.RemoveByName", options=["-E", "^" + attribute + ".*$"]) remove.set_inputformat(filteredTrainData) filteredTrainData = remove.filter(filteredTrainData) if filterTestingData: remove.set_inputformat(filterTestingData) filterTestingData = remove.filter(filterTestingData) #print attribute #print str(filteredTrainData.num_attributes() - 1) # Build classifier and evaluate it classifier = self.algorithmPicker(filteredTrainData, indexInTable) evaluation = self.evaluation(classifier, filteredTrainData, filterTestingData) #print(self.algorithmTable[indexInTable] + ": " + "{:.2f}".format(evaluation.percent_correct()) + ", Feature select number:" + str(filteredTrainData.num_attributes() - 1) + "/" + str(featureNum)) resultList.append("{:.2f}".format(evaluation.percent_correct())) #Save best data and accuracy if evaluation.percent_correct() > bestAccuracy: bestAccuracy = evaluation.percent_correct() bestTrainData = filteredTrainData if testingData: bestTestData = filterTestingData #bestEvaluation = evaluation step += 10 classifier = self.algorithmPicker(trainingData, indexInTable) evaluation = self.evaluation(classifier, trainingData, testingData) #print(self.algorithmTable[indexInTable] + ": " + "{:.2f}".format(evaluation.percent_correct()) + ", Feature select number:" + str(trainingData.num_attributes() - 1) + "/" + str(featureNum)) resultList.append("{:.2f}".format(evaluation.percent_correct())) #Save best data and accuracy if evaluation.percent_correct() > bestAccuracy: bestAccuracy = evaluation.percent_correct() bestTrainData = filteredTrainData if testingData: bestTestData = filterTestingData #bestEvaluation = evaluation for item in resultList: outputStr += item + "," outputStr = outputStr[0:-1] + "\n" self.writeToPath(csvFilePath, outputStr) return [bestAccuracy, bestTrainData, bestTestData, resultList]
def getTenScaledResultsRankedByInfo(self, trainingData, indexInTable, csvFilePath, testingData = None): dbmgr = permissionMappingManager(databasePath) featureNum = trainingData.num_attributes() - 1 attributeIn = trainingData.attributes() attributeList = [] for item in attributeIn: functionName = str(item).split(" ")[1] functionName = functionName.split("(")[0] + "\(\)" functionName = functionName.replace('$','\$') #print functionName attributeList.append(functionName) outputStr = "" outputStr += "InfomationGain" + "," resultList = [] bestAccuracy = 0 bestTrainData = 0 bestTestData = 0 #for index in range(0, len(attributeList)-1): # attributeList[index] = attributeList[index].split(" ")[1] # print attributeList[index] csvFile = open(csvFilePath, "a") csvFile.write(self.algorithmTable[indexInTable]+",") step = 10 while step < featureNum: # pick top features filteredTrainData = self.attributeSelector(trainingData, step) # check top feature informations APIList = [] for item in filteredTrainData.attributes(): #print str(item) functionName = str(item).split(" ")[1] #functionName = functionName.split("_")[0][1:] APIList.append(functionName) numberOfInstance = self.getNumOfInstance(trainingData) # Get those features that it doesn't pick filteredList = [] attributeIn = filteredTrainData.attributes() for item in attributeIn: functionName = str(item).split(" ")[1] functionName = functionName.split("(")[0] + "\(\)" functionName = functionName.replace('$','\$') filteredList.append(functionName) items = self.getItemsNotInTheList(attributeList, filteredList) #print len(items) #for item in items: # print item # Re-process training data and make testing Data synchronized filteredTrainData = trainingData filterTestingData = testingData for attribute in items: remove = Filter(classname="weka.filters.unsupervised.attribute.RemoveByName", options=["-E", "^" + attribute + ".*$"]) remove.set_inputformat(filteredTrainData) filteredTrainData = remove.filter(filteredTrainData) if filterTestingData: remove.set_inputformat(filterTestingData) filterTestingData = remove.filter(filterTestingData) #print attribute #print str(filteredTrainData.num_attributes() - 1) # Build classifier and evaluate it classifier = self.algorithmPicker(filteredTrainData, indexInTable) evaluation = self.evaluation(classifier, filteredTrainData, filterTestingData) #print(self.algorithmTable[indexInTable] + ": " + "{:.2f}".format(evaluation.percent_correct()) + ", Feature select number:" + str(filteredTrainData.num_attributes() - 1) + "/" + str(featureNum)) resultList.append("{:.2f}".format(evaluation.percent_correct())) #Save best data and accuracy if evaluation.percent_correct() > bestAccuracy: bestAccuracy = evaluation.percent_correct() bestTrainData = filteredTrainData if testingData: bestTestData = filterTestingData #bestEvaluation = evaluation step += 10 classifier = self.algorithmPicker(trainingData, indexInTable) evaluation = self.evaluation(classifier, trainingData, testingData) #print(self.algorithmTable[indexInTable] + ": " + "{:.2f}".format(evaluation.percent_correct()) + ", Feature select number:" + str(trainingData.num_attributes() - 1) + "/" + str(featureNum)) resultList.append("{:.2f}".format(evaluation.percent_correct())) #Save best data and accuracy if evaluation.percent_correct() > bestAccuracy: bestAccuracy = evaluation.percent_correct() bestTrainData = filteredTrainData if testingData: bestTestData = filterTestingData #bestEvaluation = evaluation for item in resultList: outputStr += item +"," outputStr = outputStr[0:-1] + "\n" self.writeToPath(csvFilePath, outputStr) return [bestAccuracy, bestTrainData, bestTestData, resultList]
def attribueSelectionBasedOnRankingInDatabase(self, trainingData, indexInTable, databaseTable, csvFilePath, testingData=None): featureNum = trainingData.num_attributes() - 1 outputStr = "" outputStr += databaseTable + "," # select from database vector difference featureList3 = [] wholefeatureList = [] dbmgr = permissionMappingManager(databasePath) for row in dbmgr.query("select * from " + databaseTable): featureList3.append(row[0]) wholefeatureList.append(row[0]) #featureList3.reverse() bestRemainFilterList = [] resultList = [] digit = len(featureList3) % 10 bestAccuracy = 0 bestTrainingData = None bestTestingData = None bestEvaluation = None classifier = self.algorithmPicker(trainingData, indexInTable) evaluation = self.evaluation(classifier, trainingData, testingData) if evaluation.percent_correct() >= bestAccuracy: bestAccuracy = evaluation.percent_correct() bestTrainingData = trainingData bestTestingData = testingData bestRemainFilterList = list(featureList3) bestEvaluation = evaluation print(self.algorithmTable[indexInTable] + ": " + "{:.2f}".format(evaluation.percent_correct()) + ", Feature select number:" + str(trainingData.num_attributes() - 1) + "/" + str(featureNum)) resultList.append("{:.2f}".format(evaluation.percent_correct())) if digit > 0: for i in range(0, digit): functionName = featureList3.pop().split("(")[0] + "\(\)" functionName = functionName.replace('$', '\$') #print "functionName:" + functionName remove = Filter( classname= "weka.filters.unsupervised.attribute.RemoveByName", options=["-E", "^" + functionName + ".*$"]) remove.set_inputformat(trainingData) trainingData = remove.filter(trainingData) if testingData: remove.set_inputformat(testingData) testingData = remove.filter(testingData) #print "i:" + str(i) #print "functionName:" + functionName #print "featureNum: " + str(filteredData.num_attributes() - 1) #for attributeStr in trainingData.attributes(): # print(attributeStr) #self.printFunctionInfo(trainingData, trainingData.num_instances()) classifier = self.algorithmPicker(trainingData, indexInTable) evaluation = self.evaluation(classifier, trainingData, testingData) if evaluation.percent_correct() >= bestAccuracy: bestAccuracy = evaluation.percent_correct() bestTrainingData = trainingData bestTestingData = testingData bestRemainFilterList = list(featureList3) bestEvaluation = evaluation print(self.algorithmTable[indexInTable] + ": " + "{:.2f}".format(evaluation.percent_correct()) + ", Feature select number:" + str(trainingData.num_attributes() - 1) + "/" + str(featureNum)) resultList.append("{:.2f}".format(evaluation.percent_correct())) while trainingData.num_attributes() - 1 > 10: for i in range(0, 10): functionName = featureList3.pop().split("(")[0] + "\(\)" functionName = functionName.replace('$', '\$') #print "functionName:" + functionName remove = Filter( classname= "weka.filters.unsupervised.attribute.RemoveByName", options=["-E", "^" + functionName + ".*$"]) remove.set_inputformat(trainingData) trainingData = remove.filter(trainingData) if testingData: remove.set_inputformat(testingData) testingData = remove.filter(testingData) #print functionName #print "featureNum: " + str(filteredData.num_attributes() - 1) #for attributeStr in trainingData.attributes(): # print(attributeStr) classifier = self.algorithmPicker(trainingData, indexInTable) evaluation = self.evaluation(classifier, trainingData, testingData) if evaluation.percent_correct() >= bestAccuracy: bestAccuracy = evaluation.percent_correct() bestTrainingData = trainingData bestTestingData = testingData bestRemainFilterList = list(featureList3) bestEvaluation = evaluation #print "update feature number:" + str(len(bestRemainFilterList)) print(self.algorithmTable[indexInTable] + ": " + "{:.2f}".format(evaluation.percent_correct()) + ", Feature select number:" + str(trainingData.num_attributes() - 1) + "/" + str(featureNum)) resultList.append("{:.2f}".format(evaluation.percent_correct())) resultList.reverse() fileteredfeatureList = [] #print "bestRemainFilterList number:" + str(len(bestRemainFilterList)) #print "wholefeatureList number:" + str(len(wholefeatureList)) for item in wholefeatureList: if item not in bestRemainFilterList: fileteredfeatureList.append(item) #print "update fileteredfeatureList number:" + str(len(fileteredfeatureList)) for item in resultList: outputStr += item + "," outputStr = outputStr[0:-1] + "\n" print outputStr self.writeToPath(csvFilePath, outputStr) accuracyStr = "{:.2f}".format(bestAccuracy) #print fileteredfeatureList return [bestEvaluation, bestTrainingData, bestTestingData, resultList]
from weka.core.converters import Loader, Saver from weka.core.dataset import Instances from weka.filters import Filter jvm.start() # load weather.nominal fname = data_dir + os.sep + "weather.nominal.arff" print("\nLoading dataset: " + fname + "\n") loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(fname) # output header print(Instances.template_instances(data)) # remove attribute no 3 print("\nRemove attribute no 3") fltr = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "3"]) fltr.set_inputformat(data) filtered = fltr.filter(data) # output header print(Instances.template_instances(filtered)) # save modified dataset saver = Saver(classname="weka.core.converters.ArffSaver") saver.save_file(filtered, data_dir + os.sep + "weather.nominal-filtered.arff") jvm.stop()
print("\nLoading dataset: " + fname + "\n") loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(fname) data.set_class_index(data.num_attributes() - 1) # plot pld.scatter_plot( data, data.get_attribute_by_name("petalwidth").get_index(), data.get_attribute_by_name("petallength").get_index(), wait=False) # add classifier errors to dataset addcls = Filter( classname="weka.filters.supervised.attribute.AddClassification", options=["-W", "weka.classifiers.trees.J48", "-classification", "-error"]) addcls.set_inputformat(data) filtered = addcls.filter(data) print(filtered) # build J48 cls = Classifier(classname="weka.classifiers.trees.J48") cls.build_classifier(data) evl = Evaluation(data) evl.test_model(cls, data) # plot classifier errors plc.plot_classifier_errors(evl.predictions(), wait=True) jvm.stop()
def _normalize_dataset(self, dataset): normalize_data = Filter(classname = 'weka.filters.unsupervised.attribute.Normalize', options = []) normalize_data.set_inputformat(dataset) normalized = normalize_data.filter(dataset) return normalized