def emlimitateUnusedFeature(self, trainData, testData = None): trainData.set_class_index(trainData.num_attributes() - 1) # set class attribute featureIndex = -1 filteredTrainData = trainData filteredTestData = testData attribute_index = 0 while attribute_index < filteredTrainData.num_attributes() - 1: sampleCoverage = 0 #print attribute_index # check value for current feature in each instance for instance_index in range(0, filteredTrainData.num_instances()): instance = filteredTrainData.get_instance(instance_index) value = instance.get_value(attribute_index) if value > 0: sampleCoverage += 1 if sampleCoverage == 0: #print "found" remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", str(attribute_index+1)]) #The index in this function start from 1 remove.set_inputformat(filteredTrainData) filteredTrainData = remove.filter(filteredTrainData) if filteredTestData: remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", str(attribute_index+1)]) #The index in this function start from 1 remove.set_inputformat(filteredTestData) filteredTestData = remove.filter(filteredTestData) else: attribute_index += 1 return [filteredTrainData, filteredTestData]
def run(dataset_path): start = time.time() ### load a dataset ### train_data = model.load_dataset_weka(dataset_path) # to_nomial_class_filter = Filter( classname="weka.filters.unsupervised.attribute.NumericToNominal", options=["-R", "last"]) to_nomial_class_filter.inputformat(train_data) ### Naive Bayes ### Choose what you want classifier = Classifier("weka.classifiers.bayes.NaiveBayesMultinomial") # classifier = Classifier("weka.classifiers.bayes.NaiveBayes") # classifier.build_classifer(train_data) evaluation = Evaluation(to_nomial_class_filter.filter(train_data)) evaluation.crossvalidate_model(classifier, to_nomial_class_filter.filter(train_data), 10, Random(42)) # print(evaluation.summary()) # print(evaluation.class_details()) # print(evaluation.matrix()) # ### Naive Bayes ### # mlp = Classifier("weka.classifiers.bayes.Naive Bayes") # mlp.build_classifer(train_file_5EMO) print(time.time() - start)
def createTwoDatasets(self, wholeDataPath, trainingDataPercentage, trainingPath, testingPath, shuffleSeed=43): wholeData = self.load_Arff(wholeDataPath) randomize = Filter( classname="weka.filters.unsupervised.instance.Randomize", options=["-S", str(shuffleSeed)]) randomize.set_inputformat(wholeData) wholeData = randomize.filter(wholeData) removePercentage = Filter( classname="weka.filters.unsupervised.instance.RemovePercentage", options=["-P", str(trainingDataPercentage), "-V"]) removePercentage.set_inputformat(wholeData) trainingData = removePercentage.filter(wholeData) print "instances:" + str(trainingData.num_instances()) removePercentage = Filter( classname="weka.filters.unsupervised.instance.RemovePercentage", options=["-P", str(trainingDataPercentage)]) removePercentage.set_inputformat(wholeData) testingData = removePercentage.filter(wholeData) print "instances:" + str(testingData.num_instances()) self.save_Arff(trainingData, trainingPath) self.save_Arff(testingData, testingPath)
def select_missclassified(self): remove = Filter(classname="weka.filters.supervised.attribute.AddClassification", options=['-classification' ,'-error' ,'-W' ,self.base_classifier.to_commandline()]) remove.inputformat(self.data) self.data = remove.filter(self.data) remove = Filter(classname="weka.filters.unsupervised.instance.RemoveWithValues", options=['-S','0.0','-C','last','-L','last','-V']) remove.inputformat(self.data) remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=['-R',str(self.data.num_attributes-2)+',last']) remove.inputformat(self.data) self.data = remove.filter(self.data)
def _get_training_dataset(self, X, y): # convert to numpy array if isinstance(X, pd.DataFrame): X = X.values elif isinstance(X, list): X = np.array(X) elif not isinstance(X, np.ndarray): raise Exception("Incompatible data type: {}".format(type(X))) if isinstance(y, pd.Series): y = y.values elif isinstance(y, list): y = np.array(y) elif not isinstance(y, np.ndarray): raise Exception("Incompatible data type: {}".format(type(y))) if y.dtype == "O": for i in range(0, len(y)): try: y[i] = y[i].encode() except: pass dataset = create_instances_from_matrices( X, y, name="generated from matrices") # generate dataset # convert label to nominal try: y.astype(float) self._label_type = np.float64 nominal = Filter( classname= "weka.filters.unsupervised.attribute.NumericToNominal", options=["-R", "last"]) except ValueError: self._label_type = str nominal = Filter( classname="weka.filters.unsupervised.attribute.StringToNominal", options=["-R", "last"]) nominal.inputformat(dataset) dataset = nominal.filter(dataset) # sort labels sorter = Filter( classname="weka.filters.unsupervised.attribute.SortLabels") sorter.inputformat(dataset) dataset = sorter.filter(dataset) dataset.class_is_last() # indicate class label return dataset
def _pre_process_to_classification(self, dataset): filter_data = Filter(classname = 'weka.filters.unsupervised.attribute.MathExpression', options = ['-unset-class-temporarily', '-E', "ifelse ( A>0, 1, 0 )", '-V', '-R', 'last']) filter_data.set_inputformat(dataset) filtered = filter_data.filter(dataset) discretize_data = Filter(classname = 'weka.filters.unsupervised.attribute.NumericToNominal', options = ['-R', 'last']) discretize_data.set_inputformat(filtered) discretized = discretize_data.filter(filtered) return discretized
def discretize_data(input_data): discretize = Filter( classname="weka.filters.unsupervised.attribute.Discretize", options=["-B", "10"]) discretize.inputformat(input_data) filtered_data = discretize.filter(input_data) return filtered_data
def supFilters(data, fType, ops): filt = Filter(classname="weka.filters.supervised." + fType, options = ops) filt.inputformat(data) # let the filter know about the type of data to filter filtered = filt.filter(data) # filter the data return filtered
def affective_vectorizer(tweets, filename): ''' Vectorizes the tweets and saves the vectors as csv. :param tweets: list of tweets :param filename: name of the saved file ''' jvm.start(packages=True) install_package('AffectiveTweets') data = dataset.create_instances_from_lists([[t] for t in tweets]) filter = Filter( classname= 'weka.filters.unsupervised.attribute.TweetToLexiconFeatureVector', options=[ '-F', '-D', '-R', '-A', '-T', '-L', '-N', '-P', '-J', '-H', '-Q', '-stemmer', 'weka.core.stemmers.NullStemmer', '-stopwords-handler', 'weka.core.tokenizers.TweetNLPTokenizer', '-I', '1', '-U', '-tokenizer', 'weka.core.tokenizers.TweetNLPTokenizer' ]) filter.inputformat(data) filtered_data = filter.filter(data) converters.save_any_file(filtered_data, 'data/affect-vectors/' + filename) jvm.stop()
def runSMO(file, bound): loader = Loader(classname="weka.core.converters.CSVLoader") data = loader.load_file(file) data.class_is_first() remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", bound]) cls = KernelClassifier( classname="weka.classifiers.functions.SMO", options=["-C", "1.0", "-L", "0.001", "-P", "1.0E-12", "-N", "0"]) kernel = Kernel( classname="weka.classifiers.functions.supportVector.PolyKernel", options=["-C", "250007", "-E", "1.0"]) cls.kernel = kernel pout = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText") remove.inputformat(data) filtered = remove.filter(data) evl = Evaluation(filtered) evl.crossvalidate_model(cls, filtered, 10, Random(1), pout) #print(pout.buffer_content()) print(evl.percent_correct) #print(evl.summary()) result = evl.class_details() print(result) return result
def create_model(input_file, output_file): # Load data data = converters.load_any_file(input_file) data.class_is_last() # set class attribute # filter data print_title("Filtering Data") discretize = Filter( classname="weka.filters.unsupervised.attribute.Discretize", options=["-B", "10", "-M", "-1.0", "-R", "first-last"]) discretize.inputformat( data) # let the filter know about the type of data to filter filtered_data = discretize.filter(data) print("Done! (believe it or not)") print_title("Build Classifier") classifier = Classifier(classname="weka.classifiers.trees.RandomForest", options=["-I", "100", "-K", "0", "-S", "1"]) classifier.build_classifier(filtered_data) print("Done! (believe it or not)") serialization.write_all(output_file, [classifier, discretize]) print("Model and filter saved to ", output_file) evaluation = Evaluation(data) # initialize with priors evaluation.crossvalidate_model(classifier, filtered_data, 10, Random(42)) # 10-fold CV print(evaluation.summary()) print("pctCorrect: " + str(evaluation.percent_correct)) print("incorrect: " + str(evaluation.incorrect))
def Feature_Selection(infile): directory = os.getcwd() + '/' csvpath = directory + infile jvm.start(packages=True, max_heap_size="4g") print "\n\n" print "Loaded file: ", infile csvloader = Loader(classname="weka.core.converters.CSVLoader") csvdata = csvloader.load_file(csvpath) remover = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", " 1"]) remover.inputformat(csvdata) filtered_data = remover.filter(csvdata) filtered_data.class_is_last() search = ASSearch(classname="weka.attributeSelection.BestFirst", options=["-D", "1", "-N", "5"]) evaluator = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval", options=["-P", "1", "-E", "1"]) attribs = AttributeSelection() attribs.search(search) attribs.evaluator(evaluator) attribs.select_attributes(filtered_data) print "Summary of Attribute Selection: " print attribs.results_string jvm.stop() return
def build_and_classify(classifier, classifier_name, approach_name, infile, percentage='10'): """ Creates model and classifies against input data. Returns accuracy statistics """ # set seed so results are consistent random.seed('iot') # load data loader = Loader(classname='weka.core.converters.CSVLoader') data = loader.load_file(infile) data.class_is_last() # convert all numeric attributes to nominal to_nominal = Filter(classname='weka.filters.unsupervised.attribute.NumericToNominal', options=['-R', 'first-last']) to_nominal.inputformat(data) data = to_nominal.filter(data) # randomize data with constant seed randomize = Filter(classname='weka.filters.unsupervised.instance.Randomize', options=['-S', '42']) randomize.inputformat(data) data = randomize.filter(data) # create training set and testing set train_percent_filter = Filter(classname='weka.filters.unsupervised.instance.RemovePercentage', options=['-P', percentage, '-V']) train_percent_filter.inputformat(data) train = train_percent_filter.filter(data) test = data # build and test classifier classifier.build_classifier(train) evaluation = Evaluation(train) evaluation.test_model(classifier, test) # return results as array results = [ approach_name, classifier_name, percentage, evaluation.percent_correct, evaluation.weighted_f_measure ] return results
def main(): """ Just runs some example code. """ # load a dataset iris = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris) loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(iris) # remove class attribute helper.print_info("Removing class attribute") remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "last"]) remove.inputformat(data) filtered = remove.filter(data) # use MultiFilter helper.print_info("Use MultiFilter") remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"]) std = Filter(classname="weka.filters.unsupervised.attribute.Standardize") multi = MultiFilter() multi.filters = [remove, std] multi.inputformat(data) filtered_multi = multi.filter(data) # output datasets helper.print_title("Input") print(data) helper.print_title("Output") print(filtered) helper.print_title("Output (MultiFilter)") print(filtered_multi) # load text dataset text = helper.get_data_dir( ) + os.sep + "reutersTop10Randomized_1perc_shortened.arff" helper.print_info("Loading dataset: " + text) loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(text) data.class_is_last() # apply StringToWordVector stemmer = Stemmer(classname="weka.core.stemmers.IteratedLovinsStemmer") stopwords = Stopwords(classname="weka.core.stopwords.Rainbow") tokenizer = Tokenizer(classname="weka.core.tokenizers.WordTokenizer") s2wv = StringToWordVector(options=["-W", "10", "-L", "-C"]) s2wv.stemmer = stemmer s2wv.stopwords = stopwords s2wv.tokenizer = tokenizer s2wv.inputformat(data) filtered = s2wv.filter(data) helper.print_title("Input (StringToWordVector)") print(data) helper.print_title("Output (StringToWordVector)") print(filtered)
def discretize(data, index, file): discretizer = Filter( classname='weka.filters.supervised.attribute.Discretize', options=["-R", str(index), "-precision", "6"]) discretizer.inputformat(data) newData = discretizer.filter(data) discretizer.serialize(file) return newData
def remove(data, indecies, file): cmdIndex = ','.join(indecies) remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", cmdIndex]) remove.inputformat(data) newData = remove.filter(data) remove.serialize(file) return newData
def smote(data, percentage): sampler = Filter( classname='weka.filters.supervised.instance.SMOTE', options=["-C", "0", "-K", "5", "-P", str(percentage), "-S", "1"]) sampler.inputformat(data) newData = sampler.filter(data) return newData
def undersample(data, percentage): if (percentage >= 100): return None sampler = Filter(classname='weka.filters.supervised.instance.Resample', options=["-B", "1.0", "-S", "1", "-Z", str(percentage)]) sampler.inputformat(data) newData = sampler.filter(data) return newData
def stringToNominal(data, indecies, file): cmdIndex = ','.join(indecies) stn = Filter( classname="weka.filters.unsupervised.attribute.StringToNominal", options=["-R", cmdIndex]) stn.inputformat(data) newData = stn.filter(data) stn.serialize(file) return newData
def attributeSelector(self, data, selectNum): attributeSelector = Filter(classname="weka.filters.supervised.attribute.AttributeSelection",\ options=["-S", "weka.attributeSelection.Ranker -T -1.7976931348623157E308 -N " + str(selectNum),\ "-E", "weka.attributeSelection.InfoGainAttributeEval"]) attributeSelector.set_inputformat(data) data = attributeSelector.filter(data) return data
def filterUnusedFeatureFromList(self, data, unusedFuncitonList): filteredData = data for attribute in unusedFuncitonList: remove = Filter(classname="weka.filters.unsupervised.attribute.RemoveByName", options=["-E", "^" + attribute + ".*$"]) remove.set_inputformat(filteredData) filteredData = remove.filter(filteredData) return filteredData
def createTwoDatasets(self, wholeDataPath, trainingDataPercentage, trainingPath, testingPath, shuffleSeed = 43): wholeData = self.load_Arff(wholeDataPath) randomize = Filter(classname="weka.filters.unsupervised.instance.Randomize", options=["-S", str(shuffleSeed)]) randomize.set_inputformat(wholeData) wholeData = randomize.filter(wholeData) removePercentage = Filter(classname="weka.filters.unsupervised.instance.RemovePercentage", options=["-P", str(trainingDataPercentage), "-V"]) removePercentage.set_inputformat(wholeData) trainingData = removePercentage.filter(wholeData) print "instances:" + str(trainingData.num_instances()) removePercentage = Filter(classname="weka.filters.unsupervised.instance.RemovePercentage", options=["-P", str(trainingDataPercentage)]) removePercentage.set_inputformat(wholeData) testingData = removePercentage.filter(wholeData) print "instances:" + str(testingData.num_instances()) self.save_Arff(trainingData, trainingPath) self.save_Arff(testingData, testingPath)
def load(path, db): nominals = [ 49, # dev_global_mem_cache_type 52, # dev_host_unified_memory 54, # dev_local_mem_type 56, # dev_type 57, # dev_vendor ] nominal_indices = ",".join([str(index) for index in nominals]) force_nominal = ["-N", nominal_indices] # Load data from CSV. dataset = Dataset.load_csv(path, options=force_nominal) dataset.__class__ = Dataset # Set class index and database connection. dataset.class_index = -1 dataset.db = db # Create string->nominal type attribute filter, ignoring the first # attribute (scenario ID), since we're not classifying with it. string_to_nominal = WekaFilter( classname=("weka.filters.unsupervised." "attribute.StringToNominal"), options=["-R", "2-last"], ) string_to_nominal.inputformat(dataset.instances) # Create filtered dataset, and swap data around. filtered = string_to_nominal.filter(dataset.instances) # Create nominal->binary type attribute filter, ignoring the # first attribute (scenario ID), since we're not classifying with it. n2b = WekaFilter( classname="weka.filters.unsupervised.attribute.NominalToBinary", options=["-R", "2-last"], ) n2b.inputformat(filtered) dataset.instances = n2b.filter(filtered) return dataset
def getSetDataBySetIndex(self, data, index): # cut feature set out featureTable = FeatureTable() startIndexList = featureTable.getEachSetStartIndex() start = startIndexList[index] end = startIndexList[index+1] - 1 remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-V", "-R", str(start) + "-" + str(end) + ",last"]) remove.set_inputformat(data) filteredData = remove.filter(data) return filteredData
def exposed_evaluate(self, X, d, task, i_model, i_evl): data = np.reshape(eval(X), [d, -1], order='C') if task == 'regression': if i_model == 'LR': data = converters.ndarray_to_instances(data, relation='tmp') data.class_is_last() model = Classifier( classname='weka.classifiers.functions.LinearRegression') evl = Evaluation(data) evl.crossvalidate_model(model, data, 5, Random(0)) elif i_model == 'RF': data = converters.ndarray_to_instances(data, relation='tmp') data.class_is_last() model = Classifier( classname='weka.classifiers.trees.RandomForest') evl = Evaluation(data) evl.crossvalidate_model(model, data, 5, Random(0)) if i_evl == 'mae': r_mae = evl.mean_absolute_error return r_mae elif i_evl == 'mse': r_mae = evl.mean_square_error return r_mse elif i_evl == '1-rae': r_one_minus_rae = 1 - evl.relative_absolute_error / 100 del evl, model, data return r_one_minus_rae elif task == 'classification': le = LabelEncoder() data[:, -1] = le.fit_transform(data[:, -1]) if i_model == 'RF': dataRaw = converters.ndarray_to_instances(data, relation='tmp') weka_filter = Filter( classname= "weka.filters.unsupervised.attribute.NumericToNominal", options=["-R", "last"]) weka_filter.inputformat(dataRaw) data = weka_filter.filter(dataRaw) data.class_is_last() model = Classifier( classname='weka.classifiers.trees.RandomForest') evl = Evaluation(data) evl.crossvalidate_model(model, data, 5, Random(0)) elif i_model == 'LR': model = LogisticRegression(multi_class='ovr') elif i_model == 'SVM': model = svm.SVC() if i_evl == 'f_score': fscore = evl.weighted_f_measure del evl, model, data, dataRaw if not (fscore >= 0.01 and fscore < 1.01): fscore = 0.01 return fscore
def filterUnusedFeatureFromList(self, data, unusedFuncitonList): filteredData = data for attribute in unusedFuncitonList: remove = Filter( classname="weka.filters.unsupervised.attribute.RemoveByName", options=["-E", "^" + attribute + ".*$"]) remove.set_inputformat(filteredData) filteredData = remove.filter(filteredData) return filteredData
def make_partition(data, attributes, part='normal'): if part == 'normal': value = 'last' elif part == 'anomalous': value = 'first' keep_normal = Filter( classname='weka.filters.unsupervised.instance.RemoveWithValues', options=['-C', 'last', '-L', value]) keep_normal.inputformat(data) data_normal = keep_normal.filter(data) remove = Filter(classname='weka.filters.unsupervised.attribute.Remove', options=['-R', 'last']) remove.inputformat(data) data_normal = remove.filter(data_normal) N = data_normal.num_instances return data_normal, N
def merge_classes(data, idx_to_merge): """ :param data: The data file to filter :param idx_to_merge: String representation of class indices to merge :return: filtered data """ merge_filter = Filter( classname="weka.filters.unsupervised.attribute.MergeManyValues", options=["-C", "last", "-R", idx_to_merge, "-unset-class-temporarily"]) merge_filter.inputformat(data) filtered_data = merge_filter.filter(data) return filtered_data
def filterOutUnnecessaryAPIAndEvaluateOurApproach(self, ourApproahFile, apiFile, indexInTable, methodName, databaseTable, csvFilePath): outputStr = methodName + "," resultList = [] # Get whole feature set of our approach filteredData = self.load_Arff(ourApproahFile) # Use this function to get selected API feature and save the unselected api in a list filterOutList = self.attribueSelectionBasedOnRankingInDatabase( apiFile, indexInTable, databaseTable, "")[1] # Remove unselected API for functionName in filterOutList: functionName = functionName.split("(")[0] + "\(\)" functionName = functionName.replace('$', '\$') remove = Filter( classname="weka.filters.unsupervised.attribute.RemoveByName", options=["-E", "^" + functionName + ".*$"]) remove.set_inputformat(filteredData) filteredData = remove.filter(filteredData) featureNum = filteredData.num_attributes() - 1 print "featureNum: " + str(featureNum) if csvFilePath != "": self.writeTenScaledTitleManual(featureNum, csvFilePath) #print "i:" + str(i) #print "functionName:" + functionName #print "featureNum: " + str(filteredData.num_attributes() - 1) for attributeStr in filteredData.attributes(): print(attributeStr) # Run ten scaled generation and evaluation step = 10 while step < featureNum: roundData = self.attributeSelector(filteredData, step) classifier = self.algorithmPicker(roundData, indexInTable) evaluation = self.evaluation(classifier, roundData) #print(self.algorithmTable[indexInTable] + ": " + "{:.2f}".format(evaluation.percent_correct()) + ", Feature select number:" + str(roundData.num_attributes() - 1) + "/" + str(featureNum)) resultList.append("{:.2f}".format(evaluation.percent_correct())) #csvFile.write("{:.2f}".format(evaluation.percent_correct()) +",") step += 10 classifier = self.algorithmPicker(filteredData, indexInTable) evaluation = self.evaluation(classifier, filteredData) #print(self.algorithmTable[indexInTable] + ": " + "{:.2f}".format(evaluation.percent_correct()) + ", Feature select number:" + str(filteredData.num_attributes() - 1) + "/" + str(featureNum)) resultList.append("{:.2f}".format(evaluation.percent_correct())) # Write out to CSV file for item in resultList: outputStr += item + "," outputStr = outputStr[0:-1] + "\n" self.writeToPath(csvFilePath, outputStr)
def get_weka_breast_cancer(self): split_ratio = 0.2 loader = Loader(classname="weka.core.converters.CSVLoader") loader.options = ['-F', ','] dataset = loader.load_file( os.path.join(DATASET_DIR, 'uci-20070111-breast-cancer.csv')) dataset.class_is_last() remove = Filter( classname="weka.filters.unsupervised.instance.RemovePercentage", options=["-P", str(split_ratio * 100)]) remove.inputformat(dataset) train_set = remove.filter(dataset) remove = Filter( classname="weka.filters.unsupervised.instance.RemovePercentage", options=["-P", str(split_ratio * 100), "-V"]) remove.inputformat(dataset) test_set = remove.filter(dataset) labels = dataset.class_attribute.values return train_set, test_set, labels
def remove_correct_classified(self, invert = False): options=[ '-W', self.classifier.to_commandline(), '-C', str(self.class_index), #classindex # '-F','0', # folds # '-T','0.1', #threshold by numeric classes '-I','0', # max iterations '-V' if not invert else '' ] # invert classname = "weka.filters.unsupervised.instance.RemoveMisclassified" remove = Filter(classname=classname, options=options) remove.inputformat(self.data) self.data = remove.filter(self.data)
def load(path, db): nominals = [ 49, # dev_global_mem_cache_type 52, # dev_host_unified_memory 54, # dev_local_mem_type 56, # dev_type 57, # dev_vendor ] nominal_indices = ",".join([str(index) for index in nominals]) force_nominal = ["-N", nominal_indices] # Load data from CSV. dataset = Dataset.load_csv(path, options=force_nominal) dataset.__class__ = Dataset # Set class index and database connection. dataset.class_index = -1 dataset.db = db # Create string->nominal type attribute filter, ignoring the first # attribute (scenario ID), since we're not classifying with it. string_to_nominal = WekaFilter(classname=("weka.filters.unsupervised." "attribute.StringToNominal"), options=["-R", "2-last"]) string_to_nominal.inputformat(dataset.instances) # Create filtered dataset, and swap data around. filtered = string_to_nominal.filter(dataset.instances) # Create nominal->binary type attribute filter, ignoring the # first attribute (scenario ID), since we're not classifying with it. n2b = WekaFilter(classname="weka.filters.unsupervised.attribute.NominalToBinary", options=["-R", "2-last"]) n2b.inputformat(filtered) dataset.instances = n2b.filter(filtered) return dataset
def emlimitateUnusedFeature(self, trainData, testData=None): trainData.set_class_index(trainData.num_attributes() - 1) # set class attribute featureIndex = -1 filteredTrainData = trainData filteredTestData = testData attribute_index = 0 while attribute_index < filteredTrainData.num_attributes() - 1: sampleCoverage = 0 #print attribute_index # check value for current feature in each instance for instance_index in range(0, filteredTrainData.num_instances()): instance = filteredTrainData.get_instance(instance_index) value = instance.get_value(attribute_index) if value > 0: sampleCoverage += 1 if sampleCoverage == 0: #print "found" remove = Filter( classname="weka.filters.unsupervised.attribute.Remove", options=["-R", str(attribute_index + 1) ]) #The index in this function start from 1 remove.set_inputformat(filteredTrainData) filteredTrainData = remove.filter(filteredTrainData) if filteredTestData: remove = Filter( classname="weka.filters.unsupervised.attribute.Remove", options=["-R", str(attribute_index + 1) ]) #The index in this function start from 1 remove.set_inputformat(filteredTestData) filteredTestData = remove.filter(filteredTestData) else: attribute_index += 1 return [filteredTrainData, filteredTestData]
def getSetDataBySetIndex(self, data, index): # cut feature set out featureTable = FeatureTable() startIndexList = featureTable.getEachSetStartIndex() start = startIndexList[index] end = startIndexList[index + 1] - 1 remove = Filter( classname="weka.filters.unsupervised.attribute.Remove", options=["-V", "-R", str(start) + "-" + str(end) + ",last"]) remove.set_inputformat(data) filteredData = remove.filter(data) return filteredData
def unsupervised_discretize(data): """ Function for discretization of data. Function uses weka implementation weka.filters.unsupervised.attribute.Discretize. :param data: weka arff data :return: weka arff data """ args, _sufix = unsupervised_discretize_parser() filt = Filter(classname='weka.filters.unsupervised.attribute.Discretize', options=args_to_weka_options(args, _sufix)) filt.inputformat(data) return filt.filter(data)
def filter_data(self, data): print("Filtering Data..\n") flter = Filter( classname="weka.filters.supervised.attribute.AttributeSelection") aseval = ASEvaluation( classname="weka.attributeSelection.CfsSubsetEval", options=["-P", "1", "-E", "1"]) assearch = ASSearch(classname="weka.attributeSelection.BestFirst", options=["-D", "1", "-N", "5"]) flter.set_property("evaluator", aseval.jobject) flter.set_property("search", assearch.jobject) flter.inputformat(data) filtered = flter.filter(data) return filtered
def main(): jvm.start() loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file("train_sorted.arff") numofStores = 1115 for storeNum in range(0, numofStores): tempData = data removeUpper = Filter( classname="weka.filters.unsupervised.instance.RemoveWithValues", options=[ "-S", str(storeNum + 2) + ".0", "-C", "first", "-L", "first-last", "-V" ]) removeUpper.inputformat(data) tempData = removeUpper.filter(data) removeLower = Filter( classname="weka.filters.unsupervised.instance.RemoveWithValues", options=[ "-S", str(storeNum + 1) + ".0", "-C", "first", "-L", "first-last" ]) removeLower.inputformat(tempData) tempData = removeLower.filter(tempData) #removing the storeID attribute tempData.delete_first_attribute() saver = Saver(classname="weka.core.converters.ArffSaver") saver.save_file(tempData, "stores/store" + str(storeNum + 1) + ".arff") print 'Saved Store' + str(storeNum + 1) jvm.stop()
def use_filter(data): """ Uses the AttributeSelection filter for attribute selection. :param data: the dataset to use :type data: Instances """ print("\n2. Filter") flter = Filter(classname="weka.filters.supervised.attribute.AttributeSelection") aseval = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval") assearch = ASSearch(classname="weka.attributeSelection.GreedyStepwise", options=["-B"]) flter.set_property("evaluator", aseval.jobject) flter.set_property("search", assearch.jobject) flter.inputformat(data) filtered = flter.filter(data) print(str(filtered))
def obtainSVM(file): data = converters.load_any_file(folderPathOfArffFiles + file + ".arff") remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "1-2"]) remove.inputformat(data) data = remove.filter(data) data.class_is_last() classifier = Classifier(classname="weka.classifiers.functions.LibSVM") evaluation = Evaluation(data) evaluation.crossvalidate_model(classifier, data, kFold, Random(42)) info = evaluation.class_details() roc_area = float(info[406:411]) return roc_area
def filterOutUnnecessaryAPIAndEvaluateOurApproach(self, ourApproahFile, apiFile, indexInTable, methodName, databaseTable, csvFilePath): outputStr = methodName+"," resultList = [] # Get whole feature set of our approach filteredData = self.load_Arff(ourApproahFile) # Use this function to get selected API feature and save the unselected api in a list filterOutList = self.attribueSelectionBasedOnRankingInDatabase(apiFile, indexInTable, databaseTable, "")[1] # Remove unselected API for functionName in filterOutList: functionName = functionName.split("(")[0] + "\(\)" functionName = functionName.replace('$','\$') remove = Filter(classname="weka.filters.unsupervised.attribute.RemoveByName", options=["-E", "^" + functionName + ".*$"]) remove.set_inputformat(filteredData) filteredData = remove.filter(filteredData) featureNum = filteredData.num_attributes() - 1 print "featureNum: " + str(featureNum) if csvFilePath != "": self.writeTenScaledTitleManual(featureNum, csvFilePath) #print "i:" + str(i) #print "functionName:" + functionName #print "featureNum: " + str(filteredData.num_attributes() - 1) for attributeStr in filteredData.attributes(): print(attributeStr) # Run ten scaled generation and evaluation step = 10 while step < featureNum: roundData = self.attributeSelector(filteredData, step) classifier = self.algorithmPicker(roundData, indexInTable) evaluation = self.evaluation(classifier, roundData) #print(self.algorithmTable[indexInTable] + ": " + "{:.2f}".format(evaluation.percent_correct()) + ", Feature select number:" + str(roundData.num_attributes() - 1) + "/" + str(featureNum)) resultList.append("{:.2f}".format(evaluation.percent_correct())) #csvFile.write("{:.2f}".format(evaluation.percent_correct()) +",") step += 10 classifier = self.algorithmPicker(filteredData, indexInTable) evaluation = self.evaluation(classifier, filteredData) #print(self.algorithmTable[indexInTable] + ": " + "{:.2f}".format(evaluation.percent_correct()) + ", Feature select number:" + str(filteredData.num_attributes() - 1) + "/" + str(featureNum)) resultList.append("{:.2f}".format(evaluation.percent_correct())) # Write out to CSV file for item in resultList: outputStr += item +"," outputStr = outputStr[0:-1] + "\n" self.writeToPath(csvFilePath, outputStr)
def load(path, db): nominals = [ 49, # dev_double_fp_config 50, # dev_endian_little 51, # dev_execution_capabilities 52, # dev_extensions 54, # dev_global_mem_cache_type 57, # dev_host_unified_memory 63, # dev_image_support 65, # dev_local_mem_type 96, # dev_queue_properties 97, # dev_single_fp_config 98, # dev_type 100, # dev_vendor_id ] nominal_indices = ",".join([str(index) for index in nominals]) force_nominal = ["-N", nominal_indices] # Load data from CSV. dataset = Dataset.load_csv(path, options=force_nominal) dataset.__class__ = Dataset # Set class index and database connection. dataset.class_index = -1 dataset.db = db # Create string->nominal type attribute filter, ignoring the first # attribute (scenario ID), since we're not classifying with it. string_to_nominal = WekaFilter(classname=("weka.filters.unsupervised." "attribute.StringToNominal"), options=["-R", "2-last"]) string_to_nominal.inputformat(dataset.instances) # Create filtered dataset, and swap data around. filtered = string_to_nominal.filter(dataset.instances) dataset.instances = filtered return dataset
from weka.clusterers import Clusterer, ClusterEvaluation from weka.filters import Filter import weka.plot.clusterers as plc jvm.start() # load iris fname = data_dir + os.sep + "iris.arff" print("\nLoading dataset: " + fname + "\n") loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(fname) # remove class attribute flt = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "last"]) flt.set_inputformat(data) filtered = flt.filter(data) # build KMeans print("\n--> SimpleKMeans\n") cl = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "3"]) cl.build_clusterer(filtered) evl = ClusterEvaluation() evl.set_model(cl) evl.test_model(filtered) print(evl.get_cluster_results()) plc.plot_cluster_assignments(evl, data, atts=[], inst_no=True, wait=True) # use AddCluster filter print("\n--> AddCluster filter\n") flt = Filter(classname="weka.filters.unsupervised.attribute.AddCluster", options=["-W", "weka.clusterers.SimpleKMeans -N 3"])
def _normalize_dataset(self, dataset): normalize_data = Filter(classname = 'weka.filters.unsupervised.attribute.Normalize', options = []) normalize_data.set_inputformat(dataset) normalized = normalize_data.filter(dataset) return normalized
def runner(self, cdat, heap_size = 16384, seed = None, verbose = True): self.set_status(Pipeline.RUNNING) self.logs.append('Initializing Pipeline') para = self.config self.logs.append('Reading Pipeline Configuration') head = '' name = get_rand_uuid_str() self.logs.append('Reading Input File') for i, stage in enumerate(self.stages): if stage.code in ('dat.fle', 'prp.bgc', 'prp.nrm', 'prp.pmc', 'prp.sum'): self.stages[i].status = Pipeline.RUNNING if stage.code == 'dat.fle': head = os.path.abspath(stage.value.path) name, _ = os.path.splitext(stage.value.name) self.logs.append('Parsing to ARFF') path = os.path.join(head, '{name}.arff'.format(name = name)) # This bug, I don't know why, using Config.schema instead. # cdat.toARFF(path, express_config = para.Preprocess.schema, verbose = verbose) for i, stage in enumerate(self.stages): if stage.code in ('dat.fle', 'prp.bgc', 'prp.nrm', 'prp.pmc', 'prp.sum'): self.stages[i].status = Pipeline.COMPLETE self.logs.append('Saved ARFF at {path}'.format(path = path)) self.logs.append('Splitting to Training and Testing Sets') JVM.start(max_heap_size = '{size}m'.format(size = heap_size)) load = Loader(classname = 'weka.core.converters.ArffLoader') # data = load.load_file(path) # save = Saver(classname = 'weka.core.converters.ArffSaver') data = load.load_file(os.path.join(head, 'iris.arff')) # For Debugging Purposes Only data.class_is_last() # For Debugging Purposes Only # data.class_index = cdat.iclss for i, stage in enumerate(self.stages): if stage.code == 'prp.kcv': self.stages[i].status = Pipeline.RUNNING self.logs.append('Splitting Training Set') # TODO - Check if this seed is worth it. seed = assign_if_none(seed, random.randint(0, 1000)) opts = ['-S', str(seed), '-N', str(para.Preprocess.FOLDS)] wobj = Filter(classname = 'weka.filters.supervised.instance.StratifiedRemoveFolds', options = opts + ['-V']) wobj.inputformat(data) tran = wobj.filter(data) self.logs.append('Splitting Testing Set') wobj.options = opts test = wobj.filter(data) for i, stage in enumerate(self.stages): if stage.code == 'prp.kcv': self.stages[i].status = Pipeline.COMPLETE self.logs.append('Performing Feature Selection') feat = [ ] for comb in para.FEATURE_SELECTION: if comb.USE: for i, stage in enumerate(self.stages): if stage.code == 'ats': search = stage.value.search.name evaluator = stage.value.evaluator.name if search == comb.Search.NAME and evaluator == comb.Evaluator.NAME: self.stages[i].status = Pipeline.RUNNING srch = ASSearch(classname = 'weka.attributeSelection.{classname}'.format( classname = comb.Search.NAME, options = assign_if_none(comb.Search.OPTIONS, [ ]) )) ewal = ASEvaluation(classname = 'weka.attributeSelection.{classname}'.format( classname = comb.Evaluator.NAME, options = assign_if_none(comb.Evaluator.OPTIONS, [ ]) )) attr = AttributeSelection() attr.search(srch) attr.evaluator(ewal) attr.select_attributes(tran) meta = addict.Dict() meta.search = comb.Search.NAME meta.evaluator = comb.Evaluator.NAME meta.features = [tran.attribute(index).name for index in attr.selected_attributes] feat.append(meta) for i, stage in enumerate(self.stages): if stage.code == 'ats': search = stage.value.search.name evaluator = stage.value.evaluator.name if search == comb.Search.NAME and evaluator == comb.Evaluator.NAME: self.stages[i].status = Pipeline.COMPLETE models = [ ] for model in para.MODEL: if model.USE: summary = addict.Dict() self.logs.append('Modelling {model}'.format(model = model.LABEL)) summary.label = model.LABEL summary.name = model.NAME summary.options = assign_if_none(model.OPTIONS, [ ]) for i, stage in enumerate(self.stages): if stage.code == 'lrn' and stage.value.name == model.NAME: self.stages[i].status = Pipeline.RUNNING for i, instance in enumerate(data): iclass = list(range(instance.num_classes)) options = assign_if_none(model.OPTIONS, [ ]) classifier = Classifier(classname = 'weka.classifiers.{classname}'.format(classname = model.NAME), options = options) classifier.build_classifier(tran) serializer.write(os.path.join(head, '{name}.{classname}.model'.format( name = name, classname = model.NAME )), classifier) self.logs.append('Testing model {model}'.format(model = model.LABEL)) evaluation = Evaluation(tran) evaluation.test_model(classifier, test) summary.summary = evaluation.summary() frame = pd.DataFrame(data = evaluation.confusion_matrix) axes = sns.heatmap(frame, cbar = False, annot = True) b64str = get_b64_plot(axes) summary.confusion_matrix = addict.Dict({ 'value': evaluation.confusion_matrix.tolist(), 'plot': b64str }) self.logs.append('Plotting Learning Curve for {model}'.format(model = model.LABEL)) buffer = io.BytesIO() plot_classifier_errors(evaluation.predictions, tran, test, outfile = buffer, wait = False) b64str = buffer_to_b64(buffer) summary.learning_curve = b64str buffer = io.BytesIO() plot_roc(evaluation, class_index = iclass, outfile = buffer, wait = False) b64str = buffer_to_b64(buffer) summary.roc_curve = b64str buffer = io.BytesIO() plot_prc(evaluation, class_index = iclass, outfile = buffer, wait = False) b64str = buffer_to_b64(buffer) summary.prc_curve = b64str if classifier.graph: summary.graph = classifier.graph for i, instance in enumerate(test): prediction = classifier.classify_instance(instance) for i, stage in enumerate(self.stages): if stage.code == 'lrn' and stage.value.name == model.NAME: self.stages[i].status = Pipeline.COMPLETE models.append(summary) self.gist.models = models JVM.stop() JSON.write(os.path.join(head, '{name}.cgist'.format(name = name)), self.gist) self.logs.append('Pipeline Complete') self.set_status(Pipeline.COMPLETE)
def main(): """ Just runs some example code. """ # load a dataset data_file = helper.get_data_dir() + os.sep + "vote.arff" helper.print_info("Loading dataset: " + data_file) loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(data_file) data.class_is_last() # classifier classifier = Classifier(classname="weka.classifiers.trees.J48") # randomize data folds = 10 seed = 1 rnd = Random(seed) rand_data = Instances.copy_instances(data) rand_data.randomize(rnd) if rand_data.class_attribute.is_nominal: rand_data.stratify(folds) # perform cross-validation and add predictions predicted_data = None evaluation = Evaluation(rand_data) for i in xrange(folds): train = rand_data.train_cv(folds, i) # the above code is used by the StratifiedRemoveFolds filter, # the following code is used by the Explorer/Experimenter # train = rand_data.train_cv(folds, i, rnd) test = rand_data.test_cv(folds, i) # build and evaluate classifier cls = Classifier.make_copy(classifier) cls.build_classifier(train) evaluation.test_model(cls, test) # add predictions addcls = Filter( classname="weka.filters.supervised.attribute.AddClassification", options=["-classification", "-distribution", "-error"]) # setting the java object directory avoids issues with correct quoting in option array addcls.set_property("classifier", Classifier.make_copy(classifier)) addcls.inputformat(train) addcls.filter(train) # trains the classifier pred = addcls.filter(test) if predicted_data is None: predicted_data = Instances.template_instances(pred, 0) for n in xrange(pred.num_instances): predicted_data.add_instance(pred.get_instance(n)) print("") print("=== Setup ===") print("Classifier: " + classifier.to_commandline()) print("Dataset: " + data.relationname) print("Folds: " + str(folds)) print("Seed: " + str(seed)) print("") print(evaluation.summary("=== " + str(folds) + " -fold Cross-Validation ===")) print("") print(predicted_data)
from weka.core.converters import Loader, Saver from weka.core.dataset import Instances from weka.filters import Filter jvm.start() # load weather.nominal fname = data_dir + os.sep + "weather.nominal.arff" print("\nLoading dataset: " + fname + "\n") loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(fname) # output header print(Instances.template_instances(data)) # remove attribute no 3 print("\nRemove attribute no 3") fltr = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "3"]) fltr.set_inputformat(data) filtered = fltr.filter(data) # output header print(Instances.template_instances(filtered)) # save modified dataset saver = Saver(classname="weka.core.converters.ArffSaver") saver.save_file(filtered, data_dir + os.sep + "weather.nominal-filtered.arff") jvm.stop()
def merge_nominal_attributes(self, significance=0.01): remove = Filter(classname="weka.filters.supervised.attribute.MergeNominalValues", options=['-L',str(significance),'-R','first-last']) remove.inputformat(self.data) self.data = remove.filter(self.data)
def getTenScaledResultsRankedByInfo(self, trainingData, indexInTable, csvFilePath, testingData = None): dbmgr = permissionMappingManager(databasePath) featureNum = trainingData.num_attributes() - 1 attributeIn = trainingData.attributes() attributeList = [] for item in attributeIn: functionName = str(item).split(" ")[1] functionName = functionName.split("(")[0] + "\(\)" functionName = functionName.replace('$','\$') #print functionName attributeList.append(functionName) outputStr = "" outputStr += "InfomationGain" + "," resultList = [] bestAccuracy = 0 bestTrainData = 0 bestTestData = 0 #for index in range(0, len(attributeList)-1): # attributeList[index] = attributeList[index].split(" ")[1] # print attributeList[index] csvFile = open(csvFilePath, "a") csvFile.write(self.algorithmTable[indexInTable]+",") step = 10 while step < featureNum: # pick top features filteredTrainData = self.attributeSelector(trainingData, step) # check top feature informations APIList = [] for item in filteredTrainData.attributes(): #print str(item) functionName = str(item).split(" ")[1] #functionName = functionName.split("_")[0][1:] APIList.append(functionName) numberOfInstance = self.getNumOfInstance(trainingData) # Get those features that it doesn't pick filteredList = [] attributeIn = filteredTrainData.attributes() for item in attributeIn: functionName = str(item).split(" ")[1] functionName = functionName.split("(")[0] + "\(\)" functionName = functionName.replace('$','\$') filteredList.append(functionName) items = self.getItemsNotInTheList(attributeList, filteredList) #print len(items) #for item in items: # print item # Re-process training data and make testing Data synchronized filteredTrainData = trainingData filterTestingData = testingData for attribute in items: remove = Filter(classname="weka.filters.unsupervised.attribute.RemoveByName", options=["-E", "^" + attribute + ".*$"]) remove.set_inputformat(filteredTrainData) filteredTrainData = remove.filter(filteredTrainData) if filterTestingData: remove.set_inputformat(filterTestingData) filterTestingData = remove.filter(filterTestingData) #print attribute #print str(filteredTrainData.num_attributes() - 1) # Build classifier and evaluate it classifier = self.algorithmPicker(filteredTrainData, indexInTable) evaluation = self.evaluation(classifier, filteredTrainData, filterTestingData) #print(self.algorithmTable[indexInTable] + ": " + "{:.2f}".format(evaluation.percent_correct()) + ", Feature select number:" + str(filteredTrainData.num_attributes() - 1) + "/" + str(featureNum)) resultList.append("{:.2f}".format(evaluation.percent_correct())) #Save best data and accuracy if evaluation.percent_correct() > bestAccuracy: bestAccuracy = evaluation.percent_correct() bestTrainData = filteredTrainData if testingData: bestTestData = filterTestingData #bestEvaluation = evaluation step += 10 classifier = self.algorithmPicker(trainingData, indexInTable) evaluation = self.evaluation(classifier, trainingData, testingData) #print(self.algorithmTable[indexInTable] + ": " + "{:.2f}".format(evaluation.percent_correct()) + ", Feature select number:" + str(trainingData.num_attributes() - 1) + "/" + str(featureNum)) resultList.append("{:.2f}".format(evaluation.percent_correct())) #Save best data and accuracy if evaluation.percent_correct() > bestAccuracy: bestAccuracy = evaluation.percent_correct() bestTrainData = filteredTrainData if testingData: bestTestData = filterTestingData #bestEvaluation = evaluation for item in resultList: outputStr += item +"," outputStr = outputStr[0:-1] + "\n" self.writeToPath(csvFilePath, outputStr) return [bestAccuracy, bestTrainData, bestTestData, resultList]
def testing(): logging.disable("weka") print "PROSES KLASIFIKASI\n------------------" jvm.start() pruning = 0 while pruning < 2: persen_train = 0 while persen_train < 4: fitur_hapus = 15 while fitur_hapus >= 0: list_akurasi = [] list_recall = [] list_presisi = [] list_fmeasure = [] list_roc = [] count = 0 nama = "hasilTest/" if(pruning == 0): nama += "unpruning" if(persen_train == 0): nama += "40" elif(persen_train == 1): nama += "50" elif(persen_train == 2): nama += "60" else: nama += "70" else: nama += "pruning" if(persen_train == 0): nama += "40" elif(persen_train == 1): nama += "50" elif(persen_train == 2): nama += "60" else: nama += "70" if(fitur_hapus > 0): nama += "removeF" + str(fitur_hapus) + ".txt" else: nama += "normal.txt" f = open(nama, "w") if(pruning == 0): nama = "unpruning" print "Tanpa Pruning" f.write("Hasil Decision Tree C4.5 tanpa Pruning (unpruning)\n") if(persen_train == 0): nama += "40" f.write("Dengan Training Set sebesar 40%\n") elif(persen_train == 1): nama += "50" f.write("Dengan Training Set sebesar 50%\n") elif(persen_train == 2): nama += "60" f.write("Dengan Training Set sebesar 60%\n") else: nama += "70" f.write("Dengan Training Set sebesar 70%\n") else: nama = "pruning" print "Dengan Pruning" f.write("Hasil Decision Tree C4.5 Pruning\n") if(persen_train == 0): nama += "40" f.write("Dengan Training Set sebesar 40%\n") elif(persen_train == 1): nama += "50" f.write("Dengan Training Set sebesar 50%\n") elif(persen_train == 2): nama += "60" f.write("Dengan Training Set sebesar 60%\n") else: nama += "70" f.write("Dengan Training Set sebesar 70%\n") if(fitur_hapus > 0): f.write("Menggunakan remove pada fitur " + str(fitur_hapus) + "\n\n") else: f.write("\n") f.write("No. Akurasi Recall Presisi F-Measure ROC\n") if persen_train == 0: print "40% Data Training" elif persen_train == 1: print "50% Data Training" elif persen_train == 2: print "60% Data Training" else: print "70% Data Training" print "Fitur yang dihapus:", fitur_hapus print "\nNo.\tAkurasi\tRecall\tPresisi\tF-Measure\tROC" while count < 100: loader = Loader(classname = "weka.core.converters.ArffLoader") data = loader.load_file("hasil.arff") data.class_is_last() if(fitur_hapus > 0): remove = Filter(classname = "weka.filters.unsupervised.attribute.Remove", options = ["-R", str(fitur_hapus)]) remove.inputformat(data) data_baru = remove.filter(data) data_baru.class_is_last() else: data_baru = loader.load_file("hasil.arff") data_baru.class_is_last() filter = Filter(classname = "weka.filters.unsupervised.instance.Randomize", options = ["-S", str(int(time.time()))]) filter.inputformat(data_baru) data_random = filter.filter(data_baru) data_random.class_is_last() if(pruning == 0): classifier = Classifier(classname = "weka.classifiers.trees.J48", options = ["-U"]) else: classifier = Classifier(classname = "weka.classifiers.trees.J48", options = ["-C", "0.25"]) evaluation = Evaluation(data_random) if(persen_train == 0): evaluation.evaluate_train_test_split(classifier, data_random, percentage = 40) elif(persen_train == 1): evaluation.evaluate_train_test_split(classifier, data_random, percentage = 50) elif(persen_train == 2): evaluation.evaluate_train_test_split(classifier, data_random, percentage = 60) else: evaluation.evaluate_train_test_split(classifier, data_random, percentage = 70) f.write(str(count + 1) + str( ". " ) + str(evaluation.weighted_true_positive_rate) + str( " " ) + str(evaluation.weighted_recall) + str( " " ) + str(evaluation.weighted_precision) + str( " " ) + str(evaluation.weighted_f_measure) + str( " " ) + str(evaluation.weighted_area_under_roc) + "\n") print count + 1, evaluation.weighted_true_positive_rate, evaluation.weighted_recall, evaluation.weighted_precision, evaluation.weighted_f_measure, evaluation.weighted_area_under_roc list_akurasi.append(evaluation.weighted_true_positive_rate) list_recall.append(evaluation.weighted_recall) list_presisi.append(evaluation.weighted_precision) list_fmeasure.append(evaluation.weighted_f_measure) list_roc.append(evaluation.weighted_area_under_roc) count += 1 time.sleep(1) list_akurasi.sort() list_recall.sort() list_presisi.sort() list_fmeasure.sort() list_roc.sort() f.write( "" + "\n") f.write( "Rata-Rata" + "\n") f.write( "Akurasi:" + str(sum(list_akurasi) / 100.0) + "\n") f.write( "Recall:" + str(sum(list_recall) / 100.0) + "\n") f.write( "Presisi:" + str(sum(list_presisi) / 100.0) + "\n") f.write( "F-Measure:" + str(sum(list_fmeasure) / 100.0) + "\n") f.write( "ROC:" + str(sum(list_roc) / 100.0) + "\n") f.write( "" + "\n") f.write( "Max" + "\n") f.write( "Akurasi:" + str(list_akurasi[-1] ) + "\n") f.write( "Recall:" + str(list_recall[-1] ) + "\n") f.write( "Presisi:" + str(list_presisi[-1] ) + "\n") f.write( "F-Measure:" + str(list_fmeasure[-1] ) + "\n") f.write( "ROC:" + str(list_roc[-1] ) + "\n") f.write( "" + "\n") f.write( "Min" + "\n") f.write( "Akurasi:" + str(list_akurasi[0] ) + "\n") f.write( "Recall:" + str(list_recall[0] ) + "\n") f.write( "Presisi:" + str(list_presisi[0] ) + "\n") f.write( "F-Measure:" + str(list_fmeasure[0] ) + "\n") f.write( "ROC:" + str(list_roc[0] ) + "\n") f.write( "" + "\n") print "" print "Rata-Rata" print "Akurasi:", sum(list_akurasi) / 100.0 print "Recall:", sum(list_recall) / 100.0 print "Presisi:", sum(list_presisi) / 100.0 print "F-Measure:", sum(list_fmeasure) / 100.0 print "ROC:", sum(list_roc) / 100.0 print "" print "Max" print "Akurasi:", list_akurasi[-1] print "Recall:", list_recall[-1] print "Presisi:", list_presisi[-1] print "F-Measure:", list_fmeasure[-1] print "ROC:", list_roc[-1] print "" print "Min" print "Akurasi:", list_akurasi[0] print "Recall:", list_recall[0] print "Presisi:", list_presisi[0] print "F-Measure:", list_fmeasure[0] print "ROC:", list_roc[0] print "" f.close() fitur_hapus -= 1 persen_train += 1 pruning += 1 jvm.stop()
def run_classifier(path, prot, sel, cols, prot_vals, beta): DIs = dict() jvm.start() for i in range(len(cols)-1): loader = Loader(classname="weka.core.converters.CSVLoader") data = loader.load_file(path) # remove selected attribute from the data # NOTE: options are ONE indexed, not ZERO indexed remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", \ options=["-R", str(sel[2]+1)]) remove.inputformat(data) data = remove.filter(data) # if running for only one attribue, remove all others (except protected) if i > 0: for j in range(1, prot[2]+1): if i != j: remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", \ options=["-R", ("1" if i>j else "2")]) remove.inputformat(data) data = remove.filter(data) # set prot attribute as Class attribute data.class_is_last() # run classifier cls = Classifier(classname="weka.classifiers.bayes.NaiveBayes") cls.build_classifier(data) # count the number of each combination pos_and_pred = float(0.0) pos_and_not_pred = float(0.0) neg_and_pred = float(0.0) neg_and_not_pred = float(0.0) for ind, inst in enumerate(data): if cls.classify_instance(inst): if prot_vals[ind] == prot[1]: pos_and_pred += 1 else: neg_and_pred += 1 else: if prot_vals[ind] == prot[1]: pos_and_not_pred += 1 else: neg_and_not_pred += 1 # calculate DI BER = ((pos_and_not_pred / (pos_and_pred + pos_and_not_pred)) + \ (neg_and_pred / (neg_and_pred + neg_and_not_pred))) * 0.5 if BER > 0.5: BER = 1 - BER DI = 1 - ((1 - 2 * BER) / (beta + 1 - 2 * BER)) if i == 0: # consider changing this to a 'code word' instead of 'all' DIs["all"] = DI else: DIs[cols[i-1]] = DI jvm.stop() return DIs
print("\nLoading dataset: " + fname + "\n") loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(fname) data.set_class_index(data.num_attributes() - 1) # plot pld.scatter_plot( data, data.get_attribute_by_name("petalwidth").get_index(), data.get_attribute_by_name("petallength").get_index(), wait=False) # add classifier errors to dataset addcls = Filter( classname="weka.filters.supervised.attribute.AddClassification", options=["-W", "weka.classifiers.trees.J48", "-classification", "-error"]) addcls.set_inputformat(data) filtered = addcls.filter(data) print(filtered) # build J48 cls = Classifier(classname="weka.classifiers.trees.J48") cls.build_classifier(data) evl = Evaluation(data) evl.test_model(cls, data) # plot classifier errors plc.plot_classifier_errors(evl.predictions(), wait=True) jvm.stop()
print("\nLoading dataset: " + fname + "\n") loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(fname) data.class_is_last() # cross-validate J48 cls = Classifier(classname="weka.classifiers.trees.J48") evl = Evaluation(data) evl.crossvalidate_model(cls, data, 10, Random(1)) print("All attributes: %0.0f%%" % evl.percent_correct) # remove attributes (1) and cross-validate J48 atts = "RI|Mg|Type" flt = Filter(classname="weka.filters.unsupervised.attribute.RemoveByName", options=["-E", "(" + atts + ")", "-V"]) flt.inputformat(data) filtered = flt.filter(data) cls = Classifier(classname="weka.classifiers.trees.J48") evl = Evaluation(filtered) evl.crossvalidate_model(cls, filtered, 10, Random(1)) print(atts + ": %0.0f%%" % evl.percent_correct) # remove attributes (2) and cross-validate J48 atts = "RI|Na|Mg|Ca|Ba|Type" flt = Filter(classname="weka.filters.unsupervised.attribute.RemoveByName", options=["-E", "(" + atts + ")", "-V"]) flt.inputformat(data) filtered = flt.filter(data) cls = Classifier(classname="weka.classifiers.trees.J48") evl = Evaluation(filtered) evl.crossvalidate_model(cls, filtered, 10, Random(1)) print(atts + ": %0.0f%%" % evl.percent_correct)
def main(): """ Just runs some example code. """ # load a dataset iris = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris) loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(iris) # remove class attribute helper.print_info("Removing class attribute") remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "last"]) remove.inputformat(data) filtered = remove.filter(data) # use MultiFilter helper.print_info("Use MultiFilter") remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"]) std = Filter(classname="weka.filters.unsupervised.attribute.Standardize") multi = MultiFilter() multi.filters = [remove, std] multi.inputformat(data) filtered_multi = multi.filter(data) # output datasets helper.print_title("Input") print(data) helper.print_title("Output") print(filtered) helper.print_title("Output (MultiFilter)") print(filtered_multi) # load text dataset text = helper.get_data_dir() + os.sep + "reutersTop10Randomized_1perc_shortened.arff" helper.print_info("Loading dataset: " + text) loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(text) data.class_is_last() # apply StringToWordVector stemmer = Stemmer(classname="weka.core.stemmers.IteratedLovinsStemmer") stopwords = Stopwords(classname="weka.core.stopwords.Rainbow") tokenizer = Tokenizer(classname="weka.core.tokenizers.WordTokenizer") s2wv = StringToWordVector(options=["-W", "10", "-L", "-C"]) s2wv.stemmer = stemmer s2wv.stopwords = stopwords s2wv.tokenizer = tokenizer s2wv.inputformat(data) filtered = s2wv.filter(data) helper.print_title("Input (StringToWordVector)") print(data) helper.print_title("Output (StringToWordVector)") print(filtered) # partial classname helper.print_title("Creating filter from partial classname") clsname = ".Standardize" f = Filter(classname=clsname) print(clsname + " --> " + f.classname) # source code helper.print_info("Generate source code") bolts = helper.get_data_dir() + os.sep + "labor.arff" helper.print_info("Loading dataset: " + bolts) loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(bolts) replace = Filter(classname="weka.filters.unsupervised.attribute.ReplaceMissingValues") replace.inputformat(data) replace.filter(data) print(replace.to_source("MyReplaceMissingValues", data))
# close csvfile csvfile.close() # start JVM jvm.start() # load CSV file loader = Loader(classname="weka.core.converters.CSVLoader", options=["-E", '"', "-F", ","]) data = loader.load_file(csvfilename) #print(data) # convert class to nominal wfilter = Filter(classname="weka.filters.unsupervised.attribute.StringToNominal", options=["-R", "last"]) wfilter.set_inputformat(data) data = wfilter.filter(data) # convert content to string wfilter = Filter(classname="weka.filters.unsupervised.attribute.NominalToString", options=["-C", "first"]) wfilter.set_inputformat(data) data = wfilter.filter(data) # set class attribute data.set_class_index(data.num_attributes() - 1) # generate baseline zeror = Classifier(classname="weka.classifiers.rules.ZeroR") evaluation = Evaluation(data) evaluation.crossvalidate_model(zeror, data, 10, Random(1)) print("\nBaseline:\n" + evaluation.to_summary())
# load diabetes loader = Loader(classname="weka.core.converters.ArffLoader") fname = data_dir + os.sep + "diabetes.arff" print("\nLoading dataset: " + fname + "\n") data = loader.load_file(fname) data.class_is_last() # simulate the 10 train/test pairs of cross-validation evl = Evaluation(data) for i in xrange(1, 11): # create train set remove = Filter( classname="weka.filters.supervised.instance.StratifiedRemoveFolds", options=["-N", "10", "-F", str(i), "-S", "1", "-V"]) remove.inputformat(data) train = remove.filter(data) # create test set remove = Filter( classname="weka.filters.supervised.instance.StratifiedRemoveFolds", options=["-N", "10", "-F", str(i), "-S", "1"]) remove.inputformat(data) test = remove.filter(data) cls = Classifier(classname="weka.classifiers.trees.J48") cls.build_classifier(train) evl.test_model(cls, test) print("Simulated CV accuracy: %0.1f%%" % evl.percent_correct) # perform actual cross-validation
def remove_attributes(self, *attributes): indices = [self.attribute_index(x) for x in attributes] remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", ','.join(str(x + 1) for x in indices)]) remove.inputformat(self.instances) self.instances = remove.filter(self.instances)
from weka.classifiers import Classifier, Evaluation, PredictionOutput from weka.filters import Filter jvm.start() # load diabetes loader = Loader(classname="weka.core.converters.ArffLoader") fname = data_dir + os.sep + "diabetes.arff" print("\nLoading dataset: " + fname + "\n") data = loader.load_file(fname) # we'll set the class attribute after filtering # apply NominalToBinary filter and set class attribute fltr = Filter("weka.filters.unsupervised.attribute.NominalToBinary") fltr.inputformat(data) filtered = fltr.filter(data) filtered.class_is_last() # cross-validate LinearRegression on filtered data, display model cls = Classifier(classname="weka.classifiers.functions.LinearRegression") pout = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.PlainText") evl = Evaluation(filtered) evl.crossvalidate_model(cls, filtered, 10, Random(1), pout) print("10-fold cross-validation:\n" + evl.summary()) print("Predictions:\n\n" + str(pout)) cls.build_classifier(filtered) print("Model:\n\n" + str(cls)) # use AddClassification filter with LinearRegression on filtered data print("Applying AddClassification to filtered data:\n") fltr = Filter(
def attribueSelectionBasedOnRankingInDatabase(self, trainingData, indexInTable, databaseTable, csvFilePath, testingData = None): featureNum = trainingData.num_attributes() - 1 outputStr = "" outputStr += databaseTable+"," # select from database vector difference featureList3 = [] wholefeatureList = [] dbmgr = permissionMappingManager(databasePath) for row in dbmgr.query("select * from " + databaseTable): featureList3.append(row[0]) wholefeatureList.append(row[0]) #featureList3.reverse() bestRemainFilterList = [] resultList = [] digit = len(featureList3) % 10 bestAccuracy = 0 bestTrainingData = None bestTestingData = None bestEvaluation = None classifier = self.algorithmPicker(trainingData, indexInTable) evaluation = self.evaluation(classifier, trainingData, testingData) if evaluation.percent_correct() >= bestAccuracy: bestAccuracy = evaluation.percent_correct() bestTrainingData = trainingData bestTestingData = testingData bestRemainFilterList = list(featureList3) bestEvaluation = evaluation print(self.algorithmTable[indexInTable] + ": " + "{:.2f}".format(evaluation.percent_correct()) + ", Feature select number:" + str(trainingData.num_attributes() - 1) + "/" + str(featureNum)) resultList.append("{:.2f}".format(evaluation.percent_correct())) if digit > 0: for i in range(0, digit): functionName = featureList3.pop().split("(")[0] + "\(\)" functionName = functionName.replace('$','\$') #print "functionName:" + functionName remove = Filter(classname="weka.filters.unsupervised.attribute.RemoveByName", options=["-E", "^" + functionName + ".*$"]) remove.set_inputformat(trainingData) trainingData = remove.filter(trainingData) if testingData: remove.set_inputformat(testingData) testingData = remove.filter(testingData) #print "i:" + str(i) #print "functionName:" + functionName #print "featureNum: " + str(filteredData.num_attributes() - 1) #for attributeStr in trainingData.attributes(): # print(attributeStr) #self.printFunctionInfo(trainingData, trainingData.num_instances()) classifier = self.algorithmPicker(trainingData, indexInTable) evaluation = self.evaluation(classifier, trainingData, testingData) if evaluation.percent_correct() >= bestAccuracy: bestAccuracy = evaluation.percent_correct() bestTrainingData = trainingData bestTestingData = testingData bestRemainFilterList = list(featureList3) bestEvaluation = evaluation print(self.algorithmTable[indexInTable] + ": " + "{:.2f}".format(evaluation.percent_correct()) + ", Feature select number:" + str(trainingData.num_attributes() - 1) + "/" + str(featureNum)) resultList.append("{:.2f}".format(evaluation.percent_correct())) while trainingData.num_attributes() - 1 > 10: for i in range(0,10): functionName = featureList3.pop().split("(")[0] + "\(\)" functionName = functionName.replace('$','\$') #print "functionName:" + functionName remove = Filter(classname="weka.filters.unsupervised.attribute.RemoveByName", options=["-E", "^" + functionName + ".*$"]) remove.set_inputformat(trainingData) trainingData = remove.filter(trainingData) if testingData: remove.set_inputformat(testingData) testingData = remove.filter(testingData) #print functionName #print "featureNum: " + str(filteredData.num_attributes() - 1) #for attributeStr in trainingData.attributes(): # print(attributeStr) classifier = self.algorithmPicker(trainingData, indexInTable) evaluation = self.evaluation(classifier, trainingData, testingData) if evaluation.percent_correct() >= bestAccuracy: bestAccuracy = evaluation.percent_correct() bestTrainingData = trainingData bestTestingData = testingData bestRemainFilterList = list(featureList3) bestEvaluation = evaluation #print "update feature number:" + str(len(bestRemainFilterList)) print(self.algorithmTable[indexInTable] + ": " + "{:.2f}".format(evaluation.percent_correct()) + ", Feature select number:" + str(trainingData.num_attributes() - 1) + "/" + str(featureNum)) resultList.append("{:.2f}".format(evaluation.percent_correct())) resultList.reverse() fileteredfeatureList = [] #print "bestRemainFilterList number:" + str(len(bestRemainFilterList)) #print "wholefeatureList number:" + str(len(wholefeatureList)) for item in wholefeatureList: if item not in bestRemainFilterList: fileteredfeatureList.append(item) #print "update fileteredfeatureList number:" + str(len(fileteredfeatureList)) for item in resultList: outputStr += item +"," outputStr = outputStr[0:-1] + "\n" print outputStr self.writeToPath(csvFilePath, outputStr) accuracyStr = "{:.2f}".format(bestAccuracy) #print fileteredfeatureList return [bestEvaluation, bestTrainingData, bestTestingData, resultList]