def main(): """ Just runs some example code. """ # load ARFF file helper.print_title("Loading ARFF file") loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(helper.get_data_dir() + os.sep + "iris.arff") print(str(data)) # load CSV file helper.print_title("Loading CSV file") loader = Loader(classname="weka.core.converters.CSVLoader") data = loader.load_file(helper.get_data_dir() + os.sep + "iris.csv") print(str(data)) # load directory # changes this to something sensible text_dir = "/some/where" if os.path.exists(text_dir) and os.path.isdir(text_dir): helper.print_title("Loading directory: " + text_dir) loader = TextDirectoryLoader(options=["-dir", text_dir, "-F", "-charset", "UTF-8"]) data = loader.load() print(unicode(data))
def main(): """ Just runs some example code. """ # load ARFF file helper.print_title("Loading ARFF file") loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(helper.get_data_dir() + os.sep + "iris.arff") print(str(data)) # load CSV file helper.print_title("Loading CSV file") loader = Loader(classname="weka.core.converters.CSVLoader") data = loader.load_file(helper.get_data_dir() + os.sep + "iris.csv") print(str(data)) # load directory # changes this to something sensible text_dir = "/some/where" if os.path.exists(text_dir) and os.path.isdir(text_dir): helper.print_title("Loading directory: " + text_dir) loader = TextDirectoryLoader( options=["-dir", text_dir, "-F", "-charset", "UTF-8"]) data = loader.load() print(unicode(data))
def main(): """ Just runs some example code. """ # load a dataset iris = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris) loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(iris) # remove class attribute helper.print_info("Removing class attribute") remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "last"]) remove.inputformat(data) filtered = remove.filter(data) # use MultiFilter helper.print_info("Use MultiFilter") remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"]) std = Filter(classname="weka.filters.unsupervised.attribute.Standardize") multi = MultiFilter() multi.filters = [remove, std] multi.inputformat(data) filtered_multi = multi.filter(data) # output datasets helper.print_title("Input") print(data) helper.print_title("Output") print(filtered) helper.print_title("Output (MultiFilter)") print(filtered_multi) # load text dataset text = helper.get_data_dir( ) + os.sep + "reutersTop10Randomized_1perc_shortened.arff" helper.print_info("Loading dataset: " + text) loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(text) data.class_is_last() # apply StringToWordVector stemmer = Stemmer(classname="weka.core.stemmers.IteratedLovinsStemmer") stopwords = Stopwords(classname="weka.core.stopwords.Rainbow") tokenizer = Tokenizer(classname="weka.core.tokenizers.WordTokenizer") s2wv = StringToWordVector(options=["-W", "10", "-L", "-C"]) s2wv.stemmer = stemmer s2wv.stopwords = stopwords s2wv.tokenizer = tokenizer s2wv.inputformat(data) filtered = s2wv.filter(data) helper.print_title("Input (StringToWordVector)") print(data) helper.print_title("Output (StringToWordVector)") print(filtered)
def main(args): """ Loads a dataset, shuffles it, splits it into train/test set. Trains J48 with training set and evaluates the built model on the test set. :param args: the commandline arguments (optional, can be dataset filename) :type args: list """ # load a dataset if len(args) <= 1: data_file = helper.get_data_dir() + os.sep + "vote.arff" else: data_file = args[1] helper.print_info("Loading dataset: " + data_file) loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(data_file) data.class_is_last() # generate train/test split of randomized data train, test = data.train_test_split(66.0, Random(1)) # build classifier cls = Classifier(classname="weka.classifiers.trees.J48") cls.build_classifier(train) print(cls) # evaluate evl = Evaluation(train) evl.test_model(cls, test) print(evl.summary())
def main(): """ Just runs some example code. """ classifier = Classifier("weka.classifiers.trees.J48") helper.print_title("Capabilities") capabilities = classifier.capabilities print(capabilities) # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file(iris_file) iris_data.class_is_last() data_capabilities = Capabilities.for_instances(iris_data) print(data_capabilities) print("classifier handles dataset: " + str(capabilities.supports(data_capabilities))) # disable/enable helper.print_title("Disable/Enable") capability = Capability(member="UNARY_ATTRIBUTES") capabilities.disable(capability) capabilities.min_instances = 10 print("Removing: " + str(capability)) print(capabilities)
def main(): """ Just runs some example code. """ # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") full = loader.load_file(iris_file) full.class_is_last() # remove class attribute data = Instances.copy_instances(full) data.no_class() data.delete_last_attribute() # build a clusterer and output model helper.print_title("Training SimpleKMeans clusterer") clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "3"]) clusterer.build_clusterer(data) print("done") # classes to clusters evl = ClusterEvaluation() evl.set_model(clusterer) evl.test_model(full) helper.print_title("Cluster results") print(evl.cluster_results) helper.print_title("Classes to clusters") print(evl.classes_to_clusters)
def main(args): """ Trains a NaiveBayesUpdateable classifier incrementally on a dataset. The dataset can be supplied as parameter. :param args: the commandline arguments :type args: list """ # load a dataset if len(args) <= 1: data_file = helper.get_data_dir() + os.sep + "vote.arff" else: data_file = args[1] helper.print_info("Loading dataset: " + data_file) loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(data_file, incremental=True) data.class_is_last() # classifier nb = Classifier(classname="weka.classifiers.bayes.NaiveBayesUpdateable") nb.build_classifier(data) # train incrementally for inst in loader: nb.update_classifier(inst) print(nb)
def gridsearch(): """ Applies GridSearch to a dataset. GridSearch package must be not be installed, as the monolithic weka.jar already contains this package. """ helper.print_title("GridSearch") # load a dataset fname = helper.get_data_dir() + os.sep + "bolts.arff" helper.print_info("Loading train: " + fname) loader = Loader(classname="weka.core.converters.ArffLoader") train = loader.load_file(fname) train.class_is_last() # classifier grid = GridSearch(options=["-sample-size", "100.0", "-traversal", "ROW-WISE", "-num-slots", "1", "-S", "1"]) grid.evaluation = "CC" grid.y = {"property": "kernel.gamma", "min": -3.0, "max": 3.0, "step": 1.0, "base": 10.0, "expression": "pow(BASE,I)"} grid.x = {"property": "C", "min": -3.0, "max": 3.0, "step": 1.0, "base": 10.0, "expression": "pow(BASE,I)"} cls = Classifier( classname="weka.classifiers.functions.SMOreg", options=["-K", "weka.classifiers.functions.supportVector.RBFKernel"]) grid.classifier = cls grid.build_classifier(train) print("Model:\n" + str(grid)) print("\nBest setup:\n" + grid.best.to_commandline())
def load_incremental(): """ Loads a dataset incrementally. """ # setup the flow helper.print_title("Load dataset (incremental)") iris = helper.get_data_dir() + os.sep + "iris.arff" flow = Flow(name="load dataset") filesupplier = FileSupplier() filesupplier.config["files"] = [iris] flow.actors.append(filesupplier) loaddataset = LoadDataset() loaddataset.config["incremental"] = True flow.actors.append(loaddataset) console = Console() flow.actors.append(console) # run the flow msg = flow.setup() if msg is None: msg = flow.execute() if msg is not None: print("Error executing flow:\n" + msg) else: print("Error setting up flow:\n" + msg) flow.wrapup() flow.cleanup()
def load_custom_loader(): """ Loads a dataset using a custom loader. """ # setup the flow helper.print_title("Load dataset (custom loader)") iris = helper.get_data_dir() + os.sep + "iris.csv" flow = Flow(name="load dataset") filesupplier = FileSupplier() filesupplier.config["files"] = [iris] flow.actors.append(filesupplier) loaddataset = LoadDataset() loaddataset.config["incremental"] = False loaddataset.config["use_custom_loader"] = True loaddataset.config["custom_loader"] = Loader(classname="weka.core.converters.CSVLoader") flow.actors.append(loaddataset) console = Console() flow.actors.append(console) # run the flow msg = flow.setup() if msg is None: msg = flow.execute() if msg is not None: print("Error executing flow:\n" + msg) else: print("Error setting up flow:\n" + msg) flow.wrapup() flow.cleanup()
def main(): """ Just runs some example code. """ """ Plots a dataset. """ # setup the flow helper.print_title("Plot dataset") iris = helper.get_data_dir() + os.sep + "iris.arff" flow = Flow(name="plot dataset") filesupplier = FileSupplier() filesupplier.config["files"] = [iris] flow.actors.append(filesupplier) loaddataset = LoadDataset() flow.actors.append(loaddataset) branch = Branch() flow.actors.append(branch) seq = Sequence(name="matrix plot") branch.actors.append(seq) mplot = MatrixPlot() mplot.config["percent"] = 50.0 mplot.config["wait"] = False seq.actors.append(mplot) seq = Sequence(name="line plot") branch.actors.append(seq) copy = Copy() seq.actors.append(copy) flter = Filter() flter.config["setup"] = filters.Filter( classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "last"]) flter.config["keep_relationname"] = True seq.actors.append(flter) lplot = LinePlot() lplot.config["percent"] = 50.0 lplot.config["wait"] = True seq.actors.append(lplot) # run the flow msg = flow.setup() if msg is None: print("\n" + flow.tree + "\n") msg = flow.execute() if msg is not None: print("Error executing flow:\n" + msg) else: print("Error setting up flow:\n" + msg) flow.wrapup() flow.cleanup()
def load_custom_loader(): """ Loads a dataset using a custom loader. """ # setup the flow helper.print_title("Load dataset (custom loader)") iris = helper.get_data_dir() + os.sep + "iris.csv" flow = Flow(name="load dataset") filesupplier = FileSupplier() filesupplier.config["files"] = [iris] flow.actors.append(filesupplier) loaddataset = LoadDataset() loaddataset.config["incremental"] = False loaddataset.config["use_custom_loader"] = True loaddataset.config["custom_loader"] = Loader( classname="weka.core.converters.CSVLoader") flow.actors.append(loaddataset) console = Console() flow.actors.append(console) # run the flow msg = flow.setup() if msg is None: msg = flow.execute() if msg is not None: print("Error executing flow:\n" + msg) else: print("Error setting up flow:\n" + msg) flow.wrapup() flow.cleanup()
def main(): """ Just runs some example code. """ # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(iris_file) # remove class attribute data.delete_last_attribute() # build a clusterer and output model helper.print_title("Training SimpleKMeans clusterer") clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "3"]) clusterer.build_clusterer(data) print(clusterer) # cluster data helper.print_info("Clustering data") for index, inst in enumerate(data): cl = clusterer.cluster_instance(inst) dist = clusterer.distribution_for_instance(inst) print(str(index+1) + ": cluster=" + str(cl) + ", distribution=" + str(dist))
def main(): """ Just runs some example code. """ # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file(iris_file) iris_data.class_is_last() # train classifier classifier = Classifier("weka.classifiers.trees.J48") classifier.build_classifier(iris_data) # save and read object helper.print_title("I/O: single object") outfile = tempfile.gettempdir() + os.sep + "j48.model" serialization.write(outfile, classifier) model = Classifier(jobject=serialization.read(outfile)) print(model) # save classifier and dataset header (multiple objects) helper.print_title("I/O: single object") serialization.write_all(outfile, [classifier, Instances.template_instances(iris_data)]) objects = serialization.read_all(outfile) for i, obj in enumerate(objects): helper.print_info("Object #" + str(i+1) + ":") if javabridge.get_env().is_instance_of(obj, javabridge.get_env().find_class("weka/core/Instances")): obj = Instances(jobject=obj) elif javabridge.get_env().is_instance_of(obj, javabridge.get_env().find_class("weka/classifiers/Classifier")): obj = Classifier(jobject=obj) print(obj)
def main(): """ Shows how to use the CostSensitiveClassifier. """ # load a dataset data_file = helper.get_data_dir() + os.sep + "diabetes.arff" helper.print_info("Loading dataset: " + data_file) loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(data_file) data.class_is_last() # classifier classifier = SingleClassifierEnhancer( classname="weka.classifiers.meta.CostSensitiveClassifier", options=["-cost-matrix", "[0 1; 2 0]", "-S", "2"]) base = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.3"]) classifier.classifier = base folds = 10 evaluation = Evaluation(data) evaluation.crossvalidate_model(classifier, data, folds, Random(1)) print("") print("=== Setup ===") print("Classifier: " + classifier.to_commandline()) print("Dataset: " + data.relationname) print("") print(evaluation.summary("=== " + str(folds) + " -fold Cross-Validation ==="))
def main(): """ Shows how to use the CostSensitiveClassifier. """ # load a dataset data_file = helper.get_data_dir() + os.sep + "diabetes.arff" helper.print_info("Loading dataset: " + data_file) loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(data_file) data.class_is_last() # classifier classifier = SingleClassifierEnhancer( classname="weka.classifiers.meta.CostSensitiveClassifier", options=["-cost-matrix", "[0 1; 2 0]", "-S", "2"]) base = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.3"]) classifier.classifier = base folds = 10 evaluation = Evaluation(data) evaluation.crossvalidate_model(classifier, data, folds, Random(1)) print("") print("=== Setup ===") print("Classifier: " + classifier.to_commandline()) print("Dataset: " + data.relationname) print("") print( evaluation.summary("=== " + str(folds) + " -fold Cross-Validation ==="))
def main(args): """ Trains Apriori on the specified dataset (uses vote UCI dataset if no dataset specified). :param args: the commandline arguments :type args: list """ # load a dataset if len(args) <= 1: data_file = helper.get_data_dir() + os.sep + "vote.arff" else: data_file = args[1] helper.print_info("Loading dataset: " + data_file) loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(data_file) data.class_is_last() # build Apriori, using last attribute as class attribute apriori = Associator(classname="weka.associations.Apriori", options=["-c", "-1"]) apriori.build_associations(data) print(str(apriori)) # iterate association rules (low-level) helper.print_info("Rules (low-level)") # make the underlying rules list object iterable in Python rules = javabridge.iterate_collection(apriori.jwrapper.getAssociationRules().getRules().o) for i, r in enumerate(rules): # wrap the Java object to make its methods accessible rule = JWrapper(r) print(str(i+1) + ". " + str(rule)) # output some details on rule print(" - consequence support: " + str(rule.getConsequenceSupport())) print(" - premise support: " + str(rule.getPremiseSupport())) print(" - total support: " + str(rule.getTotalSupport())) print(" - total transactions: " + str(rule.getTotalTransactions())) # iterate association rules (high-level) helper.print_info("Rules (high-level)") print("can produce rules? " + str(apriori.can_produce_rules())) print("rule metric names: " + str(apriori.rule_metric_names)) rules = apriori.association_rules() if rules is not None: print("producer: " + rules.producer) print("# rules: " + str(len(rules))) for i, rule in enumerate(rules): print(str(i+1) + ". " + str(rule)) # output some details on rule print(" - consequence support: " + str(rule.consequence_support)) print(" - consequence: " + str(rule.consequence)) print(" - premise support: " + str(rule.premise_support)) print(" - premise: " + str(rule.premise)) print(" - total support: " + str(rule.total_support)) print(" - total transactions: " + str(rule.total_transactions)) print(" - metric names: " + str(rule.metric_names)) print(" - metric values: " + str(rule.metric_values)) print(" - metric value 'Confidence': " + str(rule.metric_value('Confidence'))) print(" - primary metric name: " + str(rule.primary_metric_name)) print(" - primary metric value: " + str(rule.primary_metric_value))
def main(): """ Just runs some example code. """ # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(iris_file) # remove class attribute data.delete_last_attribute() # build a clusterer and output model helper.print_title("Training SimpleKMeans clusterer") clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "3"]) clusterer.build_clusterer(data) print(clusterer) helper.print_info("Evaluating on data") evaluation = ClusterEvaluation() evaluation.set_model(clusterer) evaluation.test_model(data) print("# clusters: " + str(evaluation.num_clusters)) print("log likelihood: " + str(evaluation.log_likelihood)) print("cluster assignments:\n" + str(evaluation.cluster_assignments)) plc.plot_cluster_assignments(evaluation, data, inst_no=True) # using a filtered clusterer helper.print_title("Filtered clusterer") loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(iris_file) clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "3"]) remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "last"]) fclusterer = FilteredClusterer() fclusterer.clusterer = clusterer fclusterer.filter = remove fclusterer.build_clusterer(data) print(fclusterer) # load a dataset incrementally and build clusterer incrementally helper.print_title("Incremental clusterer") loader = Loader("weka.core.converters.ArffLoader") iris_inc = loader.load_file(iris_file, incremental=True) clusterer = Clusterer("weka.clusterers.Cobweb") remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "last"]) remove.inputformat(iris_inc) iris_filtered = remove.outputformat() clusterer.build_clusterer(iris_filtered) for inst in loader: remove.input(inst) inst_filtered = remove.output() clusterer.update_clusterer(inst_filtered) clusterer.update_finished() print(clusterer.to_commandline()) print(clusterer) print(clusterer.graph) plg.plot_dot_graph(clusterer.graph)
def incremental(): """ Just runs some example code. """ """ Loads/filters a dataset incrementally. """ # setup the flow helper.print_title("Filter datasets (incrementally)") iris = helper.get_data_dir() + os.sep + "iris.arff" anneal = helper.get_data_dir() + os.sep + "anneal.arff" flow = Flow(name="filter datasets (incrementally)") filesupplier = FileSupplier() filesupplier.config["files"] = [iris, anneal] flow.actors.append(filesupplier) loaddataset = LoadDataset() loaddataset.config["incremental"] = True flow.actors.append(loaddataset) flter = Filter() flter.config["setup"] = filters.Filter( classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "1"]) flter.config["keep_relationname"] = True flow.actors.append(flter) console = Console() flow.actors.append(console) # run the flow msg = flow.setup() if msg is None: print("\n" + flow.tree + "\n") msg = flow.execute() if msg is not None: print("Error executing flow:\n" + msg) else: print("Error setting up flow:\n" + msg) flow.wrapup() flow.cleanup()
def main(): """ Just runs some example code. """ # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file(iris_file) iris_data.class_is_last() # train classifier classifier = Classifier("weka.classifiers.trees.J48") classifier.build_classifier(iris_data) # save and read object helper.print_title("I/O: model (using serialization module)") outfile = tempfile.gettempdir() + os.sep + "j48.model" serialization.write(outfile, classifier) model = Classifier(jobject=serialization.read(outfile)) print(model) # save classifier and dataset header (multiple objects) helper.print_title("I/O: model and header (using serialization module)") serialization.write_all( outfile, [classifier, Instances.template_instances(iris_data)]) objects = serialization.read_all(outfile) for i, obj in enumerate(objects): helper.print_info("Object #" + str(i + 1) + ":") if javabridge.get_env().is_instance_of( obj, javabridge.get_env().find_class("weka/core/Instances")): obj = Instances(jobject=obj) elif javabridge.get_env().is_instance_of( obj, javabridge.get_env().find_class( "weka/classifiers/Classifier")): obj = Classifier(jobject=obj) print(obj) # save and read object helper.print_title("I/O: just model (using Classifier class)") outfile = tempfile.gettempdir() + os.sep + "j48.model" classifier.serialize(outfile) model, _ = Classifier.deserialize(outfile) print(model) # save classifier and dataset header (multiple objects) helper.print_title("I/O: model and header (using Classifier class)") classifier.serialize(outfile, header=iris_data) model, header = Classifier.deserialize(outfile) print(model) if header is not None: print(header)
def main(): """ Just runs some example code. """ # setup the flow flow = Flow(name="list files") listfiles = ListFiles() listfiles.config["dir"] = str(helper.get_data_dir()) listfiles.config["list_files"] = True listfiles.config["list_dirs"] = False listfiles.config["recursive"] = False listfiles.config["regexp"] = ".*.arff" flow.actors.append(listfiles) tee = Tee() flow.actors.append(tee) convert = Convert() convert.config["setup"] = conversion.PassThrough() tee.actors.append(convert) console = Console() console.config["prefix"] = "Match: " tee.actors.append(console) load = LoadDataset() load.config["use_custom_loader"] = True flow.actors.append(load) cross = CrossValidate() cross.config["setup"] = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.3"]) flow.actors.append(cross) summary = EvaluationSummary() summary.config["matrix"] = True flow.actors.append(summary) # print flow flow.setup() print("\n" + flow.tree + "\n") # save the flow fname = tempfile.gettempdir() + os.sep + "simpleflow.json" Flow.save(flow, fname) # load flow fl2 = Flow.load(fname) # output flow fl2.setup() print("\n" + fl2.tree + "\n")
def arffInput(self): # load a dataset iris_file = helper.get_data_dir( ) + os.sep + "/Users/rezakhoshkangini/Documents/Drexel_Documents/Work/Mat-Code/NewCSV/BindedData/Section0.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file(iris_file) iris_data.class_is_last() # classifier help helper.print_title("Creating help string") classifier = Classifier(classname="weka.classifiers.trees.J48") print(classifier.to_help())
def main(args): """ Loads a dataset, shuffles it, splits it into train/test set. Trains J48 with training set and evaluates the built model on the test set. The predictions get recorded in two different ways: 1. in-memory via the test_model method 2. directly to file (more memory efficient), but a separate run of making predictions :param args: the commandline arguments (optional, can be dataset filename) :type args: list """ # load a dataset if len(args) <= 1: data_file = helper.get_data_dir() + os.sep + "vote.arff" else: data_file = args[1] helper.print_info("Loading dataset: " + data_file) loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(data_file) data.class_is_last() # generate train/test split of randomized data train, test = data.train_test_split(66.0, Random(1)) # build classifier cls = Classifier(classname="weka.classifiers.trees.J48") cls.build_classifier(train) print(cls) # evaluate and record predictions in memory helper.print_title("recording predictions in-memory") output = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.CSV", options=["-distribution"]) evl = Evaluation(train) evl.test_model(cls, test, output=output) print(evl.summary()) helper.print_info("Predictions:") print(output.buffer_content()) # record/output predictions separately helper.print_title("recording/outputting predictions separately") outputfile = helper.get_tmp_dir() + "/j48_vote.csv" output = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.CSV", options=["-distribution", "-suppress", "-file", outputfile]) output.header = test output.print_all(cls, test) helper.print_info("Predictions stored in:" + outputfile) # by using "-suppress" we don't store the output in memory, the following statement won't output anything print(output.buffer_content())
def main(): """ Just runs some example code. """ # load a dataset vote_file = helper.get_data_dir() + os.sep + "vote.arff" helper.print_info("Loading dataset: " + vote_file) loader = Loader("weka.core.converters.ArffLoader") vote_data = loader.load_file(vote_file) vote_data.class_is_last() # train and output associator associator = Associator(classname="weka.associations.Apriori", options=["-N", "9", "-I"]) associator.build_associations(vote_data) print(associator)
def main(): """ Just runs some example code. """ """ Loads/filters a dataset incrementally and saves it to a new file. """ # setup the flow helper.print_title("Load/filter/save dataset (incrementally)") iris = helper.get_data_dir() + os.sep + "iris.arff" flow = Flow(name="Load/filter/save dataset (incrementally)") filesupplier = FileSupplier() filesupplier.config["files"] = [iris] flow.actors.append(filesupplier) loaddataset = LoadDataset() loaddataset.config["incremental"] = True flow.actors.append(loaddataset) flter = Filter() flter.config["setup"] = filters.Filter( classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "last"]) flow.actors.append(flter) rename = RenameRelation() rename.config["name"] = "iris-reduced" flow.actors.append(rename) dumper = InstanceDumper() dumper.config["output"] = tempfile.gettempdir() + os.sep + "out.arff" flow.actors.append(dumper) # run the flow msg = flow.setup() if msg is None: print("\n" + flow.tree + "\n") msg = flow.execute() if msg is not None: print("Error executing flow:\n" + msg) else: print("Error setting up flow:\n" + msg) flow.wrapup() flow.cleanup()
def main(): """ Just runs some example code. """ # load a dataset bodyfat_file = helper.get_data_dir() + os.sep + "bodyfat.arff" helper.print_info("Loading dataset: " + bodyfat_file) loader = Loader("weka.core.converters.ArffLoader") bodyfat_data = loader.load_file(bodyfat_file) bodyfat_data.class_is_last() # classifier help helper.print_title("Creating help string") classifier = Classifier(classname="weka.classifiers.trees.M5P") classifier.build_classifier(bodyfat_data) print(classifier)
def main(): """ Just runs some example code. """ # setup the flow helper.print_title("build and save clusterer") iris = helper.get_data_dir() + os.sep + "iris_no_class.arff" flow = Flow(name="build and save clusterer") filesupplier = FileSupplier() filesupplier.config["files"] = [iris] flow.actors.append(filesupplier) loaddataset = LoadDataset() flow.actors.append(loaddataset) train = Train() train.config["setup"] = Clusterer(classname="weka.clusterers.SimpleKMeans") flow.actors.append(train) pick = ContainerValuePicker() pick.config["value"] = "Model" flow.actors.append(pick) console = Console() pick.actors.append(console) writer = ModelWriter() writer.config["output"] = str( tempfile.gettempdir()) + os.sep + "simplekmeans.model" flow.actors.append(writer) # run the flow msg = flow.setup() if msg is None: print("\n" + flow.tree + "\n") msg = flow.execute() if msg is not None: print("Error executing flow:\n" + msg) else: print("Error setting up flow:\n" + msg) flow.wrapup() flow.cleanup()
def main(): """ Just runs some example code. """ # setup the flow helper.print_title("build and save clusterer") iris = helper.get_data_dir() + os.sep + "iris_no_class.arff" flow = Flow(name="build and save clusterer") filesupplier = FileSupplier() filesupplier.config["files"] = [iris] flow.actors.append(filesupplier) loaddataset = LoadDataset() flow.actors.append(loaddataset) train = Train() train.config["setup"] = Clusterer(classname="weka.clusterers.SimpleKMeans") flow.actors.append(train) pick = ContainerValuePicker() pick.config["value"] = "Model" flow.actors.append(pick) console = Console() pick.actors.append(console) writer = ModelWriter() writer.config["output"] = str(tempfile.gettempdir()) + os.sep + "simplekmeans.model" flow.actors.append(writer) # run the flow msg = flow.setup() if msg is None: print("\n" + flow.tree + "\n") msg = flow.execute() if msg is not None: print("Error executing flow:\n" + msg) else: print("Error setting up flow:\n" + msg) flow.wrapup() flow.cleanup()
def main(): """ Just runs some example code. """ # setup the flow helper.print_title("Cross-validate clusterer") iris = helper.get_data_dir() + os.sep + "iris.arff" flow = Flow(name="cross-validate clusterer") filesupplier = FileSupplier() filesupplier.config["files"] = [iris] flow.actors.append(filesupplier) loaddataset = LoadDataset() flow.actors.append(loaddataset) flter = Filter() flter.name = "Remove class" flter.config["filter"] = filters.Filter( classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "last"]) flow.actors.append(flter) cv = CrossValidate() cv.config["setup"] = Clusterer(classname="weka.clusterers.EM") flow.actors.append(cv) console = Console() console.config["prefix"] = "Loglikelihood: " flow.actors.append(console) # run the flow msg = flow.setup() if msg is None: print("\n" + flow.tree + "\n") msg = flow.execute() if msg is not None: print("Error executing flow:\n" + msg) else: print("Error setting up flow:\n" + msg) flow.wrapup() flow.cleanup()
def main(): """ Just runs some example code. """ # load a dataset anneal_file = helper.get_data_dir() + os.sep + "anneal.arff" helper.print_info("Loading dataset: " + anneal_file) loader = Loader("weka.core.converters.ArffLoader") anneal_data = loader.load_file(anneal_file) anneal_data.class_is_last() # perform attribute selection helper.print_title("Attribute selection") search = ASSearch(classname="weka.attributeSelection.BestFirst", options=["-D", "1", "-N", "5"]) evaluation = ASEvaluation( classname="weka.attributeSelection.CfsSubsetEval", options=["-P", "1", "-E", "1"]) attsel = AttributeSelection() attsel.search(search) attsel.evaluator(evaluation) attsel.select_attributes(anneal_data) print("# attributes: " + str(attsel.number_attributes_selected)) print("attributes (as numpy array): " + str(attsel.selected_attributes)) print("attributes (as list): " + str(list(attsel.selected_attributes))) print("result string:\n" + attsel.results_string) # perform ranking helper.print_title("Attribute ranking (2-fold CV)") search = ASSearch(classname="weka.attributeSelection.Ranker", options=["-N", "-1"]) evaluation = ASEvaluation("weka.attributeSelection.InfoGainAttributeEval") attsel = AttributeSelection() attsel.ranking(True) attsel.folds(2) attsel.crossvalidation(True) attsel.seed(42) attsel.search(search) attsel.evaluator(evaluation) attsel.select_attributes(anneal_data) print("ranked attributes:\n" + str(attsel.ranked_attributes)) print("result string:\n" + attsel.results_string)
def main(args): """ Performs attribute selection on the specified dataset (uses vote UCI dataset if no dataset specified). Last attribute is assumed to be the class attribute. Used: CfsSubsetEval, GreedyStepwise, J48 :param args: the commandline arguments :type args: list """ # load a dataset if len(args) <= 1: data_file = helper.get_data_dir() + os.sep + "vote.arff" else: data_file = args[1] helper.print_info("Loading dataset: " + data_file) loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(data_file) data.class_is_last() use_classifier(data) use_filter(data) use_low_level(data)
def main(args): """ Trains Apriori on the specified dataset (uses vote UCI dataset if no dataset specified). :param args: the commandline arguments :type args: list """ # load a dataset if len(args) <= 1: data_file = helper.get_data_dir() + os.sep + "vote.arff" else: data_file = args[1] helper.print_info("Loading dataset: " + data_file) loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(data_file) data.class_is_last() # build Apriori, using last attribute as class attribute apriori = Associator(classname="weka.associations.Apriori", options=["-c", "-1"]) apriori.build_associations(data) print(str(apriori))
def main(): """ Just runs some example code. """ # load a dataset anneal_file = helper.get_data_dir() + os.sep + "anneal.arff" helper.print_info("Loading dataset: " + anneal_file) loader = Loader("weka.core.converters.ArffLoader") anneal_data = loader.load_file(anneal_file) anneal_data.class_is_last() # perform attribute selection helper.print_title("Attribute selection") search = ASSearch(classname="weka.attributeSelection.BestFirst", options=["-D", "1", "-N", "5"]) evaluation = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval", options=["-P", "1", "-E", "1"]) attsel = AttributeSelection() attsel.search(search) attsel.evaluator(evaluation) attsel.select_attributes(anneal_data) print("# attributes: " + str(attsel.number_attributes_selected)) print("attributes: " + str(attsel.selected_attributes)) print("result string:\n" + attsel.results_string) # perform ranking helper.print_title("Attribute ranking (2-fold CV)") search = ASSearch(classname="weka.attributeSelection.Ranker", options=["-N", "-1"]) evaluation = ASEvaluation("weka.attributeSelection.InfoGainAttributeEval") attsel = AttributeSelection() attsel.ranking(True) attsel.folds(2) attsel.crossvalidation(True) attsel.seed(42) attsel.search(search) attsel.evaluator(evaluation) attsel.select_attributes(anneal_data) print("ranked attributes:\n" + str(attsel.ranked_attributes)) print("result string:\n" + attsel.results_string)
def multisearch(): """ Applies MultiSearch to a dataset. "multisearch-weka-package" package must be installed. """ helper.print_title("MultiSearch") # load a dataset fname = helper.get_data_dir() + os.sep + "bolts.arff" helper.print_info("Loading train: " + fname) loader = Loader(classname="weka.core.converters.ArffLoader") train = loader.load_file(fname) train.class_is_last() # classifier multi = MultiSearch(options=[ "-sample-size", "100.0", "-initial-folds", "2", "-subsequent-folds", "2", "-num-slots", "1", "-S", "1" ]) multi.evaluation = "CC" mparam = MathParameter() mparam.prop = "classifier.kernel.gamma" mparam.minimum = -3.0 mparam.maximum = 3.0 mparam.step = 1.0 mparam.base = 10.0 mparam.expression = "pow(BASE,I)" lparam = ListParameter() lparam.prop = "classifier.C" lparam.values = ["-2.0", "-1.0", "0.0", "1.0", "2.0"] multi.parameters = [mparam, lparam] cls = Classifier( classname="weka.classifiers.functions.SMOreg", options=["-K", "weka.classifiers.functions.supportVector.RBFKernel"]) multi.classifier = cls multi.build_classifier(train) print("Model:\n" + str(multi)) print("\nBest setup:\n" + multi.best.to_commandline())
def multisearch(): """ Applies MultiSearch to a dataset. "multisearch-weka-package" package must be installed. """ helper.print_title("MultiSearch") # load a dataset fname = helper.get_data_dir() + os.sep + "bolts.arff" helper.print_info("Loading train: " + fname) loader = Loader(classname="weka.core.converters.ArffLoader") train = loader.load_file(fname) train.class_is_last() # classifier multi = MultiSearch( options=["-sample-size", "100.0", "-initial-folds", "2", "-subsequent-folds", "2", "-num-slots", "1", "-S", "1"]) multi.evaluation = "CC" mparam = MathParameter() mparam.prop = "classifier.kernel.gamma" mparam.minimum = -3.0 mparam.maximum = 3.0 mparam.step = 1.0 mparam.base = 10.0 mparam.expression = "pow(BASE,I)" lparam = ListParameter() lparam.prop = "classifier.C" lparam.values = ["-2.0", "-1.0", "0.0", "1.0", "2.0"] multi.parameters = [mparam, lparam] cls = Classifier( classname="weka.classifiers.functions.SMOreg", options=["-K", "weka.classifiers.functions.supportVector.RBFKernel"]) multi.classifier = cls multi.build_classifier(train) print("Model:\n" + str(multi)) print("\nBest setup:\n" + multi.best.to_commandline())
def main(): """ Just runs some example code. """ """ Displays a dataset as matrixplot. """ # setup the flow helper.print_title("Matrix plot") iris = helper.get_data_dir() + os.sep + "iris.arff" flow = Flow(name="matrix plot") filesupplier = FileSupplier() filesupplier.config["files"] = [iris] flow.actors.append(filesupplier) loaddataset = LoadDataset() flow.actors.append(loaddataset) plot = MatrixPlot() plot.config["percent"] = 50.0 flow.actors.append(plot) # run the flow msg = flow.setup() if msg is None: print("\n" + flow.tree + "\n") msg = flow.execute() if msg is not None: print("Error executing flow:\n" + msg) else: print("Error setting up flow:\n" + msg) flow.wrapup() flow.cleanup()
def main(): """ Just runs some example code. """ print(helper.get_data_dir()) # cross-validation + classification helper.print_title("Experiment: Cross-validation + classification") datasets = [ helper.get_data_dir() + os.sep + "iris.arff", helper.get_data_dir() + os.sep + "anneal.arff" ] classifiers = [ Classifier("weka.classifiers.rules.ZeroR"), Classifier("weka.classifiers.trees.J48") ] outfile = tempfile.gettempdir() + os.sep + "results-cv.arff" exp = SimpleCrossValidationExperiment(classification=True, runs=10, folds=10, datasets=datasets, classifiers=classifiers, result=outfile) exp.setup() exp.run() # evaluate loader = converters.loader_for_file(outfile) data = loader.load_file(outfile) matrix = ResultMatrix("weka.experiment.ResultMatrixPlainText") tester = Tester("weka.experiment.PairedCorrectedTTester") tester.resultmatrix = matrix comparison_col = data.attribute_by_name("Percent_correct").index tester.instances = data print(tester.header(comparison_col)) print(tester.multi_resultset_full(0, comparison_col)) # random split + regression helper.print_title("Experiment: Random split + regression") datasets = [ helper.get_data_dir() + os.sep + "bolts.arff", helper.get_data_dir() + os.sep + "bodyfat.arff" ] classifiers = [ Classifier("weka.classifiers.rules.ZeroR"), Classifier("weka.classifiers.functions.LinearRegression") ] outfile = tempfile.gettempdir() + os.sep + "results-rs.arff" exp = SimpleRandomSplitExperiment(classification=False, runs=10, percentage=66.6, preserve_order=False, datasets=datasets, classifiers=classifiers, result=outfile) exp.setup() exp.run() # evaluate loader = converters.loader_for_file(outfile) data = loader.load_file(outfile) matrix = ResultMatrix("weka.experiment.ResultMatrixPlainText") tester = Tester("weka.experiment.PairedCorrectedTTester") tester.resultmatrix = matrix comparison_col = data.attribute_by_name("Correlation_coefficient").index tester.instances = data print(tester.header(comparison_col)) print(tester.multi_resultset_full(0, comparison_col)) # plot plot_exp.plot_experiment(matrix, title="Random split", measure="Correlation coefficient", wait=True)
def main(): """ Just runs some example code. """ # setup the flow helper.print_title("cluster data") iris = helper.get_data_dir() + os.sep + "iris_no_class.arff" clsfile = str(tempfile.gettempdir()) + os.sep + "simplekmeans.model" flow = Flow(name="cluster data") start = Start() flow.actors.append(start) build_save = Trigger() build_save.name = "build and save clusterer" flow.actors.append(build_save) filesupplier = FileSupplier() filesupplier.config["files"] = [iris] build_save.actors.append(filesupplier) loaddataset = LoadDataset() build_save.actors.append(loaddataset) ssv = SetStorageValue() ssv.config["storage_name"] = "data" build_save.actors.append(ssv) train = Train() train.config["setup"] = Clusterer(classname="weka.clusterers.SimpleKMeans") build_save.actors.append(train) ssv = SetStorageValue() ssv.config["storage_name"] = "model" build_save.actors.append(ssv) pick = ContainerValuePicker() pick.config["value"] = "Model" build_save.actors.append(pick) console = Console() console.config["prefix"] = "built: " pick.actors.append(console) writer = ModelWriter() writer.config["output"] = clsfile build_save.actors.append(writer) pred_serialized = Trigger() pred_serialized.name = "make predictions (serialized model)" flow.actors.append(pred_serialized) filesupplier = FileSupplier() filesupplier.config["files"] = [iris] pred_serialized.actors.append(filesupplier) loaddataset = LoadDataset() loaddataset.config["incremental"] = True pred_serialized.actors.append(loaddataset) predict = Predict() predict.config["model"] = clsfile pred_serialized.actors.append(predict) console = Console() console.config["prefix"] = "serialized: " pred_serialized.actors.append(console) pred_storage = Trigger() pred_storage.name = "make predictions (model from storage)" flow.actors.append(pred_storage) filesupplier = FileSupplier() filesupplier.config["files"] = [iris] pred_storage.actors.append(filesupplier) loaddataset = LoadDataset() loaddataset.config["incremental"] = True pred_storage.actors.append(loaddataset) predict = Predict() predict.config["storage_name"] = "model" pred_storage.actors.append(predict) console = Console() console.config["prefix"] = "storage: " pred_storage.actors.append(console) # run the flow msg = flow.setup() if msg is None: print("\n" + flow.tree + "\n") msg = flow.execute() if msg is not None: print("Error executing flow:\n" + msg) else: print("Error setting up flow:\n" + msg) flow.wrapup() flow.cleanup()
def main(): """ Just runs some example code. """ # setup the flow helper.print_title("build and evaluate classifier") iris = helper.get_data_dir() + os.sep + "iris.arff" flow = Flow(name="build and evaluate classifier") start = Start() flow.actors.append(start) build_save = Trigger() build_save.name = "build and store classifier" flow.actors.append(build_save) filesupplier = FileSupplier() filesupplier.config["files"] = [iris] build_save.actors.append(filesupplier) loaddataset = LoadDataset() build_save.actors.append(loaddataset) select = ClassSelector() select.config["index"] = "last" build_save.actors.append(select) ssv = SetStorageValue() ssv.config["storage_name"] = "data" build_save.actors.append(ssv) train = Train() train.config["setup"] = Classifier(classname="weka.classifiers.trees.J48") build_save.actors.append(train) pick = ContainerValuePicker() pick.config["value"] = "Model" build_save.actors.append(pick) ssv = SetStorageValue() ssv.config["storage_name"] = "model" pick.actors.append(ssv) evaluate = Trigger() evaluate.name = "evaluate classifier" flow.actors.append(evaluate) gsv = GetStorageValue() gsv.config["storage_name"] = "data" evaluate.actors.append(gsv) evl = Evaluate() evl.config["storage_name"] = "model" evaluate.actors.append(evl) summary = EvaluationSummary() summary.config["matrix"] = True evaluate.actors.append(summary) console = Console() evaluate.actors.append(console) # run the flow msg = flow.setup() if msg is None: print("\n" + flow.tree + "\n") msg = flow.execute() if msg is not None: print("Error executing flow:\n" + msg) else: print("Error setting up flow:\n" + msg) flow.wrapup() flow.cleanup()
def main(): """ Just runs some example code. """ # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file(iris_file) iris_data.class_is_last() helper.print_title("Iris dataset") print(iris_data) helper.print_title("Iris dataset (incrementally output)") for i in iris_data: print(i) helper.print_title("Iris summary") print(Instances.summary(iris_data)) helper.print_title("Iris attributes") for a in iris_data.attributes(): print(a) helper.print_title("Instance at #0") print(iris_data.get_instance(0)) print(iris_data.get_instance(0).values) print("Attribute stats (first):\n" + str(iris_data.attribute_stats(0))) print("total count (first attribute):\n" + str(iris_data.attribute_stats(0).total_count)) print("numeric stats (first attribute):\n" + str(iris_data.attribute_stats(0).numeric_stats)) print("nominal counts (last attribute):\n" + str(iris_data.attribute_stats(iris_data.num_attributes - 1).nominal_counts)) helper.print_title("Instance values at #0") for v in iris_data.get_instance(0): print(v) # append datasets helper.print_title("append datasets") data1 = Instances.copy_instances(iris_data, 0, 2) data2 = Instances.copy_instances(iris_data, 2, 2) print("Dataset #1:\n" + str(data1)) print("Dataset #2:\n" + str(data2)) msg = data1.equal_headers(data2) print("#1 == #2 ? " + "yes" if msg is None else msg) combined = Instances.append_instances(data1, data2) print("Combined:\n" + str(combined)) # merge datasets helper.print_title("merge datasets") data1 = Instances.copy_instances(iris_data, 0, 2) data1.class_index = -1 data1.delete_attribute(1) data1.delete_first_attribute() data2 = Instances.copy_instances(iris_data, 0, 2) data2.class_index = -1 data2.delete_attribute(4) data2.delete_attribute(3) data2.delete_attribute(2) print("Dataset #1:\n" + str(data1)) print("Dataset #2:\n" + str(data2)) msg = data1.equal_headers(data2) print("#1 == #2 ? " + ("yes" if msg is None else msg)) combined = Instances.merge_instances(data2, data1) print("Combined:\n" + str(combined)) # load dataset incrementally iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset incrementally: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file(iris_file, incremental=True) iris_data.class_is_last() helper.print_title("Iris dataset") print(iris_data) for inst in loader: print(inst) # create attributes helper.print_title("Creating attributes") num_att = Attribute.create_numeric("num") print("numeric: " + str(num_att)) date_att = Attribute.create_date("dat", "yyyy-MM-dd") print("date: " + str(date_att)) nom_att = Attribute.create_nominal("nom", ["label1", "label2"]) print("nominal: " + str(nom_att)) # create dataset helper.print_title("Create dataset") dataset = Instances.create_instances("helloworld", [num_att, date_att, nom_att], 0) print(str(dataset)) # create an instance helper.print_title("Create and add instance") values = [3.1415926, date_att.parse_date("2014-04-10"), 1.0] inst = Instance.create_instance(values) print("Instance #1:\n" + str(inst)) dataset.add_instance(inst) values = [2.71828, date_att.parse_date("2014-08-09"), Instance.missing_value()] inst = Instance.create_instance(values) dataset.add_instance(inst) print("Instance #2:\n" + str(inst)) inst.set_value(0, 4.0) print("Instance #2 (updated):\n" + str(inst)) print("Dataset:\n" + str(dataset)) dataset.delete_with_missing(2) print("Dataset (after delete of missing):\n" + str(dataset)) values = [(1, date_att.parse_date("2014-07-11"))] inst = Instance.create_sparse_instance(values, 3, classname="weka.core.SparseInstance") print("sparse Instance:\n" + str(inst)) dataset.add_instance(inst) print("dataset with mixed dense/sparse instance objects:\n" + str(dataset)) # create dataset (lists) helper.print_title("Create dataset from lists") x = [[randint(1, 10) for _ in range(5)] for _ in range(10)] y = [randint(0, 1) for _ in range(10)] dataset2 = ds.create_instances_from_lists(x, y, "generated from lists") print(dataset2) x = [[randint(1, 10) for _ in range(5)] for _ in range(10)] dataset2 = ds.create_instances_from_lists(x, name="generated from lists (no y)") print(dataset2) # create dataset (matrices) helper.print_title("Create dataset from matrices") x = np.random.randn(10, 5) y = np.random.randn(10) dataset3 = ds.create_instances_from_matrices(x, y, "generated from matrices") print(dataset3) x = np.random.randn(10, 5) dataset3 = ds.create_instances_from_matrices(x, name="generated from matrices (no y)") print(dataset3) # create more sparse instances diabetes_file = helper.get_data_dir() + os.sep + "diabetes.arff" helper.print_info("Loading dataset: " + diabetes_file) loader = Loader("weka.core.converters.ArffLoader") diabetes_data = loader.load_file(diabetes_file) diabetes_data.class_is_last() helper.print_title("Create sparse instances using template dataset") sparse_data = Instances.template_instances(diabetes_data) for i in range(diabetes_data.num_attributes - 1): inst = Instance.create_sparse_instance( [(i, float(i+1) / 10.0)], sparse_data.num_attributes, classname="weka.core.SparseInstance") sparse_data.add_instance(inst) print("sparse dataset:\n" + str(sparse_data)) # simple scatterplot of iris dataset: petalwidth x petallength iris_data = loader.load_file(iris_file) iris_data.class_is_last() pld.scatter_plot( iris_data, iris_data.attribute_by_name("petalwidth").index, iris_data.attribute_by_name("petallength").index, percent=50, wait=False) # line plot of iris dataset (without class attribute) iris_data = loader.load_file(iris_file) iris_data.class_is_last() pld.line_plot(iris_data, atts=range(iris_data.num_attributes - 1), percent=50, title="Line plot iris", wait=False) # matrix plot of iris dataset iris_data = loader.load_file(iris_file) iris_data.class_is_last() pld.matrix_plot(iris_data, percent=50, title="Matrix plot iris", wait=True)
def main(): """ Just runs some example code. """ # load a dataset iris = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris) loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(iris) # remove class attribute helper.print_info("Removing class attribute") remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "last"]) remove.inputformat(data) filtered = remove.filter(data) # use MultiFilter helper.print_info("Use MultiFilter") remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"]) std = Filter(classname="weka.filters.unsupervised.attribute.Standardize") multi = MultiFilter() multi.filters = [remove, std] multi.inputformat(data) filtered_multi = multi.filter(data) # output datasets helper.print_title("Input") print(data) helper.print_title("Output") print(filtered) helper.print_title("Output (MultiFilter)") print(filtered_multi) # load text dataset text = helper.get_data_dir() + os.sep + "reutersTop10Randomized_1perc_shortened.arff" helper.print_info("Loading dataset: " + text) loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(text) data.class_is_last() # apply StringToWordVector stemmer = Stemmer(classname="weka.core.stemmers.IteratedLovinsStemmer") stopwords = Stopwords(classname="weka.core.stopwords.Rainbow") tokenizer = Tokenizer(classname="weka.core.tokenizers.WordTokenizer") s2wv = StringToWordVector(options=["-W", "10", "-L", "-C"]) s2wv.stemmer = stemmer s2wv.stopwords = stopwords s2wv.tokenizer = tokenizer s2wv.inputformat(data) filtered = s2wv.filter(data) helper.print_title("Input (StringToWordVector)") print(data) helper.print_title("Output (StringToWordVector)") print(filtered) # partial classname helper.print_title("Creating filter from partial classname") clsname = ".Standardize" f = Filter(classname=clsname) print(clsname + " --> " + f.classname) # source code helper.print_info("Generate source code") bolts = helper.get_data_dir() + os.sep + "labor.arff" helper.print_info("Loading dataset: " + bolts) loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(bolts) replace = Filter(classname="weka.filters.unsupervised.attribute.ReplaceMissingValues") replace.inputformat(data) replace.filter(data) print(replace.to_source("MyReplaceMissingValues", data))
def main(): """ Just runs some example code. """ # setup the flow helper.print_title("Attribute selection") iris = helper.get_data_dir() + os.sep + "iris.arff" flow = Flow(name="attribute selection") filesupplier = FileSupplier() filesupplier.config["files"] = [iris] flow.actors.append(filesupplier) loaddataset = LoadDataset() loaddataset.config["incremental"] = False flow.actors.append(loaddataset) attsel = AttributeSelection() attsel.config["search"] = ASSearch(classname="weka.attributeSelection.BestFirst") attsel.config["eval"] = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval") flow.actors.append(attsel) results = Tee() results.name = "output results" flow.actors.append(results) picker = ContainerValuePicker() picker.config["value"] = "Results" picker.config["switch"] = True results.actors.append(picker) console = Console() console.config["prefix"] = "Attribute selection results:" results.actors.append(console) reduced = Tee() reduced.name = "reduced dataset" flow.actors.append(reduced) picker = ContainerValuePicker() picker.config["value"] = "Reduced" picker.config["switch"] = True reduced.actors.append(picker) console = Console() console.config["prefix"] = "Reduced dataset:\n\n" reduced.actors.append(console) # run the flow msg = flow.setup() if msg is None: print("\n" + flow.tree + "\n") msg = flow.execute() if msg is not None: print("Error executing flow:\n" + msg) else: print("Error setting up flow:\n" + msg) flow.wrapup() flow.cleanup()