def filter(self, data): """ Filters the dataset(s). When providing a list, this can be used to create compatible train/test sets, since the filter only gets initialized with the first dataset and all subsequent datasets get transformed using the same setup. NB: inputformat(Instances) must have been called beforehand. :param data: the Instances to filter :type data: Instances or list of Instances :return: the filtered Instances object(s) :rtype: Instances or list of Instances """ if isinstance(data, list): result = [] for d in data: result.append( Instances( javabridge.static_call( "Lweka/filters/Filter;", "useFilter", "(Lweka/core/Instances;Lweka/filters/Filter;)Lweka/core/Instances;", d.jobject, self.jobject))) return result else: return Instances( javabridge.static_call( "Lweka/filters/Filter;", "useFilter", "(Lweka/core/Instances;Lweka/filters/Filter;)Lweka/core/Instances;", data.jobject, self.jobject))
def split_data(data, test_size): # split the data # create placeholder for train split data_train = Instances.copy_instances(data) # remove all instances from the placeholder for i in reversed(range(len(data_train))): data_train.delete(i) # create placeholder for test split data_test = Instances.copy_instances(data) # remove all instances from the placeholder for i in reversed(range(len(data_test))): data_test.delete(i) # create list of indices indices = list(range(len(data))) # shuffle indices random.shuffle(indices) # calculate number of indices in the test split num_test = int(round(len(indices) * test_size, 0)) # get indices for the test split test_ids = indices[:num_test] # fill test split with instances for idx in test_ids: data_test.add_instance(data.get_instance(idx)) # get indices for the train split train_ids = indices[num_test:] # fill train split with instances for idx in train_ids: data_train.add_instance(data.get_instance(idx)) return data_train, data_test
def load_file(self, dfile, incremental=False): """ Loads the specified file and returns the Instances object. In case of incremental loading, only the structure. :param dfile: the file to load :type dfile: str :param incremental: whether to load the dataset incrementally :type incremental: bool :return: the full dataset or the header (if incremental) :rtype: Instances """ self.enforce_type(self.jobject, "weka.core.converters.FileSourcedConverter") self.incremental = incremental if not javabridge.is_instance_of(dfile, "Ljava/io/File;"): dfile = javabridge.make_instance( "Ljava/io/File;", "(Ljava/lang/String;)V", javabridge.get_env().new_string_utf(str(dfile))) javabridge.call(self.jobject, "reset", "()V") javabridge.call(self.jobject, "setFile", "(Ljava/io/File;)V", dfile) if incremental: self.structure = Instances( javabridge.call(self.jobject, "getStructure", "()Lweka/core/Instances;")) return self.structure else: return Instances( javabridge.call(self.jobject, "getDataSet", "()Lweka/core/Instances;"))
def splitTrainSet(data,m_numLabledData=10) : total = data.num_instances labeled_amount = int(m_numLabledData * total / 100) unlabeled_amount = total - labeled_amount rand = Random(1) data.randomize(rand) labledDataSet = Instances.create_instances(data.relationname,data.attributes(),labeled_amount) UnlabledDataSet = Instances.create_instances(data.relationname,data.attributes(),unlabeled_amount) for i in range(labeled_amount) : labledDataSet.add_instance(data.get_instance(i)) labledDataSet.randomize(rand) for i in range(unlabeled_amount) : UnlabledDataSet.add_instance(data.get_instance(labeled_amount + i)) # labledDataSet.randomize(rand) labledDataSet.class_is_last() # UnlabledDataSet.randomize(rand) UnlabledDataSet.class_is_last() return labledDataSet,UnlabledDataSet
def train_internal(self): best_weights_arr = [] #create an empty F with source as template F = Instances.template_instances(self.source[0]) withF = False print("Find weight for each source data set") for source in self.source: bestWeight, bestError = self.process_source(source, F, withF) best_weights_arr.append(bestWeight) #sort the data based on the weights self.source = [ source for _, source in sorted(zip(best_weights_arr, self.source), reverse=True, key=operator.itemgetter(0)) ] print("Train for final stage") withF = True while len(self.source) > 0: #self.max_source_dataset): weight, _ = self.process_source(self.source[0], F, withF) for inst in self.source[0]: inst.weight = weight F = Instances.append_instances(F, self.source[0]) F.class_is_last() self.source.pop(0) return F
def load_file(self, dfile, incremental=False): """ Loads the specified file and returns the Instances object. In case of incremental loading, only the structure. :param dfile: the file to load :type dfile: str :param incremental: whether to load the dataset incrementally :type incremental: bool :return: the full dataset or the header (if incremental) :rtype: Instances :raises Exception: if the file does not exist """ self.enforce_type(self.jobject, "weka.core.converters.FileSourcedConverter") self.incremental = incremental if not javabridge.is_instance_of(dfile, "Ljava/io/File;"): dfile = javabridge.make_instance( "Ljava/io/File;", "(Ljava/lang/String;)V", javabridge.get_env().new_string_utf(str(dfile))) javabridge.call(self.jobject, "reset", "()V") # check whether file exists, otherwise previously set file gets loaded again sfile = javabridge.to_string(dfile) if not os.path.exists(sfile): raise Exception("Dataset file does not exist: " + str(sfile)) javabridge.call(self.jobject, "setFile", "(Ljava/io/File;)V", dfile) if incremental: self.structure = Instances( javabridge.call(self.jobject, "getStructure", "()Lweka/core/Instances;")) return self.structure else: return Instances( javabridge.call(self.jobject, "getDataSet", "()Lweka/core/Instances;"))
def load_url(self, url, incremental=False): """ Loads the specified URL and returns the Instances object. In case of incremental loading, only the structure. :param url: the URL to load the data from :type url: str :param incremental: whether to load the dataset incrementally :type incremental: bool :return: the full dataset or the header (if incremental) :rtype: Instances """ self.enforce_type(self.jobject, "weka.core.converters.URLSourcedLoader") self.incremental = incremental javabridge.call(self.jobject, "reset", "()V") javabridge.call(self.jobject, "setURL", "(Ljava/lang/String;)V", str(url)) if incremental: self.structure = Instances( javabridge.call(self.jobject, "getStructure", "()Lweka/core/Instances;")) return self.structure else: return Instances( javabridge.call(self.jobject, "getDataSet", "()Lweka/core/Instances;"))
def testNB(training_data, testing_data): train_data = Instances.copy_instances(training_data) test_data = Instances.copy_instances(testing_data) evaluation = Evaluation(train_data) classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes") classifier.build_classifier( train_data) # build classifier on the training data evaluation.test_model(classifier, test_data) # test and evaluate model on the test set print("") print("") print( evaluation.summary( "--------------Naive Bayes Evaluation--------------")) print("Accuracy: " + str(evaluation.percent_correct)) print("") print("Label\tPrecision\t\tRecall\t\t\tF-Measure") print("<=50K\t" + str(evaluation.precision(0)) + "\t" + str(evaluation.recall(0)) + "\t" + str(evaluation.f_measure(0))) print(">50K\t" + str(evaluation.precision(1)) + "\t" + str(evaluation.recall(1)) + "\t" + str(evaluation.f_measure(1))) print("Mean\t" + str(((evaluation.precision(1)) + (evaluation.precision(0))) / 2) + "\t" + str(((evaluation.recall(1)) + (evaluation.recall(0))) / 2) + "\t" + str(((evaluation.f_measure(1)) + (evaluation.f_measure(0))) / 2))
def main(): """ Just runs some example code. """ # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file(iris_file) iris_data.class_is_last() # train classifier classifier = Classifier("weka.classifiers.trees.J48") classifier.build_classifier(iris_data) # save and read object helper.print_title("I/O: model (using serialization module)") outfile = tempfile.gettempdir() + os.sep + "j48.model" serialization.write(outfile, classifier) model = Classifier(jobject=serialization.read(outfile)) print(model) # save classifier and dataset header (multiple objects) helper.print_title("I/O: model and header (using serialization module)") serialization.write_all( outfile, [classifier, Instances.template_instances(iris_data)]) objects = serialization.read_all(outfile) for i, obj in enumerate(objects): helper.print_info("Object #" + str(i + 1) + ":") if javabridge.get_env().is_instance_of( obj, javabridge.get_env().find_class("weka/core/Instances")): obj = Instances(jobject=obj) elif javabridge.get_env().is_instance_of( obj, javabridge.get_env().find_class( "weka/classifiers/Classifier")): obj = Classifier(jobject=obj) print(obj) # save and read object helper.print_title("I/O: just model (using Classifier class)") outfile = tempfile.gettempdir() + os.sep + "j48.model" classifier.serialize(outfile) model, _ = Classifier.deserialize(outfile) print(model) # save classifier and dataset header (multiple objects) helper.print_title("I/O: model and header (using Classifier class)") classifier.serialize(outfile, header=iris_data) model, header = Classifier.deserialize(outfile) print(model) if header is not None: print(header)
def load_file(self, dfile, incremental=False, class_index=None): """ Loads the specified file and returns the Instances object. In case of incremental loading, only the structure. :param dfile: the file to load :type dfile: str :param incremental: whether to load the dataset incrementally :type incremental: bool :param class_index: the class index string to use ('first', 'second', 'third', 'last-2', 'last-1', 'last' or 1-based index) :type class_index: str :return: the full dataset or the header (if incremental) :rtype: Instances :raises Exception: if the file does not exist """ self.enforce_type(self.jobject, "weka.core.converters.FileSourcedConverter") self.incremental = incremental if not javabridge.is_instance_of(dfile, "Ljava/io/File;"): dfile = javabridge.make_instance( "Ljava/io/File;", "(Ljava/lang/String;)V", javabridge.get_env().new_string_utf(str(dfile))) javabridge.call(self.jobject, "reset", "()V") # check whether file exists, otherwise previously set file gets loaded again sfile = javabridge.to_string(dfile) if not os.path.exists(sfile): raise Exception("Dataset file does not exist: " + str(sfile)) javabridge.call(self.jobject, "setFile", "(Ljava/io/File;)V", dfile) if incremental: self.structure = Instances( javabridge.call(self.jobject, "getStructure", "()Lweka/core/Instances;")) result = self.structure else: result = Instances( javabridge.call(self.jobject, "getDataSet", "()Lweka/core/Instances;")) if class_index is not None: if class_index == 'first': result.class_index = 0 elif class_index == 'second': result.class_index = 1 elif class_index == 'third': result.class_index = 2 elif class_index == 'last-2': result.class_index = result.num_attributes - 3 elif class_index == 'last-1': result.class_index = result.num_attributes - 2 elif class_index == 'last': result.class_index = result.num_attributes - 1 else: result.class_index = int(class_index) return result
def LabeledUnlabeldata(data, unlabeled, tree, y, cal_method=None): data1 = Instances.copy_instances(data) labeling = Instances.copy_instances(unlabeled) tree.build_classifier(data1) j = i = s = l = 0 while i < labeling.num_instances: clsLabel = tree.classify_instance(labeling.get_instance(i)) ##### probability calculation ##### # dist = tree.distribution_for_instance(labeling.get_instance(i)) dist = calculate_probability_distribution(tree, labeling, i, cal_method) for k, dk in enumerate(dist): if dk >= y: j = i while j < labeling.num_instances: clsLabel = tree.classify_instance(labeling.get_instance(j)) ##### probability calculation ##### # dist = tree.distribution_for_instance(labeling.get_instance(j)) dist = calculate_probability_distribution( tree, labeling, j, cal_method) for dp in dist: if dp >= y: inst = labeling.get_instance(i) inst.set_value(inst.class_index, clsLabel) data1.add_instance(inst) labeling.delete(i) l += 1 j -= 1 j += 1 if k == (len(dist) - 1) and (l != 0): tree.build_classifier(data1) i = -1 s += l l = 0 i += 1 data1.compactify() return data1
def folds(self, nfolds=10, seed=None): """ Get (training,testing) datasets for cross-validation. Arguments: nfolds (int, optional): Number of folds. Default value is 10. seed (int, optional): Seed value for shuffling dataset. Default value is random int 0 <= x <= 10000. Returns: list of (Instances,Instances) tuples: Each list element is a pair of (training,testing) datasets, respectively. """ seed = seed or randint(0, 10000) rnd = WekaRandom(seed) fold_size = labmath.ceil(self.instances.num_instances / nfolds) # Shuffle the dataset. instances = WekaInstances.copy_instances(self.instances) instances.randomize(rnd) folds = [] for i in range(nfolds): offset = i * fold_size testing_end = min(offset + fold_size, instances.num_instances - 1) # Calculate dataset indices for testing and training data. testing_range = (offset, testing_end - offset) left_range = (0, offset) right_range = (testing_end, instances.num_instances - testing_end) # If there's nothing to test, move on. if testing_range[1] < 1: continue # Create testing and training folds. testing = WekaInstances.copy_instances(instances, *testing_range) left = WekaInstances.copy_instances(instances, *left_range) right = WekaInstances.copy_instances(instances, *right_range) training = WekaInstances.append_instances(left, right) # Add fold to collection. folds.append((training, testing)) return folds
def main(): """ Just runs some example code. """ # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file(iris_file) iris_data.class_is_last() # train classifier classifier = Classifier("weka.classifiers.trees.J48") classifier.build_classifier(iris_data) # save and read object helper.print_title("I/O: single object") outfile = tempfile.gettempdir() + os.sep + "j48.model" serialization.write(outfile, classifier) model = Classifier(jobject=serialization.read(outfile)) print(model) # save classifier and dataset header (multiple objects) helper.print_title("I/O: single object") serialization.write_all(outfile, [classifier, Instances.template_instances(iris_data)]) objects = serialization.read_all(outfile) for i, obj in enumerate(objects): helper.print_info("Object #" + str(i+1) + ":") if javabridge.get_env().is_instance_of(obj, javabridge.get_env().find_class("weka/core/Instances")): obj = Instances(jobject=obj) elif javabridge.get_env().is_instance_of(obj, javabridge.get_env().find_class("weka/classifiers/Classifier")): obj = Classifier(jobject=obj) print(obj)
def load_model(filename): """ Load the model from cache. Args: filename(str): The target file name (without extension) to load. Example: LMT Returns: The classifier and data object if the target caching is saved, otherwise None. """ # Path to the cashed model (example: caches/model/LMT.cache) path = os.path.join(os.path.join('caches', 'model'), filename + '.cache') print("Path to the cashed model to load:", path) if os.path.isfile(path): cached_model, cached_data_used_for_training = serialization.read_all( path) print("Loading cached classifier") trained_classifier = Classifier(jobject=cached_model) print("Loading cached data") training_data = Instances(jobject=cached_data_used_for_training) localizer_log.msg("Loaded model: {filename}".format(filename=filename)) return [trained_classifier, training_data] localizer_log.msg("Failed to load cache of 'model'.") return None
def df_to_instances(self): ''' transform pandas data frame to arff style data :param df: panda data frame :param relation: relation, string :param attr_label: label attribute, string :return: arff style data ''' atts = [] for col in self.df.columns: if col != self.attr_label: att = Attribute.create_numeric(col) else: att = Attribute.create_nominal(col, ['0', '1']) atts.append(att) nrow = len(self.df) result = Instances.create_instances(self.relation, atts, nrow) # data for i in range(nrow): inst = Instance.create_instance( self.df.iloc[i].astype('float64').to_numpy().copy(order='C')) result.add_instance(inst) return result
def create_subsample(data, percent, seed=1): """ Generates a subsample of the dataset. :param data: the data to create the subsample from :type data: Instances :param percent: the percentage (0-100) :type percent: float :param seed: the seed value to use :type seed: int """ if percent <= 0 or percent >= 100: return data data = Instances.copy_instances(data) data.randomize(Random(seed)) data = Instances.copy_instances(data, 0, int(round(data.num_instances() * percent / 100.0))) return data
def main(): try: jvm.start() loader = Loader(classname="weka.core.converters.CSVLoader") data = loader.load_file("./data/adult.csv") data.class_is_last() # set class attribute # randomize data folds = k seed = 1 rnd = Random(seed) rand_data = Instances.copy_instances(data) rand_data.randomize(rnd) if rand_data.class_attribute.is_nominal: rand_data.stratify(folds) NaiveBayes(rand_data, folds, seed, data) DecisionTree(rand_data, folds, seed, data) except Exception as e: raise e finally: jvm.stop()
def predBtn_clicked(self): gender = self.gender_entry.get() age = int(self.age_entry.get()) height = int(self.height_entry.get()) weight = int(self.weight_entry.get()) sociability = self.sociability_entry.get() stability = self.stability_entry.get() '''Create the model''' objects = serialization.read_all("J48.model") cls = Classifier(jobject=objects[0]) data = Instances(jobject=objects[1]) '''Create the test set to be classified''' gender_values = ["Man", "Woman"] sociability_values = ["Introvert", "Extrovert"] stability_values = ["Stable", "Unstable"] values = [ gender_values.index(gender), age, height, weight, self.BMI(weight, height), stability_values.index(stability), sociability_values.index(sociability), Instance.missing_value() ] inst = Instance.create_instance(values) inst.dataset = data '''Classification''' prediction = int(cls.classify_instance(inst)) self.controller.show_frame("Result").show(prediction) self.clear()
def save_model(model, data, filename): """Save the model to the target caching file. The caches should be defined in the config file. See README and config.sample for reference. Args: model(obj): The model to be saved. Should be a weka.classifier.Classifier object. data(obj): The training set to be cached. target(str): The target option in '[cached]' section in the config file. filename(str): The target file to save. Returns: True if the target caching is saved, otherwise False. """ folder = os.path.join('caches', 'model') path = os.path.join(folder, filename + '.cache') build_if_not_exist(folder) serialization.write_all(path, [model, Instances.template_instances(data)]) localizer_log.msg( "Saved cache of {target_name}.".format(target_name='model')) return True
def create_dataset_header(): """ Creates the dataset header. :return: the header :rtype: Instances """ att_msg = Attribute.create_string("Message") att_cls = Attribute.create_nominal("Class", ["miss", "hit"]) result = Instances.create_instances("MessageClassificationProblem", [att_msg, att_cls], 0) return result
def get_outputformat(self): """ Returns the output format. :return: the output format :rtype: Instances """ inst = javabridge.call(self.jobject, "getOutputFormat", "()Lweka/core/Instances;") if inst is None: return None else: return Instances(inst)
def dataset_format(self): """ Returns the dataset format. :return: the format :rtype: Instances """ data = javabridge.call(self.jobject, "getDatasetFormat", "()Lweka/core/Instances;") if data is None: return None else: return Instances(data)
def generate_examples(self): """ Returns complete dataset. :return: the generated dataset :rtype: Instances """ data = javabridge.call(self.jobject, "generateExamples", "()Lweka/core/Instances;") if data is None: return None else: return Instances(data)
def outputformat(self): """ Returns the output format. :return: the output format :rtype: Instances """ inst = self.__outputformat() if inst is None: return None else: return Instances(inst)
def load(self): """ Loads the text files from the specified directory and returns the Instances object. In case of incremental loading, only the structure. :return: the full dataset or the header (if incremental) :rtype: Instances """ javabridge.call(self.jobject, "reset", "()V") return Instances( javabridge.call(self.jobject, "getDataSet", "()Lweka/core/Instances;"))
def instances(self): """ Returns the data used in the analysis. :return: the data in use :rtype: Instances """ inst = javabridge.call(self.jobject, "getInstances", "()Lweka/core/Instances;") if inst is None: return None else: return Instances(inst)
def filter(self, data): """ Filters the dataset. :param data: the Instances to filter :type data: Instances :return: the filtered Instances object :rtype: Instances """ return Instances(javabridge.static_call( "Lweka/filters/Filter;", "useFilter", "(Lweka/core/Instances;Lweka/filters/Filter;)Lweka/core/Instances;", data.jobject, self.jobject))
def DecisionTree(rnd_data, folds, seed, data): data_size = rnd_data.num_instances fold_size = math.floor(data_size / folds) # cross-validation evaluation = Evaluation(rnd_data) for i in range(folds): this_fold = fold_size test_start = i * fold_size test_end = (test_start + fold_size) if ((data_size - test_end) / fold_size < 1): this_fold = data_size - test_start test = Instances.copy_instances(rnd_data, test_start, this_fold) # generate validation fold if i == 0: train = Instances.copy_instances(rnd_data, test_end, data_size - test_end) else: train_1 = Instances.copy_instances(rnd_data, 0, test_start) train_2 = Instances.copy_instances(rnd_data, test_end, data_size - test_end) train = Instances.append_instances( train_1, train_2) # generate training fold # build and evaluate classifier cls = Classifier(classname="weka.classifiers.trees.J48") cls.build_classifier(train) # build classifier on training set evaluation.test_model(cls, test) # test classifier on validation/test set print("") print("=== Decision Tree ===") print("Classifier: " + cls.to_commandline()) print("Dataset: " + data.relationname) print("Folds: " + str(folds)) print("Seed: " + str(seed)) print("") print( evaluation.summary("=== " + str(folds) + "-fold Cross-Validation ==="))
def LabeledUnlabeldata(data, unlabeled, tree, y, cal_method=None ) : data1 = Instances.copy_instances(data) labeling = Instances.copy_instances(unlabeled) tree.build_classifier(data1) update=False it=0 labeling_num_instances = labeling.num_instances while labeling.num_instances > 3 and it < labeling_num_instances: it+=1 update = False removed_index=set() print("labeling.num_instances ===>> " , labeling.num_instances) for i,xi in enumerate(labeling) : clsLabel= tree.classify_instance(xi) dist = calculate_probability_distribution(tree , labeling , i , cal_method) for dp in dist : if dp >= y : update = True xi.set_value(xi.class_index,clsLabel) data1.add_instance(xi) removed_index.add(i) print("labeling ==================>>", labeling.num_instances) print("removed_index ==================>>", len(removed_index)) removed_index_list = sorted(removed_index) for i,ii in enumerate(removed_index_list) : labeling.delete(ii-i) print("labeling ==================>>", labeling.num_instances) if update: tree.build_classifier(data1) data1.compactify() return data1
def __init__(self, model=None, header=None): """ Initializes the container. :param model: the model to store (eg Classifier or Clusterer) :type model: object :param header: the header instances :type header: Instances """ super(ModelContainer, self).__init__() self.set("Model", model) if header is not None: header = Instances.template_instances(header) self.set("Header", header) self._allowed = ["Model", "Header"]
def generate_thresholdcurve_data(evaluation, class_index): """ Generates the threshold curve data from the evaluation object's predictions. :param evaluation: the evaluation to obtain the predictions from :type evaluation: Evaluation :param class_index: the 0-based index of the class-label to create the plot for :type class_index: int :return: the generated threshold curve data :rtype: Instances """ jtc = JavaObject.new_instance("weka.classifiers.evaluation.ThresholdCurve") pred = javabridge.call(evaluation.jobject, "predictions", "()Ljava/util/ArrayList;") result = Instances( javabridge.call(jtc, "getCurve", "(Ljava/util/ArrayList;I)Lweka/core/Instances;", pred, class_index)) return result
def create_dataset(tweets): text_att = Attribute.create_string('TEXT') nom_att = Attribute.create_nominal('CLASS', class_values) dataset = Instances.create_instances("tweets", [text_att, nom_att], len(tweets)) for tweet in tweets: values = [] values.append(dataset.attribute(0).add_string_value(tweet)) values.append(Instance.missing_value()) inst = Instance.create_instance(values) dataset.add_instance(inst) dataset.class_is_last() return dataset
def addNominals(self, dataset): # Add the nominal values for all columns, in case a column has none ignoreAttributes = ['readmitted'] atts = [] for a in dataset.attributes(): if (not (a.is_nominal)) or (a.name in ignoreAttributes) : atts.append(a) else: newValues = list(a.values) pvalue = 'DefaultNominal' if(pvalue not in newValues): newValues.append(pvalue) atts.append(Attribute.create_nominal(a.name, newValues)) newDataset = Instances.create_instances("Dataset", atts, 0) newDataset.class_is_last() return newDataset
def main(): """ Creates a dataset from scratch using random data and outputs it. """ atts = [] for i in xrange(5): atts.append(Attribute.create_numeric("x" + str(i))) data = Instances.create_instances("data", atts, 10) for n in xrange(10): values = [] for i in xrange(5): values.append(n*100 + i) inst = Instance.create_instance(values) data.add_instance(inst) print(data)
def ndarray_to_instances(array, relation, att_template="Att-#", att_list=None): """ Converts the numpy matrix into an Instances object and returns it. :param array: the numpy ndarray to convert :type array: numpy.darray :param relation: the name of the dataset :type relation: str :param att_template: the prefix to use for the attribute names, "#" is the 1-based index, "!" is the 0-based index, "@" the relation name :type att_template: str :param att_list: the list of attribute names to use :type att_list: list :return: the generated instances object :rtype: Instances """ if len(numpy.shape(array)) != 2: raise Exception("Number of array dimensions must be 2!") rows, cols = numpy.shape(array) # header atts = [] if att_list is not None: if len(att_list) != cols: raise Exception( "Number columns and provided attribute names differ: " + str(cols) + " != " + len(att_list)) for name in att_list: att = Attribute.create_numeric(name) atts.append(att) else: for i in xrange(cols): name = att_template.replace("#", str(i+1)).replace("!", str(i)).replace("@", relation) att = Attribute.create_numeric(name) atts.append(att) result = Instances.create_instances(relation, atts, rows) # data for i in xrange(rows): inst = Instance.create_instance(array[i]) result.add_instance(inst) return result
def do_execute(self): """ The actual execution of the actor. :return: None if successful, otherwise error message :rtype: str """ result = None data = self.input.payload if isinstance(self._input.payload, Instance): inst = self.input.payload data = inst.dataset elif isinstance(self.input.payload, Instances): data = self.input.payload inst = None append = True if self._header is None or (self._header.equal_headers(data) is not None): self._header = Instances.template_instances(data, 0) outstr = str(data) append = False elif inst is not None: outstr = str(inst) else: outstr = str(data) f = None try: if append: f = open(str(self.resolve_option("output")), "a") else: f = open(str(self.resolve_option("output")), "w") f.write(outstr) f.write("\n") except Exception as e: result = self.full_name + "\n" + traceback.format_exc() finally: if f is not None: f.close() return result
def addPatientNominals(self, patient, dataset): # Add the nominal values for the patient to the master header, in case they aren't already there # Loop and add patient's nominal values in case they aren't in masterDataset # newDataset will be the new master header # Waiting on prediction patient to be defined # Should be like {sex_cd: "m", ...} ignoreAttributes = ['readmitted'] atts = [] for a in dataset.attributes(): if (not (a.is_nominal)) or (a.name in ignoreAttributes) : atts.append(a) else: newValues = list(a.values) #print a.name pvalue = patient[a.name] if(pvalue not in newValues): newValues.append(pvalue) atts.append(Attribute.create_nominal(a.name, newValues)) newDataset = Instances.create_instances("Dataset", atts, 0) newDataset.class_is_last() return newDataset
def copy(self, from_row=None, num_rows=None): return WekaInstances.copy_instances(self.instances, from_row=from_row, num_rows=num_rows)
print("\nLoading dataset: " + fname + "\n") data = loader.load_file(fname) data.class_is_last() # define classifiers classifiers = ["weka.classifiers.rules.OneR", "weka.classifiers.trees.J48"] # cross-validate original dataset for classifier in classifiers: cls = Classifier(classname=classifier) evl = Evaluation(data) evl.crossvalidate_model(cls, data, 10, Random(1)) print("%s (original): %0.0f%%" % (classifier, evl.percent_correct)) # replace 'outlook' in first 4 'no' instances with 'missing' modified = Instances.copy_instances(data) count = 0 for i in xrange(modified.num_instances): if modified.get_instance(i).get_string_value(modified.class_index) == "no": count += 1 modified.get_instance(i).set_missing(0) if count == 4: break # cross-validate modified dataset for classifier in classifiers: cls = Classifier(classname=classifier) evl = Evaluation(modified) evl.crossvalidate_model(cls, modified, 10, Random(1)) print("%s (modified): %0.0f%%" % (classifier, evl.percent_correct))
import os import weka.core.jvm as jvm from weka.core.converters import Loader, Saver from weka.core.dataset import Instances from weka.filters import Filter jvm.start() # load weather.nominal fname = data_dir + os.sep + "weather.nominal.arff" print("\nLoading dataset: " + fname + "\n") loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(fname) # output header print(Instances.template_instances(data)) # remove attribute no 3 print("\nRemove attribute no 3") fltr = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "3"]) fltr.set_inputformat(data) filtered = fltr.filter(data) # output header print(Instances.template_instances(filtered)) # save modified dataset saver = Saver(classname="weka.core.converters.ArffSaver") saver.save_file(filtered, data_dir + os.sep + "weather.nominal-filtered.arff") jvm.stop()
def main(): """ Just runs some example code. """ # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file(iris_file) iris_data.class_is_last() helper.print_title("Iris dataset") print(iris_data) helper.print_title("Iris dataset (incrementally output)") for i in iris_data: print(i) helper.print_title("Iris summary") print(Instances.summary(iris_data)) helper.print_title("Iris attributes") for a in iris_data.attributes(): print(a) helper.print_title("Instance at #0") print(iris_data.get_instance(0)) print(iris_data.get_instance(0).values) print("Attribute stats (first):\n" + str(iris_data.attribute_stats(0))) print("total count (first attribute):\n" + str(iris_data.attribute_stats(0).total_count)) print("numeric stats (first attribute):\n" + str(iris_data.attribute_stats(0).numeric_stats)) print("nominal counts (last attribute):\n" + str(iris_data.attribute_stats(iris_data.num_attributes - 1).nominal_counts)) helper.print_title("Instance values at #0") for v in iris_data.get_instance(0): print(v) # append datasets helper.print_title("append datasets") data1 = Instances.copy_instances(iris_data, 0, 2) data2 = Instances.copy_instances(iris_data, 2, 2) print("Dataset #1:\n" + str(data1)) print("Dataset #2:\n" + str(data2)) msg = data1.equal_headers(data2) print("#1 == #2 ? " + "yes" if msg is None else msg) combined = Instances.append_instances(data1, data2) print("Combined:\n" + str(combined)) # merge datasets helper.print_title("merge datasets") data1 = Instances.copy_instances(iris_data, 0, 2) data1.class_index = -1 data1.delete_attribute(1) data1.delete_first_attribute() data2 = Instances.copy_instances(iris_data, 0, 2) data2.class_index = -1 data2.delete_attribute(4) data2.delete_attribute(3) data2.delete_attribute(2) print("Dataset #1:\n" + str(data1)) print("Dataset #2:\n" + str(data2)) msg = data1.equal_headers(data2) print("#1 == #2 ? " + ("yes" if msg is None else msg)) combined = Instances.merge_instances(data2, data1) print("Combined:\n" + str(combined)) # load dataset incrementally iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset incrementally: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file(iris_file, incremental=True) iris_data.class_is_last() helper.print_title("Iris dataset") print(iris_data) for inst in loader: print(inst) # create attributes helper.print_title("Creating attributes") num_att = Attribute.create_numeric("num") print("numeric: " + str(num_att)) date_att = Attribute.create_date("dat", "yyyy-MM-dd") print("date: " + str(date_att)) nom_att = Attribute.create_nominal("nom", ["label1", "label2"]) print("nominal: " + str(nom_att)) # create dataset helper.print_title("Create dataset") dataset = Instances.create_instances("helloworld", [num_att, date_att, nom_att], 0) print(str(dataset)) # create an instance helper.print_title("Create and add instance") values = [3.1415926, date_att.parse_date("2014-04-10"), 1.0] inst = Instance.create_instance(values) print("Instance #1:\n" + str(inst)) dataset.add_instance(inst) values = [2.71828, date_att.parse_date("2014-08-09"), Instance.missing_value()] inst = Instance.create_instance(values) dataset.add_instance(inst) print("Instance #2:\n" + str(inst)) inst.set_value(0, 4.0) print("Instance #2 (updated):\n" + str(inst)) print("Dataset:\n" + str(dataset)) dataset.delete_with_missing(2) print("Dataset (after delete of missing):\n" + str(dataset)) values = [(1, date_att.parse_date("2014-07-11"))] inst = Instance.create_sparse_instance(values, 3, classname="weka.core.SparseInstance") print("sparse Instance:\n" + str(inst)) dataset.add_instance(inst) print("dataset with mixed dense/sparse instance objects:\n" + str(dataset)) # create dataset (lists) helper.print_title("Create dataset from lists") x = [[randint(1, 10) for _ in range(5)] for _ in range(10)] y = [randint(0, 1) for _ in range(10)] dataset2 = ds.create_instances_from_lists(x, y, "generated from lists") print(dataset2) x = [[randint(1, 10) for _ in range(5)] for _ in range(10)] dataset2 = ds.create_instances_from_lists(x, name="generated from lists (no y)") print(dataset2) # create dataset (matrices) helper.print_title("Create dataset from matrices") x = np.random.randn(10, 5) y = np.random.randn(10) dataset3 = ds.create_instances_from_matrices(x, y, "generated from matrices") print(dataset3) x = np.random.randn(10, 5) dataset3 = ds.create_instances_from_matrices(x, name="generated from matrices (no y)") print(dataset3) # create more sparse instances diabetes_file = helper.get_data_dir() + os.sep + "diabetes.arff" helper.print_info("Loading dataset: " + diabetes_file) loader = Loader("weka.core.converters.ArffLoader") diabetes_data = loader.load_file(diabetes_file) diabetes_data.class_is_last() helper.print_title("Create sparse instances using template dataset") sparse_data = Instances.template_instances(diabetes_data) for i in range(diabetes_data.num_attributes - 1): inst = Instance.create_sparse_instance( [(i, float(i+1) / 10.0)], sparse_data.num_attributes, classname="weka.core.SparseInstance") sparse_data.add_instance(inst) print("sparse dataset:\n" + str(sparse_data)) # simple scatterplot of iris dataset: petalwidth x petallength iris_data = loader.load_file(iris_file) iris_data.class_is_last() pld.scatter_plot( iris_data, iris_data.attribute_by_name("petalwidth").index, iris_data.attribute_by_name("petallength").index, percent=50, wait=False) # line plot of iris dataset (without class attribute) iris_data = loader.load_file(iris_file) iris_data.class_is_last() pld.line_plot(iris_data, atts=range(iris_data.num_attributes - 1), percent=50, title="Line plot iris", wait=False) # matrix plot of iris dataset iris_data = loader.load_file(iris_file) iris_data.class_is_last() pld.matrix_plot(iris_data, percent=50, title="Matrix plot iris", wait=True)
def saveClassifier(self, filename, path='/home/sbiastoch/Schreibtisch/classifiers/'): serialization.write_all(path+filename, [self.classifier, Instances.template_instances(self.data)])
# Discretize print("Discretize numeric attributes (supervised)") fltr = Filter(classname="weka.filters.supervised.attribute.Discretize") fltr.inputformat(data) filtered = fltr.filter(data) print(filtered) # PCA print("Principal components analysis") fltr = Filter(classname="weka.filters.unsupervised.attribute.PrincipalComponents") fltr.inputformat(data) filtered = fltr.filter(data) print(filtered) # load anneal loader = Loader(classname="weka.core.converters.ArffLoader") fname = data_dir + os.sep + "anneal.arff" print("\nLoading dataset: " + fname + "\n") data = loader.load_file(fname) data.class_is_last() # RemoveUseless print("RemoveUseless") fltr = Filter(classname="weka.filters.unsupervised.attribute.RemoveUseless") fltr.inputformat(data) filtered = fltr.filter(data) print("Original header (#att=" + str(data.num_attributes) + "):\n" + str(Instances.template_instances(data))) print("Filtered header (#att=" + str(filtered.num_attributes) + "):\n" + str(Instances.template_instances(filtered))) jvm.stop()
with open(data_dir + os.sep + infile, "rb") as csvfile: print(infile) outfile = os.path.splitext(infile)[0] + ".arff" reader = csv.reader(csvfile) data = None ref_present = True for index, row in enumerate(reader): if index == 0: atts = [] ref_present = ("Reference value" in row) or ("Reference Value" in row) for idx, col in enumerate(row): col = col.lower() atts.append(Attribute.create_numeric(col)) if not ref_present and (idx == 0): atts.append(Attribute.create_numeric("reference value")) data = Instances.create_instances("irdc", atts, 0) else: values = [] for idx, col in enumerate(row): values.append(float(col)) if not ref_present and (idx == 0): values.append(float('NaN')) inst = Instance.create_instance(values) data.add_instance(inst) saver = Saver(classname="weka.core.converters.ArffSaver") saver.save_file(data, data_dir + os.sep + outfile) # train/test/predict print("Train/test/predict...")
def plot_learning_curve(classifiers, train, test=None, increments=100, metric="percent_correct", title="Learning curve", label_template="[#] @ $", key_loc="lower right", outfile=None, wait=True): """ Plots :param classifiers: list of Classifier template objects :type classifiers: list of Classifier :param train: dataset to use for the building the classifier, used for evaluating it test set None :type train: Instances :param test: optional dataset to use for the testing the built classifiers :type test: Instances :param increments: the increments (>= 1: # of instances, <1: percentage of dataset) :type increments: float :param metric: the name of the numeric metric to plot (Evaluation.<metric>) :type metric: str :param title: the title for the plot :type title: str :param label_template: the template for the label in the plot (#: 1-based index, @: full classname, !: simple classname, $: options) :type label_template: str :param key_loc: the location string for the key :type key_loc: str :param outfile: the output file, ignored if None :type outfile: str :param wait: whether to wait for the user to close the plot :type wait: bool """ if not plot.matplotlib_available: logger.error("Matplotlib is not installed, plotting unavailable!") return if not train.has_class(): logger.error("Training set has no class attribute set!") return if (test is not None) and (train.equal_headers(test) is not None): logger.error("Training and test set are not compatible: " + train.equal_headers(test)) return if increments >= 1: inc = increments else: inc = round(train.num_instances * increments) steps = [] cls = [] evls = {} for classifier in classifiers: cl = Classifier.make_copy(classifier) cls.append(cl) evls[cl] = [] if test is None: tst = train else: tst = test for i in xrange(train.num_instances): if (i > 0) and (i % inc == 0): steps.append(i+1) for cl in cls: # train if cl.is_updateable: if i == 0: tr = Instances.copy_instances(train, 0, 1) cl.build_classifier(tr) else: cl.update_classifier(train.get_instance(i)) else: if (i > 0) and (i % inc == 0): tr = Instances.copy_instances(train, 0, i + 1) cl.build_classifier(tr) # evaluate if (i > 0) and (i % inc == 0): evl = Evaluation(tst) evl.test_model(cl, tst) evls[cl].append(getattr(evl, metric)) fig, ax = plt.subplots() ax.set_xlabel("# of instances") ax.set_ylabel(metric) ax.set_title(title) fig.canvas.set_window_title(title) ax.grid(True) i = 0 for cl in cls: evl = evls[cl] i += 1 plot_label = label_template.\ replace("#", str(i)).\ replace("@", cl.classname).\ replace("!", cl.classname[cl.classname.rfind(".") + 1:]).\ replace("$", join_options(cl.config)) ax.plot(steps, evl, label=plot_label) plt.draw() plt.legend(loc=key_loc, shadow=True) if outfile is not None: plt.savefig(outfile) if wait: plt.show()
def main(): """ Just runs some example code. """ # load a dataset data_file = helper.get_data_dir() + os.sep + "vote.arff" helper.print_info("Loading dataset: " + data_file) loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(data_file) data.class_is_last() # classifier classifier = Classifier(classname="weka.classifiers.trees.J48") # randomize data folds = 10 seed = 1 rnd = Random(seed) rand_data = Instances.copy_instances(data) rand_data.randomize(rnd) if rand_data.class_attribute.is_nominal: rand_data.stratify(folds) # perform cross-validation and add predictions predicted_data = None evaluation = Evaluation(rand_data) for i in xrange(folds): train = rand_data.train_cv(folds, i) # the above code is used by the StratifiedRemoveFolds filter, # the following code is used by the Explorer/Experimenter # train = rand_data.train_cv(folds, i, rnd) test = rand_data.test_cv(folds, i) # build and evaluate classifier cls = Classifier.make_copy(classifier) cls.build_classifier(train) evaluation.test_model(cls, test) # add predictions addcls = Filter( classname="weka.filters.supervised.attribute.AddClassification", options=["-classification", "-distribution", "-error"]) # setting the java object directory avoids issues with correct quoting in option array addcls.set_property("classifier", Classifier.make_copy(classifier)) addcls.inputformat(train) addcls.filter(train) # trains the classifier pred = addcls.filter(test) if predicted_data is None: predicted_data = Instances.template_instances(pred, 0) for n in xrange(pred.num_instances): predicted_data.add_instance(pred.get_instance(n)) print("") print("=== Setup ===") print("Classifier: " + classifier.to_commandline()) print("Dataset: " + data.relationname) print("Folds: " + str(folds)) print("Seed: " + str(seed)) print("") print(evaluation.summary("=== " + str(folds) + " -fold Cross-Validation ===")) print("") print(predicted_data)