Exemplo n.º 1
0
    def train_internal(self):
        best_weights_arr = []
        #create an empty F with source as template
        F = Instances.template_instances(self.source[0])
        withF = False
        print("Find weight for each source data set")
        for source in self.source:
            bestWeight, bestError = self.process_source(source, F, withF)
            best_weights_arr.append(bestWeight)

        #sort the data based on the weights
        self.source = [
            source for _, source in sorted(zip(best_weights_arr, self.source),
                                           reverse=True,
                                           key=operator.itemgetter(0))
        ]

        print("Train for final stage")
        withF = True
        while len(self.source) > 0:  #self.max_source_dataset):
            weight, _ = self.process_source(self.source[0], F, withF)
            for inst in self.source[0]:
                inst.weight = weight
            F = Instances.append_instances(F, self.source[0])
            F.class_is_last()
            self.source.pop(0)

        return F
Exemplo n.º 2
0
    def folds(self, nfolds=10, seed=None):
        """
        Get (training,testing) datasets for cross-validation.

        Arguments:

            nfolds (int, optional): Number of folds. Default value is
              10.
            seed (int, optional): Seed value for shuffling
              dataset. Default value is random int 0 <= x <= 10000.

        Returns:

            list of (Instances,Instances) tuples: Each list element is
              a pair of (training,testing) datasets, respectively.
        """
        seed = seed or randint(0, 10000)
        rnd = WekaRandom(seed)

        fold_size = labmath.ceil(self.instances.num_instances / nfolds)

        # Shuffle the dataset.
        instances = WekaInstances.copy_instances(self.instances)
        instances.randomize(rnd)

        folds = []
        for i in range(nfolds):
            offset = i * fold_size
            testing_end = min(offset + fold_size, instances.num_instances - 1)

            # Calculate dataset indices for testing and training data.
            testing_range = (offset, testing_end - offset)
            left_range = (0, offset)
            right_range = (testing_end, instances.num_instances - testing_end)

            # If there's nothing to test, move on.
            if testing_range[1] < 1: continue

            # Create testing and training folds.
            testing = WekaInstances.copy_instances(instances, *testing_range)
            left = WekaInstances.copy_instances(instances, *left_range)
            right = WekaInstances.copy_instances(instances, *right_range)
            training = WekaInstances.append_instances(left, right)

            # Add fold to collection.
            folds.append((training, testing))

        return folds
Exemplo n.º 3
0
    def train(self):
        if not os.path.exists(self.savepath):
            os.mkdir(self.savepath)

        filename = os.path.join(self.savepath,
                                self.target_name + "_final.arff")
        if os.path.exists(filename):
            print("loading pretrain data for final training")
            loader = Loader(classname="weka.core.converters.ArffLoader")
            F = loader.load_file(filename)
        else:
            print("Start internal training")
            F = self.train_internal()
            self.saveFinal(filename, F)

        F.class_is_last()
        final_train_set = Instances.append_instances(F, self.target)
        final_train_set.class_is_last()

        print("Building final model")
        self.model.build_classifier(final_train_set)
def DecisionTree(rnd_data, folds, seed, data):

    data_size = rnd_data.num_instances
    fold_size = math.floor(data_size / folds)

    # cross-validation
    evaluation = Evaluation(rnd_data)
    for i in range(folds):
        this_fold = fold_size
        test_start = i * fold_size
        test_end = (test_start + fold_size)
        if ((data_size - test_end) / fold_size < 1):
            this_fold = data_size - test_start
        test = Instances.copy_instances(rnd_data, test_start,
                                        this_fold)  # generate validation fold
        if i == 0:
            train = Instances.copy_instances(rnd_data, test_end,
                                             data_size - test_end)
        else:
            train_1 = Instances.copy_instances(rnd_data, 0, test_start)
            train_2 = Instances.copy_instances(rnd_data, test_end,
                                               data_size - test_end)
            train = Instances.append_instances(
                train_1, train_2)  # generate training fold

        # build and evaluate classifier
        cls = Classifier(classname="weka.classifiers.trees.J48")
        cls.build_classifier(train)  # build classifier on training set
        evaluation.test_model(cls,
                              test)  # test classifier on validation/test set

    print("")
    print("=== Decision Tree ===")
    print("Classifier: " + cls.to_commandline())
    print("Dataset: " + data.relationname)
    print("Folds: " + str(folds))
    print("Seed: " + str(seed))
    print("")
    print(
        evaluation.summary("=== " + str(folds) + "-fold Cross-Validation ==="))
def main():
    """
    Just runs some example code.
    """

    # load a dataset
    iris_file = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()
    helper.print_title("Iris dataset")
    print(iris_data)
    helper.print_title("Iris dataset (incrementally output)")
    for i in iris_data:
        print(i)
    helper.print_title("Iris summary")
    print(Instances.summary(iris_data))
    helper.print_title("Iris attributes")
    for a in iris_data.attributes():
        print(a)
    helper.print_title("Instance at #0")
    print(iris_data.get_instance(0))
    print(iris_data.get_instance(0).values)
    print("Attribute stats (first):\n" + str(iris_data.attribute_stats(0)))
    print("total count (first attribute):\n" + str(iris_data.attribute_stats(0).total_count))
    print("numeric stats (first attribute):\n" + str(iris_data.attribute_stats(0).numeric_stats))
    print("nominal counts (last attribute):\n"
          + str(iris_data.attribute_stats(iris_data.num_attributes - 1).nominal_counts))
    helper.print_title("Instance values at #0")
    for v in iris_data.get_instance(0):
        print(v)

    # append datasets
    helper.print_title("append datasets")
    data1 = Instances.copy_instances(iris_data, 0, 2)
    data2 = Instances.copy_instances(iris_data, 2, 2)
    print("Dataset #1:\n" + str(data1))
    print("Dataset #2:\n" + str(data2))
    msg = data1.equal_headers(data2)
    print("#1 == #2 ? " + "yes" if msg is None else msg)
    combined = Instances.append_instances(data1, data2)
    print("Combined:\n" + str(combined))

    # merge datasets
    helper.print_title("merge datasets")
    data1 = Instances.copy_instances(iris_data, 0, 2)
    data1.class_index = -1
    data1.delete_attribute(1)
    data1.delete_first_attribute()
    data2 = Instances.copy_instances(iris_data, 0, 2)
    data2.class_index = -1
    data2.delete_attribute(4)
    data2.delete_attribute(3)
    data2.delete_attribute(2)
    print("Dataset #1:\n" + str(data1))
    print("Dataset #2:\n" + str(data2))
    msg = data1.equal_headers(data2)
    print("#1 == #2 ? " + ("yes" if msg is None else msg))
    combined = Instances.merge_instances(data2, data1)
    print("Combined:\n" + str(combined))

    # load dataset incrementally
    iris_file = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset incrementally: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_data = loader.load_file(iris_file, incremental=True)
    iris_data.class_is_last()
    helper.print_title("Iris dataset")
    print(iris_data)
    for inst in loader:
        print(inst)

    # create attributes
    helper.print_title("Creating attributes")
    num_att = Attribute.create_numeric("num")
    print("numeric: " + str(num_att))
    date_att = Attribute.create_date("dat", "yyyy-MM-dd")
    print("date: " + str(date_att))
    nom_att = Attribute.create_nominal("nom", ["label1", "label2"])
    print("nominal: " + str(nom_att))

    # create dataset
    helper.print_title("Create dataset")
    dataset = Instances.create_instances("helloworld", [num_att, date_att, nom_att], 0)
    print(str(dataset))

    # create an instance
    helper.print_title("Create and add instance")
    values = [3.1415926, date_att.parse_date("2014-04-10"), 1.0]
    inst = Instance.create_instance(values)
    print("Instance #1:\n" + str(inst))
    dataset.add_instance(inst)
    values = [2.71828, date_att.parse_date("2014-08-09"), Instance.missing_value()]
    inst = Instance.create_instance(values)
    dataset.add_instance(inst)
    print("Instance #2:\n" + str(inst))
    inst.set_value(0, 4.0)
    print("Instance #2 (updated):\n" + str(inst))
    print("Dataset:\n" + str(dataset))
    dataset.delete_with_missing(2)
    print("Dataset (after delete of missing):\n" + str(dataset))
    values = [(1, date_att.parse_date("2014-07-11"))]
    inst = Instance.create_sparse_instance(values, 3, classname="weka.core.SparseInstance")
    print("sparse Instance:\n" + str(inst))
    dataset.add_instance(inst)
    print("dataset with mixed dense/sparse instance objects:\n" + str(dataset))

    # create dataset (lists)
    helper.print_title("Create dataset from lists")
    x = [[randint(1, 10) for _ in range(5)] for _ in range(10)]
    y = [randint(0, 1) for _ in range(10)]
    dataset2 = ds.create_instances_from_lists(x, y, "generated from lists")
    print(dataset2)
    x = [[randint(1, 10) for _ in range(5)] for _ in range(10)]
    dataset2 = ds.create_instances_from_lists(x, name="generated from lists (no y)")
    print(dataset2)

    # create dataset (matrices)
    helper.print_title("Create dataset from matrices")
    x = np.random.randn(10, 5)
    y = np.random.randn(10)
    dataset3 = ds.create_instances_from_matrices(x, y, "generated from matrices")
    print(dataset3)
    x = np.random.randn(10, 5)
    dataset3 = ds.create_instances_from_matrices(x, name="generated from matrices (no y)")
    print(dataset3)

    # create more sparse instances
    diabetes_file = helper.get_data_dir() + os.sep + "diabetes.arff"
    helper.print_info("Loading dataset: " + diabetes_file)
    loader = Loader("weka.core.converters.ArffLoader")
    diabetes_data = loader.load_file(diabetes_file)
    diabetes_data.class_is_last()
    helper.print_title("Create sparse instances using template dataset")
    sparse_data = Instances.template_instances(diabetes_data)
    for i in range(diabetes_data.num_attributes - 1):
        inst = Instance.create_sparse_instance(
            [(i, float(i+1) / 10.0)], sparse_data.num_attributes, classname="weka.core.SparseInstance")
        sparse_data.add_instance(inst)
    print("sparse dataset:\n" + str(sparse_data))

    # simple scatterplot of iris dataset: petalwidth x petallength
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()
    pld.scatter_plot(
        iris_data, iris_data.attribute_by_name("petalwidth").index,
        iris_data.attribute_by_name("petallength").index,
        percent=50,
        wait=False)

    # line plot of iris dataset (without class attribute)
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()
    pld.line_plot(iris_data, atts=range(iris_data.num_attributes - 1), percent=50, title="Line plot iris", wait=False)

    # matrix plot of iris dataset
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()
    pld.matrix_plot(iris_data, percent=50, title="Matrix plot iris", wait=True)
def learning_curve(folds, data):
    training_set_size = []
    train_error_nb = []
    train_error_dtree = []
    cv_error_nb = []
    cv_error_dtree = []

    print("")
    print("This may take some time, please wait..")
    for training_size in range(10, 32561, 750):
        print(".")
        training_set_size.append(training_size)
        train_data = Instances.copy_instances(data, 0, training_size)

        data_size = train_data.num_instances
        fold_size = math.floor(data_size / folds)

        # calculating training and cross-validation error
        evaluation_nb_train = Evaluation(train_data)
        evaluation_nb_cv = Evaluation(train_data)
        evaluation_dtree_train = Evaluation(train_data)
        evaluation_dtree_cv = Evaluation(train_data)
        for i in range(folds):
            this_fold = fold_size
            test_start = i * fold_size
            test_end = (test_start + fold_size)
            if ((data_size - test_end) / fold_size < 1):
                this_fold = data_size - test_start
            test = Instances.copy_instances(
                train_data, test_start, this_fold)  # generate validation fold
            if i == 0:
                train = Instances.copy_instances(train_data, test_end,
                                                 data_size - test_end)
            else:
                train_1 = Instances.copy_instances(train_data, 0, test_start)
                train_2 = Instances.copy_instances(train_data, test_end,
                                                   data_size - test_end)
                train = Instances.append_instances(
                    train_1, train_2)  # generate training fold

            # Naive Bayes
            nb = Classifier(classname="weka.classifiers.bayes.NaiveBayes")
            nb.build_classifier(train)
            evaluation_nb_train.test_model(nb, train)
            evaluation_nb_cv.test_model(nb, test)

            # Decision Tree
            dtree = Classifier(classname="weka.classifiers.trees.J48")
            dtree.build_classifier(train)
            evaluation_dtree_train.test_model(dtree, train)
            evaluation_dtree_cv.test_model(dtree, test)

        train_error_nb.append(
            evaluation_nb_train.error_rate)  # training error - NB
        cv_error_nb.append(
            evaluation_nb_cv.error_rate)  # cross-validation error - NB
        train_error_dtree.append(
            evaluation_dtree_train.error_rate)  # training error - DTree
        cv_error_dtree.append(
            evaluation_dtree_cv.error_rate)  # cross-validation error - DTree

    # Plotting of Learning Curve
    x = training_set_size
    y1 = train_error_nb
    z1 = cv_error_nb
    y2 = train_error_dtree
    z2 = cv_error_dtree

    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(13, 8))

    axes[0].plot(x, y1, label='Training Error')
    axes[0].plot(x, z1, label='Cross-Validation Error')
    axes[0].set_xlabel('Training Set Size')
    axes[0].set_ylabel('Error Rate')
    axes[0].set_title('Naive Bayes')
    axes[0].legend()

    axes[1].plot(x, y2, label='Training Error')
    axes[1].plot(x, z2, label='Cross-Validation Error')
    axes[1].set_xlabel('Training Set Size')
    axes[1].set_ylabel('Error Rate')
    axes[1].set_title('Decision Tree')
    axes[1].legend()

    plt.show(block=True)
def main():
    """
    Just runs some example code.
    """

    # load a dataset
    iris_file = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()
    helper.print_title("Iris dataset")
    print(iris_data)
    helper.print_title("Iris dataset (incrementally output)")
    for i in iris_data:
        print(i)
    helper.print_title("Iris summary")
    print(Instances.summary(iris_data))
    helper.print_title("Iris attributes")
    for a in iris_data.attributes():
        print(a)
    helper.print_title("Instance at #0")
    print(iris_data.get_instance(0))
    print(iris_data.get_instance(0).values)
    print("Attribute stats (first):\n" + str(iris_data.attribute_stats(0)))
    print("total count (first attribute):\n" +
          str(iris_data.attribute_stats(0).total_count))
    print("numeric stats (first attribute):\n" +
          str(iris_data.attribute_stats(0).numeric_stats))
    print("nominal counts (last attribute):\n" + str(
        iris_data.attribute_stats(iris_data.num_attributes -
                                  1).nominal_counts))
    helper.print_title("Instance values at #0")
    for v in iris_data.get_instance(0):
        print(v)

    # append datasets
    helper.print_title("append datasets")
    data1 = Instances.copy_instances(iris_data, 0, 2)
    data2 = Instances.copy_instances(iris_data, 2, 2)
    print("Dataset #1:\n" + str(data1))
    print("Dataset #2:\n" + str(data2))
    msg = data1.equal_headers(data2)
    print("#1 == #2 ? " + "yes" if msg is None else msg)
    combined = Instances.append_instances(data1, data2)
    print("Combined:\n" + str(combined))

    # merge datasets
    helper.print_title("merge datasets")
    data1 = Instances.copy_instances(iris_data, 0, 2)
    data1.class_index = -1
    data1.delete_attribute(1)
    data1.delete_first_attribute()
    data2 = Instances.copy_instances(iris_data, 0, 2)
    data2.class_index = -1
    data2.delete_attribute(4)
    data2.delete_attribute(3)
    data2.delete_attribute(2)
    print("Dataset #1:\n" + str(data1))
    print("Dataset #2:\n" + str(data2))
    msg = data1.equal_headers(data2)
    print("#1 == #2 ? " + ("yes" if msg is None else msg))
    combined = Instances.merge_instances(data2, data1)
    print("Combined:\n" + str(combined))

    # load dataset incrementally
    iris_file = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset incrementally: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_data = loader.load_file(iris_file, incremental=True)
    iris_data.class_is_last()
    helper.print_title("Iris dataset")
    print(iris_data)
    for inst in loader:
        print(inst)

    # create attributes
    helper.print_title("Creating attributes")
    num_att = Attribute.create_numeric("num")
    print("numeric: " + str(num_att))
    date_att = Attribute.create_date("dat", "yyyy-MM-dd")
    print("date: " + str(date_att))
    nom_att = Attribute.create_nominal("nom", ["label1", "label2"])
    print("nominal: " + str(nom_att))

    # create dataset
    helper.print_title("Create dataset")
    dataset = Instances.create_instances("helloworld",
                                         [num_att, date_att, nom_att], 0)
    print(str(dataset))

    # create an instance
    helper.print_title("Create and add instance")
    values = [3.1415926, date_att.parse_date("2014-04-10"), 1.0]
    inst = Instance.create_instance(values)
    print("Instance #1:\n" + str(inst))
    dataset.add_instance(inst)
    values = [
        2.71828,
        date_att.parse_date("2014-08-09"),
        Instance.missing_value()
    ]
    inst = Instance.create_instance(values)
    dataset.add_instance(inst)
    print("Instance #2:\n" + str(inst))
    inst.set_value(0, 4.0)
    print("Instance #2 (updated):\n" + str(inst))
    print("Dataset:\n" + str(dataset))
    dataset.delete_with_missing(2)
    print("Dataset (after delete of missing):\n" + str(dataset))
    values = [(1, date_att.parse_date("2014-07-11"))]
    inst = Instance.create_sparse_instance(
        values, 3, classname="weka.core.SparseInstance")
    print("sparse Instance:\n" + str(inst))
    dataset.add_instance(inst)
    print("dataset with mixed dense/sparse instance objects:\n" + str(dataset))

    # create dataset (lists)
    helper.print_title("Create dataset from lists")
    x = [[randint(1, 10) for _ in range(5)] for _ in range(10)]
    y = [randint(0, 1) for _ in range(10)]
    dataset2 = ds.create_instances_from_lists(x, y, "generated from lists")
    print(dataset2)
    x = [[randint(1, 10) for _ in range(5)] for _ in range(10)]
    dataset2 = ds.create_instances_from_lists(
        x, name="generated from lists (no y)")
    print(dataset2)

    # create dataset (matrices)
    helper.print_title("Create dataset from matrices")
    x = np.random.randn(10, 5)
    y = np.random.randn(10)
    dataset3 = ds.create_instances_from_matrices(x, y,
                                                 "generated from matrices")
    print(dataset3)
    x = np.random.randn(10, 5)
    dataset3 = ds.create_instances_from_matrices(
        x, name="generated from matrices (no y)")
    print(dataset3)

    # create more sparse instances
    diabetes_file = helper.get_data_dir() + os.sep + "diabetes.arff"
    helper.print_info("Loading dataset: " + diabetes_file)
    loader = Loader("weka.core.converters.ArffLoader")
    diabetes_data = loader.load_file(diabetes_file)
    diabetes_data.class_is_last()
    helper.print_title("Create sparse instances using template dataset")
    sparse_data = Instances.template_instances(diabetes_data)
    for i in xrange(diabetes_data.num_attributes - 1):
        inst = Instance.create_sparse_instance(
            [(i, float(i + 1) / 10.0)],
            sparse_data.num_attributes,
            classname="weka.core.SparseInstance")
        sparse_data.add_instance(inst)
    print("sparse dataset:\n" + str(sparse_data))

    # simple scatterplot of iris dataset: petalwidth x petallength
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()
    pld.scatter_plot(iris_data,
                     iris_data.attribute_by_name("petalwidth").index,
                     iris_data.attribute_by_name("petallength").index,
                     percent=50,
                     wait=False)

    # line plot of iris dataset (without class attribute)
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()
    pld.line_plot(iris_data,
                  atts=xrange(iris_data.num_attributes - 1),
                  percent=50,
                  title="Line plot iris",
                  wait=False)

    # matrix plot of iris dataset
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()
    pld.matrix_plot(iris_data, percent=50, title="Matrix plot iris", wait=True)