Exemplo n.º 1
0
    def transform_multilabel_to_N_classes(self, context, classifier_name):
        """
        Transform the multilabel files into a n-classes problem.
        Convert a multilabel pattern file into multiple one-class pattern files
        """
        for pattern_kind in context["patterns_texts"]:
            for class_text in context["classifiers"][classifier_name]["classes_names"]:
                dir_name = context["general_path"] + "patterns/" + context["preprocess"][
                    "transform_multilabel_to_N_classes"]["new_set_name"] + '/'
                path_exists(dir_name)
                file_name = os.path.basename(context["classifiers"][classifier_name]["patterns"][pattern_kind])
                file_name = dir_name + file_name[:file_name.find(".pat")] + "_" + class_text + ".pat"
                try:
                    f = open(file_name, "w+")
                except IOError:
                    raise NameError("Error ocurred trying to open file %s" % file_name)

                for i in range(len(context["patterns"].patterns[classifier_name][pattern_kind])):
                    total = context["patterns"].patterns[classifier_name][pattern_kind][i]
                    #Write values
                    for value in total[0][:len(total[0])]:
                        f.write(str(value) + " ")
                        #Write class
                    classes = total[1]
                    if classes[context["classifiers"][classifier_name]["classes_names"].index(class_text)] == 1:
                        f.write("1\n")
                    else:
                        f.write("0\n")
                f.close()
Exemplo n.º 2
0
    def transform_multilabel_to_N_classes(self, context, classifier_name):
        """
        Transform the multilabel files into a n-classes problem.
        Convert a multilabel pattern file into multiple one-class pattern files
        """
        for pattern_kind in context["patterns_texts"]:
            for class_text in context["classifiers"][classifier_name][
                    "classes_names"]:
                dir_name = context["general_path"] + "patterns/" + context[
                    "preprocess"]["transform_multilabel_to_N_classes"][
                        "new_set_name"] + '/'
                path_exists(dir_name)
                file_name = os.path.basename(
                    context["classifiers"][classifier_name]["patterns"]
                    [pattern_kind])
                file_name = dir_name + file_name[:file_name.find(
                    ".pat")] + "_" + class_text + ".pat"
                try:
                    f = open(file_name, "w+")
                except IOError:
                    raise NameError("Error ocurred trying to open file %s" %
                                    file_name)

                for i in range(
                        len(context["patterns"].patterns[classifier_name]
                            [pattern_kind])):
                    total = context["patterns"].patterns[classifier_name][
                        pattern_kind][i]
                    #Write values
                    for value in total[0][:len(total[0])]:
                        f.write(str(value) + " ")
                        #Write class
                    classes = total[1]
                    if classes[context["classifiers"][classifier_name]
                               ["classes_names"].index(class_text)] == 1:
                        f.write("1\n")
                    else:
                        f.write("0\n")
                f.close()
Exemplo n.º 3
0
    def points2series(context):
        import pandas as pd
        from mullpy.auxiliar import csv2pat
        import sys
        import os

        serie_points_amount = context["preprocess"]["points2series"]["serie_size"]
        input_file = context["preprocess"]["points2series"]["input_file"]
        output_file = context["preprocess"]["points2series"]["output_file"]
        class_variable = context["preprocess"]["points2series"]["class_variable"]
        series_limit = context["preprocess"]["points2series"]["series_limit"]
        # TODO: Add support for multiple class variables. Now classes_len = 1
        classes_len = 1
        defined_features_list = context["preprocess"]["points2series"]["columns"]

        if defined_features_list == "all":
            input_df = pd.read_csv(input_file)
            defined_features_list = input_df.columns
        else:
            defined_features_list.append(class_variable)
            input_df = pd.read_csv(input_file, usecols=defined_features_list)

        # We have to take only the (series_limit + series_size) last points of input_df
        input_df_last = input_df.iloc[len(input_df) - (series_limit + serie_points_amount):].reset_index(drop=True)

        # Building output columns list defined_features_list
        features_list = []
        for i in range(serie_points_amount):
            for j in range(len(defined_features_list)):
                features_list.append("%s_%d" % (defined_features_list[j].upper(), i))
                # Adding last column, that is class variable.
        if "deployment" not in context["execution_kind"]:
            features_list.append("%s_%s" % (class_variable.upper(), "CLASS"))

        output_df = pd.DataFrame(columns=features_list, dtype=np.float32)
        if "deployment" not in context["execution_kind"]:
            iteration = range(len(input_df_last) - serie_points_amount)
        else:
            iteration = range(1, len(input_df_last) - serie_points_amount + 1)
        for i in iteration:
            # Percentage completed
            if "deployment" not in context["execution_kind"]:
                sys.stdout.write("\r{0}".format("Loaded:%f%%" % (i * 100 / (len(input_df_last) - serie_points_amount))))
                sys.stdout.flush()
            #Iterate over a numpy row in order to optimize the performance
            row = np.zeros((1, len(features_list)), dtype=np.float32)
            j, z = 0, 0
            for j in range(serie_points_amount):
                for column in defined_features_list:
                    # We have to test if the exchange value was correctly given (between 1 and 2 in those dates)
                    row[0, z] = input_df_last.iloc[i + j][column]
                    z += 1
            if "deployment" not in context["execution_kind"]:
                row[0, z] = PreProcess.check_eurusd_values(input_df_last[class_variable][i + serie_points_amount])
            output_df.loc[i] = row
            #Check the variable series_limit and break the for if the amount of rows was reached
            if series_limit is not None and i + 1 >= series_limit:
                break

        #Create the dataFrame to output the csv
        # output_df = pd.DataFrame(matrix, columns=features_list)
        # Building csv and pat files
        file_name = output_file + ".csv"
        path_exists(os.path.dirname(file_name))
        output_df.to_csv(file_name, index=False)
        if context["preprocess"]["points2series"]["to_pat"]:
            csv2pat(file_name, classes_len)
        if not context["preprocess"]["points2series"]["to_csv"]:
            os.remove(file_name)
        # Displaying info
        serie_name = output_file[output_file.rfind("/") + 1:]
        serie_path = output_file[:output_file.rfind("/")]
        if "deployment" not in context["execution_kind"]:
            print("\n%s pattern files built at %s" % (serie_name, serie_path))
Exemplo n.º 4
0
    def random_distribution(self, context):
        """
        Bagging methods come in many flavours but mostly differ from each other by the way they draw random subsets
         of the training set:

        -When random subsets of the dataset are drawn as random subsets of the samples, then this algorithm is known
        as Pasting Rvotes.
        -When samples are drawn with replacement, then the method is known as Bagging.
        -When random subsets of the dataset are drawn as random subsets of the features, then the method is known as
        Random Subspaces.
        -When base estimators are built on subsets of both samples and features, then the method is known as Random
        Patches.

        group_successive variable groups each X instances. Each of these successive instances has to be together in
        the sampling process
        """
        total_length = 0
        lengths = AutoVivification()
        for pattern_kind in context["patterns"].patterns[context["classifier_list"][0]]:
            lengths[pattern_kind] = len(context["patterns"].patterns[context["classifier_list"][0]][pattern_kind])
            total_length += lengths[pattern_kind]

        #Check if the length of patterns have the same size
        for classifier_name in context["classifier_list"]:
            for pattern_kind in context["patterns"].patterns[classifier_name]:
                if len(context["patterns"].patterns[classifier_name][pattern_kind]) != lengths[pattern_kind]:
                    raise ValueError(
                        'The length of the %s pattern of classifier %s has different size from others' % pattern_kind,
                        classifier_name)

        if context["preprocess"]["random_distribution"]["group_successive"]:
            total_length = int(total_length / context["preprocess"]["random_distribution"]["group_successive"])
            for pattern_kind in lengths:
                lengths[pattern_kind] = int(
                    lengths[pattern_kind] / context["preprocess"]["random_distribution"]["group_successive"])

        dir_name = context["general_path"] + "patterns/" + context["classifiers"][context["classifier_list"][0]]["set"]
        filters = AutoVivification()
        ###Specific kind of sampling###
        #############
        ######BAGGING
        #############
        if "bagging" in context["preprocess"]["random_distribution"] and \
                context["preprocess"]["random_distribution"]["bagging"]["activate"]:
            for pattern_kind in context["patterns_texts"]:
                filters[pattern_kind] = []
            self.bagging(context, filters, lengths, total_length)
            dir_name += "_bagging/"
        #############
        ######PASTING
        #############
        elif "pasting_Rvotes" in context["preprocess"]["random_distribution"] and \
                context["preprocess"]["random_distribution"]["pasting_Rvotes"]["activate"]:
            for pattern_kind in context["patterns_texts"]:
                filters[pattern_kind] = []
            self.pasting_rvotes(context, filters, lengths, total_length)
            dir_name += "_pasting_Rvotes/"
        #################
        #RANDOM SUBSPACES
        #################
        elif "random_subspaces" in context["preprocess"]["random_distribution"] and \
                context["preprocess"]["random_distribution"]["random_subspaces"]["activate"]:
            features_amount = self.check_features_amount(context)
            for pattern_kind in context["patterns_texts"]:
                filters[pattern_kind] = []
            self.random_subspaces(context, filters, features_amount)
            dir_name += "_random_subspaces/"
        #############
        #COMBINATIONS
        #############
        elif "all_features_combination" in context["preprocess"]["random_distribution"] and \
                context["preprocess"]["random_distribution"]["all_features_combination"]["activate"]:
            features_amount = self.check_features_amount(context)
            for pattern_kind in context["patterns_texts"]:
                filters[pattern_kind] = []
            self.all_features_combination(context, filters, features_amount)
            dir_name += "_features_combination/"
            context["preprocess"]["random_distribution"]["number_base_classifiers"] = len(filters["learning"])
        ###############
        #RANDOM PATCHES
        ###############
        elif "random_patches" in context["preprocess"]["random_distribution"] and \
                context["preprocess"]["random_distribution"]["random_patches"]["activate"]:
            dir_name += "_random_patches/"
        ###############
        #K-FOLD
        ###############
        elif "k_fold" in context["preprocess"]["random_distribution"] and \
                context["preprocess"]["random_distribution"]["k_fold"]["activate"]:
            for pattern_kind in context["preprocess"]["random_distribution"]["k_fold"]["percents"]:
                filters[pattern_kind] = []
            self.k_fold(context, filters)
            dir_name += "_k_fold/"
        ###############
        #Forecasting distribution
        ###############
        elif "forecasting_distribution" in context["preprocess"]["random_distribution"] and \
                context["preprocess"]["random_distribution"]["forecasting_distribution"]["activate"]:
            self.forecasting_distribution(context, filters)
            dir_name += "_walking_forward/"

            ###Common functions###
        elif "bagging" in context["preprocess"]["random_distribution"] and \
                context["preprocess"]["random_distribution"]["bagging"]["activate"] \
                or "pasting_Rvotes" in context["preprocess"]["random_distribution"] and \
                        context["preprocess"]["random_distribution"]["pasting_Rvotes"]["activate"]:
            if context["preprocess"]["random_distribution"]["group_successive"]:
                for kind_of in filters:
                    for filter in filters[kind_of]:
                        for i in range(len(filter)):
                            filter[i] = (
                                filter[i] * context["preprocess"]["random_distribution"]["group_successive"])
                            for j in range(1, context["preprocess"]["random_distribution"]["group_successive"]):
                                filter.append(filter[i] + j)

        path_exists(dir_name)

        self._generate_new_patterns_random_distribution(context, filters, dir_name)
Exemplo n.º 5
0
    def points2series(context):
        import pandas as pd
        from mullpy.auxiliar import csv2pat
        import sys
        import os

        serie_points_amount = context["preprocess"]["points2series"][
            "serie_size"]
        input_file = context["preprocess"]["points2series"]["input_file"]
        output_file = context["preprocess"]["points2series"]["output_file"]
        class_variable = context["preprocess"]["points2series"][
            "class_variable"]
        series_limit = context["preprocess"]["points2series"]["series_limit"]
        # TODO: Add support for multiple class variables. Now classes_len = 1
        classes_len = 1
        defined_features_list = context["preprocess"]["points2series"][
            "columns"]

        if defined_features_list == "all":
            input_df = pd.read_csv(input_file)
            defined_features_list = input_df.columns
        else:
            defined_features_list.append(class_variable)
            input_df = pd.read_csv(input_file, usecols=defined_features_list)

        # We have to take only the (series_limit + series_size) last points of input_df
        input_df_last = input_df.iloc[len(input_df) -
                                      (series_limit +
                                       serie_points_amount):].reset_index(
                                           drop=True)

        # Building output columns list defined_features_list
        features_list = []
        for i in range(serie_points_amount):
            for j in range(len(defined_features_list)):
                features_list.append("%s_%d" %
                                     (defined_features_list[j].upper(), i))
                # Adding last column, that is class variable.
        if "deployment" not in context["execution_kind"]:
            features_list.append("%s_%s" % (class_variable.upper(), "CLASS"))

        output_df = pd.DataFrame(columns=features_list, dtype=np.float32)
        if "deployment" not in context["execution_kind"]:
            iteration = range(len(input_df_last) - serie_points_amount)
        else:
            iteration = range(1, len(input_df_last) - serie_points_amount + 1)
        for i in iteration:
            # Percentage completed
            if "deployment" not in context["execution_kind"]:
                sys.stdout.write("\r{0}".format(
                    "Loaded:%f%%" %
                    (i * 100 / (len(input_df_last) - serie_points_amount))))
                sys.stdout.flush()
            #Iterate over a numpy row in order to optimize the performance
            row = np.zeros((1, len(features_list)), dtype=np.float32)
            j, z = 0, 0
            for j in range(serie_points_amount):
                for column in defined_features_list:
                    # We have to test if the exchange value was correctly given (between 1 and 2 in those dates)
                    row[0, z] = input_df_last.iloc[i + j][column]
                    z += 1
            if "deployment" not in context["execution_kind"]:
                row[0, z] = PreProcess.check_eurusd_values(
                    input_df_last[class_variable][i + serie_points_amount])
            output_df.loc[i] = row
            #Check the variable series_limit and break the for if the amount of rows was reached
            if series_limit is not None and i + 1 >= series_limit:
                break

        #Create the dataFrame to output the csv
        # output_df = pd.DataFrame(matrix, columns=features_list)
        # Building csv and pat files
        file_name = output_file + ".csv"
        path_exists(os.path.dirname(file_name))
        output_df.to_csv(file_name, index=False)
        if context["preprocess"]["points2series"]["to_pat"]:
            csv2pat(file_name, classes_len)
        if not context["preprocess"]["points2series"]["to_csv"]:
            os.remove(file_name)
        # Displaying info
        serie_name = output_file[output_file.rfind("/") + 1:]
        serie_path = output_file[:output_file.rfind("/")]
        if "deployment" not in context["execution_kind"]:
            print("\n%s pattern files built at %s" % (serie_name, serie_path))
Exemplo n.º 6
0
    def random_distribution(self, context):
        """
        Bagging methods come in many flavours but mostly differ from each other by the way they draw random subsets
         of the training set:

        -When random subsets of the dataset are drawn as random subsets of the samples, then this algorithm is known
        as Pasting Rvotes.
        -When samples are drawn with replacement, then the method is known as Bagging.
        -When random subsets of the dataset are drawn as random subsets of the features, then the method is known as
        Random Subspaces.
        -When base estimators are built on subsets of both samples and features, then the method is known as Random
        Patches.

        group_successive variable groups each X instances. Each of these successive instances has to be together in
        the sampling process
        """
        total_length = 0
        lengths = AutoVivification()
        for pattern_kind in context["patterns"].patterns[
                context["classifier_list"][0]]:
            lengths[pattern_kind] = len(context["patterns"].patterns[
                context["classifier_list"][0]][pattern_kind])
            total_length += lengths[pattern_kind]

        #Check if the length of patterns have the same size
        for classifier_name in context["classifier_list"]:
            for pattern_kind in context["patterns"].patterns[classifier_name]:
                if len(context["patterns"].patterns[classifier_name]
                       [pattern_kind]) != lengths[pattern_kind]:
                    raise ValueError(
                        'The length of the %s pattern of classifier %s has different size from others'
                        % pattern_kind, classifier_name)

        if context["preprocess"]["random_distribution"]["group_successive"]:
            total_length = int(total_length / context["preprocess"]
                               ["random_distribution"]["group_successive"])
            for pattern_kind in lengths:
                lengths[pattern_kind] = int(
                    lengths[pattern_kind] / context["preprocess"]
                    ["random_distribution"]["group_successive"])

        dir_name = context["general_path"] + "patterns/" + context[
            "classifiers"][context["classifier_list"][0]]["set"]
        filters = AutoVivification()
        ###Specific kind of sampling###
        #############
        ######BAGGING
        #############
        if "bagging" in context["preprocess"]["random_distribution"] and \
                context["preprocess"]["random_distribution"]["bagging"]["activate"]:
            for pattern_kind in context["patterns_texts"]:
                filters[pattern_kind] = []
            self.bagging(context, filters, lengths, total_length)
            dir_name += "_bagging/"
        #############
        ######PASTING
        #############
        elif "pasting_Rvotes" in context["preprocess"]["random_distribution"] and \
                context["preprocess"]["random_distribution"]["pasting_Rvotes"]["activate"]:
            for pattern_kind in context["patterns_texts"]:
                filters[pattern_kind] = []
            self.pasting_rvotes(context, filters, lengths, total_length)
            dir_name += "_pasting_Rvotes/"
        #################
        #RANDOM SUBSPACES
        #################
        elif "random_subspaces" in context["preprocess"]["random_distribution"] and \
                context["preprocess"]["random_distribution"]["random_subspaces"]["activate"]:
            features_amount = self.check_features_amount(context)
            for pattern_kind in context["patterns_texts"]:
                filters[pattern_kind] = []
            self.random_subspaces(context, filters, features_amount)
            dir_name += "_random_subspaces/"
        #############
        #COMBINATIONS
        #############
        elif "all_features_combination" in context["preprocess"]["random_distribution"] and \
                context["preprocess"]["random_distribution"]["all_features_combination"]["activate"]:
            features_amount = self.check_features_amount(context)
            for pattern_kind in context["patterns_texts"]:
                filters[pattern_kind] = []
            self.all_features_combination(context, filters, features_amount)
            dir_name += "_features_combination/"
            context["preprocess"]["random_distribution"][
                "number_base_classifiers"] = len(filters["learning"])
        ###############
        #RANDOM PATCHES
        ###############
        elif "random_patches" in context["preprocess"]["random_distribution"] and \
                context["preprocess"]["random_distribution"]["random_patches"]["activate"]:
            dir_name += "_random_patches/"
        ###############
        #K-FOLD
        ###############
        elif "k_fold" in context["preprocess"]["random_distribution"] and \
                context["preprocess"]["random_distribution"]["k_fold"]["activate"]:
            for pattern_kind in context["preprocess"]["random_distribution"][
                    "k_fold"]["percents"]:
                filters[pattern_kind] = []
            self.k_fold(context, filters)
            dir_name += "_k_fold/"
        ###############
        #Forecasting distribution
        ###############
        elif "forecasting_distribution" in context["preprocess"]["random_distribution"] and \
                context["preprocess"]["random_distribution"]["forecasting_distribution"]["activate"]:
            self.forecasting_distribution(context, filters)
            dir_name += "_walking_forward/"

            ###Common functions###
        elif "bagging" in context["preprocess"]["random_distribution"] and \
                context["preprocess"]["random_distribution"]["bagging"]["activate"] \
                or "pasting_Rvotes" in context["preprocess"]["random_distribution"] and \
                        context["preprocess"]["random_distribution"]["pasting_Rvotes"]["activate"]:
            if context["preprocess"]["random_distribution"][
                    "group_successive"]:
                for kind_of in filters:
                    for filter in filters[kind_of]:
                        for i in range(len(filter)):
                            filter[i] = (
                                filter[i] * context["preprocess"]
                                ["random_distribution"]["group_successive"])
                            for j in range(
                                    1, context["preprocess"]
                                ["random_distribution"]["group_successive"]):
                                filter.append(filter[i] + j)

        path_exists(dir_name)

        self._generate_new_patterns_random_distribution(
            context, filters, dir_name)