def transform_multilabel_to_N_classes(self, context, classifier_name): """ Transform the multilabel files into a n-classes problem. Convert a multilabel pattern file into multiple one-class pattern files """ for pattern_kind in context["patterns_texts"]: for class_text in context["classifiers"][classifier_name]["classes_names"]: dir_name = context["general_path"] + "patterns/" + context["preprocess"][ "transform_multilabel_to_N_classes"]["new_set_name"] + '/' path_exists(dir_name) file_name = os.path.basename(context["classifiers"][classifier_name]["patterns"][pattern_kind]) file_name = dir_name + file_name[:file_name.find(".pat")] + "_" + class_text + ".pat" try: f = open(file_name, "w+") except IOError: raise NameError("Error ocurred trying to open file %s" % file_name) for i in range(len(context["patterns"].patterns[classifier_name][pattern_kind])): total = context["patterns"].patterns[classifier_name][pattern_kind][i] #Write values for value in total[0][:len(total[0])]: f.write(str(value) + " ") #Write class classes = total[1] if classes[context["classifiers"][classifier_name]["classes_names"].index(class_text)] == 1: f.write("1\n") else: f.write("0\n") f.close()
def transform_multilabel_to_N_classes(self, context, classifier_name): """ Transform the multilabel files into a n-classes problem. Convert a multilabel pattern file into multiple one-class pattern files """ for pattern_kind in context["patterns_texts"]: for class_text in context["classifiers"][classifier_name][ "classes_names"]: dir_name = context["general_path"] + "patterns/" + context[ "preprocess"]["transform_multilabel_to_N_classes"][ "new_set_name"] + '/' path_exists(dir_name) file_name = os.path.basename( context["classifiers"][classifier_name]["patterns"] [pattern_kind]) file_name = dir_name + file_name[:file_name.find( ".pat")] + "_" + class_text + ".pat" try: f = open(file_name, "w+") except IOError: raise NameError("Error ocurred trying to open file %s" % file_name) for i in range( len(context["patterns"].patterns[classifier_name] [pattern_kind])): total = context["patterns"].patterns[classifier_name][ pattern_kind][i] #Write values for value in total[0][:len(total[0])]: f.write(str(value) + " ") #Write class classes = total[1] if classes[context["classifiers"][classifier_name] ["classes_names"].index(class_text)] == 1: f.write("1\n") else: f.write("0\n") f.close()
def points2series(context): import pandas as pd from mullpy.auxiliar import csv2pat import sys import os serie_points_amount = context["preprocess"]["points2series"]["serie_size"] input_file = context["preprocess"]["points2series"]["input_file"] output_file = context["preprocess"]["points2series"]["output_file"] class_variable = context["preprocess"]["points2series"]["class_variable"] series_limit = context["preprocess"]["points2series"]["series_limit"] # TODO: Add support for multiple class variables. Now classes_len = 1 classes_len = 1 defined_features_list = context["preprocess"]["points2series"]["columns"] if defined_features_list == "all": input_df = pd.read_csv(input_file) defined_features_list = input_df.columns else: defined_features_list.append(class_variable) input_df = pd.read_csv(input_file, usecols=defined_features_list) # We have to take only the (series_limit + series_size) last points of input_df input_df_last = input_df.iloc[len(input_df) - (series_limit + serie_points_amount):].reset_index(drop=True) # Building output columns list defined_features_list features_list = [] for i in range(serie_points_amount): for j in range(len(defined_features_list)): features_list.append("%s_%d" % (defined_features_list[j].upper(), i)) # Adding last column, that is class variable. if "deployment" not in context["execution_kind"]: features_list.append("%s_%s" % (class_variable.upper(), "CLASS")) output_df = pd.DataFrame(columns=features_list, dtype=np.float32) if "deployment" not in context["execution_kind"]: iteration = range(len(input_df_last) - serie_points_amount) else: iteration = range(1, len(input_df_last) - serie_points_amount + 1) for i in iteration: # Percentage completed if "deployment" not in context["execution_kind"]: sys.stdout.write("\r{0}".format("Loaded:%f%%" % (i * 100 / (len(input_df_last) - serie_points_amount)))) sys.stdout.flush() #Iterate over a numpy row in order to optimize the performance row = np.zeros((1, len(features_list)), dtype=np.float32) j, z = 0, 0 for j in range(serie_points_amount): for column in defined_features_list: # We have to test if the exchange value was correctly given (between 1 and 2 in those dates) row[0, z] = input_df_last.iloc[i + j][column] z += 1 if "deployment" not in context["execution_kind"]: row[0, z] = PreProcess.check_eurusd_values(input_df_last[class_variable][i + serie_points_amount]) output_df.loc[i] = row #Check the variable series_limit and break the for if the amount of rows was reached if series_limit is not None and i + 1 >= series_limit: break #Create the dataFrame to output the csv # output_df = pd.DataFrame(matrix, columns=features_list) # Building csv and pat files file_name = output_file + ".csv" path_exists(os.path.dirname(file_name)) output_df.to_csv(file_name, index=False) if context["preprocess"]["points2series"]["to_pat"]: csv2pat(file_name, classes_len) if not context["preprocess"]["points2series"]["to_csv"]: os.remove(file_name) # Displaying info serie_name = output_file[output_file.rfind("/") + 1:] serie_path = output_file[:output_file.rfind("/")] if "deployment" not in context["execution_kind"]: print("\n%s pattern files built at %s" % (serie_name, serie_path))
def random_distribution(self, context): """ Bagging methods come in many flavours but mostly differ from each other by the way they draw random subsets of the training set: -When random subsets of the dataset are drawn as random subsets of the samples, then this algorithm is known as Pasting Rvotes. -When samples are drawn with replacement, then the method is known as Bagging. -When random subsets of the dataset are drawn as random subsets of the features, then the method is known as Random Subspaces. -When base estimators are built on subsets of both samples and features, then the method is known as Random Patches. group_successive variable groups each X instances. Each of these successive instances has to be together in the sampling process """ total_length = 0 lengths = AutoVivification() for pattern_kind in context["patterns"].patterns[context["classifier_list"][0]]: lengths[pattern_kind] = len(context["patterns"].patterns[context["classifier_list"][0]][pattern_kind]) total_length += lengths[pattern_kind] #Check if the length of patterns have the same size for classifier_name in context["classifier_list"]: for pattern_kind in context["patterns"].patterns[classifier_name]: if len(context["patterns"].patterns[classifier_name][pattern_kind]) != lengths[pattern_kind]: raise ValueError( 'The length of the %s pattern of classifier %s has different size from others' % pattern_kind, classifier_name) if context["preprocess"]["random_distribution"]["group_successive"]: total_length = int(total_length / context["preprocess"]["random_distribution"]["group_successive"]) for pattern_kind in lengths: lengths[pattern_kind] = int( lengths[pattern_kind] / context["preprocess"]["random_distribution"]["group_successive"]) dir_name = context["general_path"] + "patterns/" + context["classifiers"][context["classifier_list"][0]]["set"] filters = AutoVivification() ###Specific kind of sampling### ############# ######BAGGING ############# if "bagging" in context["preprocess"]["random_distribution"] and \ context["preprocess"]["random_distribution"]["bagging"]["activate"]: for pattern_kind in context["patterns_texts"]: filters[pattern_kind] = [] self.bagging(context, filters, lengths, total_length) dir_name += "_bagging/" ############# ######PASTING ############# elif "pasting_Rvotes" in context["preprocess"]["random_distribution"] and \ context["preprocess"]["random_distribution"]["pasting_Rvotes"]["activate"]: for pattern_kind in context["patterns_texts"]: filters[pattern_kind] = [] self.pasting_rvotes(context, filters, lengths, total_length) dir_name += "_pasting_Rvotes/" ################# #RANDOM SUBSPACES ################# elif "random_subspaces" in context["preprocess"]["random_distribution"] and \ context["preprocess"]["random_distribution"]["random_subspaces"]["activate"]: features_amount = self.check_features_amount(context) for pattern_kind in context["patterns_texts"]: filters[pattern_kind] = [] self.random_subspaces(context, filters, features_amount) dir_name += "_random_subspaces/" ############# #COMBINATIONS ############# elif "all_features_combination" in context["preprocess"]["random_distribution"] and \ context["preprocess"]["random_distribution"]["all_features_combination"]["activate"]: features_amount = self.check_features_amount(context) for pattern_kind in context["patterns_texts"]: filters[pattern_kind] = [] self.all_features_combination(context, filters, features_amount) dir_name += "_features_combination/" context["preprocess"]["random_distribution"]["number_base_classifiers"] = len(filters["learning"]) ############### #RANDOM PATCHES ############### elif "random_patches" in context["preprocess"]["random_distribution"] and \ context["preprocess"]["random_distribution"]["random_patches"]["activate"]: dir_name += "_random_patches/" ############### #K-FOLD ############### elif "k_fold" in context["preprocess"]["random_distribution"] and \ context["preprocess"]["random_distribution"]["k_fold"]["activate"]: for pattern_kind in context["preprocess"]["random_distribution"]["k_fold"]["percents"]: filters[pattern_kind] = [] self.k_fold(context, filters) dir_name += "_k_fold/" ############### #Forecasting distribution ############### elif "forecasting_distribution" in context["preprocess"]["random_distribution"] and \ context["preprocess"]["random_distribution"]["forecasting_distribution"]["activate"]: self.forecasting_distribution(context, filters) dir_name += "_walking_forward/" ###Common functions### elif "bagging" in context["preprocess"]["random_distribution"] and \ context["preprocess"]["random_distribution"]["bagging"]["activate"] \ or "pasting_Rvotes" in context["preprocess"]["random_distribution"] and \ context["preprocess"]["random_distribution"]["pasting_Rvotes"]["activate"]: if context["preprocess"]["random_distribution"]["group_successive"]: for kind_of in filters: for filter in filters[kind_of]: for i in range(len(filter)): filter[i] = ( filter[i] * context["preprocess"]["random_distribution"]["group_successive"]) for j in range(1, context["preprocess"]["random_distribution"]["group_successive"]): filter.append(filter[i] + j) path_exists(dir_name) self._generate_new_patterns_random_distribution(context, filters, dir_name)
def points2series(context): import pandas as pd from mullpy.auxiliar import csv2pat import sys import os serie_points_amount = context["preprocess"]["points2series"][ "serie_size"] input_file = context["preprocess"]["points2series"]["input_file"] output_file = context["preprocess"]["points2series"]["output_file"] class_variable = context["preprocess"]["points2series"][ "class_variable"] series_limit = context["preprocess"]["points2series"]["series_limit"] # TODO: Add support for multiple class variables. Now classes_len = 1 classes_len = 1 defined_features_list = context["preprocess"]["points2series"][ "columns"] if defined_features_list == "all": input_df = pd.read_csv(input_file) defined_features_list = input_df.columns else: defined_features_list.append(class_variable) input_df = pd.read_csv(input_file, usecols=defined_features_list) # We have to take only the (series_limit + series_size) last points of input_df input_df_last = input_df.iloc[len(input_df) - (series_limit + serie_points_amount):].reset_index( drop=True) # Building output columns list defined_features_list features_list = [] for i in range(serie_points_amount): for j in range(len(defined_features_list)): features_list.append("%s_%d" % (defined_features_list[j].upper(), i)) # Adding last column, that is class variable. if "deployment" not in context["execution_kind"]: features_list.append("%s_%s" % (class_variable.upper(), "CLASS")) output_df = pd.DataFrame(columns=features_list, dtype=np.float32) if "deployment" not in context["execution_kind"]: iteration = range(len(input_df_last) - serie_points_amount) else: iteration = range(1, len(input_df_last) - serie_points_amount + 1) for i in iteration: # Percentage completed if "deployment" not in context["execution_kind"]: sys.stdout.write("\r{0}".format( "Loaded:%f%%" % (i * 100 / (len(input_df_last) - serie_points_amount)))) sys.stdout.flush() #Iterate over a numpy row in order to optimize the performance row = np.zeros((1, len(features_list)), dtype=np.float32) j, z = 0, 0 for j in range(serie_points_amount): for column in defined_features_list: # We have to test if the exchange value was correctly given (between 1 and 2 in those dates) row[0, z] = input_df_last.iloc[i + j][column] z += 1 if "deployment" not in context["execution_kind"]: row[0, z] = PreProcess.check_eurusd_values( input_df_last[class_variable][i + serie_points_amount]) output_df.loc[i] = row #Check the variable series_limit and break the for if the amount of rows was reached if series_limit is not None and i + 1 >= series_limit: break #Create the dataFrame to output the csv # output_df = pd.DataFrame(matrix, columns=features_list) # Building csv and pat files file_name = output_file + ".csv" path_exists(os.path.dirname(file_name)) output_df.to_csv(file_name, index=False) if context["preprocess"]["points2series"]["to_pat"]: csv2pat(file_name, classes_len) if not context["preprocess"]["points2series"]["to_csv"]: os.remove(file_name) # Displaying info serie_name = output_file[output_file.rfind("/") + 1:] serie_path = output_file[:output_file.rfind("/")] if "deployment" not in context["execution_kind"]: print("\n%s pattern files built at %s" % (serie_name, serie_path))
def random_distribution(self, context): """ Bagging methods come in many flavours but mostly differ from each other by the way they draw random subsets of the training set: -When random subsets of the dataset are drawn as random subsets of the samples, then this algorithm is known as Pasting Rvotes. -When samples are drawn with replacement, then the method is known as Bagging. -When random subsets of the dataset are drawn as random subsets of the features, then the method is known as Random Subspaces. -When base estimators are built on subsets of both samples and features, then the method is known as Random Patches. group_successive variable groups each X instances. Each of these successive instances has to be together in the sampling process """ total_length = 0 lengths = AutoVivification() for pattern_kind in context["patterns"].patterns[ context["classifier_list"][0]]: lengths[pattern_kind] = len(context["patterns"].patterns[ context["classifier_list"][0]][pattern_kind]) total_length += lengths[pattern_kind] #Check if the length of patterns have the same size for classifier_name in context["classifier_list"]: for pattern_kind in context["patterns"].patterns[classifier_name]: if len(context["patterns"].patterns[classifier_name] [pattern_kind]) != lengths[pattern_kind]: raise ValueError( 'The length of the %s pattern of classifier %s has different size from others' % pattern_kind, classifier_name) if context["preprocess"]["random_distribution"]["group_successive"]: total_length = int(total_length / context["preprocess"] ["random_distribution"]["group_successive"]) for pattern_kind in lengths: lengths[pattern_kind] = int( lengths[pattern_kind] / context["preprocess"] ["random_distribution"]["group_successive"]) dir_name = context["general_path"] + "patterns/" + context[ "classifiers"][context["classifier_list"][0]]["set"] filters = AutoVivification() ###Specific kind of sampling### ############# ######BAGGING ############# if "bagging" in context["preprocess"]["random_distribution"] and \ context["preprocess"]["random_distribution"]["bagging"]["activate"]: for pattern_kind in context["patterns_texts"]: filters[pattern_kind] = [] self.bagging(context, filters, lengths, total_length) dir_name += "_bagging/" ############# ######PASTING ############# elif "pasting_Rvotes" in context["preprocess"]["random_distribution"] and \ context["preprocess"]["random_distribution"]["pasting_Rvotes"]["activate"]: for pattern_kind in context["patterns_texts"]: filters[pattern_kind] = [] self.pasting_rvotes(context, filters, lengths, total_length) dir_name += "_pasting_Rvotes/" ################# #RANDOM SUBSPACES ################# elif "random_subspaces" in context["preprocess"]["random_distribution"] and \ context["preprocess"]["random_distribution"]["random_subspaces"]["activate"]: features_amount = self.check_features_amount(context) for pattern_kind in context["patterns_texts"]: filters[pattern_kind] = [] self.random_subspaces(context, filters, features_amount) dir_name += "_random_subspaces/" ############# #COMBINATIONS ############# elif "all_features_combination" in context["preprocess"]["random_distribution"] and \ context["preprocess"]["random_distribution"]["all_features_combination"]["activate"]: features_amount = self.check_features_amount(context) for pattern_kind in context["patterns_texts"]: filters[pattern_kind] = [] self.all_features_combination(context, filters, features_amount) dir_name += "_features_combination/" context["preprocess"]["random_distribution"][ "number_base_classifiers"] = len(filters["learning"]) ############### #RANDOM PATCHES ############### elif "random_patches" in context["preprocess"]["random_distribution"] and \ context["preprocess"]["random_distribution"]["random_patches"]["activate"]: dir_name += "_random_patches/" ############### #K-FOLD ############### elif "k_fold" in context["preprocess"]["random_distribution"] and \ context["preprocess"]["random_distribution"]["k_fold"]["activate"]: for pattern_kind in context["preprocess"]["random_distribution"][ "k_fold"]["percents"]: filters[pattern_kind] = [] self.k_fold(context, filters) dir_name += "_k_fold/" ############### #Forecasting distribution ############### elif "forecasting_distribution" in context["preprocess"]["random_distribution"] and \ context["preprocess"]["random_distribution"]["forecasting_distribution"]["activate"]: self.forecasting_distribution(context, filters) dir_name += "_walking_forward/" ###Common functions### elif "bagging" in context["preprocess"]["random_distribution"] and \ context["preprocess"]["random_distribution"]["bagging"]["activate"] \ or "pasting_Rvotes" in context["preprocess"]["random_distribution"] and \ context["preprocess"]["random_distribution"]["pasting_Rvotes"]["activate"]: if context["preprocess"]["random_distribution"][ "group_successive"]: for kind_of in filters: for filter in filters[kind_of]: for i in range(len(filter)): filter[i] = ( filter[i] * context["preprocess"] ["random_distribution"]["group_successive"]) for j in range( 1, context["preprocess"] ["random_distribution"]["group_successive"]): filter.append(filter[i] + j) path_exists(dir_name) self._generate_new_patterns_random_distribution( context, filters, dir_name)