def reconstructRules(source): #print("Reconstructing ",source) file_name = source.split(".json")[0] file_name = file_name + ".py" functions.createFile(file_name, "#This rule was reconstructed from " + source + "\n") with open(source, 'r') as f: rules = json.load(f) #print(rules) def padleft(rule, level): for i in range(0, level): rule = "\t" + rule return rule #print("def findDecision(obj):") max_level = 0 rule_set = [] #json file might not store rules respectively for instance in rules: if len(instance) > 0: rule = [] rule.append(instance["current_level"]) rule.append(instance["leaf_id"]) rule.append(instance["parents"]) rule.append(instance["rule"]) rule_set.append(rule) #print(padleft(instance["rule"], instance["current_level"])) df = np.array(rule_set) def extractRules(df, parent='root', level=1): level_raw = level * 1 parent_raw = copy.copy(parent) for i in range(0, df.shape[0]): leaf_id = df[i][1] parent_id = df[i][2] rule = df[i][3] if parent_id == parent: functions.storeRule(file_name, padleft(rule, level)) level = level + 1 parent = copy.copy(leaf_id) extractRules(df, parent, level) level = level_raw * 1 parent = copy.copy(parent_raw) #restore functions.storeRule(file_name, "def findDecision(obj):") extractRules(df)
def apply(df, config, header, dataset_features, validation_df = None): models = [] num_of_trees = config['num_of_trees'] parallelism_on = config["enableParallelism"] #TODO: is this logical for 48x2 cores? #config["enableParallelism"] = False #run each tree in parallel but each branch in serial #TODO: reconstruct for parallel run is problematic. you should reconstruct based on tree id. input_params = [] pbar = tqdm(range(0, num_of_trees), desc='Bagging') for i in pbar: pbar.set_description("Sub decision tree %d is processing" % (i+1)) subset = df.sample(frac=1/num_of_trees) root = 1 moduleName = "outputs/rules/rule_"+str(i) file = moduleName+".py" functions.createFile(file, header) if parallelism_on: #parallel run input_params.append((subset, root, file, config, dataset_features, 0, 0, 'root', i)) else: #serial run Training.buildDecisionTree(subset,root, file, config, dataset_features, parent_level = 0, leaf_id = 0, parents = 'root', tree_id = i) #------------------------------- if parallelism_on: num_cores = config["num_cores"] pool = Training.MyPool(num_cores) results = pool.starmap(buildDecisionTree, input_params) pool.close() pool.join() #------------------------------- #collect models for both serial and parallel here for i in range(0, num_of_trees): moduleName = "outputs/rules/rule_"+str(i) fp, pathname, description = imp.find_module(moduleName) myrules = imp.load_module(moduleName, fp, pathname, description) models.append(myrules) #------------------------------- return models
def apply(df, config, header, dataset_features, validation_df=None): models = [] num_of_trees = config['num_of_trees'] pbar = tqdm(range(0, num_of_trees), desc='Bagging') for i in pbar: #for i in range(0, num_of_trees): pbar.set_description("Sub decision tree %d is processing" % (i + 1)) subset = df.sample(frac=1 / num_of_trees) root = 1 moduleName = "outputs/rules/rule_" + str(i) file = moduleName + ".py" json_file = moduleName + ".json" functions.createFile(file, header) functions.createFile(json_file, "[\n") Training.buildDecisionTree(subset, root, file, config, dataset_features, parent_level=0, leaf_id=0, parents='root') functions.storeRule(json_file, "{}]") #-------------------------------- fp, pathname, description = imp.find_module(moduleName) myrules = imp.load_module(moduleName, fp, pathname, description) models.append(myrules) #------------------------------- return models
def reconstructRules(source, feature_names): #print("Reconstructing ",source) file_name = source.split(".json")[0] file_name = file_name+".py" #----------------------------------- constructor = "def findDecision(obj): #" idx = 0 for feature in feature_names: constructor = constructor + "obj["+str(idx)+"]: "+feature if idx < len(feature_names) - 1: constructor = constructor+", " idx = idx + 1 functions.createFile(file_name, constructor+"\n") #----------------------------------- with open(source, 'r') as f: rules = json.load(f) #print(rules) def padleft(rule, level): for i in range(0, level): rule = "\t"+rule return rule #print("def findDecision(obj):") max_level = 0 rule_set = [] #json file might not store rules respectively for instance in rules: if len(instance) > 0: rule = [] rule.append(instance["current_level"]) rule.append(instance["leaf_id"]) rule.append(instance["parents"]) rule.append(instance["rule"]) rule.append(instance["feature_name"]) rule.append(instance["instances"]) rule.append(instance["metric"]) rule.append(instance["return_statement"]) rule_set.append(rule) #print(padleft(instance["rule"], instance["current_level"])) df = np.array(rule_set) def extractRules(df, parent = 'root', level=1): level_raw = level * 1; parent_raw = copy.copy(parent) else_rule = "" leaf_idx = 0 for i in range(0 ,df.shape[0]): current_level = int(df[i][0]) leaf_id = df[i][1] parent_id = df[i][2] rule = df[i][3] feature_name = df[i][4] instances = int(df[i][5]) metric = float(df[i][6]) return_statement = int(df[i][7]) if parent_id == parent: if_statement = False if rule[0:2] == "if": if_statement = True else_statement = False if rule[0:5] == "else:": else_statement = True else_rule = rule #------------------------ if else_statement != True: if if_statement == True and leaf_idx > 0: rule = "el"+rule #print(padleft(rule, level), "(", leaf_idx,")") if leaf_idx == 0 and return_statement == 0: explainer = {} explainer["feature"] = feature_name explainer["instances"] = instances explainer["metric_value"] = round(metric, 4) explainer["depth"] = current_level explainer = "# "+json.dumps(explainer) functions.storeRule(file_name, padleft(explainer, level)) functions.storeRule(file_name, padleft(rule, level)) level = level + 1; parent = copy.copy(leaf_id) extractRules(df, parent, level) level = level_raw * 1; parent = copy.copy(parent_raw) #restore leaf_idx = leaf_idx + 1 #add else statement if else_rule != "": #print(padleft(else_rule, level)) functions.storeRule(file_name, padleft(else_rule, level)) #------------------------------------ extractRules(df)
def buildDecisionTree(df, root, file, config, dataset_features, parent_level = 0, leaf_id = 0, parents = 'root', validation_df = None): models = [] feature_names = df.columns[0:-1] enableParallelism = config['enableParallelism'] algorithm = config['algorithm'] json_file = file.split(".")[0]+".json" if root == 1: if config['enableRandomForest'] != True and config['enableGBM'] != True and config['enableAdaboost'] != True: raw_df = df.copy() #-------------------------------------- df_copy = df.copy() winner_name, num_of_instances, metric, metric_name = findDecision(df, config) #find winner index, this cannot be returned by find decision because columns dropped in previous steps j = 0 for i in dataset_features: if i == winner_name: winner_index = j j = j + 1 numericColumn = False if dataset_features[winner_name] != 'object': numericColumn = True #restoration columns = df.shape[1] for i in range(0, columns-1): column_name = df.columns[i]; column_type = df[column_name].dtypes if column_type != 'object' and column_name != winner_name: df[column_name] = df_copy[column_name] classes = df[winner_name].value_counts().keys().tolist() #----------------------------------------------------- num_cores = config["num_cores"] input_params = [] #serial approach for i in range(0,len(classes)): current_class = classes[i] subdataset = df[df[winner_name] == current_class] subdataset = subdataset.drop(columns=[winner_name]) branch_index = i * 1 #create branches serially if enableParallelism != True: if i == 0: #descriptor = "# Feature: "+winner_name+", Instances: "+str(num_of_instances)+", "+metric_name+": "+str(round(metric, 4)) descriptor = { "feature": winner_name, "instances": num_of_instances, #"metric_name": metric_name, "metric_value": round(metric, 4), "depth": parent_level + 1 } descriptor = "# "+json.dumps(descriptor) functions.storeRule(file, (functions.formatRule(root), "", descriptor)) createBranch(config, current_class, subdataset, numericColumn, branch_index , winner_name, winner_index, root, parents, file, dataset_features, num_of_instances, metric) else: input_params.append((config, current_class, subdataset, numericColumn, branch_index , winner_name, winner_index, root, parents, file, dataset_features, num_of_instances, metric)) #--------------------------- #add else condition in the decision tree if df.Decision.dtypes == 'object': #classification pivot = pd.DataFrame(subdataset.Decision.value_counts()).reset_index() pivot = pivot.rename(columns = {"Decision": "Instances","index": "Decision"}) pivot = pivot.sort_values(by = ["Instances"], ascending = False).reset_index() else_decision = "return '%s'" % (pivot.iloc[0].Decision) if enableParallelism != True: functions.storeRule(file,(functions.formatRule(root), "else:")) functions.storeRule(file,(functions.formatRule(root+1), else_decision)) else: #parallelism leaf_id = str(uuid.uuid1()) custom_rule_file = "outputs/rules/"+str(leaf_id)+".txt" check_rule = "else: "+else_decision sample_rule = {} sample_rule["current_level"] = root sample_rule["leaf_id"] = leaf_id sample_rule["parents"] = parents sample_rule["rule"] = check_rule sample_rule["feature_idx"] = -1 sample_rule["feature_name"] = "" sample_rule["instances"] = df.shape[0] sample_rule["metric"] = 0 sample_rule["return_statement"] = 0 #json to string sample_rule = json.dumps(sample_rule) functions.createFile(custom_rule_file, "") functions.storeRule(custom_rule_file, sample_rule) else: #regression else_decision = "return %s" % (subdataset.Decision.mean()) if enableParallelism != True: functions.storeRule(file,(functions.formatRule(root), "else:")) functions.storeRule(file,(functions.formatRule(root+1), else_decision)) else: leaf_id = str(uuid.uuid1()) custom_rule_file = "outputs/rules/"+str(leaf_id)+".txt" check_rule = "else: "+else_decision sample_rule = " {\n" sample_rule += " \"current_level\": "+str(root)+",\n" sample_rule += " \"leaf_id\": \""+str(leaf_id)+"\",\n" sample_rule += " \"parents\": \""+parents+"\",\n" sample_rule += " \"rule\": \""+check_rule+"\"\n" sample_rule += " }" functions.createFile(custom_rule_file, "") functions.storeRule(custom_rule_file, sample_rule) #--------------------------- #create branches in parallel if enableParallelism == True: """ #this usage causes trouble for recursive functions with Pool(number_of_cpus) as pool: pool.starmap(createBranch, input_params) """ pool = MyPool(num_cores) results = pool.starmap(createBranch, input_params) pool.close() pool.join() #--------------------------------------------- if root == 1: if enableParallelism == True: #custom rules are stored in .txt files. merge them all in a json file functions.createFile(json_file, "[\n") custom_rules = [] file_index = 0 for file in os.listdir(os.getcwd()+"/outputs/rules"): if file.endswith(".txt"): custom_rules.append(os.getcwd()+"/outputs/rules/"+file) #print(file) #this file stores a custom rule f = open(os.getcwd()+"/outputs/rules/"+file, "r") custom_rule = f.read() if file_index > 0: custom_rule = ", "+custom_rule functions.storeRule(json_file, custom_rule) f.close() file_index = file_index + 1 functions.storeRule(json_file, "]") #----------------------------------- #custom rules are already merged in a json file. clear messy custom rules #TO-DO: if random forest trees are handled in parallel, this would be a problem. You cannot know the related tree of a rule. You should store a global tree id in a rule. for file in custom_rules: os.remove(file) #----------------------------------- reconstructRules(json_file, feature_names) #feature importance should be calculated by demand? feature_importance(json_file, dataset_features) #----------------------------------- #is regular decision tree if config['enableRandomForest'] != True and config['enableGBM'] != True and config['enableAdaboost'] != True: #this is reguler decision tree. find accuracy here. moduleName = "outputs/rules/rules" fp, pathname, description = imp.find_module(moduleName) myrules = imp.load_module(moduleName, fp, pathname, description) #rules0 models.append(myrules) return models
def createBranch(config, current_class, subdataset, numericColumn, branch_index , winner_name, winner_index, root, parents, file, dataset_features, num_of_instances, metric): algorithm = config['algorithm'] enableAdaboost = config['enableAdaboost'] enableGBM = config['enableGBM'] max_depth = config['max_depth'] enableParallelism = config['enableParallelism'] charForResp = "'" if algorithm == 'Regression': charForResp = "" #--------------------------- json_file = file.split(".")[0]+".json" tmp_root = root * 1 parents_raw = copy.copy(parents) #--------------------------- if numericColumn == True: compareTo = current_class #current class might be <=x or >x in this case else: compareTo = " == '"+str(current_class)+"'" #print(subdataset) terminateBuilding = False #----------------------------------------------- #can decision be made? if enableGBM == True and root >= max_depth: #max depth final_decision = subdataset['Decision'].mean() terminateBuilding = True elif enableAdaboost == True: #final_decision = subdataset['Decision'].value_counts().idxmax() final_decision = functions.sign(subdataset['Decision'].mean()) #get average terminateBuilding = True enableParallelism = False elif len(subdataset['Decision'].value_counts().tolist()) == 1: final_decision = subdataset['Decision'].value_counts().keys().tolist()[0] #all items are equal in this case terminateBuilding = True elif subdataset.shape[1] == 1: #if decision cannot be made even though all columns dropped final_decision = subdataset['Decision'].value_counts().idxmax() #get the most frequent one terminateBuilding = True elif algorithm == 'Regression' and subdataset.shape[0] < 5: #pruning condition #elif algorithm == 'Regression' and subdataset['Decision'].std(ddof=0)/global_stdev < 0.4: #pruning condition final_decision = subdataset['Decision'].mean() #get average terminateBuilding = True #----------------------------------------------- if enableParallelism == True: check_condition = "if" #TODO: elif checks might be above than if statements in parallel else: if branch_index == 0: check_condition = "if" else: check_condition = "elif" check_rule = check_condition+" obj["+str(winner_index)+"]"+compareTo+":" leaf_id = str(uuid.uuid1()) custom_rule_file = "outputs/rules/"+str(leaf_id)+".txt" if enableParallelism != True: #check_rule += " # feature: "+winner_name+", instances: "+str(num_of_instances)+", "+metric_name+": "+str(round(metric, 4)) functions.storeRule(file,(functions.formatRule(root),"",check_rule)) else: sample_rule = {} sample_rule["current_level"] = root sample_rule["leaf_id"] = leaf_id sample_rule["parents"] = parents sample_rule["rule"] = check_rule sample_rule["feature_idx"] = winner_index sample_rule["feature_name"] = winner_name sample_rule["instances"] = num_of_instances sample_rule["metric"] = metric sample_rule["return_statement"] = 0 #json to string sample_rule = json.dumps(sample_rule) functions.createFile(custom_rule_file, "") functions.storeRule(custom_rule_file, sample_rule) #----------------------------------------------- if terminateBuilding == True: #check decision is made parents = copy.copy(leaf_id) leaf_id = str(uuid.uuid1()) decision_rule = "return "+charForResp+str(final_decision)+charForResp if enableParallelism != True: #serial functions.storeRule(file,(functions.formatRule(root+1),decision_rule)) else: #parallel sample_rule = {} sample_rule["current_level"] = root+1 sample_rule["leaf_id"] = leaf_id sample_rule["parents"] = parents sample_rule["rule"] = decision_rule sample_rule["feature_idx"] = winner_index sample_rule["feature_name"] = winner_name sample_rule["instances"] = num_of_instances sample_rule["metric"] = 0 sample_rule["return_statement"] = 1 #json to string sample_rule = ", "+json.dumps(sample_rule) functions.storeRule(custom_rule_file, sample_rule) else: #decision is not made, continue to create branch and leafs root = root + 1 #the following rule will be included by this rule. increase root parents = copy.copy(leaf_id) buildDecisionTree(subdataset, root, file, config, dataset_features , root-1, leaf_id, parents) root = tmp_root * 1 parents = copy.copy(parents_raw)
def reconstructRules(source): #print("Reconstructing ",source) file_name = source.split(".json")[0] file_name = file_name + ".py" functions.createFile(file_name, "#This rule was reconstructed from " + source + "\n") with open(source, 'r') as f: rules = json.load(f) #print(rules) def padleft(rule, level): for i in range(0, level): rule = "\t" + rule return rule #print("def findDecision(obj):") max_level = 0 rule_set = [] #json file might not store rules respectively for instance in rules: if len(instance) > 0: rule = [] rule.append(instance["current_level"]) rule.append(instance["leaf_id"]) rule.append(instance["parents"]) rule.append(instance["rule"]) rule_set.append(rule) #print(padleft(instance["rule"], instance["current_level"])) df = np.array(rule_set) def extractRules(df, parent='root', level=1): level_raw = level * 1 parent_raw = copy.copy(parent) else_rule = "" leaf_idx = 0 for i in range(0, df.shape[0]): leaf_id = df[i][1] parent_id = df[i][2] rule = df[i][3] if parent_id == parent: if_statement = False if rule[0:2] == "if": if_statement = True else_statement = False if rule[0:5] == "else:": else_statement = True else_rule = rule #------------------------ if else_statement != True: if if_statement == True and leaf_idx > 0: rule = "el" + rule #print(padleft(rule, level), "(", leaf_idx,")") functions.storeRule(file_name, padleft(rule, level)) level = level + 1 parent = copy.copy(leaf_id) extractRules(df, parent, level) level = level_raw * 1 parent = copy.copy(parent_raw) #restore leaf_idx = leaf_idx + 1 #add else statement if else_rule != "": #print(padleft(else_rule, level)) functions.storeRule(file_name, padleft(else_rule, level)) #------------------------------------ #print("def findDecision(obj):") functions.storeRule(file_name, "def findDecision(obj):") extractRules(df)
def buildDecisionTree(df, root, file, config, dataset_features, parent_level=0, leaf_id=0, parents='root'): models = [] enableParallelism = config['enableParallelism'] algorithm = config['algorithm'] json_file = file.split(".")[0] + ".json" if root == 1: if config['enableRandomForest'] != True and config[ 'enableGBM'] != True and config['enableAdaboost'] != True: raw_df = df.copy() #-------------------------------------- df_copy = df.copy() winner_name = findDecision(df, config) #find winner index, this cannot be returned by find decision because columns dropped in previous steps j = 0 for i in dataset_features: if i == winner_name: winner_index = j j = j + 1 numericColumn = False if dataset_features[winner_name] != 'object': numericColumn = True #restoration columns = df.shape[1] for i in range(0, columns - 1): column_name = df.columns[i] column_type = df[column_name].dtypes if column_type != 'object' and column_name != winner_name: df[column_name] = df_copy[column_name] classes = df[winner_name].value_counts().keys().tolist() #----------------------------------------------------- #TO-DO: you should specify the number of cores in config num_cores = int(multiprocessing.cpu_count() / 2) #allocate half of your total cores input_params = [] #serial approach for i in range(0, len(classes)): current_class = classes[i] subdataset = df[df[winner_name] == current_class] subdataset = subdataset.drop(columns=[winner_name]) branch_index = i * 1 #create branches serially if enableParallelism != True: createBranch(config, current_class, subdataset, numericColumn, branch_index, winner_index, root, parents, file, dataset_features) else: input_params.append((config, current_class, subdataset, numericColumn, branch_index, winner_index, root, parents, file, dataset_features)) #--------------------------- #add else condition in the decision tree if df.Decision.dtypes == 'object': #classification pivot = pd.DataFrame(subdataset.Decision.value_counts()).reset_index() pivot = pivot.rename(columns={ "Decision": "Instances", "index": "Decision" }) pivot = pivot.sort_values(by=["Instances"], ascending=False).reset_index() else_decision = "return '%s'" % (pivot.iloc[0].Decision) if enableParallelism != True: functions.storeRule(file, (functions.formatRule(root), "else:")) functions.storeRule( file, (functions.formatRule(root + 1), else_decision)) else: #parallelism leaf_id = str(uuid.uuid1()) custom_rule_file = "outputs/rules/" + str(leaf_id) + ".txt" check_rule = "else: " + else_decision sample_rule = " {\n" sample_rule += " \"current_level\": " + str(root) + ",\n" sample_rule += " \"leaf_id\": \"" + str(leaf_id) + "\",\n" sample_rule += " \"parents\": \"" + parents + "\",\n" sample_rule += " \"rule\": \"" + check_rule + "\"\n" sample_rule += " }" functions.createFile(custom_rule_file, "") functions.storeRule(custom_rule_file, sample_rule) else: #regression else_decision = "return %s" % (subdataset.Decision.mean()) if enableParallelism != True: functions.storeRule(file, (functions.formatRule(root), "else:")) functions.storeRule( file, (functions.formatRule(root + 1), else_decision)) else: leaf_id = str(uuid.uuid1()) custom_rule_file = "outputs/rules/" + str(leaf_id) + ".txt" check_rule = "else: " + else_decision sample_rule = " {\n" sample_rule += " \"current_level\": " + str(root) + ",\n" sample_rule += " \"leaf_id\": \"" + str(leaf_id) + "\",\n" sample_rule += " \"parents\": \"" + parents + "\",\n" sample_rule += " \"rule\": \"" + check_rule + "\"\n" sample_rule += " }" functions.createFile(custom_rule_file, "") functions.storeRule(custom_rule_file, sample_rule) #--------------------------- #create branches in parallel if enableParallelism == True: """ #this usage causes trouble for recursive functions with Pool(number_of_cpus) as pool: pool.starmap(createBranch, input_params) """ pool = MyPool(num_cores) results = pool.starmap(createBranch, input_params) pool.close() pool.join() #--------------------------------------------- #calculate accuracy metrics if root == 1: if enableParallelism == True: #custom rules are stored in .txt files. merge them all in a json file functions.createFile(json_file, "[\n") custom_rules = [] file_index = 0 for file in os.listdir(os.getcwd() + "/outputs/rules"): if file.endswith(".txt"): custom_rules.append(os.getcwd() + "/outputs/rules/" + file) #print(file) #this file stores a custom rule f = open(os.getcwd() + "/outputs/rules/" + file, "r") custom_rule = f.read() if file_index > 0: custom_rule = ", " + custom_rule functions.storeRule(json_file, custom_rule) f.close() file_index = file_index + 1 functions.storeRule(json_file, "]") #----------------------------------- #custom rules are already merged in a json file. clear messy custom rules #TO-DO: if random forest trees are handled in parallel, this would be a problem. You cannot know the related tree of a rule. You should store a global tree id in a rule. for file in custom_rules: os.remove(file) #----------------------------------- reconstructRules(json_file) #----------------------------------- if config['enableRandomForest'] != True and config[ 'enableGBM'] != True and config['enableAdaboost'] != True: #this is reguler decision tree. find accuracy here. moduleName = "outputs/rules/rules" fp, pathname, description = imp.find_module(moduleName) myrules = imp.load_module(moduleName, fp, pathname, description) #rules0 models.append(myrules) num_of_features = df.shape[1] - 1 instances = df.shape[0] classified = 0 mae = 0 mse = 0 #instead of for loops, pandas functions perform well raw_df['Prediction'] = raw_df.apply(findPrediction, axis=1) if algorithm != 'Regression': idx = raw_df[raw_df['Prediction'] == raw_df['Decision']].index #raw_df['Classified'] = 0 #raw_df.loc[idx, 'Classified'] = 1 #print(raw_df) accuracy = 100 * len(idx) / instances print("Accuracy: ", accuracy, "% on ", instances, " instances") else: raw_df['Absolute_Error'] = abs(raw_df['Prediction'] - raw_df['Decision']) raw_df['Absolute_Error_Squared'] = raw_df[ 'Absolute_Error'] * raw_df['Absolute_Error'] #print(raw_df) mae = raw_df['Absolute_Error'].sum() / instances print("MAE: ", mae) mse = raw_df['Absolute_Error_Squared'].sum() / instances rmse = math.sqrt(mse) print("RMSE: ", rmse) mean = raw_df['Decision'].mean() print("Mean: ", mean) if mean > 0: print("MAE / Mean: ", 100 * mae / mean, "%") print("RMSE / Mean: ", 100 * rmse / mean, "%") return models
def fit(df, config): target_label = df.columns[len(df.columns)-1] if target_label != 'Decision': print("Expected: Decision, Existing: ",target_label) raise ValueError('Please confirm that name of the target column is "Decision" and it is put to the right in pandas data frame') #------------------------ #handle NaN values nan_values = [] for column in df.columns: if df[column].dtypes != 'object': min_value = df[column].min() idx = df[df[column].isna()].index nan_value = [] nan_value.append(column) if idx.shape[0] > 0: df.loc[idx, column] = min_value - 1 nan_value.append(min_value - 1) min_value - 1 #print("NaN values are replaced to ", min_value - 1, " in column ", column) else: nan_value.append(None) nan_values.append(nan_value) #------------------------ #initialize params and folders config = functions.initializeParams(config) functions.initializeFolders() #------------------------ algorithm = config['algorithm'] valid_algorithms = ['ID3', 'C4.5', 'CART', 'CHAID', 'Regression'] if algorithm not in valid_algorithms: raise ValueError('Invalid algorithm passed. You passed ', algorithm," but valid algorithms are ",valid_algorithms) #------------------------ enableRandomForest = config['enableRandomForest'] num_of_trees = config['num_of_trees'] enableMultitasking = config['enableMultitasking'] #no longer used. check to remove this variable. enableGBM = config['enableGBM'] epochs = config['epochs'] learning_rate = config['learning_rate'] enableAdaboost = config['enableAdaboost'] enableParallelism = config['enableParallelism'] #this will handle basic decision stumps. parallelism is not required. if enableRandomForest == True: config['enableParallelism'] = False enableParallelism = False #------------------------ raw_df = df.copy() num_of_rows = df.shape[0]; num_of_columns = df.shape[1] if algorithm == 'Regression': if df['Decision'].dtypes == 'object': raise ValueError('Regression trees cannot be applied for nominal target values! You can either change the algorithm or data set.') if df['Decision'].dtypes != 'object': #this must be regression tree even if it is not mentioned in algorithm algorithm = 'Regression' config['algorithm'] = 'Regression' global_stdev = df['Decision'].std(ddof=0) if enableGBM == True: print("Gradient Boosting Machines...") algorithm = 'Regression' config['algorithm'] = 'Regression' if enableAdaboost == True: #enableParallelism = False for j in range(0, num_of_columns): column_name = df.columns[j] if df[column_name].dtypes == 'object': raise ValueError('Adaboost must be run on numeric data set for both features and target') #------------------------- print(algorithm," tree is going to be built...") dataset_features = dict() #initialize a dictionary. this is going to be used to check features numeric or nominal. numeric features should be transformed to nominal values based on scales. header = "def findDecision(obj): #" num_of_columns = df.shape[1]-1 for i in range(0, num_of_columns): column_name = df.columns[i] dataset_features[column_name] = df[column_name].dtypes header = header + "obj[" + str(i) +"]: "+column_name if i != num_of_columns - 1: header = header + ", " header = header + "\n" #------------------------ begin = time.time() trees = []; alphas = [] if enableAdaboost == True: trees, alphas = adaboost.apply(df, config, header, dataset_features) elif enableGBM == True: if df['Decision'].dtypes == 'object': #transform classification problem to regression trees, alphas = gbm.classifier(df, config, header, dataset_features) classification = True else: #regression trees = gbm.regressor(df, config, header, dataset_features) classification = False elif enableRandomForest == True: trees = randomforest.apply(df, config, header, dataset_features) else: #regular decision tree building root = 1; file = "outputs/rules/rules.py" functions.createFile(file, header) if enableParallelism == True: json_file = "outputs/rules/rules.json" functions.createFile(json_file, "[\n") trees = Training.buildDecisionTree(df,root,file, config, dataset_features , 0, 0, 'root') print("finished in ",time.time() - begin," seconds") obj = { "trees": trees, "alphas": alphas, "config": config, "nan_values": nan_values } return obj
def regressor(df, config, header, dataset_features, validation_df = None, process_id = None): models = [] #we will update decisions in every epoch, this will be used to restore base_actuals = df.Decision.values algorithm = config['algorithm'] enableRandomForest = config['enableRandomForest'] num_of_trees = config['num_of_trees'] enableMultitasking = config['enableMultitasking'] enableGBM = config['enableGBM'] epochs = config['epochs'] learning_rate = config['learning_rate'] enableAdaboost = config['enableAdaboost'] #------------------------------ boosted_from = 0; boosted_to = 0 #------------------------------ base_df = df.copy() #gbm will manipulate actuals. store its raw version. target_values = base_df['Decision'].values num_of_instances = target_values.shape[0] root = 1 file = "outputs/rules/rules0.py"; json_file = "outputs/rules/rules0.json" functions.createFile(file, header) functions.createFile(json_file, "[\n") Training.buildDecisionTree(df,root,file, config, dataset_features , parent_level = 0, leaf_id = 0, parents = 'root') #generate rules0 #functions.storeRule(json_file," {}]") df = base_df.copy() base_df['Boosted_Prediction'] = 0 #------------------------------ best_epoch_idx = 0; best_epoch_loss = 1000000 pbar = tqdm(range(1, epochs+1), desc='Boosting') #for index in range(1,epochs+1): #for index in tqdm(range(1,epochs+1), desc='Boosting'): for index in pbar: #print("epoch ",index," - ",end='') loss = 0 #run data(i-1) and rules(i-1), save data1 #dynamic import moduleName = "outputs/rules/rules%s" % (index-1) fp, pathname, description = imp.find_module(moduleName) myrules = imp.load_module(moduleName, fp, pathname, description) #rules0 models.append(myrules) new_data_set = "outputs/data/data%s.csv" % (index) f = open(new_data_set, "w") #put header in the following file columns = df.shape[1] mae = 0 #---------------------------------------- df['Epoch'] = index df['Prediction'] = df.apply(findPrediction, axis=1) base_df['Boosted_Prediction'] += df['Prediction'] loss = (base_df['Boosted_Prediction'] - base_df['Decision']).pow(2).sum() current_loss = loss / num_of_instances #mse if index == 1: boosted_from = current_loss * 1 elif index == epochs: boosted_to = current_loss * 1 if current_loss < best_epoch_loss: best_epoch_loss = current_loss * 1 best_epoch_idx = index * 1 df['Decision'] = int(learning_rate)*(df['Decision'] - df['Prediction']) df = df.drop(columns = ['Epoch', 'Prediction']) #--------------------------------- df.to_csv(new_data_set, index=False) #data(i) created #--------------------------------- file = "outputs/rules/rules"+str(index)+".py" json_file = "outputs/rules/rules"+str(index)+".json" functions.createFile(file, header) functions.createFile(json_file, "[\n") current_df = df.copy() Training.buildDecisionTree(df,root,file, config, dataset_features , parent_level = 0, leaf_id = 0, parents = 'root', main_process_id = process_id) #functions.storeRule(json_file," {}]") df = current_df.copy() #numeric features require this restoration to apply findDecision function #rules(i) created loss = loss / num_of_instances #print("epoch ",index," - loss: ",loss) #print("loss: ",loss) pbar.set_description("Epoch %d. Loss: %d. Process: " % (index, loss)) gc.collect() #--------------------------------- print("The best epoch is ", best_epoch_idx," with ", best_epoch_loss," loss value") models = models[0:best_epoch_idx] config["epochs"] = best_epoch_idx print("MSE of ",num_of_instances," instances are boosted from ",boosted_from," to ",best_epoch_loss," in ",epochs," epochs") return models
def regressor(df, config, header, dataset_features): models = [] algorithm = config['algorithm'] enableRandomForest = config['enableRandomForest'] num_of_trees = config['num_of_trees'] enableMultitasking = config['enableMultitasking'] enableGBM = config['enableGBM'] epochs = config['epochs'] learning_rate = config['learning_rate'] enableAdaboost = config['enableAdaboost'] #------------------------------ boosted_from = 0 boosted_to = 0 #------------------------------ base_df = df.copy() #gbm will manipulate actuals. store its raw version. target_values = base_df['Decision'].values num_of_instances = target_values.shape[0] root = 1 file = "outputs/rules/rules0.py" functions.createFile(file, header) Training.buildDecisionTree(df, root, file, config, dataset_features) #generate rules0 df = base_df.copy() base_df['Boosted_Prediction'] = 0 #------------------------------ pbar = tqdm(range(1, epochs + 1), desc='Boosting') #for index in range(1,epochs+1): #for index in tqdm(range(1,epochs+1), desc='Boosting'): for index in pbar: #print("epoch ",index," - ",end='') loss = 0 #run data(i-1) and rules(i-1), save data1 #dynamic import moduleName = "outputs/rules/rules%s" % (index - 1) fp, pathname, description = imp.find_module(moduleName) myrules = imp.load_module(moduleName, fp, pathname, description) #rules0 models.append(myrules) new_data_set = "outputs/data/data%s.csv" % (index) f = open(new_data_set, "w") #put header in the following file columns = df.shape[1] mae = 0 #---------------------------------------- df['Epoch'] = index df['Prediction'] = df.apply(findPrediction, axis=1) base_df['Boosted_Prediction'] += df['Prediction'] loss = (base_df['Boosted_Prediction'] - base_df['Decision']).pow(2).sum() if index == 1: boosted_from = loss / num_of_instances elif index == epochs: boosted_to = loss / num_of_instances df['Decision'] = int(learning_rate) * (df['Decision'] - df['Prediction']) df = df.drop(columns=['Epoch', 'Prediction']) #--------------------------------- df.to_csv(new_data_set, index=False) #data(i) created #--------------------------------- file = "outputs/rules/rules" + str(index) + ".py" functions.createFile(file, header) current_df = df.copy() Training.buildDecisionTree(df, root, file, config, dataset_features) df = current_df.copy( ) #numeric features require this restoration to apply findDecision function #rules(i) created loss = loss / num_of_instances #print("epoch ",index," - loss: ",loss) #print("loss: ",loss) pbar.set_description("Epoch %d. Loss: %d. Process: " % (index, loss)) #--------------------------------- print(num_of_instances, " instances are boosted from ", boosted_from, " to ", boosted_to, " in ", epochs, " epochs") return models
def classifier(df, config, header, dataset_features): models = [] print("gradient boosting for classification") epochs = config['epochs'] temp_df = df.copy() original_dataset = df.copy() worksheet = df.copy() classes = df['Decision'].unique() boosted_predictions = np.zeros([df.shape[0], len(classes)]) pbar = tqdm(range(0, epochs), desc='Boosting') #store actual set, we will use this to calculate loss actual_set = pd.DataFrame(np.zeros([df.shape[0], len(classes)]), columns=classes) for i in range(0, len(classes)): current_class = classes[i] actual_set[current_class] = np.where(df['Decision'] == current_class, 1, 0) actual_set = actual_set.values #transform it to numpy array #for epoch in range(0, epochs): for epoch in pbar: for i in range(0, len(classes)): current_class = classes[i] if epoch == 0: temp_df['Decision'] = np.where(df['Decision'] == current_class, 1, 0) worksheet['Y_' + str(i)] = temp_df['Decision'] else: temp_df['Decision'] = worksheet['Y-P_' + str(i)] predictions = [] #change data type for decision column temp_df[['Decision']].astype('int64') root = 1 file = "outputs/rules/rules-for-" + current_class + "-round-" + str( epoch) + ".py" functions.createFile(file, header) Training.buildDecisionTree(temp_df, root, file, config, dataset_features) #decision rules created #---------------------------- #dynamic import moduleName = "outputs/rules/rules-for-" + current_class + "-round-" + str( epoch) fp, pathname, description = imp.find_module(moduleName) myrules = imp.load_module(moduleName, fp, pathname, description) #rules0 models.append(myrules) num_of_columns = df.shape[1] for row, instance in df.iterrows(): features = [] for j in range(0, num_of_columns - 1): #iterate on features features.append(instance[j]) actual = temp_df.loc[row]['Decision'] prediction = myrules.findDecision(features) predictions.append(prediction) #---------------------------- if epoch == 0: worksheet['F_' + str(i)] = 0 else: worksheet['F_' + str(i)] = pd.Series(predictions).values boosted_predictions[:, i] = boosted_predictions[:, i] + worksheet[ 'F_' + str(i)].values.astype(np.float32) #print(boosted_predictions[0:5,:]) worksheet['P_' + str(i)] = 0 #---------------------------- temp_df = df.copy() #restoration for row, instance in worksheet.iterrows(): f_scores = [] for i in range(0, len(classes)): f_scores.append(instance['F_' + str(i)]) probabilities = functions.softmax(f_scores) for j in range(0, len(probabilities)): instance['P_' + str(j)] = probabilities[j] worksheet.loc[row] = instance for i in range(0, len(classes)): worksheet['Y-P_' + str(i)] = worksheet['Y_' + str(i)] - worksheet['P_' + str(i)] prediction_set = np.zeros([df.shape[0], len(classes)]) for i in range(0, boosted_predictions.shape[0]): predicted_index = np.argmax(boosted_predictions[i]) prediction_set[i][predicted_index] = 1 #---------------------------- #find loss for this epoch: prediction_set vs actual_set classified = 0 for i in range(0, actual_set.shape[0]): actual = np.argmax(actual_set[i]) prediction = np.argmax(prediction_set[i]) #print("actual: ",actual," - prediction: ",prediction) if actual == prediction: classified = classified + 1 accuracy = str(100 * classified / actual_set.shape[0]) + "%" #---------------------------- #print(worksheet.head()) #print("round ",epoch+1) pbar.set_description("Epoch %d. Accuracy: %s. Process: " % (epoch + 1, accuracy)) return models, classes
def apply(df, config, header, dataset_features, validation_df = None, process_id = None): models = [] num_of_trees = config['num_of_trees'] parallelism_on = config["enableParallelism"] #TODO: is this logical for 48x2 cores? #config["enableParallelism"] = False #run each tree in parallel but each branch in serial #TODO: reconstruct for parallel run is problematic. you should reconstruct based on tree id. input_params = [] pbar = tqdm(range(0, num_of_trees), desc='Bagging') for i in pbar: pbar.set_description("Sub decision tree %d is processing" % (i+1)) subset = df.sample(frac=1/num_of_trees) root = 1 moduleName = "outputs/rules/rule_"+str(i) file = moduleName+".py" functions.createFile(file, header) if parallelism_on: #parallel run input_params.append((subset, root, file, config, dataset_features, 0, 0, 'root', i, None, process_id)) else: #serial run Training.buildDecisionTree(subset,root, file, config, dataset_features, parent_level = 0, leaf_id = 0, parents = 'root', tree_id = i, main_process_id = process_id) #------------------------------- if parallelism_on: num_cores = config["num_cores"] #--------------------------------- if num_of_trees <= num_cores: POOL_SIZE = num_of_trees else: POOL_SIZE = num_cores with closing(multiprocessing.Pool(POOL_SIZE)) as pool: funclist = [] for input_param in input_params: f = pool.apply_async(buildDecisionTree, [*input_param]) funclist.append(f) #all functions registered here #results = [] for f in tqdm(funclist): branch_results = f.get(timeout = 100000) #results.append(branch_results) pool.close() pool.terminate() #------------------------------- #collect models for both serial and parallel here for i in range(0, num_of_trees): moduleName = "outputs/rules/rule_"+str(i) fp, pathname, description = imp.find_module(moduleName) myrules = imp.load_module(moduleName, fp, pathname, description) models.append(myrules) #------------------------------- return models
def buildDecisionTree(df, root, file, config, dataset_features, parent_level=0, leaf_id=0, parents='root', tree_id=0, validation_df=None, main_process_id=None): models = [] decision_rules = [] feature_names = df.columns[0:-1] enableParallelism = config['enableParallelism'] algorithm = config['algorithm'] json_file = file.split(".")[0] + ".json" random_forest_enabled = config['enableRandomForest'] enableGBM = config['enableGBM'] enableAdaboost = config['enableAdaboost'] if root == 1: if random_forest_enabled != True and enableGBM != True and enableAdaboost != True: raw_df = df.copy() #-------------------------------------- df_copy = df.copy() winner_name, num_of_instances, metric, metric_name = findDecision( df, config) #find winner index, this cannot be returned by find decision because columns dropped in previous steps j = 0 for i in dataset_features: if i == winner_name: winner_index = j j = j + 1 numericColumn = False if dataset_features[winner_name] != 'object': numericColumn = True #restoration columns = df.shape[1] for i in range(0, columns - 1): #column_name = df.columns[i]; column_type = df[column_name].dtypes #numeric field already transformed to object. you cannot check it with df itself, you should check df_copy column_name = df_copy.columns[i] column_type = df_copy[column_name].dtypes if column_type != 'object' and column_name != winner_name: df[column_name] = df_copy[column_name] classes = df[winner_name].value_counts().keys().tolist() #print("classes: ",classes," in ", winner_name) #----------------------------------------------------- num_cores = config["num_cores"] input_params = [] #serial approach for i in range(0, len(classes)): current_class = classes[i] subdataset = df[df[winner_name] == current_class] subdataset = subdataset.drop(columns=[winner_name]) branch_index = i * 1 #create branches serially if enableParallelism != True: if i == 0: descriptor = { "feature": winner_name, "instances": num_of_instances, #"metric_name": metric_name, "metric_value": round(metric, 4), "depth": parent_level + 1 } descriptor = "# " + json.dumps(descriptor) functions.storeRule( file, (functions.formatRule(root), "", descriptor)) results = createBranch(config, current_class, subdataset, numericColumn, branch_index, winner_name, winner_index, root, parents, file, dataset_features, num_of_instances, metric, tree_id=tree_id, main_process_id=main_process_id) decision_rules = decision_rules + results else: input_params.append( (config, current_class, subdataset, numericColumn, branch_index, winner_name, winner_index, root, parents, file, dataset_features, num_of_instances, metric, tree_id, main_process_id)) #--------------------------- #add else condition in the decision tree if df.Decision.dtypes == 'object': #classification pivot = pd.DataFrame(subdataset.Decision.value_counts()).reset_index() pivot = pivot.rename(columns={ "Decision": "Instances", "index": "Decision" }) pivot = pivot.sort_values(by=["Instances"], ascending=False).reset_index() else_decision = "return '%s'" % (pivot.iloc[0].Decision) if enableParallelism != True: functions.storeRule(file, (functions.formatRule(root), "else:")) functions.storeRule( file, (functions.formatRule(root + 1), else_decision)) else: #parallelism leaf_id = str(uuid.uuid1()) check_rule = "else: " + else_decision sample_rule = {} sample_rule["current_level"] = root sample_rule["leaf_id"] = leaf_id sample_rule["parents"] = parents sample_rule["rule"] = check_rule sample_rule["feature_idx"] = -1 sample_rule["feature_name"] = "" sample_rule["instances"] = df.shape[0] sample_rule["metric"] = 0 sample_rule["return_statement"] = 0 sample_rule["tree_id"] = tree_id #json to string sample_rule = json.dumps(sample_rule) decision_rules.append(sample_rule) else: #regression else_decision = "return %s" % (subdataset.Decision.mean()) if enableParallelism != True: functions.storeRule(file, (functions.formatRule(root), "else:")) functions.storeRule( file, (functions.formatRule(root + 1), else_decision)) else: leaf_id = str(uuid.uuid1()) check_rule = "else: " + else_decision sample_rule = {} sample_rule["current_level"] = root sample_rule["leaf_id"] = leaf_id sample_rule["parents"] = parents sample_rule["rule"] = check_rule sample_rule["tree_id"] = tree_id sample_rule["feature_name"] = "" sample_rule["instances"] = 0 sample_rule["metric"] = 0 sample_rule["return_statement"] = 1 #json to string sample_rule = json.dumps(sample_rule) decision_rules.append(sample_rule) #--------------------------- try: main_process = psutil.Process(main_process_id) children = main_process.children(recursive=True) active_processes = len(children) + 1 #plus parent #active_processes = len(children) except: active_processes = 100 #set a large initial value results = [] #create branches in parallel if enableParallelism == True: required_threads = active_processes + len(classes) #if parent_level == 0 and random_forest_enabled != True: if main_process_id != None and num_cores >= required_threads: #len(classes) branches will be run in parallel #POOL_SIZE = num_cores POOL_SIZE = len(classes) #with closing(multiprocessing.Pool(POOL_SIZE)) as pool: with closing(MyPool(POOL_SIZE)) as pool: funclist = [] for input_param in input_params: f = pool.apply_async(createBranchWrapper, [createBranch, input_param]) funclist.append(f) #all functions registered here for f in funclist: branch_results = f.get(timeout=100000) for branch_result in branch_results: results.append(branch_result) pool.close() pool.terminate() #-------------------------------- else: #serial for input_param in input_params: sub_results = createBranchWrapper(createBranch, input_param) for sub_result in sub_results: results.append(sub_result) #-------------------------------- decision_rules = decision_rules + results #-------------------------------- if root != 1: #return children results until the root node return decision_rules #--------------------------------------------- if root == 1: if enableParallelism == True: #custom rules are stored in decision_rules. merge them all in a json file first json_rules = "[\n" #initialize file_index = 0 for custom_rule in decision_rules: json_rules += custom_rule if file_index < len(decision_rules) - 1: json_rules += ", " json_rules += "\n" file_index = file_index + 1 #----------------------------------- json_rules += "]" functions.createFile(json_file, json_rules) #----------------------------------- #reconstruct rules from json to py reconstructRules(json_file, feature_names) #----------------------------------- #is regular decision tree if config['enableRandomForest'] != True and config[ 'enableGBM'] != True and config['enableAdaboost'] != True: #this is reguler decision tree. find accuracy here. moduleName = "outputs/rules/rules" fp, pathname, description = imp.find_module(moduleName) myrules = imp.load_module(moduleName, fp, pathname, description) #rules0 models.append(myrules) return models
def initializeAlphaFile(): file = "outputs/rules/alphas.py" header = "def findAlpha(epoch):\n" functions.createFile(file, header)
def fit(df, config={}, validation_df=None): """ Parameters: df (pandas data frame): Training data frame. The target column must be named as 'Decision' and it has to be in the last column config (dictionary): config = { 'algorithm' (string): ID3, 'C4.5, CART, CHAID or Regression 'enableParallelism' (boolean): False 'enableGBM' (boolean): True, 'epochs' (int): 7, 'learning_rate' (int): 1, 'enableRandomForest' (boolean): True, 'num_of_trees' (int): 5, 'enableAdaboost' (boolean): True, 'num_of_weak_classifier' (int): 4 } validation_df (pandas data frame): if nothing is passed to validation data frame, then the function validates built trees for training data frame Returns: chefboost model """ process_id = os.getpid() base_df = df.copy() target_label = df.columns[len(df.columns) - 1] if target_label != 'Decision': print("Expected: Decision, Existing: ", target_label) raise ValueError( 'Please confirm that name of the target column is "Decision" and it is put to the right in pandas data frame' ) #------------------------ #handle NaN values nan_values = [] for column in df.columns: if df[column].dtypes != 'object': min_value = df[column].min() idx = df[df[column].isna()].index nan_value = [] nan_value.append(column) if idx.shape[0] > 0: df.loc[idx, column] = min_value - 1 nan_value.append(min_value - 1) min_value - 1 #print("NaN values are replaced to ", min_value - 1, " in column ", column) else: nan_value.append(None) nan_values.append(nan_value) #------------------------ #initialize params and folders config = functions.initializeParams(config) functions.initializeFolders() #------------------------ algorithm = config['algorithm'] valid_algorithms = ['ID3', 'C4.5', 'CART', 'CHAID', 'Regression'] if algorithm not in valid_algorithms: raise ValueError('Invalid algorithm passed. You passed ', algorithm, " but valid algorithms are ", valid_algorithms) #------------------------ enableRandomForest = config['enableRandomForest'] num_of_trees = config['num_of_trees'] enableMultitasking = config[ 'enableMultitasking'] #no longer used. check to remove this variable. enableGBM = config['enableGBM'] epochs = config['epochs'] learning_rate = config['learning_rate'] enableAdaboost = config['enableAdaboost'] enableParallelism = config['enableParallelism'] #------------------------ if enableParallelism == True: print("[INFO]: ", config["num_cores"], "CPU cores will be allocated in parallel running") #------------------------ raw_df = df.copy() num_of_rows = df.shape[0] num_of_columns = df.shape[1] if algorithm == 'Regression': if df['Decision'].dtypes == 'object': raise ValueError( 'Regression trees cannot be applied for nominal target values! You can either change the algorithm or data set.' ) if df['Decision'].dtypes != 'object': #this must be regression tree even if it is not mentioned in algorithm if algorithm != 'Regression': print( "WARNING: You set the algorithm to ", algorithm, " but the Decision column of your data set has non-object type." ) print( "That's why, the algorithm is set to Regression to handle the data set." ) algorithm = 'Regression' config['algorithm'] = 'Regression' global_stdev = df['Decision'].std(ddof=0) if enableGBM == True: print("Gradient Boosting Machines...") algorithm = 'Regression' config['algorithm'] = 'Regression' if enableAdaboost == True: #enableParallelism = False for j in range(0, num_of_columns): column_name = df.columns[j] if df[column_name].dtypes == 'object': raise ValueError( 'Adaboost must be run on numeric data set for both features and target' ) #------------------------- print(algorithm, " tree is going to be built...") dataset_features = dict( ) #initialize a dictionary. this is going to be used to check features numeric or nominal. numeric features should be transformed to nominal values based on scales. header = "def findDecision(obj): #" num_of_columns = df.shape[1] - 1 for i in range(0, num_of_columns): column_name = df.columns[i] dataset_features[column_name] = df[column_name].dtypes header = header + "obj[" + str(i) + "]: " + column_name if i != num_of_columns - 1: header = header + ", " header = header + "\n" #------------------------ begin = time.time() trees = [] alphas = [] if enableAdaboost == True: trees, alphas = adaboost.apply(df, config, header, dataset_features, validation_df=validation_df) elif enableGBM == True: if df['Decision'].dtypes == 'object': #transform classification problem to regression trees, alphas = gbm.classifier(df, config, header, dataset_features, validation_df=validation_df) classification = True else: #regression trees = gbm.regressor(df, config, header, dataset_features, validation_df=validation_df) classification = False elif enableRandomForest == True: trees = randomforest.apply(df, config, header, dataset_features, validation_df=validation_df, process_id=process_id) else: #regular decision tree building root = 1 file = "outputs/rules/rules.py" functions.createFile(file, header) if enableParallelism == True: json_file = "outputs/rules/rules.json" functions.createFile(json_file, "[\n") trees = Training.buildDecisionTree(df, root=root, file=file, config=config, dataset_features=dataset_features, parent_level=0, leaf_id=0, parents='root', validation_df=validation_df, main_process_id=process_id) print("-------------------------") print("finished in ", time.time() - begin, " seconds") obj = { "trees": trees, "alphas": alphas, "config": config, "nan_values": nan_values } #----------------------------------------- #train set accuracy df = base_df.copy() evaluate(obj, df, task='train') #validation set accuracy if isinstance(validation_df, pd.DataFrame): evaluate(obj, validation_df, task='validation') #----------------------------------------- return obj
def apply(df, config, header, dataset_features): models = [] alphas = [] initializeAlphaFile() num_of_weak_classifier = config['num_of_weak_classifier'] #------------------------ rows = df.shape[0] columns = df.shape[1] final_predictions = pd.DataFrame(np.zeros([rows, 1]), columns=['prediction']) worksheet = df.copy() worksheet['Weight'] = 1 / rows #uniform distribution initially final_predictions = pd.DataFrame(np.zeros((df.shape[0], 2)), columns=['Prediction', 'Actual']) final_predictions['Actual'] = df['Decision'] #for i in range(0, num_of_weak_classifier): pbar = tqdm(range(0, num_of_weak_classifier), desc='Adaboosting') for i in pbar: worksheet['Decision'] = worksheet['Weight'] * worksheet['Decision'] root = 1 file = "outputs/rules/rules_" + str(i) + ".py" functions.createFile(file, header) #print(worksheet) Training.buildDecisionTree(worksheet.drop(columns=['Weight']), root, file, config, dataset_features, parent_level=0, leaf_id=0, parents='root') #--------------------------------------- moduleName = "outputs/rules/rules_" + str(i) fp, pathname, description = imp.find_module(moduleName) myrules = imp.load_module(moduleName, fp, pathname, description) models.append(myrules) #--------------------------------------- df['Epoch'] = i worksheet['Prediction'] = df.apply(findPrediction, axis=1) df = df.drop(columns=['Epoch']) #--------------------------------------- worksheet['Actual'] = df['Decision'] worksheet['Loss'] = abs(worksheet['Actual'] - worksheet['Prediction']) / 2 worksheet[ 'Weight_Times_Loss'] = worksheet['Loss'] * worksheet['Weight'] epsilon = worksheet['Weight_Times_Loss'].sum() alpha = math.log( (1 - epsilon) / epsilon) / 2 #use alpha to update weights in the next round alphas.append(alpha) #----------------------------- #store alpha addEpochAlpha(i, alpha) #----------------------------- worksheet['Alpha'] = alpha worksheet['New_Weights'] = worksheet['Weight'] * ( -alpha * worksheet['Actual'] * worksheet['Prediction']).apply( math.exp) #normalize worksheet['New_Weights'] = worksheet['New_Weights'] / worksheet[ 'New_Weights'].sum() worksheet['Weight'] = worksheet['New_Weights'] worksheet['Decision'] = df['Decision'] final_predictions['Prediction'] = final_predictions[ 'Prediction'] + worksheet['Alpha'] * worksheet['Prediction'] #print(final_predictions) worksheet = worksheet.drop(columns=[ 'New_Weights', 'Prediction', 'Actual', 'Loss', 'Weight_Times_Loss', 'Alpha' ]) mae = (np.abs(final_predictions['Prediction'].apply(functions.sign) - final_predictions['Actual']) / 2).sum() / final_predictions.shape[0] #print(mae) pbar.set_description("Epoch %d. Loss: %d. Process: " % (i + 1, mae)) #------------------------------ final_predictions['Prediction'] = final_predictions['Prediction'].apply( functions.sign) final_predictions['Absolute_Error'] = np.abs( final_predictions['Actual'] - final_predictions['Prediction']) / 2 #print(final_predictions) mae = final_predictions['Absolute_Error'].sum( ) / final_predictions.shape[0] print("Loss (MAE) found ", mae, " with ", num_of_weak_classifier, ' weak classifiers') return models, alphas
def apply(df, config, header, dataset_features): models = [] num_of_trees = config['num_of_trees'] pbar = tqdm(range(0, num_of_trees), desc='Bagging') for i in pbar: #for i in range(0, num_of_trees): pbar.set_description("Sub decision tree %d is processing" % (i + 1)) subset = df.sample(frac=1 / num_of_trees) root = 1 moduleName = "outputs/rules/rule_" + str(i) file = moduleName + ".py" json_file = moduleName + ".json" functions.createFile(file, header) functions.createFile(json_file, "[\n") Training.buildDecisionTree(subset, root, file, config, dataset_features, parent_level=0, leaf_id=0, parents='root') functions.storeRule(json_file, "{}]") #-------------------------------- fp, pathname, description = imp.find_module(moduleName) myrules = imp.load_module(moduleName, fp, pathname, description) models.append(myrules) #------------------------------- #check regression or classification if df['Decision'].dtypes == 'object': problem_type = 'classification' else: problem_type = 'regression' actual_values = df['Decision'].values num_of_features = df.shape[1] - 1 #discard Decision number_of_instances = df.shape[0] global_predictions = [] #if classification get the max number of prediction if problem_type == 'classification': for i in range(0, num_of_trees): moduleName = "outputs/rules/rule_" + str(i) fp, pathname, description = imp.find_module(moduleName) myrules = imp.load_module(moduleName, fp, pathname, description) predictions = [] for index, instance in df.iterrows(): params = [] for j in range(0, num_of_features): params.append(instance[j]) #index row, i th column prediction = myrules.findDecision(params) predictions.append(prediction) #print(i,"th tree prediction: ",prediction) #print(predictions) global_predictions.append(predictions) #------------------------------- classified = 0 for index in range(0, len(actual_values)): actual = actual_values[index] predictions = [] for i in range(0, num_of_trees): prediction = global_predictions[i][index] if prediction != None: #why None exists in some cases? predictions.append(prediction) predictions = np.array(predictions) unique_values = np.unique(predictions) if unique_values.shape[0] == 1: prediction = unique_values[0] else: counts = [] for unique in unique_values: count = 0 for j in predictions: if unique == j: count = count + 1 counts.append(count) #print("unique: ",unique_values) #print("counts: ",counts) prediction = None if len(counts) > 0: max_index = np.argmax(np.array(counts)) prediction = unique_values[max_index] #print(index,". actual: ",actual," - prediction: ", prediction) if actual == prediction: classified = classified + 1 print("Accuracy: ", 100 * classified / number_of_instances, "% on ", number_of_instances, " instances") return models