def buildDecisionTree(df, root, file, config, dataset_features, parent_level = 0, leaf_id = 0, parents = 'root', validation_df = None): models = [] feature_names = df.columns[0:-1] enableParallelism = config['enableParallelism'] algorithm = config['algorithm'] json_file = file.split(".")[0]+".json" if root == 1: if config['enableRandomForest'] != True and config['enableGBM'] != True and config['enableAdaboost'] != True: raw_df = df.copy() #-------------------------------------- df_copy = df.copy() winner_name, num_of_instances, metric, metric_name = findDecision(df, config) #find winner index, this cannot be returned by find decision because columns dropped in previous steps j = 0 for i in dataset_features: if i == winner_name: winner_index = j j = j + 1 numericColumn = False if dataset_features[winner_name] != 'object': numericColumn = True #restoration columns = df.shape[1] for i in range(0, columns-1): column_name = df.columns[i]; column_type = df[column_name].dtypes if column_type != 'object' and column_name != winner_name: df[column_name] = df_copy[column_name] classes = df[winner_name].value_counts().keys().tolist() #----------------------------------------------------- num_cores = config["num_cores"] input_params = [] #serial approach for i in range(0,len(classes)): current_class = classes[i] subdataset = df[df[winner_name] == current_class] subdataset = subdataset.drop(columns=[winner_name]) branch_index = i * 1 #create branches serially if enableParallelism != True: if i == 0: #descriptor = "# Feature: "+winner_name+", Instances: "+str(num_of_instances)+", "+metric_name+": "+str(round(metric, 4)) descriptor = { "feature": winner_name, "instances": num_of_instances, #"metric_name": metric_name, "metric_value": round(metric, 4), "depth": parent_level + 1 } descriptor = "# "+json.dumps(descriptor) functions.storeRule(file, (functions.formatRule(root), "", descriptor)) createBranch(config, current_class, subdataset, numericColumn, branch_index , winner_name, winner_index, root, parents, file, dataset_features, num_of_instances, metric) else: input_params.append((config, current_class, subdataset, numericColumn, branch_index , winner_name, winner_index, root, parents, file, dataset_features, num_of_instances, metric)) #--------------------------- #add else condition in the decision tree if df.Decision.dtypes == 'object': #classification pivot = pd.DataFrame(subdataset.Decision.value_counts()).reset_index() pivot = pivot.rename(columns = {"Decision": "Instances","index": "Decision"}) pivot = pivot.sort_values(by = ["Instances"], ascending = False).reset_index() else_decision = "return '%s'" % (pivot.iloc[0].Decision) if enableParallelism != True: functions.storeRule(file,(functions.formatRule(root), "else:")) functions.storeRule(file,(functions.formatRule(root+1), else_decision)) else: #parallelism leaf_id = str(uuid.uuid1()) custom_rule_file = "outputs/rules/"+str(leaf_id)+".txt" check_rule = "else: "+else_decision sample_rule = {} sample_rule["current_level"] = root sample_rule["leaf_id"] = leaf_id sample_rule["parents"] = parents sample_rule["rule"] = check_rule sample_rule["feature_idx"] = -1 sample_rule["feature_name"] = "" sample_rule["instances"] = df.shape[0] sample_rule["metric"] = 0 sample_rule["return_statement"] = 0 #json to string sample_rule = json.dumps(sample_rule) functions.createFile(custom_rule_file, "") functions.storeRule(custom_rule_file, sample_rule) else: #regression else_decision = "return %s" % (subdataset.Decision.mean()) if enableParallelism != True: functions.storeRule(file,(functions.formatRule(root), "else:")) functions.storeRule(file,(functions.formatRule(root+1), else_decision)) else: leaf_id = str(uuid.uuid1()) custom_rule_file = "outputs/rules/"+str(leaf_id)+".txt" check_rule = "else: "+else_decision sample_rule = " {\n" sample_rule += " \"current_level\": "+str(root)+",\n" sample_rule += " \"leaf_id\": \""+str(leaf_id)+"\",\n" sample_rule += " \"parents\": \""+parents+"\",\n" sample_rule += " \"rule\": \""+check_rule+"\"\n" sample_rule += " }" functions.createFile(custom_rule_file, "") functions.storeRule(custom_rule_file, sample_rule) #--------------------------- #create branches in parallel if enableParallelism == True: """ #this usage causes trouble for recursive functions with Pool(number_of_cpus) as pool: pool.starmap(createBranch, input_params) """ pool = MyPool(num_cores) results = pool.starmap(createBranch, input_params) pool.close() pool.join() #--------------------------------------------- if root == 1: if enableParallelism == True: #custom rules are stored in .txt files. merge them all in a json file functions.createFile(json_file, "[\n") custom_rules = [] file_index = 0 for file in os.listdir(os.getcwd()+"/outputs/rules"): if file.endswith(".txt"): custom_rules.append(os.getcwd()+"/outputs/rules/"+file) #print(file) #this file stores a custom rule f = open(os.getcwd()+"/outputs/rules/"+file, "r") custom_rule = f.read() if file_index > 0: custom_rule = ", "+custom_rule functions.storeRule(json_file, custom_rule) f.close() file_index = file_index + 1 functions.storeRule(json_file, "]") #----------------------------------- #custom rules are already merged in a json file. clear messy custom rules #TO-DO: if random forest trees are handled in parallel, this would be a problem. You cannot know the related tree of a rule. You should store a global tree id in a rule. for file in custom_rules: os.remove(file) #----------------------------------- reconstructRules(json_file, feature_names) #feature importance should be calculated by demand? feature_importance(json_file, dataset_features) #----------------------------------- #is regular decision tree if config['enableRandomForest'] != True and config['enableGBM'] != True and config['enableAdaboost'] != True: #this is reguler decision tree. find accuracy here. moduleName = "outputs/rules/rules" fp, pathname, description = imp.find_module(moduleName) myrules = imp.load_module(moduleName, fp, pathname, description) #rules0 models.append(myrules) return models
def createBranch(config, current_class, subdataset, numericColumn, branch_index , winner_name, winner_index, root, parents, file, dataset_features, num_of_instances, metric): algorithm = config['algorithm'] enableAdaboost = config['enableAdaboost'] enableGBM = config['enableGBM'] max_depth = config['max_depth'] enableParallelism = config['enableParallelism'] charForResp = "'" if algorithm == 'Regression': charForResp = "" #--------------------------- json_file = file.split(".")[0]+".json" tmp_root = root * 1 parents_raw = copy.copy(parents) #--------------------------- if numericColumn == True: compareTo = current_class #current class might be <=x or >x in this case else: compareTo = " == '"+str(current_class)+"'" #print(subdataset) terminateBuilding = False #----------------------------------------------- #can decision be made? if enableGBM == True and root >= max_depth: #max depth final_decision = subdataset['Decision'].mean() terminateBuilding = True elif enableAdaboost == True: #final_decision = subdataset['Decision'].value_counts().idxmax() final_decision = functions.sign(subdataset['Decision'].mean()) #get average terminateBuilding = True enableParallelism = False elif len(subdataset['Decision'].value_counts().tolist()) == 1: final_decision = subdataset['Decision'].value_counts().keys().tolist()[0] #all items are equal in this case terminateBuilding = True elif subdataset.shape[1] == 1: #if decision cannot be made even though all columns dropped final_decision = subdataset['Decision'].value_counts().idxmax() #get the most frequent one terminateBuilding = True elif algorithm == 'Regression' and subdataset.shape[0] < 5: #pruning condition #elif algorithm == 'Regression' and subdataset['Decision'].std(ddof=0)/global_stdev < 0.4: #pruning condition final_decision = subdataset['Decision'].mean() #get average terminateBuilding = True #----------------------------------------------- if enableParallelism == True: check_condition = "if" #TODO: elif checks might be above than if statements in parallel else: if branch_index == 0: check_condition = "if" else: check_condition = "elif" check_rule = check_condition+" obj["+str(winner_index)+"]"+compareTo+":" leaf_id = str(uuid.uuid1()) custom_rule_file = "outputs/rules/"+str(leaf_id)+".txt" if enableParallelism != True: #check_rule += " # feature: "+winner_name+", instances: "+str(num_of_instances)+", "+metric_name+": "+str(round(metric, 4)) functions.storeRule(file,(functions.formatRule(root),"",check_rule)) else: sample_rule = {} sample_rule["current_level"] = root sample_rule["leaf_id"] = leaf_id sample_rule["parents"] = parents sample_rule["rule"] = check_rule sample_rule["feature_idx"] = winner_index sample_rule["feature_name"] = winner_name sample_rule["instances"] = num_of_instances sample_rule["metric"] = metric sample_rule["return_statement"] = 0 #json to string sample_rule = json.dumps(sample_rule) functions.createFile(custom_rule_file, "") functions.storeRule(custom_rule_file, sample_rule) #----------------------------------------------- if terminateBuilding == True: #check decision is made parents = copy.copy(leaf_id) leaf_id = str(uuid.uuid1()) decision_rule = "return "+charForResp+str(final_decision)+charForResp if enableParallelism != True: #serial functions.storeRule(file,(functions.formatRule(root+1),decision_rule)) else: #parallel sample_rule = {} sample_rule["current_level"] = root+1 sample_rule["leaf_id"] = leaf_id sample_rule["parents"] = parents sample_rule["rule"] = decision_rule sample_rule["feature_idx"] = winner_index sample_rule["feature_name"] = winner_name sample_rule["instances"] = num_of_instances sample_rule["metric"] = 0 sample_rule["return_statement"] = 1 #json to string sample_rule = ", "+json.dumps(sample_rule) functions.storeRule(custom_rule_file, sample_rule) else: #decision is not made, continue to create branch and leafs root = root + 1 #the following rule will be included by this rule. increase root parents = copy.copy(leaf_id) buildDecisionTree(subdataset, root, file, config, dataset_features , root-1, leaf_id, parents) root = tmp_root * 1 parents = copy.copy(parents_raw)
def buildDecisionTree(df, root, file, config, dataset_features, parent_level=0, leaf_id=0, parents='root', tree_id=0, validation_df=None, main_process_id=None): models = [] decision_rules = [] feature_names = df.columns[0:-1] enableParallelism = config['enableParallelism'] algorithm = config['algorithm'] json_file = file.split(".")[0] + ".json" random_forest_enabled = config['enableRandomForest'] enableGBM = config['enableGBM'] enableAdaboost = config['enableAdaboost'] if root == 1: if random_forest_enabled != True and enableGBM != True and enableAdaboost != True: raw_df = df.copy() #-------------------------------------- df_copy = df.copy() winner_name, num_of_instances, metric, metric_name = findDecision( df, config) #find winner index, this cannot be returned by find decision because columns dropped in previous steps j = 0 for i in dataset_features: if i == winner_name: winner_index = j j = j + 1 numericColumn = False if dataset_features[winner_name] != 'object': numericColumn = True #restoration columns = df.shape[1] for i in range(0, columns - 1): #column_name = df.columns[i]; column_type = df[column_name].dtypes #numeric field already transformed to object. you cannot check it with df itself, you should check df_copy column_name = df_copy.columns[i] column_type = df_copy[column_name].dtypes if column_type != 'object' and column_name != winner_name: df[column_name] = df_copy[column_name] classes = df[winner_name].value_counts().keys().tolist() #print("classes: ",classes," in ", winner_name) #----------------------------------------------------- num_cores = config["num_cores"] input_params = [] #serial approach for i in range(0, len(classes)): current_class = classes[i] subdataset = df[df[winner_name] == current_class] subdataset = subdataset.drop(columns=[winner_name]) branch_index = i * 1 #create branches serially if enableParallelism != True: if i == 0: descriptor = { "feature": winner_name, "instances": num_of_instances, #"metric_name": metric_name, "metric_value": round(metric, 4), "depth": parent_level + 1 } descriptor = "# " + json.dumps(descriptor) functions.storeRule( file, (functions.formatRule(root), "", descriptor)) results = createBranch(config, current_class, subdataset, numericColumn, branch_index, winner_name, winner_index, root, parents, file, dataset_features, num_of_instances, metric, tree_id=tree_id, main_process_id=main_process_id) decision_rules = decision_rules + results else: input_params.append( (config, current_class, subdataset, numericColumn, branch_index, winner_name, winner_index, root, parents, file, dataset_features, num_of_instances, metric, tree_id, main_process_id)) #--------------------------- #add else condition in the decision tree if df.Decision.dtypes == 'object': #classification pivot = pd.DataFrame(subdataset.Decision.value_counts()).reset_index() pivot = pivot.rename(columns={ "Decision": "Instances", "index": "Decision" }) pivot = pivot.sort_values(by=["Instances"], ascending=False).reset_index() else_decision = "return '%s'" % (pivot.iloc[0].Decision) if enableParallelism != True: functions.storeRule(file, (functions.formatRule(root), "else:")) functions.storeRule( file, (functions.formatRule(root + 1), else_decision)) else: #parallelism leaf_id = str(uuid.uuid1()) check_rule = "else: " + else_decision sample_rule = {} sample_rule["current_level"] = root sample_rule["leaf_id"] = leaf_id sample_rule["parents"] = parents sample_rule["rule"] = check_rule sample_rule["feature_idx"] = -1 sample_rule["feature_name"] = "" sample_rule["instances"] = df.shape[0] sample_rule["metric"] = 0 sample_rule["return_statement"] = 0 sample_rule["tree_id"] = tree_id #json to string sample_rule = json.dumps(sample_rule) decision_rules.append(sample_rule) else: #regression else_decision = "return %s" % (subdataset.Decision.mean()) if enableParallelism != True: functions.storeRule(file, (functions.formatRule(root), "else:")) functions.storeRule( file, (functions.formatRule(root + 1), else_decision)) else: leaf_id = str(uuid.uuid1()) check_rule = "else: " + else_decision sample_rule = {} sample_rule["current_level"] = root sample_rule["leaf_id"] = leaf_id sample_rule["parents"] = parents sample_rule["rule"] = check_rule sample_rule["tree_id"] = tree_id sample_rule["feature_name"] = "" sample_rule["instances"] = 0 sample_rule["metric"] = 0 sample_rule["return_statement"] = 1 #json to string sample_rule = json.dumps(sample_rule) decision_rules.append(sample_rule) #--------------------------- try: main_process = psutil.Process(main_process_id) children = main_process.children(recursive=True) active_processes = len(children) + 1 #plus parent #active_processes = len(children) except: active_processes = 100 #set a large initial value results = [] #create branches in parallel if enableParallelism == True: required_threads = active_processes + len(classes) #if parent_level == 0 and random_forest_enabled != True: if main_process_id != None and num_cores >= required_threads: #len(classes) branches will be run in parallel #POOL_SIZE = num_cores POOL_SIZE = len(classes) #with closing(multiprocessing.Pool(POOL_SIZE)) as pool: with closing(MyPool(POOL_SIZE)) as pool: funclist = [] for input_param in input_params: f = pool.apply_async(createBranchWrapper, [createBranch, input_param]) funclist.append(f) #all functions registered here for f in funclist: branch_results = f.get(timeout=100000) for branch_result in branch_results: results.append(branch_result) pool.close() pool.terminate() #-------------------------------- else: #serial for input_param in input_params: sub_results = createBranchWrapper(createBranch, input_param) for sub_result in sub_results: results.append(sub_result) #-------------------------------- decision_rules = decision_rules + results #-------------------------------- if root != 1: #return children results until the root node return decision_rules #--------------------------------------------- if root == 1: if enableParallelism == True: #custom rules are stored in decision_rules. merge them all in a json file first json_rules = "[\n" #initialize file_index = 0 for custom_rule in decision_rules: json_rules += custom_rule if file_index < len(decision_rules) - 1: json_rules += ", " json_rules += "\n" file_index = file_index + 1 #----------------------------------- json_rules += "]" functions.createFile(json_file, json_rules) #----------------------------------- #reconstruct rules from json to py reconstructRules(json_file, feature_names) #----------------------------------- #is regular decision tree if config['enableRandomForest'] != True and config[ 'enableGBM'] != True and config['enableAdaboost'] != True: #this is reguler decision tree. find accuracy here. moduleName = "outputs/rules/rules" fp, pathname, description = imp.find_module(moduleName) myrules = imp.load_module(moduleName, fp, pathname, description) #rules0 models.append(myrules) return models
def buildDecisionTree(df, root, file, config, dataset_features, parent_level=0, leaf_id=0, parents='root'): models = [] enableParallelism = config['enableParallelism'] algorithm = config['algorithm'] json_file = file.split(".")[0] + ".json" if root == 1: if config['enableRandomForest'] != True and config[ 'enableGBM'] != True and config['enableAdaboost'] != True: raw_df = df.copy() #-------------------------------------- df_copy = df.copy() winner_name = findDecision(df, config) #find winner index, this cannot be returned by find decision because columns dropped in previous steps j = 0 for i in dataset_features: if i == winner_name: winner_index = j j = j + 1 numericColumn = False if dataset_features[winner_name] != 'object': numericColumn = True #restoration columns = df.shape[1] for i in range(0, columns - 1): column_name = df.columns[i] column_type = df[column_name].dtypes if column_type != 'object' and column_name != winner_name: df[column_name] = df_copy[column_name] classes = df[winner_name].value_counts().keys().tolist() #----------------------------------------------------- #TO-DO: you should specify the number of cores in config num_cores = int(multiprocessing.cpu_count() / 2) #allocate half of your total cores input_params = [] #serial approach for i in range(0, len(classes)): current_class = classes[i] subdataset = df[df[winner_name] == current_class] subdataset = subdataset.drop(columns=[winner_name]) branch_index = i * 1 #create branches serially if enableParallelism != True: createBranch(config, current_class, subdataset, numericColumn, branch_index, winner_index, root, parents, file, dataset_features) else: input_params.append((config, current_class, subdataset, numericColumn, branch_index, winner_index, root, parents, file, dataset_features)) #--------------------------- #add else condition in the decision tree if df.Decision.dtypes == 'object': #classification pivot = pd.DataFrame(subdataset.Decision.value_counts()).reset_index() pivot = pivot.rename(columns={ "Decision": "Instances", "index": "Decision" }) pivot = pivot.sort_values(by=["Instances"], ascending=False).reset_index() else_decision = "return '%s'" % (pivot.iloc[0].Decision) if enableParallelism != True: functions.storeRule(file, (functions.formatRule(root), "else:")) functions.storeRule( file, (functions.formatRule(root + 1), else_decision)) else: #parallelism leaf_id = str(uuid.uuid1()) custom_rule_file = "outputs/rules/" + str(leaf_id) + ".txt" check_rule = "else: " + else_decision sample_rule = " {\n" sample_rule += " \"current_level\": " + str(root) + ",\n" sample_rule += " \"leaf_id\": \"" + str(leaf_id) + "\",\n" sample_rule += " \"parents\": \"" + parents + "\",\n" sample_rule += " \"rule\": \"" + check_rule + "\"\n" sample_rule += " }" functions.createFile(custom_rule_file, "") functions.storeRule(custom_rule_file, sample_rule) else: #regression else_decision = "return %s" % (subdataset.Decision.mean()) if enableParallelism != True: functions.storeRule(file, (functions.formatRule(root), "else:")) functions.storeRule( file, (functions.formatRule(root + 1), else_decision)) else: leaf_id = str(uuid.uuid1()) custom_rule_file = "outputs/rules/" + str(leaf_id) + ".txt" check_rule = "else: " + else_decision sample_rule = " {\n" sample_rule += " \"current_level\": " + str(root) + ",\n" sample_rule += " \"leaf_id\": \"" + str(leaf_id) + "\",\n" sample_rule += " \"parents\": \"" + parents + "\",\n" sample_rule += " \"rule\": \"" + check_rule + "\"\n" sample_rule += " }" functions.createFile(custom_rule_file, "") functions.storeRule(custom_rule_file, sample_rule) #--------------------------- #create branches in parallel if enableParallelism == True: """ #this usage causes trouble for recursive functions with Pool(number_of_cpus) as pool: pool.starmap(createBranch, input_params) """ pool = MyPool(num_cores) results = pool.starmap(createBranch, input_params) pool.close() pool.join() #--------------------------------------------- #calculate accuracy metrics if root == 1: if enableParallelism == True: #custom rules are stored in .txt files. merge them all in a json file functions.createFile(json_file, "[\n") custom_rules = [] file_index = 0 for file in os.listdir(os.getcwd() + "/outputs/rules"): if file.endswith(".txt"): custom_rules.append(os.getcwd() + "/outputs/rules/" + file) #print(file) #this file stores a custom rule f = open(os.getcwd() + "/outputs/rules/" + file, "r") custom_rule = f.read() if file_index > 0: custom_rule = ", " + custom_rule functions.storeRule(json_file, custom_rule) f.close() file_index = file_index + 1 functions.storeRule(json_file, "]") #----------------------------------- #custom rules are already merged in a json file. clear messy custom rules #TO-DO: if random forest trees are handled in parallel, this would be a problem. You cannot know the related tree of a rule. You should store a global tree id in a rule. for file in custom_rules: os.remove(file) #----------------------------------- reconstructRules(json_file) #----------------------------------- if config['enableRandomForest'] != True and config[ 'enableGBM'] != True and config['enableAdaboost'] != True: #this is reguler decision tree. find accuracy here. moduleName = "outputs/rules/rules" fp, pathname, description = imp.find_module(moduleName) myrules = imp.load_module(moduleName, fp, pathname, description) #rules0 models.append(myrules) num_of_features = df.shape[1] - 1 instances = df.shape[0] classified = 0 mae = 0 mse = 0 #instead of for loops, pandas functions perform well raw_df['Prediction'] = raw_df.apply(findPrediction, axis=1) if algorithm != 'Regression': idx = raw_df[raw_df['Prediction'] == raw_df['Decision']].index #raw_df['Classified'] = 0 #raw_df.loc[idx, 'Classified'] = 1 #print(raw_df) accuracy = 100 * len(idx) / instances print("Accuracy: ", accuracy, "% on ", instances, " instances") else: raw_df['Absolute_Error'] = abs(raw_df['Prediction'] - raw_df['Decision']) raw_df['Absolute_Error_Squared'] = raw_df[ 'Absolute_Error'] * raw_df['Absolute_Error'] #print(raw_df) mae = raw_df['Absolute_Error'].sum() / instances print("MAE: ", mae) mse = raw_df['Absolute_Error_Squared'].sum() / instances rmse = math.sqrt(mse) print("RMSE: ", rmse) mean = raw_df['Decision'].mean() print("Mean: ", mean) if mean > 0: print("MAE / Mean: ", 100 * mae / mean, "%") print("RMSE / Mean: ", 100 * rmse / mean, "%") return models
def buildDecisionTree(df, root, file, config, dataset_features): models = [] if root == 1: if config['enableRandomForest'] != True and config[ 'enableGBM'] != True and config['enableAdaboost'] != True: raw_df = df.copy() algorithm = config['algorithm'] enableAdaboost = config['enableAdaboost'] #-------------------------------------- #print(df.shape) charForResp = "'" if algorithm == 'Regression': charForResp = "" tmp_root = root * 1 df_copy = df.copy() winner_name = findDecision(df, config) #find winner index, this cannot be returned by find decision because columns dropped in previous steps j = 0 for i in dataset_features: if i == winner_name: winner_index = j j = j + 1 numericColumn = False if dataset_features[winner_name] != 'object': numericColumn = True #restoration columns = df.shape[1] for i in range(0, columns - 1): column_name = df.columns[i] column_type = df[column_name].dtypes if column_type != 'object' and column_name != winner_name: df[column_name] = df_copy[column_name] classes = df[winner_name].value_counts().keys().tolist() for i in range(0, len(classes)): current_class = classes[i] subdataset = df[df[winner_name] == current_class] subdataset = subdataset.drop(columns=[winner_name]) if numericColumn == True: compareTo = current_class #current class might be <=x or >x in this case else: compareTo = " == '" + str(current_class) + "'" #print(subdataset) terminateBuilding = False #----------------------------------------------- #can decision be made? if enableAdaboost == True: #final_decision = subdataset['Decision'].value_counts().idxmax() final_decision = functions.sign( subdataset['Decision'].mean()) #get average terminateBuilding = True elif len(subdataset['Decision'].value_counts().tolist()) == 1: final_decision = subdataset['Decision'].value_counts().keys( ).tolist()[0] #all items are equal in this case terminateBuilding = True elif subdataset.shape[ 1] == 1: #if decision cannot be made even though all columns dropped final_decision = subdataset['Decision'].value_counts().idxmax( ) #get the most frequent one terminateBuilding = True elif algorithm == 'Regression' and subdataset.shape[ 0] < 5: #pruning condition #elif algorithm == 'Regression' and subdataset['Decision'].std(ddof=0)/global_stdev < 0.4: #pruning condition final_decision = subdataset['Decision'].mean() #get average terminateBuilding = True #----------------------------------------------- if i == 0: check_condition = "if" else: check_condition = "elif" functions.storeRule(file, (functions.formatRule(root), "", check_condition, " obj[", str(winner_index), "]", compareTo, ":")) #----------------------------------------------- if terminateBuilding == True: #check decision is made functions.storeRule( file, (functions.formatRule(root + 1), "return ", charForResp + str(final_decision) + charForResp)) else: #decision is not made, continue to create branch and leafs root = root + 1 #the following rule will be included by this rule. increase root buildDecisionTree(subdataset, root, file, config, dataset_features) root = tmp_root * 1 #--------------------------------------------- #calculate accuracy metrics if root == 1: if config['enableRandomForest'] != True and config[ 'enableGBM'] != True and config['enableAdaboost'] != True: #this is reguler decision tree. find accuracy here. moduleName = "outputs/rules/rules" fp, pathname, description = imp.find_module(moduleName) myrules = imp.load_module(moduleName, fp, pathname, description) #rules0 models.append(myrules) num_of_features = df.shape[1] - 1 instances = df.shape[0] classified = 0 mae = 0 mse = 0 #instead of for loops, pandas functions perform well raw_df['Prediction'] = raw_df.apply(findPrediction, axis=1) if algorithm != 'Regression': idx = raw_df[raw_df['Prediction'] == raw_df['Decision']].index #raw_df['Classified'] = 0 #raw_df.loc[idx, 'Classified'] = 1 #print(raw_df) accuracy = 100 * len(idx) / instances print("Accuracy: ", accuracy, "% on ", instances, " instances") else: raw_df['Absolute_Error'] = abs(raw_df['Prediction'] - raw_df['Decision']) raw_df['Absolute_Error_Squared'] = raw_df[ 'Absolute_Error'] * raw_df['Absolute_Error'] #print(raw_df) mae = raw_df['Absolute_Error'].sum() / instances print("MAE: ", mae) mse = raw_df['Absolute_Error_Squared'].sum() / instances rmse = math.sqrt(mse) print("RMSE: ", rmse) mean = raw_df['Decision'].mean() print("Mean: ", mean) if mean > 0: print("MAE / Mean: ", 100 * mae / mean, "%") print("RMSE / Mean: ", 100 * rmse / mean, "%") return models