def findPrediction(row): epoch = row['Epoch'] row = row.drop(labels=['Epoch']) columns = row.shape[0] params = [] for j in range(0, columns - 1): params.append(row[j]) moduleName = "outputs/rules/rules_%d" % (epoch) fp, pathname, description = imp.find_module(moduleName) myrules = imp.load_module(moduleName, fp, pathname, description) prediction = functions.sign(myrules.findDecision(params)) return prediction
def createBranch(config, current_class, subdataset, numericColumn, branch_index , winner_name, winner_index, root, parents, file, dataset_features, num_of_instances, metric): algorithm = config['algorithm'] enableAdaboost = config['enableAdaboost'] enableGBM = config['enableGBM'] max_depth = config['max_depth'] enableParallelism = config['enableParallelism'] charForResp = "'" if algorithm == 'Regression': charForResp = "" #--------------------------- json_file = file.split(".")[0]+".json" tmp_root = root * 1 parents_raw = copy.copy(parents) #--------------------------- if numericColumn == True: compareTo = current_class #current class might be <=x or >x in this case else: compareTo = " == '"+str(current_class)+"'" #print(subdataset) terminateBuilding = False #----------------------------------------------- #can decision be made? if enableGBM == True and root >= max_depth: #max depth final_decision = subdataset['Decision'].mean() terminateBuilding = True elif enableAdaboost == True: #final_decision = subdataset['Decision'].value_counts().idxmax() final_decision = functions.sign(subdataset['Decision'].mean()) #get average terminateBuilding = True enableParallelism = False elif len(subdataset['Decision'].value_counts().tolist()) == 1: final_decision = subdataset['Decision'].value_counts().keys().tolist()[0] #all items are equal in this case terminateBuilding = True elif subdataset.shape[1] == 1: #if decision cannot be made even though all columns dropped final_decision = subdataset['Decision'].value_counts().idxmax() #get the most frequent one terminateBuilding = True elif algorithm == 'Regression' and subdataset.shape[0] < 5: #pruning condition #elif algorithm == 'Regression' and subdataset['Decision'].std(ddof=0)/global_stdev < 0.4: #pruning condition final_decision = subdataset['Decision'].mean() #get average terminateBuilding = True #----------------------------------------------- if enableParallelism == True: check_condition = "if" #TODO: elif checks might be above than if statements in parallel else: if branch_index == 0: check_condition = "if" else: check_condition = "elif" check_rule = check_condition+" obj["+str(winner_index)+"]"+compareTo+":" leaf_id = str(uuid.uuid1()) custom_rule_file = "outputs/rules/"+str(leaf_id)+".txt" if enableParallelism != True: #check_rule += " # feature: "+winner_name+", instances: "+str(num_of_instances)+", "+metric_name+": "+str(round(metric, 4)) functions.storeRule(file,(functions.formatRule(root),"",check_rule)) else: sample_rule = {} sample_rule["current_level"] = root sample_rule["leaf_id"] = leaf_id sample_rule["parents"] = parents sample_rule["rule"] = check_rule sample_rule["feature_idx"] = winner_index sample_rule["feature_name"] = winner_name sample_rule["instances"] = num_of_instances sample_rule["metric"] = metric sample_rule["return_statement"] = 0 #json to string sample_rule = json.dumps(sample_rule) functions.createFile(custom_rule_file, "") functions.storeRule(custom_rule_file, sample_rule) #----------------------------------------------- if terminateBuilding == True: #check decision is made parents = copy.copy(leaf_id) leaf_id = str(uuid.uuid1()) decision_rule = "return "+charForResp+str(final_decision)+charForResp if enableParallelism != True: #serial functions.storeRule(file,(functions.formatRule(root+1),decision_rule)) else: #parallel sample_rule = {} sample_rule["current_level"] = root+1 sample_rule["leaf_id"] = leaf_id sample_rule["parents"] = parents sample_rule["rule"] = decision_rule sample_rule["feature_idx"] = winner_index sample_rule["feature_name"] = winner_name sample_rule["instances"] = num_of_instances sample_rule["metric"] = 0 sample_rule["return_statement"] = 1 #json to string sample_rule = ", "+json.dumps(sample_rule) functions.storeRule(custom_rule_file, sample_rule) else: #decision is not made, continue to create branch and leafs root = root + 1 #the following rule will be included by this rule. increase root parents = copy.copy(leaf_id) buildDecisionTree(subdataset, root, file, config, dataset_features , root-1, leaf_id, parents) root = tmp_root * 1 parents = copy.copy(parents_raw)
def predict(model, param): trees = model["trees"] config = model["config"] alphas = model["alphas"] nan_values = model["nan_values"] #----------------------- #handle missing values column_index = 0 for column in nan_values: column_name = column[0] missing_value = column[1] if pd.isna(missing_value) != True: #print("missing values will be replaced with ",missing_value," in ",column_name," column") if pd.isna(param[column_index]): param[column_index] = missing_value column_index = column_index + 1 #print("instance: ", param) #----------------------- enableGBM = config['enableGBM'] adaboost = config['enableAdaboost'] #----------------------- classification = False prediction = 0 prediction_classes = [] #----------------------- if enableGBM == True: if len(trees) == config['epochs']: classification = False else: classification = True prediction_classes = [0 for i in alphas] #----------------------- if len(trees) > 1: #boosting index = 0 for tree in trees: if adaboost != True: custom_prediction = tree.findDecision(param) if custom_prediction != None: if type(custom_prediction) != str: #regression if enableGBM == True and classification == True: prediction_classes[index % len(alphas)] += custom_prediction else: prediction += custom_prediction else: classification = True prediction_classes.append(custom_prediction) else: prediction += alphas[index] * tree.findDecision(param) index = index + 1 if adaboost == True: prediction = functions.sign(prediction) else: #regular decision tree tree = trees[0] prediction = tree.findDecision(param) if classification == False: return prediction else: if enableGBM == True and classification == True: return alphas[np.argmax(prediction_classes)] else: unique_labels = np.unique(prediction_classes) prediction_counts = [] for i in range(0, len(unique_labels)): count = 0 for j in prediction_classes: if j == unique_labels[i]: count = count + 1 prediction_counts.append(count) return unique_labels[np.argmax(prediction_counts)]
def predict(model, param): trees = model["trees"] config = model["config"] alphas = model["alphas"] #----------------------- enableGBM = config['enableGBM'] adaboost = config['enableAdaboost'] #----------------------- classification = False prediction = 0 prediction_classes = [] #----------------------- if enableGBM == True: if len(trees) == config['epochs']: classification = False else: classification = True prediction_classes = [0 for i in alphas] #----------------------- if len(trees) > 1: #boosting index = 0 for tree in trees: if adaboost != True: custom_prediction = tree.findDecision(param) if custom_prediction != None: if type(custom_prediction) != str: #regression if enableGBM == True and classification == True: prediction_classes[ index % len(alphas)] += custom_prediction else: prediction += custom_prediction else: classification = True prediction_classes.append(custom_prediction) else: prediction += alphas[index] * tree.findDecision(param) index = index + 1 if adaboost == True: prediction = functions.sign(prediction) else: #regular decision tree tree = trees[0] prediction = tree.findDecision(param) if classification == False: return prediction else: if enableGBM == True and classification == True: return alphas[np.argmax(prediction_classes)] else: unique_labels = np.unique(prediction_classes) prediction_counts = [] for i in range(0, len(unique_labels)): count = 0 for j in prediction_classes: if j == unique_labels[i]: count = count + 1 prediction_counts.append(count) return unique_labels[np.argmax(prediction_counts)]
def predict(model, param): """ Parameters: model (built chefboost model): you should pass model argument to the return of fit function param (list): pass input features as python list e.g. chef.predict(model, param = ['Sunny', 'Hot', 'High', 'Weak']) Returns: prediction """ trees = model["trees"] config = model["config"] alphas = [] if "alphas" in model: alphas = model["alphas"] nan_values = [] if "nan_values" in model: nan_values = model["nan_values"] #----------------------- #handle missing values column_index = 0 for column in nan_values: column_name = column[0] missing_value = column[1] if pd.isna(missing_value) != True: #print("missing values will be replaced with ",missing_value," in ",column_name," column") if pd.isna(param[column_index]): param[column_index] = missing_value column_index = column_index + 1 #print("instance: ", param) #----------------------- enableGBM = config['enableGBM'] adaboost = config['enableAdaboost'] enableRandomForest = config['enableRandomForest'] #----------------------- classification = False prediction = 0 prediction_classes = [] #----------------------- if enableGBM == True: if len(trees) == config['epochs']: classification = False else: classification = True prediction_classes = [0 for i in alphas] #----------------------- if len(trees) > 1: #bagging or boosting index = 0 for tree in trees: if adaboost != True: custom_prediction = tree.findDecision(param) if custom_prediction != None: if type(custom_prediction) != str: #regression if enableGBM == True and classification == True: prediction_classes[ index % len(alphas)] += custom_prediction else: prediction += custom_prediction else: classification = True prediction_classes.append(custom_prediction) else: #adaboost prediction += alphas[index] * tree.findDecision(param) index = index + 1 if enableRandomForest == True: #notice that gbm requires cumilative sum but random forest requires mean of each tree prediction = prediction / len(trees) if adaboost == True: prediction = functions.sign(prediction) else: #regular decision tree tree = trees[0] prediction = tree.findDecision(param) if classification == False: return prediction else: if enableGBM == True and classification == True: return alphas[np.argmax(prediction_classes)] else: #classification #e.g. random forest #get predictions made by different trees predictions = np.array(prediction_classes) #find the most frequent prediction (values, counts) = np.unique(predictions, return_counts=True) idx = np.argmax(counts) prediction = values[idx] return prediction
def buildDecisionTree(df, root, file, config, dataset_features): models = [] if root == 1: if config['enableRandomForest'] != True and config[ 'enableGBM'] != True and config['enableAdaboost'] != True: raw_df = df.copy() algorithm = config['algorithm'] enableAdaboost = config['enableAdaboost'] #-------------------------------------- #print(df.shape) charForResp = "'" if algorithm == 'Regression': charForResp = "" tmp_root = root * 1 df_copy = df.copy() winner_name = findDecision(df, config) #find winner index, this cannot be returned by find decision because columns dropped in previous steps j = 0 for i in dataset_features: if i == winner_name: winner_index = j j = j + 1 numericColumn = False if dataset_features[winner_name] != 'object': numericColumn = True #restoration columns = df.shape[1] for i in range(0, columns - 1): column_name = df.columns[i] column_type = df[column_name].dtypes if column_type != 'object' and column_name != winner_name: df[column_name] = df_copy[column_name] classes = df[winner_name].value_counts().keys().tolist() for i in range(0, len(classes)): current_class = classes[i] subdataset = df[df[winner_name] == current_class] subdataset = subdataset.drop(columns=[winner_name]) if numericColumn == True: compareTo = current_class #current class might be <=x or >x in this case else: compareTo = " == '" + str(current_class) + "'" #print(subdataset) terminateBuilding = False #----------------------------------------------- #can decision be made? if enableAdaboost == True: #final_decision = subdataset['Decision'].value_counts().idxmax() final_decision = functions.sign( subdataset['Decision'].mean()) #get average terminateBuilding = True elif len(subdataset['Decision'].value_counts().tolist()) == 1: final_decision = subdataset['Decision'].value_counts().keys( ).tolist()[0] #all items are equal in this case terminateBuilding = True elif subdataset.shape[ 1] == 1: #if decision cannot be made even though all columns dropped final_decision = subdataset['Decision'].value_counts().idxmax( ) #get the most frequent one terminateBuilding = True elif algorithm == 'Regression' and subdataset.shape[ 0] < 5: #pruning condition #elif algorithm == 'Regression' and subdataset['Decision'].std(ddof=0)/global_stdev < 0.4: #pruning condition final_decision = subdataset['Decision'].mean() #get average terminateBuilding = True #----------------------------------------------- if i == 0: check_condition = "if" else: check_condition = "elif" functions.storeRule(file, (functions.formatRule(root), "", check_condition, " obj[", str(winner_index), "]", compareTo, ":")) #----------------------------------------------- if terminateBuilding == True: #check decision is made functions.storeRule( file, (functions.formatRule(root + 1), "return ", charForResp + str(final_decision) + charForResp)) else: #decision is not made, continue to create branch and leafs root = root + 1 #the following rule will be included by this rule. increase root buildDecisionTree(subdataset, root, file, config, dataset_features) root = tmp_root * 1 #--------------------------------------------- #calculate accuracy metrics if root == 1: if config['enableRandomForest'] != True and config[ 'enableGBM'] != True and config['enableAdaboost'] != True: #this is reguler decision tree. find accuracy here. moduleName = "outputs/rules/rules" fp, pathname, description = imp.find_module(moduleName) myrules = imp.load_module(moduleName, fp, pathname, description) #rules0 models.append(myrules) num_of_features = df.shape[1] - 1 instances = df.shape[0] classified = 0 mae = 0 mse = 0 #instead of for loops, pandas functions perform well raw_df['Prediction'] = raw_df.apply(findPrediction, axis=1) if algorithm != 'Regression': idx = raw_df[raw_df['Prediction'] == raw_df['Decision']].index #raw_df['Classified'] = 0 #raw_df.loc[idx, 'Classified'] = 1 #print(raw_df) accuracy = 100 * len(idx) / instances print("Accuracy: ", accuracy, "% on ", instances, " instances") else: raw_df['Absolute_Error'] = abs(raw_df['Prediction'] - raw_df['Decision']) raw_df['Absolute_Error_Squared'] = raw_df[ 'Absolute_Error'] * raw_df['Absolute_Error'] #print(raw_df) mae = raw_df['Absolute_Error'].sum() / instances print("MAE: ", mae) mse = raw_df['Absolute_Error_Squared'].sum() / instances rmse = math.sqrt(mse) print("RMSE: ", rmse) mean = raw_df['Decision'].mean() print("Mean: ", mean) if mean > 0: print("MAE / Mean: ", 100 * mae / mean, "%") print("RMSE / Mean: ", 100 * rmse / mean, "%") return models