def apply(df, config, header, dataset_features): debug = config['debug'] #------------------------ rows = df.shape[0] columns = df.shape[1] final_predictions = pd.DataFrame(np.zeros([rows, 1]), columns=['prediction']) worksheet = df.copy() worksheet['weight'] = 1 #/ rows tmp_df = df.copy() tmp_df['Decision'] = worksheet['weight'] * tmp_df[ 'Decision'] #normal distribution for i in range(0, 1): root = 1 file = "outputs/rules/rules_" + str(i) + ".py" if debug == False: functions.createFile(file, header) #print(tmp_df) Training.buildDecisionTree(tmp_df, root, file, config, dataset_features) #print(final_predictions) """for row, instance in final_predictions.iterrows():
def apply(df, config, header, dataset_features): debug = config['debug'] num_of_trees = config['num_of_trees'] for i in range(0, num_of_trees): subset = df.sample(frac=1 / num_of_trees) root = 1 file = "outputs/rules/rule_" + str(i) + ".py" if debug == False: functions.createFile(file, header) Training.buildDecisionTree(subset, root, file, config, dataset_features)
def fit(df, config): target_label = df.columns[len(df.columns) - 1] if target_label != 'Decision': print("Expected: Decision, Existing: ", target_label) raise ValueError( 'Please confirm that name of the target column is "Decision" and it is put to the right in pandas data frame' ) #------------------------ #initialize params and folders config = functions.initializeParams(config) functions.initializeFolders() #------------------------ algorithm = config['algorithm'] valid_algorithms = ['ID3', 'C4.5', 'CART', 'Regression'] if algorithm not in valid_algorithms: raise ValueError('Invalid algorithm passed. You passed ', algorithm, " but valid algorithms are ", valid_algorithms) #------------------------ enableRandomForest = config['enableRandomForest'] num_of_trees = config['num_of_trees'] enableMultitasking = config['enableMultitasking'] enableGBM = config['enableGBM'] epochs = config['epochs'] learning_rate = config['learning_rate'] enableAdaboost = config['enableAdaboost'] #------------------------ raw_df = df.copy() num_of_rows = df.shape[0] num_of_columns = df.shape[1] if algorithm == 'Regression': if df['Decision'].dtypes == 'object': raise ValueError( 'Regression trees cannot be applied for nominal target values! You can either change the algorithm or data set.' ) if df['Decision'].dtypes != 'object': #this must be regression tree even if it is not mentioned in algorithm algorithm = 'Regression' config['algorithm'] = 'Regression' global_stdev = df['Decision'].std(ddof=0) if enableGBM == True: print("Gradient Boosting Machines...") algorithm = 'Regression' config['algorithm'] = 'Regression' if enableAdaboost == True: for j in range(0, num_of_columns): column_name = df.columns[j] if df[column_name].dtypes == 'object': raise ValueError( 'Adaboost must be run on numeric data set for both features and target' ) #------------------------- print(algorithm, " tree is going to be built...") dataset_features = dict( ) #initialize a dictionary. this is going to be used to check features numeric or nominal. numeric features should be transformed to nominal values based on scales. header = "def findDecision(" header = header + "obj" header = header + "): #" num_of_columns = df.shape[1] - 1 for i in range(0, num_of_columns): column_name = df.columns[i] dataset_features[column_name] = df[column_name].dtypes header = header + "obj[" + str(i) + "]: " + column_name if i != num_of_columns - 1: header = header + ", " header = header + "\n" #------------------------ begin = time.time() trees = [] alphas = [] if enableAdaboost == True: trees, alphas = adaboost.apply(df, config, header, dataset_features) elif enableGBM == True: if df['Decision'].dtypes == 'object': #transform classification problem to regression trees, alphas = gbm.classifier(df, config, header, dataset_features) classification = True else: #regression trees = gbm.regressor(df, config, header, dataset_features) classification = False elif enableRandomForest == True: trees = randomforest.apply(df, config, header, dataset_features) else: #regular decision tree building root = 1 file = "outputs/rules/rules.py" functions.createFile(file, header) trees = Training.buildDecisionTree(df, root, file, config, dataset_features) print("finished in ", time.time() - begin, " seconds") obj = {"trees": trees, "alphas": alphas, "config": config} return obj
def apply(df, config, header, dataset_features): models = [] alphas = [] initializeAlphaFile() num_of_weak_classifier = config['num_of_weak_classifier'] #------------------------ rows = df.shape[0] columns = df.shape[1] final_predictions = pd.DataFrame(np.zeros([rows, 1]), columns=['prediction']) worksheet = df.copy() worksheet['Weight'] = 1 / rows #uniform distribution initially final_predictions = pd.DataFrame(np.zeros((df.shape[0], 2)), columns=['Prediction', 'Actual']) final_predictions['Actual'] = df['Decision'] #for i in range(0, num_of_weak_classifier): pbar = tqdm(range(0, num_of_weak_classifier), desc='Adaboosting') for i in pbar: worksheet['Decision'] = worksheet['Weight'] * worksheet['Decision'] root = 1 file = "outputs/rules/rules_" + str(i) + ".py" functions.createFile(file, header) #print(worksheet) Training.buildDecisionTree(worksheet.drop(columns=['Weight']), root, file, config, dataset_features) #--------------------------------------- moduleName = "outputs/rules/rules_" + str(i) fp, pathname, description = imp.find_module(moduleName) myrules = imp.load_module(moduleName, fp, pathname, description) models.append(myrules) #--------------------------------------- df['Epoch'] = i worksheet['Prediction'] = df.apply(findPrediction, axis=1) df = df.drop(columns=['Epoch']) #--------------------------------------- worksheet['Actual'] = df['Decision'] worksheet['Loss'] = abs(worksheet['Actual'] - worksheet['Prediction']) / 2 worksheet[ 'Weight_Times_Loss'] = worksheet['Loss'] * worksheet['Weight'] epsilon = worksheet['Weight_Times_Loss'].sum() alpha = math.log( (1 - epsilon) / epsilon) / 2 #use alpha to update weights in the next round alphas.append(alpha) #----------------------------- #store alpha addEpochAlpha(i, alpha) #----------------------------- worksheet['Alpha'] = alpha worksheet['New_Weights'] = worksheet['Weight'] * ( -alpha * worksheet['Actual'] * worksheet['Prediction']).apply( math.exp) #normalize worksheet['New_Weights'] = worksheet['New_Weights'] / worksheet[ 'New_Weights'].sum() worksheet['Weight'] = worksheet['New_Weights'] worksheet['Decision'] = df['Decision'] final_predictions['Prediction'] = final_predictions[ 'Prediction'] + worksheet['Alpha'] * worksheet['Prediction'] #print(final_predictions) worksheet = worksheet.drop(columns=[ 'New_Weights', 'Prediction', 'Actual', 'Loss', 'Weight_Times_Loss', 'Alpha' ]) mae = (np.abs(final_predictions['Prediction'].apply(functions.sign) - final_predictions['Actual']) / 2).sum() / final_predictions.shape[0] #print(mae) pbar.set_description("Epoch %d. Loss: %d. Process: " % (i + 1, mae)) #------------------------------ final_predictions['Prediction'] = final_predictions['Prediction'].apply( functions.sign) final_predictions['Absolute_Error'] = np.abs( final_predictions['Actual'] - final_predictions['Prediction']) / 2 #print(final_predictions) mae = final_predictions['Absolute_Error'].sum( ) / final_predictions.shape[0] print("Loss (MAE) found ", mae, " with ", num_of_weak_classifier, ' weak classifiers') return models, alphas
def apply(df, config, header, dataset_features): models = [] num_of_trees = config['num_of_trees'] pbar = tqdm(range(0, num_of_trees), desc='Bagging') for i in pbar: #for i in range(0, num_of_trees): pbar.set_description("Sub decision tree %d is processing" % (i + 1)) subset = df.sample(frac=1 / num_of_trees) root = 1 moduleName = "outputs/rules/rule_" + str(i) file = moduleName + ".py" functions.createFile(file, header) Training.buildDecisionTree(subset, root, file, config, dataset_features) #-------------------------------- fp, pathname, description = imp.find_module(moduleName) myrules = imp.load_module(moduleName, fp, pathname, description) models.append(myrules) #------------------------------- #check regression or classification if df['Decision'].dtypes == 'object': problem_type = 'classification' else: problem_type = 'regression' actual_values = df['Decision'].values num_of_features = df.shape[1] - 1 #discard Decision number_of_instances = df.shape[0] global_predictions = [] #if classification get the max number of prediction if problem_type == 'classification': for i in range(0, num_of_trees): moduleName = "outputs/rules/rule_" + str(i) fp, pathname, description = imp.find_module(moduleName) myrules = imp.load_module(moduleName, fp, pathname, description) predictions = [] for index, instance in df.iterrows(): params = [] for j in range(0, num_of_features): params.append(instance[j]) #index row, i th column prediction = myrules.findDecision(params) predictions.append(prediction) #print(i,"th tree prediction: ",prediction) #print(predictions) global_predictions.append(predictions) #------------------------------- classified = 0 for index, instance in df.iterrows(): actual = actual_values[index] predictions = [] for i in range(0, num_of_trees): prediction = global_predictions[i][index] if prediction != None: #why None exists in some cases? predictions.append(prediction) predictions = np.array(predictions) unique_values = np.unique(predictions) if unique_values.shape[0] == 1: prediction = unique_values[0] else: counts = [] for unique in unique_values: count = 0 for j in predictions: if unique == j: count = count + 1 counts.append(count) #print("unique: ",unique_values) #print("counts: ",counts) prediction = None if len(counts) > 0: max_index = np.argmax(np.array(counts)) prediction = unique_values[max_index] #print(index,". actual: ",actual," - prediction: ", prediction) if actual == prediction: classified = classified + 1 print("Accuracy: ", 100 * classified / number_of_instances, "% on ", number_of_instances, " instances") return models
def fit(df, config): target_label = df.columns[len(df.columns) - 1] if target_label != 'Decision': print("Expected: Decision, Existing: ", target_label) raise ValueError('Lỗi dữ liệu, hãy chuyển dữ liệu về đúng định dạng!') #------------------------ #initialize params and folders config = functions.initializeParams(config) functions.initializeFolders() algorithm = config['algorithm'] RandomForest = config['RandomForest'] num_of_trees = config['num_of_trees'] #------------------------ raw_df = df.copy() num_of_rows = df.shape[0] num_of_columns = df.shape[1] if algorithm == 'Regression': if df['Decision'].dtypes == 'object': raise ValueError( 'Lỗi dữ liệu khi chạy kết quả dạng Regression Tree') if df['Decision'].dtypes != 'object': algorithm = 'Regression' config['algorithm'] = 'Regression' global_stdev = df['Decision'].std(ddof=0) #------------------------- print(algorithm, ": Đang tiến hành tạo cây quyết định...") dataset_features = dict() # dictionary header = "def findDecision(" header = header + "obj" header = header + "): #" num_of_columns = df.shape[1] - 1 for i in range(0, num_of_columns): column_name = df.columns[i] dataset_features[column_name] = df[column_name].dtypes header = header + "obj[" + str(i) + "]: " + column_name if i != num_of_columns - 1: header = header + ", " header = header + "\n" #------------------------ begin = time.time() trees = [] alphas = [] if RandomForest == True: trees = randomforest.apply(df, config, header, dataset_features) else: root = 1 file = "outputs/rules/rules.py" functions.createFile(file, header) trees = Training.buildDecisionTree(df, root, file, config, dataset_features) print("Thuật toán chạy hoàn thành trong: ", time.time() - begin, " giây") obj = {"trees": trees, "alphas": alphas, "config": config} return obj
def fit(df, config): #config parameters debug = config['debug'] algorithm = config['algorithm'] enableRandomForest = config['enableRandomForest'] num_of_trees = config['num_of_trees'] enableMultitasking = config['enableMultitasking'] enableGBM = config['enableGBM'] epochs = config['epochs'] learning_rate = config['learning_rate'] enableAdaboost = config['enableAdaboost'] #------------------------ if algorithm == 'Regression': if df['Decision'].dtypes == 'object': raise ValueError( 'Regression trees cannot be applied for nominal target values! You can either change the algorithm or data set.' ) if df['Decision'].dtypes != 'object': #this must be regression tree even if it is not mentioned in algorithm algorithm = 'Regression' config['algorithm'] = 'Regression' global_stdev = df['Decision'].std(ddof=0) if enableGBM == True: debug = False #gbm needs rules files to iterate algorithm = 'Regression' config['algorithm'] = 'Regression' #------------------------- print(algorithm, " tree is going to be built...") dataset_features = dict( ) #initialize a dictionary. this is going to be used to check features numeric or nominal. numeric features should be transformed to nominal values based on scales. if (True): #header of rules files header = "def findDecision(" num_of_columns = df.shape[1] - 1 for i in range(0, num_of_columns): if debug == True: if i > 0: header = header + "," header = header + df.columns[i] column_name = df.columns[i] dataset_features[column_name] = df[column_name].dtypes if debug == False: header = header + "obj" header = header + "):\n" if debug == True: print(header, end='') #------------------------ begin = time.time() if enableAdaboost == True: adaboost.apply(df, config, header, dataset_features) elif enableGBM == True: if df['Decision'].dtypes == 'object': #transform classification problem to regression gbm.classifier(df, config, header, dataset_features) else: #regression gbm.regressor(df, config, header, dataset_features) elif enableRandomForest == True: randomforest.apply(df, config, header, dataset_features) else: #regular decision tree building root = 1 file = "outputs/rules/rules.py" if debug == False: functions.createFile(file, header) Training.buildDecisionTree(df, root, file, config, dataset_features) print("finished in ", time.time() - begin, " seconds")
def regressor(df, config, header, dataset_features): models = [] algorithm = config['algorithm'] enableRandomForest = config['enableRandomForest'] num_of_trees = config['num_of_trees'] enableMultitasking = config['enableMultitasking'] enableGBM = config['enableGBM'] epochs = config['epochs'] learning_rate = config['learning_rate'] enableAdaboost = config['enableAdaboost'] #------------------------------ boosted_from = 0 boosted_to = 0 #------------------------------ base_df = df.copy() #gbm will manipulate actuals. store its raw version. target_values = base_df['Decision'].values num_of_instances = target_values.shape[0] root = 1 file = "outputs/rules/rules0.py" functions.createFile(file, header) Training.buildDecisionTree(df, root, file, config, dataset_features) #generate rules0 df = base_df.copy() base_df['Boosted_Prediction'] = 0 #------------------------------ pbar = tqdm(range(1, epochs + 1), desc='Boosting') #for index in range(1,epochs+1): #for index in tqdm(range(1,epochs+1), desc='Boosting'): for index in pbar: #print("epoch ",index," - ",end='') loss = 0 #run data(i-1) and rules(i-1), save data1 #dynamic import moduleName = "outputs/rules/rules%s" % (index - 1) fp, pathname, description = imp.find_module(moduleName) myrules = imp.load_module(moduleName, fp, pathname, description) #rules0 models.append(myrules) new_data_set = "outputs/data/data%s.csv" % (index) f = open(new_data_set, "w") #put header in the following file columns = df.shape[1] mae = 0 #---------------------------------------- df['Epoch'] = index df['Prediction'] = df.apply(findPrediction, axis=1) base_df['Boosted_Prediction'] += df['Prediction'] loss = (base_df['Boosted_Prediction'] - base_df['Decision']).pow(2).sum() if index == 1: boosted_from = loss / num_of_instances elif index == epochs: boosted_to = loss / num_of_instances df['Decision'] = int(learning_rate) * (df['Decision'] - df['Prediction']) df = df.drop(columns=['Epoch', 'Prediction']) #--------------------------------- df.to_csv(new_data_set, index=False) #data(i) created #--------------------------------- file = "outputs/rules/rules" + str(index) + ".py" functions.createFile(file, header) current_df = df.copy() Training.buildDecisionTree(df, root, file, config, dataset_features) df = current_df.copy( ) #numeric features require this restoration to apply findDecision function #rules(i) created loss = loss / num_of_instances #print("epoch ",index," - loss: ",loss) #print("loss: ",loss) pbar.set_description("Epoch %d. Loss: %d. Process: " % (index, loss)) #--------------------------------- print(num_of_instances, " instances are boosted from ", boosted_from, " to ", boosted_to, " in ", epochs, " epochs") return models
def classifier(df, config, header, dataset_features): models = [] print("gradient boosting for classification") epochs = config['epochs'] temp_df = df.copy() original_dataset = df.copy() worksheet = df.copy() classes = df['Decision'].unique() boosted_predictions = np.zeros([df.shape[0], len(classes)]) pbar = tqdm(range(0, epochs), desc='Boosting') #store actual set, we will use this to calculate loss actual_set = pd.DataFrame(np.zeros([df.shape[0], len(classes)]), columns=classes) for i in range(0, len(classes)): current_class = classes[i] actual_set[current_class] = np.where(df['Decision'] == current_class, 1, 0) actual_set = actual_set.values #transform it to numpy array #for epoch in range(0, epochs): for epoch in pbar: for i in range(0, len(classes)): current_class = classes[i] if epoch == 0: temp_df['Decision'] = np.where(df['Decision'] == current_class, 1, 0) worksheet['Y_' + str(i)] = temp_df['Decision'] else: temp_df['Decision'] = worksheet['Y-P_' + str(i)] predictions = [] #change data type for decision column temp_df[['Decision']].astype('int64') root = 1 file = "outputs/rules/rules-for-" + current_class + "-round-" + str( epoch) + ".py" functions.createFile(file, header) Training.buildDecisionTree(temp_df, root, file, config, dataset_features) #decision rules created #---------------------------- #dynamic import moduleName = "outputs/rules/rules-for-" + current_class + "-round-" + str( epoch) fp, pathname, description = imp.find_module(moduleName) myrules = imp.load_module(moduleName, fp, pathname, description) #rules0 models.append(myrules) num_of_columns = df.shape[1] for row, instance in df.iterrows(): features = [] for j in range(0, num_of_columns - 1): #iterate on features features.append(instance[j]) actual = temp_df.loc[row]['Decision'] prediction = myrules.findDecision(features) predictions.append(prediction) #---------------------------- if epoch == 0: worksheet['F_' + str(i)] = 0 else: worksheet['F_' + str(i)] = pd.Series(predictions).values boosted_predictions[:, i] = boosted_predictions[:, i] + worksheet[ 'F_' + str(i)].values.astype(np.float32) #print(boosted_predictions[0:5,:]) worksheet['P_' + str(i)] = 0 #---------------------------- temp_df = df.copy() #restoration for row, instance in worksheet.iterrows(): f_scores = [] for i in range(0, len(classes)): f_scores.append(instance['F_' + str(i)]) probabilities = functions.softmax(f_scores) for j in range(0, len(probabilities)): instance['P_' + str(j)] = probabilities[j] worksheet.loc[row] = instance for i in range(0, len(classes)): worksheet['Y-P_' + str(i)] = worksheet['Y_' + str(i)] - worksheet['P_' + str(i)] prediction_set = np.zeros([df.shape[0], len(classes)]) for i in range(0, boosted_predictions.shape[0]): predicted_index = np.argmax(boosted_predictions[i]) prediction_set[i][predicted_index] = 1 #---------------------------- #find loss for this epoch: prediction_set vs actual_set classified = 0 for i in range(0, actual_set.shape[0]): actual = np.argmax(actual_set[i]) prediction = np.argmax(prediction_set[i]) #print("actual: ",actual," - prediction: ",prediction) if actual == prediction: classified = classified + 1 accuracy = str(100 * classified / actual_set.shape[0]) + "%" #---------------------------- #print(worksheet.head()) #print("round ",epoch+1) pbar.set_description("Epoch %d. Accuracy: %s. Process: " % (epoch + 1, accuracy)) return models, classes
def classifier(df, config, header, dataset_features): print("gradient boosting for classification") debug = config['debug'] epochs = config['epochs'] temp_df = df.copy() original_dataset = df.copy() worksheet = df.copy() classes = df['Decision'].unique() boosted_predictions = np.zeros([df.shape[0], len(classes)]) for epoch in range(0, epochs): for i in range(0, len(classes)): current_class = classes[i] if epoch == 0: temp_df['Decision'] = np.where(df['Decision'] == current_class, 1, 0) worksheet['Y_' + str(i)] = temp_df['Decision'] else: temp_df['Decision'] = worksheet['Y-P_' + str(i)] predictions = [] #change data type for decision column temp_df[['Decision']].astype('int64') root = 1 file = "outputs/rules/rules-for-" + current_class + ".py" if debug == False: functions.createFile(file, header) Training.buildDecisionTree(temp_df, root, file, config, dataset_features) #decision rules created #---------------------------- #dynamic import moduleName = "outputs/rules/rules-for-" + current_class fp, pathname, description = imp.find_module(moduleName) myrules = imp.load_module(moduleName, fp, pathname, description) #rules0 num_of_columns = df.shape[1] for row, instance in df.iterrows(): features = [] for j in range(0, num_of_columns - 1): #iterate on features features.append(instance[j]) actual = temp_df.loc[row]['Decision'] prediction = myrules.findDecision(features) predictions.append(prediction) #---------------------------- if epoch == 0: worksheet['F_' + str(i)] = 0 else: worksheet['F_' + str(i)] = pd.Series(predictions).values boosted_predictions[:, i] = boosted_predictions[:, i] + worksheet[ 'F_' + str(i)].values.astype(np.float32) worksheet['P_' + str(i)] = 0 #---------------------------- temp_df = df.copy() #restoration for row, instance in worksheet.iterrows(): f_scores = [] for i in range(0, len(classes)): f_scores.append(instance['F_' + str(i)]) probabilities = functions.softmax(f_scores) for j in range(0, len(probabilities)): instance['P_' + str(j)] = probabilities[j] worksheet.loc[row] = instance for i in range(0, len(classes)): worksheet['Y-P_' + str(i)] = worksheet['Y_' + str(i)] - worksheet['P_' + str(i)] print("round ", epoch + 1)
def regressor(df, config, header, dataset_features): debug = config['debug'] algorithm = config['algorithm'] enableRandomForest = config['enableRandomForest'] num_of_trees = config['num_of_trees'] enableMultitasking = config['enableMultitasking'] enableGBM = config['enableGBM'] epochs = config['epochs'] learning_rate = config['learning_rate'] enableAdaboost = config['enableAdaboost'] #------------------------------ base_df = df.copy() root = 1 file = "outputs/rules/rules0.py" if debug == False: functions.createFile(file, header) Training.buildDecisionTree(df, root, file, config, dataset_features) #generate rules0 df = base_df.copy() #------------------------------ for index in range(1, epochs): #run data(i-1) and rules(i-1), save data1 #dynamic import moduleName = "outputs/rules/rules%s" % (index - 1) fp, pathname, description = imp.find_module(moduleName) myrules = imp.load_module(moduleName, fp, pathname, description) #rules0 new_data_set = "outputs/data/data%s.csv" % (index) f = open(new_data_set, "w") #put header in the following file columns = df.shape[1] for i, instance in df.iterrows(): params = [] line = "" for j in range(0, columns - 1): params.append(instance[j]) if j > 0: line = line + "," line = line + str(instance[j]) prediction = int( myrules.findDecision(params)) #apply rules(i-1) for data(i-1) actual = instance[columns - 1] #print(prediction) #loss was ((actual - prediction)^2) / 2 #partial derivative of loss function with respect to the prediction is prediction - actual #y' = y' - alpha * gradient = y' - alpha * (prediction - actual) = y' = y' + alpha * (actual - prediction) #whereas y' is prediction and alpha is learning rate gradient = int(learning_rate) * (actual - prediction) instance[columns - 1] = gradient df.loc[i] = instance df.to_csv(new_data_set, index=False) #data(i) created #--------------------------------- file = "outputs/rules/rules" + str(index) + ".py" if debug == False: functions.createFile(file, header) current_df = df.copy() Training.buildDecisionTree(df, root, file, config, dataset_features) df = current_df.copy( ) #numeric features require this restoration to apply findDecision function