Пример #1
0
def apply(df, config, header, dataset_features):

    debug = config['debug']

    #------------------------

    rows = df.shape[0]
    columns = df.shape[1]
    final_predictions = pd.DataFrame(np.zeros([rows, 1]),
                                     columns=['prediction'])

    worksheet = df.copy()
    worksheet['weight'] = 1  #/ rows

    tmp_df = df.copy()
    tmp_df['Decision'] = worksheet['weight'] * tmp_df[
        'Decision']  #normal distribution

    for i in range(0, 1):
        root = 1
        file = "outputs/rules/rules_" + str(i) + ".py"

        if debug == False: functions.createFile(file, header)

        #print(tmp_df)
        Training.buildDecisionTree(tmp_df, root, file, config,
                                   dataset_features)

    #print(final_predictions)
    """for row, instance in final_predictions.iterrows():
Пример #2
0
def apply(df, config, header, dataset_features):

    debug = config['debug']
    num_of_trees = config['num_of_trees']

    for i in range(0, num_of_trees):
        subset = df.sample(frac=1 / num_of_trees)

        root = 1

        file = "outputs/rules/rule_" + str(i) + ".py"

        if debug == False:
            functions.createFile(file, header)

        Training.buildDecisionTree(subset, root, file, config,
                                   dataset_features)
Пример #3
0
def fit(df, config):

    target_label = df.columns[len(df.columns) - 1]
    if target_label != 'Decision':
        print("Expected: Decision, Existing: ", target_label)
        raise ValueError(
            'Please confirm that name of the target column is "Decision" and it is put to the right in pandas data frame'
        )

    #------------------------

    #initialize params and folders
    config = functions.initializeParams(config)
    functions.initializeFolders()

    #------------------------

    algorithm = config['algorithm']

    valid_algorithms = ['ID3', 'C4.5', 'CART', 'Regression']

    if algorithm not in valid_algorithms:
        raise ValueError('Invalid algorithm passed. You passed ', algorithm,
                         " but valid algorithms are ", valid_algorithms)

    #------------------------

    enableRandomForest = config['enableRandomForest']
    num_of_trees = config['num_of_trees']
    enableMultitasking = config['enableMultitasking']

    enableGBM = config['enableGBM']
    epochs = config['epochs']
    learning_rate = config['learning_rate']

    enableAdaboost = config['enableAdaboost']

    #------------------------
    raw_df = df.copy()
    num_of_rows = df.shape[0]
    num_of_columns = df.shape[1]

    if algorithm == 'Regression':
        if df['Decision'].dtypes == 'object':
            raise ValueError(
                'Regression trees cannot be applied for nominal target values! You can either change the algorithm or data set.'
            )

    if df['Decision'].dtypes != 'object':  #this must be regression tree even if it is not mentioned in algorithm
        algorithm = 'Regression'
        config['algorithm'] = 'Regression'
        global_stdev = df['Decision'].std(ddof=0)

    if enableGBM == True:
        print("Gradient Boosting Machines...")
        algorithm = 'Regression'
        config['algorithm'] = 'Regression'

    if enableAdaboost == True:
        for j in range(0, num_of_columns):
            column_name = df.columns[j]
            if df[column_name].dtypes == 'object':
                raise ValueError(
                    'Adaboost must be run on numeric data set for both features and target'
                )

    #-------------------------

    print(algorithm, " tree is going to be built...")

    dataset_features = dict(
    )  #initialize a dictionary. this is going to be used to check features numeric or nominal. numeric features should be transformed to nominal values based on scales.

    header = "def findDecision("
    header = header + "obj"
    header = header + "): #"

    num_of_columns = df.shape[1] - 1
    for i in range(0, num_of_columns):
        column_name = df.columns[i]
        dataset_features[column_name] = df[column_name].dtypes
        header = header + "obj[" + str(i) + "]: " + column_name
        if i != num_of_columns - 1:
            header = header + ", "

    header = header + "\n"

    #------------------------

    begin = time.time()

    trees = []
    alphas = []

    if enableAdaboost == True:
        trees, alphas = adaboost.apply(df, config, header, dataset_features)

    elif enableGBM == True:

        if df['Decision'].dtypes == 'object':  #transform classification problem to regression
            trees, alphas = gbm.classifier(df, config, header,
                                           dataset_features)
            classification = True

        else:  #regression
            trees = gbm.regressor(df, config, header, dataset_features)
            classification = False

    elif enableRandomForest == True:
        trees = randomforest.apply(df, config, header, dataset_features)
    else:  #regular decision tree building

        root = 1
        file = "outputs/rules/rules.py"
        functions.createFile(file, header)
        trees = Training.buildDecisionTree(df, root, file, config,
                                           dataset_features)

    print("finished in ", time.time() - begin, " seconds")

    obj = {"trees": trees, "alphas": alphas, "config": config}

    return obj
Пример #4
0
def apply(df, config, header, dataset_features):

    models = []
    alphas = []

    initializeAlphaFile()

    num_of_weak_classifier = config['num_of_weak_classifier']

    #------------------------

    rows = df.shape[0]
    columns = df.shape[1]
    final_predictions = pd.DataFrame(np.zeros([rows, 1]),
                                     columns=['prediction'])

    worksheet = df.copy()
    worksheet['Weight'] = 1 / rows  #uniform distribution initially

    final_predictions = pd.DataFrame(np.zeros((df.shape[0], 2)),
                                     columns=['Prediction', 'Actual'])
    final_predictions['Actual'] = df['Decision']

    #for i in range(0, num_of_weak_classifier):
    pbar = tqdm(range(0, num_of_weak_classifier), desc='Adaboosting')
    for i in pbar:
        worksheet['Decision'] = worksheet['Weight'] * worksheet['Decision']

        root = 1
        file = "outputs/rules/rules_" + str(i) + ".py"

        functions.createFile(file, header)

        #print(worksheet)
        Training.buildDecisionTree(worksheet.drop(columns=['Weight']), root,
                                   file, config, dataset_features)

        #---------------------------------------

        moduleName = "outputs/rules/rules_" + str(i)
        fp, pathname, description = imp.find_module(moduleName)
        myrules = imp.load_module(moduleName, fp, pathname, description)
        models.append(myrules)

        #---------------------------------------

        df['Epoch'] = i
        worksheet['Prediction'] = df.apply(findPrediction, axis=1)
        df = df.drop(columns=['Epoch'])

        #---------------------------------------
        worksheet['Actual'] = df['Decision']
        worksheet['Loss'] = abs(worksheet['Actual'] -
                                worksheet['Prediction']) / 2
        worksheet[
            'Weight_Times_Loss'] = worksheet['Loss'] * worksheet['Weight']

        epsilon = worksheet['Weight_Times_Loss'].sum()
        alpha = math.log(
            (1 - epsilon) /
            epsilon) / 2  #use alpha to update weights in the next round
        alphas.append(alpha)

        #-----------------------------

        #store alpha
        addEpochAlpha(i, alpha)

        #-----------------------------

        worksheet['Alpha'] = alpha
        worksheet['New_Weights'] = worksheet['Weight'] * (
            -alpha * worksheet['Actual'] * worksheet['Prediction']).apply(
                math.exp)

        #normalize
        worksheet['New_Weights'] = worksheet['New_Weights'] / worksheet[
            'New_Weights'].sum()
        worksheet['Weight'] = worksheet['New_Weights']
        worksheet['Decision'] = df['Decision']

        final_predictions['Prediction'] = final_predictions[
            'Prediction'] + worksheet['Alpha'] * worksheet['Prediction']
        #print(final_predictions)
        worksheet = worksheet.drop(columns=[
            'New_Weights', 'Prediction', 'Actual', 'Loss', 'Weight_Times_Loss',
            'Alpha'
        ])

        mae = (np.abs(final_predictions['Prediction'].apply(functions.sign) -
                      final_predictions['Actual']) /
               2).sum() / final_predictions.shape[0]
        #print(mae)
        pbar.set_description("Epoch %d. Loss: %d. Process: " % (i + 1, mae))

    #------------------------------
    final_predictions['Prediction'] = final_predictions['Prediction'].apply(
        functions.sign)
    final_predictions['Absolute_Error'] = np.abs(
        final_predictions['Actual'] - final_predictions['Prediction']) / 2
    #print(final_predictions)
    mae = final_predictions['Absolute_Error'].sum(
    ) / final_predictions.shape[0]
    print("Loss (MAE) found ", mae, " with ", num_of_weak_classifier,
          ' weak classifiers')

    return models, alphas
Пример #5
0
def initializeAlphaFile():
    file = "outputs/rules/alphas.py"
    header = "def findAlpha(epoch):\n"
    functions.createFile(file, header)
Пример #6
0
def apply(df, config, header, dataset_features):

    models = []

    num_of_trees = config['num_of_trees']

    pbar = tqdm(range(0, num_of_trees), desc='Bagging')

    for i in pbar:
        #for i in range(0, num_of_trees):
        pbar.set_description("Sub decision tree %d is processing" % (i + 1))
        subset = df.sample(frac=1 / num_of_trees)

        root = 1

        moduleName = "outputs/rules/rule_" + str(i)
        file = moduleName + ".py"

        functions.createFile(file, header)

        Training.buildDecisionTree(subset, root, file, config,
                                   dataset_features)

        #--------------------------------

        fp, pathname, description = imp.find_module(moduleName)
        myrules = imp.load_module(moduleName, fp, pathname, description)
        models.append(myrules)

    #-------------------------------
    #check regression or classification
    if df['Decision'].dtypes == 'object': problem_type = 'classification'
    else: problem_type = 'regression'

    actual_values = df['Decision'].values
    num_of_features = df.shape[1] - 1  #discard Decision
    number_of_instances = df.shape[0]

    global_predictions = []

    #if classification get the max number of prediction
    if problem_type == 'classification':
        for i in range(0, num_of_trees):

            moduleName = "outputs/rules/rule_" + str(i)
            fp, pathname, description = imp.find_module(moduleName)
            myrules = imp.load_module(moduleName, fp, pathname, description)

            predictions = []

            for index, instance in df.iterrows():
                params = []
                for j in range(0, num_of_features):
                    params.append(instance[j])

                #index row, i th column
                prediction = myrules.findDecision(params)
                predictions.append(prediction)
                #print(i,"th tree prediction: ",prediction)

            #print(predictions)
            global_predictions.append(predictions)

        #-------------------------------
        classified = 0
        for index, instance in df.iterrows():

            actual = actual_values[index]
            predictions = []
            for i in range(0, num_of_trees):
                prediction = global_predictions[i][index]
                if prediction != None:  #why None exists in some cases?
                    predictions.append(prediction)

            predictions = np.array(predictions)
            unique_values = np.unique(predictions)

            if unique_values.shape[0] == 1:
                prediction = unique_values[0]
            else:
                counts = []
                for unique in unique_values:
                    count = 0
                    for j in predictions:
                        if unique == j:
                            count = count + 1
                    counts.append(count)

                #print("unique: ",unique_values)
                #print("counts: ",counts)

                prediction = None

                if len(counts) > 0:
                    max_index = np.argmax(np.array(counts))
                    prediction = unique_values[max_index]

            #print(index,". actual: ",actual," - prediction: ", prediction)
            if actual == prediction:
                classified = classified + 1

        print("Accuracy: ", 100 * classified / number_of_instances, "% on ",
              number_of_instances, " instances")

    return models
Пример #7
0
def fit(df, config):

    target_label = df.columns[len(df.columns) - 1]
    if target_label != 'Decision':
        print("Expected: Decision, Existing: ", target_label)
        raise ValueError('Lỗi dữ liệu, hãy chuyển dữ liệu về đúng định dạng!')

    #------------------------

    #initialize params and folders
    config = functions.initializeParams(config)
    functions.initializeFolders()

    algorithm = config['algorithm']

    RandomForest = config['RandomForest']
    num_of_trees = config['num_of_trees']

    #------------------------
    raw_df = df.copy()
    num_of_rows = df.shape[0]
    num_of_columns = df.shape[1]

    if algorithm == 'Regression':
        if df['Decision'].dtypes == 'object':
            raise ValueError(
                'Lỗi dữ liệu khi chạy kết quả dạng Regression Tree')

    if df['Decision'].dtypes != 'object':
        algorithm = 'Regression'
        config['algorithm'] = 'Regression'
        global_stdev = df['Decision'].std(ddof=0)

    #-------------------------

    print(algorithm, ": Đang tiến hành tạo cây quyết định...")

    dataset_features = dict()  # dictionary

    header = "def findDecision("
    header = header + "obj"
    header = header + "): #"

    num_of_columns = df.shape[1] - 1
    for i in range(0, num_of_columns):
        column_name = df.columns[i]
        dataset_features[column_name] = df[column_name].dtypes
        header = header + "obj[" + str(i) + "]: " + column_name
        if i != num_of_columns - 1:
            header = header + ", "

    header = header + "\n"

    #------------------------

    begin = time.time()

    trees = []
    alphas = []

    if RandomForest == True:
        trees = randomforest.apply(df, config, header, dataset_features)
    else:
        root = 1
        file = "outputs/rules/rules.py"
        functions.createFile(file, header)
        trees = Training.buildDecisionTree(df, root, file, config,
                                           dataset_features)

    print("Thuật toán chạy hoàn thành trong:  ", time.time() - begin, " giây")

    obj = {"trees": trees, "alphas": alphas, "config": config}
    return obj
Пример #8
0
def fit(df, config):

    #config parameters

    debug = config['debug']
    algorithm = config['algorithm']

    enableRandomForest = config['enableRandomForest']
    num_of_trees = config['num_of_trees']
    enableMultitasking = config['enableMultitasking']

    enableGBM = config['enableGBM']
    epochs = config['epochs']
    learning_rate = config['learning_rate']

    enableAdaboost = config['enableAdaboost']

    #------------------------
    if algorithm == 'Regression':
        if df['Decision'].dtypes == 'object':
            raise ValueError(
                'Regression trees cannot be applied for nominal target values! You can either change the algorithm or data set.'
            )

    if df['Decision'].dtypes != 'object':  #this must be regression tree even if it is not mentioned in algorithm
        algorithm = 'Regression'
        config['algorithm'] = 'Regression'
        global_stdev = df['Decision'].std(ddof=0)

    if enableGBM == True:
        debug = False  #gbm needs rules files to iterate
        algorithm = 'Regression'
        config['algorithm'] = 'Regression'

    #-------------------------

    print(algorithm, " tree is going to be built...")

    dataset_features = dict(
    )  #initialize a dictionary. this is going to be used to check features numeric or nominal. numeric features should be transformed to nominal values based on scales.

    if (True):  #header of rules files
        header = "def findDecision("
        num_of_columns = df.shape[1] - 1
        for i in range(0, num_of_columns):
            if debug == True:
                if i > 0:
                    header = header + ","
                header = header + df.columns[i]

            column_name = df.columns[i]
            dataset_features[column_name] = df[column_name].dtypes

        if debug == False:
            header = header + "obj"

        header = header + "):\n"

        if debug == True:
            print(header, end='')

    #------------------------

    begin = time.time()

    if enableAdaboost == True:
        adaboost.apply(df, config, header, dataset_features)

    elif enableGBM == True:

        if df['Decision'].dtypes == 'object':  #transform classification problem to regression
            gbm.classifier(df, config, header, dataset_features)

        else:  #regression
            gbm.regressor(df, config, header, dataset_features)

    elif enableRandomForest == True:
        randomforest.apply(df, config, header, dataset_features)
    else:  #regular decision tree building

        root = 1
        file = "outputs/rules/rules.py"
        if debug == False: functions.createFile(file, header)
        Training.buildDecisionTree(df, root, file, config, dataset_features)

    print("finished in ", time.time() - begin, " seconds")
Пример #9
0
def regressor(df, config, header, dataset_features):
    models = []

    algorithm = config['algorithm']

    enableRandomForest = config['enableRandomForest']
    num_of_trees = config['num_of_trees']
    enableMultitasking = config['enableMultitasking']

    enableGBM = config['enableGBM']
    epochs = config['epochs']
    learning_rate = config['learning_rate']

    enableAdaboost = config['enableAdaboost']

    #------------------------------

    boosted_from = 0
    boosted_to = 0

    #------------------------------

    base_df = df.copy()

    #gbm will manipulate actuals. store its raw version.
    target_values = base_df['Decision'].values
    num_of_instances = target_values.shape[0]

    root = 1
    file = "outputs/rules/rules0.py"
    functions.createFile(file, header)

    Training.buildDecisionTree(df, root, file, config,
                               dataset_features)  #generate rules0

    df = base_df.copy()

    base_df['Boosted_Prediction'] = 0

    #------------------------------

    pbar = tqdm(range(1, epochs + 1), desc='Boosting')

    #for index in range(1,epochs+1):
    #for index in tqdm(range(1,epochs+1), desc='Boosting'):
    for index in pbar:
        #print("epoch ",index," - ",end='')
        loss = 0

        #run data(i-1) and rules(i-1), save data1

        #dynamic import
        moduleName = "outputs/rules/rules%s" % (index - 1)
        fp, pathname, description = imp.find_module(moduleName)
        myrules = imp.load_module(moduleName, fp, pathname,
                                  description)  #rules0

        models.append(myrules)

        new_data_set = "outputs/data/data%s.csv" % (index)
        f = open(new_data_set, "w")

        #put header in the following file
        columns = df.shape[1]

        mae = 0

        #----------------------------------------

        df['Epoch'] = index
        df['Prediction'] = df.apply(findPrediction, axis=1)

        base_df['Boosted_Prediction'] += df['Prediction']

        loss = (base_df['Boosted_Prediction'] -
                base_df['Decision']).pow(2).sum()

        if index == 1:
            boosted_from = loss / num_of_instances
        elif index == epochs:
            boosted_to = loss / num_of_instances

        df['Decision'] = int(learning_rate) * (df['Decision'] -
                                               df['Prediction'])
        df = df.drop(columns=['Epoch', 'Prediction'])

        #---------------------------------

        df.to_csv(new_data_set, index=False)
        #data(i) created

        #---------------------------------

        file = "outputs/rules/rules" + str(index) + ".py"

        functions.createFile(file, header)

        current_df = df.copy()
        Training.buildDecisionTree(df, root, file, config, dataset_features)
        df = current_df.copy(
        )  #numeric features require this restoration to apply findDecision function

        #rules(i) created

        loss = loss / num_of_instances
        #print("epoch ",index," - loss: ",loss)
        #print("loss: ",loss)
        pbar.set_description("Epoch %d. Loss: %d. Process: " % (index, loss))

        #---------------------------------

    print(num_of_instances, " instances are boosted from ", boosted_from,
          " to ", boosted_to, " in ", epochs, " epochs")

    return models
Пример #10
0
def classifier(df, config, header, dataset_features):

    models = []

    print("gradient boosting for classification")

    epochs = config['epochs']

    temp_df = df.copy()
    original_dataset = df.copy()
    worksheet = df.copy()

    classes = df['Decision'].unique()

    boosted_predictions = np.zeros([df.shape[0], len(classes)])

    pbar = tqdm(range(0, epochs), desc='Boosting')

    #store actual set, we will use this to calculate loss
    actual_set = pd.DataFrame(np.zeros([df.shape[0], len(classes)]),
                              columns=classes)
    for i in range(0, len(classes)):
        current_class = classes[i]
        actual_set[current_class] = np.where(df['Decision'] == current_class,
                                             1, 0)
    actual_set = actual_set.values  #transform it to numpy array

    #for epoch in range(0, epochs):
    for epoch in pbar:
        for i in range(0, len(classes)):
            current_class = classes[i]

            if epoch == 0:
                temp_df['Decision'] = np.where(df['Decision'] == current_class,
                                               1, 0)
                worksheet['Y_' + str(i)] = temp_df['Decision']
            else:
                temp_df['Decision'] = worksheet['Y-P_' + str(i)]

            predictions = []

            #change data type for decision column
            temp_df[['Decision']].astype('int64')

            root = 1
            file = "outputs/rules/rules-for-" + current_class + "-round-" + str(
                epoch) + ".py"

            functions.createFile(file, header)

            Training.buildDecisionTree(temp_df, root, file, config,
                                       dataset_features)
            #decision rules created
            #----------------------------

            #dynamic import
            moduleName = "outputs/rules/rules-for-" + current_class + "-round-" + str(
                epoch)
            fp, pathname, description = imp.find_module(moduleName)
            myrules = imp.load_module(moduleName, fp, pathname,
                                      description)  #rules0

            models.append(myrules)

            num_of_columns = df.shape[1]

            for row, instance in df.iterrows():
                features = []
                for j in range(0, num_of_columns - 1):  #iterate on features
                    features.append(instance[j])

                actual = temp_df.loc[row]['Decision']
                prediction = myrules.findDecision(features)

                predictions.append(prediction)

            #----------------------------
            if epoch == 0:
                worksheet['F_' + str(i)] = 0
            else:
                worksheet['F_' + str(i)] = pd.Series(predictions).values

            boosted_predictions[:, i] = boosted_predictions[:, i] + worksheet[
                'F_' + str(i)].values.astype(np.float32)

            #print(boosted_predictions[0:5,:])

            worksheet['P_' + str(i)] = 0

            #----------------------------
            temp_df = df.copy()  #restoration

        for row, instance in worksheet.iterrows():
            f_scores = []
            for i in range(0, len(classes)):
                f_scores.append(instance['F_' + str(i)])

            probabilities = functions.softmax(f_scores)

            for j in range(0, len(probabilities)):
                instance['P_' + str(j)] = probabilities[j]

            worksheet.loc[row] = instance

        for i in range(0, len(classes)):
            worksheet['Y-P_' +
                      str(i)] = worksheet['Y_' + str(i)] - worksheet['P_' +
                                                                     str(i)]

        prediction_set = np.zeros([df.shape[0], len(classes)])
        for i in range(0, boosted_predictions.shape[0]):
            predicted_index = np.argmax(boosted_predictions[i])
            prediction_set[i][predicted_index] = 1

        #----------------------------
        #find loss for this epoch: prediction_set vs actual_set
        classified = 0
        for i in range(0, actual_set.shape[0]):
            actual = np.argmax(actual_set[i])
            prediction = np.argmax(prediction_set[i])
            #print("actual: ",actual," - prediction: ",prediction)

            if actual == prediction:
                classified = classified + 1

        accuracy = str(100 * classified / actual_set.shape[0]) + "%"

        #----------------------------

        #print(worksheet.head())
        #print("round ",epoch+1)
        pbar.set_description("Epoch %d. Accuracy: %s. Process: " %
                             (epoch + 1, accuracy))

    return models, classes
Пример #11
0
def classifier(df, config, header, dataset_features):
    print("gradient boosting for classification")

    debug = config['debug']
    epochs = config['epochs']

    temp_df = df.copy()
    original_dataset = df.copy()
    worksheet = df.copy()

    classes = df['Decision'].unique()

    boosted_predictions = np.zeros([df.shape[0], len(classes)])

    for epoch in range(0, epochs):
        for i in range(0, len(classes)):
            current_class = classes[i]

            if epoch == 0:
                temp_df['Decision'] = np.where(df['Decision'] == current_class,
                                               1, 0)
                worksheet['Y_' + str(i)] = temp_df['Decision']
            else:
                temp_df['Decision'] = worksheet['Y-P_' + str(i)]

            predictions = []

            #change data type for decision column
            temp_df[['Decision']].astype('int64')

            root = 1
            file = "outputs/rules/rules-for-" + current_class + ".py"

            if debug == False: functions.createFile(file, header)

            Training.buildDecisionTree(temp_df, root, file, config,
                                       dataset_features)
            #decision rules created
            #----------------------------

            #dynamic import
            moduleName = "outputs/rules/rules-for-" + current_class
            fp, pathname, description = imp.find_module(moduleName)
            myrules = imp.load_module(moduleName, fp, pathname,
                                      description)  #rules0

            num_of_columns = df.shape[1]

            for row, instance in df.iterrows():
                features = []
                for j in range(0, num_of_columns - 1):  #iterate on features
                    features.append(instance[j])

                actual = temp_df.loc[row]['Decision']
                prediction = myrules.findDecision(features)
                predictions.append(prediction)

            #----------------------------
            if epoch == 0:
                worksheet['F_' + str(i)] = 0
            else:
                worksheet['F_' + str(i)] = pd.Series(predictions).values

            boosted_predictions[:, i] = boosted_predictions[:, i] + worksheet[
                'F_' + str(i)].values.astype(np.float32)

            worksheet['P_' + str(i)] = 0

            #----------------------------
            temp_df = df.copy()  #restoration

        for row, instance in worksheet.iterrows():
            f_scores = []
            for i in range(0, len(classes)):
                f_scores.append(instance['F_' + str(i)])

            probabilities = functions.softmax(f_scores)

            for j in range(0, len(probabilities)):
                instance['P_' + str(j)] = probabilities[j]

            worksheet.loc[row] = instance

        for i in range(0, len(classes)):
            worksheet['Y-P_' +
                      str(i)] = worksheet['Y_' + str(i)] - worksheet['P_' +
                                                                     str(i)]

        print("round ", epoch + 1)
Пример #12
0
def regressor(df, config, header, dataset_features):

    debug = config['debug']
    algorithm = config['algorithm']

    enableRandomForest = config['enableRandomForest']
    num_of_trees = config['num_of_trees']
    enableMultitasking = config['enableMultitasking']

    enableGBM = config['enableGBM']
    epochs = config['epochs']
    learning_rate = config['learning_rate']

    enableAdaboost = config['enableAdaboost']

    #------------------------------

    base_df = df.copy()

    root = 1
    file = "outputs/rules/rules0.py"
    if debug == False:
        functions.createFile(file, header)

    Training.buildDecisionTree(df, root, file, config,
                               dataset_features)  #generate rules0

    df = base_df.copy()

    #------------------------------

    for index in range(1, epochs):
        #run data(i-1) and rules(i-1), save data1

        #dynamic import
        moduleName = "outputs/rules/rules%s" % (index - 1)
        fp, pathname, description = imp.find_module(moduleName)
        myrules = imp.load_module(moduleName, fp, pathname,
                                  description)  #rules0

        new_data_set = "outputs/data/data%s.csv" % (index)
        f = open(new_data_set, "w")

        #put header in the following file
        columns = df.shape[1]

        for i, instance in df.iterrows():
            params = []
            line = ""
            for j in range(0, columns - 1):
                params.append(instance[j])
                if j > 0:
                    line = line + ","
                line = line + str(instance[j])

            prediction = int(
                myrules.findDecision(params))  #apply rules(i-1) for data(i-1)
            actual = instance[columns - 1]

            #print(prediction)

            #loss was ((actual - prediction)^2) / 2
            #partial derivative of loss function with respect to the prediction is prediction - actual
            #y' = y' - alpha * gradient = y' - alpha * (prediction - actual) = y' = y' + alpha * (actual - prediction)
            #whereas y' is prediction and alpha is learning rate

            gradient = int(learning_rate) * (actual - prediction)

            instance[columns - 1] = gradient

            df.loc[i] = instance

        df.to_csv(new_data_set, index=False)
        #data(i) created
        #---------------------------------

        file = "outputs/rules/rules" + str(index) + ".py"

        if debug == False:
            functions.createFile(file, header)

        current_df = df.copy()
        Training.buildDecisionTree(df, root, file, config, dataset_features)
        df = current_df.copy(
        )  #numeric features require this restoration to apply findDecision function