Пример #1
0
def fit(df, config):

    target_label = df.columns[len(df.columns) - 1]
    if target_label != 'Decision':
        print("Expected: Decision, Existing: ", target_label)
        raise ValueError('Lỗi dữ liệu, hãy chuyển dữ liệu về đúng định dạng!')

    #------------------------

    #initialize params and folders
    config = functions.initializeParams(config)
    functions.initializeFolders()

    algorithm = config['algorithm']

    RandomForest = config['RandomForest']
    num_of_trees = config['num_of_trees']

    #------------------------
    raw_df = df.copy()
    num_of_rows = df.shape[0]
    num_of_columns = df.shape[1]

    if algorithm == 'Regression':
        if df['Decision'].dtypes == 'object':
            raise ValueError(
                'Lỗi dữ liệu khi chạy kết quả dạng Regression Tree')

    if df['Decision'].dtypes != 'object':
        algorithm = 'Regression'
        config['algorithm'] = 'Regression'
        global_stdev = df['Decision'].std(ddof=0)

    #-------------------------

    print(algorithm, ": Đang tiến hành tạo cây quyết định...")

    dataset_features = dict()  # dictionary

    header = "def findDecision("
    header = header + "obj"
    header = header + "): #"

    num_of_columns = df.shape[1] - 1
    for i in range(0, num_of_columns):
        column_name = df.columns[i]
        dataset_features[column_name] = df[column_name].dtypes
        header = header + "obj[" + str(i) + "]: " + column_name
        if i != num_of_columns - 1:
            header = header + ", "

    header = header + "\n"

    #------------------------

    begin = time.time()

    trees = []
    alphas = []

    if RandomForest == True:
        trees = randomforest.apply(df, config, header, dataset_features)
    else:
        root = 1
        file = "outputs/rules/rules.py"
        functions.createFile(file, header)
        trees = Training.buildDecisionTree(df, root, file, config,
                                           dataset_features)

    print("Thuật toán chạy hoàn thành trong:  ", time.time() - begin, " giây")

    obj = {"trees": trees, "alphas": alphas, "config": config}
    return obj
Пример #2
0
def fit(df, config):

    target_label = df.columns[len(df.columns) - 1]
    if target_label != 'Decision':
        print("Expected: Decision, Existing: ", target_label)
        raise ValueError(
            'Please confirm that name of the target column is "Decision" and it is put to the right in pandas data frame'
        )

    #------------------------

    #initialize params and folders
    config = functions.initializeParams(config)
    functions.initializeFolders()

    #------------------------

    algorithm = config['algorithm']

    valid_algorithms = ['ID3', 'C4.5', 'CART', 'Regression']

    if algorithm not in valid_algorithms:
        raise ValueError('Invalid algorithm passed. You passed ', algorithm,
                         " but valid algorithms are ", valid_algorithms)

    #------------------------

    enableRandomForest = config['enableRandomForest']
    num_of_trees = config['num_of_trees']
    enableMultitasking = config['enableMultitasking']

    enableGBM = config['enableGBM']
    epochs = config['epochs']
    learning_rate = config['learning_rate']

    enableAdaboost = config['enableAdaboost']

    #------------------------
    raw_df = df.copy()
    num_of_rows = df.shape[0]
    num_of_columns = df.shape[1]

    if algorithm == 'Regression':
        if df['Decision'].dtypes == 'object':
            raise ValueError(
                'Regression trees cannot be applied for nominal target values! You can either change the algorithm or data set.'
            )

    if df['Decision'].dtypes != 'object':  #this must be regression tree even if it is not mentioned in algorithm
        algorithm = 'Regression'
        config['algorithm'] = 'Regression'
        global_stdev = df['Decision'].std(ddof=0)

    if enableGBM == True:
        print("Gradient Boosting Machines...")
        algorithm = 'Regression'
        config['algorithm'] = 'Regression'

    if enableAdaboost == True:
        for j in range(0, num_of_columns):
            column_name = df.columns[j]
            if df[column_name].dtypes == 'object':
                raise ValueError(
                    'Adaboost must be run on numeric data set for both features and target'
                )

    #-------------------------

    print(algorithm, " tree is going to be built...")

    dataset_features = dict(
    )  #initialize a dictionary. this is going to be used to check features numeric or nominal. numeric features should be transformed to nominal values based on scales.

    header = "def findDecision("
    header = header + "obj"
    header = header + "): #"

    num_of_columns = df.shape[1] - 1
    for i in range(0, num_of_columns):
        column_name = df.columns[i]
        dataset_features[column_name] = df[column_name].dtypes
        header = header + "obj[" + str(i) + "]: " + column_name
        if i != num_of_columns - 1:
            header = header + ", "

    header = header + "\n"

    #------------------------

    begin = time.time()

    trees = []
    alphas = []

    if enableAdaboost == True:
        trees, alphas = adaboost.apply(df, config, header, dataset_features)

    elif enableGBM == True:

        if df['Decision'].dtypes == 'object':  #transform classification problem to regression
            trees, alphas = gbm.classifier(df, config, header,
                                           dataset_features)
            classification = True

        else:  #regression
            trees = gbm.regressor(df, config, header, dataset_features)
            classification = False

    elif enableRandomForest == True:
        trees = randomforest.apply(df, config, header, dataset_features)
    else:  #regular decision tree building

        root = 1
        file = "outputs/rules/rules.py"
        functions.createFile(file, header)
        trees = Training.buildDecisionTree(df, root, file, config,
                                           dataset_features)

    print("finished in ", time.time() - begin, " seconds")

    obj = {"trees": trees, "alphas": alphas, "config": config}

    return obj
Пример #3
0
def fit(df, config):

    #config parameters

    debug = config['debug']
    algorithm = config['algorithm']

    enableRandomForest = config['enableRandomForest']
    num_of_trees = config['num_of_trees']
    enableMultitasking = config['enableMultitasking']

    enableGBM = config['enableGBM']
    epochs = config['epochs']
    learning_rate = config['learning_rate']

    enableAdaboost = config['enableAdaboost']

    #------------------------
    if algorithm == 'Regression':
        if df['Decision'].dtypes == 'object':
            raise ValueError(
                'Regression trees cannot be applied for nominal target values! You can either change the algorithm or data set.'
            )

    if df['Decision'].dtypes != 'object':  #this must be regression tree even if it is not mentioned in algorithm
        algorithm = 'Regression'
        config['algorithm'] = 'Regression'
        global_stdev = df['Decision'].std(ddof=0)

    if enableGBM == True:
        debug = False  #gbm needs rules files to iterate
        algorithm = 'Regression'
        config['algorithm'] = 'Regression'

    #-------------------------

    print(algorithm, " tree is going to be built...")

    dataset_features = dict(
    )  #initialize a dictionary. this is going to be used to check features numeric or nominal. numeric features should be transformed to nominal values based on scales.

    if (True):  #header of rules files
        header = "def findDecision("
        num_of_columns = df.shape[1] - 1
        for i in range(0, num_of_columns):
            if debug == True:
                if i > 0:
                    header = header + ","
                header = header + df.columns[i]

            column_name = df.columns[i]
            dataset_features[column_name] = df[column_name].dtypes

        if debug == False:
            header = header + "obj"

        header = header + "):\n"

        if debug == True:
            print(header, end='')

    #------------------------

    begin = time.time()

    if enableAdaboost == True:
        adaboost.apply(df, config, header, dataset_features)

    elif enableGBM == True:

        if df['Decision'].dtypes == 'object':  #transform classification problem to regression
            gbm.classifier(df, config, header, dataset_features)

        else:  #regression
            gbm.regressor(df, config, header, dataset_features)

    elif enableRandomForest == True:
        randomforest.apply(df, config, header, dataset_features)
    else:  #regular decision tree building

        root = 1
        file = "outputs/rules/rules.py"
        if debug == False: functions.createFile(file, header)
        Training.buildDecisionTree(df, root, file, config, dataset_features)

    print("finished in ", time.time() - begin, " seconds")