示例#1
0
def buildDecisionTree(df, root, file, config, dataset_features, parent_level = 0, leaf_id = 0, parents = 'root', validation_df = None):
	
	models = []
	feature_names = df.columns[0:-1]
	
	enableParallelism = config['enableParallelism']
	algorithm = config['algorithm']
	
	json_file = file.split(".")[0]+".json"
	
	if root == 1:
		if config['enableRandomForest'] != True and config['enableGBM'] != True and config['enableAdaboost'] != True:
			raw_df = df.copy()
	
	#--------------------------------------
	
	df_copy = df.copy()
	
	winner_name, num_of_instances, metric, metric_name = findDecision(df, config)
	
	#find winner index, this cannot be returned by find decision because columns dropped in previous steps
	j = 0 
	for i in dataset_features:
		if i == winner_name:
			winner_index = j
		j = j + 1
	
	numericColumn = False
	if dataset_features[winner_name] != 'object':
		numericColumn = True
	
	#restoration
	columns = df.shape[1]
	for i in range(0, columns-1):
		column_name = df.columns[i]; column_type = df[column_name].dtypes
		if column_type != 'object' and column_name != winner_name:
			df[column_name] = df_copy[column_name]
	
	classes = df[winner_name].value_counts().keys().tolist()
		
	#-----------------------------------------------------
	
	num_cores = config["num_cores"]
	
	input_params = []
	
	#serial approach
	for i in range(0,len(classes)):
		current_class = classes[i]
		subdataset = df[df[winner_name] == current_class]
		subdataset = subdataset.drop(columns=[winner_name])
		branch_index = i * 1
		
		#create branches serially
		if enableParallelism != True:
			
			if i == 0:
				#descriptor = "# Feature: "+winner_name+", Instances: "+str(num_of_instances)+", "+metric_name+": "+str(round(metric, 4))
				
				descriptor = {
					"feature": winner_name,
					"instances": num_of_instances,
					#"metric_name": metric_name,
					"metric_value": round(metric, 4),
					"depth": parent_level + 1
				}
				descriptor = "# "+json.dumps(descriptor)
				
				functions.storeRule(file, (functions.formatRule(root), "", descriptor))
			
			createBranch(config, current_class, subdataset, numericColumn, branch_index
				, winner_name, winner_index, root, parents, file, dataset_features, num_of_instances, metric)
		else:
			input_params.append((config, current_class, subdataset, numericColumn, branch_index
				, winner_name, winner_index, root, parents, file, dataset_features, num_of_instances, metric))
	
	#---------------------------
	#add else condition in the decision tree
	
	if df.Decision.dtypes == 'object': #classification
		pivot = pd.DataFrame(subdataset.Decision.value_counts()).reset_index()
		pivot = pivot.rename(columns = {"Decision": "Instances","index": "Decision"})
		pivot = pivot.sort_values(by = ["Instances"], ascending = False).reset_index()
		
		else_decision = "return '%s'" % (pivot.iloc[0].Decision)
		
		if enableParallelism != True:
			functions.storeRule(file,(functions.formatRule(root), "else:"))
			functions.storeRule(file,(functions.formatRule(root+1), else_decision))
		else: #parallelism
			leaf_id = str(uuid.uuid1())
			custom_rule_file = "outputs/rules/"+str(leaf_id)+".txt"
			
			check_rule = "else: "+else_decision
			
			sample_rule = {}
			sample_rule["current_level"] = root
			sample_rule["leaf_id"] = leaf_id
			sample_rule["parents"] = parents
			sample_rule["rule"] = check_rule
			sample_rule["feature_idx"] = -1
			sample_rule["feature_name"] = ""
			sample_rule["instances"] = df.shape[0]
			sample_rule["metric"] = 0
			sample_rule["return_statement"] = 0
			
			#json to string
			sample_rule = json.dumps(sample_rule)
			
			functions.createFile(custom_rule_file, "")
			functions.storeRule(custom_rule_file, sample_rule)
			
	else: #regression
		else_decision = "return %s" % (subdataset.Decision.mean())
				
		if enableParallelism != True:
			functions.storeRule(file,(functions.formatRule(root), "else:"))
			functions.storeRule(file,(functions.formatRule(root+1), else_decision))
		else:
			leaf_id = str(uuid.uuid1())
			custom_rule_file = "outputs/rules/"+str(leaf_id)+".txt"
			
			check_rule = "else: "+else_decision
			
			sample_rule = "   {\n"
			sample_rule += "      \"current_level\": "+str(root)+",\n"
			sample_rule += "      \"leaf_id\": \""+str(leaf_id)+"\",\n"
			sample_rule += "      \"parents\": \""+parents+"\",\n"
			sample_rule += "      \"rule\": \""+check_rule+"\"\n"
			sample_rule += "   }"
			
			functions.createFile(custom_rule_file, "")
			functions.storeRule(custom_rule_file, sample_rule)
	
	#---------------------------
	
	#create branches in parallel
	if enableParallelism == True:
		"""
		#this usage causes trouble for recursive functions
		with Pool(number_of_cpus) as pool:
			pool.starmap(createBranch, input_params)
		"""
		
		pool = MyPool(num_cores)
		results = pool.starmap(createBranch, input_params)
		pool.close()
		pool.join()
	
	#---------------------------------------------
	
	if root == 1:
		
		if enableParallelism == True:

			#custom rules are stored in .txt files. merge them all in a json file
			
			functions.createFile(json_file, "[\n")
			
			custom_rules = []
			
			file_index = 0
			for file in os.listdir(os.getcwd()+"/outputs/rules"):
				if file.endswith(".txt"):
					custom_rules.append(os.getcwd()+"/outputs/rules/"+file)
					#print(file) #this file stores a custom rule
					f = open(os.getcwd()+"/outputs/rules/"+file, "r")
					custom_rule = f.read()
					
					if file_index > 0:
						custom_rule = ", "+custom_rule
					
					functions.storeRule(json_file, custom_rule)
					f.close()
					file_index = file_index + 1
					
			functions.storeRule(json_file, "]")
			
			#-----------------------------------
			
			#custom rules are already merged in a json file. clear messy custom rules
			#TO-DO: if random forest trees are handled in parallel, this would be a problem. You cannot know the related tree of a rule. You should store a global tree id in a rule.
			
			for file in custom_rules:
				os.remove(file)
			
			#-----------------------------------
			
			reconstructRules(json_file, feature_names)

			#feature importance should be calculated by demand?
			feature_importance(json_file, dataset_features)
			
			#-----------------------------------
		
		#is regular decision tree
		if config['enableRandomForest'] != True and config['enableGBM'] != True and config['enableAdaboost'] != True:
		#this is reguler decision tree. find accuracy here.
			
			moduleName = "outputs/rules/rules"
			fp, pathname, description = imp.find_module(moduleName)
			myrules = imp.load_module(moduleName, fp, pathname, description) #rules0
			models.append(myrules)
			
	return models
示例#2
0
def createBranch(config, current_class, subdataset, numericColumn, branch_index
	, winner_name, winner_index, root, parents, file, dataset_features, num_of_instances, metric):
	
	algorithm = config['algorithm']
	enableAdaboost = config['enableAdaboost']
	enableGBM = config['enableGBM']
	max_depth = config['max_depth']
	enableParallelism = config['enableParallelism']
	
	charForResp = "'"
	if algorithm == 'Regression':
		charForResp = ""
	
	#---------------------------
	
	json_file = file.split(".")[0]+".json"
	
	tmp_root = root * 1
	parents_raw = copy.copy(parents)
	
	#---------------------------
	
	if numericColumn == True:
		compareTo = current_class #current class might be <=x or >x in this case
	else:
		compareTo = " == '"+str(current_class)+"'"
	
	#print(subdataset)
	
	terminateBuilding = False
	
	#-----------------------------------------------
	#can decision be made?
	
	if enableGBM == True and root >= max_depth: #max depth
		final_decision = subdataset['Decision'].mean()
		terminateBuilding = True
	elif enableAdaboost == True:
		#final_decision = subdataset['Decision'].value_counts().idxmax()
		final_decision = functions.sign(subdataset['Decision'].mean()) #get average
		terminateBuilding = True
		enableParallelism = False
	elif len(subdataset['Decision'].value_counts().tolist()) == 1:
		final_decision = subdataset['Decision'].value_counts().keys().tolist()[0] #all items are equal in this case
		terminateBuilding = True
	elif subdataset.shape[1] == 1: #if decision cannot be made even though all columns dropped
		final_decision = subdataset['Decision'].value_counts().idxmax() #get the most frequent one
		terminateBuilding = True
	elif algorithm == 'Regression' and subdataset.shape[0] < 5: #pruning condition
	#elif algorithm == 'Regression' and subdataset['Decision'].std(ddof=0)/global_stdev < 0.4: #pruning condition
		final_decision = subdataset['Decision'].mean() #get average
		terminateBuilding = True
	
	#-----------------------------------------------
	
	if enableParallelism == True:
		check_condition = "if" #TODO: elif checks might be above than if statements in parallel
	else:	
		if branch_index == 0:
			check_condition = "if"
		else:
			check_condition = "elif"
	
	check_rule = check_condition+" obj["+str(winner_index)+"]"+compareTo+":"
	
	leaf_id = str(uuid.uuid1())
	custom_rule_file = "outputs/rules/"+str(leaf_id)+".txt"
		
	if enableParallelism != True:
		
		#check_rule += " # feature: "+winner_name+", instances: "+str(num_of_instances)+", "+metric_name+": "+str(round(metric, 4))
		
		functions.storeRule(file,(functions.formatRule(root),"",check_rule))
	else:
		
		sample_rule = {}
		sample_rule["current_level"] = root
		sample_rule["leaf_id"] = leaf_id
		sample_rule["parents"] = parents
		sample_rule["rule"] = check_rule
		sample_rule["feature_idx"] = winner_index
		sample_rule["feature_name"] = winner_name
		sample_rule["instances"] = num_of_instances
		sample_rule["metric"] = metric
		sample_rule["return_statement"] = 0
		
		#json to string
		sample_rule = json.dumps(sample_rule)
	
		functions.createFile(custom_rule_file, "")
		functions.storeRule(custom_rule_file, sample_rule)
	
	#-----------------------------------------------
	
	if terminateBuilding == True: #check decision is made
		
		parents = copy.copy(leaf_id)
		leaf_id = str(uuid.uuid1())
		
		decision_rule = "return "+charForResp+str(final_decision)+charForResp
		
		if enableParallelism != True:
			#serial
			functions.storeRule(file,(functions.formatRule(root+1),decision_rule))
		else:
			#parallel			
			sample_rule = {}
			sample_rule["current_level"] = root+1
			sample_rule["leaf_id"] = leaf_id
			sample_rule["parents"] = parents
			sample_rule["rule"] = decision_rule
			sample_rule["feature_idx"] = winner_index
			sample_rule["feature_name"] = winner_name
			sample_rule["instances"] = num_of_instances
			sample_rule["metric"] = 0
			sample_rule["return_statement"] = 1
			
			#json to string
			sample_rule = ", "+json.dumps(sample_rule)
			
			functions.storeRule(custom_rule_file, sample_rule)
	
	else: #decision is not made, continue to create branch and leafs
		root = root + 1 #the following rule will be included by this rule. increase root
		parents = copy.copy(leaf_id)
		
		buildDecisionTree(subdataset, root, file, config, dataset_features
			, root-1, leaf_id, parents)
					
		root = tmp_root * 1
		parents = copy.copy(parents_raw)
示例#3
0
def buildDecisionTree(df,
                      root,
                      file,
                      config,
                      dataset_features,
                      parent_level=0,
                      leaf_id=0,
                      parents='root',
                      tree_id=0,
                      validation_df=None,
                      main_process_id=None):

    models = []

    decision_rules = []

    feature_names = df.columns[0:-1]

    enableParallelism = config['enableParallelism']
    algorithm = config['algorithm']

    json_file = file.split(".")[0] + ".json"

    random_forest_enabled = config['enableRandomForest']
    enableGBM = config['enableGBM']
    enableAdaboost = config['enableAdaboost']

    if root == 1:
        if random_forest_enabled != True and enableGBM != True and enableAdaboost != True:
            raw_df = df.copy()

    #--------------------------------------

    df_copy = df.copy()

    winner_name, num_of_instances, metric, metric_name = findDecision(
        df, config)

    #find winner index, this cannot be returned by find decision because columns dropped in previous steps
    j = 0
    for i in dataset_features:
        if i == winner_name:
            winner_index = j
        j = j + 1

    numericColumn = False
    if dataset_features[winner_name] != 'object':
        numericColumn = True

    #restoration
    columns = df.shape[1]
    for i in range(0, columns - 1):
        #column_name = df.columns[i]; column_type = df[column_name].dtypes #numeric field already transformed to object. you cannot check it with df itself, you should check df_copy
        column_name = df_copy.columns[i]
        column_type = df_copy[column_name].dtypes
        if column_type != 'object' and column_name != winner_name:
            df[column_name] = df_copy[column_name]

    classes = df[winner_name].value_counts().keys().tolist()
    #print("classes: ",classes," in ", winner_name)
    #-----------------------------------------------------

    num_cores = config["num_cores"]

    input_params = []

    #serial approach
    for i in range(0, len(classes)):
        current_class = classes[i]
        subdataset = df[df[winner_name] == current_class]
        subdataset = subdataset.drop(columns=[winner_name])
        branch_index = i * 1

        #create branches serially
        if enableParallelism != True:

            if i == 0:

                descriptor = {
                    "feature": winner_name,
                    "instances": num_of_instances,
                    #"metric_name": metric_name,
                    "metric_value": round(metric, 4),
                    "depth": parent_level + 1
                }
                descriptor = "# " + json.dumps(descriptor)

                functions.storeRule(
                    file, (functions.formatRule(root), "", descriptor))

            results = createBranch(config,
                                   current_class,
                                   subdataset,
                                   numericColumn,
                                   branch_index,
                                   winner_name,
                                   winner_index,
                                   root,
                                   parents,
                                   file,
                                   dataset_features,
                                   num_of_instances,
                                   metric,
                                   tree_id=tree_id,
                                   main_process_id=main_process_id)

            decision_rules = decision_rules + results

        else:
            input_params.append(
                (config, current_class, subdataset, numericColumn,
                 branch_index, winner_name, winner_index, root, parents, file,
                 dataset_features, num_of_instances, metric, tree_id,
                 main_process_id))

    #---------------------------
    #add else condition in the decision tree

    if df.Decision.dtypes == 'object':  #classification
        pivot = pd.DataFrame(subdataset.Decision.value_counts()).reset_index()
        pivot = pivot.rename(columns={
            "Decision": "Instances",
            "index": "Decision"
        })
        pivot = pivot.sort_values(by=["Instances"],
                                  ascending=False).reset_index()

        else_decision = "return '%s'" % (pivot.iloc[0].Decision)

        if enableParallelism != True:
            functions.storeRule(file, (functions.formatRule(root), "else:"))
            functions.storeRule(
                file, (functions.formatRule(root + 1), else_decision))
        else:  #parallelism
            leaf_id = str(uuid.uuid1())

            check_rule = "else: " + else_decision

            sample_rule = {}
            sample_rule["current_level"] = root
            sample_rule["leaf_id"] = leaf_id
            sample_rule["parents"] = parents
            sample_rule["rule"] = check_rule
            sample_rule["feature_idx"] = -1
            sample_rule["feature_name"] = ""
            sample_rule["instances"] = df.shape[0]
            sample_rule["metric"] = 0
            sample_rule["return_statement"] = 0
            sample_rule["tree_id"] = tree_id

            #json to string
            sample_rule = json.dumps(sample_rule)
            decision_rules.append(sample_rule)

    else:  #regression
        else_decision = "return %s" % (subdataset.Decision.mean())

        if enableParallelism != True:
            functions.storeRule(file, (functions.formatRule(root), "else:"))
            functions.storeRule(
                file, (functions.formatRule(root + 1), else_decision))
        else:
            leaf_id = str(uuid.uuid1())

            check_rule = "else: " + else_decision

            sample_rule = {}
            sample_rule["current_level"] = root
            sample_rule["leaf_id"] = leaf_id
            sample_rule["parents"] = parents
            sample_rule["rule"] = check_rule
            sample_rule["tree_id"] = tree_id
            sample_rule["feature_name"] = ""
            sample_rule["instances"] = 0
            sample_rule["metric"] = 0
            sample_rule["return_statement"] = 1

            #json to string
            sample_rule = json.dumps(sample_rule)
            decision_rules.append(sample_rule)

    #---------------------------

    try:
        main_process = psutil.Process(main_process_id)
        children = main_process.children(recursive=True)
        active_processes = len(children) + 1  #plus parent
        #active_processes = len(children)
    except:
        active_processes = 100  #set a large initial value

    results = []
    #create branches in parallel
    if enableParallelism == True:

        required_threads = active_processes + len(classes)

        #if parent_level == 0 and random_forest_enabled != True:
        if main_process_id != None and num_cores >= required_threads:  #len(classes) branches will be run in parallel

            #POOL_SIZE = num_cores
            POOL_SIZE = len(classes)

            #with closing(multiprocessing.Pool(POOL_SIZE)) as pool:
            with closing(MyPool(POOL_SIZE)) as pool:
                funclist = []

                for input_param in input_params:
                    f = pool.apply_async(createBranchWrapper,
                                         [createBranch, input_param])
                    funclist.append(f)

                #all functions registered here

                for f in funclist:
                    branch_results = f.get(timeout=100000)

                    for branch_result in branch_results:
                        results.append(branch_result)

                pool.close()
                pool.terminate()

            #--------------------------------

        else:  #serial
            for input_param in input_params:
                sub_results = createBranchWrapper(createBranch, input_param)
                for sub_result in sub_results:
                    results.append(sub_result)

        #--------------------------------

        decision_rules = decision_rules + results

        #--------------------------------

        if root != 1:  #return children results until the root node
            return decision_rules

    #---------------------------------------------

    if root == 1:

        if enableParallelism == True:

            #custom rules are stored in decision_rules. merge them all in a json file first

            json_rules = "[\n"  #initialize

            file_index = 0
            for custom_rule in decision_rules:

                json_rules += custom_rule

                if file_index < len(decision_rules) - 1:
                    json_rules += ", "

                json_rules += "\n"

                file_index = file_index + 1

            #-----------------------------------

            json_rules += "]"
            functions.createFile(json_file, json_rules)

            #-----------------------------------
            #reconstruct rules from json to py

            reconstructRules(json_file, feature_names)

            #-----------------------------------

        #is regular decision tree
        if config['enableRandomForest'] != True and config[
                'enableGBM'] != True and config['enableAdaboost'] != True:
            #this is reguler decision tree. find accuracy here.

            moduleName = "outputs/rules/rules"
            fp, pathname, description = imp.find_module(moduleName)
            myrules = imp.load_module(moduleName, fp, pathname,
                                      description)  #rules0
            models.append(myrules)

    return models
示例#4
0
def buildDecisionTree(df,
                      root,
                      file,
                      config,
                      dataset_features,
                      parent_level=0,
                      leaf_id=0,
                      parents='root'):

    models = []

    enableParallelism = config['enableParallelism']
    algorithm = config['algorithm']

    json_file = file.split(".")[0] + ".json"

    if root == 1:
        if config['enableRandomForest'] != True and config[
                'enableGBM'] != True and config['enableAdaboost'] != True:
            raw_df = df.copy()

    #--------------------------------------

    df_copy = df.copy()

    winner_name = findDecision(df, config)

    #find winner index, this cannot be returned by find decision because columns dropped in previous steps
    j = 0
    for i in dataset_features:
        if i == winner_name:
            winner_index = j
        j = j + 1

    numericColumn = False
    if dataset_features[winner_name] != 'object':
        numericColumn = True

    #restoration
    columns = df.shape[1]
    for i in range(0, columns - 1):
        column_name = df.columns[i]
        column_type = df[column_name].dtypes
        if column_type != 'object' and column_name != winner_name:
            df[column_name] = df_copy[column_name]

    classes = df[winner_name].value_counts().keys().tolist()

    #-----------------------------------------------------

    #TO-DO: you should specify the number of cores in config
    num_cores = int(multiprocessing.cpu_count() /
                    2)  #allocate half of your total cores

    input_params = []

    #serial approach
    for i in range(0, len(classes)):
        current_class = classes[i]
        subdataset = df[df[winner_name] == current_class]
        subdataset = subdataset.drop(columns=[winner_name])
        branch_index = i * 1

        #create branches serially
        if enableParallelism != True:
            createBranch(config, current_class, subdataset, numericColumn,
                         branch_index, winner_index, root, parents, file,
                         dataset_features)
        else:
            input_params.append((config, current_class, subdataset,
                                 numericColumn, branch_index, winner_index,
                                 root, parents, file, dataset_features))

    #---------------------------
    #add else condition in the decision tree

    if df.Decision.dtypes == 'object':  #classification
        pivot = pd.DataFrame(subdataset.Decision.value_counts()).reset_index()
        pivot = pivot.rename(columns={
            "Decision": "Instances",
            "index": "Decision"
        })
        pivot = pivot.sort_values(by=["Instances"],
                                  ascending=False).reset_index()

        else_decision = "return '%s'" % (pivot.iloc[0].Decision)

        if enableParallelism != True:
            functions.storeRule(file, (functions.formatRule(root), "else:"))
            functions.storeRule(
                file, (functions.formatRule(root + 1), else_decision))
        else:  #parallelism
            leaf_id = str(uuid.uuid1())
            custom_rule_file = "outputs/rules/" + str(leaf_id) + ".txt"

            check_rule = "else: " + else_decision

            sample_rule = "   {\n"
            sample_rule += "      \"current_level\": " + str(root) + ",\n"
            sample_rule += "      \"leaf_id\": \"" + str(leaf_id) + "\",\n"
            sample_rule += "      \"parents\": \"" + parents + "\",\n"
            sample_rule += "      \"rule\": \"" + check_rule + "\"\n"
            sample_rule += "   }"

            functions.createFile(custom_rule_file, "")
            functions.storeRule(custom_rule_file, sample_rule)

    else:  #regression
        else_decision = "return %s" % (subdataset.Decision.mean())

        if enableParallelism != True:
            functions.storeRule(file, (functions.formatRule(root), "else:"))
            functions.storeRule(
                file, (functions.formatRule(root + 1), else_decision))
        else:
            leaf_id = str(uuid.uuid1())
            custom_rule_file = "outputs/rules/" + str(leaf_id) + ".txt"

            check_rule = "else: " + else_decision

            sample_rule = "   {\n"
            sample_rule += "      \"current_level\": " + str(root) + ",\n"
            sample_rule += "      \"leaf_id\": \"" + str(leaf_id) + "\",\n"
            sample_rule += "      \"parents\": \"" + parents + "\",\n"
            sample_rule += "      \"rule\": \"" + check_rule + "\"\n"
            sample_rule += "   }"

            functions.createFile(custom_rule_file, "")
            functions.storeRule(custom_rule_file, sample_rule)

    #---------------------------

    #create branches in parallel
    if enableParallelism == True:
        """
		#this usage causes trouble for recursive functions
		with Pool(number_of_cpus) as pool:
			pool.starmap(createBranch, input_params)
		"""

        pool = MyPool(num_cores)
        results = pool.starmap(createBranch, input_params)
        pool.close()
        pool.join()

    #---------------------------------------------

    #calculate accuracy metrics
    if root == 1:

        if enableParallelism == True:

            #custom rules are stored in .txt files. merge them all in a json file

            functions.createFile(json_file, "[\n")

            custom_rules = []

            file_index = 0
            for file in os.listdir(os.getcwd() + "/outputs/rules"):
                if file.endswith(".txt"):
                    custom_rules.append(os.getcwd() + "/outputs/rules/" + file)
                    #print(file) #this file stores a custom rule
                    f = open(os.getcwd() + "/outputs/rules/" + file, "r")
                    custom_rule = f.read()

                    if file_index > 0:
                        custom_rule = ", " + custom_rule

                    functions.storeRule(json_file, custom_rule)
                    f.close()
                    file_index = file_index + 1

            functions.storeRule(json_file, "]")

            #-----------------------------------

            #custom rules are already merged in a json file. clear messy custom rules
            #TO-DO: if random forest trees are handled in parallel, this would be a problem. You cannot know the related tree of a rule. You should store a global tree id in a rule.

            for file in custom_rules:
                os.remove(file)

            #-----------------------------------

            reconstructRules(json_file)

            #-----------------------------------

        if config['enableRandomForest'] != True and config[
                'enableGBM'] != True and config['enableAdaboost'] != True:
            #this is reguler decision tree. find accuracy here.

            moduleName = "outputs/rules/rules"
            fp, pathname, description = imp.find_module(moduleName)
            myrules = imp.load_module(moduleName, fp, pathname,
                                      description)  #rules0
            models.append(myrules)

            num_of_features = df.shape[1] - 1
            instances = df.shape[0]
            classified = 0
            mae = 0
            mse = 0

            #instead of for loops, pandas functions perform well
            raw_df['Prediction'] = raw_df.apply(findPrediction, axis=1)
            if algorithm != 'Regression':
                idx = raw_df[raw_df['Prediction'] == raw_df['Decision']].index

                #raw_df['Classified'] = 0
                #raw_df.loc[idx, 'Classified'] = 1
                #print(raw_df)

                accuracy = 100 * len(idx) / instances
                print("Accuracy: ", accuracy, "% on ", instances, " instances")
            else:
                raw_df['Absolute_Error'] = abs(raw_df['Prediction'] -
                                               raw_df['Decision'])
                raw_df['Absolute_Error_Squared'] = raw_df[
                    'Absolute_Error'] * raw_df['Absolute_Error']

                #print(raw_df)

                mae = raw_df['Absolute_Error'].sum() / instances
                print("MAE: ", mae)

                mse = raw_df['Absolute_Error_Squared'].sum() / instances
                rmse = math.sqrt(mse)
                print("RMSE: ", rmse)

                mean = raw_df['Decision'].mean()
                print("Mean: ", mean)

                if mean > 0:
                    print("MAE / Mean: ", 100 * mae / mean, "%")
                    print("RMSE / Mean: ", 100 * rmse / mean, "%")

    return models
示例#5
0
def buildDecisionTree(df, root, file, config, dataset_features):

    models = []

    if root == 1:
        if config['enableRandomForest'] != True and config[
                'enableGBM'] != True and config['enableAdaboost'] != True:
            raw_df = df.copy()

    algorithm = config['algorithm']
    enableAdaboost = config['enableAdaboost']

    #--------------------------------------

    #print(df.shape)
    charForResp = "'"
    if algorithm == 'Regression':
        charForResp = ""

    tmp_root = root * 1

    df_copy = df.copy()

    winner_name = findDecision(df, config)

    #find winner index, this cannot be returned by find decision because columns dropped in previous steps
    j = 0
    for i in dataset_features:
        if i == winner_name:
            winner_index = j
        j = j + 1

    numericColumn = False
    if dataset_features[winner_name] != 'object':
        numericColumn = True

    #restoration
    columns = df.shape[1]
    for i in range(0, columns - 1):
        column_name = df.columns[i]
        column_type = df[column_name].dtypes
        if column_type != 'object' and column_name != winner_name:
            df[column_name] = df_copy[column_name]

    classes = df[winner_name].value_counts().keys().tolist()

    for i in range(0, len(classes)):
        current_class = classes[i]
        subdataset = df[df[winner_name] == current_class]
        subdataset = subdataset.drop(columns=[winner_name])

        if numericColumn == True:
            compareTo = current_class  #current class might be <=x or >x in this case
        else:
            compareTo = " == '" + str(current_class) + "'"

        #print(subdataset)

        terminateBuilding = False

        #-----------------------------------------------
        #can decision be made?

        if enableAdaboost == True:
            #final_decision = subdataset['Decision'].value_counts().idxmax()
            final_decision = functions.sign(
                subdataset['Decision'].mean())  #get average
            terminateBuilding = True
        elif len(subdataset['Decision'].value_counts().tolist()) == 1:
            final_decision = subdataset['Decision'].value_counts().keys(
            ).tolist()[0]  #all items are equal in this case
            terminateBuilding = True
        elif subdataset.shape[
                1] == 1:  #if decision cannot be made even though all columns dropped
            final_decision = subdataset['Decision'].value_counts().idxmax(
            )  #get the most frequent one
            terminateBuilding = True
        elif algorithm == 'Regression' and subdataset.shape[
                0] < 5:  #pruning condition
            #elif algorithm == 'Regression' and subdataset['Decision'].std(ddof=0)/global_stdev < 0.4: #pruning condition
            final_decision = subdataset['Decision'].mean()  #get average
            terminateBuilding = True
        #-----------------------------------------------

        if i == 0:
            check_condition = "if"
        else:
            check_condition = "elif"

        functions.storeRule(file,
                            (functions.formatRule(root), "", check_condition,
                             " obj[", str(winner_index), "]", compareTo, ":"))

        #-----------------------------------------------

        if terminateBuilding == True:  #check decision is made
            functions.storeRule(
                file, (functions.formatRule(root + 1), "return ",
                       charForResp + str(final_decision) + charForResp))

        else:  #decision is not made, continue to create branch and leafs
            root = root + 1  #the following rule will be included by this rule. increase root
            buildDecisionTree(subdataset, root, file, config, dataset_features)

        root = tmp_root * 1

    #---------------------------------------------

    #calculate accuracy metrics
    if root == 1:
        if config['enableRandomForest'] != True and config[
                'enableGBM'] != True and config['enableAdaboost'] != True:
            #this is reguler decision tree. find accuracy here.
            moduleName = "outputs/rules/rules"
            fp, pathname, description = imp.find_module(moduleName)
            myrules = imp.load_module(moduleName, fp, pathname,
                                      description)  #rules0
            models.append(myrules)

            num_of_features = df.shape[1] - 1
            instances = df.shape[0]
            classified = 0
            mae = 0
            mse = 0

            #instead of for loops, pandas functions perform well
            raw_df['Prediction'] = raw_df.apply(findPrediction, axis=1)
            if algorithm != 'Regression':
                idx = raw_df[raw_df['Prediction'] == raw_df['Decision']].index

                #raw_df['Classified'] = 0
                #raw_df.loc[idx, 'Classified'] = 1
                #print(raw_df)

                accuracy = 100 * len(idx) / instances
                print("Accuracy: ", accuracy, "% on ", instances, " instances")
            else:
                raw_df['Absolute_Error'] = abs(raw_df['Prediction'] -
                                               raw_df['Decision'])
                raw_df['Absolute_Error_Squared'] = raw_df[
                    'Absolute_Error'] * raw_df['Absolute_Error']

                #print(raw_df)

                mae = raw_df['Absolute_Error'].sum() / instances
                print("MAE: ", mae)

                mse = raw_df['Absolute_Error_Squared'].sum() / instances
                rmse = math.sqrt(mse)
                print("RMSE: ", rmse)

                mean = raw_df['Decision'].mean()
                print("Mean: ", mean)

                if mean > 0:
                    print("MAE / Mean: ", 100 * mae / mean, "%")
                    print("RMSE / Mean: ", 100 * rmse / mean, "%")

    return models