def _build_bias_tree(self, attributes: dict, metric_with_metadata: pd.DataFrame) -> None: self.bias_tree = Tree.from_pandas_df( metric_with_metadata, attributes, self.metric_col, min_child_node_size=self.min_child_node_size, dep_variable_type=self.metric_type, max_depth=self.max_depth, alpha_merge=self.alpha, split_threshold=self.split_threshold)
def train(self, x: np.ndarray, y: np.array): """ Train CHAID classifier Parameters ---------- x: np.array Train data set y: np.array Train target data """ self.model = Tree.from_numpy(ndarr=x, arr=y, alpha_merge=self.alpha_merge, max_depth=self.max_depth, min_parent_node_size=self.min_parent_node, min_child_node_size=self.min_child_node, split_threshold=self.split_threshold, dep_variable_type='categorical')
import numpy as np from sklearn import datasets from sklearn import tree from sklearn.tree import _tree df = pd.read_csv( 'C:\\Users\\LAC40641\\Desktop\\ejemplo arbol\\prueba_chaid.csv', encoding='latin-1') le.fit(df['Sucursal_Oficial'].values) X = np.array(le.transform(df['Sucursal_Oficial'])) y = np.array(df['Sueldo_Normalizado']) xx = np.array(X).reshape(len(X), 1) tree = Tree.from_numpy(xx, y, split_titles=['a'], min_child_node_size=1) df_p = pd.DataFrame(list(zip(X, y.values)), columns=['lst1_title', 'lst2_title']) df_p.columns = ['Sucursal_Oficial', 'Sueldo_Normalizado'] independent_variable_columns = ['Sucursal_Oficial'] dep_variable = 'Sueldo_Normalizado' tree = Tree.from_pandas_df(df_p, dict( zip(independent_variable_columns, ['nominal'] * 3)), dep_variable, dep_variable_type='continuous')
def churn_clusters(data, dependent = "churn"): from CHAID import Tree indep_cols = list(set(data.columns.tolist()) - set(dependent)) tree = Tree.from_pandas_df(data, dict(zip(indep_cols, ['nominal'] * len(indep_cols))), dependent, min_child_node_size=5) print(tree.print_tree())
# Discretizar df['edad'] = pd.cut(df['edad'], bins=[0, 10, 20, 30, 40, 50, 60, 70, 90]) # Guardar Base df.to_csv('data.csv') df['OFICINA'] = df['OFICINA'].str.strip() independent_variable_columns = ['OFICINA'] dep_variable = 'Ventas_IT' minsplit = len(df['Ventas_IT']) * 0.03 prueba = Tree.from_pandas_df(df, dict( zip(independent_variable_columns, ['nominal'] * 1)), dep_variable, dep_variable_type='continuous', max_depth=5, min_parent_node_size=minsplit) prueba prueba.print_tree() prueba2 = prueba.tree_store probando = predict(df, prueba) prueba2 for i in prueba2: print(i.members['mean']) rules = prueba.classification_rules() rules lenrules = len(rules)
mask_light = list(mask_light) df_CHAID.drop(mask_light, inplace=True) mask_vdamage = df_CHAID[df_CHAID['Vehicle Damage Extent'].isin( vehicle_damage_drops)].index mask_vdamage = list(mask_vdamage) df_CHAID.drop(mask_vdamage, inplace=True) #defining Variables to use in the CHAID function indep_var = ['Light'] dep_var1 = 'Injury Severity' dep_var2 = 'Vehicle Damage Extent' #Building the tree tree1 = Tree.from_pandas_df(df_CHAID, dict(zip(indep_var, ['nominal'] * 3)), dep_var1, max_depth=1) tree2 = Tree.from_pandas_df(df_CHAID, dict(zip(indep_var, ['nominal'] * 3)), dep_var2, max_depth=1) #Converting the tree to an object to view it tree1.to_tree() tree2.to_tree() #Printing out the different trees to see the results in raw form tree1.print_tree() tree2.print_tree() #The ratios are the # of worst-case to # of other cases
'Q2_4', 'Q2_5', 'Q2_6', 'Q2_7', 'Q2_8', 'Q2_9', 'Q2_10', 'Q3_1', 'Q3_2', 'Q3_3', 'Q3_4', 'Q3_5', 'Q3_6', 'Q3_7', 'Q3_8', 'Q3_9', 'Q3_10' ] len(features) #===================================== # create tree features dict(zip(features, ['ordinal'] * 34)) tree = Tree.from_pandas_df(df, dict(zip(features, ['ordinal'] * 34)), 'clus', dep_variable_type='categorical', alpha_merge=0.05, max_depth=2, min_parent_node_size=1, min_child_node_size=0, split_threshold=0) tree tree.print_tree() dir(tree) dir(tree.get_node(3)) tree.get_node(0)._members
## create the data ndarr = np.array(([1, 2, 3] * 5) + ([2, 2, 3] * 5)).reshape(10, 3) df = pd.DataFrame(ndarr) df.columns = ['a', 'b', 'c'] arr = np.array(([1] * 5) + ([2] * 5)) df['d'] = arr print (df) ## set the CHAID input parameters independent_variable_columns = ['a', 'b', 'c'] dep_variable = 'd' ## create the Tree via pandas tree = Tree.from_pandas_df(df, dict(zip(independent_variable_columns, ['nominal'] * 3)), dep_variable) ## create the same tree, but without pandas helper tree = Tree.from_numpy(ndarr, arr, split_titles=['a', 'b', 'c'], min_child_node_size=5) ## create the same tree using the tree constructor """ cols = [ NominalColumn(ndarr[:,0], name='a') NominalColumn(ndarr[:,1], name='b') NominalColumn(ndarr[:,2], name='c') ] tree = Tree(cols, NominalColumn(arr, name='d'), {'min_child_node_size': 5}) """ print (tree.print_tree())
def Supervised_Merged(file, df, Predictor_type, dependent_variable_name, indep_column_num, Categorical=True): # Get the names of Independent and Dependent variables independent_variable_column = [df.columns[indep_column_num]] dep_variable = dependent_variable_name # Check for Target variable type to decide which CHAID TREE to implement if Categorical == True: # fit the Chaid tree model to supervised merged the categories in category predictor tree = Tree.from_pandas_df(df, dict( zip(independent_variable_column, [Predictor_type] * 1)), dep_variable, max_depth=1) else: # Convert the target variable to numeric df[dependent_variable_name] = pd.to_numeric( df[dependent_variable_name], errors='coerce') # fit the Chaid tree model to supervised merged the categories in category predictor tree = Tree.from_pandas_df(df, dict( zip(independent_variable_column, [Predictor_type] * 1)), dep_variable, dep_variable_type='continuous', max_depth=1) # Print the fitted tree file.write('The CHAID TREE is presented below:\n\n') file.write(str(tree) + '\n') # Get the merged categoriess string from the tree Merged_group = tree.tree_store[0].split.groupings.split('],') # Get numbers of merged caegroeis length_Merged_group = np.arange(0, len(Merged_group)) if len(Merged_group) >= 2: # Etract the number from the string New_Merged_Categories = {} for i in length_Merged_group: group = list(map(int, re.findall(r'\d+', Merged_group[i]))) New_Merged_Categories[i] = group file.write('The P-Values of this node is ' + str(tree.tree_store[0].split.p) + '\n') file.write('The new categories are:\n') for k, v in New_Merged_Categories.items(): file.write(str(k) + ' >>> ' + str(v) + '\n') # Convert the dict_format to match the previous dic # For example: new_merged: {0:[1,2,3,4,5],1:[6,7,8],2:[0,9]} # map_dict: {0:2, 1:0, 2:0, 3:0, 4:0, 5:0, 6:1, 7:1, 8:1, 9:2} new_dict = {} length_New_Merged = np.arange(0, len(New_Merged_Categories)) for j in length_New_Merged: values = New_Merged_Categories.get(j) for k in np.arange(0, len(values)): new_dict[values[k]] = j else: file.write('The P-Values of this node is ' + str(tree.tree_store[0].split.p) + '\n') file.write('The P-values is too large.\n') file.write( 'There is no categories can be merged in this variables.\n\n') new_dict = {} return new_dict
Independent_data = pd.read_csv("Independent_data.csv") Response = pd.read_csv("Response.csv") X_train, X_test, y_train, y_test = train_test_split(Independent_data, Response, test_size=0.25, random_state=42) train_data = pd.concat([X_train, y_train], axis=1) test_data = pd.concat([X_test, y_test], axis=1) from CHAID import Tree tree = Tree.from_pandas_df( test_data, dict( zip(X_train.columns.tolist(), list(np.repeat('nominal', len(X_train.columns))))), y_train.columns[0], max_depth=10, min_child_node_size=10) tree.print_tree() tree.classification_rules() def predict(df, tree): rules = tree.classification_rules() lenrules = len(rules) j = 0 df.index = range(0, df.shape[0]) Response = np.repeat(0, df.shape[0]) while (j <= lenrules - 1): r1 = rules[j]
index, ztest_list = Z_test(data) figdata = get_figdata(index, ztest_list, kms) draw_radar(figdata, len(index), 'Z test', [0.5, 1, 1.5, 2, 2.5, 3, 3.5]) ##### t-test index, ttest_list = T_test(data) figdata = get_figdata(index, ttest_list, kms) draw_radar(figdata, len(index), 'T test', [0.5, 1, 1.5, 2, 2.5, 3, 3.5]) ##### Chi-square test index, chisq_list = Chi_Square_test(data) figdata = get_figdata(index, chisq_list, kms) draw_radar(figdata, len(index), 'Chi-Square test', [0.5, 1, 1.5, 2, 2.5, 3, 3.5]) ####Chaid tree independent_variable_columns = data.columns[1:len(data.columns) - 1] d_variable = data.columns[-1] dic = {} for item in independent_variable_columns[1:len(data.columns) - 1]: dic.update({item: 'nominal'}) tr = Tree.from_pandas_df(data.drop(data.columns[0], axis=1), dic, d_variable, max_depth=8, min_parent_node_size=2, min_child_node_size=2) tr.print_tree()