def multi_sample_dataset(fiti, df, custom_classes, random_state=1, repeats=30): """ Complete data with samples multiple times acording to fiti Parameters ---------- fiti: factor tree df: pandas dataframe custom_classes: classes random_state: random state for the sample repeats: number of repeated samples Returns ------- List of datasets """ # Get data types datat = data_type.data(df, classes=custom_classes) data_complete = data_type.data(df, classes=custom_classes) nan_rows = get_nan_rows(df) # Set random seed numpy.random.seed(repeats) seeds = numpy.random.randint(low=0, high=1000, size=repeats) dfs_complete = [] # Impute data multiple times for seed in seeds: fiti.sample_data(data_complete, datat, nan_rows, random_state=seed) df_complete = data_complete.to_DataFrame() dfs_complete.append(df_complete) return dfs_complete
def predict_proba(self, df, random_state=1, repeats_inf=30): # Impute pred data num_cols = len(self.custom_classes) df_inf = df.copy() df_inf[self.response] = np.nan df_imputed_list = multi_sample_dataset(self.imputer, df_inf, self.custom_classes, random_state=random_state, repeats=repeats_inf) y_hat = np.zeros([df.shape[0], len(self.response)]) df_imputed = pd.concat(df_imputed_list, axis=0) df_imputed_t = data_type.data(df_imputed, classes=self.custom_classes) # Get prediction for each class for estimator in self.estimators: # Make predictions y_hat_new = estimator.pred_data(df_imputed_t, [0, 0, 0], range(num_cols - 3, num_cols), range(0, num_cols - 3)) # Average over responses y_hat_new = y_hat_new.reshape([repeats_inf, df.shape[0], 3]) y_hat_avg = np.mean(y_hat_new, axis=0) # Sum response y_hat = y_hat + y_hat_avg y_hat = y_hat / len(self.estimators) return y_hat
def filter_dataset(data_frame, cll_query, custom_classes , metric='bic', cores=multiprocessing.cpu_count()): data = data_type.data(data_frame, classes= custom_classes) et = ElimTree('et', data.col_names, data.classes) num_nds = et.nodes.num_nds forbidden_parents = [[] for _ in range(data.ncols)] forbidden_parents = forbidden_mbc(et, cll_query, forbidden_parents) for i in range(len(forbidden_parents)): if i in cll_query: forbidden_parents[i] = forbidden_parents[i] + cll_query score_best = np.array([score_function(data, i, [], metric=metric) for i in range(num_nds)],dtype = np.double) cache = np.full([et.nodes.num_nds,et.nodes.num_nds],1000000,dtype=np.double) lop_o, op_names_o, score_difs_o, cache = best_pred_cache(data, et, metric, score_best, forbidden_parents, cache, filter_l0 = True, add_only = True) selected = cll_query + np.unique(lop_o[:,1]).tolist() data_filtered = data_frame.iloc[:,selected] return data_filtered, selected
def l_bfgs(data_frame, et, cll_query, custom_classes, alpha = 1.0): num_vars = data_frame.shape[1] et_descriptor = [[et.nodes[i].parent_et for i in range(num_vars)], [et.nodes[i].nFactor for i in range(num_vars)], [[i] + et.nodes[i].parents.display() for i in range(num_vars)], [len(c) for c in custom_classes]] etc_d = PyFactorTree_data(et_descriptor[0], et_descriptor[1], et_descriptor[2],et_descriptor[3]) etc = PyFactorTree(et_descriptor[0], et_descriptor[1], et_descriptor[2],et_descriptor[3]) dt = data_type.data(data_frame, custom_classes) etc_d.learn_parameters(dt, alpha = 1) num_splits = np.ceil(data_frame.shape[0] / (1000.0*8.0))*8 # Data is splited for computing CLL to reduce memory requirements indv = Indicators_vector(data_frame, custom_classes, num_splits) # Indicator verctor for fast computations of the CLL data_ev = data_frame.copy() data_ev.iloc[:,cll_query] = np.nan ind_ev = Indicators_vector(data_ev, custom_classes, num_splits) etc_d.l_bfgs_b(indv,ind_ev,1.0) params = etc_d.get_parameters() nodes = [xi for xi in range(num_vars)] etc.set_parameters(nodes,params) return etc
def fit(self, df, random_state=1): # Get index of the response columns cll_query = [np.where(ri == df.columns)[0][0] for ri in self.response] # Train imputer # SEM + multiple imputation et_sem, _, fiti_sem, df_complete, _ = tsem_cache( df, custom_classes=self.custom_classes, metric=self.metric_sem, complete_prior="random") self.imputer = fiti_sem self.et_sem = et_sem df_imputed_list = multi_sample_dataset(self.imputer, df, self.custom_classes, random_state=random_state, repeats=self.repeats) self.df_imputed_list = df_imputed_list # Fit a model to each dataset for dfi in df_imputed_list: et_mbc = learn_mbc_generative(dfi, cll_query, pruned=False, et0=None, u=5, forbidden_parent=deepcopy( self.forbidden_parents), metric=self.metric_classifier, custom_classes=self.custom_classes) # Get factor tree fiti_mbc = PyFactorTree([ et_mbc.nodes[j].parent_et for j in range(et_mbc.nodes.num_nds) ], [et_mbc.nodes[j].nFactor for j in range(et_mbc.nodes.num_nds)], [[j] + et_mbc.nodes[j].parents.display() for j in range(et_mbc.nodes.num_nds)], [len(c) for c in self.custom_classes]) df_imputed_t = data_type.data(dfi, classes=self.custom_classes) fiti_mbc.learn_parameters(df_imputed_t, alpha=self.alpha) self.estimators.append(fiti_mbc) self.mbc_ets.append(et_mbc) self.mbc_params.append(fiti_mbc.get_parameters())
def hill_climbing_cache(data_frame, et0=None, u=5, metric='bic', tw_bound_type='b', tw_bound=5, cores=multiprocessing.cpu_count(), forbidden_parent=None, add_only = False, custom_classes=None, constraint = True, verbose=False): """Learns a Bayesian network with bounded treewidth Args: data_frame (pandas.DataFrame): Input data et0 (elimination_tree.ElimTree): Initial elimination tree (optional) u (int): maximum number of parents allowed metric (str): scoring functions tw_bound_type (str): 'b' bound, 'n' none tw_bound (float): tree-width bound cores (int): Number of cores forbidden_parent (list): blacklist with forbidden parents add_only (bool): If true, allow only arc additions custom_classes: If not None, the classes of each variable are set to custom_classes constraint: If true, the additions and reversals that exceed the treewidth bound are stored in a blacklist verbose (bool): If True, print details of the learning process Returns: elimination_tree.ElimTree Learned elimination tree """ count_i = 0 if custom_classes is None: data = data_type.data(data_frame) else: data = data_type.data(data_frame, classes= custom_classes) if et0 is None: et = ElimTree('et', data.col_names, data.classes) else: et = et0.copyInfo() if forbidden_parent is None: forbidden_parents = [[] for _ in range(data.ncols)] else: forbidden_parents = forbidden_parent ok_to_proceed = True num_nds = et.nodes.num_nds score_best = np.array([score_function(data, i, et.nodes[i].parents.display(), metric=metric) for i in range(num_nds)],dtype = np.double) # Cache cache = np.full([et.nodes.num_nds,et.nodes.num_nds],1000000,dtype=np.double) #loop while ok_to_proceed: count_i += 1 ta= time() # Input, Output and new lop_o, op_names_o, score_difs_o, cache = best_pred_cache(data, et, metric, score_best, forbidden_parents, cache, filter_l0 = True, u=u, add_only = add_only) tc = time() if len(lop_o) > 0: if tw_bound_type=='b': change_idx, et_new = search_first_tractable_structure(et, lop_o, op_names_o, tw_bound, forbidden_parents, constraint) if change_idx == -1: # print 'exit' return et else: change_idx = 0 xout = lop_o[change_idx][0] xin = lop_o[change_idx][1] lop_o[change_idx] et_new = et.copyInfo() if op_names_o[change_idx] == 0: et_new.setArcBN_py(xout, xin) elif op_names_o[change_idx] == 1: et_new.removeArcBN_py(xout, xin) else: et_new.removeArcBN_py(xout, xin) et_new.setArcBN_py(xin, xout) best_lop = lop_o[change_idx] xout = best_lop[0] xin = best_lop[1] best_op_names = op_names_o[change_idx] best_score_difs = score_difs_o[change_idx] et = et_new # Update score and cache if best_op_names == 2: score_best[xin] += cache[xout, xin] score_best[xout] += cache[xin, xout] cache[:,xout]=1000000 else: score_best[xin] += best_score_difs cache[:,xin]=1000000 tb= time() if verbose: if tw_bound_type=='b': print 'it: ',count_i, ', change: ', [best_op_names, xout, xin], ', tw: ', et.tw(), ', time: ', tc-ta, ', timetw: ', tb-tc, ', best_score_difs: ', best_score_difs else: print 'it: ',count_i, ', change: ', [best_op_names, xout, xin], ', time: ', tc-ta, ', timetw: ', tb-tc, ', best_score_difs: ', best_score_difs else: ok_to_proceed = False return et
def hill_climbing(data_frame, et0=None, u=5, metric='bic', metric_params=None, chc=False, tw_bound_type='b', tw_bound=5, optimization_type='size', k_complex=0.1, cores=multiprocessing.cpu_count(), forbidden_parent=None): """Gets the adjacency matrix of the Bayesian network encoded by the elimination tree et Args: data_frame (pandas.DataFrame): Input data et0 (elimination_tree.ElimTree): Initial limination tree u (int): maximum number of parents metric (str): scoring functions metric_params (list): Parameters for the scoring function chc (bool): If truth efficient learning (constrained hill-climbing) is performed tw_bound_type (str): 'b' bound, 'n' none tw_bound (float): tree-width bound k_complex (float): complexity penalization optimization_type (str): 'tw' tries to reduce tree-width, 'size' tries to reduce size. Only used if tw_bound_type!='n' cores (int): Number of cores forbidden_parent (list): balcklist with forbidden parents Returns: elimination_tree.ElimTree Learned elimination tree float Learning time """ #Initialize variables k = k_complex count_i = 0 data = data_type.data(data_frame) if et0 is None: et = ElimTree('et', data.col_names, data.classes) else: et = et0.copyInfo() len_nodes = data.ncols if forbidden_parent is None: forbidden_parents = [[] for _ in range(data.ncols)] else: forbidden_parents = forbidden_parent # Initialize score_best := Array with the metric value for all the variables and the complexity penalization score_best = [] for i in range(0, len_nodes): parents = et.nodes[i].parents.display() score_best.append(score_function(data, i, parents, metric, metric_params)) # Complexity of the network if tw_bound_type != 'n': score_best.append(-k * et.evaluate()) else: score_best.append(0) ok_to_proceed = True learn_time = 0 # While there is an improvement while ok_to_proceed: sys.stdout.write("\r Iteration %d" % count_i) sys.stdout.flush() count_i += 1 # Input, Output and new ta = time() et_new, score_new, time_score, time_compile, time_opt, best_change, forbidden_parents = best_pred_hc(data, et, score_best, forbidden_parents, u, metric, metric_params, chc, tw_bound_type, tw_bound, optimization_type, k_complex, cores) # If mixed, update order if optimization_type == 'mixed' and et_new.tw()>et.tw(): adj = get_adjacency_matrix_from_et(et_new) order, tw_greedy = greedy_tree_width(adj, method='fill') if tw_greedy < et_new.tw(False): et_new.compile_ordering(order.tolist()) if tw_bound_type == 'b': if tw_bound >= et_new.tw(False): score_new[-1] = et_new.size() * k_complex else: score_new[-1] = float("-inf") elif tw_bound_type == 'p': score_new[-1] = et_new.size() * k_complex tb = time() time_total = time_score + time_compile + time_opt print 'total time: ', time_total, ', Score Time: ', time_score, ', Compile Time: ', time_compile, ', Opt time: ', time_opt, ', pct: ', float(time_score) / float(time_total) print 'change: ', best_change, ', tw: ', et_new.tw(False), ', tw again: ', et_new.tw() learn_time += tb - ta if sum(score_new) > sum(score_best): et = et_new score_best = score_new else: ok_to_proceed = False f = open('learn_aux_dump', 'w') flag_it1 = et0 is None cloudpickle.dump([et, et_new, forbidden_parents, learn_time, flag_it1], f) f.close() print 'return tw: ', et.tw() return et, learn_time, score_best
def tsem_cache(data_frame, et0=None, u=5, metric='bic', tw_bound_type='b', tw_bound=5, cores=multiprocessing.cpu_count(), forbidden_parent=None, add_only = False, custom_classes=None, constraint = True, complete_prior = 'mode', alpha = 1): """Learns a multi-dimensional Bayesian network classifier Args: data_frame (pandas.DataFrame): Input data et0 (elimination_tree.ElimTree): Initial elimination tree (optional) u (int): maximum number of parents allowed metric (str): scoring functions tw_bound_type (str): 'b' bound, 'n' none tw_bound (float): tree-width bound cores (int): Number of cores forbidden_parent (list): blacklist with forbidden parents add_only (bool): If true, allow only arc additions custom_classes: If not None, the classes of each variable are set to custom_classes constraint: If true, the additions and reversals that exceed the treewidth bound are stored in a blacklist complete_prior (str): complete with mode or at random alpha: Dirichlet prior for Bayesian parameter estimation Returns: elimination_tree.ElimTree Learned elimination tree """ count_i = 0 if custom_classes is None: data = data_type.data(data_frame) else: data = data_type.data(data_frame, classes= custom_classes) nan_rows = get_nan_rows(data_frame) if et0 is None: et = ElimTree('et', data.col_names, data.classes) else: et = et0.copyInfo() num_nds = et.nodes.num_nds # Initialize complete dataset if complete_prior == 'mode': data_frame_complete = complete_df_mode(data_frame) else: data_frame_complete = complete_df_random(data_frame, data.classes) # Initialize data with random values data_complete = data_type.data(data_frame_complete, classes= data.classes) ftd = PyFactorTree_data([et.nodes[i].parent_et for i in range(et.nodes.num_nds)], [et.nodes[i].nFactor for i in range(et.nodes.num_nds)], [[i]+et.nodes[i].parents.display() for i in range(et.nodes.num_nds)], [len(c) for c in data.classes]) ftd.learn_parameters(data_complete, alpha = 0.0) ok_to_proceed_em = True score_best_ll = 0 tla = time() num_splits = numpy.ceil(data.nrows / (1000.0*8.0))*8 indv = Indicators_vector(data_frame, data.classes, num_splits) round_em = -1 if forbidden_parent is None: forbidden_parents = [[] for _ in range(data.ncols)] else: forbidden_parents = forbidden_parent num_changes = 0 num_param_cal = 0 num_improve_exp = 0 its_em = 1 while ok_to_proceed_em: round_em += 1 ok_to_proceed_em = False ok_to_proceed_hc = True # Run EM to compute parameters and compute score ftd_old = PyFactorTree_data([et.nodes[i].parent_et for i in range(et.nodes.num_nds)], [et.nodes[i].nFactor for i in range(et.nodes.num_nds)], [[i]+et.nodes[i].parents.display() for i in range(et.nodes.num_nds)], [len(c) for c in data.classes]) ftd.em_parallel(indv, its_em-1, 0.0) ftd_old.copy_parameters(ftd, range(num_nds)) score_obs = ftd.em_parallel(indv, 1, 0.0) num_param_cal += 1 ftd_mpe = PyFactorTree_data([et.nodes[i].parent_et for i in range(et.nodes.num_nds)], [et.nodes[i].nFactor for i in range(et.nodes.num_nds)], [[i]+et.nodes[i].parents.display() for i in range(et.nodes.num_nds)], [len(c) for c in data.classes]) ftd_mpe.copy_parameters(ftd_old, range(num_nds)) ftd_mpe.em_parallel(indv, 1, alpha) # Compute score of the model for the observed data for i in range(num_nds): score_obs[i] += score_function(data, i, et.nodes[i].parents.display(), metric, [],ll_in = 0) score_best_ll = score_obs # Impute dataset params = ftd_mpe.get_parameters() fiti = PyFactorTree([et.nodes[i].parent_et for i in range(et.nodes.num_nds)], [et.nodes[i].nFactor for i in range(et.nodes.num_nds)], [[i]+et.nodes[i].parents.display() for i in range(et.nodes.num_nds)], [len(c) for c in data.classes]) fiti.set_parameters([i for i in range(num_nds)],params) fiti.mpe_data(data_complete, data, nan_rows) score_best = numpy.array([score_function(data_complete, i, et.nodes[i].parents.display(), metric=metric) for i in range(num_nds)],dtype = numpy.double) # Cache cache = numpy.full([et.nodes.num_nds,et.nodes.num_nds],1000000,dtype=numpy.double) #loop hill-climbing while ok_to_proceed_hc: print "iteration ", count_i count_i += 1 ftd_old = PyFactorTree_data([et.nodes[i].parent_et for i in range(et.nodes.num_nds)], [et.nodes[i].nFactor for i in range(et.nodes.num_nds)], [[i]+et.nodes[i].parents.display() for i in range(et.nodes.num_nds)], [len(c) for c in data.classes]) ftd_old.copy_parameters(ftd, range(num_nds)) ta = time() score_best_ll = ftd.em_parallel(indv, 1, 0.0) num_param_cal += 1 tb = time() # Compute real score score_best_ll_real = ftd.log_likelihood_parallel(indv) for i in range(num_nds): score_best_ll[i] += score_function(data, i, et.nodes[i].parents.display(), metric, [],ll_in = 0) score_best_ll_real += score_function(data, i, et.nodes[i].parents.display(), metric, [],ll_in = 0) # Input, Output and new lop_o, op_names_o, score_difs_o, cache = best_pred_cache(data_complete, et, metric, score_best, forbidden_parents, cache, filter_l0 = True, add_only = add_only) ok_to_proceed_hc = False while len(lop_o) > 0: if tw_bound_type=='b': change_idx, et_new = search_first_tractable_structure(et, lop_o, op_names_o, tw_bound, forbidden_parents, constraint) else: change_idx, et_new = search_first_tractable_structure(et, lop_o, op_names_o, 10000) if change_idx != -1: best_lop = lop_o[change_idx] xout = best_lop[0] xin = best_lop[1] best_op_names = op_names_o[change_idx] best_score_difs = score_difs_o[change_idx] #Compute parameters of the new ET ftd_aux = PyFactorTree_data([et_new.nodes[i].parent_et for i in range(et_new.nodes.num_nds)], [et_new.nodes[i].nFactor for i in range(et_new.nodes.num_nds)], [[i]+et_new.nodes[i].parents.display() for i in range(et_new.nodes.num_nds)], [len(c) for c in data.classes]) if best_op_names == 2: nodes_change = best_lop.tolist() else: nodes_change = [best_lop[1]] nodes_copy = range(num_nds) for xi in nodes_change: nodes_copy.remove(xi) tc = time() score_obs_new = list(score_best_ll) ftd_aux.copy_parameters(ftd, nodes_copy) for xi in nodes_change: score_obs_new[xi] = ftd_aux.em_parameters_parallel(xi, ftd_old, indv, 0.0) num_param_cal += 1 score_obs_new[xi] += score_function(data, xi, et_new.nodes[xi].parents.display(), metric, [],ll_in = 0) # Compute real score score_obs_new_real = ftd_aux.log_likelihood_parallel(indv) for i in range(num_nds): score_obs_new_real += score_function(data, i, et_new.nodes[i].parents.display(), metric, [],ll_in = 0) td = time() if score_obs_new_real > score_best_ll_real: ok_to_proceed_hc = True ok_to_proceed_em = True ftd_best = None ftd_best = PyFactorTree_data([et_new.nodes[i].parent_et for i in range(et_new.nodes.num_nds)], [et_new.nodes[i].nFactor for i in range(et_new.nodes.num_nds)], [[i]+et_new.nodes[i].parents.display() for i in range(et_new.nodes.num_nds)], [len(c) for c in data.classes]) ftd_best.copy_parameters(ftd_aux, range(num_nds)) if sum(score_obs_new) > sum(score_best_ll): num_improve_exp +=1 score_best_ll = score_obs_new et_best = et_new # Update score and cache if best_op_names == 2: score_best[xin] += cache[xout, xin] score_best[xout] += cache[xin, xout] cache[:,xout]=1000000 else: score_best[xin] += best_score_difs cache[:,xin]=1000000 te = time() lop_o = [] num_changes += 1 print "iteration ", count_i, ', round: ', round_em, ', change: ', [best_op_names, xout, xin], ', tw: ', et_best.tw() else: if best_op_names == 0: forbidden_parents[best_lop[1]].append(best_lop[0]) elif best_op_names == 2: forbidden_parents[best_lop[0]].append(best_lop[1]) lop_o = lop_o[(change_idx+1):] op_names_o = op_names_o[(change_idx+1):] else: ok_to_proceed_hc = False lop_o = [] if ok_to_proceed_hc: ftd = ftd_best et = et_best ftd.em_parallel(indv, 1, alpha) tlb = time() # Complete dataset with MPE params = ftd.get_parameters() fiti = PyFactorTree([et.nodes[i].parent_et for i in range(et.nodes.num_nds)], [et.nodes[i].nFactor for i in range(et.nodes.num_nds)], [[i]+et.nodes[i].parents.display() for i in range(et.nodes.num_nds)], [len(c) for c in data.classes]) fiti.set_parameters([i for i in range(num_nds)],params) fiti.mpe_data(data_complete, data, nan_rows) df_complete = data_complete.to_DataFrame() return et, tlb-tla, fiti, df_complete, [num_changes, num_param_cal, float(num_improve_exp)/float(num_param_cal)]
def learn_mbc_cll(data_frame, cll_query, et0=None, u=5, metric='bic', tw_bound_type='b', tw_bound=5, cores=multiprocessing.cpu_count(), forbidden_parent=None, add_only = False, custom_classes=None, constraint = True, alpha = 1, verbose=False): """Discriminative learning of MBCs Args: data_frame (pandas.DataFrame): Input data cll_query: list of query variables, the rest are treated as evidence et0 (elimination_tree.ElimTree): Initial elimination tree (optional) u (int): maximum number of parents allowed metric (str): scoring functions tw_bound_type (str): 'b' bound, 'n' none tw_bound (float): tree-width bound cores (int): Number of cores forbidden_parent (list): blacklist with forbidden parents add_only (bool): If true, allow only arc additions custom_classes: If not None, the classes of each variable are set to custom_classes constraint: If true, the additions and reversals that exceed the treewidth bound are stored in a blacklist alpha: Dirichlet prior for Bayesian parameter estimation verbose (bool): If True, print details of the learning process Returns: elimination_tree.ElimTree Learned MBC """ #Initialization count_i = 0 if custom_classes is None: data = data_type.data(data_frame) else: data = data_type.data(data_frame, classes= custom_classes) if et0 is None: et = ElimTree('et', data.col_names, data.classes) else: et = et0.copyInfo() data_cll = data_frame num_nds = et.nodes.num_nds data_complete = data_type.data(data_cll, classes= data.classes) #Current selected variables. For efficiency purpose, the variables that are part of the graph are not use to compute the score sel_vars = cll_query + get_subgraph(et,cll_query) # Create initial FT ftd = PyFactorTree_data([et.nodes[i].parent_et for i in range(num_nds)], [et.nodes[i].nFactor for i in range(num_nds)], [[i]+et.nodes[i].parents.display() for i in range(num_nds)], [len(data.classes[i]) for i in range(num_nds)]) ftd.learn_parameters(data_complete, alpha = 0) score_best_cll = 0 # Initialize indicator vectors with all the data num_splits = numpy.ceil(data.nrows / (1000.0*8.0))*8 indv = Indicators_vector(data_cll, data.classes, num_splits) df_ev = data_cll.copy() df_ev.iloc[:,cll_query] = np.nan ind_ev = Indicators_vector(df_ev, data.classes, num_splits) if forbidden_parent is None: forbidden_parents = [[] for _ in range(data.ncols)] else: forbidden_parents = forbidden_parent # Compute score of the first model score_best_cll = compute_cll(indv, ind_ev, et, ftd, cll_query, data.classes) for i in range(num_nds): score_best_cll += score_function(data, i, et.nodes[i].parents.display(), metric, [],ll_in = 0) # BIC score score_best = numpy.array([score_function(data_complete, i, et.nodes[i].parents.display(), metric=metric) for i in range(num_nds)],dtype = numpy.double) # Cache cache = numpy.full([et.nodes.num_nds,et.nodes.num_nds],1000000,dtype=numpy.double) #loop hill-climbing ok_to_proceed_hc = True while ok_to_proceed_hc: count_i += 1 # Get FP for the MBC forbidden_parents_mbc = forbidden_mbc(et, cll_query, forbidden_parents) # Input, Output and new lop_o, op_names_o, score_difs_o, cache = best_pred_cache(data_complete, et, metric, score_best, forbidden_parents_mbc, cache, filter_l0 = True, add_only = add_only) ok_to_proceed_hc = False while len(lop_o) > 0: if tw_bound_type=='b': change_idx, et_new = search_first_tractable_structure(et, lop_o, op_names_o, tw_bound, forbidden_parents, constraint) else: change_idx, et_new = search_first_tractable_structure(et, lop_o, op_names_o, 10000) if change_idx != -1: best_lop = lop_o[change_idx] xout = best_lop[0] xin = best_lop[1] best_op_names = op_names_o[change_idx] best_score_difs = score_difs_o[change_idx] #Compute parameters of the new ET ftd_aux = PyFactorTree_data([et_new.nodes[i].parent_et for i in range(et_new.nodes.num_nds)], [et_new.nodes[i].nFactor for i in range(et_new.nodes.num_nds)], [[i]+et_new.nodes[i].parents.display() for i in range(et_new.nodes.num_nds)], [len(c) for c in data.classes]) if best_op_names == 2: nodes_change = best_lop.tolist() else: nodes_change = [best_lop[1]] nodes_copy = range(num_nds) for xi in nodes_change: nodes_copy.remove(xi) ftd_aux.learn_parameters(data,alpha=0) # Compute real score score_obs_new_real = compute_cll(indv, ind_ev, et_new, ftd_aux, cll_query, data.classes) for i in range(num_nds): score_obs_new_real += score_function(data, i, et_new.nodes[i].parents.display(), metric, [],ll_in = 0) if score_obs_new_real > score_best_cll: ok_to_proceed_hc = True ftd = ftd_aux score_diff = score_obs_new_real - score_best_cll score_best_cll = score_obs_new_real et = et_new # Update score and cache if best_op_names == 2: score_best[xin] += cache[xout, xin] score_best[xout] += cache[xin, xout] cache[:,xout]=1000000 else: score_best[xin] += best_score_difs cache[:,xin]=1000000 lop_o = [] sel_vars = cll_query + get_subgraph(et,cll_query) if verbose: print "iteration ", count_i, ', change: ', [best_op_names, xout, xin], ', tw: ', et.tw(), ', score_diff: ', score_diff,', len(sel_vars_aux): ', len(sel_vars) else: # ok_to_proceed_hc = False if best_op_names == 0: forbidden_parents[best_lop[1]].append(best_lop[0]) elif best_op_names == 2: forbidden_parents[best_lop[0]].append(best_lop[1]) lop_o = lop_o[(change_idx+1):] op_names_o = op_names_o[(change_idx+1):] else: ok_to_proceed_hc = False lop_o = [] return et
import numpy as np from epilepsy_classifiers import MBCClassifier import data_type from export import get_adjacency_matrix_from_et from utils import export_dsc # ----------- Preprocess -------------# # Load data path = "data/" # Read datasets df_train, df_test, q_vars_after_merge, cut_list = preprocess_data(path) df_variables = pd.DataFrame({"id":np.arange(df_train.shape[1]),"name":df_train.columns}) # Response variables response = ["Engel.1st.yr","Engel.2nd.yr","Engel.5th.yr"] # Get categories for all variables data_train_t = data_type.data(df_train) custom_classes = data_train_t.classes # Combine UCSF and MNI datasets df_all = df_train.append(df_test).reset_index().drop("index",axis=1) df_all.iloc[:,:-3] = df_all.drop(response,axis=1).astype(np.float32) # ----------- Train MBC -------------# # Forbidden parents for the MBC num_nds = df_all.shape[1] forbidden_parent = [[] for _ in range(num_nds-3)] forbidden_parent.append(range(num_nds-3) + [num_nds-1,num_nds-2]) forbidden_parent.append(range(num_nds-3) + [num_nds-1]) forbidden_parent.append(range(num_nds-3)) # Fit classifier estimator = MBCClassifier(response =response, custom_classes=custom_classes, repeats = 20, metric_sem = "aic", metric_classifier= "aic", alpha = 2.5, forbidden_parents=forbidden_parent) estimator.fit(df_all)