Пример #1
0
def multi_sample_dataset(fiti, df, custom_classes, random_state=1, repeats=30):
    """
	Complete data with samples multiple times acording to fiti

	Parameters
	----------
	fiti: factor tree
	df: pandas dataframe
	custom_classes: classes
	random_state: random state for the sample
	repeats: number of repeated samples

	Returns
	-------
	List of datasets
	"""
    # Get data types
    datat = data_type.data(df, classes=custom_classes)
    data_complete = data_type.data(df, classes=custom_classes)
    nan_rows = get_nan_rows(df)
    # Set random seed
    numpy.random.seed(repeats)
    seeds = numpy.random.randint(low=0, high=1000, size=repeats)
    dfs_complete = []
    # Impute data multiple times
    for seed in seeds:
        fiti.sample_data(data_complete, datat, nan_rows, random_state=seed)
        df_complete = data_complete.to_DataFrame()
        dfs_complete.append(df_complete)
    return dfs_complete
Пример #2
0
 def predict_proba(self, df, random_state=1, repeats_inf=30):
     # Impute pred data
     num_cols = len(self.custom_classes)
     df_inf = df.copy()
     df_inf[self.response] = np.nan
     df_imputed_list = multi_sample_dataset(self.imputer,
                                            df_inf,
                                            self.custom_classes,
                                            random_state=random_state,
                                            repeats=repeats_inf)
     y_hat = np.zeros([df.shape[0], len(self.response)])
     df_imputed = pd.concat(df_imputed_list, axis=0)
     df_imputed_t = data_type.data(df_imputed, classes=self.custom_classes)
     # Get prediction for each class
     for estimator in self.estimators:
         # Make predictions
         y_hat_new = estimator.pred_data(df_imputed_t, [0, 0, 0],
                                         range(num_cols - 3, num_cols),
                                         range(0, num_cols - 3))
         # Average over responses
         y_hat_new = y_hat_new.reshape([repeats_inf, df.shape[0], 3])
         y_hat_avg = np.mean(y_hat_new, axis=0)
         # Sum response
         y_hat = y_hat + y_hat_avg
     y_hat = y_hat / len(self.estimators)
     return y_hat
Пример #3
0
def filter_dataset(data_frame, cll_query, custom_classes , metric='bic',  cores=multiprocessing.cpu_count()):
    data = data_type.data(data_frame, classes= custom_classes)
    et = ElimTree('et', data.col_names, data.classes)
    num_nds = et.nodes.num_nds
    forbidden_parents = [[] for _ in range(data.ncols)]
    forbidden_parents = forbidden_mbc(et, cll_query, forbidden_parents)
    for i in range(len(forbidden_parents)):
        if i in cll_query:
            forbidden_parents[i] = forbidden_parents[i] + cll_query
    score_best = np.array([score_function(data, i, [], metric=metric) for i in range(num_nds)],dtype = np.double)
    cache = np.full([et.nodes.num_nds,et.nodes.num_nds],1000000,dtype=np.double)
    lop_o, op_names_o, score_difs_o, cache =  best_pred_cache(data, et, metric, score_best, forbidden_parents, cache, filter_l0 = True, add_only = True)
    
    selected = cll_query + np.unique(lop_o[:,1]).tolist()
    data_filtered = data_frame.iloc[:,selected]
    return data_filtered, selected
Пример #4
0
def l_bfgs(data_frame, et, cll_query, custom_classes, alpha = 1.0):
    num_vars = data_frame.shape[1]
    et_descriptor = [[et.nodes[i].parent_et for i in range(num_vars)], [et.nodes[i].nFactor for i in range(num_vars)], [[i] + et.nodes[i].parents.display() for i in range(num_vars)], [len(c) for c in custom_classes]]
    etc_d = PyFactorTree_data(et_descriptor[0], et_descriptor[1], et_descriptor[2],et_descriptor[3])
    etc = PyFactorTree(et_descriptor[0], et_descriptor[1], et_descriptor[2],et_descriptor[3])
    dt = data_type.data(data_frame, custom_classes)
    etc_d.learn_parameters(dt, alpha = 1) 
    
    num_splits = np.ceil(data_frame.shape[0] / (1000.0*8.0))*8 # Data is splited for computing CLL to reduce memory requirements
    indv = Indicators_vector(data_frame, custom_classes, num_splits) # Indicator verctor for fast computations of the CLL
    data_ev = data_frame.copy()
    data_ev.iloc[:,cll_query] = np.nan 
    ind_ev = Indicators_vector(data_ev, custom_classes, num_splits) 
    
    etc_d.l_bfgs_b(indv,ind_ev,1.0)
    params = etc_d.get_parameters()
    nodes = [xi for xi in range(num_vars)]
    etc.set_parameters(nodes,params)
    return etc
Пример #5
0
    def fit(self, df, random_state=1):
        # Get index of the response columns
        cll_query = [np.where(ri == df.columns)[0][0] for ri in self.response]
        # Train imputer
        # SEM + multiple imputation
        et_sem, _, fiti_sem, df_complete, _ = tsem_cache(
            df,
            custom_classes=self.custom_classes,
            metric=self.metric_sem,
            complete_prior="random")
        self.imputer = fiti_sem
        self.et_sem = et_sem
        df_imputed_list = multi_sample_dataset(self.imputer,
                                               df,
                                               self.custom_classes,
                                               random_state=random_state,
                                               repeats=self.repeats)
        self.df_imputed_list = df_imputed_list

        # Fit a model to each dataset
        for dfi in df_imputed_list:
            et_mbc = learn_mbc_generative(dfi,
                                          cll_query,
                                          pruned=False,
                                          et0=None,
                                          u=5,
                                          forbidden_parent=deepcopy(
                                              self.forbidden_parents),
                                          metric=self.metric_classifier,
                                          custom_classes=self.custom_classes)

            # Get factor tree
            fiti_mbc = PyFactorTree([
                et_mbc.nodes[j].parent_et for j in range(et_mbc.nodes.num_nds)
            ], [et_mbc.nodes[j].nFactor for j in range(et_mbc.nodes.num_nds)],
                                    [[j] + et_mbc.nodes[j].parents.display()
                                     for j in range(et_mbc.nodes.num_nds)],
                                    [len(c) for c in self.custom_classes])
            df_imputed_t = data_type.data(dfi, classes=self.custom_classes)
            fiti_mbc.learn_parameters(df_imputed_t, alpha=self.alpha)
            self.estimators.append(fiti_mbc)
            self.mbc_ets.append(et_mbc)
            self.mbc_params.append(fiti_mbc.get_parameters())
Пример #6
0
def hill_climbing_cache(data_frame, et0=None, u=5, metric='bic', tw_bound_type='b', tw_bound=5,  cores=multiprocessing.cpu_count(), forbidden_parent=None, add_only = False, custom_classes=None, constraint = True, verbose=False):
    """Learns a Bayesian network with bounded treewidth

    Args:
        data_frame (pandas.DataFrame): Input data
        et0 (elimination_tree.ElimTree): Initial elimination tree (optional)
        u (int): maximum number of parents allowed
        metric (str): scoring functions
        tw_bound_type (str): 'b' bound, 'n' none
        tw_bound (float): tree-width bound
        cores (int): Number of cores
        forbidden_parent (list): blacklist with forbidden parents
        add_only (bool): If true, allow only arc additions
        custom_classes: If not None, the classes of each variable are set to custom_classes
        constraint: If true, the additions and reversals that exceed the treewidth bound are stored in a blacklist
        verbose (bool): If True, print details of the learning process 

    Returns:
        elimination_tree.ElimTree Learned elimination tree
    """

    count_i = 0
    if custom_classes is None:
        data = data_type.data(data_frame)
    else:
        data = data_type.data(data_frame, classes= custom_classes)
    
    
    if et0 is None:
        et = ElimTree('et', data.col_names, data.classes)
    else:
        et = et0.copyInfo()
    
    if forbidden_parent is None:
        forbidden_parents = [[] for _ in range(data.ncols)]
    else:
        forbidden_parents = forbidden_parent
    ok_to_proceed = True
    num_nds = et.nodes.num_nds
    
    score_best = np.array([score_function(data, i, et.nodes[i].parents.display(), metric=metric) for i in range(num_nds)],dtype = np.double)
    # Cache
    cache = np.full([et.nodes.num_nds,et.nodes.num_nds],1000000,dtype=np.double)
    
    #loop
    while ok_to_proceed:
        count_i += 1
        
        ta= time()
        # Input, Output and new
        lop_o, op_names_o, score_difs_o, cache =  best_pred_cache(data, et, metric, score_best, forbidden_parents, cache, filter_l0 = True, u=u, add_only = add_only)
        tc = time()   
        if len(lop_o) > 0:
            if tw_bound_type=='b':
                change_idx, et_new = search_first_tractable_structure(et, lop_o, op_names_o, tw_bound, forbidden_parents, constraint)
                if change_idx == -1:
#                    print 'exit'
                    return et
            else:
                change_idx = 0
                xout = lop_o[change_idx][0]
                xin = lop_o[change_idx][1]
                lop_o[change_idx]
                et_new = et.copyInfo()
                if op_names_o[change_idx] == 0:
                    et_new.setArcBN_py(xout, xin)
                elif op_names_o[change_idx] == 1:
                    et_new.removeArcBN_py(xout, xin)
                else:
                    et_new.removeArcBN_py(xout, xin)
                    et_new.setArcBN_py(xin, xout)
            
            best_lop = lop_o[change_idx]
            xout = best_lop[0]
            xin = best_lop[1]
            best_op_names = op_names_o[change_idx]
            best_score_difs = score_difs_o[change_idx]
            et = et_new
            # Update score and cache
            if best_op_names == 2:
                score_best[xin] += cache[xout, xin]
                score_best[xout] += cache[xin, xout]
                cache[:,xout]=1000000
            else:
                score_best[xin] += best_score_difs
            cache[:,xin]=1000000
            tb= time()
            if verbose:
                if tw_bound_type=='b':
                    print 'it: ',count_i, ', change: ', [best_op_names, xout, xin], ', tw: ', et.tw(), ', time: ', tc-ta, ', timetw: ', tb-tc, ', best_score_difs: ', best_score_difs
                else:
                    print 'it: ',count_i, ', change: ', [best_op_names, xout, xin], ', time: ', tc-ta, ', timetw: ', tb-tc, ', best_score_difs: ', best_score_difs
        else:
            ok_to_proceed = False        
    return et
Пример #7
0
def hill_climbing(data_frame, et0=None, u=5, metric='bic', metric_params=None, chc=False, tw_bound_type='b', tw_bound=5, optimization_type='size', k_complex=0.1, cores=multiprocessing.cpu_count(), forbidden_parent=None):
    """Gets the adjacency matrix of the Bayesian network encoded by the elimination tree et

    Args:
        data_frame (pandas.DataFrame): Input data
        et0 (elimination_tree.ElimTree): Initial limination tree
        u (int): maximum number of parents
        metric (str): scoring functions
        metric_params (list): Parameters for the scoring function
        chc (bool): If truth efficient learning (constrained hill-climbing) is performed
        tw_bound_type (str): 'b' bound, 'n' none
        tw_bound (float): tree-width bound
        k_complex (float): complexity penalization
        optimization_type (str): 'tw' tries to reduce tree-width, 'size' tries to reduce size. Only used if tw_bound_type!='n'
        cores (int): Number of cores
        forbidden_parent (list): balcklist with forbidden parents

    Returns:
        elimination_tree.ElimTree Learned elimination tree
        float Learning time
    """
        
    #Initialize variables
    k = k_complex
    count_i = 0
    data = data_type.data(data_frame)
    if et0 is None:
        et = ElimTree('et', data.col_names, data.classes)
    else:
        et = et0.copyInfo()
    len_nodes = data.ncols
    if forbidden_parent is None:
        forbidden_parents = [[] for _ in range(data.ncols)]
    else:
        forbidden_parents = forbidden_parent

    # Initialize score_best := Array with the metric value for all the variables and the complexity penalization
    score_best = []
    for i in range(0, len_nodes):
        parents = et.nodes[i].parents.display()
        score_best.append(score_function(data, i, parents, metric, metric_params))

    # Complexity of the network
    if tw_bound_type != 'n':
        score_best.append(-k * et.evaluate())
    else:
        score_best.append(0)

    ok_to_proceed = True
    learn_time = 0

    # While there is an improvement
    while ok_to_proceed:
        sys.stdout.write("\r Iteration %d" % count_i)
        sys.stdout.flush()
        count_i += 1

        # Input, Output and new
        ta = time()
        et_new, score_new, time_score, time_compile, time_opt, best_change, forbidden_parents = best_pred_hc(data, et, score_best, forbidden_parents, u, metric, metric_params, chc, tw_bound_type, tw_bound, optimization_type, k_complex, cores)
        
        # If mixed, update order
        if optimization_type == 'mixed' and et_new.tw()>et.tw():
            adj = get_adjacency_matrix_from_et(et_new)
            order, tw_greedy = greedy_tree_width(adj, method='fill')
            if tw_greedy < et_new.tw(False):
                et_new.compile_ordering(order.tolist())
                if tw_bound_type == 'b':
                    if tw_bound >= et_new.tw(False):
                        score_new[-1] = et_new.size() * k_complex
                    else:
                        score_new[-1] = float("-inf")
                elif tw_bound_type == 'p':
                    score_new[-1] = et_new.size() * k_complex
            
        tb = time()
        time_total = time_score + time_compile + time_opt
        print 'total time: ', time_total, ', Score Time: ', time_score, ', Compile Time: ', time_compile, ', Opt time: ', time_opt, ', pct: ', float(time_score) / float(time_total)
        print 'change: ', best_change, ', tw: ', et_new.tw(False), ', tw again: ', et_new.tw()
        learn_time += tb - ta
        if sum(score_new) > sum(score_best):
            et = et_new
            score_best = score_new
        else:
            ok_to_proceed = False
        f = open('learn_aux_dump', 'w')
        flag_it1 = et0 is None
        cloudpickle.dump([et, et_new, forbidden_parents, learn_time, flag_it1], f)
        f.close()
        print 'return tw: ', et.tw()

    return et, learn_time, score_best
Пример #8
0
def tsem_cache(data_frame, et0=None, u=5, metric='bic', tw_bound_type='b', tw_bound=5,  cores=multiprocessing.cpu_count(), forbidden_parent=None, add_only = False, custom_classes=None, constraint = True, complete_prior = 'mode', alpha = 1):
    """Learns a multi-dimensional Bayesian network classifier

    Args:
        data_frame (pandas.DataFrame): Input data
        et0 (elimination_tree.ElimTree): Initial elimination tree (optional)
        u (int): maximum number of parents allowed
        metric (str): scoring functions
        tw_bound_type (str): 'b' bound, 'n' none
        tw_bound (float): tree-width bound
        cores (int): Number of cores
        forbidden_parent (list): blacklist with forbidden parents
        add_only (bool): If true, allow only arc additions
        custom_classes: If not None, the classes of each variable are set to custom_classes
        constraint: If true, the additions and reversals that exceed the treewidth bound are stored in a blacklist
        complete_prior (str): complete with mode or at random
        alpha: Dirichlet prior for Bayesian parameter estimation

    Returns:
        elimination_tree.ElimTree Learned elimination tree
    """
    
    count_i = 0
    if custom_classes is None:
        data = data_type.data(data_frame)
    else:
        data = data_type.data(data_frame, classes= custom_classes)
    nan_rows = get_nan_rows(data_frame)
    
    if et0 is None:
        et = ElimTree('et', data.col_names, data.classes)
    else:
        et = et0.copyInfo()
    

    num_nds = et.nodes.num_nds
    
    # Initialize complete dataset
    if complete_prior == 'mode':
        data_frame_complete = complete_df_mode(data_frame)
    else:
        data_frame_complete = complete_df_random(data_frame, data.classes) # Initialize data with random values
    data_complete = data_type.data(data_frame_complete, classes= data.classes)
    
    ftd = PyFactorTree_data([et.nodes[i].parent_et for i in range(et.nodes.num_nds)], [et.nodes[i].nFactor for i in range(et.nodes.num_nds)], [[i]+et.nodes[i].parents.display() for i in range(et.nodes.num_nds)], [len(c) for c in data.classes])
    ftd.learn_parameters(data_complete, alpha = 0.0)
    ok_to_proceed_em = True
    score_best_ll = 0    
    tla = time()    
    
    num_splits = numpy.ceil(data.nrows / (1000.0*8.0))*8
    indv = Indicators_vector(data_frame, data.classes, num_splits)        


    round_em = -1
    if forbidden_parent is None:
        forbidden_parents = [[] for _ in range(data.ncols)]
    else:
        forbidden_parents = forbidden_parent
        
        
    num_changes = 0
    num_param_cal = 0
    num_improve_exp = 0
    its_em = 1
    while ok_to_proceed_em:   
        round_em += 1
        ok_to_proceed_em = False
        ok_to_proceed_hc = True
        # Run EM to compute parameters and compute score

        ftd_old = PyFactorTree_data([et.nodes[i].parent_et for i in range(et.nodes.num_nds)], [et.nodes[i].nFactor for i in range(et.nodes.num_nds)], [[i]+et.nodes[i].parents.display() for i in range(et.nodes.num_nds)], [len(c) for c in data.classes])
        ftd.em_parallel(indv, its_em-1, 0.0)
        ftd_old.copy_parameters(ftd, range(num_nds))
        score_obs = ftd.em_parallel(indv, 1, 0.0)
        num_param_cal += 1

        ftd_mpe = PyFactorTree_data([et.nodes[i].parent_et for i in range(et.nodes.num_nds)], [et.nodes[i].nFactor for i in range(et.nodes.num_nds)], [[i]+et.nodes[i].parents.display() for i in range(et.nodes.num_nds)], [len(c) for c in data.classes])
        ftd_mpe.copy_parameters(ftd_old, range(num_nds))        
        ftd_mpe.em_parallel(indv, 1, alpha)
        
        # Compute score of the model for the observed data
        for i in range(num_nds):
            score_obs[i] += score_function(data, i, et.nodes[i].parents.display(), metric, [],ll_in = 0)     
        score_best_ll = score_obs
        
        # Impute dataset
        params = ftd_mpe.get_parameters()
        fiti = PyFactorTree([et.nodes[i].parent_et for i in range(et.nodes.num_nds)], [et.nodes[i].nFactor for i in range(et.nodes.num_nds)], [[i]+et.nodes[i].parents.display() for i in range(et.nodes.num_nds)], [len(c) for c in data.classes])
        fiti.set_parameters([i for i in range(num_nds)],params)
        fiti.mpe_data(data_complete, data, nan_rows)        
        
        score_best = numpy.array([score_function(data_complete, i, et.nodes[i].parents.display(), metric=metric) for i in range(num_nds)],dtype = numpy.double)
        # Cache
        cache = numpy.full([et.nodes.num_nds,et.nodes.num_nds],1000000,dtype=numpy.double)
        #loop hill-climbing
        while ok_to_proceed_hc:
            print "iteration ", count_i
            count_i += 1
            ftd_old = PyFactorTree_data([et.nodes[i].parent_et for i in range(et.nodes.num_nds)], [et.nodes[i].nFactor for i in range(et.nodes.num_nds)], [[i]+et.nodes[i].parents.display() for i in range(et.nodes.num_nds)], [len(c) for c in data.classes])
            ftd_old.copy_parameters(ftd, range(num_nds)) 
            ta = time()
            score_best_ll = ftd.em_parallel(indv, 1, 0.0)
            num_param_cal += 1

            tb = time()
            # Compute real score
            score_best_ll_real = ftd.log_likelihood_parallel(indv)
            for i in range(num_nds):
                score_best_ll[i] += score_function(data, i, et.nodes[i].parents.display(), metric, [],ll_in = 0)   
                score_best_ll_real += score_function(data, i, et.nodes[i].parents.display(), metric, [],ll_in = 0)  
            # Input, Output and new
            lop_o, op_names_o, score_difs_o, cache =  best_pred_cache(data_complete, et, metric, score_best, forbidden_parents, cache, filter_l0 = True, add_only = add_only)
            
            ok_to_proceed_hc = False
            while len(lop_o) > 0:
                if tw_bound_type=='b':
                    change_idx, et_new = search_first_tractable_structure(et, lop_o, op_names_o, tw_bound, forbidden_parents, constraint)
                else:
                    change_idx, et_new = search_first_tractable_structure(et, lop_o, op_names_o, 10000)   
                if change_idx != -1:
                    best_lop = lop_o[change_idx]
                    xout = best_lop[0]
                    xin = best_lop[1]
                    best_op_names = op_names_o[change_idx]
                    best_score_difs = score_difs_o[change_idx]
                    
                                       
                    #Compute parameters of the new ET
                    ftd_aux = PyFactorTree_data([et_new.nodes[i].parent_et for i in range(et_new.nodes.num_nds)], [et_new.nodes[i].nFactor for i in range(et_new.nodes.num_nds)], [[i]+et_new.nodes[i].parents.display() for i in range(et_new.nodes.num_nds)], [len(c) for c in data.classes])
            
                    if best_op_names == 2:
                        nodes_change = best_lop.tolist()
                    else:
                        nodes_change = [best_lop[1]]

                    nodes_copy = range(num_nds)
                    for xi in nodes_change:
                        nodes_copy.remove(xi)
                    tc = time()
                    score_obs_new = list(score_best_ll)
                    ftd_aux.copy_parameters(ftd, nodes_copy)
                    for xi in nodes_change:
                        score_obs_new[xi] = ftd_aux.em_parameters_parallel(xi, ftd_old, indv, 0.0)
                        num_param_cal += 1
                        score_obs_new[xi] += score_function(data, xi, et_new.nodes[xi].parents.display(), metric, [],ll_in = 0)  
                     
                    # Compute real score
                    score_obs_new_real = ftd_aux.log_likelihood_parallel(indv)
                    for i in range(num_nds):
                        score_obs_new_real += score_function(data, i, et_new.nodes[i].parents.display(), metric, [],ll_in = 0) 
                    
                    td = time()
                    if score_obs_new_real > score_best_ll_real:
                        ok_to_proceed_hc = True
                        ok_to_proceed_em = True
                        ftd_best = None 
                        ftd_best = PyFactorTree_data([et_new.nodes[i].parent_et for i in range(et_new.nodes.num_nds)], [et_new.nodes[i].nFactor for i in range(et_new.nodes.num_nds)], [[i]+et_new.nodes[i].parents.display() for i in range(et_new.nodes.num_nds)], [len(c) for c in data.classes])
                        ftd_best.copy_parameters(ftd_aux, range(num_nds))
                        if sum(score_obs_new) > sum(score_best_ll):
                            num_improve_exp +=1
                                                
                        
                        score_best_ll = score_obs_new
                        et_best = et_new
                        # Update score and cache
                        if best_op_names == 2:
                            score_best[xin] += cache[xout, xin]
                            score_best[xout] += cache[xin, xout]
                            cache[:,xout]=1000000
                        else:
                            score_best[xin] += best_score_difs
                        cache[:,xin]=1000000
                        te = time()
                        lop_o = []
                        num_changes += 1
                        print "iteration ", count_i, ', round: ', round_em, ', change: ', [best_op_names, xout, xin], ', tw: ', et_best.tw()
                    else:
                        if best_op_names == 0:
                            forbidden_parents[best_lop[1]].append(best_lop[0])
                        elif best_op_names == 2:
                            forbidden_parents[best_lop[0]].append(best_lop[1])
                        
                    lop_o = lop_o[(change_idx+1):] 
                    op_names_o = op_names_o[(change_idx+1):] 
                else:
                    ok_to_proceed_hc = False
                    lop_o = []
            if ok_to_proceed_hc:
                ftd = ftd_best
                et = et_best
    
    ftd.em_parallel(indv, 1, alpha)
    tlb = time()            
    # Complete dataset with MPE
    params = ftd.get_parameters()
    fiti = PyFactorTree([et.nodes[i].parent_et for i in range(et.nodes.num_nds)], [et.nodes[i].nFactor for i in range(et.nodes.num_nds)], [[i]+et.nodes[i].parents.display() for i in range(et.nodes.num_nds)], [len(c) for c in data.classes])
    fiti.set_parameters([i for i in range(num_nds)],params)
    fiti.mpe_data(data_complete, data, nan_rows)
    df_complete = data_complete.to_DataFrame()
    return et, tlb-tla, fiti, df_complete, [num_changes, num_param_cal, float(num_improve_exp)/float(num_param_cal)]
Пример #9
0
def learn_mbc_cll(data_frame, cll_query, et0=None, u=5, metric='bic', tw_bound_type='b', tw_bound=5,  cores=multiprocessing.cpu_count(), forbidden_parent=None, add_only = False, custom_classes=None, constraint = True, alpha = 1, verbose=False):
    """Discriminative learning of MBCs
    Args:
        data_frame (pandas.DataFrame): Input data
        cll_query: list of query variables, the rest are treated as evidence
        et0 (elimination_tree.ElimTree): Initial elimination tree (optional)
        u (int): maximum number of parents allowed
        metric (str): scoring functions
        tw_bound_type (str): 'b' bound, 'n' none
        tw_bound (float): tree-width bound
        cores (int): Number of cores
        forbidden_parent (list): blacklist with forbidden parents
        add_only (bool): If true, allow only arc additions
        custom_classes: If not None, the classes of each variable are set to custom_classes
        constraint: If true, the additions and reversals that exceed the treewidth bound are stored in a blacklist
        alpha: Dirichlet prior for Bayesian parameter estimation
        verbose (bool): If True, print details of the learning process 

    Returns:
        elimination_tree.ElimTree Learned MBC
    """
    #Initialization
    count_i = 0
    if custom_classes is None:
        data = data_type.data(data_frame)
    else:
        data = data_type.data(data_frame, classes= custom_classes)
    
    if et0 is None:
        et = ElimTree('et', data.col_names, data.classes)
    else:
        et = et0.copyInfo()
    
    data_cll = data_frame
    num_nds = et.nodes.num_nds
    data_complete = data_type.data(data_cll, classes= data.classes)
    #Current selected variables. For efficiency purpose, the variables that are part of the graph are not use to compute the score 
    sel_vars = cll_query + get_subgraph(et,cll_query)
    
    
    # Create initial FT
    ftd = PyFactorTree_data([et.nodes[i].parent_et for i in range(num_nds)], [et.nodes[i].nFactor for i in range(num_nds)], [[i]+et.nodes[i].parents.display() for i in range(num_nds)], [len(data.classes[i]) for i in range(num_nds)])
    ftd.learn_parameters(data_complete, alpha = 0) 
    score_best_cll = 0    
    
    # Initialize indicator vectors with all the data
    num_splits = numpy.ceil(data.nrows / (1000.0*8.0))*8
    indv = Indicators_vector(data_cll, data.classes, num_splits)  
    df_ev = data_cll.copy()      
    df_ev.iloc[:,cll_query] = np.nan
    ind_ev = Indicators_vector(df_ev, data.classes, num_splits)


    if forbidden_parent is None:
        forbidden_parents = [[] for _ in range(data.ncols)]
    else:
        forbidden_parents = forbidden_parent
    
    # Compute score of the first model
    score_best_cll = compute_cll(indv, ind_ev, et, ftd, cll_query, data.classes)
    
    for i in range(num_nds):
        score_best_cll += score_function(data, i, et.nodes[i].parents.display(), metric, [],ll_in = 0)      
    # BIC score
    score_best = numpy.array([score_function(data_complete, i, et.nodes[i].parents.display(), metric=metric) for i in range(num_nds)],dtype = numpy.double)
    # Cache
    cache = numpy.full([et.nodes.num_nds,et.nodes.num_nds],1000000,dtype=numpy.double)
    #loop hill-climbing
    ok_to_proceed_hc = True
    while ok_to_proceed_hc:
        count_i += 1
        # Get FP for the MBC
        forbidden_parents_mbc = forbidden_mbc(et, cll_query, forbidden_parents)

        # Input, Output and new
        lop_o, op_names_o, score_difs_o, cache =  best_pred_cache(data_complete, et, metric, score_best, forbidden_parents_mbc, cache, filter_l0 = True, add_only = add_only)
        
        ok_to_proceed_hc = False
        while len(lop_o) > 0:
            if tw_bound_type=='b':
                change_idx, et_new = search_first_tractable_structure(et, lop_o, op_names_o, tw_bound, forbidden_parents, constraint)
            else:
                change_idx, et_new = search_first_tractable_structure(et, lop_o, op_names_o, 10000)   
            if change_idx != -1:
                best_lop = lop_o[change_idx]
                xout = best_lop[0]
                xin = best_lop[1]
                best_op_names = op_names_o[change_idx]
                best_score_difs = score_difs_o[change_idx]
                                   
                #Compute parameters of the new ET
                ftd_aux = PyFactorTree_data([et_new.nodes[i].parent_et for i in range(et_new.nodes.num_nds)], [et_new.nodes[i].nFactor for i in range(et_new.nodes.num_nds)], [[i]+et_new.nodes[i].parents.display() for i in range(et_new.nodes.num_nds)], [len(c) for c in data.classes])
        
                if best_op_names == 2:
                    nodes_change = best_lop.tolist()
                else:
                    nodes_change = [best_lop[1]]

                nodes_copy = range(num_nds)
                for xi in nodes_change:
                    nodes_copy.remove(xi)
                ftd_aux.learn_parameters(data,alpha=0)
                 
                # Compute real score
                score_obs_new_real = compute_cll(indv, ind_ev, et_new, ftd_aux, cll_query, data.classes)
                for i in range(num_nds):
                    score_obs_new_real += score_function(data, i, et_new.nodes[i].parents.display(), metric, [],ll_in = 0) 
                if score_obs_new_real > score_best_cll:
                    ok_to_proceed_hc = True
                    ftd = ftd_aux
                    score_diff = score_obs_new_real - score_best_cll
                    score_best_cll = score_obs_new_real
                    et = et_new
                    # Update score and cache
                    if best_op_names == 2:
                        score_best[xin] += cache[xout, xin]
                        score_best[xout] += cache[xin, xout]
                        cache[:,xout]=1000000
                    else:
                        score_best[xin] += best_score_difs
                    cache[:,xin]=1000000
                    lop_o = []
                    sel_vars = cll_query + get_subgraph(et,cll_query)
                    if verbose:
                        print "iteration ", count_i, ', change: ', [best_op_names, xout, xin], ', tw: ', et.tw(), ', score_diff: ', score_diff,', len(sel_vars_aux): ', len(sel_vars)
                else:
#                        ok_to_proceed_hc = False
                    if best_op_names == 0:
                        forbidden_parents[best_lop[1]].append(best_lop[0])
                    elif best_op_names == 2:
                        forbidden_parents[best_lop[0]].append(best_lop[1])
                    
                lop_o = lop_o[(change_idx+1):] 
                op_names_o = op_names_o[(change_idx+1):] 
            else:
                ok_to_proceed_hc = False
                lop_o = []
    
    return et
Пример #10
0
import numpy as np
from epilepsy_classifiers import MBCClassifier
import data_type
from export import get_adjacency_matrix_from_et
from utils import export_dsc

# ----------- Preprocess -------------#
# Load data
path = "data/"
# Read datasets
df_train, df_test, q_vars_after_merge, cut_list = preprocess_data(path)
df_variables = pd.DataFrame({"id":np.arange(df_train.shape[1]),"name":df_train.columns})
# Response variables
response = ["Engel.1st.yr","Engel.2nd.yr","Engel.5th.yr"]
# Get categories for all variables
data_train_t = data_type.data(df_train)
custom_classes = data_train_t.classes
# Combine UCSF and MNI datasets
df_all = df_train.append(df_test).reset_index().drop("index",axis=1)
df_all.iloc[:,:-3] = df_all.drop(response,axis=1).astype(np.float32)

# ----------- Train MBC -------------#
# Forbidden parents for the MBC
num_nds = df_all.shape[1]
forbidden_parent = [[] for _ in range(num_nds-3)]
forbidden_parent.append(range(num_nds-3) + [num_nds-1,num_nds-2])
forbidden_parent.append(range(num_nds-3) + [num_nds-1])
forbidden_parent.append(range(num_nds-3)) 
# Fit classifier
estimator = MBCClassifier(response =response, custom_classes=custom_classes, repeats = 20, metric_sem = "aic", metric_classifier= "aic", alpha = 2.5, forbidden_parents=forbidden_parent)
estimator.fit(df_all)