def print_eval_results(self, eval_results=None, specs={}, to_csv=False): ''' Given the result of the evaluate method this method prints the result ''' if eval_results is None: eval_results = self._last_eval_results L().log.info( "\n-----------------------------------\n Results of evaluation \n-----------------------------------" ) # 1. CSV Writer if to_csv: mode = 'w' if self._append_csv: mode = 'a' with open(self._output_path, mode) as csvfile: fieldnames = ["model_name"] + list( self._settings_variables.keys()) + list(self._metrics) writer = csv.DictWriter(csvfile, fieldnames=fieldnames) if mode == "w": writer.writeheader() result = self._settings_variables for model_name in eval_results: result["model_name"] = model_name for metric in eval_results[model_name]: result[metric] = eval_results[model_name][metric] writer.writerow(result) for model_name in eval_results: L().log.info("\n-----> Model name: \t\t%s" % str(model_name)) for metric in eval_results[model_name]: metric_result = eval_results[model_name][metric] L().log.info( "%s: \t\t%s" % (self._readable_metric(metric), str(metric_result)))
def _log_cpds_emph_given(self, leaves): L().log.info( "---------------------------------------------------------------------------------------------------------------------------------------------------" ) L().log.info(" New CPDs") L().log.info( "---------------------------------------------------------------------------------------------------------------------------------------------------" ) for n in self.tbn.Vdata: if str.startswith(n, "dL_"): continue #print(str(leaves)) if not n in leaves: #print("Ignoring non leaves! As paths are only valid if they end at a leave") continue if isinstance(self.tbn.Vdata[n]["cprob"], dict): L().log.info("\n\n") for k in self.tbn.Vdata[n]["cprob"]: L().log.info( "\n\n\t------- Case: %s = %s \n\t\t\tVals: %s------- \n\t\t\tConditions:" % (n, str(self.tbn.Vdata[n]["cprob"][k]), self.tbn.Vdata[n]["vals"])) con = eval(k) remember = [ (n, str( list( np.array(self.tbn.Vdata[n]["vals"])[ self.tbn.Vdata[n]["cprob"][k] != 0]))) ] tmp = dict() for i in range(len(self.tbn.Vdata[n]["parents"])): if con[i] == "Never": continue tmp[self.tbn.Vdata[n]["parents"][i]] = con[i] if not str.endswith(self.tbn.Vdata[n]["parents"][i], "_0"): remember += [(self.tbn.Vdata[n]["parents"][i], con[i])] continue L().log.info("\t\t%s = %s" % (self.tbn.Vdata[n]["parents"][i], con[i])) remember.sort() L().log.info("\n\t\t\tWhat happened:") for r in remember: prev_tv = "_".join(r[0].split("_")[:-1] + [str(int(r[0].split("_")[-1]) - 1)]) if prev_tv[0] == "_": prev_tv = prev_tv[1:] comes_from = tmp[prev_tv] L().log.info("\t\t%s = %s (prev: %s)" % (r[0], r[1], comes_from)) else: L().log.info("\n\n%s = %s" % (n, str(self.tbn.Vdata[n]["cprob"]))) L().log.info("\n\n")
def _set_uniform_prior(self): if self._first_iteration: L().log.debug("Set priors: ") for n in self.tbn.Vdata: if str.startswith(n, "dL_"): continue if isinstance(self.tbn.Vdata[n]["cprob"], dict): for k in self.tbn.Vdata[n]["cprob"]: self.tbn.Vdata[n]["cprob"][k] = np.array([1.0 / float(len(self.tbn.Vdata[n]["cprob"][k]))]*len(self.tbn.Vdata[n]["cprob"][k])) L().log.debug("%s | %s = %s" % (n, k, str(self.tbn.Vdata[n]["cprob"][k]))) else: self.tbn.Vdata[n]["cprob"] = np.array([1.0 / float(len(self.tbn.Vdata[n]["cprob"]))]*len(self.tbn.Vdata[n]["cprob"])) L().log.debug("%s = %s" % (n, str(self.tbn.Vdata[n]["cprob"]))) self._first_iteration = False
def discover_structure_from_statistics(self, data, nodes): """ Implements the PC algorithm. :param nodes: all signal_occurrence values that are in the data set :param data: ADtree or pandas dataframe that contains the dataset counts :return: list of edges """ skeleton, sep_set = self.estimate_skeleton(data, nodes) pdag = self.estimate_cpdag(skeleton, sep_set) # orient remaining undirected edges according to occurrence number for scc in nx.strongly_connected_components(pdag): if len(scc) == 1: continue scc_nodes = sorted(scc, key=lambda node: int(node.rsplit('_')[-1])) for (parent, child) in combinations(scc_nodes, 2): if int(parent.rsplit('_')[-1]) <= int( child.rsplit('_')[-1]) and (child, parent) in pdag.edges: pdag.remove_edge(child, parent) pass pass pass edges = [list(edge) for edge in pdag.edges] L().log.debug('Edges: ' + str(edges)) return edges
def logging_setup(log_path, number_parallel): if number_parallel > 10: print( "You chose to run more than 10 processes in parallel. Be aware that your machine requires according computational power for this. Else choose less parallel processes.\n" ) print("Starting Experiments...") #sys.stderr = open(os.devnull, 'w') # disable broken pipe error time_str = strftime("%Y_%m_%d-%H_%M_%S", localtime()) log_path = os.path.join(log_path, "logging_bay_" + time_str + ".log") FORMAT = '%(asctime)-15s %(message)s' logging.basicConfig(format=FORMAT, datefmt="%H:%M:%S ") open(log_path, 'w').close() file_handler = logging.FileHandler(log_path) file_handler.setFormatter(logging.Formatter(FORMAT, "%H:%M:%S ")) L().log = logging.getLogger("tscbn_eval") L().log.addHandler(file_handler) L().log.setLevel(logging.INFO) L().log.parent.handlers = [] L().log.info("Logger initialized...")
def _log_cpds(self): L().log.info( "---------------------------------------------------------------------------------------------------------------------------------------------------") L().log.info(" New CPDs") L().log.info( "---------------------------------------------------------------------------------------------------------------------------------------------------") for n in self.tbn.Vdata: if str.startswith(n, "dL_"): continue if isinstance(self.tbn.Vdata[n]["cprob"], dict): for k in self.tbn.Vdata[n]["cprob"]: L().log.info("%s | %s = %s" % (n, k, str(self.tbn.Vdata[n]["cprob"][k]))) else: L().log.info("%s = %s" % (n, str(self.tbn.Vdata[n]["cprob"]))) L().log.info("\n\n")
def new_iteration(self, first, _debug_time): ''' New iteration of the EM Algorithm ''' self._em_iteration += 1 L().log.info("\n\nHistogram Updates \n\t\t\t\tTV %s \n\t\t\t\tsequence: %s" % (self._tv_name, str(self._full_sequence))) L().log.info("Anzahl Knoten: "+ str(self._len_nodes)) # Reset histograms if len(self._symbol_histograms)==0: L().log.info("No histograms - as no ambiguity\n") for k in range(len(self._symbol_histograms)): if self._em_iteration >1: self._symbol_histograms += self.histogram_smoothing self._symbol_histograms /= np.sum(self._symbol_histograms) try: if isinstance(self._full_sequence[k][0], list): ll = self._full_sequence[k][0][0] else: ll = self._full_sequence[k][0] L().log.info("Symbol %s - distribution: %s" % (str(ll), self._symbol_histograms[k])) except: L().log.error(traceback.format_exc()) if _debug_time:L().log.info("Check Time out: \n%s" % str(self.delta_t_for_debug))
def run_vary_sample_number(number_TVs, parallel_processes, result_path, print_sequences, plot_model, print_true_distribution, estimators): if not number_TVs in [3,5,10]: print("No model is stored for this number_TV value. Please set number_TV to 3, 5 or 10") return # Settings state_change_prob = 0.8 pe_debug_mode = False cpd_smoothing = 0.1 parallel_processes = parallel_processes object_nr = number_TVs nodes_per_tv = 5 states_per_tv = 4 edges_per_tv = 2 percentage_inter = 0.5 per_object_gap = 0.5 # gap between two intra-nodes intra_gap_range = 0.1 # gap between two intra-nodes is drawn - kind of variance: lies within - per_object_gap and per_object_gap+intra_gap_range e.g. [0.5 to 0.5 + 0.1] t_variance_tscbn = 0.02 # Variance of resulting TSCBN (after Parameter estimation) dbn_tolerance = 0.02 # tolerance train_test_split = 0.9 # percentage of training data id = "_"+ "_".join([str(v) for v in [object_nr, nodes_per_tv, states_per_tv, edges_per_tv, percentage_inter, per_object_gap, intra_gap_range, t_variance_tscbn, dbn_tolerance, state_change_prob, train_test_split]]) append_csv = False id_time = datetime.datetime.now().strftime("%I_%M%p_%d_%B_%Y_%H_%M_%S") out_path = os.path.join(result_path, r"evaluation_%s.csv" % id_time) # Iteration options grid_sample_sequences_from_tscbn = [100, 200, 300, 400, 500, 1000, 2000, 3000, 4000, 5000, 7500, 10000, 15000] # unter 1000 macht hier gar keinen Sinn bei so vielen Daten e.g. hier 228 samples - bei 100 sequenzen sehe nichtmal bruchteil grid_em_sampling_frequency = [1000] grid_em_iterations = [5] # init sg = StructureGenerator(test_type = 1) sg.add_base_structure_models([TSCBNStructureModel, DBNStructureModel, CTBNStructureModel]) # TNBNStructureModel, DBNStructureModel]) sg.reference_model = TSCBNStructureModel # this model is used to generate sample data # Load sequences sequences_in = json.load(open('store/sequences%s.txt' % id)) in_seq_in = json.load(open('store/in_seq%s.txt' % id)) first = True if print_sequences: k = 0 for sequence in sequences_in: k += 1; print(sequence) if k % 100 == 0: r = input("To load more sequences type 'y' ") if not r == "y": break # print True distribution of model if print_true_distribution: print("Actual distribution: ") with open('store/models%s.txt' % id, 'rb') as infile: real_models = dill.load(infile) infile.close() act_tscbn = real_models[sg.reference_model.__name__] for n in act_tscbn.Vdata: try: if isinstance(act_tscbn.Vdata[n]["cprob"], dict): for k in act_tscbn.Vdata[n]["cprob"]: print("%s | %s = %s" % (n, k, str(act_tscbn.Vdata[n]["cprob"][k]))) else: print("%s = %s" % (n, str(act_tscbn.Vdata[n]["cprob"]))) except: for k in act_tscbn.Vdata[n]["hybcprob"]: print("%s | %s = mean: %s var: %s" % (n, k, str(act_tscbn.Vdata[n]["hybcprob"][k]["mean_base"]), str(act_tscbn.Vdata[n]["hybcprob"][k]["variance"]))) print("\n\n") for estimator_id in estimators: for sample_sequences_from_tscbn in grid_sample_sequences_from_tscbn: for em_sampling_frequency in grid_em_sampling_frequency: for em_iterations in grid_em_iterations: print("\n-------------------------------\nDo: "+str(object_nr) +" "+ str(nodes_per_tv) +" "+ str(states_per_tv) +" "+ str(edges_per_tv) +" "+ str(percentage_inter) +" "+ str(per_object_gap) +" "+ str(t_variance_tscbn) +" "+ str(dbn_tolerance) +" "+ str(state_change_prob) +" "+ str(sample_sequences_from_tscbn) +" "+ str(em_sampling_frequency) +" "+ str(em_iterations)) # Load reference model with open('store/models%s.txt' % id, 'rb') as infile: real_models = dill.load(infile) infile.close() with open('store/specifications%s.txt' % id, 'rb') as infile: specifications = dill.load(infile) infile.close() models = copy.deepcopy(real_models) models["CTBNStructureModel"] = CTBNStructureModel() # Parameter Estimation pe = create_estimator(estimator_id) ctbn_estimator = CTBNEstimator() pe.original_tbn = copy.deepcopy(models[sg.reference_model.__name__]) original_tbn = copy.deepcopy(models[sg.reference_model.__name__]) if plot_model and first: pe.original_tbn.draw("ext") first = False # Initialize Estimator and Evaluator ev = ParameterEvaluator(append_csv);append_csv = True ev.add_setting("estimator", str(estimator_id)) ev.add_setting("object_nr", object_nr) ev.add_setting("nodes_per_tv", nodes_per_tv) ev.add_setting("states_per_tv", states_per_tv) ev.add_setting("edges_per_tv", edges_per_tv) ev.add_setting("percentage_inter", percentage_inter) ev.add_setting("per_tv_gap", per_object_gap) ev.add_setting("tscbn_variance", t_variance_tscbn) ev.add_setting("dbn_tolerance", dbn_tolerance) ev.add_setting("sc_probability", state_change_prob) ev.add_setting("sample_sequences_from_tscbn", sample_sequences_from_tscbn) ev.add_setting("em_sampling_frequency", em_sampling_frequency) ev.add_setting("em_iterations", em_iterations) ev.set_output_path(out_path) ev.rmse_tscb_variance = 0.1 # variance assumed per node - does not require parameter estimation ev.rmse_mean_range = 0.2 # drift of mean will be within this range e.g. 0.1 means it will be drawn from correct +- drift*correct ev.add_metric("runtime") ev.add_metric("log-likelihood") ev.add_metric("relative-entropy") ev.add_metric("temp-log-likelihood") pe.cpd_smoothing = cpd_smoothing pe.sampling_frequency = em_sampling_frequency # sampling frequency for the MC MC Simulation pe.iteration_frequency = em_iterations # EM Iterations pe.set_parallel_processes(parallel_processes) evidence = {} # evidence when sampling sg.set_model_visualization(plot = False, console_out = False) Printos.print_settings(sg, pe, ev, 1, train_test_split, sample_sequences_from_tscbn, evidence, []) # -------------------------------------------------------------------------------------------- # Run tests # -------------------------------------------------------------------------------------------- L().log.info("------------------ Running Test ------------------" ) if not ev._append_csv: eval_result = ev.write_header(True) sequences = sequences_in[:sample_sequences_from_tscbn + 1] in_seq = in_seq_in[:sample_sequences_from_tscbn + 1] # choose random train and test data from sklearn.model_selection import train_test_split train_sequences, test_sequences, train_tscbn_sequences, test_tscbn_sequences = train_test_split(sequences, in_seq, test_size=0.1, random_state=0) # ---------------------------------------------------------------------------------------- # ESTIMATE PARAMETERS # ---------------------------------------------------------------------------------------- for m in list(set(models)): print("\nEstimating: %s ---" % str(m)) L().log.info("Parameter Estimation %s..." %(str(m))) # TESTING #print("_-------___TEST") #if m != 'TSCBNStructureModel':continue if m == 'CTBNStructureModel': models[m] = ctbn_estimator.estimateStructureAndParameter(train_sequences,original_tbn) continue # Clear Models pe.tbn = copy.deepcopy(models[m]) pe.original_tbn = copy.deepcopy(models[m]) if m == sg.reference_model.__name__: pe.tbn.clear_parameters() # copy model structure only # Estimate Parameters ping = time.clock() pe.estimateParameter(train_sequences, m, pe_debug_mode, ev, pe.original_tbn) # computes kl divergence per run models[m] = pe.tbn models[m].parameter_execution_time = time.clock() - ping # exeution time print("Finished: %s ---" % str(m)) # ---------------------------------------------------------------------------------------- # EVALUATION # ---------------------------------------------------------------------------------------- try: eval_result = ev.evaluate(models, reference = pe._reference, test_sequences = test_sequences, test_tscbn_sequences = test_tscbn_sequences) ev.print_eval_results(eval_results = eval_result, specs = specifications, to_csv = True) except: print("bad ")
class Constant(object): LOCK = threading.Lock() LOCK2 = threading.Lock() JOE = L() Never = 0
def get_next_symbol(self, parent_starts, parent_outcome, condition): ''' each symbol is followed by a distribution which is optimized e.g. ABC would have dist[0] = [0,1,2] - depending on number of outcomes dist[0] = "Number of A occurrences" dist[1] = "Number of B occurrences" ... Then for each next symbol - draw number of next symbols and next symbol - until have next symbols return next symbols - optimize distributions: i.e. forbid permitted outcomes e.g. auf 4 Stellen - habe shcon AAB dann kann C nimmer 2 sein sondern nur 1 - can count at the same time what I did draw - e.g. if AABBCCC know next_distribution dist[0] = [0,1,0] ''' self._overall_index += 1 # ------------------------------------------------------------------------------------ # BORDER CASES # ------------------------------------------------------------------------------------ is_border = self._return_border_case() if is_border: self._cur_seen += [(condition, self._last_symbol[0])] return self._last_symbol # ------------------------------------------------------------------------------------ # Normal Run # ------------------------------------------------------------------------------------ if self._initial: # draw whole distribution good = False while not good: try: self._whole_sequence = self._draw_whole_distribution() good = True except: pass #self._TEST_ORIGINAL = self._whole_sequence #print("WHOLE: " + str(self._whole_sequence)) self._initial = False # first element was already passed as initial - thus can remove it here self._whole_sequence = self._whole_sequence[1:] self._last_symbol = self._whole_sequence[0] self._whole_sequence = self._whole_sequence[1:] # Check sample invalid #try: # self._last_symbol[1][0] # print("\nACHTUNG __________________ " + str(self._last_symbol)) # print("ORIGIONAL: "+str(self._TEST_ORIGINAL)) # print("FULL: " + str(self._full_sequence)) # print("Histos: " + str(self._symbol_histograms)) #except: # pass if not self._satisfies_parent_conditions(parent_starts, parent_outcome): Constant.JOE.log.debug("INVALID SAMPLE %s" % str(self._full_sequence)) return None self._cur_seen += [(condition, self._last_symbol[0])] return self._last_symbol # so hole ich die nächste Symbol # ------------------------------------------------------------------------------------ # Draw next until None left # ------------------------------------------------------------------------------------ if not self._next_symbols == 0: # Still did not reach last self._next_symbols -= 1 # return until done self.number_nevers -= 1 #L().log.debug("%s - I return: %s %s" % (str(self._id), self._next_symbols + 1, str(self._last_symbol))) # HERE check if my returned sample is legid! if not self._satisfies_parent_conditions(parent_starts, parent_outcome): L().log.debug("_______________ RETRY HARD _______________ ") # try again - then return not possible if len(self.sequence) == 0: # reached last element - but there is no next element - so not possible return None # if this will not work - then return invalid sample self._last_symbol = self.sequence[0] self.sequence = self.sequence[1:] if self.number_nevers > 0: self._draw_next_elements() if not self._satisfies_parent_conditions(parent_starts, parent_outcome): return None self._cur_seen += [(condition, self._last_symbol[0])] return self._last_symbol else: # get next self._last_symbol = self.sequence[0] self.sequence = self.sequence[1:] # ------------------------------------------------------------------------------------ # Draw next elements # ------------------------------------------------------------------------------------ if self.number_nevers > 0: self._draw_next_elements() #L().log.debug("%s - I return: %s %s" % (str(self._id), self._next_symbols + 1, str(self._last_symbol))) if not self._satisfies_parent_conditions(parent_starts, parent_outcome): L().log.debug("_______________ RETRY HARD _______________ ") # try again - then return not possible if len(self.sequence) == 0: # reached last element - but there is no next element - so not possible return None # if this will not work - then return invalid sample self._last_symbol = self.sequence[0] self.sequence = self.sequence[1:] if self.number_nevers > 0: self._draw_next_elements() if not self._satisfies_parent_conditions(parent_starts, parent_outcome): return None return self._last_symbol
def run_structure_experiment(target_path, parameter_temp_nodes_experiment=False, parameter_signals_experiment=False, comparison_experiment_temp_nodes=False, comparison_experiment_signals=False, comparison_experiment_scp=False): # number of iterations per experiment iterations = 25 # number of sequences per experiment sample_size = 5000 # ---------------------------------------------------------------------------------------- # Structure Generator Setup # ---------------------------------------------------------------------------------------- sg = StructureGenerator(test_type=TestStructureEnum.SPECIFICATION) sg.add_base_structure_models([TSCBNStructureModel]) sg.reference_model = TSCBNStructureModel # TIME SETTINGS (fixed for all experiments) sg.set_temporal_range(min_per_object_gap=0.5, max_per_object_gap=1.0) sg.set_temporal_variance(0.001) sg.set_dbn_tolerance(0.1) # PROBABILITY SETTINGS (fixed for all experiments) sg.set_state_change_probability(min_probability=0.95, max_probability=0.95) # ---------------------------------------------------------------------------------------- # Experiment with different parameters of the SBTreeDiscoverer # ---------------------------------------------------------------------------------------- if parameter_temp_nodes_experiment or parameter_signals_experiment: sd = SBTreeDiscoverer(min_out_degree=0.1, k_infrequent=0.1, approach='parent_graph', parallel=False) # filtering parameters fixed at 0.1 # parent graph approach means exact score optimization (but not exhaustive) # structure optimization not iteration in parallel for edges_per_object in [1, 3]: print('edges_per_object: ' + str(edges_per_object) + '...') L().log.info('edges_per_object: ' + str(edges_per_object) + '...') # EDGE SETTINGS sg.set_connection_ranges(min_edges_per_object=edges_per_object, max_edges_per_object=edges_per_object, min_percent_inter=1.0, max_percent_inter=1.0) if parameter_temp_nodes_experiment: # 1st experiment: Increase number of temporal variables per signal # EVALUATOR SETUP ev = StructureEvaluator(True) ev.set_output_path(os.path.join(target_path, r"structure_eval_%s.csv" % strftime("%Y_%m_%d-%H_%M_%S", localtime()))) metrics = ["add-edges", "del-edges", "num-add-edges", "num-del-edges", "shd", "add-edges-skel", "del-edges-skel", "num-add-edges-skel", "num-del-edges-skel", "shd-skel", "kld", "execution-time", "psi-execution-time", "so-execution-time"] for metric in metrics:ev.add_metric(metric) eval_results = dict() discovery_algorithms = set() for number_of_signals in [2, 3, 4]: print('number_of_signals: ' + str(number_of_signals) + '...') L().log.info('number_of_signals: ' + str(number_of_signals) + '...') if edges_per_object >= number_of_signals: continue numbers_of_temp_nodes = [1, 2, 3, 4, 5, 6, 7] for number_of_temp_nodes in numbers_of_temp_nodes: print('number_of_temp_nodes: ' + str(number_of_temp_nodes) + '...') L().log.info('number_of_temp_nodes: ' + str(number_of_temp_nodes) + '...') # NODE SETTINGS sg.set_node_range(min_objects=number_of_signals, max_objects=number_of_signals, min_temp_nodes=number_of_temp_nodes, max_temp_nodes=number_of_temp_nodes, min_states=3, max_states=3) eval_results.update({number_of_temp_nodes: dict()}) for iteration in range(0, iterations): print('iteration: ' + str(iteration) + '...') L().log.info('iteration: ' + str(iteration) + '...') # SAMPLE DATA models, specifications = sg.run_next_testcase() in_seq = models[sg.reference_model.__name__].randomsample(sample_size, {}) sequences = \ sequences_to_intervals(in_seq, models[sg.reference_model.__name__].Vdata, False)[0] # additional information for evaluation additional_infos = dict() additional_infos[sg.reference_model.__name__] = {'execution_time': 0.0, 'data': None} for score in ['BIC', 'AIC', 'Bdeu', 'K2']: print('score: ' + str(score) + '...') L().log.info('score: ' + str(score) + '...') for temporal_threshold in np.arange(0.0, 2.5, 0.5): print('temporal_threshold: ' + str(temporal_threshold) + '...') L().log.info('temporal_threshold: ' + str(temporal_threshold) + '...') # STRUCTURE DISCOVERER SETUP sd.score = score sd.max_time_difference = temporal_threshold sd_name = 'SBTreeDiscoverer_' + score + '_TH_' + str(temporal_threshold) if sd_name not in eval_results.get(number_of_temp_nodes): # initialise metrics_dict metrics_dict = dict((metric, []) for metric in metrics) eval_results.get(number_of_temp_nodes).update({sd_name: metrics_dict}) discovery_algorithms.add(sd_name) model_name = sd_name + ' (' + str(iteration) + ')' # RUN ALGORITHM L().log.info('----------------------------------------------------------') print('Run approach ' + model_name + '.') L().log.info('Run approach ' + model_name + '.') ping = clock() nodes, edges = sd.discover_structure(sequences) L().log.info('Nodes: ' + str(nodes)) L().log.info('Edges: ' + str(edges)) execution_time = clock() - ping additional_infos[model_name] = {'execution_time': execution_time, 'data': sd.data, 'psi_execution_time': sd.parent_set_identification_time, 'so_execution_time': sd.structure_optimization_time} L().log.info('Execution time: ' + str(execution_time)) L().log.info('----------------------------------------------------------') # CREATE TSCBN skel = GraphSkeleton() skel.V = nodes skel.E = edges skel.toporder() model = TSCBN("", skel, models[sg.reference_model.__name__].Vdata, unempty=True, forbid_never=True, discrete_only=True) # EVALUATION eval_result = ev.evaluate(model_dict={model_name: model}, reference=models[sg.reference_model.__name__], additional_infos=additional_infos) ev.print_eval_results(eval_results=eval_result, specs=specifications, to_csv=True) for metric, value in eval_result[model_name].items(): eval_results[number_of_temp_nodes][sd_name][metric].append(value) pass pass pass pass experiment_name = 'ParameterTmpNodesExperiment_EPO_' + str(edges_per_object) + '_Sig_' + \ str(number_of_signals) relevant_metrics = ["num-add-edges", "num-del-edges", "shd", "num-add-edges-skel", "num-del-edges-skel", "shd-skel", "kld", "execution-time", "psi-execution-time", "so-execution-time"] write_pgfplots_data(experiment_name, eval_results, relevant_metrics, discovery_algorithms, numbers_of_temp_nodes, 'number_of_temp_nodes', target_path) pass pass if parameter_signals_experiment: # 2nd experiment: Increase number of signals if edges_per_object == 3: continue # TODO: remove this, when choosing a maximal number of signals larger than 5 # EVALUATOR SETUP ev = StructureEvaluator(True) ev.set_output_path(os.path.join(target_path, r"structure_eval_%s.csv" % strftime("%Y_%m_%d-%H_%M_%S", localtime()))) metrics = ["add-edges", "del-edges", "num-add-edges", "num-del-edges", "shd", "add-edges-skel", "del-edges-skel", "num-add-edges-skel", "num-del-edges-skel", "shd-skel", "kld", "execution-time", "psi-execution-time", "so-execution-time"] for metric in metrics: ev.add_metric(metric) eval_results = dict() discovery_algorithms = set() for number_of_temp_nodes in [3, 5]: print('number_of_temp_nodes: ' + str(number_of_temp_nodes) + '...') L().log.info('number_of_temp_nodes: ' + str(number_of_temp_nodes) + '...') numbers_of_signals = [2, 3, 4, 5] evaluated_numbers_of_signals = copy.deepcopy(numbers_of_signals) for number_of_signals in numbers_of_signals: print('number_of_signals: ' + str(number_of_signals) + '...') L().log.info('number_of_signals: ' + str(number_of_signals) + '...') if edges_per_object >= number_of_signals: evaluated_numbers_of_signals.remove(number_of_signals) continue # NODE SETTINGS sg.set_node_range(min_objects=number_of_signals, max_objects=number_of_signals, min_temp_nodes=number_of_temp_nodes, max_temp_nodes=number_of_temp_nodes, min_states=3, max_states=3) eval_results.update({number_of_signals: dict()}) for iteration in range(iterations): print('iteration: ' + str(iteration) + '...') L().log.info('iteration: ' + str(iteration) + '...') # SAMPLE DATA models, specifications = sg.run_next_testcase() in_seq = models[sg.reference_model.__name__].randomsample(1000, {}) sequences = \ sequences_to_intervals(in_seq, models[sg.reference_model.__name__].Vdata, False)[0] # additional information for evaluation additional_infos = dict() additional_infos[sg.reference_model.__name__] = {'execution_time': 0.0, 'data': None} for score in ['BIC', 'AIC', 'Bdeu', 'K2']: print('score: ' + str(score) + '...') L().log.info('score: ' + str(score) + '...') for temporal_threshold in np.arange(0.0, 2.5, 0.5): print('temporal_threshold: ' + str(temporal_threshold) + '...') L().log.info('temporal_threshold: ' + str(temporal_threshold) + '...') # STRUCTURE DISCOVERER SETUP sd.score = score sd.max_time_difference = temporal_threshold sd_name = 'SBTreeDiscoverer_' + score + '_TH_' + str(temporal_threshold) if sd_name not in eval_results.get(number_of_signals): # initialise metrics_dict metrics_dict = dict((metric, []) for metric in metrics) eval_results.get(number_of_signals).update({sd_name: metrics_dict}) discovery_algorithms.add(sd_name) model_name = sd_name + ' (' + str(iteration) + ')' # RUN ALGORITHM L().log.info('----------------------------------------------------------') print('Run approach ' + model_name + '.') L().log.info('Run approach ' + model_name + '.') ping = clock() nodes, edges = sd.discover_structure(sequences) L().log.info('Nodes: ' + str(nodes)) L().log.info('Edges: ' + str(edges)) execution_time = clock() - ping additional_infos[model_name] = {'execution_time': execution_time, 'data': sd.data, 'psi_execution_time': sd.parent_set_identification_time, 'so_execution_time': sd.structure_optimization_time} L().log.info('Execution time: ' + str(execution_time)) L().log.info('----------------------------------------------------------') # CREATE TSCBN skel = GraphSkeleton() skel.V = nodes skel.E = edges skel.toporder() model = TSCBN("", skel, models[sg.reference_model.__name__].Vdata, unempty=True, forbid_never=True, discrete_only=True) # EVALUATION eval_result = ev.evaluate(model_dict={model_name: model}, reference=models[sg.reference_model.__name__], additional_infos=additional_infos) ev.print_eval_results(eval_results=eval_result, specs=specifications, to_csv=True) for metric, value in eval_result[model_name].items(): eval_results[number_of_signals][sd_name][metric].append(value) pass pass pass pass experiment_name = 'ParameterSignalsExperiment_EPO_' + str(edges_per_object) + '_TmpNodes_' + \ str(number_of_temp_nodes) relevant_metrics = ["num-add-edges", "num-del-edges", "shd", "num-add-edges-skel", "num-del-edges-skel", "shd-skel", "kld", "execution-time", "psi-execution-time", "so-execution-time"] write_pgfplots_data(experiment_name, eval_results, relevant_metrics, discovery_algorithms, evaluated_numbers_of_signals, 'num_signals', target_path) pass pass pass pass # ---------------------------------------------------------------------------------------- # Experiments with all algorithms # ---------------------------------------------------------------------------------------- # 1st experiment: increase number of temporal nodes if comparison_experiment_temp_nodes: # EDGE SETTINGS sg.set_connection_ranges(min_edges_per_object=2, max_edges_per_object=2, min_percent_inter=1.0, max_percent_inter=1.0) # EVALUATOR SETUP ev = StructureEvaluator(True) ev.set_output_path(os.path.join(target_path, r"structure_eval_%s.csv" % strftime("%Y_%m_%d-%H_%M_%S", localtime()))) metrics = ["add-edges", "del-edges", "num-add-edges", "num-del-edges", "shd", "add-edges-skel", "del-edges-skel", "num-add-edges-skel", "num-del-edges-skel", "shd-skel", "kld", "execution-time"] for metric in metrics: ev.add_metric(metric) eval_results = dict() for number_of_signals in [3, 4]: print('number_of_signals: ' + str(number_of_signals) + '...') L().log.info('number_of_signals: ' + str(number_of_signals) + '...') discovery_algorithms = set() numbers_of_temp_nodes = [2, 3, 4, 5, 6, 7, 8] for number_of_temp_nodes in numbers_of_temp_nodes: print('number_of_temp_nodes: ' + str(number_of_temp_nodes) + '...') L().log.info('number_of_temp_nodes: ' + str(number_of_temp_nodes) + '...') # NODE SETTINGS sg.set_node_range(min_objects=number_of_signals, max_objects=number_of_signals, min_temp_nodes=number_of_temp_nodes, max_temp_nodes=number_of_temp_nodes, min_states=3, max_states=3) eval_results.update({number_of_temp_nodes: dict()}) metrics_dict = dict((metric, []) for metric in metrics) # --------------------------------------------------- # RUN Structure Discovery several times # --------------------------------------------------- for iteration in range(iterations): print('iteration: ' + str(iteration) + '...') L().log.info('iteration: ' + str(iteration) + '...') # SAMPLE DATA models, specifications = sg.run_next_testcase() in_seq = models[sg.reference_model.__name__].randomsample(sample_size, {}) sequences = sequences_to_intervals(in_seq, models[sg.reference_model.__name__].Vdata, False)[0] additional_infos = dict() additional_infos[sg.reference_model.__name__] = {'execution_time': 0.0, 'data': None} # --------------------------------------------------- # Discovery Algorithm # --------------------------------------------------- for sd_name, sd in get_structure_discovery_algorithms(): # LIMITATIONS DUE TO RUNTIME PROBLEMS # TODO: run all algorithms for all networks on a better hardware if str.startswith(sd_name, 'Astar') and number_of_signals * number_of_temp_nodes > 16: print('Network to large for A* algorithm.') continue if str.startswith(sd_name, 'PC') and number_of_signals * number_of_temp_nodes > 24: print('Network to large for PC algorithm.') continue discovery_algorithms.add(sd_name) if sd_name not in eval_results.get(number_of_temp_nodes): eval_results.get(number_of_temp_nodes).update({sd_name: copy.deepcopy(metrics_dict)}) model_name = sd_name + ' (' + str(iteration) + ')' L().log.info('----------------------------------------------------------') print('Run approach ' + model_name + '.') L().log.info('Run approach ' + model_name + '.') ping = clock() nodes, edges = sd.discover_structure(sequences) L().log.info('Nodes: ' + str(nodes)) L().log.info('Edges: ' + str(edges)) execution_time = clock() - ping additional_infos[model_name] = {'execution_time': execution_time, 'data': sd.data} L().log.info('Execution time: ' + str(execution_time)) L().log.info('----------------------------------------------------------') # create TSCBN skel = GraphSkeleton() skel.V = nodes skel.E = edges skel.toporder() model = TSCBN("", skel, models[sg.reference_model.__name__].Vdata, unempty=True, forbid_never=True, discrete_only=True) # ---------------------------------------------------------------------------------------- # EVALUATION # ---------------------------------------------------------------------------------------- eval_result = ev.evaluate(model_dict={model_name: model}, reference=models[sg.reference_model.__name__], additional_infos=additional_infos) ev.print_eval_results(eval_results=eval_result, specs=specifications, to_csv=True) for metric, value in eval_result[model_name].items(): eval_results[number_of_temp_nodes][sd_name][metric].append(value) pass pass pass experiment_name = 'TempNodesExperiment_Sig_' + str(number_of_signals) relevant_metrics = ["num-add-edges", "num-del-edges", "shd", "num-add-edges-skel", "num-del-edges-skel", "shd-skel", "kld", "execution-time"] write_pgfplots_data(experiment_name, eval_results, relevant_metrics, discovery_algorithms, numbers_of_temp_nodes, 'number_of_temp_nodes', target_path) # 2nd experiment: increase number of signals if comparison_experiment_signals: # EDGE SETTINGS sg.set_connection_ranges(min_edges_per_object=2, max_edges_per_object=2, min_percent_inter=1.0, max_percent_inter=1.0) # EVALUATOR SETUP ev = StructureEvaluator(True) ev.set_output_path(os.path.join(target_path, r"structure_eval_%s.csv" % strftime("%Y_%m_%d-%H_%M_%S", localtime()))) metrics = ["add-edges", "del-edges", "num-add-edges", "num-del-edges", "shd", "add-edges-skel", "del-edges-skel", "num-add-edges-skel", "num-del-edges-skel", "shd-skel", "kld", "execution-time", "psi-execution-time", "so-execution-time"] for metric in metrics: ev.add_metric(metric) eval_results = dict() for number_of_temp_nodes in [3]: # TODO: run with larger numbers on better hardware print('number_of_temp_nodes: ' + str(number_of_temp_nodes) + '...') L().log.info('number_of_temp_nodes: ' + str(number_of_temp_nodes) + '...') discovery_algorithms = set() numbers_of_signals = [3, 4, 5, 6, 7, 8] for number_of_signals in numbers_of_signals: print('number_of_signals: ' + str(number_of_signals) + '...') L().log.info('number_of_signals: ' + str(number_of_signals) + '...') # NODE SETTINGS sg.set_node_range(min_objects=number_of_signals, max_objects=number_of_signals, min_temp_nodes=number_of_temp_nodes, max_temp_nodes=number_of_temp_nodes, min_states=3, max_states=3) eval_results.update({number_of_signals: dict()}) metrics_dict = dict((metric, []) for metric in metrics) # --------------------------------------------------- # RUN Structure Discovery several times # --------------------------------------------------- for iteration in range(iterations): print('iteration: ' + str(iteration) + '...') L().log.info('iteration: ' + str(iteration) + '...') # SAMPLE DATA models, specifications = sg.run_next_testcase() in_seq = models[sg.reference_model.__name__].randomsample(sample_size, {}) sequences = sequences_to_intervals(in_seq, models[sg.reference_model.__name__].Vdata, False)[0] additional_infos = dict() additional_infos[sg.reference_model.__name__] = {'execution_time': 0.0, 'data': None, 'psi-execution-time': 0.0, 'so-execution-time': 0.0} # --------------------------------------------------- # Discovery Algorithm # --------------------------------------------------- for sd_name, sd in get_structure_discovery_algorithms(): # LIMITATIONS DUE TO RUNTIME PROBLEMS # TODO: run all algorithms for all networks on a better hardware if str.startswith(sd_name, 'Astar') and number_of_signals * number_of_temp_nodes > 16: print('Network to large for A* algorithm.') continue if str.startswith(sd_name, 'PC') and number_of_signals * number_of_temp_nodes > 24: print('Network to large for PC algorithm.') continue if str.startswith(sd_name, 'sbPTM') and number_of_signals * number_of_temp_nodes > 30: print('Network to large for PTM algorithm.') continue if str.startswith(sd_name, 'cbPTM') and number_of_signals * number_of_temp_nodes > 30: print('Network to large for PTM algorithm.') continue discovery_algorithms.add(sd_name) if sd_name not in eval_results.get(number_of_signals): eval_results.get(number_of_signals).update({sd_name: copy.deepcopy(metrics_dict)}) model_name = sd_name + ' (' + str(iteration) + ')' L().log.info('----------------------------------------------------------') print('Run approach ' + model_name + '.') L().log.info('Run approach ' + model_name + '.') ping = clock() nodes, edges = sd.discover_structure(sequences) L().log.info('Nodes: ' + str(nodes)) L().log.info('Edges: ' + str(edges)) execution_time = clock() - ping additional_infos[model_name] = {'execution_time': execution_time, 'data': sd.data, 'psi_execution_time': 0.0, 'so_execution_time': 0.0} if sd.parent_set_identification_time and sd.structure_optimization_time: additional_infos[model_name].update( {'psi_execution_time': sd.parent_set_identification_time, 'so_execution_time': sd.structure_optimization_time}) L().log.info('Execution time: ' + str(execution_time)) L().log.info('----------------------------------------------------------') # create TSCBN skel = GraphSkeleton() skel.V = nodes skel.E = edges skel.toporder() model = TSCBN("", skel, models[sg.reference_model.__name__].Vdata, unempty=True, forbid_never=True, discrete_only=True) # ---------------------------------------------------------------------------------------- # EVALUATION # ---------------------------------------------------------------------------------------- eval_result = ev.evaluate(model_dict={model_name: model}, reference=models[sg.reference_model.__name__], additional_infos=additional_infos) ev.print_eval_results(eval_results=eval_result, specs=specifications, to_csv=True) for metric, value in eval_result[model_name].items(): eval_results[number_of_signals][sd_name][metric].append(value) pass pass pass experiment_name = 'SignalExperiment_TmpNodes_' + str(number_of_temp_nodes) relevant_metrics = ["num-add-edges", "num-del-edges", "shd", "num-add-edges-skel", "num-del-edges-skel", "shd-skel", "kld", "execution-time", "psi-execution-time", "so-execution-time"] write_pgfplots_data(experiment_name, eval_results, relevant_metrics, discovery_algorithms, numbers_of_signals, 'number_of_signals', target_path) # 3rd experiment: different values for the state change probability if comparison_experiment_scp: # EDGE SETTINGS sg.set_connection_ranges(min_edges_per_object=2, max_edges_per_object=2, min_percent_inter=1.0, max_percent_inter=1.0) # EVALUATOR SETUP ev = StructureEvaluator(True) ev.set_output_path(os.path.join(target_path, r"structure_eval_%s.csv" % strftime("%Y_%m_%d-%H_%M_%S", localtime()))) metrics = ["add-edges", "del-edges", "num-add-edges", "num-del-edges", "shd", "add-edges-skel", "del-edges-skel", "num-add-edges-skel", "num-del-edges-skel", "shd-skel", "kld", "execution-time"] for metric in metrics: ev.add_metric(metric) eval_results = dict() for number_of_temp_nodes in [3, 4]: print('number_of_temp_nodes: ' + str(number_of_temp_nodes) + '...') L().log.info('number_of_temp_nodes: ' + str(number_of_temp_nodes) + '...') # NODE SETTINGS sg.set_node_range(min_objects=3, max_objects=3, min_temp_nodes=number_of_temp_nodes, max_temp_nodes=number_of_temp_nodes, min_states=2, max_states=4) sg.set_connection_ranges(min_edges_per_object=2, max_edges_per_object=3, min_percent_inter=0.5, max_percent_inter=1.0) discovery_algorithms = set() state_change_probabilities = [0.5, 0.6, 0.7, 0.8, 0.9, 1.0] for state_change_probability in state_change_probabilities: print('state_change_probability: ' + str(state_change_probability) + '...') L().log.info('state_change_probability: ' + str(state_change_probability) + '...') sg.set_state_change_probability(min_probability=state_change_probability, max_probability=state_change_probability) eval_results.update({state_change_probability: dict()}) metrics_dict = dict((metric, []) for metric in metrics) # --------------------------------------------------- # RUN Structure Discovery several times # --------------------------------------------------- for iteration in range(iterations): print('iteration: ' + str(iteration) + '...') L().log.info('iteration: ' + str(iteration) + '...') # SAMPLE DATA models, specifications = sg.run_next_testcase() in_seq = models[sg.reference_model.__name__].randomsample(sample_size, {}) sequences = sequences_to_intervals(in_seq, models[sg.reference_model.__name__].Vdata, False)[0] additional_infos = dict() additional_infos[sg.reference_model.__name__] = {'execution_time': 0.0, 'data': None} # --------------------------------------------------- # Discovery Algorithm # --------------------------------------------------- for sd_name, sd in get_structure_discovery_algorithms(): # LIMITATIONS DUE TO RUNTIME PROBLEMS # TODO: run all algorithms for all networks on a better hardware if str.startswith(sd_name, 'Astar') and 3 * number_of_temp_nodes > 16: print('Network to large for A* algorithm.') continue discovery_algorithms.add(sd_name) if sd_name not in eval_results.get(state_change_probability): eval_results.get(state_change_probability).update({sd_name: copy.deepcopy(metrics_dict)}) model_name = sd_name + ' (' + str(iteration) + ')' L().log.info('----------------------------------------------------------') print('Run approach ' + model_name + '.') L().log.info('Run approach ' + model_name + '.') ping = clock() nodes, edges = sd.discover_structure(sequences) L().log.info('Nodes: ' + str(nodes)) L().log.info('Edges: ' + str(edges)) execution_time = clock() - ping additional_infos[model_name] = {'execution_time': execution_time, 'data': sd.data} L().log.info('Execution time: ' + str(execution_time)) L().log.info('----------------------------------------------------------') # create TSCBN skel = GraphSkeleton() skel.V = nodes skel.E = edges skel.toporder() model = TSCBN("", skel, models[sg.reference_model.__name__].Vdata, unempty=True, forbid_never=True, discrete_only=True) # ---------------------------------------------------------------------------------------- # EVALUATION # ---------------------------------------------------------------------------------------- eval_result = ev.evaluate(model_dict={model_name: model}, reference=models[sg.reference_model.__name__], additional_infos=additional_infos) ev.print_eval_results(eval_results=eval_result, specs=specifications, to_csv=True) for metric, value in eval_result[model_name].items(): eval_results[state_change_probability][sd_name][metric].append(value) pass pass pass experiment_name = 'SCP_Experiment_Sig_3_TmpNodes_' + str(number_of_temp_nodes) relevant_metrics = ["num-add-edges", "num-del-edges", "shd", "num-add-edges-skel", "num-del-edges-skel", "shd-skel", "kld", "execution-time"] write_pgfplots_data(experiment_name, eval_results, relevant_metrics, discovery_algorithms, state_change_probabilities, 'state_change_probability', target_path)
def _likelihood(self, tscbn, seen, tv): ''' Computes the likelihood of temporal variables that are within a range of index_range using tbn idx == indices for which this likelihood holds LIKELIHOOD OF WHOLE SEQUENCE!!!! ''' L().log.debug("\n\n----------------------------------------------------------------------") idx = [] # list of lists: l[0] == symbol_idx // l[1] == symbol number p_tot = 1.0 i = -1 symbol_idx = 0 symbol_nr = -2 #0 means 1, 1 means 2... prev_symbol = seen[0][1] first = True L().log.debug("\n\nSeen: %s" % (str(seen))) for s in seen: symbol_nr += 1 i += 1 cond = str(s[0]) symbol = s[1] n = tv + "_" + str(i) L().log.debug("Symbol nr: %s, Symbol index: %s" % (str(symbol_nr), str(symbol_idx))) L().log.debug("\n\nprev_symbol: %s, i: %s, cond: %s, symbol %s n: %s" % (str(prev_symbol), str(i), str(cond), str(symbol), str(n))) if not s[0] is None: p_tot *= tscbn[n]["cprob"][str(cond)][tscbn[n]["vals"].index(symbol)] L().log.debug("\n\np_cond: %s" %(str(tscbn[n]["cprob"][str(cond)][tscbn[n]["vals"].index(symbol)]))) else: p_tot *= tscbn[n]["cprob"][tscbn[n]["vals"].index(symbol)] L().log.debug("\n\np_: %s" % (str(tscbn[n]["cprob"][tscbn[n]["vals"].index(symbol)]))) if not first and prev_symbol != symbol: idx.append([symbol_idx, symbol_nr]) symbol_idx += 1 symbol_nr = -1 L().log.debug("idx: %s" % str(idx)) prev_symbol = symbol first = False symbol_nr += 1 idx.append([symbol_idx, symbol_nr]) L().log.debug("idx: %s" % str(idx)) ''' for i in range(0, len(pars.keys())): n = tv + "_" + str(i) if n not in pars: break symbol = set_values[n] if pars[n]["parents"] is None: cond = [] else: cond = [set_values[k] for k in pars[n]["parents"]] # get prob given cond if not cond: p = pars[n]["cprob"][pars[n]["tbn_vals"].index(symbol)] else: p = pars[n]["cprob"][str(cond)][pars[n]["tbn_vals"].index(symbol)] p_tot *= p ''' L().log.debug("\n\n----------------------------------------------------------------------") return p_tot, idx
def run_vary_structure(target_path): # ---------------------------------------------- # GRID # ---------------------------------------------- object_nr = [5, 10, 20, 30, 40] # (from, to, steps) nodes_per_tv = [2, 50, 2] # (from, to, steps) states_per_tv = [2, 6, 2] # (from, to, steps) edges_per_tv = [3, 3, 2] percentage_inter = [0.8, 0.8, 0.2] per_object_gap = [0.5, 0.5, 0.01 ] # range is still within selected and selected + 0.5 t_variance_tscbn = [0.1, 0.1, 0.02] dbn_tolerance = [0.02, 0.02, 0.02] state_change_prob = [1.0, 1.0, 0.01] append_csv = False eval_models = [CTBNStructureModel, DBNStructureModel, TSCBNStructureModel] id_time = datetime.datetime.now().strftime("%I_%M%p_%d_%B_%Y") out_path = os.path.join(target_path, r"model_evaluation_%s.csv" % id_time) print("store to %s" % out_path) run = 1 expected_runs = 1 expected_runs *= len(object_nr) expected_runs *= len( list(range(nodes_per_tv[0], nodes_per_tv[1] + 1, nodes_per_tv[2]))) expected_runs *= len( list(range(states_per_tv[0], states_per_tv[1] + 1, states_per_tv[2]))) expected_runs *= len( list(range(edges_per_tv[0], edges_per_tv[1] + 1, edges_per_tv[2]))) expected_runs *= len( list( np.arange(percentage_inter[0], percentage_inter[1] + 0.000001, percentage_inter[2]))) expected_runs *= len( list( np.arange(per_object_gap[0], per_object_gap[1] + 0.00000001, per_object_gap[2]))) expected_runs *= len( list( np.arange(t_variance_tscbn[0], t_variance_tscbn[1] + 0.00000001, t_variance_tscbn[2]))) expected_runs *= len( list( np.arange(dbn_tolerance[0], dbn_tolerance[1] + 0.00000001, dbn_tolerance[2]))) expected_runs *= len( list( np.arange(state_change_prob[0], state_change_prob[1] + 0.00000001, state_change_prob[2]))) for n_p_t in range(nodes_per_tv[0], nodes_per_tv[1] + 1, nodes_per_tv[2]): for s_p_t in range(states_per_tv[0], states_per_tv[1] + 1, states_per_tv[2]): for e_p_t in range(edges_per_tv[0], edges_per_tv[1] + 1, edges_per_tv[2]): if n_p_t < e_p_t: continue for per_iter in np.arange(percentage_inter[0], percentage_inter[1] + 0.000001, percentage_inter[2]): for p_o_gap in np.arange(per_object_gap[0], per_object_gap[1] + 0.00000001, per_object_gap[2]): for tscbn_var in np.arange( t_variance_tscbn[0], t_variance_tscbn[1] + 0.00000001, t_variance_tscbn[2]): for dbn_tol in np.arange( dbn_tolerance[0], dbn_tolerance[1] + 0.00000001, dbn_tolerance[2]): for sc_prob in np.arange( state_change_prob[0], state_change_prob[1] + 0.00000001, state_change_prob[2]): for o_nr in object_nr: print( "\n----------------------------------\nobj_nr: %s\nnodes_p_t: %s\nstates_pt: %s\nedges_pt: %s\nper_iter: %s\np_o_gap: %s\ntscbn_var: %s\ndbn_tol: %s\nsc_prob: %s" % (o_nr, n_p_t, s_p_t, e_p_t, per_iter, p_o_gap, tscbn_var, dbn_tol, sc_prob)) print("Remaining: %s" % (str(expected_runs - run))) run += 1 sg = StructureGenerator( test_type=TestStructureEnum. SPECIFICATION) ev = StructureEvaluator(append_csv) append_csv = True # Evaluation Parameters ev.add_setting("object_nr", o_nr) ev.add_setting("nodes_per_tv", n_p_t) ev.add_setting("states_per_tv", s_p_t) ev.add_setting("edges_per_tv", e_p_t) ev.add_setting("percentage_inter", per_iter) ev.add_setting("per_tv_gap", p_o_gap) ev.add_setting("tscbn_variance", tscbn_var) ev.add_setting("dbn_tolerance", dbn_tol) ev.add_setting("sc_probability", sc_prob) ev.set_output_path(out_path) ev.add_metric("num-edges") ev.add_metric("num-nodes") ev.add_metric("num-states") ev.add_metric("num-cpds") # ---------------------------------------------- # Settings # ---------------------------------------------- # Models sg.add_base_structure_models( eval_models ) # DBNStructureModel TNBNStructureModel, TSCBNStructureModel if DBNStructureModel in sg.get_generator_models( ): [ f for f in sg._generator_models if isinstance( f, DBNStructureModel) ][0].EXPLICIT_DISABLING = True # set setting for DBN # Structure Generation Settings # NODE SETTINGS sg.set_node_range( min_objects=o_nr, max_objects= o_nr, # number of temporal variables min_temp_nodes=n_p_t, max_temp_nodes= n_p_t, # number of nodes per temporal variable min_states=s_p_t, max_states=s_p_t ) # number of states per node # EDGE SETTINGS sg.set_connection_ranges( min_edges_per_object=e_p_t, max_edges_per_object=e_p_t, # Anzahl der Temporal Variables die miteinander verbunden - haben jeweils x edges zwischen Objekten min_percent_inter=per_iter, max_percent_inter=per_iter ) # Range für Random - prozentualer Anteil an Querverbindungen pro TV im Bezug auf Knotenanzahl # TIME SETTINGS sg.set_temporal_range( min_per_object_gap=p_o_gap, max_per_object_gap=p_o_gap + 0.5) sg.set_temporal_variance(tscbn_var) sg.set_dbn_tolerance(dbn_tol) # PROBABILITY SETTINGS sg.set_state_change_probability( min_probability=sc_prob, max_probability=sc_prob ) # probability of state change - at 1.0 parameter estimation should be exact # Generator Execution settings test_size = 1 # Visualization parameters sg.set_model_visualization( plot=True, console_out=False) # ---------------------------------------------- # Run tests # ---------------------------------------------- for i in range(test_size): #print("\n\n------------------ Running Test %s ------------------" % (str(i + 1))) # Return test case try: models, specifications = sg.run_next_testcase( ) except: print("Invalid sample " + str("")) if not ev._append_csv: eval_result = ev.write_header( True) continue # evaluate result eval_result = ev.evaluate( models, specifications=specifications) # output ev.print_eval_results( eval_results=eval_result, specs=specifications, to_csv=True) L().log.info("-------------------- DONE -------------------------")
def estimate_skeleton(self, data, nodes): def create_max_skeleton(nodes): skeleton = nx.Graph() skeleton.add_nodes_from(nodes) # create nodes edges = set() for node in nodes: for neigh in nodes: if node != neigh: edges.add((node, neigh)) pass pass skeleton.add_edges_from(edges) # add edges return skeleton max_skeleton = create_max_skeleton(nodes) if isinstance(data, ADTree): cb_estimator = GSquareEstimator(adtree=data) else: cb_estimator = BaseEstimator(data=data, complete_samples_only=False) # procedure similar to PC algorithm skeleton = max_skeleton.copy() condition_set_size = 0 sep_set = {} L().log.debug('---------------------------------------------------') L().log.debug('---- Conditional Independence Tests ---------------') L().log.debug('---------------------------------------------------') while True: cont = False remove_edges = [] for (source, target) in permutations(nodes, 2): neighbors = list(skeleton.neighbors(source)) if target not in neighbors: continue else: neighbors.remove(target) if len(neighbors) >= condition_set_size: L().log.debug('testing ' + source + ' --> ' + target) L().log.debug('neighbors of ' + source + ' are ' + str(neighbors)) for condition_set in combinations(neighbors, condition_set_size): L().log.debug('independence test of ' + source + ' and ' + target + ' with subset ' + str(condition_set)) _, p_val, _ = cb_estimator.test_conditional_independence( source, target, list(condition_set)) if isnan(p_val ): # pgmpy CI test returns NaN instead of 1 p_val = 1 L().log.debug('p_val = ' + str(p_val)) if p_val > self.alpha: if skeleton.has_edge(source, target): L().log.debug('remove edge ' + str((source, target))) remove_edges.append((source, target)) key = tuple(sorted((source, target))) if key in sep_set: sep_set[key] |= set(condition_set) else: sep_set[key] = set(condition_set) break pass cont = True pass condition_set_size += 1 skeleton.remove_edges_from(remove_edges) if cont is False: break if condition_set_size > self.max_reach: break pass return skeleton, sep_set
def _estimate_tscbn(self, sequences, debug, leaves, target='Aktiv_Funktion_Fahrerassistenzsystem_LDM'): cnt_s = 0 tot_s = len(sequences) for sequence in sequences: if cnt_s % 50 == 0: L().log.info("Processing %s / %s" % (str(cnt_s), str(tot_s))) cnt_s += 1 cur_seq = {} # simply count that largest = None max_val = 0 for tv in sequence: i = 0 for lst in sequence[tv]: [state, start, end] = lst node_name = tv + "_" + str(i) if start > max_val and tv != target: max_val = start largest = node_name i += 1 cur_seq[node_name] = state # das älteste element in Sequence muss unterstrich kriegen _... if largest == None: largest = target + "_1" cur_seq["_" + largest] = cur_seq[largest] del cur_seq[largest] # count all up in tree for node in cur_seq: if not self.tbn.Vdata[node]["parents"] is None: o = list(set(list(self.tbn.Vdata[node]["parents"]))) o.sort() self.tbn.Vdata[node]["parents"] = o state = cur_seq[node] if self.tbn.Vdata[node]["parents"] is None: idx = self.tbn.Vdata[node]["vals"].index(state) if not "cprob" in self.tbn.Vdata[node]: self.tbn.Vdata[node]["vals"] += ["Never"] self.tbn.Vdata[node]["cprob"] = np.zeros( len(self.tbn.Vdata[node]["vals"])) self.tbn.Vdata[node]["cprob"][idx] += 1.0 else: # get condition cond = [] for p in self.tbn.Vdata[node]["parents"]: if p not in cur_seq: cond += ["Never"] # it did not occur else: cond += [cur_seq[p]] idx = self.tbn.Vdata[node]["vals"].index(state) if not "cprob" in self.tbn.Vdata[node]: self.tbn.Vdata[node]["vals"] += ["Never"] self.tbn.Vdata[node]["cprob"] = dict() if not str(cond) in self.tbn.Vdata[node]["cprob"]: self.tbn.Vdata[node]["cprob"][str(cond)] = np.zeros( len(self.tbn.Vdata[node]["vals"])) self.tbn.Vdata[node]["cprob"][str(cond)][idx] += 1 # drop not existing cpds: for node in self.tbn.Vdata: if not self.tbn.Vdata[node][ "parents"] is None and not str.startswith(node, "dL_"): keep = dict() for cond in self.tbn.Vdata[node]["cprob"]: if not np.all(self.tbn.Vdata[node]["cprob"][cond] == 0): keep[cond] = self.tbn.Vdata[node]["cprob"][cond] self.tbn.Vdata[node]["cprob"] = keep # Plot all distributions if self.tbn.show_plot_generated: self._visual.plot_histograms_from_bn(self.tbn, self.tbn) self._log_cpds_emph_given(leaves)
def estimate_cpdag(self, skel_graph, sep_set): dag = skel_graph.to_directed() nodes = skel_graph.nodes() for (source, target) in combinations(nodes, 2): source_neighbors = set(dag.successors(source)) if target in source_neighbors: continue target_neghbors = set(dag.successors(target)) if source in target_neghbors: continue common_neighbors = source_neighbors & target_neghbors key = tuple(sorted((source, target))) for k in common_neighbors: if k not in sep_set[key]: if dag.has_edge(k, source): dag.remove_edge(k, source) L().log.debug('S: remove edge (' + k + ', ' + source + ')') pass if dag.has_edge(k, target): dag.remove_edge(k, target) L().log.debug('S: remove edge (' + k + ', ' + target + ')') pass pass pass pass def _has_both_edges(dag, i, j): return dag.has_edge(i, j) and dag.has_edge(j, i) def _has_any_edge(dag, i, j): return dag.has_edge(i, j) or dag.has_edge(j, i) # For all the combination of nodes source and target, apply the following # rules. for (source, target) in combinations(nodes, 2): # Rule 1: Orient source-target into source->target whenever there is an arrow k->source # such that k and target are nonadjacent. # # Check if source-target. if _has_both_edges(dag, source, target): # Look all the predecessors of source. for k in dag.predecessors(source): # Skip if there is an arrow source->k. if dag.has_edge(source, k): continue # Skip if k and target are adjacent. if _has_any_edge(dag, k, target): continue # Make source-target into source->target dag.remove_edge(target, source) L().log.debug('R1: remove edge (' + target + ', ' + source + ')') break pass # Rule 2: Orient source-target into source->target whenever there is a chain # source->k->target. # # Check if source-target. if _has_both_edges(dag, source, target): # Find nodes k where k is source->k. succs_i = set() for k in dag.successors(source): if not dag.has_edge(k, source): succs_i.add(k) pass pass # Find nodes target where target is k->target. preds_j = set() for k in dag.predecessors(target): if not dag.has_edge(target, k): preds_j.add(k) pass pass # Check if there is any node k where source->k->target. if len(succs_i & preds_j) > 0: # Make source-target into source->target dag.remove_edge(target, source) L().log.debug('R2: remove edge (' + target + ', ' + source + ')') break pass # Rule 3: Orient source-target into source->target whenever there are two chains # source-k->target and source-l->target such that k and l are nonadjacent. # # Check if source-target. if _has_both_edges(dag, source, target): # Find nodes k where source-k. source_neighbors = set() for k in dag.successors(source): if dag.has_edge(k, source): source_neighbors.add(k) pass pass # For all the pairs of nodes in source_neighbors, for (k, l) in combinations(source_neighbors, 2): # Skip if k and l are adjacent. if _has_any_edge(dag, k, l): continue # Skip if not k->target. if dag.has_edge(target, k) or (not dag.has_edge(k, target)): continue # Skip if not l->target. if dag.has_edge(target, l) or (not dag.has_edge(l, target)): continue # Make source-target into source->target. dag.remove_edge(target, source) L().log.debug('R3: remove edge (' + target + ', ' + source + ')') break pass return dag
def _single_run(self, initial_states, trees, seq_count, len_sequences, debug, disable_out = True): ''' This function is used to process multiple sequences together ''' # get last state #L().log.debug("-----------------> SEQUENCE %s of %s" % (str(seq_count + 1), str(len_sequences))) results = [] delta_t_distribution = {} # save key: node - value: dict: key - condition (inkl. myself) value: list of given delta t # --------- SAMPLING ----------- pars = {} Constant.LOCK.acquire() initial_set = [n for n in self.tbn.nodes.keys() if self.tbn.Vdata[n]["parents"] == None] Constant.LOCK.release() for tz in range(self.sampling_frequency): # Initialize #if debug: L().log.debug("Sequence %s - Run %s/%s" % (str(seq_count), str(tz + 1), str(self.sampling_frequency))) for t in trees: trees[t].reset(initial_states) #Constant.LOCK.acquire() node_set = copy.deepcopy(initial_set)#[n for n in self.tbn.nodes.keys() if self.tbn.Vdata[n]["parents"] == None] #Constant.LOCK.release() parents_set, set_values, i, current_sample_initial = [], {}, 0, [] current_sample, sample_legid, t_abs, t_abs_end = [], True, {}, {} # Iterate tree - starting from parent done = [] while node_set: # 1. next node i, n = self._next_node(node_set, i) # 2. copy parent information - to omit parallel access if n not in pars: Constant.LOCK.acquire() par = {} par["parents"] = copy.deepcopy(self.tbn.Vdata[n]["parents"]) par["dL_parents"] = copy.deepcopy(self.tbn.Vdata["dL_" + n]["parents"]) par["tbn_vals"] = copy.deepcopy(self.tbn.Vdata[n]["vals"]) par["children"] = copy.deepcopy(self.tbn.Vdata[n]["children"]) par["cprob"] = copy.deepcopy(self.tbn.Vdata[n]["cprob"]) pars[n] = par Constant.LOCK.release() # 3. if initial states - draw it from there if n.split("_")[-1] == "0": # DRAW LEAF NODE INITIAL SAMPLE val = initial_states["_".join(n.split("_")[:-1])][0] #L().log.debug("%s - I return: %s " % (str(n), str(val))) current_sample_initial.append([n, pars[n]["tbn_vals"].index(val)]) # info, info delta_t_distribution["dL_" + n] = {} if self._debug_time: trees["_".join(n.split("_")[:-1])].delta_t_for_debug["dL_" + n] = {} if self._debug_time: trees["_".join(n.split("_")[:-1])].delta_t_for_debug["dL_" + n][str([val])] = 0 t_abs[n], t_abs_end[n], delta_t_distribution["dL_" + n][str([val])], set_values[n] = 0.0, initial_states["_".join(n.split("_")[:-1])][2], [0.0], val else: # 4. if not initial states - draw conditioned on parents # check if all parents given - else continue if not set(pars[n]["parents"]).issubset(parents_set): i += 1 continue # get conditions cond = [set_values[k] for k in pars[n]["parents"]] # DRAW AND STORE NEXT SYMBOL parent_starts = [ [self._is_never(k, set_values), t_abs[k]] for k in pars[n]["parents"]] #parent_ends = [ [self._is_never(k, set_values), t_abs_end[k]] for k in pars[n]["parents"]] val = trees["_".join(n.split("_")[:-1])].get_next_symbol(parent_starts, self._parent_outcome(n, set_values), cond) if val is None: if debug: L().log.debug("Sample NOT LEGID - None - BREAK") print("Sample not legit") break set_values[n] = val[0] t_abs[n] = val[1] t_abs_end[n] = val[2] # IF DRAWN SAMPLE LEGIT RECORD IT current_sample.append([n, str(cond), pars[n]["tbn_vals"].index(val[0])]) if debug: L().log.debug("NEXT: %s = %s" % (str(n), val[0])) if debug: L().log.debug("nodes: %s" % str(node_set)) # RECORD DELTA T DISTRIBUTION cond_dL = [set_values[k] for k in pars[n]["dL_parents"]] # [set_values[k] for k in self.tbn.Vdata["dL_" + n]["parents"]] # DEBUG - ONLY HERE!!!!!!!!! if self._debug_time: if "dL_" + n not in trees["_".join(n.split("_")[:-1])].delta_t_for_debug: trees["_".join(n.split("_")[:-1])].delta_t_for_debug["dL_" + n] = {} if not str(cond_dL) in trees["_".join(n.split("_")[:-1])].delta_t_for_debug["dL_" + n]: trees["_".join(n.split("_")[:-1])].delta_t_for_debug["dL_" + n][str(cond_dL)] = [] # Summe, Anzahl #trees[n.split("_")[0]].delta_t_for_debug["dL_" + n][str(cond_dL)] += [t_abs[n] - max([t_abs[k] for k in pars[n]["parents"]])] # END DEBUG if "dL_" + n not in delta_t_distribution: delta_t_distribution["dL_" + n] = {} if not str(cond_dL) in delta_t_distribution["dL_" + n]: delta_t_distribution["dL_" + n][str(cond_dL)] = [] # Summe, Anzahl delta_t_distribution["dL_" + n][str(cond_dL)] += [t_abs[n] - max([t_abs[k] for k in pars[n]["parents"]])] # GET NEXT NODES parents_set.append(n) node_set.remove(n) done += [n] node_set += [o for o in pars[n]["children"] if not o in done and not str.startswith(o, "dL_")] node_set = list(set(node_set)) results.append([current_sample_initial, current_sample]) # do norm fit on last run then simply aggregate all gaussians - HALT - das muss conditioned passieren for k in delta_t_distribution: for j in delta_t_distribution[k]: mean, std = norm.fit(delta_t_distribution[k][j]) var = std * std if var == 0: var = 0.02 # else it makes no sense - as everything else then exact value is zero mean_scale = [1] * len(self.tbn.Vdata[k]["parents"]) delta_t_distribution[k][j] = {'variance': var, 'mean_base': mean, 'mean_scal': mean_scale} if self._debug_time: trees["_".join(k.replace("dL_", "").split("_")[:-1])].delta_t_for_debug[k][j] = {'variance': var, 'mean_base': mean} return results, delta_t_distribution, trees, seq_count
def print_settings(sg, pe, ev, test_size, train_test_split, sample_sequences_from_tscbn, evidence, testmode_models): L().log.info( "---------------------------------------------------------------------------------" ) L().log.info( " SETTINGS " ) L().log.info( "---------------------------------------------------------------------------------\n" ) ("\n\t\t\t\t\t\t ---> Execution Settings<---") L().log.info("Test size: \t\t\t\t\t\t%s" % str(test_size)) #L().log.info("Traintest split percentage: \t%s per cent" % str(train_test_split * 100)) L().log.info("Number of reference Samples: \t%s" % str(sample_sequences_from_tscbn)) L().log.info("Evidence: \t\t\t\t\t\t%s" % str(evidence)) L().log.info("Testmode Models : \t\t\t\t%s" % str(testmode_models)) L().log.info("\n\t\t\t\t\t\t ---> Parameter Estimation <---") L().log.info("E-Step Sampling Frequency: \t\t%s" % str(pe.sampling_frequency) ) # sampling frequency for the MC MC Simulation L().log.info("EM Iterations: \t\t\t\t\t%s" % str(pe.iteration_frequency)) # EM Iterations L().log.info("Parallel Processes: \t\t\t%s" % str(pe._parallel_processes)) L().log.info("\n\t\t\t\t\t\t ---> TSCBN Infos <---") '''L().log.info("Object Range: \t\t\t\t\t%s" % str(sg._object_range)) L().log.info("Models: \t\t\t\t\t\t%s" % str([m.__class__.__name__ for m in sg._generator_models])) L().log.info("Number TVs: \t\t\t\t\t%s" % str(sg._temp_node_range)) L().log.info("Number States: \t\t\t\t\t%s" % str(sg._state_range)) L().log.info("Number Inter-TV: \t\t\t\t%s" % str(sg._edges_inter_object_range)) L().log.info("Percentage Inter-TV: \t\t\t%s" % str(sg._percentage_inter_edges)) L().log.info("Intra Object Range: \t\t\t%s" % str(sg._intra_object_temp_range)) L().log.info("TSCBN Temporal Variance: \t\t%s" % str(sg._temporal_variance)) L().log.info("State Change Probability: \t\t%s" % str(sg._sc_probability)) L().log.info("\n\t\t\t\t\t\t ---> Evaluation Settings <---") L().log.info("DBN Tolerance: \t\t\t\t\t%s" % str(sg._dbn_tolerance))''' L().log.info( "RMSE TSCBN Variance: \t\t\t%s" % str(ev.rmse_tscb_variance) ) # variance assumed per node - does not require parameter estimation L().log.info("RMSE TSCBN MEAN DRIFT: \t\t\t%s" % str(ev.rmse_mean_range)) L().log.info("Evaluation Metrics") for m in ev._metrics: L().log.info("\t\t%s" % str(m)) L().log.info( "---------------------------------------------------------------------------------" ) L().log.info( " END SETTINGS " ) L().log.info( "---------------------------------------------------------------------------------\n\n\n" ) L().log.info( "---------------------------------------------------------------------------------" ) L().log.info(" RUN ") L().log.info( "---------------------------------------------------------------------------------\n\n" )
def _estimate_tscbn(self, sequences, debug): # set uniform priors ------- BUT ONLY ON FIRST ITERATION self._set_uniform_prior() # FOR TEST - if given try: kl_div = self._evaluator._compute_kl_divergence(self.tbn, self._reference, print_it=False) if kl_div != "N.A.": EMAlgorithmParameterEstimator.LAST_KL_DIVERGENCE = kl_div except: pass#print("No Evaluation for kl per em iteration set") # get sample trees per_seq_trees, per_seq_initial_states = self._extract_sample_trees(sequences) # set parallel processes if len(sequences) <= self._parallel_processes: self._parallel_processes = len(sequences) '''cnt = 0 tot_cnt = 0 for i in per_seq_trees: for j in per_seq_trees[i]: if per_seq_trees[i][j].number_nevers == 0: cnt += 1 tot_cnt +=1 print(str(cnt)) print(str(tot_cnt)) print(str(float(cnt)/float(tot_cnt))) import sys sys.exit(0)''' # EM: Iterations L().log.info("\n") L().log.info("Start EM Iterations") for opo in range(self.iteration_frequency): print("\n%sIteration:%s %s" % (PNT.BOLD, PNT.END, str(opo+1))) L().log.info("------------------------------------------------------------> EMIteration: %s ------------------------------------------------------------" % str(opo+1)) # Update to new histograms L().log.debug("---------------------------------------------------------------------------------------------------------------------------------------------------") L().log.debug(" Histogram Update") L().log.debug("---------------------------------------------------------------------------------------------------------------------------------------------------") self._log_cpds() for k in per_seq_trees: trees = per_seq_trees[k] L().log.debug( "------------------------------------------------------------> Sequence " + str(k) + " <------------------------------------------------------------") [trees[t].new_iteration(opo == 0, self._debug_time) for t in trees] # per sequence create sample list_input = self._em_input(sequences, per_seq_trees, per_seq_initial_states, debug) print("Training size: %s" % str(len(sequences))) # split this input_list - to avoid memory overload split_size = 2001 list_inputs = [list_input[i:i + split_size] for i in range(0, len(list_input), split_size)] final = False for i in range(len(list_inputs)): if i == (len(list_inputs)-1): final=True l_input = list_inputs[i] # parallel execution of simulation output_list = self._parallel_em(debug, l_input) # Update CPD and trees + normalize all + set all distribution parameters self._update_CPD(output_list, per_seq_trees, final) per_seq_trees = self._update_trees(output_list, per_seq_trees) del output_list # print evaluation try: kl_div = self._evaluator._compute_kl_divergence(self.tbn, self._reference, print_it = False) if kl_div != "N.A.": EMAlgorithmParameterEstimator.LAST_KL_DIVERGENCE = kl_div except: print("No Evaluation for kl per em iteration set") # Plot all distributions if self.tbn.show_plot_generated: self._visual.plot_histograms_from_bn(self.tbn, self.original_tbn) L().log.info( "------------------------------------------------------------> EM Finished ------------------------------------------------------------") self._log_cpds() return self.tbn
def test_conditional_independence(self, source, target, condition_set): adtree = self.adtree number_samples = adtree.count() source_table = adtree.table(source) source_values = [source_entry[0] for source_entry in source_table] target_table = adtree.table(target) target_values = [target_entry[0] for target_entry in target_table] dof = ( (len(source_table) - 1) * (len(target_table) - 1) * np.prod(list(map(lambda x: len(adtree.table(x)), condition_set))) ) # degrees of freedom if dof == 0: # this is the case when source or target is constant L().log.warning( 'Zero degrees of freedom: Either source or target is constant!' ) pass return 0, 1, True # p-value is 1 row_size_required = 10 * dof # test results are not really reliable if there are less than 10*dof samples sufficient_data = True if number_samples < row_size_required: L().log.warning('Not enough samples. ' + str(number_samples) + ' is too small. Need ' + str(row_size_required) + '. G^2-Test may not be reliable.') sufficient_data = False pass g2 = 0 # first case: empty condition set if len(condition_set) == 0: nij = pd.DataFrame(0, index=[entry[0] for entry in source_table], columns=[entry[0] for entry in target_table]) kwargs = {} # collect arguments for ADtree lookup for source_value in source_values: for target_value in target_values: kwargs.update({source: source_value, target: target_value}) nij.loc[source_value, target_value] = adtree.count(**kwargs) pass n_j = np.array([nij.sum(axis=1) ]).T # fix first variable and compute frequencies ni_ = np.array([nij.sum(axis=0) ]) # fix second variable and compute frequencies expected_nij = n_j.dot(ni_) / number_samples # expectation of nij ln_argument = nij.divide(expected_nij) # compute argument for ln() ln_results = np.log(ln_argument) # compute ln() g2 = np.nansum(nij.multiply(2 * ln_results)) # compute sum of lns pass # second case: non-empty condition set if len(condition_set) > 0: # calculate number of possible combinations of the values in the condition set prod_levels = np.prod( list(map(lambda x: len(adtree.table(x)), condition_set))) condition_set_values = [ list([entry[0] for entry in adtree.table(node)]) for node in condition_set ] cs_value_combinations = list(product(*condition_set_values)) nij_ = [ pd.DataFrame(0, index=[entry[0] for entry in source_table], columns=[entry[0] for entry in target_table]) for _ in cs_value_combinations ] nijk = pd.concat(nij_, keys=cs_value_combinations) # type: pd.DataFrame # fill in frequencies kwargs = {} # collect arguments for ADtree lookup for source_value in source_values: for target_value in target_values: for cs_value_combination in cs_value_combinations: kwargs.update({ source: source_value, target: target_value }) kwargs.update(zip(condition_set, cs_value_combination)) nijk.xs(cs_value_combination).loc[ source_value, target_value] = adtree.count(**kwargs) pass pass pass ni__ = np.ndarray((len(source_table), prod_levels)) n_j_ = np.ndarray((len(target_table), prod_levels)) for value_combination in cs_value_combinations: index = cs_value_combinations.index(value_combination) ni__[:, index] = nijk.xs(value_combination).sum(axis=1) n_j_[:, index] = nijk.xs(value_combination).sum(axis=0) pass n__k = n_j_.sum(axis=0) for value_combination in cs_value_combinations: index = cs_value_combinations.index(value_combination) ni_k = np.array([ ni__[:, index] ]).T # fix condition set and compute source frequencies n_jk = np.array([ n_j_[:, index] ]) # fix condition set and compute target frequencies expected_nijk = ni_k.dot(n_jk) / n__k[ index] # expected frequencies for nijk ln_argument = nijk.xs( value_combination) / expected_nijk # argument for ln() ln_results = np.log(ln_argument) # compute ln() g2 += np.nansum( nijk.xs(value_combination).multiply(2 * ln_results)) pass pass p_val = chi2.sf(g2, dof) # compute p-value by using the chi^2 distribution return g2, p_val, sufficient_data
def discover_structure_from_pops(self, pops, data): """ This method takes the potential parents of all nodes and the ADtree with all the data. An approach similar to PC algorithm is performed to determine the parent set for each node. :param pops: map from nodes to their potential parents :param data: ADtree or pandas dataframe :return nodes: list of nodes :return edges: list of inter edges """ def create_maximal_pgm(pops): pgm = nx.DiGraph() pgm.add_nodes_from(pops) # create nodes for node in pops: edges = [(parent, node) for parent in pops.get(node) if node.rsplit('_', 1)[0] != parent.rsplit('_', 1)[0]] pgm.add_edges_from(edges) # add edges return pgm def markov_blanket(graph, parent_node, node): mb = set(pa for pa in graph.predecessors(node)) # add parent nodes mb |= set(ch for ch in graph.successors(node)) # add child nodes for child in graph.successors(node): # add parents of children mb |= set(pa for pa in graph.predecessors(child)) if node in mb: # remove node mb.remove(node) if parent_node in mb: # remove parent_node mb.remove(parent_node) return mb max_pgm = create_maximal_pgm(pops) if self.draw: plt.title('Maximal PGM (only intra-edges)') signal_pos_map = {} pos = {} for node in max_pgm.nodes: if node.rsplit('_', 1)[0] not in signal_pos_map: signal_pos_map.update({node.rsplit('_', 1)[0]: len(signal_pos_map)}) x_coordinate = int(node[-1:]) y_coordinate = signal_pos_map.get(node.rsplit('_', 1)[0]) pos.update({node: [x_coordinate, y_coordinate]}) nx.draw(max_pgm, pos=pos, with_labels=True) plt.show() pass if isinstance(data, ADTree): cb_estimator = GSquareEstimator(adtree=data) else: cb_estimator = BaseEstimator(data=data, complete_samples_only=False) # procedure similar to PC algorithm pgm = max_pgm.copy() condition_set_size = 0 L().log.debug('---------------------------------------------------') L().log.debug('---- Conditional Independence Tests ---------------') L().log.debug('---------------------------------------------------') # if self.optimization_chi_square: import scipy.stats as scs def chi_square_of_df_cols(df, col1, col2): df_col1, df_col2 = df[col1], df[col2] categories_2 = list(df_col2.unique()) categories_1 = list(df_col1.unique()) result = [[sum((df_col1 == cat1) & (df_col2 == cat2)) for cat2 in categories_2] for cat1 in categories_1] chi = scs.chi2_contingency(result) return chi remove_edges = [] for (source, target) in pgm.edges(): # check how correlated those two edges are / independent of MB and all the other stuff dat = chi_square_of_df_cols(self.data, source, target) # 1 = more corr. 0 = less corr. chi2, p, sufficient_data = dat[0], dat[1], dat[2] #print("%s Chi = %s, p=%s" % (str([source, target]), str(chi2), str(p))) if chi2 < self.chi_square_thresh and pgm.has_edge(source, target): L().log.debug('remove edge ' + str((source, target))) remove_edges.append((source, target)) pgm.remove_edges_from(remove_edges) #import sys #sys.exit(0) # additionally remove edges which are conditionally independent # e.g. given a-> b c->b and given a, c is independent of b, then I can remove c!!! remove_edges = [] for (source, target) in pgm.edges(): condition_set = [a for a in pgm.predecessors(target) if a != source] if not condition_set:continue _, p_val, _ = cb_estimator.test_conditional_independence(source, target, list(condition_set)) if p_val > self.alpha: if pgm.has_edge(source, target): L().log.debug('remove edge ' + str((source, target))) remove_edges.append((source, target)) pgm.remove_edges_from(remove_edges) else: while True: cont = False remove_edges = [] for (source, target) in pgm.edges(): mb = markov_blanket(pgm, target, source) if len(mb) >= condition_set_size: L().log.debug('testing ' + source + ' --> ' + target) L().log.debug('markov blanket of ' + source + ' is ' + str(mb)) for condition_set in combinations(mb, condition_set_size): L().log.debug( 'independence test of ' + source + ' and ' + target + ' with subset ' + str(condition_set)) _, p_val, _ = cb_estimator.test_conditional_independence(source, target, list(condition_set)) #if isnan(p_val): # pgmpy CI test returns NaN instead of 1 # p_val = 1 L().log.debug('p_val = ' + str(p_val)) if p_val > self.alpha: if pgm.has_edge(source, target): L().log.debug('remove edge ' + str((source, target))) remove_edges.append((source, target)) break pass cont = True pass condition_set_size += 1 pgm.remove_edges_from(remove_edges) if cont is False: break if condition_set_size > self.max_reach: break if self.draw: plt.title('PGM after CI tests (only inter-edges)') signal_pos_map = {} pos = {} for node in pgm.nodes: if node.rsplit('_', 1)[0] not in signal_pos_map: signal_pos_map.update({node.rsplit('_', 1)[0]: len(signal_pos_map)}) x_coordinate = int(node[-1:]) y_coordinate = signal_pos_map.get(node.rsplit('_', 1)[0]) pos.update({node: [x_coordinate, y_coordinate]}) nx.draw(pgm, pos=pos, with_labels=True) plt.show() pass nodes = list(pops.keys()) edges = [list(edge) for edge in pgm.edges] return nodes, edges