def test_mci(self): # Setting up strict test level pc_alpha = 0.05 #[0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5] tau_max = 2 alpha_level = 0.01 dataframe = pp.DataFrame(self.data) cond_ind_test = ParCorr(verbosity=verbosity) pcmci = PCMCI(selected_variables=None, dataframe=dataframe, cond_ind_test=cond_ind_test, verbosity=verbosity) results = pcmci.run_mci( selected_links=None, tau_min=1, tau_max=tau_max, parents=self.true_parents, max_conds_py=None, max_conds_px=None, ) parents = pcmci._return_significant_parents( pq_matrix=results['p_matrix'], val_matrix=results['val_matrix'], alpha_level=alpha_level, )['parents'] # print parents # print _get_parent_graph(true_parents) assert_graphs_equal(parents, self.true_parents)
def test_pcmci(self): # Setting up strict test level pc_alpha = 0.05 #[0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5] tau_max = 2 alpha_level = 0.01 dataframe = pp.DataFrame(self.data) cond_ind_test = ParCorr(verbosity=verbosity) pcmci = PCMCI(dataframe=dataframe, cond_ind_test=cond_ind_test, verbosity=verbosity) results = pcmci.run_pcmci( tau_max=tau_max, pc_alpha=pc_alpha, ) parents = pcmci._return_significant_parents( pq_matrix=results['p_matrix'], val_matrix=results['val_matrix'], alpha_level=alpha_level)['parents'] # print parents # print self.true_parents assert_graphs_equal(parents, self.true_parents)
class PCMCIPlugin: def input(self, inputfile): self.links_coeffs = {} infile = open(inputfile, 'r') for line in infile: contents = line.split('\t') var = int(contents[0]) driver = int(contents[1]) lag = int(contents[2]) coeff = float(contents[3]) if (not var in self.links_coeffs): self.links_coeffs[var] = [] self.links_coeffs[var].append(((driver, lag), coeff)) def run(self): data, _ = pp.var_process(self.links_coeffs, T=1000) dataframe = pp.DataFrame(data) cond_ind_test = ParCorr() self.pcmciobj = PCMCI(dataframe=dataframe, cond_ind_test=cond_ind_test) self.results = self.pcmciobj.run_pcmci(tau_max=2, pc_alpha=None) def output(self, outputfile): self.pcmciobj.print_significant_links( p_matrix=self.results['p_matrix'], val_matrix=self.results['val_matrix'], alpha_level=0.05)
def time_lagged_correlation(): """Runs the time-lagged correlation analysis experiment. This function alculates the time-lagged correlation between the variables for lags of up to 48 hours and plots the result as a scatterplot matrix. """ var_names = [ "dayOfYear", "minuteOfYear", "minuteOfDay", "dayOfWeek", "isWeekend", "humidity_sensor", "temperature", "precip_intensity", "cloud_cover", "p1", "p2", "dew_point", "wind_speed" ] tau_min = 0 tau_max = 48 dataframe, var_list = generate_dataframe(var_names) print(f"Variable names: {var_names}") ci_test = ParCorr(significance='analytic') pcmci = PCMCI(dataframe=dataframe, cond_ind_test=ci_test, verbosity=1) correlations = pcmci.get_lagged_dependencies(tau_min=tau_min, tau_max=tau_max) lag_func_matrix = tp.plot_lagfuncs( name="experiments/causal_discovery/results/time_lagged_correlation.png", val_matrix=correlations, setup_args={ 'var_names': var_names, 'figsize': (50, 25), 'label_fontsize': 12, 'label_space_top': 0.025, 'label_space_left': 0.05, 'lag_units': 'hours', 'x_base': 6, 'y_base': .5 }) print(lag_func_matrix)
def run_pc_stable_parallel(j): """Wrapper around PCMCI.run_pc_stable estimating the parents for a single variable j. Parameters ---------- j : int Variable index. Returns ------- j, pcmci_of_j, parents_of_j : tuple Variable index, PCMCI object, and parents of j """ # CondIndTest is initialized globally below # Further parameters of PCMCI as described in the documentation can be # supplied here: pcmci_of_j = PCMCI(dataframe=dataframe, cond_ind_test=cond_ind_test, selected_variables=[j], verbosity=verbosity) # Run PC condition-selection algorithm. Also here further parameters can be # specified: parents_of_j = pcmci_of_j.run_pc_stable( selected_links=selected_links, tau_max=tau_max, pc_alpha=pc_alpha, ) # We return also the PCMCI object because it may contain pre-computed # results can be re-used in the MCI step (such as residuals or null # distributions) return j, pcmci_of_j, parents_of_j
def test_pc_stable_max_conds_dim(self): # Setting up strict test level pc_alpha = 0.05 #[0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5] tau_max = 2 alpha_level = 0.01 # true_parents_here = {0: [], # 1: [(1, -1), (0, -1)], # 2: [] # } dataframe = pp.DataFrame(self.data) cond_ind_test = ParCorr(verbosity=verbosity) pcmci = PCMCI(selected_variables=None, dataframe=dataframe, cond_ind_test=cond_ind_test, verbosity=verbosity) pcmci.run_pc_stable( selected_links=None, tau_min=1, tau_max=tau_max, save_iterations=False, pc_alpha=pc_alpha, max_conds_dim=2, max_combinations=1, ) parents = pcmci.all_parents # print parents # print _get_parent_graph(true_parents) assert_graphs_equal(parents, self.true_parents)
def run_pc_stable_parallel(j, dataframe, cond_ind_test, params): """Wrapper around PCMCI.run_pc_stable estimating the parents for a single variable j. Parameters ---------- j : int Variable index. Returns ------- j, pcmci_of_j, parents_of_j : tuple Variable index, PCMCI object, and parents of j """ N = dataframe.values.shape[1] # CondIndTest is initialized globally below # Further parameters of PCMCI as described in the documentation can be # supplied here: pcmci_of_j = PCMCI( dataframe=dataframe, cond_ind_test=cond_ind_test, selected_variables=[j], # var_names=var_names, verbosity=verbosity) # Run PC condition-selection algorithm. Also here further parameters can be # specified: if method_arg == 'pcmci': parents_of_j = pcmci_of_j.run_pc_stable( selected_links=params['selected_links'], tau_max=params['tau_max'], pc_alpha=params['pc_alpha'], ) elif method_arg == 'gc': parents_of_j = {} for i in range(N): if i == j: parents_of_j[i] = [ (var, -lag) for var in range(N) for lag in range(params['tau_min'], params['tau_max'] + 1) ] else: parents_of_j[i] = [] elif method_arg == 'corr': parents_of_j = {} for i in range(N): parents_of_j[i] = [] # We return also the PCMCI object because it may contain pre-computed # results can be re-used in the MCI step (such as residuals or null # distributions) return j, pcmci_of_j, parents_of_j
def __init__(self, dataframe, train_indices, test_indices, prediction_model, cond_ind_test=None, data_transform=None, verbosity=0): # Default value for the mask mask = dataframe.mask if mask is None: mask = np.zeros(dataframe.values.shape, dtype='bool') # Get the dataframe shape T = len(dataframe.values) # Have the default dataframe be the training data frame train_mask = np.copy(mask) train_mask[[t for t in range(T) if t not in train_indices]] = True self.dataframe = DataFrame(dataframe.values, mask=train_mask, missing_flag=dataframe.missing_flag) # Initialize the models baseclass with the training dataframe Models.__init__(self, dataframe=self.dataframe, model=prediction_model, data_transform=data_transform, mask_type='y', verbosity=verbosity) # Build the testing dataframe as well self.test_mask = np.copy(mask) self.test_mask[[t for t in range(T) if t not in test_indices]] = True # Setup the PCMCI instance if cond_ind_test is not None: # Force the masking cond_ind_test.set_mask_type('y') cond_ind_test.verbosity = verbosity PCMCI.__init__(self, dataframe=self.dataframe, cond_ind_test=cond_ind_test, selected_variables=None, verbosity=verbosity) # Set the member variables self.cond_ind_test = cond_ind_test # Initialize member varialbes that are set outside self.target_predictors = None self.selected_targets = None self.fitted_model = None self.test_array = None
def a_run_pcmciplus(a_pcmciplus, a_pcmciplus_params): # Unpack the pcmci and the true parents, and common parameters dataframe, true_graph, links_coeffs, tau_min, tau_max = a_pcmciplus # Unpack the parameters ( pc_alpha, contemp_collider_rule, conflict_resolution, reset_lagged_links, cond_ind_test_class, ) = a_pcmciplus_params if cond_ind_test_class == 'oracle_ci': cond_ind_test = OracleCI(links_coeffs) elif cond_ind_test_class == 'par_corr': cond_ind_test = ParCorr() # Run the PCMCI algorithm with the given parameters pcmci = PCMCI(dataframe=dataframe, cond_ind_test=cond_ind_test, verbosity=2) results = pcmci.run_pcmciplus( selected_links=None, tau_min=tau_min, tau_max=tau_max, pc_alpha=pc_alpha, contemp_collider_rule=contemp_collider_rule, conflict_resolution=conflict_resolution, reset_lagged_links=reset_lagged_links, max_conds_dim=None, max_conds_py=None, max_conds_px=None, ) # Print true links print("************************") print("\nTrue Graph") for lag in range(tau_max): print("Lag %d = ", lag) print(true_graph[:, :, lag]) # pcmci.print_significant_links( # p_matrix=(true_graph != ""), # val_matrix=true_graph, # conf_matrix=None, # q_matrix=None, # graph=true_graph, # ambiguous_triples=None, # alpha_level=0.05) # Return the results and the expected result return results['graph'], true_graph
def issue38(): dpath = os.path.dirname(os.path.abspath(__file__)) fname = 'tigramite_issue_38_input_example.csv' fpath = os.path.join(dpath, fname) df = pd.read_csv(fpath, index_col=0) print(df) data = df.values tdf = DataFrame( data=data, mask=None, missing_flag=None, var_names=df.columns, datatime=None, ) indp_test = CMIknn( # knn=None, # shuffle_neighbors=None, # transform=None, # significance=None, ) selected_variables = [col_lbl for col_lbl in df.columns if 'i' in col_lbl] selected_variables_ix = [ df.columns.get_loc(lbl) for lbl in selected_variables ] print(("Init PCMCI with:" f"dataframe={tdf}," f"cond_ind_test={indp_test}," f"selected_variables={selected_variables_ix}," f"verbosity=10,")) pcmci = PCMCI( dataframe=tdf, cond_ind_test=indp_test, selected_variables=selected_variables_ix, verbosity=10, ) max_lag = 24 alpha = 0.1 print("Running PCMCI...") pcmci.run_pcmci(tau_max=max_lag, pc_alpha=alpha) print("Done successfully! No errors!")
def pcmci_setup(data): dataframe = pp.DataFrame(data.values, var_names=list(data.columns)) parcorr = ParCorr(significance='analytic') pcmci = PCMCI( dataframe=dataframe, cond_ind_test=parcorr, verbosity=1) return pcmci
def a_pcmci(a_sample, request): # Unpack the test data and true parent graph dataframe, true_parents = a_sample # Build the PCMCI instance pcmci = PCMCI(dataframe=dataframe, cond_ind_test=ParCorr(verbosity=VERBOSITY), verbosity=VERBOSITY) # Return the constructed PCMCI, expected results, and common parameters return pcmci, true_parents
def run_pc_stable_parallel(j): """Wrapper around PCMCI.run_pc_stable estimating the parents for a single variable j. Parameters ---------- j : int Variable index. Returns ------- j, pcmci_of_j, parents_of_j : tuple Variable index, PCMCI object, and parents of j """ # CondIndTest is initialized globally below # Further parameters of PCMCI as described in the documentation can be # supplied here: pcmci_of_j = PCMCI( dataframe=dataframe, cond_ind_test=cond_ind_test, selected_variables=[j], verbosity=verbosity) # Run PC condition-selection algorithm. Also here further parameters can be # specified: parents_of_j = pcmci_of_j.run_pc_stable( selected_links=selected_links, tau_max=tau_max, pc_alpha=pc_alpha, ) # We return also the PCMCI object because it may contain pre-computed # results can be re-used in the MCI step (such as residuals or null # distributions) return j, pcmci_of_j, parents_of_j
def a_pcmci(a_sample, a_test, a_common_params, request): # Unpack the test data and true parent graph dataframe, true_parents = a_sample # Unpack the common parameters tau_min, tau_max, sel_link = a_common_params # Get the parameters from this request select_vars = request.param # Build the PCMCI instance pcmci = PCMCI(dataframe=dataframe, cond_ind_test=a_test, verbosity=VERBOSITY) # Select the correct links if they are given select_links = _select_links(sel_link, true_parents) # Ensure we change the true parents to be the same as the selected links if select_links is not None: true_parents = select_links # Return the constructed PCMCI, expected results, and common parameters return pcmci, true_parents, tau_min, tau_max, select_links
def test_alphas(dataframe, cond_ind_test, alphas, var_names, tau_min=0, tau_max=1, selected_links=None): """Executes the PCMCI algorithm over a list of different alphas and plots the results. Args: dataframe: The TIGRAMITE dataframe to use. cond_ind_test: The conditional independence test to use. alphas: The list of individual alphas. var_names: The names of the variables contained in the dataframe. tau_min: The minimum lag. tau_max: The maximum lag. selected_links: Dictionalry specifying whether only selected links should be tested. """ pcmci = PCMCI(dataframe=dataframe, cond_ind_test=cond_ind_test, verbosity=1) for pc_alpha in alphas: run_experiment(pcmci, cond_ind_test, pc_alpha, tau_min, tau_max, var_names, selected_links)
def caus_gpdc(data, var_names): import numpy as np import matplotlib as mpl from matplotlib import pyplot as plt import sklearn import tigramite from tigramite import data_processing as pp from tigramite import plotting as tp from tigramite.pcmci import PCMCI from tigramite.independence_tests import ParCorr, GPDC, CMIknn, CMIsymb from tigramite.models import LinearMediation, Prediction data_mask_row = np.zeros(len(data)) for i in range(68904): if (i % 72) < 30 or (i % 72) > 47: data_mask_row[i] = True data_mask = np.zeros(data.shape) data_mask[:, 0] = data_mask_row data_mask[:, 1] = data_mask_row data_mask[:, 2] = data_mask_row data_mask[:, 9] = data_mask_row data_mask[:, 10] = data_mask_row data_mask[:, 11] = data_mask_row dataframe = pp.DataFrame(data, mask=data_mask) datatime = np.arange(len(data)) # tp.plot_timeseries(data, datatime, var_names, use_mask=True, # mask=data_mask, grey_masked_samples='data') gpdc = GPDC(significance='analytic', gp_params=None, use_mask=True, mask_type='y') gpdc.generate_and_save_nulldists(sample_sizes=range(495, 501), null_dist_filename='dc_nulldists.npz') gpdc.null_dist_filename = 'dc_nulldists.npz' pcmci_gpdc = PCMCI(dataframe=dataframe, cond_ind_test=gpdc, var_names=var_names, verbosity=1) # correlations = pcmci.get_lagged_dependencies(tau_max=20) # lag_func_matrix = tp.plot_lagfuncs(val_matrix=correlations, # setup_args={'var_names':var_names, # 'x_base':5, 'y_base':.5}) results = pcmci_gpdc.run_pcmci(tau_max=6, tau_min=1, pc_alpha=0.01) # print("p-values") # print (results['p_matrix'].round(3)) # print("MCI partial correlations") # print (results['val_matrix'].round(2)) q_matrix = pcmci_gpdc.get_corrected_pvalues(p_matrix=results['p_matrix'], fdr_method='fdr_bh') pcmci_gpdc._print_significant_links(p_matrix=results['p_matrix'], q_matrix=q_matrix, val_matrix=results['val_matrix'], alpha_level=0.01) link_matrix = pcmci_gpdc._return_significant_parents( pq_matrix=q_matrix, val_matrix=results['val_matrix'], alpha_level=0.01)['link_matrix'] tp.plot_time_series_graph( val_matrix=results['val_matrix'], link_matrix=link_matrix, var_names=var_names, link_colorbar_label='MCI', ) return results, link_matrix
def run_pcmci(data, data_mask, var_names, path_outsub2, s, tau_min=0, tau_max=1, pc_alpha=None, alpha_level=0.05, max_conds_dim=4, max_combinations=1, max_conds_py=None, max_conds_px=None, verbosity=4): #%% if path_outsub2 is not False: txt_fname = os.path.join(path_outsub2, f'split_{s}_PCMCI_out.txt') # from contextlib import redirect_stdout orig_stdout = sys.stdout # buffer print statement output to f sys.stdout = f = io.StringIO() #%% # ====================================================================================================================== # tigramite 4 # ====================================================================================================================== T, N = data.shape # Time, Regions # ====================================================================================================================== # Initialize dataframe object (needed for tigramite functions) # ====================================================================================================================== dataframe = pp.DataFrame(data=data, mask=data_mask, var_names=var_names) # ====================================================================================================================== # pc algorithm: only parents for selected_variables are calculated # ====================================================================================================================== parcorr = ParCorr(significance='analytic', mask_type='y', verbosity=verbosity) #========================================================================== # multiple testing problem: #========================================================================== pcmci = PCMCI(dataframe=dataframe, cond_ind_test=parcorr, selected_variables=None, verbosity=verbosity) # selected_variables : list of integers, optional (default: range(N)) # Specify to estimate parents only for selected variables. If None is # passed, parents are estimated for all variables. # ====================================================================================================================== #selected_links = dictionary/None results = pcmci.run_pcmci(tau_max=tau_max, pc_alpha=pc_alpha, tau_min=tau_min, max_conds_dim=max_conds_dim, max_combinations=max_combinations, max_conds_px=max_conds_px, max_conds_py=max_conds_py) q_matrix = pcmci.get_corrected_pvalues(p_matrix=results['p_matrix'], fdr_method='fdr_bh') pcmci.print_significant_links(p_matrix=results['p_matrix'], q_matrix=q_matrix, val_matrix=results['val_matrix'], alpha_level=alpha_level) #%% if path_outsub2 is not False: file = io.open(txt_fname, mode='w+') file.write(f.getvalue()) file.close() f.close() sys.stdout = orig_stdout return pcmci, q_matrix, results
# Specify time axis and variable names datatime = np.arange(len(data)) # ====================================================================================================================== # pc algorithm: only parents for selected_variables are calculated (here entry[0] = PoV) # ====================================================================================================================== parcorr = ParCorr(significance='analytic', use_mask=True, mask_type='y', verbosity=2) pcmci = PCMCI( dataframe=dataframe, cond_ind_test=parcorr, var_names=var_names, selected_variables= None, #[0], # only parents for the monsoon trough rainfall verbosity=2) # ====================================================================================================================== # results = pcmci.run_pcmci(tau_max=tau_max, pc_alpha = pc_alpha, tau_min = tau_min, max_combinations=1 ) # ====================================================================================================================== if p == 0: pc_alpha = pcA_set1a pc_alpha_name = str(pcA_set1a) elif p == 1: pc_alpha = pcA_set1b pc_alpha_name = str(pcA_set1b) elif p == 2:
data[:, 3] = nc_file.variables["ENSOI"][:] a = nc_file.variables["NAOIN"][:] b = nc_file.variables["NAOIS"][:] data[:, 4] = a - b data_mask = np.zeros(data.shape) for t in range(1, T + 1): if (t % 73) >= 12 and (t % 73) <= 30: data_mask[t - 1, :] = True # Initialize dataframe object, specify time axis and variable names var_names = ['WPSH', 'IO', 'WNP', 'ENSO', 'NAO'] dataframe = pp.DataFrame(data, mask=data_mask) parcorr = ParCorr(significance='analytic', mask_type='xyz') pcmci = PCMCI(dataframe=dataframe, cond_ind_test=parcorr) results = pcmci.run_pcmci(tau_max=12, pc_alpha=0.03) # Correct p-values q_matrix = pcmci.get_corrected_pvalues(p_matrix=results['p_matrix'], fdr_method='fdr_bh') # Plotting link_matrix = pcmci.return_significant_parents( pq_matrix=q_matrix, val_matrix=results['val_matrix'], alpha_level=0.03)['link_matrix'] tp.plot_graph(val_matrix=results['val_matrix'], link_matrix=link_matrix, var_names=var_names) """
def run(self): data, _ = pp.var_process(self.links_coeffs, T=1000) dataframe = pp.DataFrame(data) cond_ind_test = ParCorr() self.pcmciobj = PCMCI(dataframe=dataframe, cond_ind_test=cond_ind_test) self.results = self.pcmciobj.run_pcmci(tau_max=2, pc_alpha=None)
datatime=np.arange(len(data)), var_names=var_names) # In[4]: data.shape # In[5]: tp.plot_timeseries(dataframe) plt.show() # In[6]: parcorr = ParCorr(significance='analytic') pcmci = PCMCI(dataframe=dataframe, cond_ind_test=parcorr, verbosity=1) # In[7]: correlations = pcmci.get_lagged_dependencies(tau_max=20, val_only=True)['val_matrix'] lag_func_matrix = tp.plot_lagfuncs(val_matrix=correlations, setup_args={ 'figsize': (6, 6), 'var_names': var_names, 'x_base': 10, 'y_base': .5 }) # In[8]:
def init_pcmci(df_data, significance='analytic', mask_type='y', selected_variables=None, verbosity=5): ''' First initializing pcmci object for each training set. This allows to plot lagged cross-correlations which help to identity a reasonably tau_max. Parameters ---------- df_data : pandas DataFrame df_data is retrieved by running rg.get_ts_prec(). significance : str, optional DESCRIPTION. The default is 'analytic'. mask_type : str, optional DESCRIPTION. The default is 'y'. verbosity : int, optional DESCRIPTION. The default is 4. selected_variables : list of integers, optional (default: None) Specify to estimate parents only for selected variables. If None is passed, parents are estimated for all variables. Returns ------- dictionary of format {split:pcmci}. ''' splits = df_data.index.levels[0] pcmci_dict = {} RV_mask = df_data['RV_mask'] for s in range(splits.size): TrainIsTrue = df_data['TrainIsTrue'].loc[s] df_data_s = df_data.loc[s][TrainIsTrue == True] df_data_s = df_data_s.dropna(axis=1, how='all') if any(df_data_s.isna().values.flatten()): if verbosity > 0: print('Warnning: nans detected') # print(np.unique(df_data_s.isna().values)) var_names = [ k for k in df_data_s.columns if k not in ['TrainIsTrue', 'RV_mask'] ] df_data_s = df_data_s.loc[:, var_names] data = df_data_s.values data_mask = ~RV_mask.loc[s][TrainIsTrue == True].values # indices with mask == False are used (with mask_type 'y') data_mask = np.repeat(data_mask, data.shape[1]).reshape(data.shape) # create dataframe in Tigramite format dataframe = pp.DataFrame(data=data, mask=data_mask, var_names=var_names) parcorr = ParCorr(significance=significance, mask_type=mask_type, verbosity=0) parcorr.verbosity = verbosity # to avoid print init text each time # ====================================================================================================================== # pc algorithm: only parents for selected_variables are calculated # ====================================================================================================================== pcmci = PCMCI(dataframe=dataframe, cond_ind_test=parcorr, verbosity=verbosity) pcmci_dict[s] = pcmci return pcmci_dict
def test_order_independence_pcmciplus(a_pcmciplus_order_independence, a_pcmciplus_params_order_independence): # Unpack the pcmci and the true parents, and common parameters dataframe, true_graph, links_coeffs, tau_min, tau_max = \ a_pcmciplus_order_independence data = dataframe.values T, N = data.shape # Unpack the parameters ( pc_alpha, contemp_collider_rule, conflict_resolution, reset_lagged_links, cond_ind_test_class, ) = a_pcmciplus_params_order_independence if cond_ind_test_class == 'oracle_ci': cond_ind_test = OracleCI(links_coeffs) elif cond_ind_test_class == 'par_corr': cond_ind_test = ParCorr() # Run the PCMCI algorithm with the given parameters pcmci = PCMCI(dataframe=dataframe, cond_ind_test=cond_ind_test, verbosity=1) print("************************") print("\nTrue Graph") pcmci.print_significant_links(p_matrix=(true_graph == 0), val_matrix=true_graph, conf_matrix=None, q_matrix=None, graph=true_graph, ambiguous_triples=None, alpha_level=0.05) results = pcmci.run_pcmciplus( selected_links=None, tau_min=tau_min, tau_max=tau_max, pc_alpha=pc_alpha, contemp_collider_rule=contemp_collider_rule, conflict_resolution=conflict_resolution, reset_lagged_links=reset_lagged_links, max_conds_dim=None, max_conds_py=None, max_conds_px=None, ) correct_results = results['graph'] for perm in itertools.permutations(range(N)): print(perm) data_new = np.copy(data[:, perm]) dataframe = pp.DataFrame(data_new, var_names=list(perm)) pcmci = PCMCI(dataframe=dataframe, cond_ind_test=cond_ind_test, verbosity=1) results = pcmci.run_pcmciplus( selected_links=None, tau_min=tau_min, tau_max=tau_max, pc_alpha=pc_alpha, contemp_collider_rule=contemp_collider_rule, conflict_resolution=conflict_resolution, reset_lagged_links=reset_lagged_links, max_conds_dim=None, max_conds_py=None, max_conds_px=None, ) tmp = np.take(correct_results, perm, axis=0) back_converted_result = np.take(tmp, perm, axis=1) for tau in range(tau_max + 1): if not np.allclose(results['graph'][:, :, tau], back_converted_result[:, :, tau]): print(tau) print(results['graph'][:, :, tau]) print(back_converted_result[:, :, tau]) print(back_converted_result[:, :, tau] - results['graph'][:, :, tau]) print(perm) # np.allclose(results['graph'], back_converted_result) np.testing.assert_equal(results['graph'], back_converted_result)
def pcmci_causality(data, dt, index, headers, T_data, N_data, maxlag): T = T_data N = N_data # Run settings # there is another tau_max in lagged dependencies that might be much longer! tau_max = maxlag # Verbosity: # 0 - nothing # 1 - final graph only # 2 - everything verbose_max = 2 verbose = 2 print("======") # print(list(data)) # got 100 records as itertools.chain object, not numpy df # Initialize dataframe object, specify time axis and variable names dataframe = pp.DataFrame(data, datatime=dt, var_names=headers) print(dataframe.var_names) rcot = RCOT(significance='analytic') pcmci_rcot = PCMCI(dataframe=dataframe, cond_ind_test=rcot, verbosity=0) pcmci_rcot.verbosity = 1 results = pcmci_rcot.run_pcmci(tau_max=tau_max, pc_alpha=0.05) # Print results print("p-values") print(results['p_matrix'].round(3)) print("MCI partial correlations") print(results['val_matrix'].round(2)) # Save results to file # p_matrix = results['p_matrix'] # with open("p-values_baseline.csv", "w") as csv_file: # writer = csv.writer(csv_file, delimiter=",", quotechar="|", quoting=csv.QUOTE_MINIMAL) # # [[[1 2 3]]] Three brackets to get through. # for sector in p_matrix: # print("sector: ", sector) # for row in sector: # print("row: ", row) # writer.writerow(row) # writer.writerow([]) # # print("inside def pcmci_causality") # output edges result_arr = [] for index_cause, item in enumerate(results['p_matrix']): # print("index is") # print(index) # print("item is") # print(item) # print("cause is") cause = headers[index_cause] # print(headers[index_cause]) for index_effect, arr in enumerate(item): # print("effect arr is ") # print(arr) # print("effect name is") effect = headers[index_effect] # print(headers[index_effect]) for arrItem in arr: if arrItem < 0.05 and cause != effect: result_arr.append([effect, cause, index]) print("{} caused by {}".format(effect, cause)) break with open("pcmci_baseline_out.csv", "w", newline='') as f: for row in result_arr: f.write("%s\n" % ','.join(str(col) for col in row)) # print(pcmci) print(result_arr) return result_arr
def pcmci_causality(data, dt, index, headers, T_data, N_data, maxlag): T = T_data N = N_data tau_max = maxlag # Verbosity: # 0 - nothing # 1 - final graph only # 2 - everything verbose_max = 2 verbose = 2 print("======") # print(list(data)) # got 100 records as itertools.chain object, not numpy df data = np.array(list(data)) print("data len is ") print(len(data)) # data = np.fromiter(data, float) # print(data) # Initialize dataframe object, specify time axis and variable names dataframe = pp.DataFrame(data, datatime=dt, var_names=headers) print(dataframe.var_names) rcot = RCOT(significance='analytic') pcmci_rcot = PCMCI(dataframe=dataframe, cond_ind_test=rcot, verbosity=0) pcmci_rcot.verbosity = 1 results = pcmci_rcot.run_pcmci(tau_max=tau_max, pc_alpha=0.05) # Print results print("p-values") print(results['p_matrix'].round(3)) print("MCI partial correlations") print(results['val_matrix'].round(2)) # print("inside def pcmci_causality") # output edges result_arr = [] # result_arr.append(["effect","cause"]) for index_cause, item in enumerate(results['p_matrix']): print("index is") print(index) print("item is") print(item) print("cause is") cause = headers[index_cause] print(headers[index_cause]) for index_effect, arr in enumerate(item): print("effect arr is ") print(arr) print("effect name is") effect = headers[index_effect] print(headers[index_effect]) for arrItem in arr: if arrItem < 0.05 and cause != effect: result_arr.append([effect, cause, index]) print("{} caused by {}".format(effect, cause)) break with open("pcmci_para_out{}.csv".format(index), "w", newline='') as f: for row in result_arr: f.write("%s\n" % ','.join(str(col) for col in row)) # print(pcmci) return result_arr
def build_link_pcmci_noself(p_data_values, p_agent_names, p_var_sou, p_var_tar): """ build links by n column data """ [times_num, agent_num] = p_data_values.shape # set the data for PCMCI data_frame = pp.DataFrame(p_data_values, var_names=p_agent_names, missing_flag=BaseConfig.BACKGROUND_VALUE) # new PCMCI pcmci = PCMCI(dataframe=data_frame, cond_ind_test=ParCorr()) # run PCMCI alpha_level = 0.01 results_pcmci = pcmci.run_pcmciplus(tau_min=0, tau_max=2, pc_alpha=alpha_level) # get the result graph_pcmci = results_pcmci['graph'] q_matrix = results_pcmci['q_matrix'] p_matrix = results_pcmci['p_matrix'] val_matrix = results_pcmci['val_matrix'] conf_matrix = results_pcmci['conf_matrix'] ambiguous_triples = results_pcmci['ambiguous_triples'] # filter these links links_df = pd.DataFrame(columns=('VarSou', 'VarTar', 'Source', 'Target', 'TimeLag', 'Strength', 'Unoriented')) if graph_pcmci is not None: sig_links = (graph_pcmci != "") * (graph_pcmci != "<--") elif q_matrix is not None: sig_links = (q_matrix <= alpha_level) else: sig_links = (p_matrix <= alpha_level) for j in range(agent_num): links = {(p[0], -p[1]): np.abs(val_matrix[p[0], j, abs(p[1])]) for p in zip(*np.where(sig_links[:, j, :]))} # Sort by value sorted_links = sorted(links, key=links.get, reverse=True) for p in sorted_links: VarSou = p_var_sou VarTar = p_var_tar Source = p_agent_names[j] Target = p_agent_names[p[0]] TimeLag = p[1] Strength = val_matrix[p[0], j, abs(p[1])] Unoriented = None if graph_pcmci is not None: if p[1] == 0 and graph_pcmci[j, p[0], 0] == "o-o": Unoriented = 1 # "unoriented link" elif graph_pcmci[p[0], j, abs(p[1])] == "x-x": Unoriented = 1 # "unclear orientation due to conflict" else: Unoriented = 0 links_df = links_df.append(pd.DataFrame({ 'VarSou': [VarSou], 'VarTar': [VarTar], 'Source': [Source], 'Target': [Target], 'TimeLag': [TimeLag], 'Strength': [Strength], 'Unoriented': [Unoriented] }), ignore_index=True) # remove the self correlation edges links_df = links_df.loc[links_df['Source'] != links_df['Target']] return links_df
dfresiduals = pd.read_pickle(filename_residuals) data = dfresiduals[parameters['var_names']].values T, N = data.shape # Initialize dataframe object dataframe = pp.DataFrame(data) #%% rcot = RCOT2(significance=parameters['cond_ind_test.significance'], num_f=parameters['cond_ind_test.num_f']) pcmci = PCMCI(dataframe, cond_ind_test=rcot, selected_variables=parameters['selected_variables'], var_names=parameters['var_names'], verbosity=10) q_matrix = pcmci.get_corrected_pvalues(p_matrix=p_matrix, fdr_method='fdr_bh') q_matrix_tsbh = pcmci.get_corrected_pvalues(p_matrix=p_matrix, fdr_method='fdr_tsbh') #%% print results pcmci._print_significant_links( p_matrix = p_matrix, q_matrix = q_matrix, val_matrix = val_matrix, alpha_level = 0.1) #%% get selected parents and fit linear model
# Initialize dataframe object, specify time axis and variable names #var_names = [r'$X^0$', r'$X^1$', r'$X^2$', r'$X^3$'] dataframe = pp.DataFrame(data, datatime=np.arange(len(data)), var_names=headers) if verbose > 0: plot = tp.plot_timeseries(dataframe)[0] if display_images: plot.show() if save_images: plot.savefig("timeseries.png") parcorr = ParCorr(significance='analytic') pcmci = PCMCI(dataframe=dataframe, cond_ind_test=parcorr, verbosity=1) correlations = pcmci.get_lagged_dependencies(tau_max=3) lag_func_matrix = tp.plot_lagfuncs(val_matrix=correlations, setup_args={ 'var_names': headers, 'x_base': 5, 'y_base': .5 }) if verbose > 1: if display_images: lag_func_matrix.savefig() if save_images: lag_func_matrix.savefig("lag_func.png")
def run_PCMCI(ex, outdic_actors, s, df_splits, map_proj): #===================================================================================== # # 4) PCMCI-algorithm # #===================================================================================== # save output if ex['SaveTF'] == True: # from contextlib import redirect_stdout orig_stdout = sys.stdout # buffer print statement output to f if sys.version[:1] == '3': sys.stdout = f = io.StringIO() elif sys.version[:1] == '2': sys.stdout = f = open(os.path.join(ex['fig_subpath'], 'old.txt'), 'w+') #%% # amount of text printed: verbosity = 3 # alpha level for independence test within the pc procedure (finding parents) pc_alpha = ex['pcA_sets'][ex['pcA_set']] # alpha level for multiple linear regression model while conditining on parents of # parents alpha_level = ex['alpha_level_tig'] print('run tigramite 4, run.pcmci') print(('alpha level(s) for independence tests within the pc procedure' '(finding parents): {}'.format(pc_alpha))) print(( 'alpha level for multiple linear regression model while conditining on parents of ' 'parents: {}'.format(ex['alpha_level_tig']))) # Retrieve traintest info traintest = df_splits # load Response Variable class RV = ex[ex['RV_name']] # create list with all actors, these will be merged into the fulldata array allvar = ex['vars'][0] var_names_corr = [] actorlist = [] cols = [[RV.name]] for var in allvar[:]: print(var) actor = outdic_actors[var] if actor.ts_corr[s].size != 0: ts_train = actor.ts_corr[s].values actorlist.append(ts_train) # create array which numbers the regions var_idx = allvar.index(var) n_regions = actor.ts_corr[s].shape[1] actor.var_info = [[i + 1, actor.ts_corr[s].columns[i], var_idx] for i in range(n_regions)] # Array of corresponing regions with var_names_corr (first entry is RV) var_names_corr = var_names_corr + actor.var_info cols.append(list(actor.ts_corr[s].columns)) index_dates = actor.ts_corr[s].index var_names_corr.insert(0, RV.name) # stack actor time-series together: fulldata = np.concatenate(tuple(actorlist), axis=1) print(('There are {} regions in total'.format(fulldata.shape[1]))) # add the full 1D time series of interest as first entry: fulldata = np.column_stack((RV.RVfullts, fulldata)) df_data = pd.DataFrame(fulldata, columns=flatten(cols), index=index_dates) if ex['import_prec_ts'] == True: var_names_full = var_names_corr.copy() for d in ex['precursor_ts']: path_data = d[1] if len(path_data) > 1: path_data = ''.join(list(path_data)) # skip first col because it is the RV ts df_data_ext = func_fc.load_hdf5( path_data)['df_data'].iloc[:, 1:].loc[s] cols_ts = np.logical_or(df_data_ext.dtypes == 'float64', df_data_ext.dtypes == 'float32') cols_ext = list(df_data_ext.columns[cols_ts]) # cols_ext must be of format '{}_{int}_{}' lab_int = 100 for i, c in enumerate(cols_ext): char = c.split('_')[1] if char.isdigit(): pass else: cols_ext[i] = c.replace(char, str(lab_int)) + char lab_int += 1 df_data_ext = df_data_ext[cols_ext] to_freq = ex['tfreq'] if to_freq != 1: start_end_date = (ex['sstartdate'], ex['senddate']) start_end_year = (ex['startyear'], ex['endyear']) df_data_ext = functions_pp.time_mean_bins(df_data_ext, to_freq, start_end_date, start_end_year, seldays='part')[0] # df_data_ext = functions_pp.time_mean_bins(df_data_ext, # ex, ex['tfreq'], # seldays='part')[0] # Expand var_names_corr n = var_names_full[-1][0] + 1 add_n = n + len(cols_ext) n_var_idx = var_names_full[-1][-1] + 1 for i in range(n, add_n): var_names_full.append([i, cols_ext[i - n], n_var_idx]) df_data = df_data.merge(df_data_ext, left_index=True, right_index=True) else: var_names_full = var_names_corr bool_train = traintest.loc[s]['TrainIsTrue'] bool_RV_train = np.logical_and(bool_train, traintest.loc[s]['RV_mask']) dates_train = traintest.loc[s]['TrainIsTrue'][bool_train].index dates_RV_train = traintest.loc[s]['TrainIsTrue'][bool_RV_train].index RVfull_train = RV.RVfullts.sel(time=dates_train) datesfull_train = pd.to_datetime(RVfull_train.time.values) data = df_data.loc[datesfull_train].values print((data.shape)) # get RV datamask (same shape als data) data_mask = [ True if d in dates_RV_train else False for d in datesfull_train ] data_mask = np.repeat(data_mask, data.shape[1]).reshape(data.shape) # add traintest mask to fulldata # dates_all = pd.to_datetime(RV.RVfullts.index) # dates_RV = pd.to_datetime(RV.RV_ts.index) dates_all = pd.to_datetime(RV.RVfullts.time.values) dates_RV = pd.to_datetime(RV.RV_ts.time.values) df_data['TrainIsTrue'] = [ True if d in datesfull_train else False for d in dates_all ] df_data['RV_mask'] = [True if d in dates_RV else False for d in dates_all] # ====================================================================================================================== # tigramite 3 # ====================================================================================================================== T, N = data.shape # Time, Regions # ====================================================================================================================== # Initialize dataframe object (needed for tigramite functions) # ====================================================================================================================== dataframe = pp.DataFrame(data=data, mask=data_mask, var_names=var_names_full) # ====================================================================================================================== # pc algorithm: only parents for selected_variables are calculated # ====================================================================================================================== parcorr = ParCorr(significance='analytic', mask_type='y', verbosity=verbosity) #========================================================================== # multiple testing problem: #========================================================================== pcmci = PCMCI(dataframe=dataframe, cond_ind_test=parcorr, selected_variables=None, verbosity=4) # selected_variables : list of integers, optional (default: range(N)) # Specify to estimate parents only for selected variables. If None is # passed, parents are estimated for all variables. # ====================================================================================================================== #selected_links = dictionary/None results = pcmci.run_pcmci(tau_max=ex['tigr_tau_max'], pc_alpha=pc_alpha, tau_min=0, max_combinations=ex['max_comb_actors']) q_matrix = pcmci.get_corrected_pvalues(p_matrix=results['p_matrix'], fdr_method='fdr_bh') pcmci.print_significant_links(p_matrix=results['p_matrix'], q_matrix=q_matrix, val_matrix=results['val_matrix'], alpha_level=alpha_level) # returns all parents, not just causal precursors (of lag>0) sig = rgcpd.return_sign_parents(pcmci, pq_matrix=q_matrix, val_matrix=results['val_matrix'], alpha_level=alpha_level) all_parents = sig['parents'] # link_matrix = sig['link_matrix'] links_RV = all_parents[0] df = rgcpd.bookkeeping_precursors(links_RV, var_names_full) #%% rgcpd.print_particular_region_new(links_RV, var_names_corr, s, outdic_actors, map_proj, ex) #%% if ex['SaveTF'] == True: if sys.version[:1] == '3': fname = f's{s}_' + ex['params'] + '.txt' file = io.open(os.path.join(ex['fig_subpath'], fname), mode='w+') file.write(f.getvalue()) file.close() f.close() elif sys.version[:1] == '2': f.close() sys.stdout = orig_stdout return df, df_data
# give custom NAN value for tigramite to interpret mssng = 99999 study_data = study_data.copy().fillna(mssng) dataframe = pp.DataFrame(study_data.values, var_names= study_data.columns, missing_flag = mssng) tp.plot_timeseries(dataframe) parcorr = ParCorr(significance='analytic') gpdc = GPDC(significance='analytic', gp_params=None) pcmci_gpdc = PCMCI( dataframe=dataframe, cond_ind_test=gpdc, verbosity=0) pcmci = PCMCI( dataframe=dataframe, cond_ind_test=parcorr, verbosity=1) # min_lag, max_lag = 1,6 results = pcmci.run_pcmci(tau_min = min_lag, tau_max=max_lag, pc_alpha=None) # q_matrix = pcmci.get_corrected_pvalues(p_matrix=results['p_matrix'], fdr_method='fdr_bh') # pcmci.print_significant_links( p_matrix = results['p_matrix'], q_matrix = q_matrix,
def test(dataframes,max_lags=[4],alpha=[None],tests=['ParCorr'],limit=1): ''' This function performs the PCMCI algorithm for all the dataframes received as parameters, given the hyper-parameters of the conditional independence test Args: dataframes: A list of TIGRAMITE dataframes max_lags: Maximum number of lags to consider for the laggd time series alpha: Significance level to perform the parent test tests: A list of conditional independence test to be performed limit: A limit for the instances to be considered Returns: ''' test_results = [] random.shuffle(dataframes) total = limit*len(max_lags)*len(alpha)*len(tests) data_frame_iter = iter(dataframes) tests_to_evaluate=[] if 'RCOT' in tests: rcot = RCOT() tests_to_evaluate.append(['RCOT',rcot]) if 'GPDC' in tests: gpdc = GPDC() tests_to_evaluate.append(['GPDC', gpdc]) if 'ParCorr' in tests: parcorr = ParCorr(significance='analytic') tests_to_evaluate.append(['ParCorr',parcorr]) if 'CMIknn' in tests: cmiknn = CMIknn() tests_to_evaluate.append(['CMIknn',cmiknn]) unique_complexities = list(set(l[1] for l in dataframes)) counts = {} for i in unique_complexities: counts[i] = 0 for test in tests_to_evaluate: stop = False for l in max_lags: for a in alpha: while not stop: try: i = random.sample(dataframes,1)[0] if counts[i[1]] < limit: print('evaluating: ' + str(i[3])) start = time.time() pcmci = PCMCI( dataframe=i[2], cond_ind_test=test[1], verbosity=0) # correlations = pcmci.get_lagged_dependencies(tau_max=20) pcmci.verbosity = 1 results = pcmci.run_pcmci(tau_max=l, pc_alpha=a) time_lapse = round(time.time() - start, 2) q_matrix = pcmci.get_corrected_pvalues(p_matrix=results['p_matrix'], fdr_method='fdr_bh') valid_parents = list(pcmci.return_significant_parents(pq_matrix=q_matrix, val_matrix=results['val_matrix'], alpha_level=a)['parents'].values()) flat_list = [] for sublist in valid_parents: for item in sublist: flat_list.append(item) valid_links = len(flat_list) test_results.append([i[3], i[0], i[1], l,test[0],a,valid_links,time_lapse]) results_df = pd.DataFrame(test_results, columns=['representation', 'complexity', 'sample_size', 'max_lag','test','alpha','valid_links_at_alpha', 'learning_time']) print('results ready to be saved') results_df.to_csv( 'results/performance_sample_sizes.csv', index=False) counts[i[1]] += 1 if all(value == limit for value in counts.values()): stop = True except: print('Hoopla!') pass for i in unique_complexities: counts[i] = 0