val_matrix = results['val_matrix'] with open("val-values.csv", "w") as csv_file: writer = csv.writer(csv_file, delimiter=",", quotechar="|", quoting=csv.QUOTE_MINIMAL) #[[[1 2 3]]] Three brackets to get through. for sector in val_matrix: print("sector: ", sector) for row in sector: print("row: ", row) writer.writerow(row) writer.writerow([]) q_matrix = pcmci.get_corrected_pvalues(p_matrix=results['p_matrix'], fdr_method='fdr_bh') pcmci.print_significant_links(p_matrix=results['p_matrix'], q_matrix=q_matrix, val_matrix=results['val_matrix'], alpha_level=0.01) link_matrix = pcmci.return_significant_parents( pq_matrix=q_matrix, val_matrix=results['val_matrix'], alpha_level=0.01)['link_matrix'] graph = tp.plot_graph( val_matrix=results['val_matrix'], link_matrix=link_matrix, var_names=headers, link_colorbar_label='cross-MCI', node_colorbar_label='auto-MCI',
def run_PCMCI(ex, outdic_actors, s, df_splits, map_proj): #===================================================================================== # # 4) PCMCI-algorithm # #===================================================================================== # save output if ex['SaveTF'] == True: # from contextlib import redirect_stdout orig_stdout = sys.stdout # buffer print statement output to f if sys.version[:1] == '3': sys.stdout = f = io.StringIO() elif sys.version[:1] == '2': sys.stdout = f = open(os.path.join(ex['fig_subpath'], 'old.txt'), 'w+') #%% # amount of text printed: verbosity = 3 # alpha level for independence test within the pc procedure (finding parents) pc_alpha = ex['pcA_sets'][ex['pcA_set']] # alpha level for multiple linear regression model while conditining on parents of # parents alpha_level = ex['alpha_level_tig'] print('run tigramite 4, run.pcmci') print(('alpha level(s) for independence tests within the pc procedure' '(finding parents): {}'.format(pc_alpha))) print(( 'alpha level for multiple linear regression model while conditining on parents of ' 'parents: {}'.format(ex['alpha_level_tig']))) # Retrieve traintest info traintest = df_splits # load Response Variable class RV = ex[ex['RV_name']] # create list with all actors, these will be merged into the fulldata array allvar = ex['vars'][0] var_names_corr = [] actorlist = [] cols = [[RV.name]] for var in allvar[:]: print(var) actor = outdic_actors[var] if actor.ts_corr[s].size != 0: ts_train = actor.ts_corr[s].values actorlist.append(ts_train) # create array which numbers the regions var_idx = allvar.index(var) n_regions = actor.ts_corr[s].shape[1] actor.var_info = [[i + 1, actor.ts_corr[s].columns[i], var_idx] for i in range(n_regions)] # Array of corresponing regions with var_names_corr (first entry is RV) var_names_corr = var_names_corr + actor.var_info cols.append(list(actor.ts_corr[s].columns)) index_dates = actor.ts_corr[s].index var_names_corr.insert(0, RV.name) # stack actor time-series together: fulldata = np.concatenate(tuple(actorlist), axis=1) print(('There are {} regions in total'.format(fulldata.shape[1]))) # add the full 1D time series of interest as first entry: fulldata = np.column_stack((RV.RVfullts, fulldata)) df_data = pd.DataFrame(fulldata, columns=flatten(cols), index=index_dates) if ex['import_prec_ts'] == True: var_names_full = var_names_corr.copy() for d in ex['precursor_ts']: path_data = d[1] if len(path_data) > 1: path_data = ''.join(list(path_data)) # skip first col because it is the RV ts df_data_ext = func_fc.load_hdf5( path_data)['df_data'].iloc[:, 1:].loc[s] cols_ts = np.logical_or(df_data_ext.dtypes == 'float64', df_data_ext.dtypes == 'float32') cols_ext = list(df_data_ext.columns[cols_ts]) # cols_ext must be of format '{}_{int}_{}' lab_int = 100 for i, c in enumerate(cols_ext): char = c.split('_')[1] if char.isdigit(): pass else: cols_ext[i] = c.replace(char, str(lab_int)) + char lab_int += 1 df_data_ext = df_data_ext[cols_ext] to_freq = ex['tfreq'] if to_freq != 1: start_end_date = (ex['sstartdate'], ex['senddate']) start_end_year = (ex['startyear'], ex['endyear']) df_data_ext = functions_pp.time_mean_bins(df_data_ext, to_freq, start_end_date, start_end_year, seldays='part')[0] # df_data_ext = functions_pp.time_mean_bins(df_data_ext, # ex, ex['tfreq'], # seldays='part')[0] # Expand var_names_corr n = var_names_full[-1][0] + 1 add_n = n + len(cols_ext) n_var_idx = var_names_full[-1][-1] + 1 for i in range(n, add_n): var_names_full.append([i, cols_ext[i - n], n_var_idx]) df_data = df_data.merge(df_data_ext, left_index=True, right_index=True) else: var_names_full = var_names_corr bool_train = traintest.loc[s]['TrainIsTrue'] bool_RV_train = np.logical_and(bool_train, traintest.loc[s]['RV_mask']) dates_train = traintest.loc[s]['TrainIsTrue'][bool_train].index dates_RV_train = traintest.loc[s]['TrainIsTrue'][bool_RV_train].index RVfull_train = RV.RVfullts.sel(time=dates_train) datesfull_train = pd.to_datetime(RVfull_train.time.values) data = df_data.loc[datesfull_train].values print((data.shape)) # get RV datamask (same shape als data) data_mask = [ True if d in dates_RV_train else False for d in datesfull_train ] data_mask = np.repeat(data_mask, data.shape[1]).reshape(data.shape) # add traintest mask to fulldata # dates_all = pd.to_datetime(RV.RVfullts.index) # dates_RV = pd.to_datetime(RV.RV_ts.index) dates_all = pd.to_datetime(RV.RVfullts.time.values) dates_RV = pd.to_datetime(RV.RV_ts.time.values) df_data['TrainIsTrue'] = [ True if d in datesfull_train else False for d in dates_all ] df_data['RV_mask'] = [True if d in dates_RV else False for d in dates_all] # ====================================================================================================================== # tigramite 3 # ====================================================================================================================== T, N = data.shape # Time, Regions # ====================================================================================================================== # Initialize dataframe object (needed for tigramite functions) # ====================================================================================================================== dataframe = pp.DataFrame(data=data, mask=data_mask, var_names=var_names_full) # ====================================================================================================================== # pc algorithm: only parents for selected_variables are calculated # ====================================================================================================================== parcorr = ParCorr(significance='analytic', mask_type='y', verbosity=verbosity) #========================================================================== # multiple testing problem: #========================================================================== pcmci = PCMCI(dataframe=dataframe, cond_ind_test=parcorr, selected_variables=None, verbosity=4) # selected_variables : list of integers, optional (default: range(N)) # Specify to estimate parents only for selected variables. If None is # passed, parents are estimated for all variables. # ====================================================================================================================== #selected_links = dictionary/None results = pcmci.run_pcmci(tau_max=ex['tigr_tau_max'], pc_alpha=pc_alpha, tau_min=0, max_combinations=ex['max_comb_actors']) q_matrix = pcmci.get_corrected_pvalues(p_matrix=results['p_matrix'], fdr_method='fdr_bh') pcmci.print_significant_links(p_matrix=results['p_matrix'], q_matrix=q_matrix, val_matrix=results['val_matrix'], alpha_level=alpha_level) # returns all parents, not just causal precursors (of lag>0) sig = rgcpd.return_sign_parents(pcmci, pq_matrix=q_matrix, val_matrix=results['val_matrix'], alpha_level=alpha_level) all_parents = sig['parents'] # link_matrix = sig['link_matrix'] links_RV = all_parents[0] df = rgcpd.bookkeeping_precursors(links_RV, var_names_full) #%% rgcpd.print_particular_region_new(links_RV, var_names_corr, s, outdic_actors, map_proj, ex) #%% if ex['SaveTF'] == True: if sys.version[:1] == '3': fname = f's{s}_' + ex['params'] + '.txt' file = io.open(os.path.join(ex['fig_subpath'], fname), mode='w+') file.write(f.getvalue()) file.close() f.close() elif sys.version[:1] == '2': f.close() sys.stdout = orig_stdout return df, df_data
T, N = data.shape # Initialize dataframe object dataframe = pp.DataFrame(data) #%% rcot = RCOT2(significance=parameters['cond_ind_test.significance'], num_f=parameters['cond_ind_test.num_f']) pcmci = PCMCI(dataframe, cond_ind_test=rcot, selected_variables=parameters['selected_variables'], var_names=parameters['var_names'], verbosity=10) q_matrix = pcmci.get_corrected_pvalues(p_matrix=p_matrix, fdr_method='fdr_bh') q_matrix_tsbh = pcmci.get_corrected_pvalues(p_matrix=p_matrix, fdr_method='fdr_tsbh') #%% print results pcmci._print_significant_links( p_matrix = p_matrix, q_matrix = q_matrix, val_matrix = val_matrix, alpha_level = 0.1) #%% get selected parents and fit linear model q_0 = 0.05 parameters['q_0'] = q_0 parameters['q_matrix'] = q_matrix
def test(dataframes,max_lags=[4],alpha=[None],tests=['ParCorr'],limit=1): ''' This function performs the PCMCI algorithm for all the dataframes received as parameters, given the hyper-parameters of the conditional independence test Args: dataframes: A list of TIGRAMITE dataframes max_lags: Maximum number of lags to consider for the laggd time series alpha: Significance level to perform the parent test tests: A list of conditional independence test to be performed limit: A limit for the instances to be considered Returns: ''' test_results = [] random.shuffle(dataframes) total = limit*len(max_lags)*len(alpha)*len(tests) data_frame_iter = iter(dataframes) tests_to_evaluate=[] if 'RCOT' in tests: rcot = RCOT() tests_to_evaluate.append(['RCOT',rcot]) if 'GPDC' in tests: gpdc = GPDC() tests_to_evaluate.append(['GPDC', gpdc]) if 'ParCorr' in tests: parcorr = ParCorr(significance='analytic') tests_to_evaluate.append(['ParCorr',parcorr]) if 'CMIknn' in tests: cmiknn = CMIknn() tests_to_evaluate.append(['CMIknn',cmiknn]) unique_complexities = list(set(l[1] for l in dataframes)) counts = {} for i in unique_complexities: counts[i] = 0 for test in tests_to_evaluate: stop = False for l in max_lags: for a in alpha: while not stop: try: i = random.sample(dataframes,1)[0] if counts[i[1]] < limit: print('evaluating: ' + str(i[3])) start = time.time() pcmci = PCMCI( dataframe=i[2], cond_ind_test=test[1], verbosity=0) # correlations = pcmci.get_lagged_dependencies(tau_max=20) pcmci.verbosity = 1 results = pcmci.run_pcmci(tau_max=l, pc_alpha=a) time_lapse = round(time.time() - start, 2) q_matrix = pcmci.get_corrected_pvalues(p_matrix=results['p_matrix'], fdr_method='fdr_bh') valid_parents = list(pcmci.return_significant_parents(pq_matrix=q_matrix, val_matrix=results['val_matrix'], alpha_level=a)['parents'].values()) flat_list = [] for sublist in valid_parents: for item in sublist: flat_list.append(item) valid_links = len(flat_list) test_results.append([i[3], i[0], i[1], l,test[0],a,valid_links,time_lapse]) results_df = pd.DataFrame(test_results, columns=['representation', 'complexity', 'sample_size', 'max_lag','test','alpha','valid_links_at_alpha', 'learning_time']) print('results ready to be saved') results_df.to_csv( 'results/performance_sample_sizes.csv', index=False) counts[i[1]] += 1 if all(value == limit for value in counts.values()): stop = True except: print('Hoopla!') pass for i in unique_complexities: counts[i] = 0
def run_pcmci(data, data_mask, var_names, path_outsub2, s, tau_min=0, tau_max=1, pc_alpha=None, alpha_level=0.05, max_conds_dim=4, max_combinations=1, max_conds_py=None, max_conds_px=None, verbosity=4): #%% if path_outsub2 is not False: txt_fname = os.path.join(path_outsub2, f'split_{s}_PCMCI_out.txt') # from contextlib import redirect_stdout orig_stdout = sys.stdout # buffer print statement output to f sys.stdout = f = io.StringIO() #%% # ====================================================================================================================== # tigramite 4 # ====================================================================================================================== T, N = data.shape # Time, Regions # ====================================================================================================================== # Initialize dataframe object (needed for tigramite functions) # ====================================================================================================================== dataframe = pp.DataFrame(data=data, mask=data_mask, var_names=var_names) # ====================================================================================================================== # pc algorithm: only parents for selected_variables are calculated # ====================================================================================================================== parcorr = ParCorr(significance='analytic', mask_type='y', verbosity=verbosity) #========================================================================== # multiple testing problem: #========================================================================== pcmci = PCMCI(dataframe=dataframe, cond_ind_test=parcorr, selected_variables=None, verbosity=verbosity) # selected_variables : list of integers, optional (default: range(N)) # Specify to estimate parents only for selected variables. If None is # passed, parents are estimated for all variables. # ====================================================================================================================== #selected_links = dictionary/None results = pcmci.run_pcmci(tau_max=tau_max, pc_alpha=pc_alpha, tau_min=tau_min, max_conds_dim=max_conds_dim, max_combinations=max_combinations, max_conds_px=max_conds_px, max_conds_py=max_conds_py) q_matrix = pcmci.get_corrected_pvalues(p_matrix=results['p_matrix'], fdr_method='fdr_bh') pcmci.print_significant_links(p_matrix=results['p_matrix'], q_matrix=q_matrix, val_matrix=results['val_matrix'], alpha_level=alpha_level) #%% if path_outsub2 is not False: file = io.open(txt_fname, mode='w+') file.write(f.getvalue()) file.close() f.close() sys.stdout = orig_stdout return pcmci, q_matrix, results
def caus_gpdc(data, var_names): import numpy as np import matplotlib as mpl from matplotlib import pyplot as plt import sklearn import tigramite from tigramite import data_processing as pp from tigramite import plotting as tp from tigramite.pcmci import PCMCI from tigramite.independence_tests import ParCorr, GPDC, CMIknn, CMIsymb from tigramite.models import LinearMediation, Prediction data_mask_row = np.zeros(len(data)) for i in range(68904): if (i % 72) < 30 or (i % 72) > 47: data_mask_row[i] = True data_mask = np.zeros(data.shape) data_mask[:, 0] = data_mask_row data_mask[:, 1] = data_mask_row data_mask[:, 2] = data_mask_row data_mask[:, 9] = data_mask_row data_mask[:, 10] = data_mask_row data_mask[:, 11] = data_mask_row dataframe = pp.DataFrame(data, mask=data_mask) datatime = np.arange(len(data)) # tp.plot_timeseries(data, datatime, var_names, use_mask=True, # mask=data_mask, grey_masked_samples='data') gpdc = GPDC(significance='analytic', gp_params=None, use_mask=True, mask_type='y') gpdc.generate_and_save_nulldists(sample_sizes=range(495, 501), null_dist_filename='dc_nulldists.npz') gpdc.null_dist_filename = 'dc_nulldists.npz' pcmci_gpdc = PCMCI(dataframe=dataframe, cond_ind_test=gpdc, var_names=var_names, verbosity=1) # correlations = pcmci.get_lagged_dependencies(tau_max=20) # lag_func_matrix = tp.plot_lagfuncs(val_matrix=correlations, # setup_args={'var_names':var_names, # 'x_base':5, 'y_base':.5}) results = pcmci_gpdc.run_pcmci(tau_max=6, tau_min=1, pc_alpha=0.01) # print("p-values") # print (results['p_matrix'].round(3)) # print("MCI partial correlations") # print (results['val_matrix'].round(2)) q_matrix = pcmci_gpdc.get_corrected_pvalues(p_matrix=results['p_matrix'], fdr_method='fdr_bh') pcmci_gpdc._print_significant_links(p_matrix=results['p_matrix'], q_matrix=q_matrix, val_matrix=results['val_matrix'], alpha_level=0.01) link_matrix = pcmci_gpdc._return_significant_parents( pq_matrix=q_matrix, val_matrix=results['val_matrix'], alpha_level=0.01)['link_matrix'] tp.plot_time_series_graph( val_matrix=results['val_matrix'], link_matrix=link_matrix, var_names=var_names, link_colorbar_label='MCI', ) return results, link_matrix