def rcot_hyperparameter_tuning(): """Runs the experiment for tuning the hyperparameters of the non-linear causal model using RCOT. This function creates causal models using the RCOT test for a variety of different alphas and numbers of random Fourier transformations (num_f). The results are plotted as network and timeseries graphs. The maximum lag is 1. """ var_names = [ "dayOfYear", "minuteOfYear", "minuteOfDay", "dayOfWeek", "isWeekend", "humidity_sensor", "temperature", "precip_intensity", "cloud_cover", "p1", "p2", "dew_point", "wind_speed" ] tau_min = 0 tau_max = 1 dataframe, var_list = generate_dataframe(var_names, start_index=0, end_index=2000) print(f"Variable names: {var_names}") num_fs = [2**n for n in range(1, 14)] for num_f in num_fs: ci_test = RCOT(significance='analytic', num_f=num_f) test_alphas(dataframe, ci_test, [0.05, 0.1, 0.2], var_names, tau_min=tau_min, tau_max=tau_max)
def prior_knowledge(): """Runs the experiment for incorporating prior knowledge into the non-linear causal model using RCOT. This function creates causal models using the RCOT test for a variety of alphas and small range of random Fourier transformations (num_f). It further limits the solution space by limiting the selected_links used in the PCMCI algorithm, which effectively enforced independencies in the result. The results are plotted as network and timeseries graphs. The maximum lag is 1. """ var_names = [ "dayOfYear", "minuteOfYear", "minuteOfDay", "dayOfWeek", "isWeekend", "humidity_sensor", "temperature", "precip_intensity", "cloud_cover", "p1", "p2", "dew_point", "wind_speed" ] tau_min = 0 tau_max = 24 dataframe, var_list = generate_dataframe(var_names, start_index=0, end_index=2000) print(f"Variable names: {var_names}") num_fs = [2**9, 2**10] for num_f in num_fs: ci_test = RCOT(significance='analytic', num_f=num_f) test_alphas(dataframe, ci_test, [0.05, 0.1, 0.2], var_names, tau_min=tau_min, tau_max=tau_max, selected_links=generate_links_from_prior_knowledge( var_names, tau_min, tau_max))
def rcot(request): return RCOT(mask_type=None, significance='analytic', fixed_thres=None, sig_samples=500, sig_blocklength=3, confidence='bootstrap', conf_lev=0.9, conf_samples=10000, conf_blocklength=1, num_f=25, approx="lpd4", seed=42)
def pcmci_causality(data, dt, index, headers, T_data, N_data, maxlag): T = T_data N = N_data tau_max = maxlag # Verbosity: # 0 - nothing # 1 - final graph only # 2 - everything verbose_max = 2 verbose = 2 print("======") # print(list(data)) # got 100 records as itertools.chain object, not numpy df data = np.array(list(data)) print("data len is ") print(len(data)) # data = np.fromiter(data, float) # print(data) # Initialize dataframe object, specify time axis and variable names dataframe = pp.DataFrame(data, datatime=dt, var_names=headers) print(dataframe.var_names) rcot = RCOT(significance='analytic') pcmci_rcot = PCMCI(dataframe=dataframe, cond_ind_test=rcot, verbosity=0) pcmci_rcot.verbosity = 1 results = pcmci_rcot.run_pcmci(tau_max=tau_max, pc_alpha=0.05) # Print results print("p-values") print(results['p_matrix'].round(3)) print("MCI partial correlations") print(results['val_matrix'].round(2)) # print("inside def pcmci_causality") # output edges result_arr = [] # result_arr.append(["effect","cause"]) for index_cause, item in enumerate(results['p_matrix']): print("index is") print(index) print("item is") print(item) print("cause is") cause = headers[index_cause] print(headers[index_cause]) for index_effect, arr in enumerate(item): print("effect arr is ") print(arr) print("effect name is") effect = headers[index_effect] print(headers[index_effect]) for arrItem in arr: if arrItem < 0.05 and cause != effect: result_arr.append([effect, cause, index]) print("{} caused by {}".format(effect, cause)) break with open("pcmci_para_out{}.csv".format(index), "w", newline='') as f: for row in result_arr: f.write("%s\n" % ','.join(str(col) for col in row)) # print(pcmci) return result_arr
def test(dataframes,max_lags=[4],alpha=[None],tests=['ParCorr'],limit=1): ''' This function performs the PCMCI algorithm for all the dataframes received as parameters, given the hyper-parameters of the conditional independence test Args: dataframes: A list of TIGRAMITE dataframes max_lags: Maximum number of lags to consider for the laggd time series alpha: Significance level to perform the parent test tests: A list of conditional independence test to be performed limit: A limit for the instances to be considered Returns: ''' test_results = [] random.shuffle(dataframes) total = limit*len(max_lags)*len(alpha)*len(tests) data_frame_iter = iter(dataframes) tests_to_evaluate=[] if 'RCOT' in tests: rcot = RCOT() tests_to_evaluate.append(['RCOT',rcot]) if 'GPDC' in tests: gpdc = GPDC() tests_to_evaluate.append(['GPDC', gpdc]) if 'ParCorr' in tests: parcorr = ParCorr(significance='analytic') tests_to_evaluate.append(['ParCorr',parcorr]) if 'CMIknn' in tests: cmiknn = CMIknn() tests_to_evaluate.append(['CMIknn',cmiknn]) unique_complexities = list(set(l[1] for l in dataframes)) counts = {} for i in unique_complexities: counts[i] = 0 for test in tests_to_evaluate: stop = False for l in max_lags: for a in alpha: while not stop: try: i = random.sample(dataframes,1)[0] if counts[i[1]] < limit: print('evaluating: ' + str(i[3])) start = time.time() pcmci = PCMCI( dataframe=i[2], cond_ind_test=test[1], verbosity=0) # correlations = pcmci.get_lagged_dependencies(tau_max=20) pcmci.verbosity = 1 results = pcmci.run_pcmci(tau_max=l, pc_alpha=a) time_lapse = round(time.time() - start, 2) q_matrix = pcmci.get_corrected_pvalues(p_matrix=results['p_matrix'], fdr_method='fdr_bh') valid_parents = list(pcmci.return_significant_parents(pq_matrix=q_matrix, val_matrix=results['val_matrix'], alpha_level=a)['parents'].values()) flat_list = [] for sublist in valid_parents: for item in sublist: flat_list.append(item) valid_links = len(flat_list) test_results.append([i[3], i[0], i[1], l,test[0],a,valid_links,time_lapse]) results_df = pd.DataFrame(test_results, columns=['representation', 'complexity', 'sample_size', 'max_lag','test','alpha','valid_links_at_alpha', 'learning_time']) print('results ready to be saved') results_df.to_csv( 'results/performance_sample_sizes.csv', index=False) counts[i[1]] += 1 if all(value == limit for value in counts.values()): stop = True except: print('Hoopla!') pass for i in unique_complexities: counts[i] = 0
def pcmci_causality(data, dt, index, headers, T_data, N_data, maxlag): T = T_data N = N_data # Run settings # there is another tau_max in lagged dependencies that might be much longer! tau_max = maxlag # Verbosity: # 0 - nothing # 1 - final graph only # 2 - everything verbose_max = 2 verbose = 2 print("======") # print(list(data)) # got 100 records as itertools.chain object, not numpy df # Initialize dataframe object, specify time axis and variable names dataframe = pp.DataFrame(data, datatime=dt, var_names=headers) print(dataframe.var_names) rcot = RCOT(significance='analytic') pcmci_rcot = PCMCI(dataframe=dataframe, cond_ind_test=rcot, verbosity=0) pcmci_rcot.verbosity = 1 results = pcmci_rcot.run_pcmci(tau_max=tau_max, pc_alpha=0.05) # Print results print("p-values") print(results['p_matrix'].round(3)) print("MCI partial correlations") print(results['val_matrix'].round(2)) # Save results to file # p_matrix = results['p_matrix'] # with open("p-values_baseline.csv", "w") as csv_file: # writer = csv.writer(csv_file, delimiter=",", quotechar="|", quoting=csv.QUOTE_MINIMAL) # # [[[1 2 3]]] Three brackets to get through. # for sector in p_matrix: # print("sector: ", sector) # for row in sector: # print("row: ", row) # writer.writerow(row) # writer.writerow([]) # # print("inside def pcmci_causality") # output edges result_arr = [] for index_cause, item in enumerate(results['p_matrix']): # print("index is") # print(index) # print("item is") # print(item) # print("cause is") cause = headers[index_cause] # print(headers[index_cause]) for index_effect, arr in enumerate(item): # print("effect arr is ") # print(arr) # print("effect name is") effect = headers[index_effect] # print(headers[index_effect]) for arrItem in arr: if arrItem < 0.05 and cause != effect: result_arr.append([effect, cause, index]) print("{} caused by {}".format(effect, cause)) break with open("pcmci_baseline_out.csv", "w", newline='') as f: for row in result_arr: f.write("%s\n" % ','.join(str(col) for col in row)) # print(pcmci) print(result_arr) return result_arr