Пример #1
0
def rcot_hyperparameter_tuning():
    """Runs the experiment for tuning the hyperparameters of the non-linear causal model using RCOT.

    This function creates causal models using the RCOT test for a variety of different alphas and numbers of random
    Fourier transformations (num_f). The results are plotted as network and timeseries graphs. The maximum lag is 1.
    """
    var_names = [
        "dayOfYear", "minuteOfYear", "minuteOfDay", "dayOfWeek", "isWeekend",
        "humidity_sensor", "temperature", "precip_intensity", "cloud_cover",
        "p1", "p2", "dew_point", "wind_speed"
    ]
    tau_min = 0
    tau_max = 1
    dataframe, var_list = generate_dataframe(var_names,
                                             start_index=0,
                                             end_index=2000)
    print(f"Variable names: {var_names}")
    num_fs = [2**n for n in range(1, 14)]
    for num_f in num_fs:
        ci_test = RCOT(significance='analytic', num_f=num_f)
        test_alphas(dataframe,
                    ci_test, [0.05, 0.1, 0.2],
                    var_names,
                    tau_min=tau_min,
                    tau_max=tau_max)
def prior_knowledge():
    """Runs the experiment for incorporating prior knowledge into the non-linear causal model using RCOT.

    This function creates causal models using the RCOT test for a variety of alphas and small range of random Fourier
    transformations (num_f). It further limits the solution space by limiting the selected_links used in the PCMCI
    algorithm, which effectively enforced independencies in the result. The results are plotted as network and
    timeseries graphs. The maximum lag is 1.
    """
    var_names = [
        "dayOfYear", "minuteOfYear", "minuteOfDay", "dayOfWeek", "isWeekend",
        "humidity_sensor", "temperature", "precip_intensity", "cloud_cover",
        "p1", "p2", "dew_point", "wind_speed"
    ]
    tau_min = 0
    tau_max = 24

    dataframe, var_list = generate_dataframe(var_names,
                                             start_index=0,
                                             end_index=2000)
    print(f"Variable names: {var_names}")
    num_fs = [2**9, 2**10]
    for num_f in num_fs:
        ci_test = RCOT(significance='analytic', num_f=num_f)
        test_alphas(dataframe,
                    ci_test, [0.05, 0.1, 0.2],
                    var_names,
                    tau_min=tau_min,
                    tau_max=tau_max,
                    selected_links=generate_links_from_prior_knowledge(
                        var_names, tau_min, tau_max))
Пример #3
0
def rcot(request):
    return RCOT(mask_type=None,
                significance='analytic',
                fixed_thres=None,
                sig_samples=500,
                sig_blocklength=3,
                confidence='bootstrap',
                conf_lev=0.9,
                conf_samples=10000,
                conf_blocklength=1,
                num_f=25,
                approx="lpd4",
                seed=42)
Пример #4
0
def pcmci_causality(data, dt, index, headers, T_data, N_data, maxlag):

    T = T_data
    N = N_data
    tau_max = maxlag

    # Verbosity:
    # 0 - nothing
    # 1 - final graph only
    # 2 - everything
    verbose_max = 2
    verbose = 2
    print("======")
    # print(list(data))  # got 100 records as itertools.chain object, not numpy df

    data = np.array(list(data))
    print("data len is ")
    print(len(data))
    # data = np.fromiter(data, float)
    # print(data)
    # Initialize dataframe object, specify time axis and variable names
    dataframe = pp.DataFrame(data, datatime=dt, var_names=headers)
    print(dataframe.var_names)
    rcot = RCOT(significance='analytic')
    pcmci_rcot = PCMCI(dataframe=dataframe, cond_ind_test=rcot, verbosity=0)

    pcmci_rcot.verbosity = 1
    results = pcmci_rcot.run_pcmci(tau_max=tau_max, pc_alpha=0.05)

    # Print results
    print("p-values")
    print(results['p_matrix'].round(3))
    print("MCI partial correlations")
    print(results['val_matrix'].round(2))

    # print("inside def pcmci_causality")

    # output edges
    result_arr = []
    # result_arr.append(["effect","cause"])

    for index_cause, item in enumerate(results['p_matrix']):
        print("index is")
        print(index)
        print("item is")
        print(item)
        print("cause is")
        cause = headers[index_cause]
        print(headers[index_cause])
        for index_effect, arr in enumerate(item):
            print("effect arr is ")
            print(arr)
            print("effect name is")
            effect = headers[index_effect]
            print(headers[index_effect])
            for arrItem in arr:
                if arrItem < 0.05 and cause != effect:
                    result_arr.append([effect, cause, index])
                    print("{} caused by {}".format(effect, cause))
                    break

        with open("pcmci_para_out{}.csv".format(index), "w", newline='') as f:
            for row in result_arr:
                f.write("%s\n" % ','.join(str(col) for col in row))
    # print(pcmci)
    return result_arr
def test(dataframes,max_lags=[4],alpha=[None],tests=['ParCorr'],limit=1):
    ''' This function performs the PCMCI algorithm for all the dataframes received as parameters, given the hyper-parameters of the conditional
        independence test
    Args:
        dataframes: A list of TIGRAMITE dataframes
        max_lags: Maximum number of lags to consider for the laggd time series
        alpha: Significance level to perform the parent test
        tests: A list of conditional independence test to be performed
        limit: A limit for the instances to be considered

    Returns:

    '''
    test_results = []
    random.shuffle(dataframes)
    total = limit*len(max_lags)*len(alpha)*len(tests)
    data_frame_iter = iter(dataframes)

    tests_to_evaluate=[]
    if 'RCOT' in tests:
        rcot = RCOT()
        tests_to_evaluate.append(['RCOT',rcot])
    if 'GPDC' in tests:
        gpdc = GPDC()
        tests_to_evaluate.append(['GPDC', gpdc])
    if 'ParCorr' in tests:
        parcorr = ParCorr(significance='analytic')
        tests_to_evaluate.append(['ParCorr',parcorr])
    if 'CMIknn' in tests:
        cmiknn = CMIknn()
        tests_to_evaluate.append(['CMIknn',cmiknn])


    unique_complexities = list(set(l[1] for l in dataframes))
    counts = {}
    for i in unique_complexities:
        counts[i] = 0

    for test in tests_to_evaluate:
        stop = False
        for l in max_lags:
            for a in alpha:
                while not stop:
                    try:
                        i = random.sample(dataframes,1)[0]
                        if counts[i[1]] < limit:
                            print('evaluating: ' + str(i[3]))
                            start = time.time()
                            pcmci = PCMCI(
                                    dataframe=i[2],
                                    cond_ind_test=test[1],
                                    verbosity=0)
                             # correlations = pcmci.get_lagged_dependencies(tau_max=20)
                            pcmci.verbosity = 1
                            results = pcmci.run_pcmci(tau_max=l, pc_alpha=a)
                            time_lapse = round(time.time() - start, 2)

                            q_matrix = pcmci.get_corrected_pvalues(p_matrix=results['p_matrix'], fdr_method='fdr_bh')
                            valid_parents = list(pcmci.return_significant_parents(pq_matrix=q_matrix,
                                                                                  val_matrix=results['val_matrix'],
                                                                                  alpha_level=a)['parents'].values())

                            flat_list = []
                            for sublist in valid_parents:
                                for item in sublist:
                                    flat_list.append(item)

                            valid_links = len(flat_list)

                            test_results.append([i[3], i[0], i[1], l,test[0],a,valid_links,time_lapse])

                            results_df = pd.DataFrame(test_results,
                                                              columns=['representation', 'complexity', 'sample_size', 'max_lag','test','alpha','valid_links_at_alpha',
                                                                       'learning_time'])
                            print('results ready to be saved')
                            results_df.to_csv(
                                        'results/performance_sample_sizes.csv',
                                        index=False)

                            counts[i[1]] += 1
                            if all(value == limit for value in counts.values()):
                                stop = True

                    except:
                        print('Hoopla!')
                        pass

                for i in unique_complexities:
                    counts[i] = 0
Пример #6
0
def pcmci_causality(data, dt, index, headers, T_data, N_data, maxlag):
    T = T_data
    N = N_data
    # Run settings
    # there is another tau_max in lagged dependencies that might be much longer!
    tau_max = maxlag

    # Verbosity:
    # 0 - nothing
    # 1 - final graph only
    # 2 - everything
    verbose_max = 2
    verbose = 2
    print("======")
    # print(list(data))  # got 100 records as itertools.chain object, not numpy df

    # Initialize dataframe object, specify time axis and variable names
    dataframe = pp.DataFrame(data, datatime=dt, var_names=headers)
    print(dataframe.var_names)
    rcot = RCOT(significance='analytic')
    pcmci_rcot = PCMCI(dataframe=dataframe, cond_ind_test=rcot, verbosity=0)

    pcmci_rcot.verbosity = 1
    results = pcmci_rcot.run_pcmci(tau_max=tau_max, pc_alpha=0.05)

    # Print results
    print("p-values")
    print(results['p_matrix'].round(3))
    print("MCI partial correlations")
    print(results['val_matrix'].round(2))

    # Save results to file
    # p_matrix = results['p_matrix']
    # with open("p-values_baseline.csv", "w") as csv_file:
    #     writer = csv.writer(csv_file, delimiter=",", quotechar="|", quoting=csv.QUOTE_MINIMAL)
    #     # [[[1 2 3]]] Three brackets to get through.
    #     for sector in p_matrix:
    #         print("sector: ", sector)
    #         for row in sector:
    #             print("row: ", row)
    #             writer.writerow(row)
    #         writer.writerow([])
    #
    # print("inside def pcmci_causality")

    # output edges
    result_arr = []

    for index_cause, item in enumerate(results['p_matrix']):
        # print("index is")
        # print(index)
        # print("item is")
        # print(item)
        # print("cause is")
        cause = headers[index_cause]
        # print(headers[index_cause])
        for index_effect, arr in enumerate(item):
            # print("effect arr is ")
            # print(arr)
            # print("effect name is")
            effect = headers[index_effect]
            # print(headers[index_effect])
            for arrItem in arr:
                if arrItem < 0.05 and cause != effect:
                    result_arr.append([effect, cause, index])
                    print("{} caused by {}".format(effect, cause))
                    break

    with open("pcmci_baseline_out.csv", "w", newline='') as f:
        for row in result_arr:
            f.write("%s\n" % ','.join(str(col) for col in row))
    # print(pcmci)
    print(result_arr)

    return result_arr