Пример #1
0
def sg_varEx_table(ds, varEx):
    """ Accepts results containing selected component index data and the name of the dataset the stochastic greedy algorithms selected the features from.
    Tabulates results and outputs to a Tex file which can be imported to the Research Article.

    Args:
        ds (String): Dataset feature selection was performed on.
        varEx (Dictionary): Key - Algorithm type, Value - Component indexes of the features selected by that algorithm
    """
    header = ['K', 1]
    rows = [header]
    x = 2
    for k in varEx.keys():
        row = []
        row.append(k)
        header.append(x)
        x = x + 1
        for i in range(len(varEx[k])):
            row.append(varEx[k][i])
        rows.append(row)

    table = Texttable()
    table.set_cols_align(["c"] * len(rows[0]))
    table.set_deco(Texttable.HEADER)
    table.add_rows(rows)

    outputLatex = latextable.draw_latex(table, caption="Variance explained by variables selected by the stochastic greedy implementations for the {0} dataset and for k = 1,..,6 the kth selected variable is indicated using the default percentage for random sampling".format(ds))
    # save output to latex file in output/random/ds/glg{dataset}.tex
    with open('output/real/{0}/sgVarEx.tex'.format(ds), 'w') as file:
        file.write(outputLatex)
    
    with open('output/notLatex/real/{0}/sgVarEx.txt'.format(ds),'w') as file:
        file.write(table.draw())
Пример #2
0
def realDataInfo():
    """ This function creates a table containing the dimensions of each of the real datasets used in the research study and outputs to a Tex file. 
    """
    header = ['Dataset', 'm', 'v']
    file_names = ['X50sites', 'Xpitprops', 'wdbc', 'frogs']
    data_names = ['Wave Sites', 'Pitprops', 'Breast Cancer Diagnosis', 'Anuran Frog Calls']
    
    rows = []
    rows.append(header)
    for i in range(len(file_names)):
        mat = read_matrix_from_file('data/realData/{0}.txt'.format(file_names[i]))
        m, v = mat.shape
        dataset = data_names[i]
        rows.append([dataset, m, v])

    table = Texttable()
    table.set_cols_align(["c"] * 3)
    table.set_deco(Texttable.HEADER | Texttable.VLINES)
    table.add_rows(rows)

    outputLatex = latextable.draw_latex(table, caption="Overview of the real data used in this study")
    # save output to latex file in output/random/ds/glg{dataset}.tex
    with open('output/real/data_dimensions.tex','w') as file:
        file.write(outputLatex)

    with open('output/notLatex/real/data_dimensions.txt','w') as file:
        file.write(table.draw())
Пример #3
0
 def print_comparison_latex_code(self):
     print('\nTexttable Latex')
     print(
         latextable.draw_latex(
             self._table,
             caption=
             "Confronto di accuratezza sul test set prima e dopo l'esecuzione "
             "della strategia di pruning"))
Пример #4
0
def sg_duration_table(ds, duration):
    header = ['Algorithm', 'Duration']
    rows = [header]
    for k in duration.keys():
        row = []
        row.append(k)
        row.append(str(duration[k][0]))
        rows.append(row)

    table = Texttable()
    table.set_cols_dtype(['t','t'])
    table.set_cols_align(["c"] * len(rows[0]))
    table.set_deco(Texttable.HEADER)
    table.add_rows(rows)

    outputLatex = latextable.draw_latex(table, caption="Computational time in seconds to select 6 features from the {0} dataset with the greedy and lazy greedy unsupervised stochastic selection algorithms".format(ds))
    # save output to latex file in output/random/ds/glg{dataset}.tex
    with open('output/real/{0}/sgDurationTable.tex'.format(ds), 'w') as file:
        file.write(outputLatex)

    with open('output/notLatex/real/{0}/sgDurationTable.txt'.format(ds),'w') as file:
        file.write(table.draw())
Пример #5
0
def sg_compare_varEx_table(ds, varEx, percentages):
    """ Accepts results containing variance explained by selected component data and the name of the dataset the stochastic greedy algorithms selected the features from.
    Tabulates results and outputs to a Tex file which can be imported to the Research Article.

    Args:
        ds (String): Dataset feature selection was performed on.
        varEx (Dictionary): Key - algorithm type, Value - Dictionary: Key - percentage, Value - The variance explained by components selected by the algorithms given in a list with percentage used in random sampling
        percentages (List of floats): Percentages values used in random sampling.
    """
    # Table Header
    header = ['\%']
    for i in range(Nc):
        header.append(i+1)
    
    # Table Rows
    for k in varEx.keys():
        rows = [header]
        for p in percentages:
            row = [int(p*100)]
            res = varEx[k][p]
            for x in res:
                row.append(x)
            rows.append(row)

        # Setup table
        table = Texttable()
        table.set_cols_align(["c"]* len(header))
        table.set_deco(Texttable.HEADER)
        table.add_rows(rows)

        # Create output
        outputLatex = latextable.draw_latex(table, caption="Variance explained by the variables selected by the {0} algorithm for the {1} dataset, using different random sampling percentages, and for k = 1,..,6 the kth selected variable is indicated".format(k, ds))
        # Write to file
        with open('output/real/{0}/sg_compareVarEx_{1}.tex'.format(ds, k), 'w') as file:
            file.write(outputLatex)

        with open('output/notLatex/real/{0}/sg_compareVarEx.txt'.format(ds),'w') as file:
            file.write(table.draw())
Пример #6
0
def randomDataInfo():
    """ This function creates a table containing the dimensions of each of the random datasets used in the research study and outputs to a Tex file."""
    header = ['Dataset', 'm', 'v']
    rows = []
    rows.append(header)
    for i in range(10):
        mat = read_matrix_from_file('data/randomData/t{0}.txt'.format((i+1)))
        m, v = mat.shape
        dataset = 't{0}'.format((i+1))
        rows.append([dataset, m, v])

    table = Texttable()
    table.set_cols_align(["c"] * 3)
    table.set_deco(Texttable.HEADER | Texttable.VLINES)
    table.add_rows(rows)

    outputLatex = latextable.draw_latex(table, caption="Overview of the random data used in this study")
    # save output to latex file in output/random/ds/glg{dataset}.tex
    with open('output/random/data_dimensions.tex','w') as file:
        file.write(outputLatex)

    with open('output/notLatex/random/data_dimensions.txt','w') as file:
        file.write(table.draw())
Пример #7
0
def sg_sample_rows_table():
    """ Tabulates the number of rows in each random sample by using different percentages in random sampling.
    Outputs table to a Tex file which can be imported to the Research Article.
    """
    header = ['']

    # rowsDick < key, value> = <dataset name, number of samples>
    rowsDict = defaultdict(int)
    for k in datasets.keys():
        X = read_matrix_from_file(datasets[k])
        rowsDict[k] = X.shape[0]
        header.append(k)

    # results for each row appended to rows
    rows = [header]

    for i in range(10, 110, 10):
        # results from each percentage becomes a row
        row = []
        row.append(i)
        for k in rowsDict.keys():
            row.append(int(rowsDict[k] * i * 0.01))
        rows.append(row)

    # Output tabulated results to Tex file
    table = Texttable()
    table.set_cols_align(["c"] * len(rows[0]))
    table.set_deco(Texttable.HEADER)
    table.add_rows(rows)

    outputLatex = latextable.draw_latex(table, caption="Number of rows when different percentages are used to select subsets of the datasets via random sampling with replacement")
    # save output to latex file in output/random/ds/glg{dataset}.tex
    with open('output/real/sg_Sizes.tex', 'w') as file:
        file.write(outputLatex)

    with open('output/notLatex/real/sg_Sizes.txt','w') as file:
        file.write(table.draw())
Пример #8
0
def sg_compare_duration_table(ds, duration, percentages):
    """ Accepts results containing computation time data and the name of the dataset the stochastic greedy algorithms selected the features from.
    Tabulates results and outputs to a Tex file which can be imported to the Research Article.

    Args:
        ds (String): Dataset feature selection was performed on.
        duration (Dictionary): Key - algorithm type, Value - Dictionary: Key - percentage, Value - The computational time to perform the algorithm given in a list with percentage used in random sampling
        percentages (List of floats): Percentages values used in random sampling.
    """
    # Table Header
    header =['Algorithms']
    for p in percentages:
        header.append(p)
    
    # Table Rows
    rows = [header]
    for k in duration.keys():
        row = [k]
        for p in percentages:
            row.append(str(duration[k][p]))
        rows.append(row)

    # Construct Table
    table = Texttable()
    table.set_cols_dtype(['t','t','t'])
    table.set_cols_align(["c"] * len(rows[0]))
    table.set_deco(Texttable.HEADER)
    table.add_rows(rows, header=True)

    # Create output
    outputLatex = latextable.draw_latex(table, caption="Computation time (in seconds) to perform Stochastic Greedy feature selection on the {0} dataset with different percentages used to sample the data".format(ds))
    # Write to file
    with open('output/real/{0}/sg_compareDurationTable.tex'.format(ds), 'w') as file:
        file.write(outputLatex)

    with open('output/notLatex/real/{0}/sg_compareDurationTable.txt'.format(ds),'w') as file:
        file.write(table.draw())
def build_main_table(dataset: str, experiment: dict) -> str:
    t = Texttable()
    t.set_deco(t.HEADER)
    cats = ["LOC", "PER", "ORG"]
    miscdata = dataset != "WikiANN"
    # Fix miscfuckeri
    row = ["Model name", "Trained on", "F1"]
    if miscdata:
        row.append(r"F1 {\tiny\textdiscount MISC}")
        cats.append("MISC")
    row += ["Prec.", "Rec."]
    row += cats

    t.set_cols_align(["l", "l"] + ["c"] * (3 + len(cats) + int(miscdata)))
    t.set_cols_dtype(["t"] * len(row))  # Dont overwrite my formatting pls
    t.header(row)

    for m, mname in MODEL_NAMES.items():
        v = experiment[m]
        row = [mname, MODEL_TRAINDATA[m]]
        if miscdata:
            row.append(
                f1f(v["stats"]["micro avg"]["f1-score"]
                    ) if v["stats"]["MISC"]["f1-score"] else "-")
        row.append(f1f(v["stats_nomisc"]["micro avg"]["f1-score"]))
        row.append(f1f(v["stats"]["micro avg"]["precision"]))
        row.append(f1f(v["stats"]["micro avg"]["recall"]))
        row += [f1f(v["stats"][c]["f1-score"] or "-") for c in cats]
        t.add_row(row)
    #print(t.draw())
    out = draw_latex(
        t,
        caption=
        f"F1\pro-scores of Danish NER models of the {dataset} data-set consisting of {v['N']} sentences.",
        label=f"tab:{dataset}")
    print(out)
    return out
    dataframe = dataframe.replace("?", np.nan).dropna()
    classes = set(dataframe[dataframe.columns[0]])
    classToInt = dict(zip(classes, range(len(classes))))
    dataframe['class'] = dataframe['class'].apply(lambda x: classToInt[x])
    dataset = dataframe.to_numpy()
    TARGET = dataset[:, 0].astype(int)
    DATA = dataset[:, 1:].astype(float)
    accuracy, algorithm_used, algorithm_best = accuracy_test_combine_algo_cv(DATA, TARGET, dataset_name, n_splits=5,
                                                                             stratified=True, balanced=True)
    table_accuracy.add_row(accuracy)
    print(accuracy)
    table_algo_best.add_row(algorithm_best)
    print(algorithm_best)
    table_algo_used.add_row(algorithm_used)
    print(algorithm_used)
    benchmark_result.append(accuracy[1])

mean_accuracy = ["Average", np.average(benchmark_result)]
median_accuracy = ["Median", np.median(benchmark_result)]
table_accuracy.add_row(mean_accuracy)
table_accuracy.add_row(median_accuracy)
benchmark_results = np.loadtxt("benchmark_results.csv", delimiter=", ")
benchmark_results = np.insert(benchmark_results, 0, benchmark_result, axis=1)
np.savetxt("benchmark_results_COMB.csv", benchmark_results, delimiter=", ")
print(table_accuracy.draw() + "\n")
print(draw_latex(table_accuracy) + "\n")
print(table_algo_used.draw() + "\n")
print(draw_latex(table_algo_used) + "\n")
print(table_algo_best.draw() + "\n")
print(draw_latex(table_algo_best) + "\n")
Пример #11
0
    TARGET = dataset[:, 0].astype(int)
    DATA = dataset[:, 1:].astype(float)
    row.append(len(DATA[0]))
    row.append(len(DATA))
    class0 = [x for x in TARGET if x == 0]
    class1 = [x for x in TARGET if x == 1]
    IR = (max(len(class0), len(class1)) / min(len(class0), len(class1)))
    row.append(IR)

    file2 = dataset_location + datasets[i + n]
    dataset_name2 = file2[file2.rfind("/") + 1:][:-4]
    row.append(dataset_name2)
    dataframe2 = pd.read_csv(file2, skiprows=0, sep='|')
    dataframe2 = dataframe2.replace("?", np.nan).dropna()
    classes2 = set(dataframe2[dataframe2.columns[0]])
    classToInt2 = dict(zip(classes2, range(len(classes2))))
    dataframe2['class'] = dataframe2['class'].apply(lambda x: classToInt2[x])
    dataset2 = dataframe2.to_numpy()
    TARGET2 = dataset2[:, 0].astype(int)
    DATA2 = dataset2[:, 1:].astype(float)
    row.append(len(DATA2[0]))
    row.append(len(DATA2))
    class0 = [x for x in TARGET2 if x == 0]
    class1 = [x for x in TARGET2 if x == 1]
    IR = (max(len(class0), len(class1)) / min(len(class0), len(class1)))
    row.append(IR)
    table.add_row(row)

print(table.draw() + "\n")
print(draw_latex(table) + "\n")
def area_over_curve_lp():
    def compute_ideal_area(Y, C):
        len_c = len(numpy.unique(C))
        len_y = len(numpy.unique(Y))
        p_y_c = numpy.zeros((len_y, len_c))

        for c in range(len_c):
            for y in range(len_y):
                p_y_c[y, c] = numpy.logical_and(Y == y, C == c).mean()
        print(p_y_c)

        # compute desired rate i.e p(y=1|C=c)
        desired_rate = p_y_c[1, :].mean()
        errors = p_y_c[1, :] - desired_rate

        majority_acc = max(numpy.mean(Y == 1), 1 - numpy.mean(Y == 1))
        max_dp = demographic_parity_difference(Y, Y, sensitive_features=C)

        solution = get_optimal_front(Y, C)
        # add no error and max_dp to the solution
        solution.append([1, max_dp])

        solution = numpy.array(solution)

        # sort by dp
        solution = solution[solution[:, 1].argsort()]

        area = numpy.sum(
            # acc                            * dp_next - dp_cur
            (solution[:-1, 0] - majority_acc) *
            (solution[1:, 1] - solution[0:-1, 1]))
        return area, majority_acc, max_dp

    # Methods
    methods = [
        "fcrl", "cvib_supervised", "lag-fairness", "maxent_arl", "laftr",
        "adv_forgetting"
    ]

    # compute AUC table
    area = {}
    for data in ["adult", "health"]:
        # compute idea areas
        if data == "adult":
            adult = load_adult(0.2)
            Y = adult["test"][2]
            C = adult["test"][1]
        elif data == "health":
            health = load_health(0.2)
            Y = health["test"][2]
            C = health["test"][1]

        norm_area, majority_acc, max_dp = compute_ideal_area(Y, C)

        area[data] = {}
        for idx, key in enumerate([
                "nn_1_layer", "nn_2_layer", "random_forest", "svm",
                "logistic_regression"
        ]):
            area[data][key] = {}
            for m in methods:
                if data == "health" and m == "laftr":
                    continue
                t = numpy.load(f"result/eval/{data}/{m}.npy",
                               allow_pickle=True).item()
                df = get_dataframe_from_results(t)

                # get pareto front
                pareto = df[[f'{key}_normalized_acc',
                             f'{key}_normalized_dp']].values
                # drop nan
                pareto = pareto[~numpy.isnan(pareto).any(axis=1)]
                pareto = get_pareto_front(pareto)
                pareto = numpy.array(pareto)
                pareto = pareto[pareto[:, 1].argsort()]

                # reject points that have more dp than data
                THRESH = 1.0
                idx = pareto.shape[0]
                while idx > -1:
                    if pareto[idx - 1, 1] > THRESH * max_dp:
                        idx = idx - 1
                    else:
                        break
                pareto = pareto[:idx]
                if idx == -1:
                    area[data][key][m] = 0
                    print(f"No point found below dp_max for {m}, {data}")
                    continue

                # add random acc point, 0 (this works as a reference to create horizontal bars
                # add max_dp, pareto[-1,0] i.e max acc you can get at data's dp
                pareto = numpy.concatenate(
                    [[[majority_acc, 0]], pareto, [[pareto[-1, 0], max_dp]]],
                    axis=0)

                # get area by making rectangle
                area[data][key][m] = numpy.sum(
                    # acc                            * dp_next - dp_cur
                    (pareto[:-1, 0] - pareto[0, 0]) *
                    (pareto[1:, 1] - pareto[0:-1, 1]))

                # normalize
                area[data][key][m] /= norm_area

    # dump to table
    for idx, key in enumerate([
            "nn_1_layer", "nn_2_layer", "random_forest", "svm",
            "logistic_regression"
    ]):
        table = Texttable()
        table.set_cols_align(["l", "c", "c"])
        table.header(["Method", "UCI Adult", "Heritage Health"])
        for m in methods:
            if m == "fcrl":
                table.add_row([
                    "FCRL (Ours)", area["adult"][key][m],
                    area["health"][key][m]
                ])
            if m == "lag-fairness":
                table.add_row(
                    ["MIFR", area["adult"][key][m], area["health"][key][m]])
            if m == "maxent_arl":
                table.add_row([
                    "MaxEnt-ARL", area["adult"][key][m], area["health"][key][m]
                ])
            if m == "cvib_supervised":
                table.add_row(
                    ["CVIB", area["adult"][key][m], area["health"][key][m]])
            if m == "laftr":
                table.add_row(["LAFTR", area["adult"][key][m], "N/A"])
            if m == "adv_forgetting":
                table.add_row([
                    "Adversarial Forgetting", area["adult"][key][m],
                    area["health"][key][m]
                ])

        os_utils.safe_makedirs(os.path.join(FIGURES_FOLDER, "table"))
        with open(os.path.join(FIGURES_FOLDER, "table", f"{key}.better.tex"),
                  'w') as f:
            f.write(
                latextable.draw_latex(
                    table,
                    caption="Area Over Parity Accuracy Curve",
                    label=f"AOPAC_{key}"))