Exemplo n.º 1
0
def test1():
    """ Test to find F1 score by manually selecting product id(s) from original data to test """

    script_dir = os.path.dirname(__file__)
    file_to_learn = os.path.join(script_dir,
                                 '../autofunc/assets/consumer_systems.csv')

    train_data = make_df(file_to_learn)

    # Use a threshold to get the top XX% of frequency values
    threshold = 0.7

    ## Choose ID(s) from learning file to separate into the testing set
    test_ids = [691, 169]

    test_df, train_df = split_learning_verification(train_data, test_ids)

    test_list = df_to_list(test_df)

    comb_sort = counter_pandas(train_df)
    thresh_results = get_top_results(comb_sort, threshold)

    # Find the F1 score
    learned_dict, matched, overmatched, unmatched, recall, precision, f1 = precision_recall(
        thresh_results, test_list)

    assert len(learned_dict) != 0
    assert f1 > 0
Exemplo n.º 2
0
def test_1():
    """
    Testing that the highest confidence result for the screw component is couple solid, which is what
    a screw does almost exclusively
    """

    script_dir = os.path.dirname(__file__)
    file_to_test = os.path.join(script_dir,
                                '../autofunc/assets/consumer_systems.csv')

    test_data = pd.read_csv(file_to_test)

    combos_sorted = counter_pandas(test_data)

    assert combos_sorted['screw'][0][0] == 'couple solid'
Exemplo n.º 3
0
def test_1():
    """ Example showing how to automate functional representation with frequency values"""

    # Dataset used for data mining
    script_dir = os.path.dirname(__file__)
    file_to_test = os.path.join(script_dir,
                                '../autofunc/assets/consumer_systems.csv')

    test_data = pd.read_csv(file_to_test)
    combos_sorted = counter_pandas(test_data)

    # Use a threshold to get the top XX% of confidence values
    threshold = 0.5
    thresh_results = get_top_results(combos_sorted, threshold)

    # Use a known product for verification
    input_file = os.path.join(script_dir,
                              '../autofunc/assets/InputExample.csv')

    # Get dictionary of functions and flows for each component based on data mining
    results, unmatched = get_func_rep(thresh_results, input_file, True)

    assert results['screw'][0][0] == 'couple solid'
    assert 'cheese' in unmatched
Exemplo n.º 4
0
from autofunc.get_top_results import get_top_results
from autofunc.counter_pandas import counter_pandas
from autofunc.get_precision_recall import precision_recall
from autofunc.df_to_list import df_to_list
import os.path
import pandas as pd

""" Example showing how to find F1 score using separate file of input components """


# Dataset used for data mining
script_dir = os.path.dirname(__file__)
file_to_learn = os.path.join(script_dir, '../autofunc/assets/consumer_systems.csv')

train_data = pd.read_csv(file_to_learn)
combos_sorted = counter_pandas(train_data)

# Use a threshold to get the top XX% of confidence values
threshold = 0.5
thresh_results = get_top_results(combos_sorted, threshold)

# Use a known product for verification
test_file = os.path.join(script_dir, '../autofunc/assets/jigsawQuery_headers.csv')
test_data = pd.read_csv(test_file)
test_list = df_to_list(test_data)

learned_dict, matched, overmatched, unmatched, recall, precision, f1 = precision_recall(thresh_results,
                                                                                                    test_list)


# Optional write to file - uncomment and rename to write file
Exemplo n.º 5
0
        verification_ids = e

        ver_df, learn_df = split_learning_verification(df, verification_ids)

        ver_list = df_to_list(ver_df)

        if not bd:
            comb_sort, counts, combos = counter_pandas_with_counts(learn_df)
            thresh_results = get_top_results(comb_sort, threshold)

            # Find the F1 score of the verification test by comparing the learned results with the known function/flows
            learned_dict, matched, overmatched, unmatched, recall, precision, f1 = precision_recall(
                thresh_results, ver_list)

        if bd:
            bd_comb_sort = counter_pandas(bd_df)
            bd_thresh_results = get_top_results(bd_comb_sort, threshold)
            learned_dict, matched, overmatched, unmatched, recall, precision, f1 = precision_recall(
                bd_thresh_results, ver_list)

        precisions.append(precision)
        recalls.append(recall)

        print(e)

        f1s += f1

        keep.append([e, f1])
        plots.append(f1)

        avg_f1 = f1s / len(keep)
Exemplo n.º 6
0
        ps_thresh = i / 100

        if reading:
            keep_ids = similarity_df[
                similarity_df[str(test_id)] > ps_thresh].index.tolist()
        else:
            keep_ids = similarity_df[
                similarity_df[test_id] > ps_thresh].index.tolist()

        keep_ids.remove(test_id)

        # Only keep rows from data frame that have an id that is in the keep_ids list
        keep_df = train_df[train_df['id'].isin(keep_ids)]

        comb_sort = counter_pandas(keep_df)

        # Component counting and fractions
        train_comps = list(keep_df.comp.unique())

        if train_comps:
            comp_ratio = len(train_comps) / num_all_comps
            comp_ratios.append((len(keep_ids), i, comp_ratio))

        if comp_ratio > 0.7 and len(keep_ids) < 40:
            keepers.append(keep_ids)

        scatter_keep.append((comp_ratio, len(keep_ids)))

        for t in range(10, 100, 5):
            threshold = t / 100
Exemplo n.º 7
0
        ps_thresh = i / 100

        if reading:
            keep_ids = similarity_df[
                similarity_df[str(test_id)] > ps_thresh].index.tolist()
        else:
            keep_ids = similarity_df[
                similarity_df[test_id] > ps_thresh].index.tolist()

        keep_ids.remove(test_id)

        # Only keep rows from data frame that have an id that is in the keep_ids list
        keep_df = train_df[train_df['id'].isin(keep_ids)]

        comb_sort = counter_pandas(keep_df)

        # Component counting and fractions
        train_comps = list(keep_df.comp.unique())

        if train_comps:
            comp_ratios.append(
                (len(keep_ids), i, len(train_comps) / num_all_comps))

        for t in range(10, 100, 5):
            threshold = t / 100
            print(test_id, ' ', ps_thresh, ' ', threshold)

            thresh_results = get_top_results(comb_sort, threshold)

            if not keep_ids: