def test1(): """ Test to find F1 score by manually selecting product id(s) from original data to test """ script_dir = os.path.dirname(__file__) file_to_learn = os.path.join(script_dir, '../autofunc/assets/consumer_systems.csv') train_data = make_df(file_to_learn) # Use a threshold to get the top XX% of frequency values threshold = 0.7 ## Choose ID(s) from learning file to separate into the testing set test_ids = [691, 169] test_df, train_df = split_learning_verification(train_data, test_ids) test_list = df_to_list(test_df) comb_sort = counter_pandas(train_df) thresh_results = get_top_results(comb_sort, threshold) # Find the F1 score learned_dict, matched, overmatched, unmatched, recall, precision, f1 = precision_recall( thresh_results, test_list) assert len(learned_dict) != 0 assert f1 > 0
# Dataset used for data mining script_dir = os.path.dirname(__file__) file_to_learn = os.path.join(script_dir, '../autofunc/assets/consumer_systems.csv') train_data = pd.read_csv(file_to_learn) combos_sorted = counter_pandas(train_data) # Use a threshold to get the top XX% of confidence values threshold = 0.5 thresh_results = get_top_results(combos_sorted, threshold) # Use a known product for verification test_file = os.path.join(script_dir, '../autofunc/assets/jigsawQuery_headers.csv') test_data = pd.read_csv(test_file) test_list = df_to_list(test_data) learned_dict, matched, overmatched, unmatched, recall, precision, f1 = precision_recall(thresh_results, test_list) # Optional write to file - uncomment and rename to write file # write_results_from_dict(learned_dict, 'test1.csv') print('Recall = {0:.5f}'.format(recall)) print('Precision = {0:.5f}'.format(precision)) print('F1 = {0:.5f}'.format(f1))
for i in range(iters): random.shuffle(test_ids) # Split into folds n = floor(len(test_ids) / k) # Making folds using list comprehension folds = [ test_ids[i * n:(i + 1) * n] for i in range((len(test_ids) + n - 1) // n) ] for e in folds: verification_ids = e ver_df, learn_df = split_learning_verification(df, verification_ids) ver_list = df_to_list(ver_df) if not bd: comb_sort, counts, combos = counter_pandas_with_counts(learn_df) thresh_results = get_top_results(comb_sort, threshold) # Find the F1 score of the verification test by comparing the learned results with the known function/flows learned_dict, matched, overmatched, unmatched, recall, precision, f1 = precision_recall( thresh_results, ver_list) if bd: bd_comb_sort = counter_pandas(bd_df) bd_thresh_results = get_top_results(bd_comb_sort, threshold) learned_dict, matched, overmatched, unmatched, recall, precision, f1 = precision_recall( bd_thresh_results, ver_list)
## End 1 ## 2. Reading in dataframe as computed above # similarity_df = pd.read_csv('consumer_similarity.csv') # reading = True ## End 2 all_comps = list(train_df_whole.comp.unique()) num_all_comps = len(all_comps) ## Main loop, comment if not reading in all_data for test_id in all_train_ids: # print(test_id) # test_df, train_df = split_learning_verification(train_df_whole, [test_id]) test_list = df_to_list(test_df) train_ids = list(map(int, train_df.id.unique())) id_comp_ratios.append( (test_id, len(list(test_df.comp.unique())) / num_all_comps)) # # Outer loop through percent similar for i in range(0, 100, 10): ps_start = time.time() f1_plot = [] thresh_plot = [] ps_plot = [] keep_ids = []