def main(): runs = ['3', '4'] #sources = ['raw_processed', 'clean_contradictions_batch_0.5'] sources = [ 'data_processed', 'clean_contradictions_total_1', 'clean_contradictions_pair_0.5', 'clean_contradictions_batch_0.5' ] group = 'experiment*' n_q = '*' n_lists = '*' batch = '*' #name = f'run{"_".join(runs)}-group_{group}-batch{batch}'.replace('*', '-all-') n_lists = '*' for source in sources: data_dict_list = [] name = f'run{"_".join(runs)}-group_{group}-batch{batch}'.replace( '*', '-all-') name = f'{name}-{source}' print(name) for run in runs: print(run) data = load_processed_data(run, group, n_q, n_lists, batch, source) data_dict_list.extend(data) # data_dict_list.extend(load_experiment_data(run, group, n_q, n_lists, batch, remove_not_val = True)) print(len(data_dict_list)) print('checking if concepts are there:') check_data(data_dict_list) print('creating input') input_df = create_input_df(data_dict_list) print(input_df.columns) input_dir = '../analyses/crowdtruth/input/' input_path = f'{input_dir}{name}.csv' os.makedirs(input_dir, exist_ok=True) input_df.to_csv(input_path, index=False) res_dir = '../analyses/crowdtruth/results/' res_path = f'{res_dir}{name}' os.makedirs(res_dir, exist_ok=True) print('running crowdtruth') input_file = input_path data, config = crowdtruth.load(file=input_file, config=TestConfig()) results = crowdtruth.run(data, config) print('crowdtruth done') unit_scores = results['units'] split_unit_annotation_score(unit_scores) unit_scores.to_csv(f'{res_path}-units.csv') worker_scores = results['workers'] worker_scores.to_csv(f'{res_path}-workers.csv') annotation_scores = results["annotations"] annotation_scores.to_csv(f'{res_path}-annotations.csv') print(f'results stored: {res_path}')
def test_all_workers_agree(self): for w in range(2,11): test_config = self.test_conf_const.__class__ data, config = crowdtruth.load( file=TEST_FILE_PREF + str(w) + "work_agr.csv", config=test_config()) results = crowdtruth.run(data, config) self.assertAlmostEqual(results["units"]["uqs"].at[1], 1.0) for wid in range(w): self.assertAlmostEqual(results["workers"]["wqs"].at["W" + str(wid + 1)], 1.0) if not config.open_ended_task: self.assertAlmostEqual(results["annotations"]["aqs"]["A"], 1.0)
def test_metrics_correct_interval(self): test_conf_const = TutorialCustomizedConfig() test_config = test_conf_const.__class__ data, config = crowdtruth.load( file="tutorial/relex_example_custom.csv", config=test_config()) results = crowdtruth.run(data, config) for _, val_arr in results["units"]["unit_annotation_score"].items(): for _, val in val_arr.items(): self.assertGreaterEqual(val, 0.0) self.assertLessEqual(val, 1.0) for _, val in results["units"]["uqs"].items(): self.assertGreaterEqual(val, 0.0) self.assertLessEqual(val, 1.0) for _, val in results["workers"]["wqs"].items(): self.assertGreaterEqual(val, 0.0) self.assertLessEqual(val, 1.0) for _, val in results["annotations"]["aqs"].items(): self.assertGreaterEqual(val, 0.0) self.assertLessEqual(val, 1.0)
def create_analysis_files(dataset, max_no_workers, max_runs, storing_folder): unique_unit_ids = get_uniq_unit_ids(dataset) for subset_size in xrange(3, max_no_workers): workers_directory = storing_folder + str(subset_size) + "workers" if not os.path.exists(workers_directory): os.makedirs(workers_directory) map_unit_id_combinations = {} for unit_id in xrange(0, len(unique_unit_ids)): (count, subset_unit_id) = get_no_work_unit_id(dataset, unique_unit_ids[unit_id]) combinations = gen_all_worker_combinations(subset_size, count, subset_unit_id) map_unit_id_combinations[unique_unit_ids[unit_id]] = combinations import csv fields = [ 'F1_nr_binary', 'F1_n_binary', 'F1_nr_ternary', 'F1_r_ternary', 'F1_hr_ternary' ] with open('F1_' + str(subset_size) + ".csv", 'w') as f: writer = csv.writer(f) writer.writerow(fields) for run_no in xrange(1, max_runs): unit_worker_set = {} for unit_id, worker_sets in map_unit_id_combinations.iteritems(): unit_worker_set[unit_id] = pick_random_worker_set(worker_sets) df_subset_size = pd.DataFrame() for unit_id, worker_set in unit_worker_set.iteritems(): df_subset = dataset[(dataset["_unit_id"] == unit_id) & (dataset["_worker_id"].isin(worker_set))] frames = [df_subset_size, df_subset] df_subset_size = pd.concat(frames) df_subset_size.to_csv(workers_directory + "/run.csv", index=False) filename = workers_directory + "/run.csv" results_with_newGT = pd.read_csv( "../ground_truth_data/reviewers_pilot_aggregated_judgments.csv" ) class config(DefaultConfig): inputColumns = [ "index", "bin", "doc_len", "document_id", "document_body", "document_title", "rel", "topic", "topic_description", "topic_query" ] outputColumns = ["relevant_snippets"] # processing of a closed task open_ended_task = True annotation_separator = "," annotation_vector = [] def processJudgments(self, judgments): for col in self.outputColumns: judgments[col] = judgments[col].apply( lambda x: x.replace('[', '')) judgments[col] = judgments[col].apply( lambda x: x.replace(']', '')) judgments[col] = judgments[col].apply( lambda x: x.replace('"', '')) judgments[col] = judgments[col].apply( lambda x: x.replace(' ', ',')) return judgments # Read data data, config = crowdtruth.load(file=filename, config=config()) data['judgments'].head() results = crowdtruth.run(data, config) results["units"]['max_relevance_score'] = pd.Series( np.random.randn(len(results["units"])), index=results["units"].index) for i in xrange(0, len(results["units"])): maxVal = 0.0 for key, value in results["units"][ "unit_annotation_score"].iloc[i].items(): if key != "none": if value > maxVal: maxVal = value results["units"]['max_relevance_score'].iloc[i] = maxVal results["units"]["reviewers_rel"] = pd.Series( np.random.randn(len(results["units"])), index=results["units"].index) results["units"]["reviewers_rel_merged"] = pd.Series( np.random.randn(len(results["units"])), index=results["units"].index) for i in xrange(0, len(results_with_newGT.index)): for j in xrange(0, len(results["units"].index)): if (results_with_newGT["topic"].iloc[i] == results["units"]["input.topic"].iloc[j]) and ( results_with_newGT["document_id"].iloc[i] == results["units"]["input.document_id"].iloc[j]): results["units"]["reviewers_rel"].iloc[ j] = results_with_newGT["reviewers_rel"].iloc[i] results["units"]["reviewers_rel_merged"].iloc[ j] = results_with_newGT[ "reviewers_rel_merged"].iloc[i] F1_notrelevant_binary = compute_F1_score_not_relevant_binary( results["units"]) F1_relevant_binary = compute_F1_score_relevant_binary( results["units"]) F1_notrelevant_ternary = compute_F1_score_not_relevant_ternary( results["units"]) F1_relevant_ternary = compute_F1_score_relevant_ternary( results["units"]) F1_highlyrelevant_ternary = compute_F1_score_highly_relevant_ternary( results["units"]) row = [ F1_notrelevant_binary, F1_relevant_binary, F1_notrelevant_ternary, F1_relevant_ternary, F1_highlyrelevant_ternary ] with open('F1_' + str(subset_size) + ".csv", 'a') as f: writer = csv.writer(f) writer.writerow(row)
def test_incremental_worker_agreement(self): for w in range(4, 11): test_config = self.test_conf_const.__class__ data, config = crowdtruth.load( file=TEST_FILE_PREF + str(w - 2) + "vs" + str(w - 1) + "work_agr.csv", config=test_config()) results = crowdtruth.run(data, config) # print str(config.open_ended_task) # check that workers that agree on the same unit have the same quality score for x in range(2, w): if x != (w - 1): self.assertAlmostEqual( results["workers"]["wqs"].at["W1"], results["workers"]["wqs"].at["W" + str(x)],) self.assertAlmostEqual( results["workers"]["wqs"].at["W" + str(w)], results["workers"]["wqs"].at["W" + str(w + x - 1)]) # workers that agree have a greater WQS than the worker that disagrees self.assertGreater( results["workers"]["wqs"].at["W1"], results["workers"]["wqs"].at["W" + str(w - 1)]) self.assertGreater( results["workers"]["wqs"].at["W" + str(w)], results["workers"]["wqs"].at["W" + str(2 * w - 1)]) # the more workers agree on a unit, the higher the worker quality score self.assertGreater( results["workers"]["wqs"].at["W" + str(w)], results["workers"]["wqs"].at["W1"]) # print "W" + str(w) + ": " + str(results["workers"]["wqs"].at["W" + str(w)]) # print "W1: " + str(results["workers"]["wqs"].at["W1"]) # the more workers agree on a unit, the higher the unit quality score self.assertLess( results["units"]["uqs"].at[1], results["units"]["uqs"].at[2]) self.assertLess( results["units"]["uqs"].at[1], results["units"]["uqs"].at[3]) self.assertLess( results["units"]["uqs"].at[2], results["units"]["uqs"].at[3]) # the more workers agree on an annotation, the higher the unit quality score if not config.open_ended_task: self.assertLess( results["annotations"]["aqs"].at["A"], results["annotations"]["aqs"].at["C"]) self.assertLess( results["annotations"]["aqs"].at["B"], results["annotations"]["aqs"].at["A"]) self.assertLess( results["annotations"]["aqs"].at["D"], results["annotations"]["aqs"].at["C"]) self.assertLess( results["annotations"]["aqs"].at["A"], results["annotations"]["aqs"].at["E"]) self.assertLess( results["annotations"]["aqs"].at["C"], results["annotations"]["aqs"].at["E"])