# Create a list of user_ids to exclude based on marginal distribution exclude_id_based_on_marginal_distribution = calculate_marginal_distribution_for_each_user(project_short_name) # Filter the include_ids removing any that should be excluded based on marginal distributions include_ids = [id for id in include_ids if id not in exclude_id_based_on_marginal_distribution] gold_standard_data = define_gold_standard_data(project_short_name=project_short_name) expert_ids = define_gold_standard_ids(project_short_name=project_short_name) #combined_dict = build_combined_dict_keyed_on_composite_key(project_short_name=project_short_name, user_ids_to_include=include_ids, expert_project_short_name=gold_standard_data, expert_user_ids_to_include=expert_ids) ### CLEAN FROM HERE ### user_dict = create_individual_dict(project_short_name=project_short_name,user_ids_to_include=include_ids) if project_short_name == 'tb2-r2.0': ihc.get_gs_data_2r20() else: gs_dict = create_individual_dict(project_short_name=gold_standard_data, user_ids_to_include=expert_ids) list_of_composite_keys_in_both = [] for k in gs_dict.keys(): if (k in user_dict.keys()) and (':0:0' in k): #converts data stored by keys back to data stored by task/image list_of_composite_keys_in_both.append(k) list_of_composite_keys_in_both.sort() for key in list_of_composite_keys_in_both: user_answers = pandas.DataFrame(user_dict[key]['ihc']) gs_answers = pandas.DataFrame(gs_dict[key]['ihc']) user_prop_mean = user_answers.sum(axis=1)['proportion']/len(user_answers.keys()) expert_prop_mean = gs_answers.sum(axis=1)['proportion']/len(gs_answers.keys())
import IHCtools as ihc from ihc_settings import s import numpy as np import matplotlib.pyplot as plt import scipy.stats.stats as sss import scikits.bootstrap as bootstrap # seed the rng with clock or similarly sufficiently random state np.random.RandomState(seed=None) # disable truncated printing of arrays np.set_printoptions(threshold=np.inf) ########### Loading and organising data ### load numpy array of GS data (nImages*nMeasures*nExperts). Also returns imIDs to keep track of what image each row in expert_data_np refers to datExp, AllimIDs = ihc.get_gs_data_2r20_numpy(s["expertIDs"]) if s["project_short_name"] == "tb2-r2.0": imIDs = list(AllimIDs.Utask_ID_PA) elif s["project_short_name"] == "tb2-r2.0a": imIDs = list(AllimIDs.Utask_ID_a) elif s["project_short_name"] == "tb2-r2.1": imIDs = list(AllimIDs.Utask_ID_2dot1) elif s["project_short_name"] == "tb2-r2.1b": imIDs = list(AllimIDs.GStask_ID.astype(int)) else: raise Exception("add project to list to get imIDs") print "number of images included:", len(imIDs) ### load numpy array of user data. Also returns userIDs in a list (datUser, userIDs) = ihc.get_user_data(imIDs, s["project_short_name"]) ### simulate user data from expert data # (datUser,userIDs) = ihc.simulate_user_data(datExp)