def calc_error(self, truth, obs, ignore_nan): """ Calculate the error of the obs compared to the truth. Parameters ---------- truth : array Numeric NumPy array of gold standard data obs : array Numeric NumPy array of observed values ignore_nan : bool Should NaN's be ignored in the calculation? Returns ------- error : float The error Notes ----- truth and obs must have equal length. The ith element in obs is a measurment or observation of the ith element in truth. """ errors = [] for i in range(0, len(truth)): if ignore_nan == True: if utilities.is_nan(truth[i]) != True and utilities.is_nan(obs[i]) != True: errors.append(self.dist(truth[i], obs[i])) else: errors.append(self.dist(truth[i], obs[i])) if len(errors) == 0: error = np.nan else: error = self.summarize(errors) return error
def evaluate_estimates(out_file, key, est_dir, est_y, gold_standard_file, y): """ Evaluate estimates stored in a directory of csvs against the gold standard. Parameters ---------- out_file : string A path to a csv in which to store summary error metrics key : list of strings A list of strings that correspond to columns in the estimates files and in the gold standard files to serve as the key in merging these files. If evaluates_estimates is being run from the command line, then key must be enclosed in double quotes (e.g. \"['iso3','year']\") est_dir : string The path to a directory with the csvs of the estimates. All csvs in this directory will be assumed to contain estimates. The path should end with a / est_y : string The name of the column in the estimate files that holds the predictions from the model y : string The name of the column in the gold standard file that holds the response variable. This should also be the name of the column in the estimates files that holds the knocked out and noised response variable. gold_standard_file : string The path to the gold standard file Notes ----- see parse_filename for a guide to the naming convention of the predicted response variables """ model_design_vars = {} data = utilities.read(gold_standard_file) data = utilities.add_unique_id(data, key, 'unique_id_for_join_by') files = os.listdir(est_dir) for file in files: file_key = parse_filename(file) path = est_dir + file new_data = utilities.read(path) # rename variable names = [] for name in new_data.dtype.names: if name == est_y: est_y_name = est_y + '_' + str(file_key['model']) + '_' + str(file_key['design']) + '_' + str(file_key['rep']) names.append(est_y_name) elif name == y: y_name = y + '_' + str(file_key['model']) + '_' + str(file_key['design']) + '_' + str(file_key['rep']) names.append(y_name) else: names.append(name) new_data.dtype.names = tuple(names) # collect up variables corresponding to a certain model and design if model_design_vars.has_key(file_key['model']) == False: model_design_vars[file_key['model']] = {} if model_design_vars[file_key['model']].has_key(file_key['design']) == False: model_design_vars[file_key['model']][file_key['design']] = [] model_design_vars[file_key['model']][file_key['design']].append(est_y_name) new_data = utilities.add_unique_id(new_data, key, 'unique_id_for_join_by') new_data = new_data[['unique_id_for_join_by', est_y_name, y_name]] # http://stackoverflow.com/questions/2774949/merging-indexed-array-in-python data = numpy.lib.recfunctions.join_by('unique_id_for_join_by', data, new_data) # this would write a file with the gold standard and all the predictions #utilities.write(out_predictions_file, data) data = numpy.lib.recfunctions.drop_fields(data, 'unique_id_for_join_by') model_design_errors = {} for model in model_design_vars.keys(): model_design_errors[model] = {} for design in model_design_vars[model].keys(): model_design_errors[model][design] = {} truth = [] obs = [] for var in model_design_vars[model][design]: y_var = var.replace(est_y, y) for i in range(0, len(data[y])): if utilities.is_nan(data[y_var][i]) == True: pdb.set_trace() truth.append(data[y][i]) obs.append(data[var][i]) truth = np.array(truth) obs = np.array(obs) model_design_errors[model][design] = {} errors = errormetrics.get_error_metrics() for error in errors: error_str = 'errormetrics.' + error + '()' error_class = eval(error_str) model_design_errors[model][design][error] = error_class.calc_error(truth, obs, True) errors = [] for model in model_design_errors.keys(): for design in model_design_errors[model].keys(): for error in model_design_errors[model][design].keys(): errors.append(error) errors = np.unique(errors) writer = csv.writer(open(out_file, 'wb')) fieldnames = ['model','design'] + errors.tolist() writer.writerow(fieldnames) for model in model_design_errors.keys(): for design in model_design_errors[model].keys(): row = [model, design] for error in errors: if model_design_errors[model][design].has_key(error) == True: row.append(model_design_errors[model][design][error]) else: row.append('') writer.writerow(row) writer = []
def knock_out_leaving_surveys(data, y, se, key, year, survey_span, num_surveys, prop, survey_date): """ Knock out values of y and se in data so that the remaining data simulates a set of surveys. More specifically, in each level of key multiple surveys are generated by randomly selecting a year to conduct each survey and then marking that year and a number of previous years determined by survey_span to leave in the dataset. Parameters ---------- data : ndarray A structured NumPy array. Should probably not have any missing values or else strange behavior will ensue y : string The label for the response variable in data se : string The label for the standard error variable in data key : string or list of strings The labels for the variables in data that will define separate levels for the knock out scheme. year : string The label for the year in data survey_span : int The number of years that each survey covers num_surveys : int The number of surveys in each country prop : float Proportion of countries to apply the knock out design. 0 <= prop <= 1 survey_date : string The name of a variable to be added to data that contains the year each survey was conducted so that surveys can be distinguished within a given level of the key. This comes in handy if you want to specify correlated noise among surveys in your noiser. Returns ------- ko_data : ndarray The same as data except with values of y and se knocked out Notes ----- In this framework, multiple observations of the same data points cannot be generated """ ko_data = copy.copy(data) ko_data = utilities.add_unique_id(ko_data, key, 'unique_id_for_ko') r = np.where(np.arange(1.,len(np.unique(ko_data['unique_id_for_ko']))+1.) <= len(np.unique(ko_data['unique_id_for_ko']))*prop, True, False) if type(r.tolist()) != type(True): random.shuffle(r) else: r = [r] should_be_kept = {} survey_date_dict = {} for i, id in enumerate(np.unique(ko_data['unique_id_for_ko'])): ko_data_i = utilities.subset(ko_data, 'unique_id_for_ko', id) should_be_kept[id] = [] survey_date_dict[id] = [] if r[i] == True: for s in range(0, num_surveys): survey_year_index = random.choice(range(0,len(ko_data_i[year]))) for j in range(survey_year_index-survey_span, survey_year_index): if (j in range(0, len(ko_data_i[year]))) == True: should_be_kept[id].append(ko_data_i[year][j]) survey_date_dict[id].append(ko_data_i[year][survey_year_index]) else: for j in range(0, len(ko_data_i[year])): should_be_kept[id].append(ko_data_i[year][j]) survey_date_dict[id].append(np.nan) survey_date_list = [np.nan]*len(ko_data[y]) for i in range(0, len(ko_data[y])): id = ko_data['unique_id_for_ko'][i] yr = ko_data[year][i] for j, kept_yr in enumerate(should_be_kept[id]): if kept_yr == yr: survey_date_list[i] = survey_date_dict[id][j] break if utilities.is_nan(survey_date_list[i]) == True: ko_data[y][i] = np.nan ko_data[se][i] = np.nan ko_data = numpy.lib.recfunctions.append_fields(ko_data, survey_date, np.array(survey_date_list)) ko_data = numpy.lib.recfunctions.drop_fields(ko_data, 'unique_id_for_ko') return ko_data
def knock_out_cluster_unit(data, y, se, cluster='iso3', unit='year', prop=.2, design='random'): """ Within levels defined by the cluster variable, knock out a proportion of units in data by replacing values of the variable y. Parameters ---------- data : ndarray A structured array. y : string A label of variable in data that corresponds to the response variable to be knocked out se : string A label of variable in data that corresponds to the standard error variable to be knocked out cluster : string or list of strings A field or list of fields in self.data (e.g. 'iso3' or \"['iso3','age']\"). The knock out scheme is applied separately to levels defined by cluster. unit : string A field in self.data. The unit of the data to knock out. Unit should not have multiple values with in levels of cluster. proportion : float The proportion of data to knock out. design : string If 'random', then a proportion of data is knocked out randomally. If 'first', then the first proportion of data is knocked out and analagously for last. Examples -------- >>> dtype = [('iso3','|S4'),('year','<i4'),('y','<f4'),('se','<f4')] >>> data = np.array([('USA',1990,1,.1),('USA',1991,2,.2),('CAN',1990,3,.3),('CAN',1991,4,.4)], dtype=dtype) >>> ko_data = knock_out_cluster_unit(data,'y','se','iso3','year',.5,'first') >>> utilities.is_nan(ko_data['y'][0]) True >>> utilities.is_nan(ko_data['y'][1]) False >>> utilities.is_nan(ko_data['y'][2]) True >>> utilities.is_nan(ko_data['y'][3]) False >>> utilities.is_nan(ko_data['se'][0]) True >>> utilities.is_nan(ko_data['se'][1]) False >>> utilities.is_nan(ko_data['se'][2]) True >>> utilities.is_nan(ko_data['se'][3]) False # Check to see that original data has not been changed >>> utilities.is_nan(data['y'][0]) False """ data = copy.copy(data) data_cluster = {} if cluster == '': data_cluster[''] = data else: if len(cluster[0]) >= 2: data = utilities.add_unique_id(data, cluster, 'knockerouters_unique_cluster_id') cluster = 'knockerouters_unique_cluster_id' for level in np.unique(data[cluster]): data_cluster[level] = utilities.subset(data, cluster, level) for key in data_cluster.keys(): candidates = [] for i, val in enumerate(data_cluster[key][y]): if utilities.is_nan(val) == False: candidates.append(i) should_be_knocked_out = {} r = np.where(np.arange(1.,len(candidates)+1.) <= len(candidates)*prop, True, False) if type(r.tolist()) != type(True): random.shuffle(r) else: r = [r] for index, i in enumerate(candidates): level = data_cluster[key][unit][i] if design == 'random': should_be_knocked_out[level] = r[index] elif design == 'first': should_be_knocked_out[level] = (float(i+1)/len(candidates)) <= prop elif design == 'last': should_be_knocked_out[level] = (float(i+1)/len(candidates)) >= (1-prop) for i, level in enumerate(data[unit]): if (level in should_be_knocked_out.keys()) == True: if cluster == '': if should_be_knocked_out[level] == True: data[y][i] = np.nan data[se][i] = np.nan else: if should_be_knocked_out[level] == True and data[cluster][i] == key: data[y][i] = np.nan data[se][i] = np.nan if cluster == 'knockerouters_unique_cluster_id': data = numpy.lib.recfunctions.drop_fields(data, 'knockerouters_unique_cluster_id') return data
def simulate_data(out_dir, y, se, gold_standard_file, design_file): """ Simulate data by knocking out and adding noise to a gold standard file. How data is knocked out and how noise is added is determined by parameters specified in the design file. Parameters ---------- out_dir : string The path to a directory in which to output the noisy and knocked out data. The path should end with a / y : string The column name in the gold standard file corresponding to the response to be knocked out and noised up. se : string The column name in the gold standard file corresponding to the standard error of the response. If se == '', then a se variable will be created named 'se' and filled with 0's. If noise is added, then this se variable will be set to the standard error of the noise. gold_standard_file : string The path to a csv. design_file : string The path to a csv. If a knock out test is to be performed, there must be a column called knockerouters. If noise is to be added, there must be a column called noisers. If there is a column called rep, then each test will be repeated rep times. If no such column is provided, each test will only be run once. All other columns are parameters for the knockerouter function or the noiser function. These two functions must not not share column names for parameters. All column entries (not the header) must be enclosed in double quotes. A string will be enclosed in single quotes and then double quotes (e.g. \"'USA'\"), whereas a number or an array will be enclosed only in double quotes (e.g. \"2\", \"[1,2,3]\"). See Also -------- utilities.read """ if os.path.isdir(out_dir) == False: os.mkdir(out_dir) gold_data = utilities.read(gold_standard_file) if se == "": gold_data = numpy.lib.recfunctions.append_fields(gold_data, "se", [0] * len(gold_data), "<f4") se = "se" reader = csv.reader(open(design_file)) on_header = True index = 0 rep_index = np.nan for row in reader: if on_header == True: header = row on_header = False for i in range(0, len(header)): if header == "rep": rep_index = i else: if utilities.is_nan(rep_index) == True: reps = 1 else: reps = int(row[rep_index]) for i in range(0, reps): data = gold_data row_dict = {} for j, name in enumerate(header): row_j = eval(row[j]) row_dict[name] = row_j for func_collection in ["knockerouters", "noisers", "biasers"]: if row_dict.has_key(func_collection) == True: fun_str = func_collection + "." + row_dict[func_collection] if func_collection == "biasers": fun_str = fun_str + "(data, y" else: fun_str = fun_str + "(data, y, se" get_args_str = "inspect.getargspec(" + func_collection + "." + row_dict[func_collection] + ")" args = eval(get_args_str)[0] for arg in args: if (arg in ["data", "y", "se"]) == False: fun_str = fun_str + ", row_dict['" + arg + "']" fun_str = fun_str + ")" data = eval(fun_str) gold_standard_file = gold_standard_file.split("/") gold_standard_file = gold_standard_file[len(gold_standard_file) - 1] new_file = ( out_dir + "sim_" + gold_standard_file.replace(".csv", "") + "_" + str(index) + "_" + str(i) + ".csv" ) utilities.write(new_file, data) index = index + 1