예제 #1
0
def add_mixture_noise(data, y, se, unit=["iso3", "year"], noise_sd=[1.0, 10.0], mix_prob=[0.1, 0.9]):
    """
    Add noise from a mixture of Normal distributions.

    Parameters
    ----------
    data : ndarray
        A structured NumPy array
    y : string
        A label in data corresponding to the variable to add noise to
    se : string
        A label in data corresponding to standard error variable in which the
        standard error of the noise added will be stored
    unit : string or list of strings
        Levels defined by the unit variabes will share the same draws from the noise distributions
    noise_sd : list of floats
        The standard deviations of the Normal distributions
    mix_prob : list of floats
        A list of length len(sd) that gives the probability of using
        a standard deviation specified in sd. All values must be
        between 0 and 1 and together must sum up to 1.

    Returns
    -------
    noisy_data : ndarray
        Exactly the same as data except noise has been added to the y
        variable.

    Examples
    --------
    >>> data = np.array([('north','USA',1,2),('north','CAN',2,4)],dtype=[('region','|S6'),('iso3','|S4'),('y','<f4'), ('se','<f4')])
    >>> noisy_data = add_mixture_noise(data, 'y', 'se', 'iso3', [5,10], [.5,.5])
    >>> noisy_data['y'][0] != data['y'][0]
    True
    >>> noisy_data['y'][1] != data['y'][1]
    True

    >>> int(100*noisy_data['se'][0]) == 538 or int(100*noisy_data['se'][0]) == 1019
    True
    >>> int(100*noisy_data['se'][1]) == 640 or int(100*noisy_data['se'][1]) == 1077
    True

    >>> data = np.array([('north','USA',2,2),('north','CAN',2,2)],dtype=[('region','|S6'),('iso3','|S4'),('y','<f4'), ('se','<f4')])
    >>> noisy_data = add_mixture_noise(data, 'y', 'se', 'region', [10], [1])
    >>> abs(noisy_data['y'][0] - noisy_data['y'][1]) < .01
    True
    >>> noisy_data = add_mixture_noise(data, 'y', 'se', 'iso3', [10], [1])
    >>> abs(noisy_data['y'][0] - noisy_data['y'][1]) > .01
    True
    """

    noisy_data = copy.copy(data)

    prob_cut_offs = []
    prob_cut_offs.append(mix_prob[0])

    for i in range(1, len(mix_prob)):
        new_cut_off = 0.0
        for j in range(0, i + 1):
            new_cut_off = new_cut_off + mix_prob[j]
        prob_cut_offs.append(new_cut_off)

    noisy_data = utilities.add_unique_id(noisy_data, unit, "unique_id_for_noise")
    noise_to_add = {}
    var_to_add = {}
    for i, id in enumerate(np.unique(noisy_data["unique_id_for_noise"])):
        r = random.random()
        min_dist = np.inf
        index = 999
        for j, prob in enumerate(prob_cut_offs):
            if abs(r - prob) < min_dist and prob - r > 0:
                min_dist = abs(r - prob)
                index = j

        noise_to_add[id] = random.gauss(0, noise_sd[index])
        var_to_add[id] = pow(noise_sd[index], 2)

    for i in range(0, len(data[y])):
        id = noisy_data["unique_id_for_noise"][i]
        noisy_data[y][i] = noisy_data[y][i] + noise_to_add[id]
        noisy_data[se][i] = np.sqrt(pow(noisy_data[se][i], 2) + var_to_add[id])

    noisy_data = numpy.lib.recfunctions.drop_fields(noisy_data, "unique_id_for_noise")

    return noisy_data
def evaluate_estimates(out_file, key, est_dir, est_y, gold_standard_file, y):
    """
    Evaluate estimates stored in a directory of csvs against the gold standard.

    Parameters
    ----------
    out_file : string
        A path to a csv in which to store summary error metrics
    key : list of strings
        A list of strings that correspond to columns in the estimates files and in the gold standard files
        to serve as the key in merging these files. If evaluates_estimates is being run from
        the command line, then key must be enclosed in double quotes (e.g. \"['iso3','year']\")
    est_dir : string
        The path to a directory with the csvs of the estimates. All csvs in this directory will be assumed to contain
        estimates. The path should end with a /
    est_y : string
        The name of the column in the estimate files that holds the predictions from the model
    y : string
        The name of the column in the gold standard file that holds the response variable. This should also
        be the name of the column in the estimates files that holds the knocked out and noised response variable.
    gold_standard_file : string
        The path to the gold standard file

    Notes
    -----
    see parse_filename for a guide to the naming convention of the predicted response variables
    """

    model_design_vars = {}

    data = utilities.read(gold_standard_file)

    data = utilities.add_unique_id(data, key, 'unique_id_for_join_by')
    
    files = os.listdir(est_dir)
    for file in files:        
        file_key = parse_filename(file)
                  
        path = est_dir + file
        new_data = utilities.read(path)

        # rename variable
        names = []
        for name in new_data.dtype.names:
            if name == est_y:
                est_y_name = est_y + '_' + str(file_key['model']) + '_' + str(file_key['design']) + '_' + str(file_key['rep'])
                names.append(est_y_name)
            elif name == y:
                y_name = y + '_' + str(file_key['model']) + '_' + str(file_key['design']) + '_' + str(file_key['rep'])
                names.append(y_name)
            else:
                names.append(name)
        new_data.dtype.names = tuple(names)

        # collect up variables corresponding to a certain model and design
        if model_design_vars.has_key(file_key['model']) == False:
            model_design_vars[file_key['model']] = {}

        if model_design_vars[file_key['model']].has_key(file_key['design']) == False:
            model_design_vars[file_key['model']][file_key['design']] = []

        model_design_vars[file_key['model']][file_key['design']].append(est_y_name)

        new_data = utilities.add_unique_id(new_data, key, 'unique_id_for_join_by')
        new_data = new_data[['unique_id_for_join_by', est_y_name, y_name]]
        
        # http://stackoverflow.com/questions/2774949/merging-indexed-array-in-python
        data = numpy.lib.recfunctions.join_by('unique_id_for_join_by', data, new_data)

    # this would write a file with the gold standard and all the predictions
    #utilities.write(out_predictions_file, data)

    data = numpy.lib.recfunctions.drop_fields(data, 'unique_id_for_join_by')

    model_design_errors = {}
    for model in model_design_vars.keys():
        model_design_errors[model] = {}
        for design in model_design_vars[model].keys():
            model_design_errors[model][design] = {}
            truth = []
            obs = []
            for var in model_design_vars[model][design]:
                y_var = var.replace(est_y, y)
                for i in range(0, len(data[y])):
                    if utilities.is_nan(data[y_var][i]) == True:
                        pdb.set_trace()
                        truth.append(data[y][i])
                        obs.append(data[var][i])
                
            truth = np.array(truth)
            obs = np.array(obs)

            model_design_errors[model][design] = {}
            errors = errormetrics.get_error_metrics()
            for error in errors:
                error_str = 'errormetrics.' + error + '()'
                error_class = eval(error_str)
                model_design_errors[model][design][error] = error_class.calc_error(truth, obs, True)
                
    errors = [] 
    for model in model_design_errors.keys():
        for design in model_design_errors[model].keys():
            for error in model_design_errors[model][design].keys():
                errors.append(error)
    errors = np.unique(errors)

    writer = csv.writer(open(out_file, 'wb'))
    fieldnames = ['model','design'] + errors.tolist()
    writer.writerow(fieldnames)
    for model in model_design_errors.keys():
        for design in model_design_errors[model].keys():
            row = [model, design]
            for error in errors:
                if model_design_errors[model][design].has_key(error) == True:
                    row.append(model_design_errors[model][design][error])
                else:
                    row.append('')
                    
            writer.writerow(row)
    writer = []
예제 #3
0
def knock_out_leaving_surveys(data, y, se, key, year, survey_span, num_surveys, prop, survey_date):
    """
    Knock out values of y and se in data so that the remaining data simulates a set 
    of surveys. More specifically, in each level of key multiple surveys are generated by
    randomly selecting a year to conduct each survey and then marking that year and a number 
    of previous years determined by survey_span to leave in the dataset.
    
    Parameters
    ----------
    data : ndarray
        A structured NumPy array. Should probably not have any missing values
        or else strange behavior will ensue
    y : string
        The label for the response variable in data
    se : string
        The label for the standard error variable in data
    key : string or list of strings
        The labels for the variables in data that will define separate levels 
        for the knock out scheme.
    year : string
        The label for the year in data
    survey_span : int
        The number of years that each survey covers
    num_surveys : int 
        The number of surveys in each country
    prop : float
        Proportion of countries to apply the knock out design. 0 <= prop <= 1
    survey_date : string
        The name of a variable to be added to data that contains the year each survey
        was conducted so that surveys can be distinguished within a given level of the key.
        This comes in handy if you want to specify correlated noise among surveys in your noiser.

    Returns
    -------
    ko_data : ndarray
        The same as data except with values of y and se knocked out

    Notes
    -----
    In this framework, multiple observations of the same data points cannot be generated
    """

    ko_data = copy.copy(data)

    ko_data = utilities.add_unique_id(ko_data, key, 'unique_id_for_ko')

    r = np.where(np.arange(1.,len(np.unique(ko_data['unique_id_for_ko']))+1.) <= len(np.unique(ko_data['unique_id_for_ko']))*prop, True, False)
        
    if type(r.tolist()) != type(True):
        random.shuffle(r)
    else:
        r = [r]

    should_be_kept = {}
    survey_date_dict = {}
    for i, id in enumerate(np.unique(ko_data['unique_id_for_ko'])):
        ko_data_i = utilities.subset(ko_data, 'unique_id_for_ko', id)
        
        should_be_kept[id] = []
        survey_date_dict[id] = []
        if r[i] == True:
            for s in range(0, num_surveys):
                survey_year_index = random.choice(range(0,len(ko_data_i[year])))
                for j in range(survey_year_index-survey_span, survey_year_index): 
                    if (j in range(0, len(ko_data_i[year]))) == True:
                        should_be_kept[id].append(ko_data_i[year][j])
                        survey_date_dict[id].append(ko_data_i[year][survey_year_index])
        else:
            for j in range(0, len(ko_data_i[year])):
                should_be_kept[id].append(ko_data_i[year][j])
                survey_date_dict[id].append(np.nan)

    survey_date_list = [np.nan]*len(ko_data[y])
    for i in range(0, len(ko_data[y])):
        id = ko_data['unique_id_for_ko'][i]
        yr = ko_data[year][i]

        for j, kept_yr in enumerate(should_be_kept[id]):
            if kept_yr == yr:
                survey_date_list[i] = survey_date_dict[id][j]
                break

        if utilities.is_nan(survey_date_list[i]) == True:
            ko_data[y][i] = np.nan
            ko_data[se][i] = np.nan    
    
    ko_data = numpy.lib.recfunctions.append_fields(ko_data, survey_date, np.array(survey_date_list))
        
    ko_data = numpy.lib.recfunctions.drop_fields(ko_data, 'unique_id_for_ko')
    
    return ko_data
예제 #4
0
def add_mixture_bias(data, y, unit=['iso3','year'], bias=[1.,10.], mix_prob=[.1,.9]):
    """
    Add different levels of bias to the data

    Parameters
    ----------
    data : ndarray
        A structured NumPy array
    y : string
        A label in data corresponding to the variable to add noise to
    unit : string or list of strings
        Levels defined by the unit variabes will share the same draws from the noise distributions
    bias : list of floats
        The amount of bias to add
    mix_prob : list of floats
        A list of length len(sd) that gives the probability of using
        a standard deviation specified in sd. All values must be
        between 0 and 1 and together must sum up to 1.

    Returns
    -------
    biased_data : ndarray
        Exactly the same as data except bias has been added to the y
        variable.

    Examples
    --------
    >>> data = np.array([('north','USA',1,2),('north','CAN',2,4)],dtype=[('region','|S6'),('iso3','|S4'),('y','<f4'), ('se','<f4')])
    >>> biased_data = add_mixture_bias(data, 'y', 'iso3', [10], [1])
    >>> int(biased_data['y'][0])
    11
    """

    biased_data = copy.copy(data)

    prob_cut_offs = []
    prob_cut_offs.append(mix_prob[0])
    
    for i in range(1, len(mix_prob)):
        new_cut_off = 0.
        for j in range(0, i+1):
            new_cut_off = new_cut_off + mix_prob[j]
        prob_cut_offs.append(new_cut_off)
    
    biased_data = utilities.add_unique_id(biased_data, unit, 'unique_id_for_bias')
    bias_to_add = {}
    for i, id in enumerate(np.unique(biased_data['unique_id_for_bias'])):
        r = random.random()
        min_dist = np.inf
        index = 999
        for j, prob in enumerate(prob_cut_offs):
            if abs(r - prob) < min_dist and prob - r > 0:
                min_dist = abs(r - prob)
                index = j

        bias_to_add[id] = bias[index]

    for i in range(0, len(data[y])):
        id = biased_data['unique_id_for_bias'][i]
        biased_data[y][i] = biased_data[y][i] + bias_to_add[id]
    
    biased_data = numpy.lib.recfunctions.drop_fields(biased_data, 'unique_id_for_bias')
    
    return biased_data
예제 #5
0
def knock_out_cluster_unit(data, y, se, cluster='iso3', unit='year', prop=.2, design='random'):
    """
    Within levels defined by the cluster variable, knock out a proportion of
    units in data by replacing values of the variable y.
 
    Parameters
    ----------
    data : ndarray
        A structured array.
    y : string
        A label of variable in data that corresponds to the response variable to be knocked out
    se : string
        A label of variable in data that corresponds to the standard error variable to be knocked out        
    cluster : string or list of strings
        A field or list of fields in self.data (e.g. 'iso3' or \"['iso3','age']\"). The knock out scheme is applied separately to
        levels defined by cluster.
    unit : string
        A field in self.data. The unit of the data to knock out. Unit should not have multiple values
        with in levels of cluster.
    proportion : float
        The proportion of data to knock out.
    design : string
        If 'random', then a proportion of data is knocked out randomally.
        If 'first', then the first proportion of data is knocked out and
        analagously for last.

    Examples
    --------
    >>> dtype = [('iso3','|S4'),('year','<i4'),('y','<f4'),('se','<f4')]
    >>> data = np.array([('USA',1990,1,.1),('USA',1991,2,.2),('CAN',1990,3,.3),('CAN',1991,4,.4)], dtype=dtype)
    >>> ko_data = knock_out_cluster_unit(data,'y','se','iso3','year',.5,'first')
    >>> utilities.is_nan(ko_data['y'][0])
    True
    >>> utilities.is_nan(ko_data['y'][1])
    False
    >>> utilities.is_nan(ko_data['y'][2])
    True
    >>> utilities.is_nan(ko_data['y'][3])
    False

    >>> utilities.is_nan(ko_data['se'][0])
    True
    >>> utilities.is_nan(ko_data['se'][1])
    False
    >>> utilities.is_nan(ko_data['se'][2])
    True
    >>> utilities.is_nan(ko_data['se'][3])
    False

    # Check to see that original data has not been changed
    >>> utilities.is_nan(data['y'][0])
    False
    """

    data = copy.copy(data)
    
    data_cluster = {}
    if cluster == '':
        data_cluster[''] = data
    else:
        if len(cluster[0]) >= 2:
            data = utilities.add_unique_id(data, cluster, 'knockerouters_unique_cluster_id')
            cluster = 'knockerouters_unique_cluster_id'
               
        for level in np.unique(data[cluster]):
            data_cluster[level] = utilities.subset(data, cluster, level)

    for key in data_cluster.keys():

        candidates = []
        for i, val in enumerate(data_cluster[key][y]):
            if utilities.is_nan(val) == False:
                candidates.append(i)

        should_be_knocked_out = {}

        r = np.where(np.arange(1.,len(candidates)+1.) <= len(candidates)*prop, True, False)
        
        if type(r.tolist()) != type(True):
            random.shuffle(r)
        else:
            r = [r]
                
        for index, i in enumerate(candidates):
            level = data_cluster[key][unit][i]

            if design == 'random':
                should_be_knocked_out[level] = r[index]
            elif design == 'first':
                should_be_knocked_out[level] = (float(i+1)/len(candidates)) <= prop
            elif design == 'last':
                should_be_knocked_out[level] = (float(i+1)/len(candidates)) >= (1-prop)
        
        for i, level in enumerate(data[unit]):
            if (level in should_be_knocked_out.keys()) == True:
                if cluster == '':
                    if should_be_knocked_out[level] == True:
                        data[y][i] = np.nan
                        data[se][i] = np.nan
                else:
                    if should_be_knocked_out[level] == True and data[cluster][i] == key: 
                        data[y][i] = np.nan
                        data[se][i] = np.nan
                        
    if cluster == 'knockerouters_unique_cluster_id':
        data = numpy.lib.recfunctions.drop_fields(data, 'knockerouters_unique_cluster_id')

    return data