예제 #1
0
 def test_init(self):
     # let's add some uncertainties to this
     uncs = [RealParameter("a", 0, 1),
            RealParameter("b", 0, 1)]
     outcomes = [TimeSeriesOutcome("test")]
     constraints = []
     callback = DefaultCallback(uncs, [], outcomes, constraints,
                                nr_experiments=100)
     
     self.assertEqual(callback.i, 0)
     self.assertEqual(callback.nr_experiments, 100)
     self.assertEqual(callback.cases.shape[0], 100)
     self.assertEqual(callback.outcomes, [o.name for o in outcomes])
     
     names = rf.get_names(callback.cases.dtype)
     names = set(names)
     self.assertEqual(names, {'a', 'b', 'policy', 'model', 'scenario_id'})
     self.assertEqual(callback.results, {})
     
     # with levers
     levers = [RealParameter('c', 0, 10)]
     
     callback = DefaultCallback(uncs, levers, outcomes, constraints, 
                                nr_experiments=100)
     
     self.assertEqual(callback.i, 0)
     self.assertEqual(callback.nr_experiments, 100)
     self.assertEqual(callback.cases.shape[0], 100)
     self.assertEqual(callback.outcomes, [o.name for o in outcomes])
     
     names = rf.get_names(callback.cases.dtype)
     names = set(names)
     self.assertEqual(names, {'a', 'b', 'c','policy', 'model', 'scenario_id'})
     self.assertEqual(callback.results, {})
예제 #2
0
    def test_init(self):
        # let's add some uncertainties to this
        uncs = [RealParameter("a", 0, 1), RealParameter("b", 0, 1)]
        outcomes = [TimeSeriesOutcome("test")]
        callback = DefaultCallback(uncs, [], outcomes, nr_experiments=100)

        self.assertEqual(callback.i, 0)
        self.assertEqual(callback.nr_experiments, 100)
        self.assertEqual(callback.cases.shape[0], 100)
        self.assertEqual(callback.outcomes, [o.name for o in outcomes])

        names = rf.get_names(callback.cases.dtype)
        names = set(names)
        self.assertEqual(names, {'a', 'b', 'policy', 'model'})
        self.assertEqual(callback.results, {})

        # with levers
        levers = [RealParameter('c', 0, 10)]

        callback = DefaultCallback(uncs, levers, outcomes, nr_experiments=100)

        self.assertEqual(callback.i, 0)
        self.assertEqual(callback.nr_experiments, 100)
        self.assertEqual(callback.cases.shape[0], 100)
        self.assertEqual(callback.outcomes, [o.name for o in outcomes])

        names = rf.get_names(callback.cases.dtype)
        names = set(names)
        self.assertEqual(names, {'a', 'b', 'c', 'policy', 'model'})
        self.assertEqual(callback.results, {})
def _make_box(x):
    '''
    Make a box that encompasses all the data
    
    Parameters
    ----------
    x : structured numpy array
    
    
    '''
    
    box = np.zeros((2, ), x.dtype)
    
    names = recfunctions.get_names(x.dtype)
    
    for name in names:
        dtype = x.dtype.fields.get(name)[0] 
        mask = np.ma.getmaskarray(x[name])
        values = x[name][mask==False]
        
        if dtype == 'object':
            try:
                values = set(values)
                box[name][:] = values
            except TypeError as e:
                ema_logging.warning("{} has unhashable values".format(name))
                raise e
        else:
            box[name][0] = np.min(values, axis=0) 
            box[name][1] = np.max(values, axis=0)    
    return box  
    def test_drop_restriction(self):
        x = np.array([(0, 1, 2), (2, 5, 6), (3, 2, 1)],
                     dtype=[('a', np.float), ('b', np.float), ('c', np.float)])
        y = {'y': np.array([1, 1, 0])}
        results = (x, y)

        prim_obj = prim.setup_prim(results, 'y', threshold=0.8)
        box = PrimBox(prim_obj, prim_obj.box_init, prim_obj.yi)

        new_box_lim = np.array([(0, 1, 1), (2, 2, 6)],
                               dtype=[('a', np.float), ('b', np.float),
                                      ('c', np.float)])
        indices = np.array([0, 1], dtype=np.int)
        box.update(new_box_lim, indices)

        box.drop_restriction('b')

        correct_box_lims = np.array([(0, 1, 1), (2, 5, 6)],
                                    dtype=[('a', np.float), ('b', np.float),
                                           ('c', np.float)])
        box_lims = box.box_lims[-1]
        names = recfunctions.get_names(correct_box_lims.dtype)
        for entry in names:
            lim_correct = correct_box_lims[entry]
            lim_box = box_lims[entry]
            for i in range(len(lim_correct)):
                self.assertEqual(lim_correct[i], lim_box[i])

        self.assertEqual(box.peeling_trajectory['mean'][2], 1)
        self.assertEqual(box.peeling_trajectory['coverage'][2], 1)
        self.assertEqual(box.peeling_trajectory['density'][2], 1)
        self.assertEqual(box.peeling_trajectory['res dim'][2], 1)
        self.assertEqual(box.peeling_trajectory['mass'][2], 2 / 3)
def _prepare_experiments(experiments):
    '''
    transform the experiments structured array into a numpy array.

    Parameters
    ----------
    experiments : structured array
    
    Returns
    -------
    ndarray
    
    '''
    uncs = recfunctions.get_names(experiments.dtype)

    temp_experiments = np.zeros((experiments.shape[0], len(uncs)))
    
    for i, u in enumerate(uncs):
        try: 
            temp_experiments[:,i] = experiments[u].astype(np.float)
        except ValueError:
            
            data = experiments[u]
            entries = sorted(list(set(data)))
            
            for j, entry in enumerate(entries):
                temp_experiments[data==entry,i] = j
    
    return temp_experiments
 def test_store_cases(self):
     nr_experiments = 3
     uncs = [ParameterUncertainty((0,1), "a"),
            ParameterUncertainty((0,1), "b"),
            CategoricalUncertainty([0, 1, 2], "c"),
            ParameterUncertainty((0,1), "d", integer=True),]
     outcomes = [Outcome("test", time=True)]
     case = {unc.name:random.random() for unc in uncs}
     case["c"] = int(round(case["c"]*2))
     case["d"] = int(round(case["d"]))
     policy = {'name':'none'}
     name = "test"
  
     callback = DefaultCallback(uncs, 
                                [outcome.name for outcome in outcomes], 
                                nr_experiments=nr_experiments,
                                reporting_interval=1)
     result = {outcomes[0].name: 1}
     callback(0, case, policy, name, result)
      
     experiments, _ = callback.get_results()
     design = case
     design['policy'] = policy['name']
     design['model'] = name
     
     names = rf.get_names(experiments.dtype)
     for name in names:
         self.assertEqual(experiments[name][0], design[name])
예제 #7
0
 def test_prim_init_select(self):
     self.results = test_utilities.load_flu_data()
     self.classify = flu_classify        
     
     experiments, outcomes = self.results
     
     unc = recfunctions.get_names(experiments.dtype)
     
     # test initialization, including t_coi calculation in case of searching
     # for results equal to or higher than the threshold
     outcomes['death toll'] = outcomes['deceased population region 1'][:, -1]
     results = experiments, outcomes
     threshold = 10000
     prim_obj = prim.setup_prim(results, classify='death toll', 
                          threshold_type=prim.ABOVE, threshold=threshold,
                          incl_unc=unc)
     
     value = np.ones((experiments.shape[0],))
     value = value[outcomes['death toll'] >= threshold].shape[0]
     self.assertTrue(prim_obj.t_coi==value)
             
     # test initialization, including t_coi calculation in case of searching
     # for results equal to or lower  than the threshold
     threshold = 1000
     prim_obj = prim.setup_prim(results, classify='death toll', 
                          threshold_type=prim.BELOW, 
                          threshold=threshold)
     
     value = np.ones((experiments.shape[0],))
     value = value[outcomes['death toll'] <= threshold].shape[0]
     self.assertTrue(prim_obj.t_coi==value)
     
     prim.setup_prim(self.results, self.classify, threshold=prim.ABOVE)
예제 #8
0
def setup_cart(results, classify, incl_unc=[], mass_min=0.05):
    """helper function for performing cart
    
    Parameters
    ----------
    results : tuple of structured array and dict with numpy arrays
              the return from :meth:`perform_experiments`.
    classify : string, function or callable
               either a string denoting the outcome of interest to 
               use or a function. 
    incl_unc : list of strings
    mass_min : float
    
    
    Raises
    ------
    TypeError 
        if classify is not a string or a callable.
    
    """
    
    if not incl_unc:
        x = np.ma.array(results[0])
    else:
        drop_names = set(recfunctions.get_names(results[0].dtype))-set(incl_unc)
        x = recfunctions.drop_fields(results[0], drop_names, asrecarray = True)
    if type(classify)==types.StringType:
        y = results[1][classify]
    elif callable(classify):
        y = classify(results[1])
    else:
        raise TypeError("unknown type for classify")
    
    return CART(x, y, mass_min)
예제 #9
0
    def _rotate_subset(self, value, orig_experiments, logical): 
        '''
        rotate a subset
        
        Parameters
        ----------
        value : list of strings
        orig_experiment : numpy structured array
        logical : boolean array
        
        '''
        list_dtypes = [(name, "<f8") for name in value]
        
        #cast everything to float
        drop_names = set(rf.get_names(orig_experiments.dtype)) - set(value)
        orig_subset = rf.drop_fields(orig_experiments, drop_names, 
                                               asrecarray=True)
        subset_experiments = orig_subset.astype(list_dtypes).view('<f8').reshape(orig_experiments.shape[0], len(value))
 
        #normalize the data
        mean = np.mean(subset_experiments,axis=0)
        std = np.std(subset_experiments, axis=0)
        std[std==0] = 1 #in order to avoid a devision by zero
        subset_experiments = (subset_experiments - mean)/std
        
        #get the experiments of interest
        experiments_of_interest = subset_experiments[logical]
        
        #determine the rotation
        rotation_matrix =  self._determine_rotation(experiments_of_interest)
        
        #apply the rotation
        subset_experiments = np.dot(subset_experiments,rotation_matrix)
        return rotation_matrix, subset_experiments
예제 #10
0
def _make_box(x):
    '''
    Make a box that encompasses all the data
    
    Parameters
    ----------
    x : structured numpy array
    
    
    '''

    box = np.zeros((2, ), x.dtype)

    names = recfunctions.get_names(x.dtype)

    for name in names:
        dtype = x.dtype.fields.get(name)[0]
        mask = np.ma.getmaskarray(x[name])
        values = x[name][mask == False]

        if dtype == 'object':
            try:
                values = set(values)
            except TypeError as e:
                ema_logging.warning("{} has unhashable values".format(name))
                raise e
            else:
                box[name][:] = values
        else:
            box[name][0] = np.min(values, axis=0)
            box[name][1] = np.max(values, axis=0)
    return box
예제 #11
0
 def test_prim_init_select(self):
     self.results = util.load_flu_data()
     self.classify = flu_classify        
     
     experiments, outcomes = self.results
     
     unc = recfunctions.get_names(experiments.dtype)
     
     # test initialization, including t_coi calculation in case of searching
     # for results equal to or higher than the threshold
     outcomes['death toll'] = outcomes['deceased population region 1'][:, -1]
     results = experiments, outcomes
     threshold = 10000
     prim_obj = prim.setup_prim(results, classify='death toll', 
                          threshold_type=prim.ABOVE, threshold=threshold,
                          incl_unc=unc)
     
     value = np.ones((experiments.shape[0],))
     value = value[outcomes['death toll'] >= threshold].shape[0]
     self.assertTrue(prim_obj.t_coi==value)
             
     # test initialization, including t_coi calculation in case of searching
     # for results equal to or lower  than the threshold
     threshold = 1000
     prim_obj = prim.setup_prim(results, classify='death toll', 
                          threshold_type=prim.BELOW, 
                          threshold=threshold)
     
     value = np.ones((experiments.shape[0],))
     value = value[outcomes['death toll'] <= threshold].shape[0]
     self.assertTrue(prim_obj.t_coi==value)
     
     prim.setup_prim(self.results, self.classify, threshold=prim.ABOVE)
예제 #12
0
def get_univariate_feature_scores(x,y, score_func=F_CLASSIFICATION):
    '''
    
    calculate feature scores using univariate statistical tests. In case of
    categorical data, chi square or the Anova F value is used. In case of 
    continuous data the Anova F value is used. 
    
    Parameters
    ----------
    x : structured array
    y : 1D nd.array
    score_func : {F_CLASSIFICATION, F_REGRESSION, CHI2}
                the score function to use, one of f_regression (regression), or  
                f_classification or chi2 (classification). 
    Returns
    -------
    list of tuples 
        sorted in descending order of tuples with uncertainty and feature 
        scores (i.e. p values in this case).
    
    
    '''
    uncs = recfunctions.get_names(x.dtype)
    
    x = _prepare_experiments(x)
    
    pvalues = score_func(x, y)[1]
    pvalues = np.asarray(pvalues)

    pvalues = zip(uncs, pvalues)
    pvalues = list(pvalues)
    pvalues.sort(key=itemgetter(1))
    return pvalues
예제 #13
0
def _prepare_experiments(experiments):
    '''
    transform the experiments structured array into a numpy array.

    Parameters
    ----------
    experiments : structured array
    
    Returns
    -------
    ndarray
    
    '''
    uncs = recfunctions.get_names(experiments.dtype)

    temp_experiments = np.zeros((experiments.shape[0], len(uncs)))
    
    for i, u in enumerate(uncs):
        try: 
            temp_experiments[:,i] = experiments[u].astype(np.float)
        except ValueError:
            
            data = experiments[u]
            entries = sorted(list(set(data)))
            
            for j, entry in enumerate(entries):
                temp_experiments[data==entry,i] = j
    
    return temp_experiments
예제 #14
0
    def test_store_cases(self):
        nr_experiments = 3
        uncs = [
            ParameterUncertainty((0, 1), "a"),
            ParameterUncertainty((0, 1), "b"),
            CategoricalUncertainty([0, 1, 2], "c"),
            ParameterUncertainty((0, 1), "d", integer=True),
        ]
        outcomes = [Outcome("test", time=True)]
        case = {unc.name: random.random() for unc in uncs}
        case["c"] = int(round(case["c"] * 2))
        case["d"] = int(round(case["d"]))
        policy = {'name': 'none'}
        name = "test"

        callback = DefaultCallback(uncs,
                                   [outcome.name for outcome in outcomes],
                                   nr_experiments=nr_experiments,
                                   reporting_interval=1)
        result = {outcomes[0].name: 1}
        callback(0, case, policy, name, result)

        experiments, _ = callback.get_results()
        design = case
        design['policy'] = policy['name']
        design['model'] = name

        names = rf.get_names(experiments.dtype)
        for name in names:
            self.assertEqual(experiments[name][0], design[name])
예제 #15
0
def plot_cdfs(x, y, ccdf=False):
    '''plot cumulative density functions for each column in x, based on 
    the  classification specified in y.

    Parameters
    ----------
    x : recarray
        the experiments to use in the cdfs
    y : ndaray 
        the categorization for the data
    ccdf : bool, optional
           if true, plot a complementary cdf instead of a normal cdf.
           
           
    Returns
    -------
    a matplotlib Figure instance

    '''
    x = rf.drop_fields(x, "scenario_id", asrecarray=True)
    uncs = rf.get_names(x.dtype)
    cp = sns.color_palette()

    n_col = 4
    n_row = math.ceil(len(uncs) / n_col)
    size = 3
    aspect = 1
    figsize = n_col * size * aspect, n_row * size
    fig, axes = plt.subplots(n_row, n_col, figsize=figsize, squeeze=False)

    for i, unc in enumerate(uncs):
        discrete = False

        i_col = i % n_col
        i_row = i // n_col
        ax = axes[i_row, i_col]

        data = x[unc]
        if x.dtype[unc] == np.dtype('O'):
            discrete = True
        plot_individual_cdf(ax, unc, data, y, discrete, ccdf=ccdf)

    # last row might contain empty axis,
    # let's make them disappear
    for j_col in range(i_col + 1, n_col):
        ax = axes[i_row, j_col]
        ax.set_xticklabels([])
        ax.set_xticks([])
        ax.set_yticklabels([])
        ax.set_yticks([])

        sns.despine(ax=ax, top=True, right=True, left=True, bottom=True)

    proxies, labels = build_legend(x, y)

    fig.legend(proxies, labels, "upper center")

    return fig
예제 #16
0
def plot_cdfs(x, y, ccdf=False):
    '''plot cumulative density functions for each column in x, based on the 
    classification specified in y.
    
    Parameters
    ----------
    x : recarray
        the experiments to use in the cdfs
    y : ndaray 
        tthe categorization for the data
    ccdf : bool, optional
           if true, plot a complementary cdf 
           instead of a normal cdf.
    
    '''
    
    uncs = rf.get_names(x.dtype)
    cp = sns.color_palette()
    
    n_col = 4
    n_row = len(uncs)//n_col +1
    size = 3 
    aspect = 1
    figsize = n_col * size * aspect, n_row * size
    fig, axes = plt.subplots(n_row, n_col,
                             figsize=figsize,
                             squeeze=False)

    for i, unc in enumerate(uncs):
        discrete = False
        
        i_col = i % n_col
        i_row = i // n_col
        ax = axes[i_row, i_col]
        
        data = x[unc]
        if x.dtype[unc] == np.dtype('O'):
            discrete = True
        plot_cdf(ax, unc, data, y, discrete, ccdf=ccdf)
    
    # last row might contain empty axis, 
    # let's make them disappear
    i_row = len(uncs) // n_col
    i_col = len(uncs) % n_col
    for i_col in range(i_col, n_col):
        ax = axes[i_row, i_col]
        ax.set_xticklabels([])
        ax.set_xticks([])
        ax.set_yticklabels([])
        ax.set_yticks([])
        
        sns.despine(ax=ax, top=True, right=True, left=True, bottom=True)
    
    proxies, labels = build_legend(x, y)
    
    fig.legend(proxies, labels, "upper center")

    return fig
def _in_box(x, boxlim):
    '''
     
    returns the indices of the data points that are within the 
    box_lims.
    
    Parameters
    ----------
    x : numpy structured array
    boxlim : numpy structured array
    
    
    Returns
    -------
    ndarray
        valid numpy indices on x
    
    '''
    logical = np.ones(x.shape[0], dtype=np.bool)
    
    dims = recfunctions.get_names(boxlim.dtype)

    for name in dims:
        value = x.dtype.fields.get(name)[0]
        
        if value == 'object':
            entries = boxlim[name][0]
            l = np.ones( (x.shape[0], len(entries)), dtype=np.bool)
            for i,entry in enumerate(entries):
                if type(list(entries)[0]) not in (str, float, int):
                    bools = []                
                    for element in list(x[name]):
                        if element == entry:
                            bools.append(True)
                        else:
                            bools.append(False)
                    l[:, i] = np.asarray(bools, dtype=bool)
                else:
                    l[:, i] = x[name] == entry
            l = np.any(l, axis=1)
            logical = logical & l
        else:
            logical = logical & (boxlim[name][0] <= x[name] )&\
                                        (x[name] <= boxlim[name][1])                
    
    indices = np.where(logical==True)
    
    assert len(indices)==1
    indices = indices[0]
    
    return indices
예제 #18
0
def _in_box(x, boxlim):
    '''
     
    returns the indices of the data points that are within the 
    box_lims.
    
    Parameters
    ----------
    x : numpy structured array
    boxlim : numpy structured array
    
    
    Returns
    -------
    ndarray
        valid numpy indices on x
    
    '''
    logical = np.ones(x.shape[0], dtype=np.bool)

    dims = recfunctions.get_names(boxlim.dtype)

    for name in dims:
        value = x.dtype.fields.get(name)[0]

        if value == 'object':
            entries = boxlim[name][0]
            l = np.ones((x.shape[0], len(entries)), dtype=np.bool)
            for i, entry in enumerate(entries):
                if type(list(entries)[0]) not in (str, float, int):
                    bools = []
                    for element in list(x[name]):
                        if element == entry:
                            bools.append(True)
                        else:
                            bools.append(False)
                    l[:, i] = np.asarray(bools, dtype=bool)
                else:
                    l[:, i] = x[name] == entry
            l = np.any(l, axis=1)
            logical = logical & l
        else:
            logical = logical & (boxlim[name][0] <= x[name] )&\
                                        (x[name] <= boxlim[name][1])

    indices = np.where(logical == True)

    assert len(indices) == 1
    indices = indices[0]

    return indices
def get_univariate_feature_scores(results,
                                  classify,
                                  score_func='f_classification'):
    '''
    
    calculate feature scores using univariate statistical tests. In case of
    categorical data, chi square or the Anova F value is used. In case of 
    continuous data the Anova F value is used. 
    
    Parameters
    ----------
    results : tuple
    classify : str
    score_func : {'f_classification', 'chi2', 'f_regression'}
                the score function to use, one of f_regression (regression), or  
                f_classification or chi2 (classification). 
    Returns
    -------
    list of tuples 
        sorted in descending order of tuples with uncertainty and feature 
        scores (i.e. p values in this case).
    
    
    '''

    score_funcs = {
        'f_regression': f_regression,
        'f_classification': f_classif,
        'chi2': chi2
    }

    experiments, outcomes = results
    uncs = recfunctions.get_names(experiments.dtype)

    x = _prepare_experiments(experiments)
    y, categorical = _prepare_outcomes(outcomes, classify)

    if categorical:
        score_func = score_funcs[score_func]
    else:
        score_func = f_regression

    pvalues = score_func(x, y)[1]
    pvalues = np.asarray(pvalues)

    pvalues = zip(uncs, pvalues)
    pvalues = list(pvalues)
    pvalues.sort(key=itemgetter(1))
    return pvalues
예제 #20
0
 def determine_restricted_dims(self, box_lims):
     '''
     
     determine which dimensions of the given box are restricted compared 
     to compared to the initial box that contains all the data
     
     :param box_lims: 
     
     '''
 
     logical = self.compare(self.box_init, box_lims)
     u = np.asarray(recfunctions.get_names(box_lims.dtype), 
                    dtype=object)
     dims = u[logical==False]
     return dims
예제 #21
0
 def test_init(self):
     # let's add some uncertainties to this
     uncs = [ParameterUncertainty((0,1), "a"),
            ParameterUncertainty((0,1), "b")]
     outcomes = [Outcome("test", time=True)]
     callback = DefaultCallback(uncs, outcomes, nr_experiments=100)
     
     self.assertEqual(callback.i, 0)
     self.assertEqual(callback.nr_experiments, 100)
     self.assertEqual(callback.cases.shape[0], 100)
     self.assertEqual(callback.outcomes, outcomes)
     
     names = rf.get_names(callback.cases.dtype)
     names = set(names)
     self.assertEqual(names, {'a', 'b', 'policy', 'model'})
     self.assertEqual(callback.results, {})
 def test_init(self):
     # let's add some uncertainties to this
     uncs = [ParameterUncertainty((0,1), "a"),
            ParameterUncertainty((0,1), "b")]
     outcomes = [Outcome("test", time=True)]
     callback = DefaultCallback(uncs, outcomes, nr_experiments=100)
     
     self.assertEqual(callback.i, 0)
     self.assertEqual(callback.nr_experiments, 100)
     self.assertEqual(callback.cases.shape[0], 100)
     self.assertEqual(callback.outcomes, outcomes)
     
     names = rf.get_names(callback.cases.dtype)
     names = set(names)
     self.assertEqual(names, {'a', 'b', 'policy', 'model'})
     self.assertEqual(callback.results, {})
예제 #23
0
def setup_prim(results, classify, threshold, incl_unc=[], **kwargs):
    """Helper function for setting up the prim algorithm
    
    Parameters
    ----------
    results : tuple
              tuple of structured array and dict with numpy arrays
              the return from :meth:`perform_experiments`.
    classify : str or callable
               either a string denoting the outcome of interest to 
               use or a function. 
    threshold : double
                the minimum score on the objective function of the last box
                on the peeling trajectory. In case of a binary classification,
                this should be between 0 and 1. 
    incl_unc : list of str, optional
               list of uncertainties to include in prim analysis
    kwargs : dict
             valid keyword arguments for prim.Prim
    
    Returns
    -------
    a Prim instance
    
    Raises
    ------
    PrimException 
        if data resulting from classify is not a 1-d array. 
    TypeError 
        if classify is not a string or a callable.
    
    """

    if not incl_unc:
        x = np.ma.array(results[0])
    else:
        drop_names = set(rf.get_names(results[0].dtype)) - set(incl_unc)
        x = rf.drop_fields(results[0], drop_names, asrecarray=True)
    if isinstance(classify, six.string_types):
        y = results[1][classify]
    elif callable(classify):
        y = classify(results[1])
    else:
        raise TypeError("unknown type for classify")

    return Prim(x, y, threshold=threshold, **kwargs)
예제 #24
0
def _determine_restricted_dims(box_lims, box_init):
    '''
    
    determine which dimensions of the given box are restricted compared 
    to compared to the initial box that contains all the data
    
    Parameters
    ----------
    box_lims : structured numpy array
               a specific box limit
    box_init : structured numpy array
               the initial box containing all data points
    
    '''

    logical = _compare(box_init, box_lims)
    u = np.asarray(recfunctions.get_names(box_lims.dtype), dtype=object)
    dims = u[logical == False]
    return dims
예제 #25
0
    def test_drop_restriction(self):
        x = np.array([(0,1,2),
                      (2,5,6),
                      (3,2,1)], 
                     dtype=[('a', np.float),
                            ('b', np.float),
                            ('c', np.float)])
        y = np.array([1,1,0])
         
        prim_obj = Prim(x, y, threshold=0.8)
        box = PrimBox(prim_obj, prim_obj._box_init, prim_obj.yi)
 
        new_box_lim = np.array([(0,1,1),
                                (2,2,6)], 
                                dtype=[('a', np.float),
                                       ('b', np.float),
                                       ('c', np.float)])
        indices = np.array([0,1], dtype=np.int)
        box.update(new_box_lim, indices)
         
        box.drop_restriction('b')
         
        correct_box_lims = np.array([(0,1,1),
                                     (2,5,6)], 
                                    dtype=[('a', np.float),
                                           ('b', np.float),
                                           ('c', np.float)]) 
               
        box_lims = box._box_lims[-1]
        names = recfunctions.get_names(correct_box_lims.dtype)
        
        for entry in names:
            lim_correct = correct_box_lims[entry]
            lim_box = box_lims[entry]
            for i in range(len(lim_correct)):
                self.assertEqual(lim_correct[i], lim_box[i])
         
        self.assertEqual(box.peeling_trajectory['mean'][2], 1)
        self.assertEqual(box.peeling_trajectory['coverage'][2], 1)
        self.assertEqual(box.peeling_trajectory['density'][2], 1)
        self.assertEqual(box.peeling_trajectory['res dim'][2], 1)
        self.assertEqual(box.peeling_trajectory['mass'][2], 2/3)
def _determine_restricted_dims(box_lims, box_init):
    '''
    
    determine which dimensions of the given box are restricted compared 
    to compared to the initial box that contains all the data
    
    Parameters
    ----------
    box_lims : structured numpy array
               a specific box limit
    box_init : structured numpy array
               the initial box containing all data points
    
    '''

    logical = _compare(box_init, box_lims)
    u = np.asarray(recfunctions.get_names(box_lims.dtype), 
                   dtype=object)
    dims = u[logical==False]
    return dims
예제 #27
0
def make_box(x):
    '''
    Make a box that encompasses all the data
    
    Parameters
    ----------
    x : structured numpy array
    '''
    # get the types in the order they appear in the numpy array
    types = [(v[1], k, v[0].name) for k, v in six.iteritems(x.dtype.fields)]
    types = sorted(types)

    # convert any bool types to object to store set(False, True)
    ntypes = [(k, 'object' if t == 'bool' else t) for (_, k, t) in types]

    # create box limits
    box = np.zeros((2, ), ntypes)
    names = recfunctions.get_names(x.dtype)

    for name in names:
        dtype = box.dtype.fields.get(name)[0]
        values = x[name]

        if isinstance(values, np.ma.MaskedArray):
            values = values.compressed()

        if dtype == 'object':
            try:
                values = set(values)
                box[name][:] = values
            except TypeError as e:
                logging.getLogger(__name__).warning(
                    "{} has unhashable values".format(name))
                raise e
        else:
            box[name][0] = np.min(values, axis=0)
            box[name][1] = np.max(values, axis=0)

    return box
예제 #28
0
def setup_prim(results, classify, incl_unc=[], **kwargs):
    """Helper function for setting up the prim algorithm
    
    Parameters
    ----------
    results : tuple of structured array and dict with numpy arrays
              the return from :meth:`perform_experiments`.
    classify : string, function or callable
               either a string denoting the outcome of interest to 
               use or a function. 
    kwargs : valid keyword arguments for prim.Prim
    
    Returns
    -------
    a Prim instance
    
    Raises
    ------
    PrimException 
        if data resulting from classify is not a 1-d array. 
    TypeError 
        if classify is not a string or a callable.
    
    """
    
    if not incl_unc:
        x = np.ma.array(results[0])
    else:
        drop_names = set(rf.get_names(results[0].dtype))-set(incl_unc)
        x = rf.drop_fields(results[0], drop_names, asrecarray = True)
    if type(classify)==StringType:
        y = results[1][classify]
    elif callable(classify):
        y = classify(results[1])
    else:
        raise TypeError("unknown type for classify")
    
    return Prim(x,y, **kwargs)
예제 #29
0
def setup_cart(results, classify, incl_unc=[], mass_min=0.05):
    """helper function for performing cart in combination with data
    generated by the workbench. 

    Parameters
    ----------
    results : tuple of structured array and dict with numpy arrays
              the return from :meth:`perform_experiments`.
    classify : string, function or callable
               either a string denoting the outcome of interest to 
               use or a function. 
    incl_unc : list of strings
    mass_min : float


    Raises
    ------
    TypeError 
        if classify is not a string or a callable.

    """

    if not incl_unc:
        x = np.ma.array(results[0])
    else:
        drop_names = set(recfunctions.get_names(
            results[0].dtype)) - set(incl_unc)
        x = recfunctions.drop_fields(results[0], drop_names, asrecarray=True)
    if isinstance(classify, six.string_types):
        y = results[1][classify]
        mode = sdutil.REGRESSION
    elif callable(classify):
        y = classify(results[1])
        mode = sdutil.BINARY
    else:
        raise TypeError("unknown type for classify")

    return CART(x, y, mass_min, mode=mode)
예제 #30
0
def setup_cart(results, classify, incl_unc=[], mass_min=0.05):
    """helper function for performing cart in combination with data
    generated by the workbench. 

    Parameters
    ----------
    results : tuple of structured array and dict with numpy arrays
              the return from :meth:`perform_experiments`.
    classify : string, function or callable
               either a string denoting the outcome of interest to 
               use or a function. 
    incl_unc : list of strings
    mass_min : float


    Raises
    ------
    TypeError 
        if classify is not a string or a callable.

    """

    if not incl_unc:
        x = np.ma.array(results[0])
    else:
        drop_names = set(recfunctions.get_names(
            results[0].dtype))-set(incl_unc)
        x = recfunctions.drop_fields(results[0], drop_names, asrecarray=True)
    if isinstance(classify, six.string_types):
        y = results[1][classify]
        mode = sdutil.REGRESSION
    elif callable(classify):
        y = classify(results[1])
        mode = sdutil.BINARY
    else:
        raise TypeError("unknown type for classify")

    return CART(x, y, mass_min, mode=mode)
def make_box(x):
    '''
    Make a box that encompasses all the data
    
    Parameters
    ----------
    x : structured numpy array
    '''
    # get the types in the order they appear in the numpy array
    types = [(v[1], k, v[0].name) for k, v in six.iteritems(x.dtype.fields)]
    types = sorted(types)
    
    # convert any bool types to object to store set(False, True)
    ntypes = [(k, 'object' if t == 'bool' else t) for (_, k, t) in types]
    
    # create box limits
    box = np.zeros((2, ), ntypes)
    names = recfunctions.get_names(x.dtype)
    
    for name in names:
        dtype = box.dtype.fields.get(name)[0]
        values = x[name]
        
        if isinstance(values, np.ma.MaskedArray):
            values = values.compressed()
        
        if dtype == 'object':
            try:
                values = set(values)
                box[name][:] = values
            except TypeError as e:
                logging.getLogger(__name__).warning("{} has unhashable values".format(name))
                raise e
        else:
            box[name][0] = np.min(values, axis=0)
            box[name][1] = np.max(values, axis=0)
               
    return box  
예제 #32
0
def get_univariate_feature_scores(x,y, score_func=F_CLASSIFICATION):
    '''
    
    calculate feature scores using univariate statistical tests. In case of
    categorical data, chi square or the Anova F value is used. In case of 
    continuous data the Anova F value is used. 
    
    Parameters
    ----------
    x : structured array
    y : 1D nd.array
    score_func : {F_CLASSIFICATION, F_REGRESSION, CHI2}
                the score function to use, one of f_regression (regression), or  
                f_classification or chi2 (classification). 
    Returns
    -------
    pandas DataFrame
        sorted in descending order of tuples with uncertainty and feature 
        scores (i.e. p values in this case).
    
    
    '''
    uncs = recfunctions.get_names(x.dtype)
    
    x = _prepare_experiments(x)
    
    pvalues = score_func(x, y)[1]
    pvalues = np.asarray(pvalues)

    pvalues = zip(uncs, pvalues)
    pvalues = list(pvalues)
    pvalues.sort(key=itemgetter(1))
    
    pvalues = pd.DataFrame(pvalues)
    
    return pvalues
예제 #33
0
    def _rotate_subset(self, value, orig_experiments, logical):
        '''
        rotate a subset
        
        Parameters
        ----------
        value : list of str
        orig_experiment : numpy structured array
        logical : boolean array
        
        '''
        list_dtypes = [(name, "<f8") for name in value]

        #cast everything to float
        drop_names = set(rf.get_names(orig_experiments.dtype)) - set(value)
        orig_subset = rf.drop_fields(orig_experiments,
                                     drop_names,
                                     asrecarray=True)
        subset_experiments = orig_subset.astype(list_dtypes).view(
            '<f8').reshape(orig_experiments.shape[0], len(value))

        #normalize the data
        mean = np.mean(subset_experiments, axis=0)
        std = np.std(subset_experiments, axis=0)
        std[std == 0] = 1  #in order to avoid a devision by zero
        subset_experiments = (subset_experiments - mean) / std

        #get the experiments of interest
        experiments_of_interest = subset_experiments[logical]

        #determine the rotation
        rotation_matrix = self._determine_rotation(experiments_of_interest)

        #apply the rotation
        subset_experiments = np.dot(subset_experiments, rotation_matrix)
        return rotation_matrix, subset_experiments
예제 #34
0
    def __init__(self,
                 x,
                 y,
                 threshold=None,
                 threshold_type=">",
                 include=None,
                 exclude=None,
                 **kwargs):
        """Generates a decision tree for classification.
        
        Parameters
        ----------
        x : a matrix-like object (pandas.DataFrame, numpy.recarray, etc.)
            the independent variables
        y : a list-like object, the column name (str), or callable
            the dependent variable either provided as a list-like object
            classifying the data into cases of interest (e.g., False/True),
            a list-like object storing the raw variable value (in which case
            a threshold must be given), a string identifying the dependent
            variable in x, or a function called on each row of x to compute the
            dependent variable
        threshold : float
            threshold for identifying cases of interest
        threshold_type : str
            comparison operator used when identifying cases of interest
        include : list of str
            the names of variables included in the PRIM analysis
        exclude : list of str
            the names of variables excluded from the PRIM analysis
        """
        super(Cart, self).__init__()

        # Ensure the input x is a numpy matrix/array
        if isinstance(x, pd.DataFrame):
            x = x.to_records(index=False)
        elif isinstance(x, np.ma.MaskedArray):
            pass
        else:
            x = pd.DataFrame(x).to_records(index=False)

        # if y is a string or function, compute the actual response value
        # otherwise, ensure y is a numpy matrix/array
        if isinstance(y, six.string_types):
            key = y
            y = x[key]

            if exclude:
                exclude = list(exclude) + [key]
            else:
                exclude = [key]
        elif six.callable(y):
            fun = y
            y = np.apply_along_axis(fun, 0, x)
        elif isinstance(y, pd.DataFrame) or isinstance(y, pd.Series):
            y = y.values
        elif isinstance(y, np.ma.MaskedArray):
            pass
        else:
            y = np.asarray(y)

        # convert include/exclude arguments to lists if they are strings
        if include and isinstance(include, six.string_types):
            include = [include]

        if exclude and isinstance(exclude, six.string_types):
            exclude = [exclude]

        # include or exclude columns from the analysis
        if include:
            if isinstance(include, six.string_types):
                include = [include]

            drop_names = set(rf.get_names(x.dtype)) - set(include)
            x = rf.drop_fields(x, drop_names, asrecarray=True)

        if exclude:
            if isinstance(exclude, six.string_types):
                exclude = [exclude]

            drop_names = set(exclude)
            x = rf.drop_fields(x, drop_names, asrecarray=True)

        # apply the threshold if
        if threshold:
            if six.callable(threshold):
                y = np.apply_along_axis(threshold, 0, y)
            else:
                # The syntax for threshold_type is "x <op> <threshold>", e.g.,
                # "x > 0.5".  However, partial only supports positional
                # arguments for built-in operators.  Thus, we must assign the
                # threshold to the first position and use a different operator.
                # For example, "x > 0.5" must be evaluated as "0.5 < x".
                OPERATORS = {
                    "<": operator.ge,
                    ">": operator.le,
                    "<=": operator.gt,
                    ">=": operator.lt,
                    "=": operator.eq
                }

                op = OPERATORS[threshold_type]
                y = np.apply_along_axis(functools.partial(op, threshold), 0, y)

        # validate inputs
        if len(y.shape) > 1:
            raise ValueError("y is not a 1-d array")

        # extract feature names
        feature_names = rf.get_names(x.dtype)

        # ensure x is formatted as a 2D matrix
        x = x.view("<f8").reshape(x.shape + (-1, ))

        clf = tree.DecisionTreeClassifier(**kwargs)
        clf = clf.fit(x, y)

        # add our custom metadata to the classifier
        self._feature_names = feature_names
        self._x = x
        self._y = y
        self._clf = clf
예제 #35
0
    def __init__(self, 
                 x,
                 y, 
                 threshold = None, 
                 threshold_type = ">",
                 obj_func = lenient1, 
                 peel_alpha = 0.05, 
                 paste_alpha = 0.05,
                 mass_min = 0.05, 
                 include = None,
                 exclude = None,
                 coi = None):
        """Creates a new PRIM object.
        
        The PRIM object maintains the current state of the PRIM algorithm,
        recording the PRIM boxes found thus far, the remaining (uncaptured)
        cases of interest in the dataset, and provides methods for finding the
        next PRIM box and viewing statistics.
        
        Parameters
        ----------
        x : a matrix-like object (pandas.DataFrame, numpy.recarray, etc.)
            the independent variables
        y : a list-like object, the column name (str), or callable
            the dependent variable either provided as a list-like object
            classifying the data into cases of interest (e.g., False/True),
            a list-like object storing the raw variable value (in which case
            a threshold must be given), a string identifying the dependent
            variable in x, or a function called on each row of x to compute the
            dependent variable
        threshold : float
            threshold for identifying cases of interest
        threshold_type : str
            comparison operator used whwen identifying cases of interest
        obj_func : callable (default: lenient1)
            a function that computes the objective function (peeling criteria)
        peel_alpha : float (default: 0.05) 
            parameter controlling the peeling stage
        paste_alpha : float (default: 0.05)
            parameter controlling the pasting stage
        mass_min : float (default: 0.05)
            minimum mass of a box
        include : list of str
            the names of variables included in the PRIM analysis
        exclude : list of str
            the names of variables excluded from the PRIM analysis
        coi : str or list of str
            if y contains strings, coi identifies which string is the case of
            interest
        """
        
        # Ensure the input x is a numpy matrix/array
        if isinstance(x, pd.DataFrame):
            x = x.to_records(index=False)
        elif isinstance(x, np.ma.MaskedArray):
            pass
        else:
            x = pd.DataFrame(x).to_records(index=False)
            
        # if y is a string or function, compute the actual response value
        # otherwise, ensure y is a numpy matrix/array
        if isinstance(y, six.string_types):
            key = y
            y = x[key]
            
            if exclude:
                exclude = list(exclude) + [key]
            else:
                exclude = [key]
        elif six.callable(y):
            fun = y
            y = np.apply_along_axis(fun, 0, x)
        elif isinstance(y, pd.DataFrame) or isinstance(y, pd.Series):
            y = y.values
        elif isinstance(y, np.ma.MaskedArray):
            pass
        else:
            y = np.asarray(y)
            
        # convert include/exclude arguments to lists if they are strings
        if include and isinstance(include, six.string_types):
            include = [include]
            
        if exclude and isinstance(exclude, six.string_types):
            exclude = [exclude]     
            
        # include or exclude columns from the analysis
        if include:
            if isinstance(include, six.string_types):
                include = [include]

            drop_names = set(rf.get_names(x.dtype))-set(include)
            x = rf.drop_fields(x, drop_names, asrecarray=True)
        
        if exclude:
            if isinstance(exclude, six.string_types):
                exclude = [exclude]

            drop_names = set(exclude) 
            x = rf.drop_fields(x, drop_names, asrecarray=True)
            
        # apply the threshold if 
        if threshold:
            if six.callable(threshold):
                y = np.apply_along_axis(threshold, 0, y)
            else:
                # The syntax for threshold_type is "x <op> <threshold>", e.g.,
                # "x > 0.5".  However, partial only supports positional
                # arguments for built-in operators.  Thus, we must assign the
                # threshold to the first position and use a different operator.
                # For example, "x > 0.5" must be evaluated as "0.5 < x".
                OPERATORS = {"<=" : operator.ge,
                             ">=" : operator.le,
                             "<" : operator.gt,
                             ">" : operator.lt,
                             "=" : operator.eq}
                
                op = OPERATORS[threshold_type]
                y = np.apply_along_axis(functools.partial(op, threshold), 0, y)
                
        # validate inputs
        if len(y.shape) > 1:
            raise PrimError("y is not a 1-d array")
        
        unique_y = np.unique(y)
        
        if unique_y.shape[0] > 2:
            raise PrimError("y must contain only two values (0/1 or False/True)")
        
        if ((unique_y.shape[0] == 2 and (False not in unique_y or True not in unique_y)) or
                (False not in unique_y and True not in unique_y)):
            if coi is None:
                raise PrimError("y must contain only two values (0/1 or False/True)")
            else:
                if not hasattr(coi, "__iter__") and not isinstance(coi, six.string_types):
                    coi = [coi]
                y = np.asarray([1 if yi in coi else 0 for yi in y])
            
        # store the parameters       
        self.x = x
        self.y = y
        self.paste_alpha = paste_alpha
        self.peel_alpha = peel_alpha
        self.mass_min = mass_min
        self.threshold = threshold 
        self.threshold_type = threshold_type
        self.obj_func = obj_func
       
        # set the indices
        self.yi = np.arange(0, self.y.shape[0])
       
        # how many data points do we have
        self.n = self.y.shape[0]
        
        # how many cases of interest do we have?
        self.t_coi = self.determine_coi(self.yi)
        
        # initial box that contains all data
        self._box_init = make_box(self.x)
    
        # make a list in which the identified boxes can be put
        self._boxes = []
        
        # set yi_remaining to all y values
        self._update_yi_remaining()
예제 #36
0
 def test_store_cases(self):
     nr_experiments = 3
     uncs = [RealParameter("a", 0, 1),
             RealParameter("b", 0, 1),
             CategoricalParameter('c', [0, 1, 2]),
             IntegerParameter("d", 0, 1)]
     outcomes = [TimeSeriesOutcome("test")]
     constraints = []
     case = {unc.name:random.random() for unc in uncs}
     case["c"] = int(round(case["c"]*2))
     case["d"] = int(round(case["d"]))
     
     model = NamedObject('test')
     policy  = Policy('policy')
     scenario = Scenario(**case)
     experiment = Case(0, model.name, policy, scenario, 0)
  
     callback = DefaultCallback(uncs, [],outcomes, constraints,
                                nr_experiments=nr_experiments,
                                reporting_interval=1)
     model_outcomes = {outcomes[0].name: 1}
     model_constraints = {}
     callback(experiment, model_outcomes, model_constraints)
      
     experiments, _ = callback.get_results()
     design = case
     design['policy'] = policy.name
     design['model'] = model.name
     design['scenario_id'] = scenario.name
     
     names = rf.get_names(experiments.dtype)
     for name in names:
         entry_a = experiments[name][0]
         entry_b = design[name]
         
         self.assertEqual(entry_a, entry_b, "failed for "+name)
          
     # with levers
     nr_experiments = 3
     uncs = [RealParameter("a", 0, 1),
             RealParameter("b", 0, 1)]
     levers = [RealParameter("c", 0, 1),
               RealParameter("d", 0, 1)]
     outcomes = [TimeSeriesOutcome("test")]
     case = {unc.name:random.random() for unc in uncs}
     
     model = NamedObject('test')
     policy  = Policy('policy', c=1, d=1)
     scenario = Scenario(**case)
     experiment = Case(0, model.name, policy, scenario, 0)
  
     callback = DefaultCallback(uncs, levers,outcomes,constraints, 
                                nr_experiments=nr_experiments,
                                reporting_interval=1)
     model_outcomes = {outcomes[0].name: 1}
     model_constraints = {}
     callback(experiment, model_outcomes, model_constraints)
      
     experiments, _ = callback.get_results()
     design = case
     design['c'] = 1
     design['d'] = 1
     design['policy'] = policy.name
     design['model'] = model.name
     design['scenario_id'] = scenario.name
     
     names = rf.get_names(experiments.dtype)
     
     for name in names:
         self.assertEqual(experiments[name][0], design[name])
예제 #37
0
def get_ex_feature_scores(x, y, mode=CLASSIFICATION, nr_trees=250, 
                          max_features='auto', max_depth=None, 
                          min_samples_split=2, min_samples_leaf=1, 
                          min_weight_fraction_leaf=0, max_leaf_nodes=None,
                          bootstrap=True, oob_score=True, random_state=None): 
    '''
    Get feature scores using extra trees

    Parameters
    ----------
    x : structured array
    y : 1D nd.array
    mode : {CLASSIFICATION, REGRESSION}
    nr_trees : int, optional
               nr. of trees in forest (default=250)
    max_features : int, optional
                   see http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html
    max_depth : int, optional 
                see http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html
    min_samples_split : int, optional
                  see http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html
    min_samples_leaf : int, optional
                       see http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html
    min_weight_fraction_leaf : float, optional
                               see http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html
    max_leaf_nodes: int or None, optional
                    see http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html
    bootstrap : bool, optional
                see http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html
    oob_score : bool, optional
                see http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html
    random_state : int, optional
                   see http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html
    
    Returns
    -------
    pandas DataFrame
        sorted in descending order of tuples with uncertainty and feature 
        scores 
    object
        either ExtraTreesClassifier or ExtraTreesRegressor
    
    '''
    
    uncs = recfunctions.get_names(x.dtype)
    x = _prepare_experiments(x)
    
    if mode==CLASSIFICATION:
        etc = ExtraTreesClassifier
        criterion='gini'
    elif mode==REGRESSION:
        etc = ExtraTreesRegressor
        criterion = 'mse'
    else:
        raise ValueError('{} not valid for mode'.format(mode))
    
    extra_trees = etc(n_estimators=nr_trees, 
                      criterion=criterion, 
                      max_features=max_features, 
                      max_depth=max_depth,
                      min_samples_split=min_samples_split,
                      min_samples_leaf=min_samples_leaf,
                      min_weight_fraction_leaf=min_weight_fraction_leaf,
                      max_leaf_nodes=max_leaf_nodes,
                      bootstrap=bootstrap,
                      oob_score=oob_score,
                      random_state=random_state)
    extra_trees.fit(x,y)

    importances = extra_trees.feature_importances_

    importances = zip(uncs, importances)
    importances = list(importances)
    importances.sort(key=itemgetter(1), reverse=True)

    importances = pd.DataFrame(importances)

    return importances, extra_trees
예제 #38
0
def get_lasso_feature_scores(x, y, mode=CLASSIFICATION, scaling=0.5, 
                             sample_fraction=0.75, n_resampling=200,
                             random_state=None):
    '''
    Calculate features scores using a randomized lasso (regression) or 
    randomized logistic regression (classification). This is also known as 
    stability selection.
    
    see http://scikit-learn.org/stable/modules/feature_selection.html for 
    details. 
    
    Parameters
    ----------   
    x : structured array
    y : 1D nd.array
    mode : {CLASSIFICATION, REGRESSION}
    scaling : float, optional
              scaling parameter, should be between 0 and 1
    sample_fraction : float, optional
                      the fraction of samples to used in each randomized 
                      dataset
    n_resmpling : int, optional
                  the number of times the model is trained on a random subset 
                  of the data
    random_state : int, optional
                   if it is an int, it specifies the seed to use, defaults to 
                   None.
                         
    Returns
    -------
    pandas DataFrame
        sorted in descending order of tuples with uncertainty and feature 
        scores         
         
    '''
    
    uncs = recfunctions.get_names(x.dtype)
    
    x = _prepare_experiments(x)
    
    if mode==CLASSIFICATION:

        lfs = RandomizedLogisticRegression(scaling=scaling, 
                                           sample_fraction=sample_fraction,
                                           n_resampling=n_resampling, 
                                           random_state=random_state)
        lfs.fit(x,y)
    elif  mode==REGRESSION:
        # we use LassoLarsCV to determine alpha see
        # http://scikit-learn.org/stable/auto_examples/linear_model/plot_sparse_recovery.html
        lars_cv = LassoLarsCV(cv=6).fit(x, y,)
        alphas = np.linspace(lars_cv.alphas_[0], .1 * lars_cv.alphas_[0], 6)
        
        # fit the randomized lasso        
        lfs = RandomizedLasso(alpha=alphas,scaling=scaling, 
                              sample_fraction=sample_fraction,
                              n_resampling=n_resampling,
                              random_state=random_state)
        lfs.fit(x, y)
    else:
        raise ValueError('{} invalid value for mode'.format(mode))

    importances = lfs.scores_
    importances = zip(uncs, importances)
    importances = list(importances)
    importances.sort(key=itemgetter(1), reverse=True)
    importances = pd.DataFrame(importances)


    return importances
예제 #39
0
    def test_store_cases(self):
        nr_experiments = 3
        uncs = [
            RealParameter("a", 0, 1),
            RealParameter("b", 0, 1),
            CategoricalParameter('c', [0, 1, 2]),
            IntegerParameter("d", 0, 1)
        ]
        outcomes = [TimeSeriesOutcome("test")]
        case = {unc.name: random.random() for unc in uncs}
        case["c"] = int(round(case["c"] * 2))
        case["d"] = int(round(case["d"]))

        model = NamedObject('test')
        policy = Policy('policy')
        scenario = Scenario(**case)
        experiment = Experiment(0, model, policy, scenario, 0)

        callback = DefaultCallback(uncs, [],
                                   outcomes,
                                   nr_experiments=nr_experiments,
                                   reporting_interval=1)
        result = {outcomes[0].name: 1}
        callback(experiment, result)

        experiments, _ = callback.get_results()
        design = case
        design['policy'] = policy.name
        design['model'] = model.name

        names = rf.get_names(experiments.dtype)
        for name in names:
            self.assertEqual(experiments[name][0], design[name])

        # with levers
        nr_experiments = 3
        uncs = [RealParameter("a", 0, 1), RealParameter("b", 0, 1)]
        levers = [RealParameter("c", 0, 1), RealParameter("d", 0, 1)]
        outcomes = [TimeSeriesOutcome("test")]
        case = {unc.name: random.random() for unc in uncs}

        model = NamedObject('test')
        policy = Policy('policy', c=1, d=1)
        scenario = Scenario(**case)
        experiment = Experiment(0, model, policy, scenario, 0)

        callback = DefaultCallback(uncs,
                                   levers,
                                   outcomes,
                                   nr_experiments=nr_experiments,
                                   reporting_interval=1)
        result = {outcomes[0].name: 1}
        callback(experiment, result)

        experiments, _ = callback.get_results()
        design = case
        design['c'] = 1
        design['d'] = 1
        design['policy'] = policy.name
        design['model'] = model.name

        names = rf.get_names(experiments.dtype)

        print(experiments[0])

        for name in names:
            self.assertEqual(experiments[name][0], design[name])
예제 #40
0
def get_lasso_feature_scores(x, y, mode=CLASSIFICATION, scaling=0.5, 
                             sample_fraction=0.75, n_resampling=200,
                             random_state=None):
    '''
    Calculate features scores using a randomized lasso (regression) or 
    randomized logistic regression (classification). This is also known as 
    stability selection.
    
    see http://scikit-learn.org/stable/modules/feature_selection.html for 
    details. 
    
    Parameters
    ----------   
    x : structured array
    y : 1D nd.array
    mode : {CLASSIFICATION, REGRESSION}
    scaling : float, optional
              scaling parameter, should be between 0 and 1
    sample_fraction : float, optional
                      the fraction of samples to used in each randomized 
                      dataset
    n_resmpling : int, optional
                  the number of times the model is trained on a random subset 
                  of the data
    random_state : int, optional
                   if it is an int, it specifies the seed to use, defaults to 
                   None.
                         
    Returns
    -------
    pandas DataFrame
        sorted in descending order of tuples with uncertainty and feature 
        scores         
         
    '''
    
    uncs = recfunctions.get_names(x.dtype)
    
    x = _prepare_experiments(x)
    
    if mode==CLASSIFICATION:

        lfs = RandomizedLogisticRegression(scaling=scaling, 
                                           sample_fraction=sample_fraction,
                                           n_resampling=n_resampling, 
                                           random_state=random_state)
        lfs.fit(x,y)
    elif  mode==REGRESSION:
        # we use LassoLarsCV to determine alpha see
        # http://scikit-learn.org/stable/auto_examples/linear_model/plot_sparse_recovery.html
        lars_cv = LassoLarsCV(cv=6).fit(x, y,)
        alphas = np.linspace(lars_cv.alphas_[0], .1 * lars_cv.alphas_[0], 6)
        
        # fit the randomized lasso        
        lfs = RandomizedLasso(alpha=alphas,scaling=scaling, 
                              sample_fraction=sample_fraction,
                              n_resampling=n_resampling,
                              random_state=random_state)
        lfs.fit(x, y)
    else:
        raise ValueError('{} invalid value for mode'.format(mode))

    importances = lfs.scores_
    importances = zip(uncs, importances)
    importances = list(importances)
    importances.sort(key=itemgetter(1), reverse=True)
    importances = pd.DataFrame(importances)


    return importances
예제 #41
0
    def __init__(self,
                 x,
                 y,
                 threshold=None,
                 threshold_type=">",
                 obj_func=lenient1,
                 peel_alpha=0.05,
                 paste_alpha=0.05,
                 mass_min=0.05,
                 include=None,
                 exclude=None,
                 coi=None):
        """Creates a new PRIM object.
        
        The PRIM object maintains the current state of the PRIM algorithm,
        recording the PRIM boxes found thus far, the remaining (uncaptured)
        cases of interest in the dataset, and provides methods for finding the
        next PRIM box and viewing statistics.
        
        Parameters
        ----------
        x : a matrix-like object (pandas.DataFrame, numpy.recarray, etc.)
            the independent variables
        y : a list-like object, the column name (str), or callable
            the dependent variable either provided as a list-like object
            classifying the data into cases of interest (e.g., False/True),
            a list-like object storing the raw variable value (in which case
            a threshold must be given), a string identifying the dependent
            variable in x, or a function called on each row of x to compute the
            dependent variable
        threshold : float
            threshold for identifying cases of interest
        threshold_type : str
            comparison operator used whwen identifying cases of interest
        obj_func : callable (default: lenient1)
            a function that computes the objective function (peeling criteria)
        peel_alpha : float (default: 0.05) 
            parameter controlling the peeling stage
        paste_alpha : float (default: 0.05)
            parameter controlling the pasting stage
        mass_min : float (default: 0.05)
            minimum mass of a box
        include : list of str
            the names of variables included in the PRIM analysis
        exclude : list of str
            the names of variables excluded from the PRIM analysis
        coi : str or list of str
            if y contains strings, coi identifies which string is the case of
            interest
        """

        # Ensure the input x is a numpy matrix/array
        if isinstance(x, pd.DataFrame):
            x = x.to_records(index=False)
        elif isinstance(x, np.ma.MaskedArray):
            pass
        else:
            x = pd.DataFrame(x).to_records(index=False)

        # if y is a string or function, compute the actual response value
        # otherwise, ensure y is a numpy matrix/array
        if isinstance(y, six.string_types):
            key = y
            y = x[key]

            if exclude:
                exclude = list(exclude) + [key]
            else:
                exclude = [key]
        elif six.callable(y):
            fun = y
            y = np.apply_along_axis(fun, 0, x)
        elif isinstance(y, pd.DataFrame) or isinstance(y, pd.Series):
            y = y.values
        elif isinstance(y, np.ma.MaskedArray):
            pass
        else:
            y = np.asarray(y)

        # convert include/exclude arguments to lists if they are strings
        if include and isinstance(include, six.string_types):
            include = [include]

        if exclude and isinstance(exclude, six.string_types):
            exclude = [exclude]

        # include or exclude columns from the analysis
        if include:
            if isinstance(include, six.string_types):
                include = [include]

            drop_names = set(rf.get_names(x.dtype)) - set(include)
            x = rf.drop_fields(x, drop_names, asrecarray=True)

        if exclude:
            if isinstance(exclude, six.string_types):
                exclude = [exclude]

            drop_names = set(exclude)
            x = rf.drop_fields(x, drop_names, asrecarray=True)

        # apply the threshold if
        if threshold:
            if six.callable(threshold):
                y = np.apply_along_axis(threshold, 0, y)
            else:
                # The syntax for threshold_type is "x <op> <threshold>", e.g.,
                # "x > 0.5".  However, partial only supports positional
                # arguments for built-in operators.  Thus, we must assign the
                # threshold to the first position and use a different operator.
                # For example, "x > 0.5" must be evaluated as "0.5 < x".
                OPERATORS = {
                    "<=": operator.ge,
                    ">=": operator.le,
                    "<": operator.gt,
                    ">": operator.lt,
                    "=": operator.eq
                }

                op = OPERATORS[threshold_type]
                y = np.apply_along_axis(functools.partial(op, threshold), 0, y)

        # validate inputs
        if len(y.shape) > 1:
            raise PrimError("y is not a 1-d array")

        unique_y = np.unique(y)

        if unique_y.shape[0] > 2:
            raise PrimError(
                "y must contain only two values (0/1 or False/True)")

        if ((unique_y.shape[0] == 2 and
             (False not in unique_y or True not in unique_y))
                or (False not in unique_y and True not in unique_y)):
            if coi is None:
                raise PrimError(
                    "y must contain only two values (0/1 or False/True)")
            else:
                if not hasattr(coi, "__iter__") and not isinstance(
                        coi, six.string_types):
                    coi = [coi]
                y = np.asarray([1 if yi in coi else 0 for yi in y])

        # store the parameters
        self.x = x
        self.y = y
        self.paste_alpha = paste_alpha
        self.peel_alpha = peel_alpha
        self.mass_min = mass_min
        self.threshold = threshold
        self.threshold_type = threshold_type
        self.obj_func = obj_func

        # set the indices
        self.yi = np.arange(0, self.y.shape[0])

        # how many data points do we have
        self.n = self.y.shape[0]

        # how many cases of interest do we have?
        self.t_coi = self.determine_coi(self.yi)

        # initial box that contains all data
        self._box_init = make_box(self.x)

        # make a list in which the identified boxes can be put
        self._boxes = []

        # set yi_remaining to all y values
        self._update_yi_remaining()
def get_rf_feature_scores(results,
                          classify,
                          nr_trees=250,
                          criterion='gini',
                          max_features='auto',
                          max_depth=None,
                          min_samples_split=2,
                          min_samples_leaf=1,
                          bootstrap=True,
                          oob_score=True,
                          random_state=None):
    '''
    Get feature scores using a random forest

    Parameters
    ----------
    results : tuple
              results tuple
    classify : callable or str
               a classify function or variable analogous to PRIM
    nr_trees : int, optional
               nr. of trees in forest (default=250)
    criterion : str, optional
                see http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
    max_features : int, optional
                   see http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
    max_depth : int, optional 
                see http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
    min_samples : int, optional
                  see http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
    min_samples_leaf : int, optional
                       see http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
    bootstrap : bool, optional
                see http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
    oob_score : bool, optional
                see http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
    random_state : int, optional
                   see http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
    
    Returns
    -------
    list of tuples 
        sorted in descending order of tuples with uncertainty and feature 
        scores 
    object
        either RandomForestClassifier or RandomForestRegressor
    
    '''
    experiments, outcomes = results
    uncs = recfunctions.get_names(experiments.dtype)

    x = _prepare_experiments(experiments)

    y, categorical = _prepare_outcomes(outcomes, classify)

    if categorical:
        rfc = RandomForestClassifier
    else:
        rfc = RandomForestRegressor
        criterion = 'mse'

    forest = rfc(n_estimators=nr_trees,
                 criterion=criterion,
                 max_features=max_features,
                 max_depth=max_depth,
                 min_samples_split=min_samples_split,
                 min_samples_leaf=min_samples_leaf,
                 bootstrap=bootstrap,
                 oob_score=oob_score,
                 random_state=random_state)
    forest.fit(x, y)

    importances = forest.feature_importances_

    importances = zip(uncs, importances)
    importances = list(importances)
    importances.sort(key=itemgetter(1), reverse=True)

    return importances, forest
def get_lasso_feature_scores(results,
                             classify,
                             scaling=0.5,
                             sample_fraction=0.75,
                             n_resampling=200,
                             random_state=None):
    '''
    Calculate features scores using a randomized lasso (regression) or 
    randomized logistic regression (classification). This is also known as 
    stability selection.
    
    see http://scikit-learn.org/stable/modules/feature_selection.html for 
    details. 
    
    Parameters
    ----------   
    results : tuple
    classify : callable or str
               a classify function or variable analogous to PRIM
    scaling : float, optional
              scaling parameter, should be between 0 and 1
    sample_fraction : float, optional
                      the fraction of samples to used in each randomized 
                      dataset
    n_resmpling : int, optional
                  the number of times the model is trained on a random subset 
                  of the data
    random_state : int, optional
                   if it is an int, it specifies the seed to use, defaults to 
                   None.
                         
    Returns
    -------
    list of tuples 
        sorted in descending order of tuples with uncertainty and feature 
        scores         
         
    '''

    experiments, outcomes = results
    uncs = recfunctions.get_names(experiments.dtype)

    x = _prepare_experiments(experiments)
    y, categorical = _prepare_outcomes(outcomes, classify)

    if categorical:

        lfs = RandomizedLogisticRegression(scaling=scaling,
                                           sample_fraction=sample_fraction,
                                           n_resampling=n_resampling,
                                           random_state=random_state)
        lfs.fit(x, y)
    else:
        # we use LassoLarsCV to determine alpha see
        # http://scikit-learn.org/stable/auto_examples/linear_model/plot_sparse_recovery.html
        lars_cv = LassoLarsCV(cv=6).fit(
            x,
            y,
        )
        alphas = np.linspace(lars_cv.alphas_[0], .1 * lars_cv.alphas_[0], 6)

        # fit the randomized lasso
        lfs = RandomizedLasso(alpha=alphas,
                              scaling=scaling,
                              sample_fraction=sample_fraction,
                              n_resampling=n_resampling,
                              random_state=random_state)
        lfs.fit(x, y)

    importances = lfs.scores_
    importances = zip(uncs, importances)
    importances = list(importances)
    importances.sort(key=itemgetter(1), reverse=True)

    return importances
예제 #44
0
    def __init__(self,
                 x,
                 y, 
                 threshold = None, 
                 threshold_type = ">",
                 include = None,
                 exclude = None,
                 **kwargs):
        """Generates a decision tree for classification.
        
        Parameters
        ----------
        x : a matrix-like object (pandas.DataFrame, numpy.recarray, etc.)
            the independent variables
        y : a list-like object, the column name (str), or callable
            the dependent variable either provided as a list-like object
            classifying the data into cases of interest (e.g., False/True),
            a list-like object storing the raw variable value (in which case
            a threshold must be given), a string identifying the dependent
            variable in x, or a function called on each row of x to compute the
            dependent variable
        threshold : float
            threshold for identifying cases of interest
        threshold_type : str
            comparison operator used when identifying cases of interest
        include : list of str
            the names of variables included in the PRIM analysis
        exclude : list of str
            the names of variables excluded from the PRIM analysis
        """
        super(Cart, self).__init__()
        
        # Ensure the input x is a numpy matrix/array
        if isinstance(x, pd.DataFrame):
            x = x.to_records(index=False)
        elif isinstance(x, np.ma.MaskedArray):
            pass
        else:
            x = pd.DataFrame(x).to_records(index=False)
            
        # if y is a string or function, compute the actual response value
        # otherwise, ensure y is a numpy matrix/array
        if isinstance(y, six.string_types):
            key = y
            y = x[key]
            
            if exclude:
                exclude = list(exclude) + [key]
            else:
                exclude = [key]
        elif six.callable(y):
            fun = y
            y = np.apply_along_axis(fun, 0, x)
        elif isinstance(y, pd.DataFrame) or isinstance(y, pd.Series):
            y = y.values
        elif isinstance(y, np.ma.MaskedArray):
            pass
        else:
            y = np.asarray(y)
            
        # convert include/exclude arguments to lists if they are strings
        if include and isinstance(include, six.string_types):
            include = [include]
            
        if exclude and isinstance(exclude, six.string_types):
            exclude = [exclude]     
            
        # include or exclude columns from the analysis
        if include:
            if isinstance(include, six.string_types):
                include = [include]

            drop_names = set(rf.get_names(x.dtype))-set(include)
            x = rf.drop_fields(x, drop_names, asrecarray=True)
        
        if exclude:
            if isinstance(exclude, six.string_types):
                exclude = [exclude]

            drop_names = set(exclude) 
            x = rf.drop_fields(x, drop_names, asrecarray=True)
            
        # apply the threshold if 
        if threshold:
            if six.callable(threshold):
                y = np.apply_along_axis(threshold, 0, y)
            else:
                # The syntax for threshold_type is "x <op> <threshold>", e.g.,
                # "x > 0.5".  However, partial only supports positional
                # arguments for built-in operators.  Thus, we must assign the
                # threshold to the first position and use a different operator.
                # For example, "x > 0.5" must be evaluated as "0.5 < x".
                OPERATORS = {"<" : operator.ge,
                             ">" : operator.le,
                             "<=" : operator.gt,
                             ">=" : operator.lt,
                             "=" : operator.eq}
                
                op = OPERATORS[threshold_type]
                y = np.apply_along_axis(functools.partial(op, threshold), 0, y)
                
        # validate inputs
        if len(y.shape) > 1:
            raise ValueError("y is not a 1-d array")
        
        # extract feature names
        feature_names = rf.get_names(x.dtype)
        
        # ensure x is formatted as a 2D matrix
        x = x.view("<f8").reshape(x.shape + (-1,))
        
        clf = tree.DecisionTreeClassifier(**kwargs)
        clf = clf.fit(x, y)
        
        # add our custom metadata to the classifier
        self._feature_names = feature_names
        self._x = x
        self._y = y
        self._clf = clf
예제 #45
0
    def __init__(self, 
                 results,
                 classify, 
                 obj_function=DEFAULT, 
                 peel_alpha = 0.05, 
                 paste_alpha = 0.05,
                 mass_min = 0.05, 
                 threshold = None, 
                 threshold_type=ABOVE,
                 incl_unc=[]):
        '''
        
        :param results: the return from :meth:`perform_experiments`.
        :param classify: either a string denoting the outcome of interest to 
                         use or a function. 
        :param peel_alpha: parameter controlling the peeling stage (default = 0.05). 
        :param paste_alpha: parameter controlling the pasting stage (default = 0.05).
        :param mass_min: minimum mass of a box (default = 0.05). 
        :param threshold: the threshold of the output space that boxes should meet. 
        :param threshold_type: If 1, the boxes should go above the threshold, if -1
                               the boxes should go below the threshold, if 0, the 
                               algorithm looks for both +1 and -1.
        :param obj_func: The objective function to use. Default is 
                         :func:`def_obj_func`
        :param incl_unc: optional argument, should be a list of uncertainties
                         that are to be included in the prim analysis. 
        :raises: PrimException if data resulting from classify is not a 
                 1-d array. 
        :raises: TypeError if classify is not a string or a callable.
                     
        '''
        assert threshold!=None
        if not incl_unc:
            self.x = results[0]
        else:
            drop_names = set(recfunctions.get_names(results[0].dtype))-set(incl_unc)
            self.x = recfunctions.drop_fields(results[0], drop_names, asrecarray = True)

        if type(classify)==StringType:
            self.y = results[1][classify]
        elif callable(classify):
            self.y = classify(results[1])
        else:
            raise TypeError("unknown type for classify")
        
        if len(self.y.shape) > 1:
            raise PrimException("y is not a 1-d array")
        
        # store the remainder of the parameters
        self.paste_alpha = paste_alpha
        self.peel_alpha = peel_alpha
        self.mass_min = mass_min
        self.threshold = threshold 
        self.threshold_type = threshold_type
        self.obj_func = self._obj_functions[obj_function]
       
        # set the indices
        self.yi = np.arange(0, self.y.shape[0])
       
        # how many data points do we have
        self.n = self.y.shape[0]
        
        # how many cases of interest do we have?
        self.t_coi = self.determine_coi(self.yi)
        
        # initial box that contains all data
        self.box_init = self.make_box(self.x)
    
        # make a list in which the identified boxes can be put
        self.boxes = []
        
        self._update_yi_remaining()
예제 #46
0
def get_ex_feature_scores(x, y, mode=CLASSIFICATION, nr_trees=250, 
                          max_features='auto', max_depth=None, 
                          min_samples_split=2, min_samples_leaf=1, 
                          min_weight_fraction_leaf=0, max_leaf_nodes=None,
                          bootstrap=True, oob_score=True, random_state=None): 
    '''
    Get feature scores using extra trees

    Parameters
    ----------
    x : structured array
    y : 1D nd.array
    mode : {CLASSIFICATION, REGRESSION}
    nr_trees : int, optional
               nr. of trees in forest (default=250)
    max_features : int, optional
                   see http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html
    max_depth : int, optional 
                see http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html
    min_samples_split : int, optional
                  see http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html
    min_samples_leaf : int, optional
                       see http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html
    min_weight_fraction_leaf : float, optional
                               see http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html
    max_leaf_nodes: int or None, optional
                    see http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html
    bootstrap : bool, optional
                see http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html
    oob_score : bool, optional
                see http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html
    random_state : int, optional
                   see http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html
    
    Returns
    -------
    pandas DataFrame
        sorted in descending order of tuples with uncertainty and feature 
        scores 
    object
        either ExtraTreesClassifier or ExtraTreesRegressor
    
    '''
    
    uncs = recfunctions.get_names(x.dtype)
    x = _prepare_experiments(x)
    
    if mode==CLASSIFICATION:
        etc = ExtraTreesClassifier
        criterion='gini'
    elif mode==REGRESSION:
        etc = ExtraTreesRegressor
        criterion = 'mse'
    else:
        raise ValueError('{} not valid for mode'.format(mode))
    
    extra_trees = etc(n_estimators=nr_trees, 
                      criterion=criterion, 
                      max_features=max_features, 
                      max_depth=max_depth,
                      min_samples_split=min_samples_split,
                      min_samples_leaf=min_samples_leaf,
                      min_weight_fraction_leaf=min_weight_fraction_leaf,
                      max_leaf_nodes=max_leaf_nodes,
                      bootstrap=bootstrap,
                      oob_score=oob_score,
                      random_state=random_state)
    extra_trees.fit(x,y)

    importances = extra_trees.feature_importances_

    importances = zip(uncs, importances)
    importances = list(importances)
    importances.sort(key=itemgetter(1), reverse=True)

    importances = pd.DataFrame(importances)

    return importances, extra_trees