def test_init(self): # let's add some uncertainties to this uncs = [RealParameter("a", 0, 1), RealParameter("b", 0, 1)] outcomes = [TimeSeriesOutcome("test")] constraints = [] callback = DefaultCallback(uncs, [], outcomes, constraints, nr_experiments=100) self.assertEqual(callback.i, 0) self.assertEqual(callback.nr_experiments, 100) self.assertEqual(callback.cases.shape[0], 100) self.assertEqual(callback.outcomes, [o.name for o in outcomes]) names = rf.get_names(callback.cases.dtype) names = set(names) self.assertEqual(names, {'a', 'b', 'policy', 'model', 'scenario_id'}) self.assertEqual(callback.results, {}) # with levers levers = [RealParameter('c', 0, 10)] callback = DefaultCallback(uncs, levers, outcomes, constraints, nr_experiments=100) self.assertEqual(callback.i, 0) self.assertEqual(callback.nr_experiments, 100) self.assertEqual(callback.cases.shape[0], 100) self.assertEqual(callback.outcomes, [o.name for o in outcomes]) names = rf.get_names(callback.cases.dtype) names = set(names) self.assertEqual(names, {'a', 'b', 'c','policy', 'model', 'scenario_id'}) self.assertEqual(callback.results, {})
def test_init(self): # let's add some uncertainties to this uncs = [RealParameter("a", 0, 1), RealParameter("b", 0, 1)] outcomes = [TimeSeriesOutcome("test")] callback = DefaultCallback(uncs, [], outcomes, nr_experiments=100) self.assertEqual(callback.i, 0) self.assertEqual(callback.nr_experiments, 100) self.assertEqual(callback.cases.shape[0], 100) self.assertEqual(callback.outcomes, [o.name for o in outcomes]) names = rf.get_names(callback.cases.dtype) names = set(names) self.assertEqual(names, {'a', 'b', 'policy', 'model'}) self.assertEqual(callback.results, {}) # with levers levers = [RealParameter('c', 0, 10)] callback = DefaultCallback(uncs, levers, outcomes, nr_experiments=100) self.assertEqual(callback.i, 0) self.assertEqual(callback.nr_experiments, 100) self.assertEqual(callback.cases.shape[0], 100) self.assertEqual(callback.outcomes, [o.name for o in outcomes]) names = rf.get_names(callback.cases.dtype) names = set(names) self.assertEqual(names, {'a', 'b', 'c', 'policy', 'model'}) self.assertEqual(callback.results, {})
def _make_box(x): ''' Make a box that encompasses all the data Parameters ---------- x : structured numpy array ''' box = np.zeros((2, ), x.dtype) names = recfunctions.get_names(x.dtype) for name in names: dtype = x.dtype.fields.get(name)[0] mask = np.ma.getmaskarray(x[name]) values = x[name][mask==False] if dtype == 'object': try: values = set(values) box[name][:] = values except TypeError as e: ema_logging.warning("{} has unhashable values".format(name)) raise e else: box[name][0] = np.min(values, axis=0) box[name][1] = np.max(values, axis=0) return box
def test_drop_restriction(self): x = np.array([(0, 1, 2), (2, 5, 6), (3, 2, 1)], dtype=[('a', np.float), ('b', np.float), ('c', np.float)]) y = {'y': np.array([1, 1, 0])} results = (x, y) prim_obj = prim.setup_prim(results, 'y', threshold=0.8) box = PrimBox(prim_obj, prim_obj.box_init, prim_obj.yi) new_box_lim = np.array([(0, 1, 1), (2, 2, 6)], dtype=[('a', np.float), ('b', np.float), ('c', np.float)]) indices = np.array([0, 1], dtype=np.int) box.update(new_box_lim, indices) box.drop_restriction('b') correct_box_lims = np.array([(0, 1, 1), (2, 5, 6)], dtype=[('a', np.float), ('b', np.float), ('c', np.float)]) box_lims = box.box_lims[-1] names = recfunctions.get_names(correct_box_lims.dtype) for entry in names: lim_correct = correct_box_lims[entry] lim_box = box_lims[entry] for i in range(len(lim_correct)): self.assertEqual(lim_correct[i], lim_box[i]) self.assertEqual(box.peeling_trajectory['mean'][2], 1) self.assertEqual(box.peeling_trajectory['coverage'][2], 1) self.assertEqual(box.peeling_trajectory['density'][2], 1) self.assertEqual(box.peeling_trajectory['res dim'][2], 1) self.assertEqual(box.peeling_trajectory['mass'][2], 2 / 3)
def _prepare_experiments(experiments): ''' transform the experiments structured array into a numpy array. Parameters ---------- experiments : structured array Returns ------- ndarray ''' uncs = recfunctions.get_names(experiments.dtype) temp_experiments = np.zeros((experiments.shape[0], len(uncs))) for i, u in enumerate(uncs): try: temp_experiments[:,i] = experiments[u].astype(np.float) except ValueError: data = experiments[u] entries = sorted(list(set(data))) for j, entry in enumerate(entries): temp_experiments[data==entry,i] = j return temp_experiments
def test_store_cases(self): nr_experiments = 3 uncs = [ParameterUncertainty((0,1), "a"), ParameterUncertainty((0,1), "b"), CategoricalUncertainty([0, 1, 2], "c"), ParameterUncertainty((0,1), "d", integer=True),] outcomes = [Outcome("test", time=True)] case = {unc.name:random.random() for unc in uncs} case["c"] = int(round(case["c"]*2)) case["d"] = int(round(case["d"])) policy = {'name':'none'} name = "test" callback = DefaultCallback(uncs, [outcome.name for outcome in outcomes], nr_experiments=nr_experiments, reporting_interval=1) result = {outcomes[0].name: 1} callback(0, case, policy, name, result) experiments, _ = callback.get_results() design = case design['policy'] = policy['name'] design['model'] = name names = rf.get_names(experiments.dtype) for name in names: self.assertEqual(experiments[name][0], design[name])
def test_prim_init_select(self): self.results = test_utilities.load_flu_data() self.classify = flu_classify experiments, outcomes = self.results unc = recfunctions.get_names(experiments.dtype) # test initialization, including t_coi calculation in case of searching # for results equal to or higher than the threshold outcomes['death toll'] = outcomes['deceased population region 1'][:, -1] results = experiments, outcomes threshold = 10000 prim_obj = prim.setup_prim(results, classify='death toll', threshold_type=prim.ABOVE, threshold=threshold, incl_unc=unc) value = np.ones((experiments.shape[0],)) value = value[outcomes['death toll'] >= threshold].shape[0] self.assertTrue(prim_obj.t_coi==value) # test initialization, including t_coi calculation in case of searching # for results equal to or lower than the threshold threshold = 1000 prim_obj = prim.setup_prim(results, classify='death toll', threshold_type=prim.BELOW, threshold=threshold) value = np.ones((experiments.shape[0],)) value = value[outcomes['death toll'] <= threshold].shape[0] self.assertTrue(prim_obj.t_coi==value) prim.setup_prim(self.results, self.classify, threshold=prim.ABOVE)
def setup_cart(results, classify, incl_unc=[], mass_min=0.05): """helper function for performing cart Parameters ---------- results : tuple of structured array and dict with numpy arrays the return from :meth:`perform_experiments`. classify : string, function or callable either a string denoting the outcome of interest to use or a function. incl_unc : list of strings mass_min : float Raises ------ TypeError if classify is not a string or a callable. """ if not incl_unc: x = np.ma.array(results[0]) else: drop_names = set(recfunctions.get_names(results[0].dtype))-set(incl_unc) x = recfunctions.drop_fields(results[0], drop_names, asrecarray = True) if type(classify)==types.StringType: y = results[1][classify] elif callable(classify): y = classify(results[1]) else: raise TypeError("unknown type for classify") return CART(x, y, mass_min)
def _rotate_subset(self, value, orig_experiments, logical): ''' rotate a subset Parameters ---------- value : list of strings orig_experiment : numpy structured array logical : boolean array ''' list_dtypes = [(name, "<f8") for name in value] #cast everything to float drop_names = set(rf.get_names(orig_experiments.dtype)) - set(value) orig_subset = rf.drop_fields(orig_experiments, drop_names, asrecarray=True) subset_experiments = orig_subset.astype(list_dtypes).view('<f8').reshape(orig_experiments.shape[0], len(value)) #normalize the data mean = np.mean(subset_experiments,axis=0) std = np.std(subset_experiments, axis=0) std[std==0] = 1 #in order to avoid a devision by zero subset_experiments = (subset_experiments - mean)/std #get the experiments of interest experiments_of_interest = subset_experiments[logical] #determine the rotation rotation_matrix = self._determine_rotation(experiments_of_interest) #apply the rotation subset_experiments = np.dot(subset_experiments,rotation_matrix) return rotation_matrix, subset_experiments
def _make_box(x): ''' Make a box that encompasses all the data Parameters ---------- x : structured numpy array ''' box = np.zeros((2, ), x.dtype) names = recfunctions.get_names(x.dtype) for name in names: dtype = x.dtype.fields.get(name)[0] mask = np.ma.getmaskarray(x[name]) values = x[name][mask == False] if dtype == 'object': try: values = set(values) except TypeError as e: ema_logging.warning("{} has unhashable values".format(name)) raise e else: box[name][:] = values else: box[name][0] = np.min(values, axis=0) box[name][1] = np.max(values, axis=0) return box
def test_prim_init_select(self): self.results = util.load_flu_data() self.classify = flu_classify experiments, outcomes = self.results unc = recfunctions.get_names(experiments.dtype) # test initialization, including t_coi calculation in case of searching # for results equal to or higher than the threshold outcomes['death toll'] = outcomes['deceased population region 1'][:, -1] results = experiments, outcomes threshold = 10000 prim_obj = prim.setup_prim(results, classify='death toll', threshold_type=prim.ABOVE, threshold=threshold, incl_unc=unc) value = np.ones((experiments.shape[0],)) value = value[outcomes['death toll'] >= threshold].shape[0] self.assertTrue(prim_obj.t_coi==value) # test initialization, including t_coi calculation in case of searching # for results equal to or lower than the threshold threshold = 1000 prim_obj = prim.setup_prim(results, classify='death toll', threshold_type=prim.BELOW, threshold=threshold) value = np.ones((experiments.shape[0],)) value = value[outcomes['death toll'] <= threshold].shape[0] self.assertTrue(prim_obj.t_coi==value) prim.setup_prim(self.results, self.classify, threshold=prim.ABOVE)
def get_univariate_feature_scores(x,y, score_func=F_CLASSIFICATION): ''' calculate feature scores using univariate statistical tests. In case of categorical data, chi square or the Anova F value is used. In case of continuous data the Anova F value is used. Parameters ---------- x : structured array y : 1D nd.array score_func : {F_CLASSIFICATION, F_REGRESSION, CHI2} the score function to use, one of f_regression (regression), or f_classification or chi2 (classification). Returns ------- list of tuples sorted in descending order of tuples with uncertainty and feature scores (i.e. p values in this case). ''' uncs = recfunctions.get_names(x.dtype) x = _prepare_experiments(x) pvalues = score_func(x, y)[1] pvalues = np.asarray(pvalues) pvalues = zip(uncs, pvalues) pvalues = list(pvalues) pvalues.sort(key=itemgetter(1)) return pvalues
def test_store_cases(self): nr_experiments = 3 uncs = [ ParameterUncertainty((0, 1), "a"), ParameterUncertainty((0, 1), "b"), CategoricalUncertainty([0, 1, 2], "c"), ParameterUncertainty((0, 1), "d", integer=True), ] outcomes = [Outcome("test", time=True)] case = {unc.name: random.random() for unc in uncs} case["c"] = int(round(case["c"] * 2)) case["d"] = int(round(case["d"])) policy = {'name': 'none'} name = "test" callback = DefaultCallback(uncs, [outcome.name for outcome in outcomes], nr_experiments=nr_experiments, reporting_interval=1) result = {outcomes[0].name: 1} callback(0, case, policy, name, result) experiments, _ = callback.get_results() design = case design['policy'] = policy['name'] design['model'] = name names = rf.get_names(experiments.dtype) for name in names: self.assertEqual(experiments[name][0], design[name])
def plot_cdfs(x, y, ccdf=False): '''plot cumulative density functions for each column in x, based on the classification specified in y. Parameters ---------- x : recarray the experiments to use in the cdfs y : ndaray the categorization for the data ccdf : bool, optional if true, plot a complementary cdf instead of a normal cdf. Returns ------- a matplotlib Figure instance ''' x = rf.drop_fields(x, "scenario_id", asrecarray=True) uncs = rf.get_names(x.dtype) cp = sns.color_palette() n_col = 4 n_row = math.ceil(len(uncs) / n_col) size = 3 aspect = 1 figsize = n_col * size * aspect, n_row * size fig, axes = plt.subplots(n_row, n_col, figsize=figsize, squeeze=False) for i, unc in enumerate(uncs): discrete = False i_col = i % n_col i_row = i // n_col ax = axes[i_row, i_col] data = x[unc] if x.dtype[unc] == np.dtype('O'): discrete = True plot_individual_cdf(ax, unc, data, y, discrete, ccdf=ccdf) # last row might contain empty axis, # let's make them disappear for j_col in range(i_col + 1, n_col): ax = axes[i_row, j_col] ax.set_xticklabels([]) ax.set_xticks([]) ax.set_yticklabels([]) ax.set_yticks([]) sns.despine(ax=ax, top=True, right=True, left=True, bottom=True) proxies, labels = build_legend(x, y) fig.legend(proxies, labels, "upper center") return fig
def plot_cdfs(x, y, ccdf=False): '''plot cumulative density functions for each column in x, based on the classification specified in y. Parameters ---------- x : recarray the experiments to use in the cdfs y : ndaray tthe categorization for the data ccdf : bool, optional if true, plot a complementary cdf instead of a normal cdf. ''' uncs = rf.get_names(x.dtype) cp = sns.color_palette() n_col = 4 n_row = len(uncs)//n_col +1 size = 3 aspect = 1 figsize = n_col * size * aspect, n_row * size fig, axes = plt.subplots(n_row, n_col, figsize=figsize, squeeze=False) for i, unc in enumerate(uncs): discrete = False i_col = i % n_col i_row = i // n_col ax = axes[i_row, i_col] data = x[unc] if x.dtype[unc] == np.dtype('O'): discrete = True plot_cdf(ax, unc, data, y, discrete, ccdf=ccdf) # last row might contain empty axis, # let's make them disappear i_row = len(uncs) // n_col i_col = len(uncs) % n_col for i_col in range(i_col, n_col): ax = axes[i_row, i_col] ax.set_xticklabels([]) ax.set_xticks([]) ax.set_yticklabels([]) ax.set_yticks([]) sns.despine(ax=ax, top=True, right=True, left=True, bottom=True) proxies, labels = build_legend(x, y) fig.legend(proxies, labels, "upper center") return fig
def _in_box(x, boxlim): ''' returns the indices of the data points that are within the box_lims. Parameters ---------- x : numpy structured array boxlim : numpy structured array Returns ------- ndarray valid numpy indices on x ''' logical = np.ones(x.shape[0], dtype=np.bool) dims = recfunctions.get_names(boxlim.dtype) for name in dims: value = x.dtype.fields.get(name)[0] if value == 'object': entries = boxlim[name][0] l = np.ones( (x.shape[0], len(entries)), dtype=np.bool) for i,entry in enumerate(entries): if type(list(entries)[0]) not in (str, float, int): bools = [] for element in list(x[name]): if element == entry: bools.append(True) else: bools.append(False) l[:, i] = np.asarray(bools, dtype=bool) else: l[:, i] = x[name] == entry l = np.any(l, axis=1) logical = logical & l else: logical = logical & (boxlim[name][0] <= x[name] )&\ (x[name] <= boxlim[name][1]) indices = np.where(logical==True) assert len(indices)==1 indices = indices[0] return indices
def _in_box(x, boxlim): ''' returns the indices of the data points that are within the box_lims. Parameters ---------- x : numpy structured array boxlim : numpy structured array Returns ------- ndarray valid numpy indices on x ''' logical = np.ones(x.shape[0], dtype=np.bool) dims = recfunctions.get_names(boxlim.dtype) for name in dims: value = x.dtype.fields.get(name)[0] if value == 'object': entries = boxlim[name][0] l = np.ones((x.shape[0], len(entries)), dtype=np.bool) for i, entry in enumerate(entries): if type(list(entries)[0]) not in (str, float, int): bools = [] for element in list(x[name]): if element == entry: bools.append(True) else: bools.append(False) l[:, i] = np.asarray(bools, dtype=bool) else: l[:, i] = x[name] == entry l = np.any(l, axis=1) logical = logical & l else: logical = logical & (boxlim[name][0] <= x[name] )&\ (x[name] <= boxlim[name][1]) indices = np.where(logical == True) assert len(indices) == 1 indices = indices[0] return indices
def get_univariate_feature_scores(results, classify, score_func='f_classification'): ''' calculate feature scores using univariate statistical tests. In case of categorical data, chi square or the Anova F value is used. In case of continuous data the Anova F value is used. Parameters ---------- results : tuple classify : str score_func : {'f_classification', 'chi2', 'f_regression'} the score function to use, one of f_regression (regression), or f_classification or chi2 (classification). Returns ------- list of tuples sorted in descending order of tuples with uncertainty and feature scores (i.e. p values in this case). ''' score_funcs = { 'f_regression': f_regression, 'f_classification': f_classif, 'chi2': chi2 } experiments, outcomes = results uncs = recfunctions.get_names(experiments.dtype) x = _prepare_experiments(experiments) y, categorical = _prepare_outcomes(outcomes, classify) if categorical: score_func = score_funcs[score_func] else: score_func = f_regression pvalues = score_func(x, y)[1] pvalues = np.asarray(pvalues) pvalues = zip(uncs, pvalues) pvalues = list(pvalues) pvalues.sort(key=itemgetter(1)) return pvalues
def determine_restricted_dims(self, box_lims): ''' determine which dimensions of the given box are restricted compared to compared to the initial box that contains all the data :param box_lims: ''' logical = self.compare(self.box_init, box_lims) u = np.asarray(recfunctions.get_names(box_lims.dtype), dtype=object) dims = u[logical==False] return dims
def test_init(self): # let's add some uncertainties to this uncs = [ParameterUncertainty((0,1), "a"), ParameterUncertainty((0,1), "b")] outcomes = [Outcome("test", time=True)] callback = DefaultCallback(uncs, outcomes, nr_experiments=100) self.assertEqual(callback.i, 0) self.assertEqual(callback.nr_experiments, 100) self.assertEqual(callback.cases.shape[0], 100) self.assertEqual(callback.outcomes, outcomes) names = rf.get_names(callback.cases.dtype) names = set(names) self.assertEqual(names, {'a', 'b', 'policy', 'model'}) self.assertEqual(callback.results, {})
def setup_prim(results, classify, threshold, incl_unc=[], **kwargs): """Helper function for setting up the prim algorithm Parameters ---------- results : tuple tuple of structured array and dict with numpy arrays the return from :meth:`perform_experiments`. classify : str or callable either a string denoting the outcome of interest to use or a function. threshold : double the minimum score on the objective function of the last box on the peeling trajectory. In case of a binary classification, this should be between 0 and 1. incl_unc : list of str, optional list of uncertainties to include in prim analysis kwargs : dict valid keyword arguments for prim.Prim Returns ------- a Prim instance Raises ------ PrimException if data resulting from classify is not a 1-d array. TypeError if classify is not a string or a callable. """ if not incl_unc: x = np.ma.array(results[0]) else: drop_names = set(rf.get_names(results[0].dtype)) - set(incl_unc) x = rf.drop_fields(results[0], drop_names, asrecarray=True) if isinstance(classify, six.string_types): y = results[1][classify] elif callable(classify): y = classify(results[1]) else: raise TypeError("unknown type for classify") return Prim(x, y, threshold=threshold, **kwargs)
def _determine_restricted_dims(box_lims, box_init): ''' determine which dimensions of the given box are restricted compared to compared to the initial box that contains all the data Parameters ---------- box_lims : structured numpy array a specific box limit box_init : structured numpy array the initial box containing all data points ''' logical = _compare(box_init, box_lims) u = np.asarray(recfunctions.get_names(box_lims.dtype), dtype=object) dims = u[logical == False] return dims
def test_drop_restriction(self): x = np.array([(0,1,2), (2,5,6), (3,2,1)], dtype=[('a', np.float), ('b', np.float), ('c', np.float)]) y = np.array([1,1,0]) prim_obj = Prim(x, y, threshold=0.8) box = PrimBox(prim_obj, prim_obj._box_init, prim_obj.yi) new_box_lim = np.array([(0,1,1), (2,2,6)], dtype=[('a', np.float), ('b', np.float), ('c', np.float)]) indices = np.array([0,1], dtype=np.int) box.update(new_box_lim, indices) box.drop_restriction('b') correct_box_lims = np.array([(0,1,1), (2,5,6)], dtype=[('a', np.float), ('b', np.float), ('c', np.float)]) box_lims = box._box_lims[-1] names = recfunctions.get_names(correct_box_lims.dtype) for entry in names: lim_correct = correct_box_lims[entry] lim_box = box_lims[entry] for i in range(len(lim_correct)): self.assertEqual(lim_correct[i], lim_box[i]) self.assertEqual(box.peeling_trajectory['mean'][2], 1) self.assertEqual(box.peeling_trajectory['coverage'][2], 1) self.assertEqual(box.peeling_trajectory['density'][2], 1) self.assertEqual(box.peeling_trajectory['res dim'][2], 1) self.assertEqual(box.peeling_trajectory['mass'][2], 2/3)
def _determine_restricted_dims(box_lims, box_init): ''' determine which dimensions of the given box are restricted compared to compared to the initial box that contains all the data Parameters ---------- box_lims : structured numpy array a specific box limit box_init : structured numpy array the initial box containing all data points ''' logical = _compare(box_init, box_lims) u = np.asarray(recfunctions.get_names(box_lims.dtype), dtype=object) dims = u[logical==False] return dims
def make_box(x): ''' Make a box that encompasses all the data Parameters ---------- x : structured numpy array ''' # get the types in the order they appear in the numpy array types = [(v[1], k, v[0].name) for k, v in six.iteritems(x.dtype.fields)] types = sorted(types) # convert any bool types to object to store set(False, True) ntypes = [(k, 'object' if t == 'bool' else t) for (_, k, t) in types] # create box limits box = np.zeros((2, ), ntypes) names = recfunctions.get_names(x.dtype) for name in names: dtype = box.dtype.fields.get(name)[0] values = x[name] if isinstance(values, np.ma.MaskedArray): values = values.compressed() if dtype == 'object': try: values = set(values) box[name][:] = values except TypeError as e: logging.getLogger(__name__).warning( "{} has unhashable values".format(name)) raise e else: box[name][0] = np.min(values, axis=0) box[name][1] = np.max(values, axis=0) return box
def setup_prim(results, classify, incl_unc=[], **kwargs): """Helper function for setting up the prim algorithm Parameters ---------- results : tuple of structured array and dict with numpy arrays the return from :meth:`perform_experiments`. classify : string, function or callable either a string denoting the outcome of interest to use or a function. kwargs : valid keyword arguments for prim.Prim Returns ------- a Prim instance Raises ------ PrimException if data resulting from classify is not a 1-d array. TypeError if classify is not a string or a callable. """ if not incl_unc: x = np.ma.array(results[0]) else: drop_names = set(rf.get_names(results[0].dtype))-set(incl_unc) x = rf.drop_fields(results[0], drop_names, asrecarray = True) if type(classify)==StringType: y = results[1][classify] elif callable(classify): y = classify(results[1]) else: raise TypeError("unknown type for classify") return Prim(x,y, **kwargs)
def setup_cart(results, classify, incl_unc=[], mass_min=0.05): """helper function for performing cart in combination with data generated by the workbench. Parameters ---------- results : tuple of structured array and dict with numpy arrays the return from :meth:`perform_experiments`. classify : string, function or callable either a string denoting the outcome of interest to use or a function. incl_unc : list of strings mass_min : float Raises ------ TypeError if classify is not a string or a callable. """ if not incl_unc: x = np.ma.array(results[0]) else: drop_names = set(recfunctions.get_names( results[0].dtype)) - set(incl_unc) x = recfunctions.drop_fields(results[0], drop_names, asrecarray=True) if isinstance(classify, six.string_types): y = results[1][classify] mode = sdutil.REGRESSION elif callable(classify): y = classify(results[1]) mode = sdutil.BINARY else: raise TypeError("unknown type for classify") return CART(x, y, mass_min, mode=mode)
def setup_cart(results, classify, incl_unc=[], mass_min=0.05): """helper function for performing cart in combination with data generated by the workbench. Parameters ---------- results : tuple of structured array and dict with numpy arrays the return from :meth:`perform_experiments`. classify : string, function or callable either a string denoting the outcome of interest to use or a function. incl_unc : list of strings mass_min : float Raises ------ TypeError if classify is not a string or a callable. """ if not incl_unc: x = np.ma.array(results[0]) else: drop_names = set(recfunctions.get_names( results[0].dtype))-set(incl_unc) x = recfunctions.drop_fields(results[0], drop_names, asrecarray=True) if isinstance(classify, six.string_types): y = results[1][classify] mode = sdutil.REGRESSION elif callable(classify): y = classify(results[1]) mode = sdutil.BINARY else: raise TypeError("unknown type for classify") return CART(x, y, mass_min, mode=mode)
def make_box(x): ''' Make a box that encompasses all the data Parameters ---------- x : structured numpy array ''' # get the types in the order they appear in the numpy array types = [(v[1], k, v[0].name) for k, v in six.iteritems(x.dtype.fields)] types = sorted(types) # convert any bool types to object to store set(False, True) ntypes = [(k, 'object' if t == 'bool' else t) for (_, k, t) in types] # create box limits box = np.zeros((2, ), ntypes) names = recfunctions.get_names(x.dtype) for name in names: dtype = box.dtype.fields.get(name)[0] values = x[name] if isinstance(values, np.ma.MaskedArray): values = values.compressed() if dtype == 'object': try: values = set(values) box[name][:] = values except TypeError as e: logging.getLogger(__name__).warning("{} has unhashable values".format(name)) raise e else: box[name][0] = np.min(values, axis=0) box[name][1] = np.max(values, axis=0) return box
def get_univariate_feature_scores(x,y, score_func=F_CLASSIFICATION): ''' calculate feature scores using univariate statistical tests. In case of categorical data, chi square or the Anova F value is used. In case of continuous data the Anova F value is used. Parameters ---------- x : structured array y : 1D nd.array score_func : {F_CLASSIFICATION, F_REGRESSION, CHI2} the score function to use, one of f_regression (regression), or f_classification or chi2 (classification). Returns ------- pandas DataFrame sorted in descending order of tuples with uncertainty and feature scores (i.e. p values in this case). ''' uncs = recfunctions.get_names(x.dtype) x = _prepare_experiments(x) pvalues = score_func(x, y)[1] pvalues = np.asarray(pvalues) pvalues = zip(uncs, pvalues) pvalues = list(pvalues) pvalues.sort(key=itemgetter(1)) pvalues = pd.DataFrame(pvalues) return pvalues
def _rotate_subset(self, value, orig_experiments, logical): ''' rotate a subset Parameters ---------- value : list of str orig_experiment : numpy structured array logical : boolean array ''' list_dtypes = [(name, "<f8") for name in value] #cast everything to float drop_names = set(rf.get_names(orig_experiments.dtype)) - set(value) orig_subset = rf.drop_fields(orig_experiments, drop_names, asrecarray=True) subset_experiments = orig_subset.astype(list_dtypes).view( '<f8').reshape(orig_experiments.shape[0], len(value)) #normalize the data mean = np.mean(subset_experiments, axis=0) std = np.std(subset_experiments, axis=0) std[std == 0] = 1 #in order to avoid a devision by zero subset_experiments = (subset_experiments - mean) / std #get the experiments of interest experiments_of_interest = subset_experiments[logical] #determine the rotation rotation_matrix = self._determine_rotation(experiments_of_interest) #apply the rotation subset_experiments = np.dot(subset_experiments, rotation_matrix) return rotation_matrix, subset_experiments
def __init__(self, x, y, threshold=None, threshold_type=">", include=None, exclude=None, **kwargs): """Generates a decision tree for classification. Parameters ---------- x : a matrix-like object (pandas.DataFrame, numpy.recarray, etc.) the independent variables y : a list-like object, the column name (str), or callable the dependent variable either provided as a list-like object classifying the data into cases of interest (e.g., False/True), a list-like object storing the raw variable value (in which case a threshold must be given), a string identifying the dependent variable in x, or a function called on each row of x to compute the dependent variable threshold : float threshold for identifying cases of interest threshold_type : str comparison operator used when identifying cases of interest include : list of str the names of variables included in the PRIM analysis exclude : list of str the names of variables excluded from the PRIM analysis """ super(Cart, self).__init__() # Ensure the input x is a numpy matrix/array if isinstance(x, pd.DataFrame): x = x.to_records(index=False) elif isinstance(x, np.ma.MaskedArray): pass else: x = pd.DataFrame(x).to_records(index=False) # if y is a string or function, compute the actual response value # otherwise, ensure y is a numpy matrix/array if isinstance(y, six.string_types): key = y y = x[key] if exclude: exclude = list(exclude) + [key] else: exclude = [key] elif six.callable(y): fun = y y = np.apply_along_axis(fun, 0, x) elif isinstance(y, pd.DataFrame) or isinstance(y, pd.Series): y = y.values elif isinstance(y, np.ma.MaskedArray): pass else: y = np.asarray(y) # convert include/exclude arguments to lists if they are strings if include and isinstance(include, six.string_types): include = [include] if exclude and isinstance(exclude, six.string_types): exclude = [exclude] # include or exclude columns from the analysis if include: if isinstance(include, six.string_types): include = [include] drop_names = set(rf.get_names(x.dtype)) - set(include) x = rf.drop_fields(x, drop_names, asrecarray=True) if exclude: if isinstance(exclude, six.string_types): exclude = [exclude] drop_names = set(exclude) x = rf.drop_fields(x, drop_names, asrecarray=True) # apply the threshold if if threshold: if six.callable(threshold): y = np.apply_along_axis(threshold, 0, y) else: # The syntax for threshold_type is "x <op> <threshold>", e.g., # "x > 0.5". However, partial only supports positional # arguments for built-in operators. Thus, we must assign the # threshold to the first position and use a different operator. # For example, "x > 0.5" must be evaluated as "0.5 < x". OPERATORS = { "<": operator.ge, ">": operator.le, "<=": operator.gt, ">=": operator.lt, "=": operator.eq } op = OPERATORS[threshold_type] y = np.apply_along_axis(functools.partial(op, threshold), 0, y) # validate inputs if len(y.shape) > 1: raise ValueError("y is not a 1-d array") # extract feature names feature_names = rf.get_names(x.dtype) # ensure x is formatted as a 2D matrix x = x.view("<f8").reshape(x.shape + (-1, )) clf = tree.DecisionTreeClassifier(**kwargs) clf = clf.fit(x, y) # add our custom metadata to the classifier self._feature_names = feature_names self._x = x self._y = y self._clf = clf
def __init__(self, x, y, threshold = None, threshold_type = ">", obj_func = lenient1, peel_alpha = 0.05, paste_alpha = 0.05, mass_min = 0.05, include = None, exclude = None, coi = None): """Creates a new PRIM object. The PRIM object maintains the current state of the PRIM algorithm, recording the PRIM boxes found thus far, the remaining (uncaptured) cases of interest in the dataset, and provides methods for finding the next PRIM box and viewing statistics. Parameters ---------- x : a matrix-like object (pandas.DataFrame, numpy.recarray, etc.) the independent variables y : a list-like object, the column name (str), or callable the dependent variable either provided as a list-like object classifying the data into cases of interest (e.g., False/True), a list-like object storing the raw variable value (in which case a threshold must be given), a string identifying the dependent variable in x, or a function called on each row of x to compute the dependent variable threshold : float threshold for identifying cases of interest threshold_type : str comparison operator used whwen identifying cases of interest obj_func : callable (default: lenient1) a function that computes the objective function (peeling criteria) peel_alpha : float (default: 0.05) parameter controlling the peeling stage paste_alpha : float (default: 0.05) parameter controlling the pasting stage mass_min : float (default: 0.05) minimum mass of a box include : list of str the names of variables included in the PRIM analysis exclude : list of str the names of variables excluded from the PRIM analysis coi : str or list of str if y contains strings, coi identifies which string is the case of interest """ # Ensure the input x is a numpy matrix/array if isinstance(x, pd.DataFrame): x = x.to_records(index=False) elif isinstance(x, np.ma.MaskedArray): pass else: x = pd.DataFrame(x).to_records(index=False) # if y is a string or function, compute the actual response value # otherwise, ensure y is a numpy matrix/array if isinstance(y, six.string_types): key = y y = x[key] if exclude: exclude = list(exclude) + [key] else: exclude = [key] elif six.callable(y): fun = y y = np.apply_along_axis(fun, 0, x) elif isinstance(y, pd.DataFrame) or isinstance(y, pd.Series): y = y.values elif isinstance(y, np.ma.MaskedArray): pass else: y = np.asarray(y) # convert include/exclude arguments to lists if they are strings if include and isinstance(include, six.string_types): include = [include] if exclude and isinstance(exclude, six.string_types): exclude = [exclude] # include or exclude columns from the analysis if include: if isinstance(include, six.string_types): include = [include] drop_names = set(rf.get_names(x.dtype))-set(include) x = rf.drop_fields(x, drop_names, asrecarray=True) if exclude: if isinstance(exclude, six.string_types): exclude = [exclude] drop_names = set(exclude) x = rf.drop_fields(x, drop_names, asrecarray=True) # apply the threshold if if threshold: if six.callable(threshold): y = np.apply_along_axis(threshold, 0, y) else: # The syntax for threshold_type is "x <op> <threshold>", e.g., # "x > 0.5". However, partial only supports positional # arguments for built-in operators. Thus, we must assign the # threshold to the first position and use a different operator. # For example, "x > 0.5" must be evaluated as "0.5 < x". OPERATORS = {"<=" : operator.ge, ">=" : operator.le, "<" : operator.gt, ">" : operator.lt, "=" : operator.eq} op = OPERATORS[threshold_type] y = np.apply_along_axis(functools.partial(op, threshold), 0, y) # validate inputs if len(y.shape) > 1: raise PrimError("y is not a 1-d array") unique_y = np.unique(y) if unique_y.shape[0] > 2: raise PrimError("y must contain only two values (0/1 or False/True)") if ((unique_y.shape[0] == 2 and (False not in unique_y or True not in unique_y)) or (False not in unique_y and True not in unique_y)): if coi is None: raise PrimError("y must contain only two values (0/1 or False/True)") else: if not hasattr(coi, "__iter__") and not isinstance(coi, six.string_types): coi = [coi] y = np.asarray([1 if yi in coi else 0 for yi in y]) # store the parameters self.x = x self.y = y self.paste_alpha = paste_alpha self.peel_alpha = peel_alpha self.mass_min = mass_min self.threshold = threshold self.threshold_type = threshold_type self.obj_func = obj_func # set the indices self.yi = np.arange(0, self.y.shape[0]) # how many data points do we have self.n = self.y.shape[0] # how many cases of interest do we have? self.t_coi = self.determine_coi(self.yi) # initial box that contains all data self._box_init = make_box(self.x) # make a list in which the identified boxes can be put self._boxes = [] # set yi_remaining to all y values self._update_yi_remaining()
def test_store_cases(self): nr_experiments = 3 uncs = [RealParameter("a", 0, 1), RealParameter("b", 0, 1), CategoricalParameter('c', [0, 1, 2]), IntegerParameter("d", 0, 1)] outcomes = [TimeSeriesOutcome("test")] constraints = [] case = {unc.name:random.random() for unc in uncs} case["c"] = int(round(case["c"]*2)) case["d"] = int(round(case["d"])) model = NamedObject('test') policy = Policy('policy') scenario = Scenario(**case) experiment = Case(0, model.name, policy, scenario, 0) callback = DefaultCallback(uncs, [],outcomes, constraints, nr_experiments=nr_experiments, reporting_interval=1) model_outcomes = {outcomes[0].name: 1} model_constraints = {} callback(experiment, model_outcomes, model_constraints) experiments, _ = callback.get_results() design = case design['policy'] = policy.name design['model'] = model.name design['scenario_id'] = scenario.name names = rf.get_names(experiments.dtype) for name in names: entry_a = experiments[name][0] entry_b = design[name] self.assertEqual(entry_a, entry_b, "failed for "+name) # with levers nr_experiments = 3 uncs = [RealParameter("a", 0, 1), RealParameter("b", 0, 1)] levers = [RealParameter("c", 0, 1), RealParameter("d", 0, 1)] outcomes = [TimeSeriesOutcome("test")] case = {unc.name:random.random() for unc in uncs} model = NamedObject('test') policy = Policy('policy', c=1, d=1) scenario = Scenario(**case) experiment = Case(0, model.name, policy, scenario, 0) callback = DefaultCallback(uncs, levers,outcomes,constraints, nr_experiments=nr_experiments, reporting_interval=1) model_outcomes = {outcomes[0].name: 1} model_constraints = {} callback(experiment, model_outcomes, model_constraints) experiments, _ = callback.get_results() design = case design['c'] = 1 design['d'] = 1 design['policy'] = policy.name design['model'] = model.name design['scenario_id'] = scenario.name names = rf.get_names(experiments.dtype) for name in names: self.assertEqual(experiments[name][0], design[name])
def get_ex_feature_scores(x, y, mode=CLASSIFICATION, nr_trees=250, max_features='auto', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0, max_leaf_nodes=None, bootstrap=True, oob_score=True, random_state=None): ''' Get feature scores using extra trees Parameters ---------- x : structured array y : 1D nd.array mode : {CLASSIFICATION, REGRESSION} nr_trees : int, optional nr. of trees in forest (default=250) max_features : int, optional see http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html max_depth : int, optional see http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html min_samples_split : int, optional see http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html min_samples_leaf : int, optional see http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html min_weight_fraction_leaf : float, optional see http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html max_leaf_nodes: int or None, optional see http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html bootstrap : bool, optional see http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html oob_score : bool, optional see http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html random_state : int, optional see http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html Returns ------- pandas DataFrame sorted in descending order of tuples with uncertainty and feature scores object either ExtraTreesClassifier or ExtraTreesRegressor ''' uncs = recfunctions.get_names(x.dtype) x = _prepare_experiments(x) if mode==CLASSIFICATION: etc = ExtraTreesClassifier criterion='gini' elif mode==REGRESSION: etc = ExtraTreesRegressor criterion = 'mse' else: raise ValueError('{} not valid for mode'.format(mode)) extra_trees = etc(n_estimators=nr_trees, criterion=criterion, max_features=max_features, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, max_leaf_nodes=max_leaf_nodes, bootstrap=bootstrap, oob_score=oob_score, random_state=random_state) extra_trees.fit(x,y) importances = extra_trees.feature_importances_ importances = zip(uncs, importances) importances = list(importances) importances.sort(key=itemgetter(1), reverse=True) importances = pd.DataFrame(importances) return importances, extra_trees
def get_lasso_feature_scores(x, y, mode=CLASSIFICATION, scaling=0.5, sample_fraction=0.75, n_resampling=200, random_state=None): ''' Calculate features scores using a randomized lasso (regression) or randomized logistic regression (classification). This is also known as stability selection. see http://scikit-learn.org/stable/modules/feature_selection.html for details. Parameters ---------- x : structured array y : 1D nd.array mode : {CLASSIFICATION, REGRESSION} scaling : float, optional scaling parameter, should be between 0 and 1 sample_fraction : float, optional the fraction of samples to used in each randomized dataset n_resmpling : int, optional the number of times the model is trained on a random subset of the data random_state : int, optional if it is an int, it specifies the seed to use, defaults to None. Returns ------- pandas DataFrame sorted in descending order of tuples with uncertainty and feature scores ''' uncs = recfunctions.get_names(x.dtype) x = _prepare_experiments(x) if mode==CLASSIFICATION: lfs = RandomizedLogisticRegression(scaling=scaling, sample_fraction=sample_fraction, n_resampling=n_resampling, random_state=random_state) lfs.fit(x,y) elif mode==REGRESSION: # we use LassoLarsCV to determine alpha see # http://scikit-learn.org/stable/auto_examples/linear_model/plot_sparse_recovery.html lars_cv = LassoLarsCV(cv=6).fit(x, y,) alphas = np.linspace(lars_cv.alphas_[0], .1 * lars_cv.alphas_[0], 6) # fit the randomized lasso lfs = RandomizedLasso(alpha=alphas,scaling=scaling, sample_fraction=sample_fraction, n_resampling=n_resampling, random_state=random_state) lfs.fit(x, y) else: raise ValueError('{} invalid value for mode'.format(mode)) importances = lfs.scores_ importances = zip(uncs, importances) importances = list(importances) importances.sort(key=itemgetter(1), reverse=True) importances = pd.DataFrame(importances) return importances
def test_store_cases(self): nr_experiments = 3 uncs = [ RealParameter("a", 0, 1), RealParameter("b", 0, 1), CategoricalParameter('c', [0, 1, 2]), IntegerParameter("d", 0, 1) ] outcomes = [TimeSeriesOutcome("test")] case = {unc.name: random.random() for unc in uncs} case["c"] = int(round(case["c"] * 2)) case["d"] = int(round(case["d"])) model = NamedObject('test') policy = Policy('policy') scenario = Scenario(**case) experiment = Experiment(0, model, policy, scenario, 0) callback = DefaultCallback(uncs, [], outcomes, nr_experiments=nr_experiments, reporting_interval=1) result = {outcomes[0].name: 1} callback(experiment, result) experiments, _ = callback.get_results() design = case design['policy'] = policy.name design['model'] = model.name names = rf.get_names(experiments.dtype) for name in names: self.assertEqual(experiments[name][0], design[name]) # with levers nr_experiments = 3 uncs = [RealParameter("a", 0, 1), RealParameter("b", 0, 1)] levers = [RealParameter("c", 0, 1), RealParameter("d", 0, 1)] outcomes = [TimeSeriesOutcome("test")] case = {unc.name: random.random() for unc in uncs} model = NamedObject('test') policy = Policy('policy', c=1, d=1) scenario = Scenario(**case) experiment = Experiment(0, model, policy, scenario, 0) callback = DefaultCallback(uncs, levers, outcomes, nr_experiments=nr_experiments, reporting_interval=1) result = {outcomes[0].name: 1} callback(experiment, result) experiments, _ = callback.get_results() design = case design['c'] = 1 design['d'] = 1 design['policy'] = policy.name design['model'] = model.name names = rf.get_names(experiments.dtype) print(experiments[0]) for name in names: self.assertEqual(experiments[name][0], design[name])
def __init__(self, x, y, threshold=None, threshold_type=">", obj_func=lenient1, peel_alpha=0.05, paste_alpha=0.05, mass_min=0.05, include=None, exclude=None, coi=None): """Creates a new PRIM object. The PRIM object maintains the current state of the PRIM algorithm, recording the PRIM boxes found thus far, the remaining (uncaptured) cases of interest in the dataset, and provides methods for finding the next PRIM box and viewing statistics. Parameters ---------- x : a matrix-like object (pandas.DataFrame, numpy.recarray, etc.) the independent variables y : a list-like object, the column name (str), or callable the dependent variable either provided as a list-like object classifying the data into cases of interest (e.g., False/True), a list-like object storing the raw variable value (in which case a threshold must be given), a string identifying the dependent variable in x, or a function called on each row of x to compute the dependent variable threshold : float threshold for identifying cases of interest threshold_type : str comparison operator used whwen identifying cases of interest obj_func : callable (default: lenient1) a function that computes the objective function (peeling criteria) peel_alpha : float (default: 0.05) parameter controlling the peeling stage paste_alpha : float (default: 0.05) parameter controlling the pasting stage mass_min : float (default: 0.05) minimum mass of a box include : list of str the names of variables included in the PRIM analysis exclude : list of str the names of variables excluded from the PRIM analysis coi : str or list of str if y contains strings, coi identifies which string is the case of interest """ # Ensure the input x is a numpy matrix/array if isinstance(x, pd.DataFrame): x = x.to_records(index=False) elif isinstance(x, np.ma.MaskedArray): pass else: x = pd.DataFrame(x).to_records(index=False) # if y is a string or function, compute the actual response value # otherwise, ensure y is a numpy matrix/array if isinstance(y, six.string_types): key = y y = x[key] if exclude: exclude = list(exclude) + [key] else: exclude = [key] elif six.callable(y): fun = y y = np.apply_along_axis(fun, 0, x) elif isinstance(y, pd.DataFrame) or isinstance(y, pd.Series): y = y.values elif isinstance(y, np.ma.MaskedArray): pass else: y = np.asarray(y) # convert include/exclude arguments to lists if they are strings if include and isinstance(include, six.string_types): include = [include] if exclude and isinstance(exclude, six.string_types): exclude = [exclude] # include or exclude columns from the analysis if include: if isinstance(include, six.string_types): include = [include] drop_names = set(rf.get_names(x.dtype)) - set(include) x = rf.drop_fields(x, drop_names, asrecarray=True) if exclude: if isinstance(exclude, six.string_types): exclude = [exclude] drop_names = set(exclude) x = rf.drop_fields(x, drop_names, asrecarray=True) # apply the threshold if if threshold: if six.callable(threshold): y = np.apply_along_axis(threshold, 0, y) else: # The syntax for threshold_type is "x <op> <threshold>", e.g., # "x > 0.5". However, partial only supports positional # arguments for built-in operators. Thus, we must assign the # threshold to the first position and use a different operator. # For example, "x > 0.5" must be evaluated as "0.5 < x". OPERATORS = { "<=": operator.ge, ">=": operator.le, "<": operator.gt, ">": operator.lt, "=": operator.eq } op = OPERATORS[threshold_type] y = np.apply_along_axis(functools.partial(op, threshold), 0, y) # validate inputs if len(y.shape) > 1: raise PrimError("y is not a 1-d array") unique_y = np.unique(y) if unique_y.shape[0] > 2: raise PrimError( "y must contain only two values (0/1 or False/True)") if ((unique_y.shape[0] == 2 and (False not in unique_y or True not in unique_y)) or (False not in unique_y and True not in unique_y)): if coi is None: raise PrimError( "y must contain only two values (0/1 or False/True)") else: if not hasattr(coi, "__iter__") and not isinstance( coi, six.string_types): coi = [coi] y = np.asarray([1 if yi in coi else 0 for yi in y]) # store the parameters self.x = x self.y = y self.paste_alpha = paste_alpha self.peel_alpha = peel_alpha self.mass_min = mass_min self.threshold = threshold self.threshold_type = threshold_type self.obj_func = obj_func # set the indices self.yi = np.arange(0, self.y.shape[0]) # how many data points do we have self.n = self.y.shape[0] # how many cases of interest do we have? self.t_coi = self.determine_coi(self.yi) # initial box that contains all data self._box_init = make_box(self.x) # make a list in which the identified boxes can be put self._boxes = [] # set yi_remaining to all y values self._update_yi_remaining()
def get_rf_feature_scores(results, classify, nr_trees=250, criterion='gini', max_features='auto', max_depth=None, min_samples_split=2, min_samples_leaf=1, bootstrap=True, oob_score=True, random_state=None): ''' Get feature scores using a random forest Parameters ---------- results : tuple results tuple classify : callable or str a classify function or variable analogous to PRIM nr_trees : int, optional nr. of trees in forest (default=250) criterion : str, optional see http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html max_features : int, optional see http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html max_depth : int, optional see http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html min_samples : int, optional see http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html min_samples_leaf : int, optional see http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html bootstrap : bool, optional see http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html oob_score : bool, optional see http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html random_state : int, optional see http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html Returns ------- list of tuples sorted in descending order of tuples with uncertainty and feature scores object either RandomForestClassifier or RandomForestRegressor ''' experiments, outcomes = results uncs = recfunctions.get_names(experiments.dtype) x = _prepare_experiments(experiments) y, categorical = _prepare_outcomes(outcomes, classify) if categorical: rfc = RandomForestClassifier else: rfc = RandomForestRegressor criterion = 'mse' forest = rfc(n_estimators=nr_trees, criterion=criterion, max_features=max_features, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, bootstrap=bootstrap, oob_score=oob_score, random_state=random_state) forest.fit(x, y) importances = forest.feature_importances_ importances = zip(uncs, importances) importances = list(importances) importances.sort(key=itemgetter(1), reverse=True) return importances, forest
def get_lasso_feature_scores(results, classify, scaling=0.5, sample_fraction=0.75, n_resampling=200, random_state=None): ''' Calculate features scores using a randomized lasso (regression) or randomized logistic regression (classification). This is also known as stability selection. see http://scikit-learn.org/stable/modules/feature_selection.html for details. Parameters ---------- results : tuple classify : callable or str a classify function or variable analogous to PRIM scaling : float, optional scaling parameter, should be between 0 and 1 sample_fraction : float, optional the fraction of samples to used in each randomized dataset n_resmpling : int, optional the number of times the model is trained on a random subset of the data random_state : int, optional if it is an int, it specifies the seed to use, defaults to None. Returns ------- list of tuples sorted in descending order of tuples with uncertainty and feature scores ''' experiments, outcomes = results uncs = recfunctions.get_names(experiments.dtype) x = _prepare_experiments(experiments) y, categorical = _prepare_outcomes(outcomes, classify) if categorical: lfs = RandomizedLogisticRegression(scaling=scaling, sample_fraction=sample_fraction, n_resampling=n_resampling, random_state=random_state) lfs.fit(x, y) else: # we use LassoLarsCV to determine alpha see # http://scikit-learn.org/stable/auto_examples/linear_model/plot_sparse_recovery.html lars_cv = LassoLarsCV(cv=6).fit( x, y, ) alphas = np.linspace(lars_cv.alphas_[0], .1 * lars_cv.alphas_[0], 6) # fit the randomized lasso lfs = RandomizedLasso(alpha=alphas, scaling=scaling, sample_fraction=sample_fraction, n_resampling=n_resampling, random_state=random_state) lfs.fit(x, y) importances = lfs.scores_ importances = zip(uncs, importances) importances = list(importances) importances.sort(key=itemgetter(1), reverse=True) return importances
def __init__(self, x, y, threshold = None, threshold_type = ">", include = None, exclude = None, **kwargs): """Generates a decision tree for classification. Parameters ---------- x : a matrix-like object (pandas.DataFrame, numpy.recarray, etc.) the independent variables y : a list-like object, the column name (str), or callable the dependent variable either provided as a list-like object classifying the data into cases of interest (e.g., False/True), a list-like object storing the raw variable value (in which case a threshold must be given), a string identifying the dependent variable in x, or a function called on each row of x to compute the dependent variable threshold : float threshold for identifying cases of interest threshold_type : str comparison operator used when identifying cases of interest include : list of str the names of variables included in the PRIM analysis exclude : list of str the names of variables excluded from the PRIM analysis """ super(Cart, self).__init__() # Ensure the input x is a numpy matrix/array if isinstance(x, pd.DataFrame): x = x.to_records(index=False) elif isinstance(x, np.ma.MaskedArray): pass else: x = pd.DataFrame(x).to_records(index=False) # if y is a string or function, compute the actual response value # otherwise, ensure y is a numpy matrix/array if isinstance(y, six.string_types): key = y y = x[key] if exclude: exclude = list(exclude) + [key] else: exclude = [key] elif six.callable(y): fun = y y = np.apply_along_axis(fun, 0, x) elif isinstance(y, pd.DataFrame) or isinstance(y, pd.Series): y = y.values elif isinstance(y, np.ma.MaskedArray): pass else: y = np.asarray(y) # convert include/exclude arguments to lists if they are strings if include and isinstance(include, six.string_types): include = [include] if exclude and isinstance(exclude, six.string_types): exclude = [exclude] # include or exclude columns from the analysis if include: if isinstance(include, six.string_types): include = [include] drop_names = set(rf.get_names(x.dtype))-set(include) x = rf.drop_fields(x, drop_names, asrecarray=True) if exclude: if isinstance(exclude, six.string_types): exclude = [exclude] drop_names = set(exclude) x = rf.drop_fields(x, drop_names, asrecarray=True) # apply the threshold if if threshold: if six.callable(threshold): y = np.apply_along_axis(threshold, 0, y) else: # The syntax for threshold_type is "x <op> <threshold>", e.g., # "x > 0.5". However, partial only supports positional # arguments for built-in operators. Thus, we must assign the # threshold to the first position and use a different operator. # For example, "x > 0.5" must be evaluated as "0.5 < x". OPERATORS = {"<" : operator.ge, ">" : operator.le, "<=" : operator.gt, ">=" : operator.lt, "=" : operator.eq} op = OPERATORS[threshold_type] y = np.apply_along_axis(functools.partial(op, threshold), 0, y) # validate inputs if len(y.shape) > 1: raise ValueError("y is not a 1-d array") # extract feature names feature_names = rf.get_names(x.dtype) # ensure x is formatted as a 2D matrix x = x.view("<f8").reshape(x.shape + (-1,)) clf = tree.DecisionTreeClassifier(**kwargs) clf = clf.fit(x, y) # add our custom metadata to the classifier self._feature_names = feature_names self._x = x self._y = y self._clf = clf
def __init__(self, results, classify, obj_function=DEFAULT, peel_alpha = 0.05, paste_alpha = 0.05, mass_min = 0.05, threshold = None, threshold_type=ABOVE, incl_unc=[]): ''' :param results: the return from :meth:`perform_experiments`. :param classify: either a string denoting the outcome of interest to use or a function. :param peel_alpha: parameter controlling the peeling stage (default = 0.05). :param paste_alpha: parameter controlling the pasting stage (default = 0.05). :param mass_min: minimum mass of a box (default = 0.05). :param threshold: the threshold of the output space that boxes should meet. :param threshold_type: If 1, the boxes should go above the threshold, if -1 the boxes should go below the threshold, if 0, the algorithm looks for both +1 and -1. :param obj_func: The objective function to use. Default is :func:`def_obj_func` :param incl_unc: optional argument, should be a list of uncertainties that are to be included in the prim analysis. :raises: PrimException if data resulting from classify is not a 1-d array. :raises: TypeError if classify is not a string or a callable. ''' assert threshold!=None if not incl_unc: self.x = results[0] else: drop_names = set(recfunctions.get_names(results[0].dtype))-set(incl_unc) self.x = recfunctions.drop_fields(results[0], drop_names, asrecarray = True) if type(classify)==StringType: self.y = results[1][classify] elif callable(classify): self.y = classify(results[1]) else: raise TypeError("unknown type for classify") if len(self.y.shape) > 1: raise PrimException("y is not a 1-d array") # store the remainder of the parameters self.paste_alpha = paste_alpha self.peel_alpha = peel_alpha self.mass_min = mass_min self.threshold = threshold self.threshold_type = threshold_type self.obj_func = self._obj_functions[obj_function] # set the indices self.yi = np.arange(0, self.y.shape[0]) # how many data points do we have self.n = self.y.shape[0] # how many cases of interest do we have? self.t_coi = self.determine_coi(self.yi) # initial box that contains all data self.box_init = self.make_box(self.x) # make a list in which the identified boxes can be put self.boxes = [] self._update_yi_remaining()