Exemplo n.º 1
0
    def __init__(self, ensemble, api=None, max_models=None):

        if api is None:
            self.api = BigML(storage=STORAGE)
        else:
            self.api = api
        self.ensemble_id = None
        if isinstance(ensemble, list):
            try:
                models = [get_model_id(model) for model in ensemble]
            except ValueError:
                raise ValueError('Failed to verify the list of models. Check '
                                 'your model id values.')
            self.distributions = None
        else:
            self.ensemble_id = get_ensemble_id(ensemble)
            ensemble = check_resource(ensemble, self.api.get_ensemble)
            models = ensemble['object']['models']
            self.distributions = ensemble['object'].get('distributions', None)
        self.model_ids = models
        self.fields = self.all_model_fields()

        number_of_models = len(models)
        if max_models is None:
            self.models_splits = [models]
        else:
            self.models_splits = [models[index:(index + max_models)] for index
                                  in range(0, number_of_models, max_models)]
        if len(self.models_splits) == 1:
            models = [retrieve_resource(self.api, model_id,
                                        query_string=ONLY_MODEL)
                      for model_id in self.models_splits[0]]
            self.multi_model = MultiModel(models, self.api)
Exemplo n.º 2
0
    def predict(self, input_data, by_name=True, method=PLURALITY_CODE,
                with_confidence=False, options=None):
        """Makes a prediction based on the prediction made by every model.

           The method parameter is a numeric key to the following combination
           methods in classifications/regressions:
              0 - majority vote (plurality)/ average: PLURALITY_CODE
              1 - confidence weighted majority vote / error weighted:
                  CONFIDENCE_CODE
              2 - probability weighted majority vote / average:
                  PROBABILITY_CODE
              3 - threshold filtered vote / doesn't apply:
                  THRESHOLD_CODE
        """

        if len(self.models_splits) > 1:
            # If there's more than one chunck of models, they must be
            # sequentially used to generate the votes for the prediction
            votes = MultiVote([])
            for models_split in self.models_splits:
                models = [retrieve_resource(self.api, model_id,
                                            query_string=ONLY_MODEL)
                          for model_id in models_split]
                multi_model = MultiModel(models, api=self.api)
                votes_split = multi_model.generate_votes(input_data,
                                                         by_name=by_name)
                votes.extend(votes_split.predictions)
        else:
            # When only one group of models is found you use the
            # corresponding multimodel to predict
            votes_split = self.multi_model.generate_votes(input_data,
                                                          by_name=by_name)
            votes = MultiVote(votes_split.predictions)
        return votes.combine(method=method, with_confidence=with_confidence,
                             options=options)
Exemplo n.º 3
0
    def predict(self, input_data, by_name=True, method=PLURALITY_CODE,
                with_confidence=False, options=None):
        """Makes a prediction based on the prediction made by every model.

           The method parameter is a numeric key to the following combination
           methods in classifications/regressions:
              0 - majority vote (plurality)/ average: PLURALITY_CODE
              1 - confidence weighted majority vote / error weighted:
                  CONFIDENCE_CODE
              2 - probability weighted majority vote / average:
                  PROBABILITY_CODE
              3 - threshold filtered vote / doesn't apply:
                  THRESHOLD_CODE
        """

        if len(self.models_splits) > 1:
            # If there's more than one chunck of models, they must be
            # sequentially used to generate the votes for the prediction
            votes = MultiVote([])
            for models_split in self.models_splits:
                models = [retrieve_resource(self.api, model_id,
                                            query_string=ONLY_MODEL)
                          for model_id in models_split]
                multi_model = MultiModel(models, api=self.api)
                votes_split = multi_model.generate_votes(input_data,
                                                         by_name=by_name)
                votes.extend(votes_split.predictions)
        else:
            # When only one group of models is found you use the
            # corresponding multimodel to predict
            votes_split = self.multi_model.generate_votes(input_data,
                                                          by_name=by_name)
            votes = MultiVote(votes_split.predictions)
        return votes.combine(method=method, with_confidence=with_confidence,
                             options=options)
Exemplo n.º 4
0
    def __init__(self, ensemble, api=None, max_models=None):

        if api is None:
            self.api = BigML(storage=STORAGE)
        else:
            self.api = api
        self.ensemble_id = None
        if isinstance(ensemble, list):
            try:
                models = [get_model_id(model) for model in ensemble]
            except ValueError:
                raise ValueError('Failed to verify the list of models. Check '
                                 'your model id values.')
            self.distributions = None
        else:
            self.ensemble_id = get_ensemble_id(ensemble)
            ensemble = check_resource(ensemble, self.api.get_ensemble)
            models = ensemble['object']['models']
            self.distributions = ensemble['object'].get('distributions', None)
        self.model_ids = models
        self.fields = self.all_model_fields()

        number_of_models = len(models)
        if max_models is None:
            self.models_splits = [models]
        else:
            self.models_splits = [models[index:(index + max_models)] for index
                                  in range(0, number_of_models, max_models)]
        if len(self.models_splits) == 1:
            models = [retrieve_resource(self.api, model_id,
                                        query_string=ONLY_MODEL)
                      for model_id in self.models_splits[0]]
            self.multi_model = MultiModel(models, self.api)
Exemplo n.º 5
0
    def __init__(self, ensemble, model_fns_dir, api=None):

        if api is None:
            self.api = BigML(storage=STORAGE)
        else:
            self.api = api
        self.resource_id = None
        # to be deprecated
        self.ensemble_id = None
        self.objective_id = None
        self.distributions = None
        self.distribution = None
        self.models_splits = []
        self.multi_model = None
        self.boosting = None
        self.boosting_offsets = None
        self.regression = False
        self.fields = None
        self.class_names = None
        self.importance = {}
        self.predict_functions = []

        ensemble = self.get_ensemble_resource(ensemble)
        self.resource_id = get_ensemble_id(ensemble)
        self.ensemble_id = self.resource_id

        if lacks_info(ensemble, inner_key="ensemble"):
            # avoid checking fields because of old ensembles
            ensemble = retrieve_resource(self.api, self.resource_id,
                                         no_check_fields=True)
        if ensemble['object'].get('type') == BOOSTING:
            self.boosting = ensemble['object'].get('boosting')
        models = ensemble['object']['models']
        self.distributions = ensemble['object'].get('distributions', [])
        self.importance = ensemble['object'].get('importance', [])
        self.model_ids = models
        # new ensembles have the fields structure
        if ensemble['object'].get('ensemble'):
            self.fields = ensemble['object'].get( \
                'ensemble', {}).get("fields")
            self.objective_id = ensemble['object'].get("objective_field")
            self.input_fields = ensemble['object'].get("input_fields")

        if model_fns_dir:
            self.get_model_fns(model_fns_dir)
        else:
            raise ValueError("The EnsemblePredictor object expects as"
                             " argument the directory where the models"
                             " predict functions are stored. To generate "
                             " them, please check the 'bigmler export'"
                             " command.")

        if self.fields:
            summary = self.fields[self.objective_id]['summary']
            if 'bins' in summary:
                distribution = summary['bins']
            elif 'counts' in summary:
                distribution = summary['counts']
            elif 'categories' in summary:
                distribution = summary['categories']
            else:
                distribution = []
            self.distribution = distribution

        self.regression = \
            self.fields[self.objective_id].get('optype') == 'numeric'
        if self.boosting:
            self.boosting_offsets = ensemble['object'].get('initial_offset',
                                                           0) \
                if self.regression else dict(ensemble['object'].get( \
                    'initial_offsets', []))

        if not self.regression and self.boosting is None:
            try:
                objective_field = self.fields[self.objective_id]
                categories = objective_field['summary']['categories']
                classes = [category[0] for category in categories]
            except (AttributeError, KeyError):
                classes = set()
                for distribution in self.distributions:
                    for category in distribution['training']['categories']:
                        classes.add(category[0])

            self.class_names = sorted(classes)