def __init__(self, ensemble, api=None, max_models=None): if api is None: self.api = BigML(storage=STORAGE) else: self.api = api self.ensemble_id = None if isinstance(ensemble, list): try: models = [get_model_id(model) for model in ensemble] except ValueError: raise ValueError('Failed to verify the list of models. Check ' 'your model id values.') self.distributions = None else: self.ensemble_id = get_ensemble_id(ensemble) ensemble = check_resource(ensemble, self.api.get_ensemble) models = ensemble['object']['models'] self.distributions = ensemble['object'].get('distributions', None) self.model_ids = models self.fields = self.all_model_fields() number_of_models = len(models) if max_models is None: self.models_splits = [models] else: self.models_splits = [models[index:(index + max_models)] for index in range(0, number_of_models, max_models)] if len(self.models_splits) == 1: models = [retrieve_resource(self.api, model_id, query_string=ONLY_MODEL) for model_id in self.models_splits[0]] self.multi_model = MultiModel(models, self.api)
def predict(self, input_data, by_name=True, method=PLURALITY_CODE, with_confidence=False, options=None): """Makes a prediction based on the prediction made by every model. The method parameter is a numeric key to the following combination methods in classifications/regressions: 0 - majority vote (plurality)/ average: PLURALITY_CODE 1 - confidence weighted majority vote / error weighted: CONFIDENCE_CODE 2 - probability weighted majority vote / average: PROBABILITY_CODE 3 - threshold filtered vote / doesn't apply: THRESHOLD_CODE """ if len(self.models_splits) > 1: # If there's more than one chunck of models, they must be # sequentially used to generate the votes for the prediction votes = MultiVote([]) for models_split in self.models_splits: models = [retrieve_resource(self.api, model_id, query_string=ONLY_MODEL) for model_id in models_split] multi_model = MultiModel(models, api=self.api) votes_split = multi_model.generate_votes(input_data, by_name=by_name) votes.extend(votes_split.predictions) else: # When only one group of models is found you use the # corresponding multimodel to predict votes_split = self.multi_model.generate_votes(input_data, by_name=by_name) votes = MultiVote(votes_split.predictions) return votes.combine(method=method, with_confidence=with_confidence, options=options)
def __init__(self, ensemble, model_fns_dir, api=None): if api is None: self.api = BigML(storage=STORAGE) else: self.api = api self.resource_id = None # to be deprecated self.ensemble_id = None self.objective_id = None self.distributions = None self.distribution = None self.models_splits = [] self.multi_model = None self.boosting = None self.boosting_offsets = None self.regression = False self.fields = None self.class_names = None self.importance = {} self.predict_functions = [] ensemble = self.get_ensemble_resource(ensemble) self.resource_id = get_ensemble_id(ensemble) self.ensemble_id = self.resource_id if lacks_info(ensemble, inner_key="ensemble"): # avoid checking fields because of old ensembles ensemble = retrieve_resource(self.api, self.resource_id, no_check_fields=True) if ensemble['object'].get('type') == BOOSTING: self.boosting = ensemble['object'].get('boosting') models = ensemble['object']['models'] self.distributions = ensemble['object'].get('distributions', []) self.importance = ensemble['object'].get('importance', []) self.model_ids = models # new ensembles have the fields structure if ensemble['object'].get('ensemble'): self.fields = ensemble['object'].get( \ 'ensemble', {}).get("fields") self.objective_id = ensemble['object'].get("objective_field") self.input_fields = ensemble['object'].get("input_fields") if model_fns_dir: self.get_model_fns(model_fns_dir) else: raise ValueError("The EnsemblePredictor object expects as" " argument the directory where the models" " predict functions are stored. To generate " " them, please check the 'bigmler export'" " command.") if self.fields: summary = self.fields[self.objective_id]['summary'] if 'bins' in summary: distribution = summary['bins'] elif 'counts' in summary: distribution = summary['counts'] elif 'categories' in summary: distribution = summary['categories'] else: distribution = [] self.distribution = distribution self.regression = \ self.fields[self.objective_id].get('optype') == 'numeric' if self.boosting: self.boosting_offsets = ensemble['object'].get('initial_offset', 0) \ if self.regression else dict(ensemble['object'].get( \ 'initial_offsets', [])) if not self.regression and self.boosting is None: try: objective_field = self.fields[self.objective_id] categories = objective_field['summary']['categories'] classes = [category[0] for category in categories] except (AttributeError, KeyError): classes = set() for distribution in self.distributions: for category in distribution['training']['categories']: classes.add(category[0]) self.class_names = sorted(classes)