def get_ensemble_resource(self, ensemble): """Extracts the ensemble resource info. The ensemble argument can be - a path to a local file - an ensemble id """ # the string can be a path to a JSON file if isinstance(ensemble, basestring): try: with open(ensemble) as ensemble_file: ensemble = json.load(ensemble_file) self.resource_id = get_ensemble_id(ensemble) if self.resource_id is None: raise ValueError("The JSON file does not seem" " to contain a valid BigML ensemble" " representation.") except IOError: # if it is not a path, it can be an ensemble id self.resource_id = get_ensemble_id(ensemble) if self.resource_id is None: if ensemble.find('ensemble/') > -1: raise Exception( self.api.error_message(ensemble, resource_type='ensemble', method='get')) else: raise IOError("Failed to open the expected JSON file" " at %s" % ensemble) except ValueError: raise ValueError("Failed to interpret %s." " JSON file expected.") return ensemble
def __init__(self, ensemble, api=None, max_models=None): if api is None: self.api = BigML(storage=STORAGE) else: self.api = api self.ensemble_id = None if isinstance(ensemble, list): try: models = [get_model_id(model) for model in ensemble] except ValueError: raise ValueError('Failed to verify the list of models. Check ' 'your model id values.') self.distributions = None else: self.ensemble_id = get_ensemble_id(ensemble) ensemble = check_resource(ensemble, self.api.get_ensemble) models = ensemble['object']['models'] self.distributions = ensemble['object'].get('distributions', None) self.model_ids = models self.fields = self.all_model_fields() number_of_models = len(models) if max_models is None: self.models_splits = [models] else: self.models_splits = [models[index:(index + max_models)] for index in range(0, number_of_models, max_models)] if len(self.models_splits) == 1: models = [retrieve_resource(self.api, model_id, query_string=ONLY_MODEL) for model_id in self.models_splits[0]] self.multi_model = MultiModel(models, self.api)
def __init__(self, ensemble, api=None, max_models=None): if api is None: self.api = BigML(storage=STORAGE) else: self.api = api self.ensemble_id = get_ensemble_id(ensemble) ensemble = check_resource(ensemble, self.api.get_ensemble) models = ensemble['object']['models'] self.model_ids = models number_of_models = len(models) if max_models is None: self.models_splits = [models] else: self.models_splits = [models[index:(index + max_models)] for index in range(0, number_of_models, max_models)]
def __init__(self, ensemble, model_fns_dir, api=None): if api is None: self.api = BigML(storage=STORAGE) else: self.api = api self.resource_id = None # to be deprecated self.ensemble_id = None self.objective_id = None self.distributions = None self.distribution = None self.models_splits = [] self.multi_model = None self.boosting = None self.boosting_offsets = None self.regression = False self.fields = None self.class_names = None self.importance = {} self.predict_functions = [] ensemble = self.get_ensemble_resource(ensemble) self.resource_id = get_ensemble_id(ensemble) self.ensemble_id = self.resource_id if lacks_info(ensemble, inner_key="ensemble"): # avoid checking fields because of old ensembles ensemble = retrieve_resource(self.api, self.resource_id, no_check_fields=True) if ensemble['object'].get('type') == BOOSTING: self.boosting = ensemble['object'].get('boosting') models = ensemble['object']['models'] self.distributions = ensemble['object'].get('distributions', []) self.importance = ensemble['object'].get('importance', []) self.model_ids = models # new ensembles have the fields structure if ensemble['object'].get('ensemble'): self.fields = ensemble['object'].get( \ 'ensemble', {}).get("fields") self.objective_id = ensemble['object'].get("objective_field") self.input_fields = ensemble['object'].get("input_fields") if model_fns_dir: self.get_model_fns(model_fns_dir) else: raise ValueError("The EnsemblePredictor object expects as" " argument the directory where the models" " predict functions are stored. To generate " " them, please check the 'bigmler export'" " command.") if self.fields: summary = self.fields[self.objective_id]['summary'] if 'bins' in summary: distribution = summary['bins'] elif 'counts' in summary: distribution = summary['counts'] elif 'categories' in summary: distribution = summary['categories'] else: distribution = [] self.distribution = distribution self.regression = \ self.fields[self.objective_id].get('optype') == 'numeric' if self.boosting: self.boosting_offsets = ensemble['object'].get('initial_offset', 0) \ if self.regression else dict(ensemble['object'].get( \ 'initial_offsets', [])) if not self.regression and self.boosting is None: try: objective_field = self.fields[self.objective_id] categories = objective_field['summary']['categories'] classes = [category[0] for category in categories] except (AttributeError, KeyError): classes = set() for distribution in self.distributions: for category in distribution['training']['categories']: classes.add(category[0]) self.class_names = sorted(classes)
def __init__(self, ensemble, api=None, max_models=None, cache_get=None): self.model_splits = [] self.multi_model = None self.api = get_api_connection(api) self.fields = None self.class_names = None if use_cache(cache_get): # using a cache to store the model attributes self.__dict__ = load(get_ensemble_id(ensemble), cache_get) self.api = get_api_connection(api) if len(self.models_splits) == 1: # retrieve the models from a cache get function try: models = [ Model(model_id, cache_get=cache_get) for model_id in self.models_splits[0] ] except Exception as exc: raise Exception('Error while calling the user-given' ' function %s: %s' % (cache_get.__name__, str(exc))) self.multi_model = MultiModel(models, self.api, fields=self.fields, class_names=self.class_names, cache_get=cache_get) return self.resource_id = None self.objective_id = None self.distributions = None self.distribution = None self.boosting = None self.boosting_offsets = None self.cache_get = None self.regression = False self.importance = {} query_string = ONLY_MODEL no_check_fields = False self.input_fields = [] if isinstance(ensemble, list): if all([isinstance(model, Model) for model in ensemble]): models = ensemble self.model_ids = [ local_model.resource_id for local_model in models ] else: try: models = [get_model_id(model) for model in ensemble] self.model_ids = models except ValueError as exc: raise ValueError('Failed to verify the list of models.' ' Check your model id values: %s' % str(exc)) else: ensemble = self.get_ensemble_resource(ensemble) self.resource_id = get_ensemble_id(ensemble) if not check_local_but_fields(ensemble): # avoid checking fields because of old ensembles ensemble = retrieve_resource(self.api, self.resource_id, no_check_fields=True) if ensemble['object'].get('type') == BOOSTING: self.boosting = ensemble['object'].get('boosting') models = ensemble['object']['models'] self.distributions = ensemble['object'].get('distributions', []) self.importance = ensemble['object'].get('importance', []) self.model_ids = models # new ensembles have the fields structure if ensemble['object'].get('ensemble'): self.fields = ensemble['object'].get( \ 'ensemble', {}).get("fields") self.objective_id = ensemble['object'].get("objective_field") query_string = EXCLUDE_FIELDS no_check_fields = True self.input_fields = ensemble['object'].get('input_fields') number_of_models = len(models) if max_models is None: self.models_splits = [models] else: self.models_splits = [ models[index:(index + max_models)] for index in range(0, number_of_models, max_models) ] if len(self.models_splits) == 1: if not isinstance(models[0], Model): if use_cache(cache_get): # retrieve the models from a cache get function try: models = [ Model(model_id, cache_get=cache_get) for model_id in self.models_splits[0] ] self.cache_get = cache_get except Exception as exc: raise Exception('Error while calling the user-given' ' function %s: %s' % (cache_get.__name__, str(exc))) else: models = [retrieve_resource( \ self.api, model_id, query_string=query_string, no_check_fields=no_check_fields) for model_id in self.models_splits[0]] model = models[0] else: # only retrieving first model self.cache_get = cache_get if not isinstance(models[0], Model): if use_cache(cache_get): # retrieve the models from a cache get function try: model = Model(self.models_splits[0][0], cache_get=cache_get) self.cache_get = cache_get except Exception as exc: raise Exception('Error while calling the user-given' ' function %s: %s' % (cache_get.__name__, str(exc))) else: model = retrieve_resource( \ self.api, self.models_splits[0][0], query_string=query_string, no_check_fields=no_check_fields) models = [model] if self.distributions is None: try: self.distributions = [] for model in models: self.distributions.append( {'training': model.root_distribution}) except AttributeError: self.distributions = [ model['object']['model']['distribution'] for model in models ] if self.boosting is None: self._add_models_attrs(model, max_models) if self.fields is None: self.fields, self.objective_id = self.all_model_fields( max_models=max_models) if self.fields: add_distribution(self) self.regression = \ self.fields[self.objective_id].get('optype') == NUMERIC if self.boosting: self.boosting_offsets = ensemble['object'].get('initial_offset', 0) \ if self.regression else dict(ensemble['object'].get( \ 'initial_offsets', [])) if not self.regression: try: objective_field = self.fields[self.objective_id] categories = objective_field['summary']['categories'] classes = [category[0] for category in categories] except (AttributeError, KeyError): classes = set() for distribution in self.distributions: for category in distribution['training']['categories']: classes.add(category[0]) self.class_names = sorted(classes) self.objective_categories = [category for \ category, _ in self.fields[self.objective_id][ \ "summary"]["categories"]] ModelFields.__init__( \ self, self.fields, objective_id=self.objective_id) if len(self.models_splits) == 1: self.multi_model = MultiModel(models, self.api, fields=self.fields, class_names=self.class_names)