def local_predict(models, test_reader, output, args, options=None, exclude=None): """Get local predictions and combine them to get a final prediction """ single_model = len(models) == 1 kwargs = {"full": True, "missing_strategy": args.missing_strategy} if single_model: local_model = Model(models[0], api=args.retrieve_api_) else: local_model = Ensemble(models, max_models=args.max_batch_models, api=args.retrieve_api_) kwargs.update({ "method": args.method, "options": options, "median": args.median }) if args.operating_point_: kwargs.update({"operating_point": args.operating_point_}) for input_data in test_reader: input_data_dict = dict(zip(test_reader.raw_headers, input_data)) prediction = local_model.predict(input_data_dict, **kwargs) if single_model and args.median and local_model.tree.regression: # only single models' predictions can be based on the median value # predict prediction["prediction"] = prediction["median"] write_prediction(prediction, output, args.prediction_info, input_data, exclude)
def local_predict(models, test_reader, output, args, options=None, exclude=None): """Get local predictions and combine them to get a final prediction """ single_model = len(models) == 1 test_set_header = test_reader.has_headers() kwargs = { "by_name": test_set_header, "with_confidence": True, "missing_strategy": args.missing_strategy } if single_model: local_model = Model(models[0]) else: local_model = Ensemble(models, max_models=args.max_batch_models) kwargs.update({ "method": args.method, "options": options, "median": args.median }) for input_data in test_reader: input_data_dict = dict(zip(test_reader.raw_headers, input_data)) prediction = local_model.predict(input_data_dict, **kwargs) if single_model and args.median and local_model.tree.regression: # only single models' predictions can be based on the median value # predict prediction[0] = prediction[-1] write_prediction(prediction[0:2], output, args.prediction_info, input_data, exclude)
def test_ensemble(self,test_file): assert self.authenticated, 'Not authenticated!' # download a local copy of the ensemble self.logger.info('Creating local ensemble') local_ensemble = Ensemble(self.ensemble_res,api=self.api) # make the Fields object source = self.api.get_source(self.source_res) fields = Fields(source['object']['fields']) self.logger.info('Reading test data and generating predictions') true_labels = [] predict_labels = [] pr = Profile() pr.enable() with open(test_file) as fid: test_reader = csv.reader(fid) # skip the header line test_reader.next() for row in test_reader: row_list = [val for val in row] true_labels.append(row_list.pop()) instance = fields.pair(row_list) predict_labels.append(local_ensemble.predict(instance, by_name=False, method=1)) pr.disable() ps = Stats(pr) self.predict_time = ps.total_tt # eval_args = {'combiner':1} # evaluation = self.api.create_evaluation(self.ensemble_res,test_data,eval_args) # check_resource(evaluation['resource'],self.api.get_evaluation) # evaluation = self.api.get_evaluation(evaluation['resource']) # matrix = evaluation['object']['result']['model']['confusion_matrix'] # self.predict_time = evaluation['object']['status']['elapsed']/1000 if self.regression: self.results = (predict_labels,true_labels) else: self.results = make_confusion_matrix(true_labels,predict_labels)
def export_code(args, api=None): """Generates the plugin code in the language required by the user """ args.language = args.language or "javascript" if args.model is not None and args.language in EXPORTS: local_model = EXPORTS[args.language](args.model, api=api) generate_output(local_model, args, model_type="model") if args.ensemble is not None and args.language in EXPORTS: local_ensemble = Ensemble(args.ensemble, api=api) for model_id in local_ensemble.model_ids: local_model = EXPORTS[args.language]( \ model_id, api=api, fields=local_ensemble.fields, boosting=local_ensemble.boosting) generate_output(local_model, args, model_type="model")
from bigml.api import BigML CURRENT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) CACHE_DIR = os.path.join(CURRENT_DIR, "cache") api = BigML("deven96", "81795cceca568fff4115d5c047071728a0700673", storage=CACHE_DIR) predictions = { "toxicity": "", "identity_hatred": "", "threat": "", "obscene": "", "severe_toxicity": "", "insult": "", } ensembles = predictions.copy() ensembles["toxicity"] = Ensemble('ensemble/5ddd1a3f1efc925827001f7a', api) ensembles["identity_hatred"] = Ensemble('ensemble/5ddd1b6f5a213904ee0000ca', api) ensembles["threat"] = Ensemble('ensemble/5ddd282959f5c31acc001a01', api) ensembles["obscene"] = Ensemble('ensemble/5ddd1ad959f5c31acc001999', api) ensembles["severe_toxicity"] = Ensemble('ensemble/5ddd1aab1efc925827001f7d', api) ensembles["insult"] = Ensemble('ensemble/5ddd1b3c5e269e4886001b8c', api) def get_predictions(input_data): """ Returns the strings gotten from each toxicity api """ for key, ensemble in ensembles.items(): predictions[key] = ensemble.predict({"comment_text":input_data}, full=True)
def create_local_ensemble_with_list(step, number_of_models): world.local_ensemble = Ensemble(world.models[-int(number_of_models):], world.api)
def create_local_ensemble(step): world.local_ensemble = Ensemble(world.ensemble_id, world.api)
def compute_output(api, args, training_set, test_set=None, output=None, objective_field=None, description=None, field_attributes=None, types=None, dataset_fields=None, model_fields=None, name=None, training_set_header=True, test_set_header=True, model_ids=None, votes_files=None, resume=False, fields_map=None, test_field_attributes=None, test_types=None): """ Creates one or more models using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ source = None dataset = None model = None models = None fields = None other_label = OTHER ensemble_ids = [] multi_label_data = None multi_label_fields = [] local_ensemble = None # It is compulsory to have a description to publish either datasets or # models if (not description and (args.black_box or args.white_box or args.public_dataset)): sys.exit("You should provide a description to publish.") # When using --max-categories, it is compulsory to specify also the # objective_field if args.max_categories > 0 and objective_field is None: sys.exit("When --max-categories is used, you must also provide the" " --objective field name or column number") # When using --new-fields, it is compulsory to specify also a dataset # id if args.new_fields and not args.dataset: sys.exit("To use --new-fields you must also provide a dataset id" " to generate the new dataset from it.") path = u.check_dir(output) session_file = "%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required, open the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared if args.clear_logs: try: open(log, 'w', 0).close() except IOError: pass # labels to be used in multi-label expansion labels = (map(str.strip, args.labels.split(',')) if args.labels is not None else None) if labels is not None: labels = sorted([label.decode("utf-8") for label in labels]) # multi_label file must be preprocessed to obtain a new extended file if args.multi_label and training_set is not None: (training_set, multi_label_data) = ps.multi_label_expansion( training_set, training_set_header, objective_field, args, path, labels=labels, session_file=session_file) training_set_header = True objective_field = multi_label_data["objective_name"] all_labels = l.get_all_labels(multi_label_data) if not labels: labels = all_labels else: all_labels = labels source, resume, csv_properties, fields = ps.source_processing( training_set, test_set, training_set_header, test_set_header, api, args, resume, name=name, description=description, csv_properties=csv_properties, field_attributes=field_attributes, types=types, multi_label_data=multi_label_data, session_file=session_file, path=path, log=log) if args.multi_label and source: multi_label_data = l.get_multi_label_data(source) (objective_field, labels, all_labels, multi_label_fields) = l.multi_label_sync(objective_field, labels, multi_label_data, fields, multi_label_fields) datasets, resume, csv_properties, fields = pd.dataset_processing( source, training_set, test_set, fields, objective_field, api, args, resume, name=name, description=description, dataset_fields=dataset_fields, multi_label_data=multi_label_data, csv_properties=csv_properties, session_file=session_file, path=path, log=log) if datasets: dataset = datasets[0] # If test_split is used, split the dataset in a training and a test dataset # according to the given split if args.test_split > 0: dataset, test_dataset, resume = pd.split_processing( dataset, api, args, resume, name=name, description=description, multi_label_data=multi_label_data, session_file=session_file, path=path, log=log) datasets[0] = dataset # Check if the dataset has a categorical objective field and it # has a max_categories limit for categories if args.max_categories > 0 and len(datasets) == 1: objective_id = fields.field_id(fields.objective_field) if pd.check_max_categories(fields.fields[objective_id]): distribution = pd.get_categories_distribution( dataset, objective_id) if distribution and len(distribution) > args.max_categories: categories = [element[0] for element in distribution] other_label = pd.create_other_label(categories, other_label) datasets, resume = pd.create_categories_datasets( dataset, distribution, fields, args, api, resume, session_file=session_file, path=path, log=log, other_label=other_label) else: sys.exit("The provided objective field is not categorical nor " "a full terms only text field. " "Only these fields can be used with" " --max-categories") # If multi-dataset flag is on, generate a new dataset from the given # list of datasets if args.multi_dataset: dataset, resume = pd.create_new_dataset( datasets, api, args, resume, name=name, description=description, fields=fields, dataset_fields=dataset_fields, objective_field=objective_field, session_file=session_file, path=path, log=log) datasets = [dataset] # Check if the dataset has a generators file associated with it, and # generate a new dataset with the specified field structure if args.new_fields: dataset, resume = pd.create_new_dataset( dataset, api, args, resume, name=name, description=description, fields=fields, dataset_fields=dataset_fields, objective_field=objective_field, session_file=session_file, path=path, log=log) datasets[0] = dataset if args.multi_label and dataset and multi_label_data is None: multi_label_data = l.get_multi_label_data(dataset) (objective_field, labels, all_labels, multi_label_fields) = l.multi_label_sync(objective_field, labels, multi_label_data, fields, multi_label_fields) if dataset: # retrieves max_categories data, if any args.max_categories = get_metadata(dataset, 'max_categories', args.max_categories) other_label = get_metadata(dataset, 'other_label', other_label) models, model_ids, ensemble_ids, resume = pm.models_processing( datasets, models, model_ids, objective_field, fields, api, args, resume, name=name, description=description, model_fields=model_fields, session_file=session_file, path=path, log=log, labels=labels, multi_label_data=multi_label_data, other_label=other_label) if models: model = models[0] single_model = len(models) == 1 # If multi-label flag is set and no training_set was provided, label # info is extracted from the user_metadata. If models belong to an # ensemble, the ensemble must be retrieved to get the user_metadata. if model and args.multi_label and multi_label_data is None: if len(ensemble_ids) > 0 and isinstance(ensemble_ids[0], dict): resource = ensemble_ids[0] elif belongs_to_ensemble(model): ensemble_id = get_ensemble_id(model) resource = r.get_ensemble(ensemble_id, api=api, verbosity=args.verbosity, session_file=session_file) else: resource = model multi_label_data = l.get_multi_label_data(resource) # We update the model's public state if needed if model: if isinstance(model, basestring): if not args.evaluate: query_string = MINIMUM_MODEL else: query_string = r.FIELDS_QS model = u.check_resource(model, api.get_model, query_string=query_string) if (args.black_box or args.white_box or r.shared_changed(args.shared, model)): model_args = {} if r.shared_changed(args.shared, model): model_args.update(shared=args.shared) if args.black_box or args.white_box: model_args.update(r.set_publish_model_args(args)) if model_args: model = r.update_model(model, model_args, args, api=api, path=path, session_file=session_file) models[0] = model # We get the fields of the model if we haven't got # them yet and need them if model and not args.evaluate and test_set: # If more than one model, use the full field structure if (not single_model and not args.multi_label and belongs_to_ensemble(model)): if len(ensemble_ids) > 0: ensemble_id = ensemble_ids[0] else: ensemble_id = get_ensemble_id(model) local_ensemble = Ensemble(ensemble_id, api=api) fields, objective_field = pm.get_model_fields( model, csv_properties, args, single_model=single_model, multi_label_data=multi_label_data, local_ensemble=local_ensemble) # Fills in all_labels from user_metadata if args.multi_label and not all_labels: (objective_field, labels, all_labels, multi_label_fields) = l.multi_label_sync(objective_field, labels, multi_label_data, fields, multi_label_fields) if model: # retrieves max_categories data, if any args.max_categories = get_metadata(model, 'max_categories', args.max_categories) other_label = get_metadata(model, 'other_label', other_label) # If predicting if models and has_test(args) and not args.evaluate: models_per_label = 1 test_dataset = None if args.multi_label: # When prediction starts from existing models, the # multi_label_fields can be retrieved from the user_metadata # in the models if args.multi_label_fields is None and multi_label_fields: multi_label_field_names = [ field[1] for field in multi_label_fields ] args.multi_label_fields = ",".join(multi_label_field_names) test_set = ps.multi_label_expansion(test_set, test_set_header, objective_field, args, path, labels=labels, session_file=session_file, input_flag=True)[0] test_set_header = True # Remote predictions: predictions are computed as batch predictions # in bigml.com except when --no-batch flag is set on or multi-label # or max-categories are used if (args.remote and not args.no_batch and not args.multi_label and not args.method in [THRESHOLD_CODE, COMBINATION]): # create test source from file test_name = "%s - test" % name if args.test_source is None: (test_source, resume, csv_properties, test_fields) = ps.test_source_processing( test_set, test_set_header, api, args, resume, name=test_name, description=description, field_attributes=test_field_attributes, types=test_types, session_file=session_file, path=path, log=log) else: test_source_id = bigml.api.get_source_id(args.test_source) test_source = api.check_resource(test_source_id, api.get_source) if args.test_dataset is None: # create test dataset from test source dataset_args = r.set_basic_dataset_args( test_name, description, args) test_dataset, resume = pd.alternative_dataset_processing( test_source, "test", dataset_args, api, args, resume, session_file=session_file, path=path, log=log) else: test_dataset_id = bigml.api.get_dataset_id(args.test_dataset) test_dataset = api.check_resource(test_dataset_id, api.get_dataset) csv_properties.update(objective_field=None, objective_field_present=False) test_fields = pd.get_fields_structure(test_dataset, csv_properties) batch_prediction_args = r.set_batch_prediction_args( name, description, args, fields=fields, dataset_fields=test_fields, fields_map=fields_map) remote_predict(model, test_dataset, batch_prediction_args, args, api, resume, prediction_file=output, session_file=session_file, path=path, log=log) else: models_per_label = args.number_of_models if (args.multi_label and len(ensemble_ids) > 0 and args.number_of_models == 1): # use case where ensembles are read from a file models_per_label = len(models) / len(ensemble_ids) predict(test_set, test_set_header, models, fields, output, objective_field, args, api=api, log=log, resume=resume, session_file=session_file, labels=labels, models_per_label=models_per_label, other_label=other_label, multi_label_data=multi_label_data) # When combine_votes flag is used, retrieve the predictions files saved # in the comma separated list of directories and combine them if votes_files: model_id = re.sub(r'.*(model_[a-f0-9]{24})__predictions\.csv$', r'\1', votes_files[0]).replace("_", "/") try: model = u.check_resource(model_id, api.get_model) except ValueError, exception: sys.exit("Failed to get model %s: %s" % (model_id, str(exception))) local_model = Model(model) message = u.dated("Combining votes.\n") u.log_message(message, log_file=session_file, console=args.verbosity) combine_votes(votes_files, local_model.to_prediction, output, args.method)
def create_local_ensemble(step): world.local_ensemble = Ensemble(world.ensemble_id, world.api) world.local_model = Model(world.local_ensemble.model_ids[0], world.api)
def i_create_local_ensemble_from_file(step, export_file): world.local_ensemble = Ensemble(res_filename(export_file))
def create_local_ensemble_with_list_of_local_models(step, number_of_models): local_models = [ Model(model) for model in world.models[-int(number_of_models):] ] world.local_ensemble = Ensemble(local_models, world.api)
from bigml.ensemble import Ensemble # Downloads and generates a local version of the ensemble, if it # hasn't been downloaded previously. from bigml.api import BigML ensemble = Ensemble('ensemble/5ddd1a3f1efc925827001f7a', api=BigML("deven96", "81795cceca568fff4115d5c047071728a0700673", domain="bigml.io")) # To make predictions fill the desired input_data in next line. input_data = {} ret = ensemble.predict({'comment_text': "f**k you c**t"}, full=True) print(ret)
api = BigML(dev_mode=True) model = api.get_model("model/563a1c7a3cd25747430023ce") prediction = api.create_prediction(model, {"petal length": 4.07, "sepal width": 3.15, "petal width": 1.51}) local_model = Model("model/56430eb8636e1c79b0001f90", api=api) prediction = local_model.predict( {"petal length": 0.96, "sepal width": 4.1, "petal width": 2.52}, 2, add_confidence=True, multiple=3 ) local_model = Ensemble("ensemble/564a02d5636e1c79b5006e13", api=api) local_model = Ensemble("ensemble/564a081bc6c19b6cf3011c60", api=api) prediction = local_model.predict( {"petal length": 0.95, "sepal width": 3.9, "petal width": 1.51, "sepal length": 7.0}, method=2, add_confidence=True ) local_ensemble = Ensemble("ensemble/564623d4636e1c79b00051f7", api=api) prediction = local_ensemble.predict({"Price": 5.8, "Grape": "Pinot Grigio", "Country": "Italy", "Rating": 92}, True) local_anomaly = Anomaly("anomaly/564c5a76636e1c3d52000007", api=api) prediction = local_anomaly.anomaly_score( {"petal length": 4.07, "sepal width": 3.15, "petal width": 1.51, "sepal length": 6.02, "species": "Iris-setosa"}, True, ) prediction = local_anomaly.anomaly_score( {"petal length": 0.96, "sepal width": 4.1, "petal width": 2.51, "sepal length": 6.02, "species": "Iris-setosa"}, True, ) prediction = local_anomaly.anomaly_score({"petal length": 0.96, "sepal width": 4.1, "petal width": 2.51}, True) api.pprint(prediction)
# Requires BigML Python bindings # # Install via: pip install bigml # # or clone it: # git clone https://github.com/bigmlcom/python.git from bigml.ensemble import Ensemble # Downloads and generates a local version of the ensemble, if it # hasn't been downloaded previously. from bigml.api import BigML ensemble = Ensemble('ensemble/5cacf3dceba31d30ba000d60', api=BigML("rshelton", "adabd734dd2a2af5cb4e49176f0eb472cfa8ce5a", domain="bigml.io")) # To make predictions fill the desired input_data in next line. input_data = {} ensemble.predict(input_data, full=True) # # input_data: dict for the input values # (e.g. {"petal length": 1, "sepal length": 3}) # full: if set to True, the output will be a dictionary that includes all the # available information in the predicted node. The attributes vary depending # on the ensemble type. Please check: # https://bigml.readthedocs.io/en/latest/#local-ensemble-s-predictions
}] }) API.ok(SCRIPT_model_or_ensemble) EXECUTION_model_or_enemble = API.create_execution( SCRIPT_model_or_ensemble['resource'], {'inputs': [["ts-id", training_set]]}) API.ok(EXECUTION_model_or_enemble) model_or_ensemble = EXECUTION_model_or_enemble["object"]["execution"]["result"] #Locally store the model or ensemble if model_or_ensemble[:1] == 'e': global local_ensemble local_ensemble = Ensemble(model_or_ensemble) picklEoR = local_ensemble else: global local_model local_model = Model(model_or_ensemble) picklEoR = local_model #batch prediction to check if the model is accurate batch_prediction = API.create_batch_prediction(model_or_ensemble, testing_set, {"all_fields": True}) API.ok(batch_prediction) API.download_batch_prediction(batch_prediction, filename=(filename[:-4] + "-Model-or-Ensemble-Check.csv")) #Store the data the has been created from this python file
def _predict_nba(request, context): """ :param request: iterable sequence of bundled rows :return: string """ # Disable caching by uncomment the following two lines #md = (('qlik-cache', 'no-store'),) #context.send_initial_metadata(md) params = [] # Iterate over bundled rows to retrieve data for request_rows in request: # Iterate over rows for row in request_rows.rows: # Retrieve string value of parameter and append to the params variable # Length of param is 1 since one column is received, the [0] collects the first value in the list param = [d.strData for d in row.duals][0] print('param:', param) params.append(param) print('params:', params) # Possible selections to predict opt_selections = ['Kevin Durant', 'Allen Iverson', 'Carmelo Anthony', 'Isaiah Thomas', 'Cory Jefferson', 'Robbie Hummel', 'Wesley Johnson'] # Check selections if len(params) == 1 and any([selection in params for selection in opt_selections]): selection = params[0].split(' ') # list of first name and last name file = 'NBA_data/Demo_predictPPGPk_{}_{}'.format(selection[0], selection[1]) with open(file, 'rb') as f: data = pickle.load(f) print('data:', data) correct_res = data['NBA PPG'] del data['NBA PPG'] try: # Use pre-trained ensemble ensemble_link = 'ensemble/5727212049c4a15ca1004b77' ensemble = Ensemble(ensemble_link, api=BigML(dev_mode=True, domain='bigml.io')) # saves locally except: err = sys.exc_info() logging.error('Unexpected error: {}, {}, {}'.format(err[2].tb_frame.f_code.co_filename, err[2].tb_lineno, err[1])) # Predict data using the trained ensemble res = ensemble.predict(data, with_confidence=True) print('res:', res) result = 'Predicted number of PPG: {} <br> ' \ 'Correct number of PPG: {} <br>' \ 'Confidence: {}'.format(round(res[0], 1), correct_res, round(res[1], 1)) else: result = 'Not possible to predict.' # Create an iterable of dual with the result duals = iter([SSE.Dual(strData=result)]) # Yield the row data as bundled rows yield SSE.BundledRows(rows=[SSE.Row(duals=duals)])