def local_anomaly_score(anomalies, test_reader, output, args, exclude=None): """Get local anomaly detector and issue anomaly score prediction """ # Only one anomaly detector at present local_anomaly = Anomaly(anomalies[0], api=args.retrieve_api_) test_set_header = test_reader.has_headers() for input_data in test_reader: input_data_dict = test_reader.dict(input_data, filtering=False) try: anomaly_score_info = {"score": local_anomaly.anomaly_score(input_data_dict, by_name=test_set_header)} except Exception: anomaly_score_info = {"score": NO_ANOMALY_SCORE} write_anomaly_score(anomaly_score_info["score"], output, args.prediction_info, input_data, exclude)
def local_anomaly_score(anomalies, test_reader, output, args, exclude=None): """Get local anomaly detector and issue anomaly score prediction """ # Only one anomaly detector at present local_anomaly = Anomaly(anomalies[0], api=args.retrieve_api_) for input_data in test_reader: input_data_dict = test_reader.dict(input_data, filtering=False) try: anomaly_score_info = {'score': local_anomaly.anomaly_score( input_data_dict)} except Exception: anomaly_score_info = {'score': NO_ANOMALY_SCORE} write_anomaly_score(anomaly_score_info['score'], output, args.prediction_info, input_data, exclude)
def i_create_local_anomaly_from_file(step, export_file): world.local_anomaly = Anomaly(res_filename(export_file))
def create_dataset_with_anomalies(step): local_anomalies = Anomaly(world.anomaly['resource']) world.dataset = world.api.create_dataset( world.dataset['resource'], {"lisp_filter": local_anomalies.anomalies_filter()}) world.datasets.append(world.dataset['resource'])
def i_create_a_local_anomaly(step): world.local_anomaly = Anomaly(world.anomaly['resource'])
def compute_output(api, args): """ Creates one or more anomaly detectors using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ anomaly = None anomalies = None # no multi-label support at present # variables from command-line options resume = args.resume_ anomaly_ids = args.anomaly_ids_ output = args.predictions # there's only one anomaly detector to be generated at present args.max_parallel_anomalies = 1 # anomalies cannot be published yet. args.public_anomaly = False # It is compulsory to have a description to publish either datasets or # anomalies if (not args.description_ and (args.public_anomaly or args.public_dataset)): sys.exit("You should provide a description to publish.") # When using --new-fields, it is compulsory to specify also a dataset # id if args.new_fields and not args.dataset: sys.exit("To use --new-fields you must also provide a dataset id" " to generate the new dataset from it.") path = u.check_dir(output) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # basic pre-model step: creating or retrieving the source related info source, resume, csv_properties, fields = pms.get_source_info( api, args, resume, csv_properties, session_file, path, log) # basic pre-model step: creating or retrieving the dataset related info dataset_properties = pms.get_dataset_info( api, args, resume, source, csv_properties, fields, session_file, path, log) (_, datasets, test_dataset, resume, csv_properties, fields) = dataset_properties if args.anomaly_file: # anomaly is retrieved from the contents of the given local JSON file anomaly, csv_properties, fields = u.read_local_resource( args.anomaly_file, csv_properties=csv_properties) anomalies = [anomaly] anomaly_ids = [anomaly['resource']] else: # anomaly is retrieved from the remote object anomalies, anomaly_ids, resume = pa.anomalies_processing( datasets, anomalies, anomaly_ids, api, args, resume, fields=fields, session_file=session_file, path=path, log=log) if anomalies: anomaly = anomalies[0] # We update the anomaly's public state if needed if anomaly: if not a.has_test(args) and not args.anomalies_dataset: query_string = MINIMUM_MODEL elif not a.has_test(args): query_string = ";".join([EXCLUDE_TREES, r.ALL_FIELDS_QS]) else: query_string = r.ALL_FIELDS_QS try: anomaly_id = anomaly.get('resource', anomaly) except AttributeError: anomaly_id = anomaly anomaly = u.check_resource(anomaly_id, query_string=query_string, api=api) anomalies[0] = anomaly if (args.public_anomaly or (args.shared_flag and r.shared_changed(args.shared, anomaly))): anomaly_args = {} if args.shared_flag and r.shared_changed(args.shared, anomaly): anomaly_args.update(shared=args.shared) if args.public_anomaly: anomaly_args.update(r.set_publish_anomaly_args(args)) if anomaly_args: anomaly = r.update_anomaly(anomaly, anomaly_args, args, api=api, path=path, session_file=session_file) anomalies[0] = anomaly # We get the fields of the anomaly detector if we haven't got # them yet and need them if anomaly and (args.test_set or args.export_fields): fields = pa.get_anomaly_fields(anomaly, csv_properties, args) # If creating a top anomalies excluded/included dataset if args.anomalies_dataset and anomaly: origin_dataset = anomaly['object'].get('dataset') if origin_dataset is None: sys.exit("The dataset used to generate the anomaly detector " "cannot be found. Failed to generate the anomalies " " dataset.") local_anomaly = Anomaly(anomaly) include = args.anomalies_dataset == ANOMALIES_IN args.anomaly_filter_ = local_anomaly.anomalies_filter(include=include) _, resume = pd.create_new_dataset( origin_dataset, api, args, resume, fields=fields, session_file=session_file, path=path, log=log) # If predicting if anomaly and args.score: args.test_dataset = anomaly['object']['dataset'] if anomalies and (a.has_test(args) or (test_dataset and args.remote)): # test dataset can be defined by --test-split or --test-dataset or # --test-datasets if test_dataset is None: test_dataset = get_test_dataset(args) # Remote anomaly scores: scores are computed as batch anomaly scores # in bigml.com except when --no-batch flag is set on if args.remote and not args.no_batch: # create test source from file test_name = "%s - test" % args.name if args.test_source is None: test_properties = ps.test_source_processing( api, args, resume, name=test_name, session_file=session_file, path=path, log=log) (test_source, resume, csv_properties, test_fields) = test_properties else: test_source_id = bigml.api.get_source_id(args.test_source) test_source = api.check_resource(test_source_id) if test_dataset is None: # create test dataset from test source dataset_args = r.set_basic_dataset_args(args, name=test_name) test_dataset, resume = pd.alternative_dataset_processing( test_source, "test", dataset_args, api, args, resume, session_file=session_file, path=path, log=log) else: test_dataset_id = bigml.api.get_dataset_id(test_dataset) test_dataset = api.check_resource(test_dataset_id) test_fields = pd.get_fields_structure(test_dataset, csv_properties) batch_anomaly_score_args = r.set_batch_anomaly_score_args( args, fields=fields, dataset_fields=test_fields) remote_anomaly_score(anomaly, test_dataset, batch_anomaly_score_args, args, api, resume, prediction_file=output, session_file=session_file, path=path, log=log) else: anomaly_score(anomalies, fields, args, session_file=session_file) if fields and args.export_fields: fields.summary_csv(os.path.join(path, args.export_fields)) u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)
def compute_output(api, args): """ Creates one or more anomaly detectors using the `training_set` or uses the ids of previously created BigML models to make predictions for the `test_set`. """ anomaly = None anomalies = None # no multi-label support at present # variables from command-line options resume = args.resume_ anomaly_ids = args.anomaly_ids_ output = args.predictions # there's only one anomaly detector to be generated at present args.max_parallel_anomalies = 1 # anomalies cannot be published yet. args.public_anomaly = False # It is compulsory to have a description to publish either datasets or # anomalies if (not args.description_ and (args.public_anomaly or args.public_dataset)): sys.exit("You should provide a description to publish.") # When using --new-fields, it is compulsory to specify also a dataset # id if args.new_fields and not args.dataset: sys.exit("To use --new-fields you must also provide a dataset id" " to generate the new dataset from it.") path = u.check_dir(output) session_file = u"%s%s%s" % (path, os.sep, SESSIONS_LOG) csv_properties = {} # If logging is required set the file for logging log = None if args.log_file: u.check_dir(args.log_file) log = args.log_file # If --clear_logs the log files are cleared clear_log_files([log]) # basic pre-model step: creating or retrieving the source related info source, resume, csv_properties, fields = pms.get_source_info( api, args, resume, csv_properties, session_file, path, log) # basic pre-model step: creating or retrieving the dataset related info dataset_properties = pms.get_dataset_info( api, args, resume, source, csv_properties, fields, session_file, path, log) (dataset, datasets, test_dataset, resume, csv_properties, fields) = dataset_properties if args.anomaly_file: # anomaly is retrieved from the contents of the given local JSON file anomaly, csv_properties, fields = u.read_local_resource( args.anomaly_file, csv_properties=csv_properties) anomalies = [anomaly] anomaly_ids = [anomaly['resource']] else: # anomaly is retrieved from the remote object anomalies, anomaly_ids, resume = pa.anomalies_processing( datasets, anomalies, anomaly_ids, api, args, resume, fields=fields, session_file=session_file, path=path, log=log) if anomalies: anomaly = anomalies[0] # We update the anomaly's public state if needed if anomaly: if not a.has_test(args) and not args.anomalies_dataset: query_string = MINIMUM_MODEL elif not a.has_test(args): query_string = ";".join([EXCLUDE_TREES, r.ALL_FIELDS_QS]) else: query_string = r.ALL_FIELDS_QS try: anomaly_id = anomaly.get('resource', anomaly) except AttributeError: anomaly_id = anomaly anomaly = u.check_resource(anomaly_id, query_string=query_string, api=api) anomalies[0] = anomaly if (args.public_anomaly or (args.shared_flag and r.shared_changed(args.shared, anomaly))): anomaly_args = {} if args.shared_flag and r.shared_changed(args.shared, anomaly): anomaly_args.update(shared=args.shared) if args.public_anomaly: anomaly_args.update(r.set_publish_anomaly_args(args)) if anomaly_args: anomaly = r.update_anomaly(anomaly, anomaly_args, args, api=api, path=path, session_file=session_file) anomalies[0] = anomaly # We get the fields of the anomaly detector if we haven't got # them yet and need them if anomaly and args.test_set: fields = pa.get_anomaly_fields(anomaly, csv_properties, args) # If creating a top anomalies excluded/included dataset if args.anomalies_dataset and anomaly: origin_dataset = anomaly['object'].get('dataset') if origin_dataset is None: sys.exit("The dataset used to generate the anomaly detector " "cannot be found. Failed to generate the anomalies " " dataset.") local_anomaly = Anomaly(anomaly) include = args.anomalies_dataset == ANOMALIES_IN args._anomaly_filter = local_anomaly.anomalies_filter(include=include) new_dataset, resume = pd.create_new_dataset( origin_dataset, api, args, resume, fields=fields, session_file=session_file, path=path, log=log) # If predicting if anomaly and args.score: args.test_dataset = anomaly['object']['dataset'] if anomalies and (a.has_test(args) or (test_dataset and args.remote)): # test dataset can be defined by --test-split or --test-dataset or # --test-datasets if test_dataset is None: test_dataset = get_test_dataset(args) # Remote anomaly scores: scores are computed as batch anomaly scores # in bigml.com except when --no-batch flag is set on if args.remote and not args.no_batch: # create test source from file test_name = "%s - test" % args.name if args.test_source is None: test_properties = ps.test_source_processing( api, args, resume, name=test_name, session_file=session_file, path=path, log=log) (test_source, resume, csv_properties, test_fields) = test_properties else: test_source_id = bigml.api.get_source_id(args.test_source) test_source = api.check_resource(test_source_id) if test_dataset is None: # create test dataset from test source dataset_args = r.set_basic_dataset_args(args, name=test_name) test_dataset, resume = pd.alternative_dataset_processing( test_source, "test", dataset_args, api, args, resume, session_file=session_file, path=path, log=log) else: test_dataset_id = bigml.api.get_dataset_id(test_dataset) test_dataset = api.check_resource(test_dataset_id) test_fields = pd.get_fields_structure(test_dataset, csv_properties) batch_anomaly_score_args = r.set_batch_anomaly_score_args( args, fields=fields, dataset_fields=test_fields) remote_anomaly_score(anomaly, test_dataset, batch_anomaly_score_args, args, api, resume, prediction_file=output, session_file=session_file, path=path, log=log) else: anomaly_score(anomalies, fields, args, session_file=session_file) u.print_generated_files(path, log_file=session_file, verbosity=args.verbosity) if args.reports: clear_reports(path) if args.upload: upload_reports(args.reports, path)
api = BigML(dev_mode=True) model = api.get_model("model/563a1c7a3cd25747430023ce") prediction = api.create_prediction(model, {"petal length": 4.07, "sepal width": 3.15, "petal width": 1.51}) local_model = Model("model/56430eb8636e1c79b0001f90", api=api) prediction = local_model.predict( {"petal length": 0.96, "sepal width": 4.1, "petal width": 2.52}, 2, add_confidence=True, multiple=3 ) local_model = Ensemble("ensemble/564a02d5636e1c79b5006e13", api=api) local_model = Ensemble("ensemble/564a081bc6c19b6cf3011c60", api=api) prediction = local_model.predict( {"petal length": 0.95, "sepal width": 3.9, "petal width": 1.51, "sepal length": 7.0}, method=2, add_confidence=True ) local_ensemble = Ensemble("ensemble/564623d4636e1c79b00051f7", api=api) prediction = local_ensemble.predict({"Price": 5.8, "Grape": "Pinot Grigio", "Country": "Italy", "Rating": 92}, True) local_anomaly = Anomaly("anomaly/564c5a76636e1c3d52000007", api=api) prediction = local_anomaly.anomaly_score( {"petal length": 4.07, "sepal width": 3.15, "petal width": 1.51, "sepal length": 6.02, "species": "Iris-setosa"}, True, ) prediction = local_anomaly.anomaly_score( {"petal length": 0.96, "sepal width": 4.1, "petal width": 2.51, "sepal length": 6.02, "species": "Iris-setosa"}, True, ) prediction = local_anomaly.anomaly_score({"petal length": 0.96, "sepal width": 4.1, "petal width": 2.51}, True) api.pprint(prediction)