def realtime_predict(ml_model_id, record): """Takes a string ml_model_id, and a dict record, and makes a realtime prediction call, if the ML Model has a realtime endpoint. If the ML Model doesn't have a realtime endpoint, it creates one instead of calling predict() """ ml = boto.connect_machinelearning() model = ml.get_ml_model(ml_model_id) endpoint = model.get('EndpointInfo', {}).get('EndpointUrl', '') #endpoint = endpoint.replace("https://", "") # This shouldn't be needed if endpoint: print('ml.predict("%s", %s, "%s") # returns...' % (ml_model_id, json.dumps(record, indent=2), endpoint)) start = time.time() prediction = ml.predict(ml_model_id, record, predict_endpoint=endpoint) latency_ms = (time.time() - start)*1000 print(json.dumps(prediction, indent=2)) print("Latency: %.2fms" % latency_ms) else: print( '# Missing realtime endpoint\nml.create_realtime_endpoint("%s")' % ml_model_id) result = ml.create_realtime_endpoint(ml_model_id) print(json.dumps(result, indent=2)) print("""# Predictions will fail until the endpoint has been fully created. # Note that you will be charged a reservation fee until this endpoint is deleted. # Delete with: python realtime.py %s --deleteEndpoint""" % ml_model_id)
def realtime_predict(ml_model_id, record): """Takes a string ml_model_id, and a dict record, and makes a realtime prediction call, if the ML Model has a realtime endpoint. If the ML Model doesn't have a realtime endpoint, it creates one instead of calling predict() """ ml = boto.connect_machinelearning() model = ml.get_ml_model(ml_model_id) endpoint = model.get('EndpointInfo', {}).get('EndpointUrl', '') #endpoint = endpoint.replace("https://", "") # This shouldn't be needed if endpoint: print('ml.predict("%s", %s, "%s") # returns...' % (ml_model_id, json.dumps(record, indent=2), endpoint)) prediction = ml.predict(ml_model_id, record, predict_endpoint=endpoint) print(json.dumps(prediction, indent=2)) else: print( '# Missing realtime endpoint\nml.create_realtime_endpoint("%s")' % ml_model_id) result = ml.create_realtime_endpoint(ml_model_id) print(json.dumps(result, indent=2)) print( """# Predictions will fail until the endpoint has been fully created. # Note that you will be charged a reservation fee until this endpoint is deleted. # Delete with: python realtime.py %s --deleteEndpoint""" % ml_model_id)
def collect_perf(eval_id_list): """ This function collects the AUC score for a list of evaluations (based on binary classification model) on Amazon ML. If any evaluation is in progress, the script will poll and wait with exponential backoff. Args: eval_id_list: a list of Evaluation IDs to collect performance metrics. Returns: a map of completed evaluation's ID to the corresponding AUC score. Raises: exception when any Evaluation is in Failed status. """ ml = boto.connect_machinelearning() # boto Amazon ML client completed_evals = dict() # to collect completed Evaluations start_timestamp = time.time() # start timestamp in seconds # time delay in seconds between two polling attempt polling_delay = config.INITIAL_POLLING_DELAY logger.info("Checking the Evaluation status...") while time.time() - start_timestamp < config.TIME_OUT: any_in_progress = False # assume all complete for ev_id in eval_id_list: # fetching each Evaluation status if ev_id in completed_evals: # skip any completed Evaluations continue # fetch evaluation status evaluation = ml.get_evaluation(ev_id) eval_status = evaluation["Status"] logger.info("{} status: {}".format(ev_id, eval_status)) if eval_status == "COMPLETED": # get the AUC score from the Evaluation auc = evaluation["PerformanceMetrics"][ "Properties"]["BinaryAUC"] # mark this Evaluation to be completed, and write down the # AUC score in floating point number. Note that this entity # will be skipped in next polling completed_evals[ev_id] = float(auc) elif eval_status == "FAILED": raise Exception("Evaluation {} is FAILED!".format( ev_id)) else: any_in_progress = True # in progress if not any_in_progress: # exit polling if all Evaluations completed break logger.debug("Next poll in {} seconds...".format(polling_delay)) time.sleep(polling_delay) # update polling_delay in the next polling polling_delay = min(polling_delay * 2, config.DELAY_CAP) return completed_evals
def build(self): """ Builds the necessary entities on Amazon ML. """ self._ml = boto.connect_machinelearning() self.create_datasources() self.create_ml_model() self.create_eval()
def collect_perf(eval_id_list): """ This function collects the AUC score for a list of evaluations (based on binary classification model) on Amazon ML. If any evaluation is in progress, the script will poll and wait with exponential backoff. Args: eval_id_list: a list of Evaluation IDs to collect performance metrics. Returns: a map of completed evaluation's ID to the corresponding AUC score. Raises: exception when any Evaluation is in Failed status. """ ml = boto.connect_machinelearning() # boto Amazon ML client completed_evals = dict() # to collect completed Evaluations start_timestamp = time.time() # start timestamp in seconds # time delay in seconds between two polling attempt polling_delay = config.INITIAL_POLLING_DELAY logger.info("Checking the Evaluation status...") while time.time() - start_timestamp < config.TIME_OUT: any_in_progress = False # assume all complete for ev_id in eval_id_list: # fetching each Evaluation status if ev_id in completed_evals: # skip any completed Evaluations continue # fetch evaluation status evaluation = ml.get_evaluation(ev_id) eval_status = evaluation["Status"] logger.info("{} status: {}".format(ev_id, eval_status)) if eval_status == "COMPLETED": # get the AUC score from the Evaluation auc = evaluation["PerformanceMetrics"]["Properties"][ "BinaryAUC"] # mark this Evaluation to be completed, and write down the # AUC score in floating point number. Note that this entity # will be skipped in next polling completed_evals[ev_id] = float(auc) elif eval_status == "FAILED": raise Exception("Evaluation {} is FAILED!".format(ev_id)) else: any_in_progress = True # in progress if not any_in_progress: # exit polling if all Evaluations completed break logger.debug("Next poll in {} seconds...".format(polling_delay)) time.sleep(polling_delay) # update polling_delay in the next polling polling_delay = min(polling_delay * 2, config.DELAY_CAP) return completed_evals
def build_model(data_s3_url, schema_fn, recipe_fn, name, train_percent=70): """Creates all the objects needed to build an ML Model & evaluate its quality. """ ml = boto.connect_machinelearning() (train_ds_id, test_ds_id) = create_data_sources(ml, data_s3_url, schema_fn, train_percent, name) ml_model_id = create_model(ml, train_ds_id, recipe_fn, name) eval_id = create_evaluation(ml, ml_model_id, test_ds_id, name) return ml_model_id
def use_model(model_id, threshold, schema_fn, output_s3, data_s3url): """Creates all the objects needed to build an ML Model & evaluate its quality. """ ml = boto.connect_machinelearning() poll_until_completed(ml, model_id) # Can't use it until it's COMPLETED ml.update_ml_model(model_id, score_threshold=threshold) print("Set score threshold for %s to %.2f" % (model_id, threshold)) bp_id = 'bp-' + base64.b32encode(os.urandom(10)) ds_id = create_data_source_for_scoring(ml, data_s3url, schema_fn) ml.create_batch_prediction( batch_prediction_id=bp_id, batch_prediction_name="Batch Prediction for marketing sample", ml_model_id=model_id, batch_prediction_data_source_id=ds_id, output_uri=output_s3) print("Created Batch Prediction %s" % bp_id)
def use_model(model_id, threshold, schema_fn, output_s3, data_s3url): """Creates all the objects needed to build an ML Model & evaluate its quality. """ ml = boto.connect_machinelearning() poll_until_completed(ml, model_id) # Can't use it until it's COMPLETED ml.update_ml_model(model_id, score_threshold=threshold) print("Set score threshold for %s to %.2f" % (model_id, threshold)) bp_id = 'bp-' + base64.b32encode(os.urandom(10)) ds_id = create_data_source_for_scoring(ml, data_s3url, schema_fn) ml.create_batch_prediction( batch_prediction_id=bp_id, batch_prediction_name="Batch Prediction for marketing sample", ml_model_id=model_id, batch_prediction_data_source_id=ds_id, output_uri=output_s3 ) print("Created Batch Prediction %s" % bp_id)
def aml_connection(): """Connects to the service and validates that credentials are configured properly. """ ml = boto.connect_machinelearning() try: # Check that the connection is configured properly ml.describe_ml_models(limit=1) except: raise RuntimeError("""There was a problem connecting to Amazon Machine Learning. Be sure your AWS credentials are properly configured. A credentials file should be in ~/.aws/credentials (or C:\Users\USER_NAME\.aws\credentials on Windows) and look like: [Credentials] aws_access_key_id = <your_access_key_here> aws_secret_access_key = <your_secret_key_here> """) return ml
def poll_until_completed(entity_id, entity_type_str): ml = boto.connect_machinelearning() polling_function = { 'ds': ml.get_data_source, 'ml': ml.get_ml_model, 'ev': ml.get_evaluation, 'bp': ml.get_batch_prediction, }[entity_type_str] delay = 2 while True: results = polling_function(entity_id) status = results['Status'] message = results.get('Message', '') now = str(datetime.datetime.now().time()) print("Object %s is %s (%s) at %s" % (entity_id, status, message, now)) if status in ['COMPLETED', 'FAILED', 'INVALID']: break # exponential backoff with jitter delay *= random.uniform(1.1, 2.0) time.sleep(delay) print(json.dumps(results, indent=2))
def realtime_predict(ml_model_id, record): """Takes a string ml_model_id, and a dict record, and makes a realtime prediction call, if the ML Model has a realtime endpoint. If the ML Model doesn't have a realtime endpoint, it creates one instead of calling predict() """ ml = boto.connect_machinelearning() model = ml.get_ml_model(ml_model_id) endpoint = model.get('EndpointInfo', {}).get('EndpointUrl', '') #endpoint = endpoint.replace("https://", "") # This shouldn't be needed if endpoint: print('ml.predict("%s", %s, "%s") # returns...' % (ml_model_id, json.dumps(record, indent=2), endpoint)) start = time.time() prediction = ml.predict(ml_model_id, record, predict_endpoint=endpoint) latency_ms = (time.time() - start)*1000 predict_scores = prediction['Prediction']['predictedScores'] predict_list = sorted(predict_scores.items(), key=itemgetter(1)) predict_sort = [] for k in predict_list: predict_sort.append(k[0]) for i in reversed(predict_sort): sys.stdout.write(i+';') print(json.dumps(prediction, indent=2)) print("Latency: %.2fms" % latency_ms) else: print( '# Missing realtime endpoint\nml.create_realtime_endpoint("%s")' % ml_model_id) result = ml.create_realtime_endpoint(ml_model_id) print(json.dumps(result, indent=2)) print("""# Predictions will fail until the endpoint has been fully created. # Note that you will be charged a reservation fee until this endpoint is deleted. # Delete with: python realtime.py %s --deleteEndpoint""" % ml_model_id)
def delete_realtime_endpoint(ml_model_id): ml = boto.connect_machinelearning() print('# Deleting realtime endpoint\nml.delete_realtime_endpoint("%s")' % ml_model_id) result = ml.delete_realtime_endpoint(ml_model_id) print(json.dumps(result, indent=2))
from time import sleep from zipfile import ZipFile import boto from boto.kinesis.exceptions import ResourceInUseException import config # To enable logging: # boto.set_stream_logger('boto') # Initialize the AWS clients. sns = boto.connect_sns() kinesis = boto.connect_kinesis() aws_lambda = boto.connect_awslambda() ml = boto.connect_machinelearning() lambda_execution_policy = open('lambdaExecutionPolicyTemplate.json').read().format(**config.AWS) aws_account_id = config.AWS["awsAccountId"] region = config.AWS["region"] kinesis_stream = config.AWS["kinesisStream"] sns_topic = config.AWS["snsTopic"] lambda_function_name = config.AWS["lambdaFunctionName"] lambda_execution_role = config.AWS["lambdaExecutionRole"] lambda_trust_policy = '{"Statement":[{"Effect":"Allow","Principal":{"Service":"lambda.amazonaws.com"},"Action":"sts:AssumeRole"}]}' def role_exists(iam, role_name): try:
print(CONSOLE_URL_ML_MODEL.format(CONSOLE_URL_BASE, ml_model_id)) evaluation_id = 'ev-tweets-' + time_stamp ml.create_evaluation(evaluation_id, ml_model_id, evaluation_ds_id) print("Creating evaluation with id {0}. See:".format(evaluation_id)) print(CONSOLE_URL_EVALUATION.format(CONSOLE_URL_BASE, ml_model_id, evaluation_id)) print("Waiting for evaluation to complete.") poll_until_ready(evaluation_id) print("done") evaluation = ml.get_evaluation(evaluation_id) print("Performance metric on the evaluation dataset: Binary AUC: " + evaluation['PerformanceMetrics']['Properties'][ 'BinaryAUC']) time_stamp = time.strftime('%Y-%m-%d-%H-%M-%S') ml = boto.connect_machinelearning() s3 = boto.connect_s3() aml_training_dataset = None if __name__ == "__main__": if len(sys.argv) != 5: print __doc__ sys.exit(-1) try: aml_training_dataset = sys.argv[1] s3_bucket = s3.get_bucket(sys.argv[3]) s3_key = s3_bucket.new_key(sys.argv[4]) s3_uri = "s3://{0}/{1}".format(sys.argv[3], sys.argv[4]) print("Uploading {0} to {1}".format(sys.argv[1], s3_uri)) if s3_key.exists():
def process_email_msg_in_s3(source_folder, message_id): assert source_folder in ('unprocessed', 'failed') s3_conn = boto.connect_s3(s3_full.ACCESS_KEY_ID, s3_full.SECRET_ACCESS_KEY) email_bucket = s3_conn.get_bucket(EMAIL_BUCKET_NAME) s3_key = '%s/%s' % (source_folder, message_id) s3_obj = email_bucket.get_key(s3_key) if not s3_obj: raise Exception('Expected S3 key not found (%s)' % s3_key) try: parsed_email_dict = _fetch_email_from_s3_and_parse(s3_obj) db_conn = engine.connect() db_trans = db_conn.begin() account = get_account_for_transaction(db_conn) now = datetime.utcnow() logging.info('Calling ML to predict verifier') ml_bucket = s3_conn.get_bucket(ML_BUCKET_NAME) transactions_filename = ml_bucket.get_key(ML_MOST_RECENT_TRANSACTIONS_FILENAME).get_contents_as_string() ml_conn = boto.connect_machinelearning(ml_full.ACCESS_KEY_ID, ml_full.SECRET_ACCESS_KEY) prediction_dict = _perform_ml_prediction( ml_conn, transactions_filename, parsed_email_dict['merchant_parsed'], parsed_email_dict['date_parsed'], parsed_email_dict['amount_parsed'], account.name, now) verifier_predicted_user = get_user_by_username(db_conn, prediction_dict['Prediction']['predictedLabel']) verifier_predicted_scores = prediction_dict['Prediction']['predictedScores'] logging.info('Predicted verifier is "%s" (%s)', verifier_predicted_user.name, verifier_predicted_scores) new_trans_id = _store_transaction_into_db( db_conn, parsed_email_dict['merchant_parsed'], parsed_email_dict['date_parsed'], parsed_email_dict['amount_parsed'], message_id, account, now, transactions_filename, verifier_predicted_scores, verifier_predicted_user.id) verification_dict = derive_and_store_verification_attempt(db_conn, new_trans_id, verifier_predicted_user.id) # A "history" with just one entry -- the system's prediction for this transaction verification_history = [get_verification_history_first_entry(now, verifier_predicted_user.name, verifier_predicted_scores), ] send_verification_email( verification_dict['verif_attempt_id'], verifier_predicted_user, account.name, parsed_email_dict['date_parsed'], parsed_email_dict['amount_parsed'], parsed_email_dict['merchant_parsed'], verification_dict['possible_attributed_tos'], verification_dict['possible_other_verifiers'], verification_history) db_trans.commit() logging.info('Moving S3 file into "processed" folder') s3_obj.copy(EMAIL_BUCKET_NAME, 'processed/%s' % message_id, metadata={'Content-Type': 'text/plain'}) s3_obj.delete() except Exception: if s3_obj: if source_folder != 'failed': # If it's already in failed, just leave it there! Don't delete it! logging.info('Moving S3 file into "failed" folder') s3_obj.copy(EMAIL_BUCKET_NAME, 'failed/%s' % message_id, metadata={'Content-Type': 'text/plain'}) s3_obj.delete() raise
def create_ml_model(): # # 0. Derive datasource/model/evaluation name and IDs -- the same value for all # now = datetime.utcnow() model_name = model_id = 'transactions-%s' % now.strftime('%Y%m%d') datasource_name = datasource_id = model_name evaluation_name = evaluation_id = model_name logging.info('Model/datasource/evaluation name/ID will be %s' % model_name) # # 1. Query the DB for transactions; write them to a CSV # logging.info('Querying DB for transactions, writing them to CSV') _, temp_fname = tempfile.mkstemp() outfile = open(temp_fname, 'wb') writer = csv.writer(outfile, quoting=csv.QUOTE_NONNUMERIC) writer.writerow([ 'id', 'merchant', 'date', 'day_of_week', 'weekend', 'amount', 'created_date', 'account', 'verified_by', ]) s = select([transaction_tbl.c.id, transaction_tbl.c.merchant, transaction_tbl.c.date, transaction_tbl.c.amount, transaction_tbl.c.created_date, account_tbl.c.name.label('account_name'), user_tbl.c.name.label('verifier_name') ]) \ .select_from(transaction_tbl .join(user_tbl, onclause=transaction_tbl.c.verified_by == user_tbl.c.id) .join(account_tbl)) \ .where(transaction_tbl.c.is_verified) conn = engine.connect() rs = conn.execute(s) for row in rs: python_weekday, weekend = ml.weekday_fields(row.date) row = [row.id, row.merchant, row.date, python_weekday, weekend, row.amount, row.created_date, row.account_name, row.verifier_name] row = [ml.standardize(f) for f in row] writer.writerow(row) outfile.close() # # 2. Upload the CSV into S3 # logging.info('Uploading CSV into S3 bucket') infile = open(temp_fname, 'rb') s3_conn = boto.connect_s3(s3_full.ACCESS_KEY_ID, s3_full.SECRET_ACCESS_KEY) bucket = s3_conn.get_bucket(ML_BUCKET_NAME) s3_key = bucket.new_key('%s.csv' % model_name) s3_key.metadata = {'Content-Type': 'text/csv'} s3_key.set_contents_from_file(infile) infile.close() os.remove(temp_fname) # # 3. Create an ML datasource from the CSV # logging.info('Creating ML datasource from the CSV') ml_conn = boto.connect_machinelearning(ml_full.ACCESS_KEY_ID, ml_full.SECRET_ACCESS_KEY) try: ml_conn.delete_data_source(data_source_id=model_name) except boto.machinelearning.exceptions.ResourceNotFoundException: pass # Obtained by using the AWS console UI transactions_schema = ''' { "version" : "1.0", "rowId" : "id", "rowWeight" : null, "targetAttributeName" : "verified_by", "dataFormat" : "CSV", "dataFileContainsHeader" : true, "attributes" : [ { "attributeName" : "id", "attributeType" : "CATEGORICAL" }, { "attributeName" : "merchant", "attributeType" : "TEXT" }, { "attributeName" : "date", "attributeType" : "CATEGORICAL" }, { "attributeName" : "day_of_week", "attributeType" : "CATEGORICAL" }, { "attributeName" : "weekend", "attributeType" : "BINARY" }, { "attributeName" : "amount", "attributeType" : "NUMERIC" }, { "attributeName" : "created_date", "attributeType" : "TEXT" }, { "attributeName" : "account", "attributeType" : "CATEGORICAL" }, { "attributeName" : "verified_by", "attributeType" : "CATEGORICAL" } ], "excludedAttributeNames" : [ ] } ''' ml_conn.create_data_source_from_s3( data_source_id=datasource_id, data_source_name=datasource_name, data_spec={ 'DataLocationS3': 's3://%s/%s.csv' % (ML_BUCKET_NAME, datasource_name), 'DataSchema': transactions_schema, }, compute_statistics=True) # # 4. Create an ML model from the datasource # logging.info('Creating ML model from the datasource') ml_conn.create_ml_model( ml_model_id=model_id, ml_model_name=model_name, ml_model_type='MULTICLASS', training_data_source_id=datasource_id) ml_conn.create_realtime_endpoint(ml_model_id=model_id) ml_conn.create_evaluation( evaluation_id=evaluation_id, ml_model_id=model_id, evaluation_data_source_id=datasource_id, evaluation_name=evaluation_name) # # 5. Write the name of the just-created model/datasource/evaluation to the ML_MOST_RECENT_TRANSACTIONS_FILENAME key in S3 # logging.info('Writing model name to ML_MOST_RECENT_TRANSACTIONS_FILENAME in S3') bucket = s3_conn.get_bucket(ML_BUCKET_NAME) k = bucket.get_key(ML_MOST_RECENT_TRANSACTIONS_FILENAME) if not k: k = bucket.new_key(ML_MOST_RECENT_TRANSACTIONS_FILENAME) k.metadata = {'Content-Type': 'text/plain'} k.set_contents_from_string(model_name)