示例#1
0
def realtime_predict(ml_model_id, record):
    """Takes a string ml_model_id, and a dict record, and makes a realtime
    prediction call, if the ML Model has a realtime endpoint.
    If the ML Model doesn't have a realtime endpoint, it creates one instead
    of calling predict()
    """
    ml = boto.connect_machinelearning()
    model = ml.get_ml_model(ml_model_id)
    endpoint = model.get('EndpointInfo', {}).get('EndpointUrl', '')
    #endpoint = endpoint.replace("https://", "")  # This shouldn't be needed
    if endpoint:
        print('ml.predict("%s", %s, "%s") # returns...' % (ml_model_id,
                                                           json.dumps(record, indent=2), endpoint))
        start = time.time()
        prediction = ml.predict(ml_model_id, record, predict_endpoint=endpoint)
        latency_ms = (time.time() - start)*1000
        print(json.dumps(prediction, indent=2))
        print("Latency: %.2fms" % latency_ms)
    else:
        print(
            '# Missing realtime endpoint\nml.create_realtime_endpoint("%s")' % ml_model_id)
        result = ml.create_realtime_endpoint(ml_model_id)
        print(json.dumps(result, indent=2))
        print("""# Predictions will fail until the endpoint has been fully created.
# Note that you will be charged a reservation fee until this endpoint is deleted.
# Delete with:
    python realtime.py %s --deleteEndpoint""" % ml_model_id)
def realtime_predict(ml_model_id, record):
    """Takes a string ml_model_id, and a dict record, and makes a realtime
    prediction call, if the ML Model has a realtime endpoint.
    If the ML Model doesn't have a realtime endpoint, it creates one instead
    of calling predict()
    """
    ml = boto.connect_machinelearning()
    model = ml.get_ml_model(ml_model_id)
    endpoint = model.get('EndpointInfo', {}).get('EndpointUrl', '')
    #endpoint = endpoint.replace("https://", "")  # This shouldn't be needed
    if endpoint:
        print('ml.predict("%s", %s, "%s") # returns...' %
              (ml_model_id, json.dumps(record, indent=2), endpoint))
        prediction = ml.predict(ml_model_id, record, predict_endpoint=endpoint)
        print(json.dumps(prediction, indent=2))
    else:
        print(
            '# Missing realtime endpoint\nml.create_realtime_endpoint("%s")' %
            ml_model_id)
        result = ml.create_realtime_endpoint(ml_model_id)
        print(json.dumps(result, indent=2))
        print(
            """# Predictions will fail until the endpoint has been fully created.
# Note that you will be charged a reservation fee until this endpoint is deleted.
# Delete with:
    python realtime.py %s --deleteEndpoint""" % ml_model_id)
def collect_perf(eval_id_list):
    """
    This function collects the AUC score for a list of
    evaluations (based on binary classification model)
    on Amazon ML. If any evaluation is in progress,
    the script will poll and wait with exponential
    backoff.

    Args:
        eval_id_list: a list of Evaluation IDs to collect
            performance metrics.
    Returns:
        a map of completed evaluation's ID to
            the corresponding AUC score.
    Raises:
        exception when any Evaluation is in
            Failed status.
    """
    ml = boto.connect_machinelearning()  # boto Amazon ML client
    completed_evals = dict()  # to collect completed Evaluations
    start_timestamp = time.time()  # start timestamp in seconds

    # time delay in seconds between two polling attempt
    polling_delay = config.INITIAL_POLLING_DELAY

    logger.info("Checking the Evaluation status...")
    while time.time() - start_timestamp < config.TIME_OUT:
        any_in_progress = False  # assume all complete

        for ev_id in eval_id_list:  # fetching each Evaluation status
            if ev_id in completed_evals:  # skip any completed Evaluations
                continue

            # fetch evaluation status
            evaluation = ml.get_evaluation(ev_id)
            eval_status = evaluation["Status"]
            logger.info("{} status: {}".format(ev_id, eval_status))

            if eval_status == "COMPLETED":
                # get the AUC score from the Evaluation
                auc = evaluation["PerformanceMetrics"][
                    "Properties"]["BinaryAUC"]
                # mark this Evaluation to be completed, and write down the
                # AUC score in floating point number. Note that this entity
                # will be skipped in next polling
                completed_evals[ev_id] = float(auc)
            elif eval_status == "FAILED":
                raise Exception("Evaluation {} is FAILED!".format(
                    ev_id))
            else:
                any_in_progress = True  # in progress

        if not any_in_progress:  # exit polling if all Evaluations completed
            break
        logger.debug("Next poll in {} seconds...".format(polling_delay))
        time.sleep(polling_delay)
        # update polling_delay in the next polling
        polling_delay = min(polling_delay * 2, config.DELAY_CAP)
    return completed_evals
 def build(self):
     """
     Builds the necessary entities on Amazon ML.
     """
     self._ml = boto.connect_machinelearning()
     self.create_datasources()
     self.create_ml_model()
     self.create_eval()
 def build(self):
     """
     Builds the necessary entities on Amazon ML.
     """
     self._ml = boto.connect_machinelearning()
     self.create_datasources()
     self.create_ml_model()
     self.create_eval()
def collect_perf(eval_id_list):
    """
    This function collects the AUC score for a list of
    evaluations (based on binary classification model)
    on Amazon ML. If any evaluation is in progress,
    the script will poll and wait with exponential
    backoff.

    Args:
        eval_id_list: a list of Evaluation IDs to collect
            performance metrics.
    Returns:
        a map of completed evaluation's ID to
            the corresponding AUC score.
    Raises:
        exception when any Evaluation is in
            Failed status.
    """
    ml = boto.connect_machinelearning()  # boto Amazon ML client
    completed_evals = dict()  # to collect completed Evaluations
    start_timestamp = time.time()  # start timestamp in seconds

    # time delay in seconds between two polling attempt
    polling_delay = config.INITIAL_POLLING_DELAY

    logger.info("Checking the Evaluation status...")
    while time.time() - start_timestamp < config.TIME_OUT:
        any_in_progress = False  # assume all complete

        for ev_id in eval_id_list:  # fetching each Evaluation status
            if ev_id in completed_evals:  # skip any completed Evaluations
                continue

            # fetch evaluation status
            evaluation = ml.get_evaluation(ev_id)
            eval_status = evaluation["Status"]
            logger.info("{} status: {}".format(ev_id, eval_status))

            if eval_status == "COMPLETED":
                # get the AUC score from the Evaluation
                auc = evaluation["PerformanceMetrics"]["Properties"][
                    "BinaryAUC"]
                # mark this Evaluation to be completed, and write down the
                # AUC score in floating point number. Note that this entity
                # will be skipped in next polling
                completed_evals[ev_id] = float(auc)
            elif eval_status == "FAILED":
                raise Exception("Evaluation {} is FAILED!".format(ev_id))
            else:
                any_in_progress = True  # in progress

        if not any_in_progress:  # exit polling if all Evaluations completed
            break
        logger.debug("Next poll in {} seconds...".format(polling_delay))
        time.sleep(polling_delay)
        # update polling_delay in the next polling
        polling_delay = min(polling_delay * 2, config.DELAY_CAP)
    return completed_evals
def build_model(data_s3_url, schema_fn, recipe_fn, name, train_percent=70):
    """Creates all the objects needed to build an ML Model & evaluate its quality.
    """
    ml = boto.connect_machinelearning()
    (train_ds_id, test_ds_id) = create_data_sources(ml, data_s3_url, schema_fn,
                                                    train_percent, name)
    ml_model_id = create_model(ml, train_ds_id, recipe_fn, name)
    eval_id = create_evaluation(ml, ml_model_id, test_ds_id, name)

    return ml_model_id
示例#8
0
def use_model(model_id, threshold, schema_fn, output_s3, data_s3url):
    """Creates all the objects needed to build an ML Model & evaluate its quality.
    """
    ml = boto.connect_machinelearning()

    poll_until_completed(ml, model_id)  # Can't use it until it's COMPLETED
    ml.update_ml_model(model_id, score_threshold=threshold)
    print("Set score threshold for %s to %.2f" % (model_id, threshold))

    bp_id = 'bp-' + base64.b32encode(os.urandom(10))
    ds_id = create_data_source_for_scoring(ml, data_s3url, schema_fn)
    ml.create_batch_prediction(
        batch_prediction_id=bp_id,
        batch_prediction_name="Batch Prediction for marketing sample",
        ml_model_id=model_id,
        batch_prediction_data_source_id=ds_id,
        output_uri=output_s3)
    print("Created Batch Prediction %s" % bp_id)
def use_model(model_id, threshold, schema_fn, output_s3, data_s3url):
    """Creates all the objects needed to build an ML Model & evaluate its quality.
    """
    ml = boto.connect_machinelearning()

    poll_until_completed(ml, model_id)  # Can't use it until it's COMPLETED
    ml.update_ml_model(model_id, score_threshold=threshold)
    print("Set score threshold for %s to %.2f" % (model_id, threshold))

    bp_id = 'bp-' + base64.b32encode(os.urandom(10))
    ds_id = create_data_source_for_scoring(ml, data_s3url, schema_fn)
    ml.create_batch_prediction(
        batch_prediction_id=bp_id,
        batch_prediction_name="Batch Prediction for marketing sample",
        ml_model_id=model_id,
        batch_prediction_data_source_id=ds_id,
        output_uri=output_s3
    )
    print("Created Batch Prediction %s" % bp_id)
def aml_connection():
    """Connects to the service and validates that credentials are configured properly.
    """
    ml = boto.connect_machinelearning()
    try:
        # Check that the connection is configured properly
        ml.describe_ml_models(limit=1)
    except:
        raise RuntimeError("""There was a problem connecting to Amazon Machine Learning.
Be sure your AWS credentials are properly configured.
A credentials file should be in ~/.aws/credentials
(or C:\Users\USER_NAME\.aws\credentials on Windows)
and look like:

[Credentials]
aws_access_key_id = <your_access_key_here>
aws_secret_access_key = <your_secret_key_here>
    """)
    return ml
示例#11
0
def aml_connection():
    """Connects to the service and validates that credentials are configured properly.
    """
    ml = boto.connect_machinelearning()
    try:
        # Check that the connection is configured properly
        ml.describe_ml_models(limit=1)
    except:
        raise RuntimeError("""There was a problem connecting to Amazon Machine Learning.
Be sure your AWS credentials are properly configured.
A credentials file should be in ~/.aws/credentials
(or C:\Users\USER_NAME\.aws\credentials on Windows)
and look like:

[Credentials]
aws_access_key_id = <your_access_key_here>
aws_secret_access_key = <your_secret_key_here>
    """)
    return ml
def poll_until_completed(entity_id, entity_type_str):
    ml = boto.connect_machinelearning()
    polling_function = {
        'ds': ml.get_data_source,
        'ml': ml.get_ml_model,
        'ev': ml.get_evaluation,
        'bp': ml.get_batch_prediction,
    }[entity_type_str]
    delay = 2
    while True:
        results = polling_function(entity_id)
        status = results['Status']
        message = results.get('Message', '')
        now = str(datetime.datetime.now().time())
        print("Object %s is %s (%s) at %s" % (entity_id, status, message, now))
        if status in ['COMPLETED', 'FAILED', 'INVALID']:
            break

        # exponential backoff with jitter
        delay *= random.uniform(1.1, 2.0)
        time.sleep(delay)
    print(json.dumps(results, indent=2))
示例#13
0
def realtime_predict(ml_model_id, record):
    """Takes a string ml_model_id, and a dict record, and makes a realtime
    prediction call, if the ML Model has a realtime endpoint.
    If the ML Model doesn't have a realtime endpoint, it creates one instead
    of calling predict()
    """
    ml = boto.connect_machinelearning()
    model = ml.get_ml_model(ml_model_id)
    endpoint = model.get('EndpointInfo', {}).get('EndpointUrl', '')
    #endpoint = endpoint.replace("https://", "")  # This shouldn't be needed
    if endpoint:
        print('ml.predict("%s", %s, "%s") # returns...' % (ml_model_id,
                                                           json.dumps(record, indent=2), endpoint))
        start = time.time()
        prediction = ml.predict(ml_model_id, record, predict_endpoint=endpoint)
        latency_ms = (time.time() - start)*1000

        predict_scores = prediction['Prediction']['predictedScores']
        predict_list = sorted(predict_scores.items(), key=itemgetter(1))
        predict_sort = []
        for k in predict_list:
                predict_sort.append(k[0])
        for i in reversed(predict_sort):
                sys.stdout.write(i+';')

        print(json.dumps(prediction, indent=2))
        print("Latency: %.2fms" % latency_ms)
    else:
        print(
            '# Missing realtime endpoint\nml.create_realtime_endpoint("%s")' % ml_model_id)
        result = ml.create_realtime_endpoint(ml_model_id)
        print(json.dumps(result, indent=2))
        print("""# Predictions will fail until the endpoint has been fully created.
# Note that you will be charged a reservation fee until this endpoint is deleted.
# Delete with:
    python realtime.py %s --deleteEndpoint""" % ml_model_id)
def delete_realtime_endpoint(ml_model_id):
    ml = boto.connect_machinelearning()
    print('# Deleting realtime endpoint\nml.delete_realtime_endpoint("%s")' %
          ml_model_id)
    result = ml.delete_realtime_endpoint(ml_model_id)
    print(json.dumps(result, indent=2))
from time import sleep
from zipfile import ZipFile

import boto
from boto.kinesis.exceptions import ResourceInUseException

import config

# To enable logging:
# boto.set_stream_logger('boto')

# Initialize the AWS clients.
sns = boto.connect_sns()
kinesis = boto.connect_kinesis()
aws_lambda = boto.connect_awslambda()
ml = boto.connect_machinelearning()

lambda_execution_policy = open('lambdaExecutionPolicyTemplate.json').read().format(**config.AWS)

aws_account_id = config.AWS["awsAccountId"]
region = config.AWS["region"]
kinesis_stream = config.AWS["kinesisStream"]
sns_topic = config.AWS["snsTopic"]

lambda_function_name = config.AWS["lambdaFunctionName"]
lambda_execution_role = config.AWS["lambdaExecutionRole"]
lambda_trust_policy = '{"Statement":[{"Effect":"Allow","Principal":{"Service":"lambda.amazonaws.com"},"Action":"sts:AssumeRole"}]}'


def role_exists(iam, role_name):
    try:
def delete_realtime_endpoint(ml_model_id):
    ml = boto.connect_machinelearning()
    print('# Deleting realtime endpoint\nml.delete_realtime_endpoint("%s")' %
          ml_model_id)
    result = ml.delete_realtime_endpoint(ml_model_id)
    print(json.dumps(result, indent=2))
示例#17
0
    print(CONSOLE_URL_ML_MODEL.format(CONSOLE_URL_BASE, ml_model_id))
    evaluation_id = 'ev-tweets-' + time_stamp
    ml.create_evaluation(evaluation_id, ml_model_id, evaluation_ds_id)
    print("Creating evaluation with id {0}. See:".format(evaluation_id))
    print(CONSOLE_URL_EVALUATION.format(CONSOLE_URL_BASE, ml_model_id, evaluation_id))
    print("Waiting for evaluation to complete.")
    poll_until_ready(evaluation_id)
    print("done")
    evaluation = ml.get_evaluation(evaluation_id)
    print("Performance metric on the evaluation dataset: Binary AUC: " + evaluation['PerformanceMetrics']['Properties'][
        'BinaryAUC'])


time_stamp = time.strftime('%Y-%m-%d-%H-%M-%S')

ml = boto.connect_machinelearning()
s3 = boto.connect_s3()

aml_training_dataset = None

if __name__ == "__main__":
    if len(sys.argv) != 5:
        print __doc__
        sys.exit(-1)
    try:
        aml_training_dataset = sys.argv[1]
        s3_bucket = s3.get_bucket(sys.argv[3])
        s3_key = s3_bucket.new_key(sys.argv[4])
        s3_uri = "s3://{0}/{1}".format(sys.argv[3], sys.argv[4])
        print("Uploading {0} to {1}".format(sys.argv[1], s3_uri))
        if s3_key.exists():
示例#18
0
def process_email_msg_in_s3(source_folder, message_id):

    assert source_folder in ('unprocessed', 'failed')
    s3_conn = boto.connect_s3(s3_full.ACCESS_KEY_ID, s3_full.SECRET_ACCESS_KEY)
    email_bucket = s3_conn.get_bucket(EMAIL_BUCKET_NAME)
    s3_key = '%s/%s' % (source_folder, message_id)
    s3_obj = email_bucket.get_key(s3_key)
    if not s3_obj:
        raise Exception('Expected S3 key not found (%s)' % s3_key)

    try:
        parsed_email_dict = _fetch_email_from_s3_and_parse(s3_obj)

        db_conn = engine.connect()
        db_trans = db_conn.begin()

        account = get_account_for_transaction(db_conn)
        now = datetime.utcnow()

        logging.info('Calling ML to predict verifier')
        ml_bucket = s3_conn.get_bucket(ML_BUCKET_NAME)
        transactions_filename = ml_bucket.get_key(ML_MOST_RECENT_TRANSACTIONS_FILENAME).get_contents_as_string()
        ml_conn = boto.connect_machinelearning(ml_full.ACCESS_KEY_ID, ml_full.SECRET_ACCESS_KEY)
        prediction_dict = _perform_ml_prediction(
            ml_conn,
            transactions_filename,
            parsed_email_dict['merchant_parsed'],
            parsed_email_dict['date_parsed'],
            parsed_email_dict['amount_parsed'],
            account.name,
            now)

        verifier_predicted_user = get_user_by_username(db_conn, prediction_dict['Prediction']['predictedLabel'])
        verifier_predicted_scores = prediction_dict['Prediction']['predictedScores']

        logging.info('Predicted verifier is "%s" (%s)', verifier_predicted_user.name, verifier_predicted_scores)

        new_trans_id = _store_transaction_into_db(
            db_conn,
            parsed_email_dict['merchant_parsed'],
            parsed_email_dict['date_parsed'],
            parsed_email_dict['amount_parsed'],
            message_id,
            account,
            now,
            transactions_filename,
            verifier_predicted_scores,
            verifier_predicted_user.id)

        verification_dict = derive_and_store_verification_attempt(db_conn, new_trans_id, verifier_predicted_user.id)

        # A "history" with just one entry -- the system's prediction for this transaction
        verification_history = [get_verification_history_first_entry(now,
                                    verifier_predicted_user.name,
                                    verifier_predicted_scores), ]

        send_verification_email(
            verification_dict['verif_attempt_id'],
            verifier_predicted_user,
            account.name,
            parsed_email_dict['date_parsed'],
            parsed_email_dict['amount_parsed'],
            parsed_email_dict['merchant_parsed'],
            verification_dict['possible_attributed_tos'],
            verification_dict['possible_other_verifiers'],
            verification_history)

        db_trans.commit()

        logging.info('Moving S3 file into "processed" folder')
        s3_obj.copy(EMAIL_BUCKET_NAME, 'processed/%s' % message_id, metadata={'Content-Type': 'text/plain'})
        s3_obj.delete()

    except Exception:
        if s3_obj:
            if source_folder != 'failed':  # If it's already in failed, just leave it there!  Don't delete it!
                logging.info('Moving S3 file into "failed" folder')
                s3_obj.copy(EMAIL_BUCKET_NAME, 'failed/%s' % message_id, metadata={'Content-Type': 'text/plain'})
                s3_obj.delete()
        raise
示例#19
0
def create_ml_model():

    #
    # 0. Derive datasource/model/evaluation name and IDs -- the same value for all
    #
    now = datetime.utcnow()
    model_name = model_id = 'transactions-%s' % now.strftime('%Y%m%d')
    datasource_name = datasource_id = model_name
    evaluation_name = evaluation_id = model_name

    logging.info('Model/datasource/evaluation name/ID will be %s' % model_name)

    #
    # 1. Query the DB for transactions; write them to a CSV
    #

    logging.info('Querying DB for transactions, writing them to CSV')

    _, temp_fname = tempfile.mkstemp()
    outfile = open(temp_fname, 'wb')
    writer = csv.writer(outfile, quoting=csv.QUOTE_NONNUMERIC)

    writer.writerow([
        'id',
        'merchant',
        'date',
        'day_of_week',
        'weekend',
        'amount',
        'created_date',
        'account',
        'verified_by',
    ])

    s = select([transaction_tbl.c.id,
                transaction_tbl.c.merchant,
                transaction_tbl.c.date,
                transaction_tbl.c.amount,
                transaction_tbl.c.created_date,
                account_tbl.c.name.label('account_name'),
                user_tbl.c.name.label('verifier_name')
               ]) \
            .select_from(transaction_tbl
            .join(user_tbl, onclause=transaction_tbl.c.verified_by == user_tbl.c.id)
            .join(account_tbl)) \
            .where(transaction_tbl.c.is_verified)

    conn = engine.connect()
    rs = conn.execute(s)
    for row in rs:
        python_weekday, weekend = ml.weekday_fields(row.date)
        row = [row.id,
               row.merchant,
               row.date,
               python_weekday,
               weekend,
               row.amount,
               row.created_date,
               row.account_name,
               row.verifier_name]
        row = [ml.standardize(f) for f in row]
        writer.writerow(row)

    outfile.close()

    #
    # 2. Upload the CSV into S3
    #

    logging.info('Uploading CSV into S3 bucket')

    infile = open(temp_fname, 'rb')
    s3_conn = boto.connect_s3(s3_full.ACCESS_KEY_ID, s3_full.SECRET_ACCESS_KEY)
    bucket = s3_conn.get_bucket(ML_BUCKET_NAME)
    s3_key = bucket.new_key('%s.csv' % model_name)
    s3_key.metadata = {'Content-Type': 'text/csv'}
    s3_key.set_contents_from_file(infile)

    infile.close()
    os.remove(temp_fname)

    #
    # 3. Create an ML datasource from the CSV
    #

    logging.info('Creating ML datasource from the CSV')

    ml_conn = boto.connect_machinelearning(ml_full.ACCESS_KEY_ID, ml_full.SECRET_ACCESS_KEY)
    try:
        ml_conn.delete_data_source(data_source_id=model_name)
    except boto.machinelearning.exceptions.ResourceNotFoundException:
        pass

    # Obtained by using the AWS console UI
    transactions_schema = '''
    {
      "version" : "1.0",
      "rowId" : "id",
      "rowWeight" : null,
      "targetAttributeName" : "verified_by",
      "dataFormat" : "CSV",
      "dataFileContainsHeader" : true,
      "attributes" : [ {
        "attributeName" : "id",
        "attributeType" : "CATEGORICAL"
      }, {
        "attributeName" : "merchant",
        "attributeType" : "TEXT"
      }, {
        "attributeName" : "date",
        "attributeType" : "CATEGORICAL"
      }, {
        "attributeName" : "day_of_week",
        "attributeType" : "CATEGORICAL"
      }, {
        "attributeName" : "weekend",
        "attributeType" : "BINARY"
      }, {
        "attributeName" : "amount",
        "attributeType" : "NUMERIC"
      }, {
        "attributeName" : "created_date",
        "attributeType" : "TEXT"
      }, {
        "attributeName" : "account",
        "attributeType" : "CATEGORICAL"
      }, {
        "attributeName" : "verified_by",
        "attributeType" : "CATEGORICAL"
      } ],
      "excludedAttributeNames" : [ ]
    }
    '''

    ml_conn.create_data_source_from_s3(
        data_source_id=datasource_id,
        data_source_name=datasource_name,
        data_spec={
            'DataLocationS3': 's3://%s/%s.csv' % (ML_BUCKET_NAME, datasource_name),
            'DataSchema': transactions_schema,
        },
        compute_statistics=True)

    #
    # 4. Create an ML model from the datasource
    #

    logging.info('Creating ML model from the datasource')

    ml_conn.create_ml_model(
        ml_model_id=model_id,
        ml_model_name=model_name,
        ml_model_type='MULTICLASS',
        training_data_source_id=datasource_id)

    ml_conn.create_realtime_endpoint(ml_model_id=model_id)

    ml_conn.create_evaluation(
        evaluation_id=evaluation_id,
        ml_model_id=model_id,
        evaluation_data_source_id=datasource_id,
        evaluation_name=evaluation_name)

    #
    # 5. Write the name of the just-created model/datasource/evaluation to the ML_MOST_RECENT_TRANSACTIONS_FILENAME key in S3
    #

    logging.info('Writing model name to ML_MOST_RECENT_TRANSACTIONS_FILENAME in S3')

    bucket = s3_conn.get_bucket(ML_BUCKET_NAME)
    k = bucket.get_key(ML_MOST_RECENT_TRANSACTIONS_FILENAME)
    if not k:
        k = bucket.new_key(ML_MOST_RECENT_TRANSACTIONS_FILENAME)
    k.metadata = {'Content-Type': 'text/plain'}
    k.set_contents_from_string(model_name)