def compose_shards(data, context):
    print(data)

    num_shards = 10
    prefix = 'flights/json/sharded/output'
    outfile = 'flights/json/flights.json'
    # trigger on the last file only
    filename = data['name']

    last_shard = '-%05d-of-%05d' % (num_shards - 1, num_shards)
    if (prefix in filename and last_shard in filename):
        # verify that all 10 shards exist
        prefix = filename.replace(last_shard, '')
        client = gcs.Client()
        bucket = client.bucket(data['bucket'])
        blobs = []
        for shard in range(num_shards):
            sfile = '%s-%05d-of-%05d' % (prefix, shard + 1, num_shards)
            blob = bucket.blob(sfile)
            if not blob.exists():
                # this causes a retry in 60s
                raise ValueError('Shard {} not present'.format(sfile))
            blobs.append(blob)
        # all shards exist, so compose
        bucket.blob(outfile).compose(blobs)
        logging.info('Successfully created {}'.format(outfile))
        for blob in blobs:
            blob.delete()
        logging.info('Deleted {} shards'.format(len(blobs)))
예제 #2
0
def store_eval(job_dir, results):

    tf.logging.info('job_dir: %s' % job_dir)
    job_info = re.search('gs://(monorail-.+)-mlengine/(spam_trainer_\d+)',
                         job_dir)

    # Only upload eval data if this is not being run locally.
    if job_info:
        project = job_info.group(1)
        job_name = job_info.group(2)

        tf.logging.info('project: %s' % project)
        tf.logging.info('job_name: %s' % job_name)

        client_obj = client.Client(project=project)
        bucket_name = '%s-mlengine' % project
        bucket_obj = bucket.Bucket(client_obj, bucket_name)

        bucket_obj.blob = blob.Blob(job_name + '/eval_data.json', bucket_obj)
        for key, value in results[0].items():
            if isinstance(value, np.float32):
                results[0][key] = value.item()

        bucket_obj.blob.upload_from_string(json.dumps(results[0]),
                                           content_type='application/json')

    else:
        tf.logging.error('Could not find bucket "%s" to output evalution to.' %
                         job_dir)
예제 #3
0
def store_component_conversion(job_dir, data):

    tf.logging.info('job_dir: %s' % job_dir)
    job_info = re.search('gs://(monorail-.+)-mlengine/(component_trainer_\d+)',
                         job_dir)

    # Check if training is being done on GAE or locally.
    if job_info:
        project = job_info.group(1)
        job_name = job_info.group(2)

        client_obj = client.Client(project=project)
        bucket_name = '%s-mlengine' % project
        bucket_obj = bucket.Bucket(client_obj, bucket_name)

        bucket_obj.blob = blob.Blob(job_name + '/component_index.json',
                                    bucket_obj)

        bucket_obj.blob.upload_from_string(json.dumps(data),
                                           content_type='application/json')

    else:
        paths = job_dir.split('/')
        for y, _ in enumerate(list(range(1, len(paths))), 1):
            if not os.path.exists("/".join(paths[:y + 1])):
                os.makedirs('/'.join(paths[:y + 1]))
        with open(job_dir + '/component_index.json', 'w') as f:
            f.write(json.dumps(data))
예제 #4
0
파일: clients.py 프로젝트: nya3jp/livesite
 def __init__(self, config: types.Config, override_project: Optional[str] = None):
     self._project = override_project or config.project
     self._config = config
     if config.user_info.get('type') == 'service_account':
         creds = service_account.Credentials.from_service_account_info(
             config.user_info, scopes=constants.SCOPES)
     else:
         creds = google_credentials.Credentials.from_authorized_user_info(
             config.user_info, scopes=constants.SCOPES)
     self._session = google_auth_requests.AuthorizedSession(creds)
     self._storage = storage_client.Client(project=self._project, _http=self._session)
예제 #5
0
파일: spam.py 프로젝트: xinghun61/infra
def CompareAccuracy(args):
    client_obj = client.Client(project=args.project)
    bucket_name = '%s-mlengine' % args.project
    bucket_obj = bucket.Bucket(client_obj, bucket_name)

    model1_auc, model1_auc_pr = get_auc(args.model1, bucket_obj)
    print('%s:\nAUC: %f\tAUC Precision/Recall: %f\n' %
          (args.model1, model1_auc, model1_auc_pr))

    model2_auc, model2_auc_pr = get_auc(args.model2, bucket_obj)
    print('%s:\nAUC: %f\tAUC Precision/Recall: %f' %
          (args.model2, model2_auc, model2_auc_pr))
예제 #6
0
def make_top_words_list(job_dir):
    """Returns the top (most common) words in the entire dataset for component
  prediction. If a file is already stored in GCS containing these words, the
  words from the file are simply returned. Otherwise, the most common words are
  determined and written to GCS, before being returned.

  Returns:
    A list of the most common words in the dataset (the number of them
    determined by ml_helpers.COMPONENT_FEATURES).
  """

    credentials = GoogleCredentials.get_application_default()
    storage = discovery.build('storage', 'v1', credentials=credentials)
    objects = storage.objects()

    subpaths = re.match('gs://(monorail-.*)-mlengine/(component_trainer_\d+)',
                        job_dir)

    if subpaths:
        project_id = subpaths.group(1)
        trainer_folder = subpaths.group(2)
    else:
        project_id = 'monorail-prod'

    storage_bucket = project_id + '.appspot.com'
    request = objects.list(bucket=storage_bucket,
                           prefix='component_training_data')

    response = trainer.dataset.make_api_request(request)

    items = response.get('items')
    csv_filepaths = [b.get('name') for b in items]

    final_string = ''

    for word in parse_words(csv_filepaths, objects, storage_bucket,
                            project_id):
        final_string += word + '\n'

    if subpaths:
        client_obj = client.Client(project=project_id)
        bucket_obj = bucket.Bucket(client_obj, project_id + '-mlengine')

        bucket_obj.blob = google.cloud.storage.blob.Blob(
            trainer_folder + '/' + TOP_WORDS, bucket_obj)
        bucket_obj.blob.upload_from_string(final_string,
                                           content_type='text/plain')
    return final_string.split()
예제 #7
0
def Predict(args):
    ml = googleapiclient.discovery.build('ml', 'v1', credentials=credentials)

    with open(args.content) as f:
        content = f.read()

    project_ID = 'projects/%s' % args.project
    full_model_name = '%s/models/%s' % (project_ID, MODEL_NAME)
    model_request = ml.projects().models().get(name=full_model_name)
    model_response = model_request.execute()

    version_name = model_response['defaultVersion']['name']

    model_name = 'component_trainer_' + re.search("v_(\d+)",
                                                  version_name).group(1)

    client_obj = client.Client(project=args.project)
    bucket_name = '%s-mlengine' % args.project
    bucket_obj = bucket.Bucket(client_obj, bucket_name)

    instance = ml_helpers.GenerateFeaturesRaw([content], COMPONENT_FEATURES,
                                              getTopWords(
                                                  bucket_name, model_name))

    request = ml.projects().predict(
        name=full_model_name,
        body={'instances': [{
            'inputs': instance['word_features']
        }]})

    try:
        response = request.execute()

        bucket_obj.blob = blob.Blob('%s/component_index.json' % model_name,
                                    bucket_obj)
        component_index = bucket_obj.blob.download_as_string()
        component_index_dict = json.loads(component_index)

        return read_indexes(response, component_index_dict)

    except googleapiclient.errors.HttpError, err:
        print('There was an error. Check the details:')
        print(err._get_reason())
예제 #8
0
            header = "failed state: "
            exit_cond = "exit"
        total = str(datetime.timedelta(seconds=(now - start)))
        print("Time: " + total + ", " + header + payload)
        if exit_cond == "exit":
            exit()
        elif exit_cond == "break":
            break
        time.sleep(5)

    # job succeeded
    # append results into one consistent file
    # https://medium.com/google-cloud/how-to-write-to-a-single-shard-on-google-cloud-storage-efficiently-using-cloud-dataflow-and-cloud-3aeef1732325
    files = list_blobs_with_prefix(BUCKET_NAME, prefix=OUTPUT_DIR)
    num_shards = find_max_shards(files)
    client = gcs.Client()
    bucket = client.bucket(BUCKET_NAME)
    blobs = []
    for shard in range(num_shards):
        sfile = '%s-%05d-of-%05d' % (OUTPUT_DIR + "/" + OUTPUT_ID, shard,
                                     num_shards)
        blob = bucket.blob(sfile)
        if not blob.exists():
            # this causes a retry in 60s
            raise ValueError('Shard {} not present'.format(sfile))
        blobs.append(blob)
    bucket.blob(OUTPUT_NAME).compose(blobs)

    # List the new file
    files = list_blobs_with_prefix(BUCKET_NAME, prefix=OUTPUT_NAME)
    print(files)