def compose_shards(data, context): print(data) num_shards = 10 prefix = 'flights/json/sharded/output' outfile = 'flights/json/flights.json' # trigger on the last file only filename = data['name'] last_shard = '-%05d-of-%05d' % (num_shards - 1, num_shards) if (prefix in filename and last_shard in filename): # verify that all 10 shards exist prefix = filename.replace(last_shard, '') client = gcs.Client() bucket = client.bucket(data['bucket']) blobs = [] for shard in range(num_shards): sfile = '%s-%05d-of-%05d' % (prefix, shard + 1, num_shards) blob = bucket.blob(sfile) if not blob.exists(): # this causes a retry in 60s raise ValueError('Shard {} not present'.format(sfile)) blobs.append(blob) # all shards exist, so compose bucket.blob(outfile).compose(blobs) logging.info('Successfully created {}'.format(outfile)) for blob in blobs: blob.delete() logging.info('Deleted {} shards'.format(len(blobs)))
def store_eval(job_dir, results): tf.logging.info('job_dir: %s' % job_dir) job_info = re.search('gs://(monorail-.+)-mlengine/(spam_trainer_\d+)', job_dir) # Only upload eval data if this is not being run locally. if job_info: project = job_info.group(1) job_name = job_info.group(2) tf.logging.info('project: %s' % project) tf.logging.info('job_name: %s' % job_name) client_obj = client.Client(project=project) bucket_name = '%s-mlengine' % project bucket_obj = bucket.Bucket(client_obj, bucket_name) bucket_obj.blob = blob.Blob(job_name + '/eval_data.json', bucket_obj) for key, value in results[0].items(): if isinstance(value, np.float32): results[0][key] = value.item() bucket_obj.blob.upload_from_string(json.dumps(results[0]), content_type='application/json') else: tf.logging.error('Could not find bucket "%s" to output evalution to.' % job_dir)
def store_component_conversion(job_dir, data): tf.logging.info('job_dir: %s' % job_dir) job_info = re.search('gs://(monorail-.+)-mlengine/(component_trainer_\d+)', job_dir) # Check if training is being done on GAE or locally. if job_info: project = job_info.group(1) job_name = job_info.group(2) client_obj = client.Client(project=project) bucket_name = '%s-mlengine' % project bucket_obj = bucket.Bucket(client_obj, bucket_name) bucket_obj.blob = blob.Blob(job_name + '/component_index.json', bucket_obj) bucket_obj.blob.upload_from_string(json.dumps(data), content_type='application/json') else: paths = job_dir.split('/') for y, _ in enumerate(list(range(1, len(paths))), 1): if not os.path.exists("/".join(paths[:y + 1])): os.makedirs('/'.join(paths[:y + 1])) with open(job_dir + '/component_index.json', 'w') as f: f.write(json.dumps(data))
def __init__(self, config: types.Config, override_project: Optional[str] = None): self._project = override_project or config.project self._config = config if config.user_info.get('type') == 'service_account': creds = service_account.Credentials.from_service_account_info( config.user_info, scopes=constants.SCOPES) else: creds = google_credentials.Credentials.from_authorized_user_info( config.user_info, scopes=constants.SCOPES) self._session = google_auth_requests.AuthorizedSession(creds) self._storage = storage_client.Client(project=self._project, _http=self._session)
def CompareAccuracy(args): client_obj = client.Client(project=args.project) bucket_name = '%s-mlengine' % args.project bucket_obj = bucket.Bucket(client_obj, bucket_name) model1_auc, model1_auc_pr = get_auc(args.model1, bucket_obj) print('%s:\nAUC: %f\tAUC Precision/Recall: %f\n' % (args.model1, model1_auc, model1_auc_pr)) model2_auc, model2_auc_pr = get_auc(args.model2, bucket_obj) print('%s:\nAUC: %f\tAUC Precision/Recall: %f' % (args.model2, model2_auc, model2_auc_pr))
def make_top_words_list(job_dir): """Returns the top (most common) words in the entire dataset for component prediction. If a file is already stored in GCS containing these words, the words from the file are simply returned. Otherwise, the most common words are determined and written to GCS, before being returned. Returns: A list of the most common words in the dataset (the number of them determined by ml_helpers.COMPONENT_FEATURES). """ credentials = GoogleCredentials.get_application_default() storage = discovery.build('storage', 'v1', credentials=credentials) objects = storage.objects() subpaths = re.match('gs://(monorail-.*)-mlengine/(component_trainer_\d+)', job_dir) if subpaths: project_id = subpaths.group(1) trainer_folder = subpaths.group(2) else: project_id = 'monorail-prod' storage_bucket = project_id + '.appspot.com' request = objects.list(bucket=storage_bucket, prefix='component_training_data') response = trainer.dataset.make_api_request(request) items = response.get('items') csv_filepaths = [b.get('name') for b in items] final_string = '' for word in parse_words(csv_filepaths, objects, storage_bucket, project_id): final_string += word + '\n' if subpaths: client_obj = client.Client(project=project_id) bucket_obj = bucket.Bucket(client_obj, project_id + '-mlengine') bucket_obj.blob = google.cloud.storage.blob.Blob( trainer_folder + '/' + TOP_WORDS, bucket_obj) bucket_obj.blob.upload_from_string(final_string, content_type='text/plain') return final_string.split()
def Predict(args): ml = googleapiclient.discovery.build('ml', 'v1', credentials=credentials) with open(args.content) as f: content = f.read() project_ID = 'projects/%s' % args.project full_model_name = '%s/models/%s' % (project_ID, MODEL_NAME) model_request = ml.projects().models().get(name=full_model_name) model_response = model_request.execute() version_name = model_response['defaultVersion']['name'] model_name = 'component_trainer_' + re.search("v_(\d+)", version_name).group(1) client_obj = client.Client(project=args.project) bucket_name = '%s-mlengine' % args.project bucket_obj = bucket.Bucket(client_obj, bucket_name) instance = ml_helpers.GenerateFeaturesRaw([content], COMPONENT_FEATURES, getTopWords( bucket_name, model_name)) request = ml.projects().predict( name=full_model_name, body={'instances': [{ 'inputs': instance['word_features'] }]}) try: response = request.execute() bucket_obj.blob = blob.Blob('%s/component_index.json' % model_name, bucket_obj) component_index = bucket_obj.blob.download_as_string() component_index_dict = json.loads(component_index) return read_indexes(response, component_index_dict) except googleapiclient.errors.HttpError, err: print('There was an error. Check the details:') print(err._get_reason())
header = "failed state: " exit_cond = "exit" total = str(datetime.timedelta(seconds=(now - start))) print("Time: " + total + ", " + header + payload) if exit_cond == "exit": exit() elif exit_cond == "break": break time.sleep(5) # job succeeded # append results into one consistent file # https://medium.com/google-cloud/how-to-write-to-a-single-shard-on-google-cloud-storage-efficiently-using-cloud-dataflow-and-cloud-3aeef1732325 files = list_blobs_with_prefix(BUCKET_NAME, prefix=OUTPUT_DIR) num_shards = find_max_shards(files) client = gcs.Client() bucket = client.bucket(BUCKET_NAME) blobs = [] for shard in range(num_shards): sfile = '%s-%05d-of-%05d' % (OUTPUT_DIR + "/" + OUTPUT_ID, shard, num_shards) blob = bucket.blob(sfile) if not blob.exists(): # this causes a retry in 60s raise ValueError('Shard {} not present'.format(sfile)) blobs.append(blob) bucket.blob(OUTPUT_NAME).compose(blobs) # List the new file files = list_blobs_with_prefix(BUCKET_NAME, prefix=OUTPUT_NAME) print(files)