def store_eval(job_dir, results): tf.logging.info('job_dir: %s' % job_dir) job_info = re.search('gs://(monorail-.+)-mlengine/(spam_trainer_\d+)', job_dir) # Only upload eval data if this is not being run locally. if job_info: project = job_info.group(1) job_name = job_info.group(2) tf.logging.info('project: %s' % project) tf.logging.info('job_name: %s' % job_name) client_obj = client.Client(project=project) bucket_name = '%s-mlengine' % project bucket_obj = bucket.Bucket(client_obj, bucket_name) bucket_obj.blob = blob.Blob(job_name + '/eval_data.json', bucket_obj) for key, value in results[0].items(): if isinstance(value, np.float32): results[0][key] = value.item() bucket_obj.blob.upload_from_string(json.dumps(results[0]), content_type='application/json') else: tf.logging.error('Could not find bucket "%s" to output evalution to.' % job_dir)
def store_component_conversion(job_dir, data): tf.logging.info('job_dir: %s' % job_dir) job_info = re.search('gs://(monorail-.+)-mlengine/(component_trainer_\d+)', job_dir) # Check if training is being done on GAE or locally. if job_info: project = job_info.group(1) job_name = job_info.group(2) client_obj = client.Client(project=project) bucket_name = '%s-mlengine' % project bucket_obj = bucket.Bucket(client_obj, bucket_name) bucket_obj.blob = blob.Blob(job_name + '/component_index.json', bucket_obj) bucket_obj.blob.upload_from_string(json.dumps(data), content_type='application/json') else: paths = job_dir.split('/') for y, _ in enumerate(list(range(1, len(paths))), 1): if not os.path.exists("/".join(paths[:y + 1])): os.makedirs('/'.join(paths[:y + 1])) with open(job_dir + '/component_index.json', 'w') as f: f.write(json.dumps(data))
def CompareAccuracy(args): client_obj = client.Client(project=args.project) bucket_name = '%s-mlengine' % args.project bucket_obj = bucket.Bucket(client_obj, bucket_name) model1_auc, model1_auc_pr = get_auc(args.model1, bucket_obj) print('%s:\nAUC: %f\tAUC Precision/Recall: %f\n' % (args.model1, model1_auc, model1_auc_pr)) model2_auc, model2_auc_pr = get_auc(args.model2, bucket_obj) print('%s:\nAUC: %f\tAUC Precision/Recall: %f' % (args.model2, model2_auc, model2_auc_pr))
def make_top_words_list(job_dir): """Returns the top (most common) words in the entire dataset for component prediction. If a file is already stored in GCS containing these words, the words from the file are simply returned. Otherwise, the most common words are determined and written to GCS, before being returned. Returns: A list of the most common words in the dataset (the number of them determined by ml_helpers.COMPONENT_FEATURES). """ credentials = GoogleCredentials.get_application_default() storage = discovery.build('storage', 'v1', credentials=credentials) objects = storage.objects() subpaths = re.match('gs://(monorail-.*)-mlengine/(component_trainer_\d+)', job_dir) if subpaths: project_id = subpaths.group(1) trainer_folder = subpaths.group(2) else: project_id = 'monorail-prod' storage_bucket = project_id + '.appspot.com' request = objects.list(bucket=storage_bucket, prefix='component_training_data') response = trainer.dataset.make_api_request(request) items = response.get('items') csv_filepaths = [b.get('name') for b in items] final_string = '' for word in parse_words(csv_filepaths, objects, storage_bucket, project_id): final_string += word + '\n' if subpaths: client_obj = client.Client(project=project_id) bucket_obj = bucket.Bucket(client_obj, project_id + '-mlengine') bucket_obj.blob = google.cloud.storage.blob.Blob( trainer_folder + '/' + TOP_WORDS, bucket_obj) bucket_obj.blob.upload_from_string(final_string, content_type='text/plain') return final_string.split()
def _main(module, name, state, location, project, storage_class, force, acl, reset_acl, default_acl, reset_default_acl, iam_policy, reset_iam_policy): storage_client = storage.Client() changed = False bucket_obj = storage_client.lookup_bucket(name) iam_policy_diff = acl_diff = default_acl_diff = AclDiff(False, [], []) final_policy = {} if state == 'present': if not bucket_obj: changed = True bucket_obj = bucket.Bucket(storage_client, name) bucket_obj.location = location bucket_obj.storage_class = storage_class if not module.check_mode: bucket_obj.create(project=project) # adjust permissions acl_diff = _adjust_acl(module, bucket_obj.acl, acl, reset_acl) default_acl_diff = _adjust_acl(module, bucket_obj.default_object_acl, default_acl, reset_default_acl) final_policy, iam_policy_diff = _adjust_iam(module, bucket_obj, iam_policy, reset_iam_policy) elif state == 'absent': if bucket_obj: changed = True bucket_obj.acl.reload() bucket_obj.default_object_acl.reload() if not module.check_mode: bucket_obj.delete(force) elif state == 'get': pass else: module.exit_json(failed=True, error="Unexpected state '%s'" % state) changed = any([changed, acl_diff.changed, default_acl_diff.changed, iam_policy_diff.changed]) result = { 'changed': changed, 'bucket': _bucket_repr(bucket_obj, final_policy), 'changes': { 'acl': acl_diff._asdict(), 'default_acl': default_acl_diff._asdict(), 'iam_policy': iam_policy_diff._asdict(), }, 'state': state, } module.exit_json(**result)
def Predict(args): ml = googleapiclient.discovery.build('ml', 'v1', credentials=credentials) with open(args.content) as f: content = f.read() project_ID = 'projects/%s' % args.project full_model_name = '%s/models/%s' % (project_ID, MODEL_NAME) model_request = ml.projects().models().get(name=full_model_name) model_response = model_request.execute() version_name = model_response['defaultVersion']['name'] model_name = 'component_trainer_' + re.search("v_(\d+)", version_name).group(1) client_obj = client.Client(project=args.project) bucket_name = '%s-mlengine' % args.project bucket_obj = bucket.Bucket(client_obj, bucket_name) instance = ml_helpers.GenerateFeaturesRaw([content], COMPONENT_FEATURES, getTopWords( bucket_name, model_name)) request = ml.projects().predict( name=full_model_name, body={'instances': [{ 'inputs': instance['word_features'] }]}) try: response = request.execute() bucket_obj.blob = blob.Blob('%s/component_index.json' % model_name, bucket_obj) component_index = bucket_obj.blob.download_as_string() component_index_dict = json.loads(component_index) return read_indexes(response, component_index_dict) except googleapiclient.errors.HttpError, err: print('There was an error. Check the details:') print(err._get_reason())