def app(request): _app.config.from_object("peregrine.test_settings") app_init(_app) sheepdog_blueprint = sheepdog.blueprint.create_blueprint('submission') _app.register_blueprint(sheepdog_blueprint, url_prefix='/v0/submission') _app.logger.info('Initializing IndexClient') _app.index_client = IndexClient( _app.config['INDEX_CLIENT']['host'], version=_app.config['INDEX_CLIENT']['version'], auth=_app.config['INDEX_CLIENT']['auth']) try: _app.logger.info('Initializing Auth driver') except Exception: _app.logger.exception("Couldn't initialize auth, continuing anyway") _app.logger.setLevel(os.environ.get("GDC_LOG_LEVEL", "WARNING")) _app.jwt_public_keys = { _app.config['USER_API']: { 'key-test': utils.read_file('resources/keys/test_public_key.pem') } } return _app
def dist_get_record(record): # Sort the list of distributed ID services # Ones with which the request matches a hint will be first # Followed by those that don't match the hint sorted_dist = sorted(blueprint.dist, key=lambda k: hint_match(record, k['hints']), reverse=True) for indexd in sorted_dist: try: if indexd['type'] == "doi": fetcher_client = DOIClient(baseurl=indexd['host']) res = fetcher_client.get(record) elif indexd['type'] == "dos": fetcher_client = DOSClient(baseurl=indexd['host']) res = fetcher_client.get(record) else: fetcher_client = IndexClient(baseurl=indexd['host']) res = fetcher_client.global_get(record, no_dist=True) except: # a lot of things can go wrong with the get, but in general we don't care here. continue if res: json = res.to_json() json['from_index_service'] = { 'host': indexd['host'], 'name': indexd['name'], } return json raise IndexNoRecordFound('no record found')
def db_init(app): app.logger.info('Initializing PsqlGraph driver') app.db = PsqlGraphDriver( host=app.config['PSQLGRAPH']['host'], user=app.config['PSQLGRAPH']['user'], password=app.config['PSQLGRAPH']['password'], database=app.config['PSQLGRAPH']['database'], set_flush_timestamps=True, ) app.userdb = SQLAlchemyDriver(app.config['PSQL_USER_DB_CONNECTION']) flask_scoped_session(app.userdb.Session, app) app.oauth2 = OAuth2Client(**app.config['OAUTH2']) app.logger.info('Initializing Indexd driver') app.signpost = IndexClient(app.config['SIGNPOST']['host'], version=app.config['SIGNPOST']['version'], auth=app.config['SIGNPOST']['auth']) try: app.logger.info('Initializing Auth driver') app.auth = AuthDriver(app.config["AUTH_ADMIN_CREDS"], app.config["INTERNAL_AUTH"]) except Exception: app.logger.exception("Couldn't initialize auth, continuing anyway")
def db_init(app): app.logger.info("Initializing PsqlGraph driver") connect_args = {} if app.config.get("PSQLGRAPH") and app.config["PSQLGRAPH"].get("sslmode"): connect_args["sslmode"] = app.config["PSQLGRAPH"]["sslmode"] app.db = PsqlGraphDriver( host=app.config["PSQLGRAPH"]["host"], user=app.config["PSQLGRAPH"]["user"], password=app.config["PSQLGRAPH"]["password"], database=app.config["PSQLGRAPH"]["database"], set_flush_timestamps=True, connect_args=connect_args, isolation_level=app.config["PSQLGRAPH"].get( "isolation_level", "READ_COMMITTED" ), ) if app.config.get("AUTO_MIGRATE_DATABASE"): migrate_database(app) app.oauth_client = oauth2_client.OAuthClient(**app.config["OAUTH2"]) app.logger.info("Initializing index client") app.index_client = IndexClient( app.config["INDEX_CLIENT"]["host"], version=app.config["INDEX_CLIENT"]["version"], auth=app.config["INDEX_CLIENT"]["auth"], )
def app(request): _app.config.from_object("peregrine.test_settings") app_init(_app) sheepdog_blueprint = sheepdog.blueprint.create_blueprint("submission") _app.register_blueprint(sheepdog_blueprint, url_prefix="/v0/submission") _app.logger.info("Initializing IndexClient") _app.index_client = IndexClient( _app.config["INDEX_CLIENT"]["host"], version=_app.config["INDEX_CLIENT"]["version"], auth=_app.config["INDEX_CLIENT"]["auth"], ) try: _app.logger.info("Initializing Auth driver") except Exception: _app.logger.exception("Couldn't initialize auth, continuing anyway") _app.logger.setLevel(os.environ.get("GDC_LOG_LEVEL", "WARNING")) _app.jwt_public_keys = { _app.config["USER_API"]: { "key-test": utils.read_file("resources/keys/test_public_key.pem") } } return _app
def get_indexd_records(): """ Get all indexd records """ results = {} indexd_client = IndexClient( INDEXD["host"], INDEXD["version"], (INDEXD["auth"]["username"], INDEXD["auth"]["password"]), ) it = indexd_client.list(page_size=1000) progress = 0 for doc in it: progress += 1 results[doc.did] = doc.urls return results
def test_hashes(get_request_mock, handle_error_mock): from indexclient.client import IndexClient input_params = { 'hashes': { 'md5': '00000000000000000000000000000001' }, 'size': '1' } expected_format = { 'hash': ['md5:00000000000000000000000000000001'], 'size': '1', 'limit': 1 } with patch("indexclient.client.IndexClient._get") as get_mock: client = IndexClient('base_url') client.get_with_params(input_params) assert get_mock.called args, kwargs = get_mock.call_args_list[0] assert kwargs['params'] == expected_format
def test_hashes(get_request_mock, handle_error_mock): from indexclient.client import IndexClient input_params = { "hashes": { "md5": "00000000000000000000000000000001" }, "size": "1" } expected_format = { "hash": ["md5:00000000000000000000000000000001"], "size": "1", "limit": 1, } with patch("indexclient.client.IndexClient._get") as get_mock: client = IndexClient("base_url") client.get_with_params(input_params) assert get_mock.called args, kwargs = get_mock.call_args_list[0] assert kwargs["params"] == expected_format
def db_init(app): app.logger.info('Initializing PsqlGraph driver') app.db = PsqlGraphDriver( host=app.config['PSQLGRAPH']['host'], user=app.config['PSQLGRAPH']['user'], password=app.config['PSQLGRAPH']['password'], database=app.config['PSQLGRAPH']['database'], set_flush_timestamps=True, ) app.oauth2 = OAuth2Client(**app.config['OAUTH2']) app.logger.info('Initializing Indexd driver') app.signpost = IndexClient(app.config['SIGNPOST']['host'], version=app.config['SIGNPOST']['version'], auth=app.config['SIGNPOST']['auth'])
def db_init(app): app.logger.info("Initializing PsqlGraph driver") app.db = PsqlGraphDriver( host=app.config["PSQLGRAPH"]["host"], user=app.config["PSQLGRAPH"]["user"], password=app.config["PSQLGRAPH"]["password"], database=app.config["PSQLGRAPH"]["database"], set_flush_timestamps=True, ) app.oauth2 = OAuth2Client(**app.config["OAUTH2"]) app.logger.info("Initializing Indexd driver") app.index_client = IndexClient( app.config["INDEX_CLIENT"]["host"], version=app.config["INDEX_CLIENT"]["version"], auth=app.config["INDEX_CLIENT"]["auth"], )
def db_init(app): app.logger.info("Initializing PsqlGraph driver") app.db = PsqlGraphDriver( host=app.config["PSQLGRAPH"]["host"], user=app.config["PSQLGRAPH"]["user"], password=app.config["PSQLGRAPH"]["password"], database=app.config["PSQLGRAPH"]["database"], set_flush_timestamps=True, ) if app.config.get("AUTO_MIGRATE_DATABASE"): migrate_database(app) app.oauth_client = oauth2_client.OAuthClient(**app.config["OAUTH2"]) app.logger.info("Initializing index client") app.index_client = IndexClient( app.config["INDEX_CLIENT"]["host"], version=app.config["INDEX_CLIENT"]["version"], auth=app.config["INDEX_CLIENT"]["auth"], )
def __init__( self, global_config, files, total_files, job_name, copied_objects, manager_ns, bucket=None, ): """ Class constructor Args: global_config(dict): a configuration { "multi_part_upload_threads": 10, "data_chunk_size": 1024*1024*5 } files(list(str)): list of copying files total_files(int): total number of files job_name(str): copying|indexing copied_objects(dict): a dictionary of copied files with key is uuid/file_name manager_ns(ManagerNamespace): for synchronization bucket(str): source bucket """ self.bucket = bucket self.files = files self.total_files = total_files self.global_config = global_config self.job_name = job_name self.copied_objects = copied_objects self.manager_ns = manager_ns self.indexclient = IndexClient( INDEXD["host"], INDEXD["version"], (INDEXD["auth"]["username"], INDEXD["auth"]["password"]), )
def __init__( self, global_config, files, total_files, job_name, copied_objects, manager_ns, bucket=None, ): """ Class constructor Args: global_config(dict): a configuration { "multi_part_upload_threads": 10, "data_chunk_size": 1024*1024*5 } manifest_file(str): manifest file thread_num(int): number of threads job_name(str): copying|indexing bucket(str): source bucket """ self.bucket = bucket self.files = files self.total_files = total_files self.global_config = global_config self.job_name = job_name self.copied_objects = copied_objects self.manager_ns = manager_ns self.indexclient = IndexClient( INDEXD["host"], INDEXD["version"], (INDEXD["auth"]["username"], INDEXD["auth"]["password"]), )
def dist_get_record(record): # Sort the list of distributed ID services # Ones with which the request matches a hint will be first # Followed by those that don't match the hint sorted_dist = sorted( blueprint.dist, key=lambda k: hint_match(record, k["hints"]), reverse=True ) for indexd in sorted_dist: try: if indexd["type"] == "doi": # Digital Object Identifier fetcher_client = DOIClient(baseurl=indexd["host"]) res = fetcher_client.get(record) elif indexd["type"] == "dos": # Data Object Service fetcher_client = DOSClient(baseurl=indexd["host"]) res = fetcher_client.get(record) elif indexd["type"] == "hs": # HydroShare and CommonsShare fetcher_client = HSClient(baseurl=indexd["host"]) res = fetcher_client.get(record) else: fetcher_client = IndexClient(baseurl=indexd["host"]) res = fetcher_client.global_get(record, no_dist=True) except: # a lot of things can go wrong with the get, but in general we don't care here. continue if res: json = res.to_json() json["from_index_service"] = { "host": indexd["host"], "name": indexd["name"], } return json raise IndexNoRecordFound("no record found")
def delete_objects_from_cloud_resources(manifest, log_bucket, release, dry_run=True): """ delete object from S3 and GS for safety use filename instead of file_name in manifest file to avoid accident deletion. Args: manifest(str): manifest file log_filename(str): the name of log file release(str): data release dry_run(bool): True the program does not really delete the file (for report purpose) """ session = boto3.session.Session() s3_sess = session.resource("s3") try: s3_sess.meta.client.head_bucket(Bucket=log_bucket) except botocore.exceptions.ClientError as e: logger.error( "The bucket {} does not exist or you have no access. Detail {}". format(log_bucket, e)) return indexclient = IndexClient( INDEXD["host"], INDEXD["version"], (INDEXD["auth"]["username"], INDEXD["auth"]["password"]), ) if manifest.startswith("s3://"): file_infos = get_fileinfo_list_from_s3_manifest(manifest) else: file_infos = get_fileinfo_list_from_csv_manifest(manifest) s3 = boto3.resource("s3") gs_client = storage.Client() ignored_dict = get_ignored_files(IGNORED_FILES, "\t") aws_deletion_logs = [] gs_deletion_logs = [] num = 0 for fi in file_infos: num = num + 1 logger.info("Start to process file {}".format(num)) try: aws_target_bucket = get_aws_bucket_name(fi, PROJECT_ACL) except UserError as e: aws_deletion_logs.append( DeletionLog(url=fi.get("id") + "/" + fi.get("filename"), message=e.message)) aws_target_bucket = None if not dry_run: if aws_target_bucket: aws_deletion_logs.append( _remove_object_from_s3(s3, indexclient, fi, aws_target_bucket, dry_run)) try: google_target_bucket = get_google_bucket_name(fi, PROJECT_ACL) except UserError as e: logger.warning(e) gs_deletion_logs.append( DeletionLog(url=fi.get("id") + "/" + fi.get("filename"), message=e.message)) continue gs_deletion_logs.append( _remove_object_from_gs(gs_client, indexclient, fi, google_target_bucket, ignored_dict)) delete_record_from_indexd(fi.get("id"), indexclient) aws_log_list = [] for log in aws_deletion_logs: aws_log_list.append(log.to_dict()) aws_log_json = {} aws_log_json["data"] = aws_log_list gs_log_list = [] for log in gs_deletion_logs: gs_log_list.append(log.to_dict()) gs_log_json = {} gs_log_json["data"] = gs_log_list timestr = time.strftime("%Y%m%d-%H%M%S") gs_filename = timestr + "gs_deletion_log.json" aws_filename = timestr + "aws_deletion_log.json" if not dry_run: try: s3 = boto3.client("s3") with open(aws_filename, "w") as outfile: json.dump(aws_log_json, outfile) s3.upload_file(aws_filename, log_bucket, release + "/" + basename(aws_filename)) with open(gs_filename, "w") as outfile: json.dump(gs_log_json, outfile) s3.upload_file(gs_filename, log_bucket, release + "/" + basename(gs_filename)) except Exception as e: logger.error(e) else: logger.info( "All following files are for redaction.\nIf there is nothing below that means there is nothing to redact!!!\n\n" ) logger.info("url\n") for log in aws_log_list: if log["deleted"]: logger.info(log["url"])
def index_client(): return IndexClient(SIGNPOST['host'], SIGNPOST['version'], SIGNPOST['auth'])
def manifest_indexing(manifest, prefix=None, replace_urls=False): """ Loop through all the files in the manifest, update/create records in indexd update indexd if the url is not in the record url list or acl has changed """ indexclient = IndexClient( INDEXD["host"], INDEXD["version"], (INDEXD["auth"]["username"], INDEXD["auth"]["password"]), ) try: files = get_fileinfos_from_tsv_manifest(manifest) except Exception as e: logger.error("Can not read {}. Detail {}".format(manifest, e)) return prefix = prefix + "/" if prefix else "" number_indexed_files = 0 for fi in files: try: urls = fi.get("url").split(" ") if fi.get("acl").lower() in {"[u'open']", "['open']"}: acl = ["*"] else: acl = [ element.strip().replace("'", "") for element in fi.get("acl")[1:-1].split(",") ] doc = indexclient.get(prefix + fi.get("GUID")) if doc is not None: need_update = False for url in urls: if not replace_urls and url not in doc.urls: doc.urls.append(url) need_update = True if replace_urls and set(urls) != set(doc.urls): doc.urls = urls need_update = True # indexd doesn't like when records have metadata for non-existing # urls new_urls_metadata = copy.deepcopy(doc.urls_metadata) for url, metadata in doc.urls_metadata.items(): if url not in urls: del new_urls_metadata[url] doc.urls_metadata = new_urls_metadata if set(doc.acl) != set(acl): doc.acl = acl need_update = True if need_update: doc.patch() else: doc = indexclient.create( did=prefix + fi.get("GUID"), hashes={"md5": fi.get("md5")}, size=fi.get("size", 0), acl=acl, urls=urls, ) number_indexed_files += 1 if number_indexed_files % 10 == 0 or number_indexed_files == len( files): logger.info("Progress {}%".format(number_indexed_files * 100.0 / len(files))) except Exception as e: # Don't break for any reason logger.error( "Can not update/create an indexd record with uuid {}. Detail {}" .format(fi.get("GUID"), e))
def exec_google_copy(fi, ignored_dict, global_config): """ copy a file to google bucket. Args: fi(dict): a dictionary of a copying file global_config(dict): a configuration { "chunk_size_download": 1024, "chunk_size_upload": 1024 } Returns: DataFlowLog """ if fi["size"] == 0: msg = "can not copy {} to GOOGLE bucket since it is empty file".format(fi["id"]) return DataFlowLog(message=msg) indexd_client = IndexClient( INDEXD["host"], INDEXD["version"], (INDEXD["auth"]["username"], INDEXD["auth"]["password"]), ) if not ignored_dict: raise UserError( "Expecting non-empty IGNORED_FILES. Please check if ignored_files_manifest.py is configured correctly!!!" ) try: bucket_name = utils.get_google_bucket_name(fi, PROJECT_ACL) except UserError as e: msg = "can not copy {} to GOOGLE bucket. Detail {}. {}".format( fi["id"], e, PROJECT_ACL ) logger.error(msg) return DataFlowLog(message=msg) if not bucket_exists(bucket_name): msg = "There is no bucket with provided name {}\n".format(bucket_name) logger.error(msg) return DataFlowLog(message=msg) if fi["id"] in ignored_dict: logger.info( "{} is ignored. Start to check indexd for u5aa objects".format(fi["id"]) ) _update_indexd_for_5aa_object(fi, bucket_name, ignored_dict, indexd_client) return DataFlowLog(message="{} is in the ignored list".format(fi["id"])) client = storage.Client() sess = AuthorizedSession(client._credentials) blob_name = fi.get("id") + "/" + fi.get("file_name") _check_and_handle_changed_acl_object(fi) if blob_exists(bucket_name, blob_name): logger.info("{} is already copied".format(fi["id"])) else: try: logger.info( "Start to stream {}. Size {} (MB)".format( fi["id"], fi["size"] * 1.0 / 1000 / 1000 ) ) tries = 0 while tries < NUM_STREAMING_TRIES: try: resumable_streaming_copy( fi, client, bucket_name, blob_name, global_config ) if fail_resumable_copy_blob(sess, bucket_name, blob_name, fi): delete_object(sess, bucket_name, blob_name) else: break except Exception as e: logger.warning(e) tries += 1 if tries == NUM_STREAMING_TRIES: logger.error( "Can not stream {} after multiple attemps".format(fi.get("id")) ) else: logger.info( "Finish streaming {}. Size {} (MB)".format( fi["id"], fi["size"] * 1.0 / 1000 / 1000 ) ) except APIError as e: logger.error(str(e)) return DataFlowLog(message=str(e)) except Exception as e: # Don't break (Not expected) logger.error(str(e)) return DataFlowLog(message=str(e)) # Confirm that the object was copied if blob_exists(bucket_name, blob_name): try: if indexd_utils.update_url(fi, indexd_client, provider="gs"): logger.info("Successfully update indexd for {}".format(fi["id"])) else: logger.info("Can not update indexd for {}".format(fi["id"])) except APIError as e: logger.error(e) return DataFlowLog(copy_success=True, message=e) else: msg = "can not copy {} to GOOGLE bucket after multiple attempts. Check the error detail in logs".format( blob_name ) logger.error(msg) return DataFlowLog(message=msg) return DataFlowLog( copy_success=True, index_success=True, message="object {} successfully copied ".format(blob_name), )
def index_client(): return IndexClient(INDEX_CLIENT["host"], INDEX_CLIENT["version"], INDEX_CLIENT["auth"])
def index_client(): return IndexClient(INDEX_CLIENT['host'], INDEX_CLIENT['version'], INDEX_CLIENT['auth'])
def index_client(): return IndexClient(SIGNPOST["host"], SIGNPOST["version"], SIGNPOST["auth"])