def download(model_endpoint): """ model_endpoint is instance of one of: * pmworker.endpoint.DocumentUrl * pmworker.endpoint.PageUrl. Will download Document original file or Page associated .txt file from remote S3 location (shared storage) to local MEDIA_ROOT. This function makes sense only in case of settings.S3=True i.e. only if S3 media storage is enabled. """ if model_endpoint.exists(): return True remote_abspath = model_endpoint.url(ep=Endpoint.S3) local_abspath = model_endpoint.url() local_dirname = os.path.dirname(local_abspath) if not os.path.exists(os.path.dirname(local_abspath)): logger.debug(f"{local_dirname} does not exists. Creating.") os.makedirs(local_dirname, exist_ok=True) else: logger.debug(f"{local_dirname} already exists.") # if not s3_download: # logger.debug("s3_download=False. Use test data dir.") # logger.debug(f"Using file {test_local_alternative}") # copy2doc_url( # src_file_path=test_local_alternative, # doc_url=model_endpoint.url() # ) s3_client = boto3.client('s3') bucketname = get_bucketname(model_endpoint.url(ep=Endpoint.S3)) keyname = get_keyname(model_endpoint.url(ep=Endpoint.S3)) if not model_endpoint.exists(ep=Endpoint.S3): logger.info(f"Endpoint s3:/{bucketname}/{keyname} missing") return False try: logger.debug((f"Downloading {remote_abspath} to {local_abspath}")) logger.debug(f"Uploading to bucket={bucketname} " f"Keyname={keyname} " f"Local={local_abspath}.") s3_client.download_file(bucketname, keyname, local_abspath) except botocore.exceptions.ClientError: logger.error( f"Error while downloading " f" {remote_abspath} to {local_abspath}" f" bucketname={bucketname} keyname={keyname}", exc_info=True) return False return True
def remove_file(doc_url): """ Will remove actually the directory associated with that user """ s3 = boto3.resource('s3') bucketname = get_bucketname(doc_url) keyname = get_keyname(doc_url) bucket = s3.Bucket(bucketname) bucket.Object(keyname).delete()
def upload_img(page_url): s3_url = page_url.img_url(ep=Endpoint.S3) img_url = page_url.img_url() bucketname = get_bucketname(s3_url) keyname = get_keyname(s3_url) s3_client = boto3.client('s3') logger.debug(f"Uploading to bucket={bucketname} " f"Keyname={keyname} " f"img_url={img_url} " f"s3_url={s3_url}.") s3_client.upload_file(img_url, bucketname, keyname)
def upload_document_to_s3(doc_ep): s3_url = doc_ep.url(ep=Endpoint.S3) local_url = doc_ep.url() bucketname = get_bucketname(s3_url) keyname = get_keyname(s3_url) s3_client = boto3.client('s3') if not os.path.exists(local_url): raise ValueError(f"{local_url} path does not exits") logger.debug(f"upload_document {local_url} to s3:/{bucketname}{keyname}") s3_client.upload_file(local_url, bucketname, keyname)
def download_hocr(page_ep): if page_ep.hocr_exists(): return True remote_abspath = page_ep.hocr_url(ep=Endpoint.S3) local_abspath = page_ep.hocr_url() local_dirname = os.path.dirname(local_abspath) if not os.path.exists(os.path.dirname(local_abspath)): logger.debug(f"{local_dirname} does not exists. Creating.") os.makedirs(local_dirname, exist_ok=True) else: logger.debug(f"{local_dirname} already exists.") s3_client = boto3.client('s3') bucketname = get_bucketname(page_ep.hocr_url(ep=Endpoint.S3)) keyname = get_keyname(page_ep.hocr_url(ep=Endpoint.S3)) if not s3_key_exists(remote_abspath): logger.info(f"Endpoint s3:/{bucketname}/{keyname} missing") return False try: logger.debug((f"Downloading {remote_abspath} to {local_abspath}")) logger.debug(f"Uploading to bucket={bucketname} " f"Keyname={keyname} " f"Local={local_abspath}.") s3_client.download_file(bucketname, keyname, local_abspath) except botocore.exceptions.ClientError: logger.error( f"Error while downloading " f" {remote_abspath} to {local_abspath}" f" bucketname={bucketname} keyname={keyname}", exc_info=True) return False return True
def test_getkeyname(self): self.assertEqual(get_keyname("s3://my-bucket/some/path/to/x.pdf"), "some/path/to/x.pdf") self.assertEqual(get_keyname("s3:/my-bucket/some/path/to/x.pdf"), "some/path/to/x.pdf")