Exemplo n.º 1
0
 def _get_training_job(self):
     with generate_user_session() as session:
         url = "{}/training/definitions/{}/jobs/{}".format(
             ORGANIZATION_ENDPOINT, self.job_definition_name,
             self.training_job_id)
         res = session.get(url)
         res.raise_for_status()
         return res.json()
Exemplo n.º 2
0
def generate_bucket_file_iter_by_id(bucket_id, *file_ids):
    """
    generate file iterator for list of bucket file identifiers

    :param bucket_id: datalake bucket identifier
    :param file_ids: list of file id to iterate
    :return:
    """
    session = generate_user_session()

    for file_id in file_ids:
        url = "{}/buckets/{}/files/{}".format(ORGANIZATION_ENDPOINT, bucket_id, file_id)
        r = session.get(url, timeout=PLATFORM_REQUEST_TIMEOUT_SECONDS)
        r.raise_for_status()
        file_info = r.json()
        yield file_info
Exemplo n.º 3
0
def generate_channel_file_iter_by_id(channel_id, *file_ids):
    """
    generate file iterator for list of datalake file identifiers

    :param channel_id: datalake channel identifier
    :param file_ids: list of file id to iterate
    :return:
    """
    session = generate_user_session()

    for file_id in file_ids:
        url = "{}/channels/{}/{}".format(ABEJA_API_URL, channel_id, file_id)
        r = session.get(url, timeout=PLATFORM_REQUEST_TIMEOUT_SECONDS)
        r.raise_for_status()
        file_info = r.json()
        yield file_info
Exemplo n.º 4
0
def generate_bucket_file_iter(bucket_id):
    """
    generate file iterator in bucket

    :param bucket_id: datalake bucket identifier
    :return:
    """
    url = "{}/buckets/{}/files".format(ORGANIZATION_ENDPOINT, bucket_id)
    params = {
        'target_dir': '/',
        'items_per_page': DATALAKE_ITEMS_PER_PAGE
    }

    # Get upload url
    session = generate_user_session()
    queue = deque()

    while True:
        r = session.get(url, params=params,
                        timeout=PLATFORM_REQUEST_TIMEOUT_SECONDS)
        r.raise_for_status()
        res = r.json()
        files = res.get('files', [])
        if len(files) == 0:
            if len(queue) == 0:
                break
            else:
                target_dir = queue.popleft()
                params.update({
                    'target_dir': target_dir,
                    'start_after': target_dir
                })
        else:
            # Iterate files
            for file_info in files:
                if file_info['is_file']:
                    yield file_info
                else:
                    queue.append(file_info['file_id'])
            next_start_after = res.get('next_start_after')
            params.update({
                'start_after': next_start_after
            })
Exemplo n.º 5
0
def generate_channel_file_iter_by_period(channel_id, start=None, end=None):
    """
    generate file iterator in channel from specified start date to specified end date

    :param channel_id: datalake channel identifier
    :param start: start date (YYYYMMDD)
    :param end: send date (YYYYMMDD)
    :return:
    """
    url = "{}/channels/{}".format(ABEJA_API_URL, channel_id)
    params = {
        'items_per_page': DATALAKE_ITEMS_PER_PAGE
    }

    if start and end:
        params['start'] = start
        params['end'] = end
    if (start and not end) or (not start and end):
        logger.error(
            'both start and end are required for period of datalake file list')
        raise InvalidDatalakeTimeInterval()

    # Get upload url
    session = generate_user_session()

    while True:
        r = session.get(url, params=params,
                        timeout=PLATFORM_REQUEST_TIMEOUT_SECONDS)
        r.raise_for_status()
        res = r.json()
        files = res.get('files')
        if not files or len(files) == 0:
            break
        # Iterate files
        for file_info in files:
            yield file_info
        next_page_token = res.get('next_page_token')
        if not next_page_token:
            break
        params = {
            'next_page_token': next_page_token
        }
Exemplo n.º 6
0
def upload_job(bucket_id, upload_file: UploadBucketFile, report_queue,
               options):
    """
    upload files until consuming all items in file queue

    :param bucket_id: bucket identifier
    :param upload_file: ``UploadBucketFile`` object to upload
    :param report_queue: queue to report progress for each file
    :param options: job options
    :return:
    """

    publisher_id = uuid.uuid4().hex
    file_path = upload_file.path
    file_id = upload_file.key
    options = options if options else {}
    metadata = {}

    try:
        finished_status = FINISH_REPORT
        url = "{}/buckets/{}/files".format(ORGANIZATION_ENDPOINT, bucket_id)

        type, _ = guess_type(file_path)
        headers = {}

        # Runtime option `metadata` overwrites metadata specified in
        # file list spec.
        metadata = upload_file.metadata or metadata
        metadata['filename'] = upload_file.key
        for key, value in options.get('metadata', ()):
            metadata[key] = value

        for key, value in metadata.items():
            key = urllib.parse.quote(str(key), encoding='utf-8')
            value = urllib.parse.quote(str(value), encoding='utf-8')
            headers['x-abeja-meta-{}'.format(key)] = value

        total = os.path.getsize(file_path)
        initialize_options = {
            'file_name': file_path,
            'total': total,
        }
        report_queue.put(
            (INITIALIZE_REPORT, publisher_id, 0, initialize_options))
        with generate_user_session(json_content_type=False) as session:
            with open(str(file_path), 'rb') as file_obj:
                params = {}
                params = BytesIO(json.dumps(params).encode())
                files = {
                    'file': (file_id, file_obj, type),
                    'parameters': ('params.json', params, 'application/json')
                }
                # Uploading file shouldn't be timed out!
                upload_res = session.post(url,
                                          files=files,
                                          headers=headers,
                                          timeout=None)
                report_queue.put((PROGRESS_REPORT, publisher_id, total, None))

        upload_res.raise_for_status()
        content = upload_res.json()
        report_queue.put((finished_status, publisher_id, 0, {
            'source': file_path,
            'destination': content.get('file_id', ''),
            'metadata': content.get('metadata', {})
        }))
    except Exception as e:
        options = {
            'source':
            file_path,
            'destination':
            file_id,
            'metadata':
            metadata,
            'error':
            'Failed to upload {} to bucket_id {} (Reason: {})'.format(
                file_path, bucket_id, e)
        }
        report_queue.put((RAISE_ERROR, publisher_id, 0, options))
Exemplo n.º 7
0
def _download_job_proecess(channel_id, file_info, report_queue, options):
    """
    download files until consuming all items in file queue

    :param channel_id: channel identifier
    :param file_info: information of file to download
    :param report_queue: queue to report progress for each file
    :param options: option including target download directory
    :return:
    """

    publisher_id = uuid.uuid4().hex
    download_dir = options.get('download_dir', '.')
    saving_file_name_type = options.get('file_name_type')
    skip_duplicate = options.get('skip_duplicate', False)

    file_id = file_info.get('file_id')
    file_meta = file_info.get('metadata', {})
    download_uri = file_info.get('download_uri')

    if saving_file_name_type == 'id':
        file_name = file_id
    else:
        file_name = file_meta.get('x-abeja-meta-filename') or file_id

    download_path = None
    is_downloading = True
    try:
        # download file content
        with generate_retry_session() as session:
            download_stream_res = session.get(
                download_uri,
                stream=True,
                timeout=PLATFORM_REQUEST_TIMEOUT_SECONDS)
        # update pre-signed url if expired
        if download_stream_res.status_code == 403:
            url = "{}/channels/{}/{}".format(ABEJA_API_URL, channel_id,
                                             file_id)
            with generate_user_session() as user_session:
                res = user_session.get(
                    url, timeout=PLATFORM_REQUEST_TIMEOUT_SECONDS)
            res.raise_for_status()
            file_info = res.json()
            download_uri = file_info.get('download_uri')
            with generate_retry_session() as session:
                download_stream_res = session.get(
                    download_uri,
                    stream=True,
                    timeout=PLATFORM_REQUEST_TIMEOUT_SECONDS)
        download_stream_res.raise_for_status()
        total_size = int(download_stream_res.headers.get('content-length', 0))

        initialize_options = {
            'file_name': file_name,
            'total': total_size,
        }
        report_queue.put(
            (INITIALIZE_REPORT, publisher_id, 0, initialize_options))
        download_path = _get_default_file_path(download_dir, file_name)
        result_options = {
            'source': file_id,
            'destination': download_path,
        }
        if os.path.exists(download_path):
            if skip_duplicate:
                report_queue.put(
                    (SKIP_REPORT, publisher_id, 0, result_options))
                return
            else:
                # update destination path ot resolve conflict
                result_options['destination'] = _resolve_file_path(
                    download_dir, file_name)
        is_downloading = True
        with open(download_path, 'wb') as f:
            for chunk in download_stream_res.iter_content(
                    chunk_size=HTTP_READ_CHUNK_SIZE):
                # update tqdm progress bar with chunk data size
                report_queue.put(
                    (PROGRESS_REPORT, publisher_id, len(chunk), None))
                f.write(chunk)

        report_queue.put((FINISH_REPORT, publisher_id, 0, result_options))
    except:
        if is_downloading and download_path is not None and os.path.exists(
                download_path):
            os.remove(download_path)
        options = {
            'source':
            file_id,
            'error':
            'Failed to download {} of channel_id {}'.format(
                file_id, channel_id)
        }
        report_queue.put((RAISE_ERROR, publisher_id, 0, options))
        raise
Exemplo n.º 8
0
def upload_job(channel_id, upload_file, report_queue, options):
    """
    upload files until consuming all items in file queue

    :param channel_id: channel identifier
    :param upload_file: ``UploadFile`` object to upload
    :param report_queue: queue to report progress for each file
    :param options: job options
    :return:
    """

    publisher_id = uuid.uuid4().hex
    file_path = upload_file.path
    options = options if options else {}
    metadata = {}

    try:
        finished_status = FINISH_REPORT
        url = "{}/channels/{}/upload".format(ABEJA_API_URL, channel_id)

        conflict_target = options.get('conflict_target')
        if conflict_target:
            url = '{}?conflict_target={}'.format(url, conflict_target)

        type, _ = guess_type(file_path)
        headers = {
            'Content-Type': type if type else 'application/octet-stream'
        }

        # Runtime option `metadata` overwrites metadata specified in
        # file list spec.
        metadata = upload_file.metadata or metadata
        metadata['filename'] = os.path.basename(file_path)
        for key, value in options.get('metadata', ()):
            metadata[key] = value

        for key, value in metadata.items():
            key = urllib.parse.quote(str(key), encoding='utf-8')
            value = urllib.parse.quote(str(value), encoding='utf-8')
            headers['x-abeja-meta-{}'.format(key)] = value

        # File iterator
        it = UploadFileIterator(file_path, publisher_id, report_queue)
        data_adapter = IterableToFileAdapter(it)

        with generate_user_session() as session:
            # Uploading file shouldn't be timed out!
            upload_res = session.post(url,
                                      data=data_adapter,
                                      headers=headers,
                                      timeout=None)

        # 409 conflict when conflict_target option specified can be ignored.
        if conflict_target and upload_res.status_code == 409:
            finished_status = SKIP_REPORT
        else:
            upload_res.raise_for_status()

        content = upload_res.json()
        report_queue.put((finished_status, publisher_id, 0, {
            'source': file_path,
            'destination': content.get('file_id', ''),
            'metadata': content.get('metadata', {})
        }))
    except Exception as e:
        options = {
            'source':
            file_path,
            'metadata':
            metadata,
            'error':
            'Failed to upload {} to channel_id {} (Reason: {})'.format(
                file_path, channel_id, e)
        }
        report_queue.put((RAISE_ERROR, publisher_id, 0, options))
Exemplo n.º 9
0
def _download_job_proecess(bucket_id, file_info, report_queue, options):
    """
    download files until consuming all items in file queue

    :param bucket_id: bucket identifier
    :param file_info: information of file to download
    :param report_queue: queue to report progress for each file
    :param options: option including target download directory
    :return:
    """

    publisher_id = uuid.uuid4().hex
    download_dir = options.get('download_dir', '.')

    file_id = file_info.get('file_id')
    download_uri = file_info.get('download_uri')
    file_name = file_id
    download_path = None
    is_downloading = True
    try:
        # download file content
        with generate_retry_session() as session:
            download_stream_res = session.get(
                download_uri, stream=True, timeout=PLATFORM_REQUEST_TIMEOUT_SECONDS)
        # update pre-signed url if expired
        if download_stream_res.status_code == 403:
            url = "{}/buckets/{}/files/{}".format(
                ORGANIZATION_ENDPOINT, bucket_id, file_id)
            with generate_user_session() as user_session:
                res = user_session.get(
                    url, timeout=PLATFORM_REQUEST_TIMEOUT_SECONDS)
            res.raise_for_status()
            file_info = res.json()
            download_uri = file_info.get('download_uri')
            with generate_retry_session() as session:
                download_stream_res = session.get(
                    download_uri, stream=True, timeout=PLATFORM_REQUEST_TIMEOUT_SECONDS)
        download_stream_res.raise_for_status()
        total_size = int(download_stream_res.headers.get('content-length', 0))

        initialize_options = {
            'file_name': file_name,
            'total': total_size,
        }
        report_queue.put(
            (INITIALIZE_REPORT, publisher_id, 0, initialize_options))
        download_path = _get_default_file_path(download_dir, file_name)
        result_options = {
            'source': file_id,
            'destination': download_path,
        }
        if Path(download_path).exists():
            report_queue.put((SKIP_REPORT, publisher_id, 0, result_options))
            return
        else:
            Path(download_path).parent.mkdir(parents=True, exist_ok=True)
        is_downloading = True
        with open(download_path, 'wb') as f:
            for chunk in download_stream_res.iter_content(chunk_size=HTTP_READ_CHUNK_SIZE):
                # update tqdm progress bar with chunk data size
                report_queue.put(
                    (PROGRESS_REPORT, publisher_id, len(chunk), None))
                f.write(chunk)

        report_queue.put((FINISH_REPORT, publisher_id, 0, result_options))
    except Exception:
        if is_downloading and download_path is not None and os.path.exists(download_path):
            os.remove(download_path)
        options = {
            'source': file_id,
            'error': 'Failed to download {} of bucket_id {}'.format(
                file_id, bucket_id)
        }
        report_queue.put((RAISE_ERROR, publisher_id, 0, options))
        raise