def register_dataset_items_from_datalake(organization_id, image_channel_id,
                                         label_channel_id, dataset_name,
                                         img_list_path):
    """
    register datasets from datalake channel
    :param dataset_id: target dataset id
    :param channel_id: target channel
    :param label_metadata_key: metadata key which label value is stored
    :param max_size_for_label: max size of dataset items for each label value
    :return:
    """
    with open('dataset.json', 'r') as f:
        dataset_props = json.load(f)

    paths = [[fn.split('/')[-1] for fn in line.split()]
             for line in open(img_list_path)]
    img2label = {img: lbl for img, lbl in paths}

    print('Getting data from datalake....')
    client = DatalakeClient(organization_id=organization_id,
                            credential=credential)
    label_channel = client.get_channel(label_channel_id)

    label_list = label_channel.list_files(prefetch=True)
    label2fileid = {
        label.metadata['filename']: label.file_id
        for label in label_list
    }

    image_channel = client.get_channel(image_channel_id)
    file_iter = image_channel.list_files(limit=1000, prefetch=True)

    dataset_items = []
    for file_info in file_iter:
        imgfile = file_info.metadata['filename']
        labelfile = img2label[imgfile]
        label_id = label2fileid[labelfile]

        annotation = {
            'channel_id': label_channel_id,
            'file_id': label_id,
            'filename': labelfile
        }

        item = create_request_element(image_channel_id, file_info, annotation)
        dataset_items.append(item)

    print('Registering dataset items....')
    dataset_params = {
        'organization_id': organization_id,
        'name': dataset_name,
        'type': 'segmentation',
        'props': dataset_props
    }
    dataset_client = DatasetClient(organization_id=organization_id,
                                   credential=credential)
    dataset = dataset_client.datasets.create(dataset_name, 'segmentation',
                                             dataset_params)
    register_dataset_items(dataset, dataset_items)
Exemplo n.º 2
0
def upload_datalake(file_path):

    client = DatalakeClient()
    datalake_client = DatalakeClient(organization_id=organization_id,
                                     credential=credential)
    channel = datalake_client.get_channel(channel_id)

    metadata = {}
    file = channel.upload_file(file_path, metadata=metadata)

    return 0
Exemplo n.º 3
0
def upload_image_datalake(image_path):
    abeja_credential = {
        'user_id': abeja_user_id,
        'personal_access_token': abeja_personal_access_token
    }
    datalake_client = DatalakeClient(organization_id=abeja_organization_id,
                                     credential=abeja_credential)
    channel = datalake_client.get_channel(abeja_channel_id)
    res = channel.upload_file(image_path)
    datalake_file = channel.get_file(file_id=res.file_id)
    content = datalake_file.get_file_info()
    return content['download_url']
Exemplo n.º 4
0
def register_dataset_items_from_datalake(organization_id, channel_id,
                                         dataset_name, split, year,
                                         max_workers):
    """
    register datasets from datalake channel

    Args:
        organization_id:
        channel_id:
        dataset_name:
        split:
        year:
        max_workers:

    Returns:

    """

    with open('dataset.json', 'r') as f:
        dataset_props = json.load(f)

    voc_dataset = voc_bbox_dataset.VOCBboxDataset(split=split, year=year)
    nb_data = len(voc_dataset)

    data = {}
    for i in range(nb_data):
        id, annotation = voc_dataset.get_annotations(i)
        data[id] = annotation

    print('Getting data from datalake....')
    client = DatalakeClient(organization_id=organization_id,
                            credential=credential)
    channel = client.get_channel(channel_id)

    def file2id(file_info):
        return file_info.metadata['filename'].split('.')[0]

    file_iter = channel.list_files(limit=1000, prefetch=False)
    dataset_items = []
    for file_info in tqdm(file_iter):
        if file2id(file_info) in data:
            item = create_request_element(channel_id, file_info,
                                          data[file2id(file_info)])
            dataset_items.append(item)

    print('Registering dataset items....')
    dataset_client = DatasetClient(organization_id=organization_id,
                                   credential=credential)
    dataset = dataset_client.datasets.create(dataset_name,
                                             dataset_props['type'],
                                             dataset_props['props'])
    register_dataset_items(dataset, dataset_items, max_workers=max_workers)
    print('uploaded!')
Exemplo n.º 5
0
def load_latest_file_from_datalake(channel_id):
    datalake_client = DatalakeClient()
    channel = datalake_client.get_channel(channel_id)

    # load latest file path
    for f in channel.list_files(sort='-uploaded_at'):
        latest_file_path = f.download_url
        latest_file_datetime = f.uploaded_at
        print('load file uploaded at {} (UTC time).'.format(
            latest_file_datetime))
        break

    return latest_file_path
Exemplo n.º 6
0
def main(image_root_dir, channel_id, dataset_name, max_workers,
         min_num_image_per_class):
    image_root_dir = Path(image_root_dir)
    props, label2id = build_props(image_root_dir, min_num_image_per_class)

    datakale_client = DatalakeClient()
    channel = datakale_client.get_channel(channel_id)

    datasets_client = DatasetsClient()
    dataset = datasets_client.datasets.create(name=dataset_name,
                                              type='classification',
                                              props=props)
    ret = upload_images(channel,
                        dataset,
                        image_root_dir,
                        label2id,
                        max_workers=max_workers,
                        min_num_image_per_class=min_num_image_per_class)
    print('finished!')
Exemplo n.º 7
0
def train_data_loader(channel_id: str, file_id: str, label_field: str, input_fileds: list = None):
    datalake_client = DatalakeClient()
    channel = datalake_client.get_channel(channel_id)
    datalake_file = channel.get_file(file_id)
    datalake_file.get_content(cache=True)

    csvfile = Path(ABEJA_STORAGE_DIR_PATH, channel_id, file_id)
    if input_fileds:
        train = pd.read_csv(csvfile, usecols=input_fileds+[label_field])
    else:
        train = pd.read_csv(csvfile)

    y_train = train[label_field].values
    cols_drop = [c for c in train.columns if train[c].dtype == 'O'] + [label_field]
    train.drop(cols_drop, axis=1, inplace=True)
    X_train = train
    cols_train = X_train.columns.tolist()
    del train
    return X_train, y_train, cols_train
def register_dataset_items_from_datalake(organization_id,
                                         channel_id,
                                         dataset_name,
                                         label_metadata_key):
    with open('dataset.json', 'r') as f:
        dataset_props = json.load(f)

    print('Getting data from datalake....')
    client = DatalakeClient(organization_id=organization_id,
                            credential=credential)
    channel = client.get_channel(channel_id)

    def to_annotation(file_info):
        label = file_info.metadata[label_metadata_key]
        label_id = label2id[label]
        return [{label_metadata_key: label,
                 'label_id': label_id,
                 'category_id': 0}]

    file_iter = channel.list_files(limit=1000, prefetch=False)
    label2id = {
        x['label']: x['label_id']
        for x in dataset_props['props']['categories'][0]['labels']
    }

    dataset_items = []
    for file_info in file_iter:
        item = create_request_element(channel_id, file_info,
                                      data_id=int(file_info.metadata['filename'].split('.')[0]),
                                      annotation=to_annotation(file_info))
        dataset_items.append(item)
        if len(dataset_items) % 1000 == 0:
            print(len(dataset_items))

    print('Registering dataset items....')
    dataset_client = DatasetClient(organization_id=organization_id,
                                   credential=credential)
    dataset = dataset_client.datasets.create(dataset_name,
                                             dataset_props['type'],
                                             dataset_props['props'])
    register_dataset_items(dataset, dataset_items)
Exemplo n.º 9
0
    def __init__(self,
                 root,
                 dataset_id,
                 transform=None,
                 target_transform=None,
                 transforms=None,
                 prefetch=False,
                 use_cache=True,
                 indices=None):

        super(AbejaDataset, self).__init__(root, transforms, transform,
                                           target_transform)

        datasets_client = DatasetsClient()
        self.datalake_client = DatalakeClient()
        dataset = datasets_client.get_dataset(dataset_id)
        self.labels = dataset.props['categories'][0]['labels']
        self.palette = create_palette(self.labels)
        self.use_cache = use_cache

        self.datalake_files = list()
        idx = 0
        for item in dataset.dataset_items.list(prefetch=prefetch):
            if indices is not None and not idx in indices:
                idx += 1
                continue

            if 'segmentation-image' in item.attributes:
                data_uri = item.attributes['segmentation-image']['combined'][
                    'data_uri']
            else:
                # FIXME: DEPRECATED. Type 'segmentation' is invalid on the latest spec.
                data_uri = item.attributes['segmentation']['combined'][
                    'data_uri']
            m = re.search(r'datalake://(.+?)/(.+?)$', data_uri)
            src_data = item.source_data[0]
            self.datalake_files.append(
                DataLakeObj(m.group(1), m.group(2), src_data))
            idx += 1
    'pottedplant': 15,
    'sheep': 16,
    'sofa': 17,
    'train': 18,
    'tvmonitor': 19
}

# get DataLake channel id
organization_url = urljoin(
    ANNOTATION_API,
    "/api/v1/organizations/{}/projects/{}".format(ANNOTATION_ORGANIZATION_ID,
                                                  ANNOTATION_PROJECT_ID))
res = requests.get(organization_url, headers=headers)
res.raise_for_status()
channel_id = res.json()['data_lake_channels'][0]['channel_id']
client = DatalakeClient(organization_id=organization_id, credential=credential)
datalake_channel = client.get_channel(channel_id)

# get annotation task url
task_url = urljoin(
    ANNOTATION_API, "/api/v1/organizations/{}/projects/{}/tasks/".format(
        ANNOTATION_ORGANIZATION_ID, ANNOTATION_PROJECT_ID))
page = 1

while True:
    res = requests.get(task_url, headers=headers, params={'page': page})
    res.raise_for_status()
    res_body = res.json()

    # check if project has data
    if len(res_body) == 0:
Exemplo n.º 11
0
def main():
    """auto-annotation for image segmentation"""
    page = 1
    delay = 10

    while True:
        # get annotation tasks
        task_url = urljoin(
            annotation_api,
            '/api/v1/organizations/{}/projects/{}/tasks/'.format(
                annotation_organization_id, annotation_project_id))
        res_task_url = requests.get(task_url,
                                    headers=headers,
                                    params={'page': page})
        res_task_url.raise_for_status()

        # check if project has data
        if len(res_task_url.json()) == 0:
            break

        for task in tqdm(res_task_url.json()):
            # load image from DataLake channel-in
            client = DatalakeClient(organization_id=organization_id,
                                    credential=credential)
            channel = client.get_channel(channel_id_in)
            metadata = task['metadata'][0]
            input_img = channel.get_file(metadata['file_id'])
            content_type = input_img.get_file_info()['content_type']
            input_img_io = io.BytesIO(input_img.get_content())

            # post image to the model
            res_deployment_url = requests.post(
                deployment_url,
                data=input_img_io,
                headers={'Content-Type': content_type},
                auth=(user_id, personal_access_token))
            res_deployment_url.raise_for_status()

            # get predicted results
            labels = res_deployment_url.json()['labels'][0]
            b64 = res_deployment_url.json()['result']
            output_img_io = io.BytesIO(base64.b64decode(b64))

            # convert black color to transparency
            img = Image.open(output_img_io).convert('RGBA')
            pixdata = img.load()

            width, height = img.size
            for y in range(height):
                for x in range(width):
                    if pixdata[x, y] == (0, 0, 0, 255):
                        pixdata[x, y] = (0, 0, 0, 0)

            modified_img_io = io.BytesIO()
            img.save(modified_img_io, format='PNG')

            # upload output image to Datalake channel-out
            upload_url = 'https://api.abeja.io//channels/{}/upload'.format(
                channel_id_out)
            res_upload_url = requests.post(
                upload_url,
                data=modified_img_io.getvalue(),
                headers={'Content-type': 'image/png'},
                auth=(user_id, personal_access_token))
            res_upload_url.raise_for_status()
            file_id = res_upload_url.json()['file_id']

            # get file url
            file_info_url = 'https://api.abeja.io//channels/{}/{}'.format(
                channel_id_out, file_id)
            res_file_info_url = requests.get(file_info_url,
                                             auth=(user_id,
                                                   personal_access_token))
            res_file_info_url.raise_for_status()
            file_url = res_file_info_url.json()['download_url']

            # FIXME multi label
            information = [{
                'class': labels['label'],
                'color': labels['color'],
                'file_id': file_id,
                'file_url': file_url,
                'id': labels['label_id']
            }]

            # register predicted result to annotation tool
            preinference_url = urljoin(
                task_url, "{}/preinferences".format(str(task['id'])))
            res_preinference_url = requests.post(
                preinference_url,
                json={'information': information},
                headers=headers)
            res_preinference_url.raise_for_status()
            print(res_preinference_url.json())
            time.sleep(delay)

        page = page + 1
Exemplo n.º 12
0
class AbejaDataset(VisionDataset):
    def __init__(self,
                 root,
                 dataset_id,
                 transform=None,
                 target_transform=None,
                 transforms=None,
                 prefetch=False,
                 use_cache=True,
                 indices=None):

        super(AbejaDataset, self).__init__(root, transforms, transform,
                                           target_transform)

        datasets_client = DatasetsClient()
        self.datalake_client = DatalakeClient()
        dataset = datasets_client.get_dataset(dataset_id)
        self.labels = dataset.props['categories'][0]['labels']
        self.palette = create_palette(self.labels)
        self.use_cache = use_cache

        self.datalake_files = list()
        idx = 0
        for item in dataset.dataset_items.list(prefetch=prefetch):
            if indices is not None and not idx in indices:
                idx += 1
                continue

            if 'segmentation-image' in item.attributes:
                data_uri = item.attributes['segmentation-image']['combined'][
                    'data_uri']
            else:
                # FIXME: DEPRECATED. Type 'segmentation' is invalid on the latest spec.
                data_uri = item.attributes['segmentation']['combined'][
                    'data_uri']
            m = re.search(r'datalake://(.+?)/(.+?)$', data_uri)
            src_data = item.source_data[0]
            self.datalake_files.append(
                DataLakeObj(m.group(1), m.group(2), src_data))
            idx += 1

    def __getitem__(self, index):
        """
        Args:
            index (int): Index

        Returns:
            tuple: (image, target) where target is the image segmentation.
        """
        channel = self.datalake_client.get_channel(
            self.datalake_files[index].channel_id)
        datalake_file = channel.get_file(self.datalake_files[index].file_id)

        # source image
        src_data = self.datalake_files[index].src_data
        src_content = src_data.get_content(cache=self.use_cache)
        src_file_like_object = io.BytesIO(src_content)
        src_img = Image.open(src_file_like_object).convert('RGB')

        # target image
        content = datalake_file.get_content(cache=self.use_cache)
        file_like_object = io.BytesIO(content)
        target = Image.open(file_like_object).convert('RGB').quantize(
            palette=self.palette)

        if self.transforms is not None:
            src_img, target = self.transforms(src_img, target)

        return src_img, target

    def __len__(self):
        return len(self.datalake_files)

    def num_class(self):
        return len(self.labels) + 1  # label+background
Exemplo n.º 13
0
from abeja.datalake import Client
from abeja.datalake.file import DatalakeFile

CHANNEL_ID = os.environ.get('CHANNEL_ID')

ORGANIZATION_ID = os.environ.get('ORGANIZATION_ID')
USER_ID = os.environ.get('USER_ID')
PERSONAL_ACCESS_TOKEN = os.environ.get('PERSONAL_ACCESS_TOKEN')

FIXTURE_IMAGE_PATH = 'integration_tests/fixtures/images'

credential = {
    'user_id': USER_ID,
    'personal_access_token': PERSONAL_ACCESS_TOKEN
}
client = Client(credential=credential, organization_id=ORGANIZATION_ID)
channel = client.get_channel(CHANNEL_ID)


def delete_all_files_in_channel(channel_id):
    res = client.api.list_channel_files(channel_id)
    for file in res['files']:
        file_id = file['file_id']
        client.api.delete_channel_file(channel_id, file_id)


class DatalakeIntegrationTest(TestCase):
    def setUp(self):
        delete_all_files_in_channel(CHANNEL_ID)

    @classmethod