def register_dataset_items_from_datalake(organization_id, image_channel_id, label_channel_id, dataset_name, img_list_path): """ register datasets from datalake channel :param dataset_id: target dataset id :param channel_id: target channel :param label_metadata_key: metadata key which label value is stored :param max_size_for_label: max size of dataset items for each label value :return: """ with open('dataset.json', 'r') as f: dataset_props = json.load(f) paths = [[fn.split('/')[-1] for fn in line.split()] for line in open(img_list_path)] img2label = {img: lbl for img, lbl in paths} print('Getting data from datalake....') client = DatalakeClient(organization_id=organization_id, credential=credential) label_channel = client.get_channel(label_channel_id) label_list = label_channel.list_files(prefetch=True) label2fileid = { label.metadata['filename']: label.file_id for label in label_list } image_channel = client.get_channel(image_channel_id) file_iter = image_channel.list_files(limit=1000, prefetch=True) dataset_items = [] for file_info in file_iter: imgfile = file_info.metadata['filename'] labelfile = img2label[imgfile] label_id = label2fileid[labelfile] annotation = { 'channel_id': label_channel_id, 'file_id': label_id, 'filename': labelfile } item = create_request_element(image_channel_id, file_info, annotation) dataset_items.append(item) print('Registering dataset items....') dataset_params = { 'organization_id': organization_id, 'name': dataset_name, 'type': 'segmentation', 'props': dataset_props } dataset_client = DatasetClient(organization_id=organization_id, credential=credential) dataset = dataset_client.datasets.create(dataset_name, 'segmentation', dataset_params) register_dataset_items(dataset, dataset_items)
def upload_datalake(file_path): client = DatalakeClient() datalake_client = DatalakeClient(organization_id=organization_id, credential=credential) channel = datalake_client.get_channel(channel_id) metadata = {} file = channel.upload_file(file_path, metadata=metadata) return 0
def upload_image_datalake(image_path): abeja_credential = { 'user_id': abeja_user_id, 'personal_access_token': abeja_personal_access_token } datalake_client = DatalakeClient(organization_id=abeja_organization_id, credential=abeja_credential) channel = datalake_client.get_channel(abeja_channel_id) res = channel.upload_file(image_path) datalake_file = channel.get_file(file_id=res.file_id) content = datalake_file.get_file_info() return content['download_url']
def register_dataset_items_from_datalake(organization_id, channel_id, dataset_name, split, year, max_workers): """ register datasets from datalake channel Args: organization_id: channel_id: dataset_name: split: year: max_workers: Returns: """ with open('dataset.json', 'r') as f: dataset_props = json.load(f) voc_dataset = voc_bbox_dataset.VOCBboxDataset(split=split, year=year) nb_data = len(voc_dataset) data = {} for i in range(nb_data): id, annotation = voc_dataset.get_annotations(i) data[id] = annotation print('Getting data from datalake....') client = DatalakeClient(organization_id=organization_id, credential=credential) channel = client.get_channel(channel_id) def file2id(file_info): return file_info.metadata['filename'].split('.')[0] file_iter = channel.list_files(limit=1000, prefetch=False) dataset_items = [] for file_info in tqdm(file_iter): if file2id(file_info) in data: item = create_request_element(channel_id, file_info, data[file2id(file_info)]) dataset_items.append(item) print('Registering dataset items....') dataset_client = DatasetClient(organization_id=organization_id, credential=credential) dataset = dataset_client.datasets.create(dataset_name, dataset_props['type'], dataset_props['props']) register_dataset_items(dataset, dataset_items, max_workers=max_workers) print('uploaded!')
def load_latest_file_from_datalake(channel_id): datalake_client = DatalakeClient() channel = datalake_client.get_channel(channel_id) # load latest file path for f in channel.list_files(sort='-uploaded_at'): latest_file_path = f.download_url latest_file_datetime = f.uploaded_at print('load file uploaded at {} (UTC time).'.format( latest_file_datetime)) break return latest_file_path
def main(image_root_dir, channel_id, dataset_name, max_workers, min_num_image_per_class): image_root_dir = Path(image_root_dir) props, label2id = build_props(image_root_dir, min_num_image_per_class) datakale_client = DatalakeClient() channel = datakale_client.get_channel(channel_id) datasets_client = DatasetsClient() dataset = datasets_client.datasets.create(name=dataset_name, type='classification', props=props) ret = upload_images(channel, dataset, image_root_dir, label2id, max_workers=max_workers, min_num_image_per_class=min_num_image_per_class) print('finished!')
def train_data_loader(channel_id: str, file_id: str, label_field: str, input_fileds: list = None): datalake_client = DatalakeClient() channel = datalake_client.get_channel(channel_id) datalake_file = channel.get_file(file_id) datalake_file.get_content(cache=True) csvfile = Path(ABEJA_STORAGE_DIR_PATH, channel_id, file_id) if input_fileds: train = pd.read_csv(csvfile, usecols=input_fileds+[label_field]) else: train = pd.read_csv(csvfile) y_train = train[label_field].values cols_drop = [c for c in train.columns if train[c].dtype == 'O'] + [label_field] train.drop(cols_drop, axis=1, inplace=True) X_train = train cols_train = X_train.columns.tolist() del train return X_train, y_train, cols_train
def register_dataset_items_from_datalake(organization_id, channel_id, dataset_name, label_metadata_key): with open('dataset.json', 'r') as f: dataset_props = json.load(f) print('Getting data from datalake....') client = DatalakeClient(organization_id=organization_id, credential=credential) channel = client.get_channel(channel_id) def to_annotation(file_info): label = file_info.metadata[label_metadata_key] label_id = label2id[label] return [{label_metadata_key: label, 'label_id': label_id, 'category_id': 0}] file_iter = channel.list_files(limit=1000, prefetch=False) label2id = { x['label']: x['label_id'] for x in dataset_props['props']['categories'][0]['labels'] } dataset_items = [] for file_info in file_iter: item = create_request_element(channel_id, file_info, data_id=int(file_info.metadata['filename'].split('.')[0]), annotation=to_annotation(file_info)) dataset_items.append(item) if len(dataset_items) % 1000 == 0: print(len(dataset_items)) print('Registering dataset items....') dataset_client = DatasetClient(organization_id=organization_id, credential=credential) dataset = dataset_client.datasets.create(dataset_name, dataset_props['type'], dataset_props['props']) register_dataset_items(dataset, dataset_items)
def __init__(self, root, dataset_id, transform=None, target_transform=None, transforms=None, prefetch=False, use_cache=True, indices=None): super(AbejaDataset, self).__init__(root, transforms, transform, target_transform) datasets_client = DatasetsClient() self.datalake_client = DatalakeClient() dataset = datasets_client.get_dataset(dataset_id) self.labels = dataset.props['categories'][0]['labels'] self.palette = create_palette(self.labels) self.use_cache = use_cache self.datalake_files = list() idx = 0 for item in dataset.dataset_items.list(prefetch=prefetch): if indices is not None and not idx in indices: idx += 1 continue if 'segmentation-image' in item.attributes: data_uri = item.attributes['segmentation-image']['combined'][ 'data_uri'] else: # FIXME: DEPRECATED. Type 'segmentation' is invalid on the latest spec. data_uri = item.attributes['segmentation']['combined'][ 'data_uri'] m = re.search(r'datalake://(.+?)/(.+?)$', data_uri) src_data = item.source_data[0] self.datalake_files.append( DataLakeObj(m.group(1), m.group(2), src_data)) idx += 1
'pottedplant': 15, 'sheep': 16, 'sofa': 17, 'train': 18, 'tvmonitor': 19 } # get DataLake channel id organization_url = urljoin( ANNOTATION_API, "/api/v1/organizations/{}/projects/{}".format(ANNOTATION_ORGANIZATION_ID, ANNOTATION_PROJECT_ID)) res = requests.get(organization_url, headers=headers) res.raise_for_status() channel_id = res.json()['data_lake_channels'][0]['channel_id'] client = DatalakeClient(organization_id=organization_id, credential=credential) datalake_channel = client.get_channel(channel_id) # get annotation task url task_url = urljoin( ANNOTATION_API, "/api/v1/organizations/{}/projects/{}/tasks/".format( ANNOTATION_ORGANIZATION_ID, ANNOTATION_PROJECT_ID)) page = 1 while True: res = requests.get(task_url, headers=headers, params={'page': page}) res.raise_for_status() res_body = res.json() # check if project has data if len(res_body) == 0:
def main(): """auto-annotation for image segmentation""" page = 1 delay = 10 while True: # get annotation tasks task_url = urljoin( annotation_api, '/api/v1/organizations/{}/projects/{}/tasks/'.format( annotation_organization_id, annotation_project_id)) res_task_url = requests.get(task_url, headers=headers, params={'page': page}) res_task_url.raise_for_status() # check if project has data if len(res_task_url.json()) == 0: break for task in tqdm(res_task_url.json()): # load image from DataLake channel-in client = DatalakeClient(organization_id=organization_id, credential=credential) channel = client.get_channel(channel_id_in) metadata = task['metadata'][0] input_img = channel.get_file(metadata['file_id']) content_type = input_img.get_file_info()['content_type'] input_img_io = io.BytesIO(input_img.get_content()) # post image to the model res_deployment_url = requests.post( deployment_url, data=input_img_io, headers={'Content-Type': content_type}, auth=(user_id, personal_access_token)) res_deployment_url.raise_for_status() # get predicted results labels = res_deployment_url.json()['labels'][0] b64 = res_deployment_url.json()['result'] output_img_io = io.BytesIO(base64.b64decode(b64)) # convert black color to transparency img = Image.open(output_img_io).convert('RGBA') pixdata = img.load() width, height = img.size for y in range(height): for x in range(width): if pixdata[x, y] == (0, 0, 0, 255): pixdata[x, y] = (0, 0, 0, 0) modified_img_io = io.BytesIO() img.save(modified_img_io, format='PNG') # upload output image to Datalake channel-out upload_url = 'https://api.abeja.io//channels/{}/upload'.format( channel_id_out) res_upload_url = requests.post( upload_url, data=modified_img_io.getvalue(), headers={'Content-type': 'image/png'}, auth=(user_id, personal_access_token)) res_upload_url.raise_for_status() file_id = res_upload_url.json()['file_id'] # get file url file_info_url = 'https://api.abeja.io//channels/{}/{}'.format( channel_id_out, file_id) res_file_info_url = requests.get(file_info_url, auth=(user_id, personal_access_token)) res_file_info_url.raise_for_status() file_url = res_file_info_url.json()['download_url'] # FIXME multi label information = [{ 'class': labels['label'], 'color': labels['color'], 'file_id': file_id, 'file_url': file_url, 'id': labels['label_id'] }] # register predicted result to annotation tool preinference_url = urljoin( task_url, "{}/preinferences".format(str(task['id']))) res_preinference_url = requests.post( preinference_url, json={'information': information}, headers=headers) res_preinference_url.raise_for_status() print(res_preinference_url.json()) time.sleep(delay) page = page + 1
class AbejaDataset(VisionDataset): def __init__(self, root, dataset_id, transform=None, target_transform=None, transforms=None, prefetch=False, use_cache=True, indices=None): super(AbejaDataset, self).__init__(root, transforms, transform, target_transform) datasets_client = DatasetsClient() self.datalake_client = DatalakeClient() dataset = datasets_client.get_dataset(dataset_id) self.labels = dataset.props['categories'][0]['labels'] self.palette = create_palette(self.labels) self.use_cache = use_cache self.datalake_files = list() idx = 0 for item in dataset.dataset_items.list(prefetch=prefetch): if indices is not None and not idx in indices: idx += 1 continue if 'segmentation-image' in item.attributes: data_uri = item.attributes['segmentation-image']['combined'][ 'data_uri'] else: # FIXME: DEPRECATED. Type 'segmentation' is invalid on the latest spec. data_uri = item.attributes['segmentation']['combined'][ 'data_uri'] m = re.search(r'datalake://(.+?)/(.+?)$', data_uri) src_data = item.source_data[0] self.datalake_files.append( DataLakeObj(m.group(1), m.group(2), src_data)) idx += 1 def __getitem__(self, index): """ Args: index (int): Index Returns: tuple: (image, target) where target is the image segmentation. """ channel = self.datalake_client.get_channel( self.datalake_files[index].channel_id) datalake_file = channel.get_file(self.datalake_files[index].file_id) # source image src_data = self.datalake_files[index].src_data src_content = src_data.get_content(cache=self.use_cache) src_file_like_object = io.BytesIO(src_content) src_img = Image.open(src_file_like_object).convert('RGB') # target image content = datalake_file.get_content(cache=self.use_cache) file_like_object = io.BytesIO(content) target = Image.open(file_like_object).convert('RGB').quantize( palette=self.palette) if self.transforms is not None: src_img, target = self.transforms(src_img, target) return src_img, target def __len__(self): return len(self.datalake_files) def num_class(self): return len(self.labels) + 1 # label+background
from abeja.datalake import Client from abeja.datalake.file import DatalakeFile CHANNEL_ID = os.environ.get('CHANNEL_ID') ORGANIZATION_ID = os.environ.get('ORGANIZATION_ID') USER_ID = os.environ.get('USER_ID') PERSONAL_ACCESS_TOKEN = os.environ.get('PERSONAL_ACCESS_TOKEN') FIXTURE_IMAGE_PATH = 'integration_tests/fixtures/images' credential = { 'user_id': USER_ID, 'personal_access_token': PERSONAL_ACCESS_TOKEN } client = Client(credential=credential, organization_id=ORGANIZATION_ID) channel = client.get_channel(CHANNEL_ID) def delete_all_files_in_channel(channel_id): res = client.api.list_channel_files(channel_id) for file in res['files']: file_id = file['file_id'] client.api.delete_channel_file(channel_id, file_id) class DatalakeIntegrationTest(TestCase): def setUp(self): delete_all_files_in_channel(CHANNEL_ID) @classmethod