def __init__(self, num_parallel_reads: int = 1, force_build=False, force_download=False, shuffle=True): file_pair = { "img_align_celeba.zip": "0B7EVK8r0v71pZjFTYXZWM3FlRnM", "list_attr_celeba.txt": "0B7EVK8r0v71pblRyaVFSWGxPY0U" } self.root_key = "celebA" self.num_attr = 40 log_message("Retrieving CelebA data") self.keys = maybe_download_and_store_google_drive( file_pair, root_key=self.root_key, force_download=force_download, use_subkeys=False) if len(self.keys) == 0: log_warning("Download Failed, change force_download=True") return self.selected_attrs = None # Extract each batch log_message('Extracting CelebA data...') self._train_db = None self._val_db = None self.num_parallel_reads = num_parallel_reads # Extract labels self.attr2idx: Dict = {} self.idx2attr: Dict = {} log_message("Extracting CelebA labels first") info_files = DATA_STORE[self.keys[1]] self._process_attr(info_files) if force_build: if shuffle: random.shuffle(self._img_meta) # Build Dataset self._build_dataset("train", shuffle) self._build_dataset("val", shuffle) record_root = os.path.join(self.root_key, "tfrecord") train_root = os.path.join(record_root, "train") val_root = os.path.join(record_root, "val") self.train_fpath = DATA_STORE[train_root] self.val_fpath = DATA_STORE[val_root] self.num_train_examples = sum( 1 for _ in tf.python_io.tf_record_iterator(self.train_fpath)) self.num_val_examples = sum( 1 for _ in tf.python_io.tf_record_iterator(self.train_fpath)) log_message("Built Complete")
def at_terminate(self, ): """The code which is run at the termination of the program. In our case, this saves the db-store data """ # Flush the database self.flush() # Clean the working directory try: shutil.rmtree(self.working_directory) except Exception as ex: if not self.testing: log_warning('Error removing working directory: {}'.format( str(ex)))
def _build_dataset(self, dataset: str) -> None: if dataset not in ['train', 'val']: raise ValueError("Must be building either training or validation dataset") # Open the TFRecordWriter if dataset == 'train': record_root = 'coco2014/tfrecord/train' json = self.train_json root_fpath = DATA_STORE['coco2014/data/train/images'] else: record_root = 'coco2014/tfrecord/val' json = self.val_json root_fpath = DATA_STORE['coco2014/data/val/images'] # Construct the record reader tf_record_writer = tf.python_io.TFRecordWriter(DATA_STORE.create_key(record_root, 'data.tfrecords', force=True)) # Loop over the data and parse errors = 0 log_message('Building {} dataset...'.format(dataset)) for entry in tqdm.tqdm(json['annotations']): # Load the image image = load_image(build_fpath_from_image_id(root_fpath, entry['image_id'], dataset)) if image is None: errors += 1 log_warning('Error loading image: {}. {} Errors so far.'.format(build_fpath_from_image_id(root_fpath, entry['image_id'], dataset), errors)) continue # Parse the caption caption_raw = entry['caption'] caption_dense, caption_len = self.dictionary.dense_parse(caption_raw, word_padding=self.max_word_length, char_padding=self.max_char_length) # Add the image data feature = { 'caption_word_embedding': _int64_feature(np.ravel(caption_dense[0]).astype(np.int64)), 'caption_char_embedding': _int64_feature(np.ravel(caption_dense[1]).astype(np.int64)), 'caption_length': _int64_feature([caption_len]), 'image_shape': _int64_feature(image.shape), 'image': _bytes_feature(tf.compat.as_bytes(image.tostring())), } # Write the TF-Record example = tf.train.Example(features=tf.train.Features(feature=feature)) tf_record_writer.write(example.SerializeToString()) tf_record_writer.close() DATA_STORE.update_hash(record_root)
def _build_dataset(self, dataset: str) -> None: if dataset not in ['train', 'val']: raise ValueError("Must be building either training or validation dataset") # Open the TFRecordWriter if dataset == 'train': record_root = 'coco2017/detection/tfrecord/train' json = self.train_json root_fpath = DATA_STORE['coco2017/data/train/images'] else: record_root = 'coco2017/detection/tfrecord/val' json = self.val_json root_fpath = DATA_STORE['coco2017/data/val/images'] # Construct the record reader tf_record_writer = tf.python_io.TFRecordWriter(DATA_STORE.create_key(record_root, 'data.tfrecords', force=True)) # Loop over the data and parse errors = 0 log_message('Building {} dataset...'.format(dataset)) for entry in tqdm.tqdm(json['annotations']): # Load the image image = load_image(build_fpath_from_image_id(root_fpath, entry['image_id'], dataset)) if image is None: errors += 1 log_warning('Error loading image: {}. {} Errors so far.'.format(build_fpath_from_image_id(root_fpath, entry['image_id'], dataset), errors)) continue # Add the image data # TODO: Add the segmentation (decode using the RLE for COCO) feature = { 'area': _float_feature(entry['area']), 'iscrowd': _int64_feature(entry['iscrowd']), 'bbox': _float_feature(np.ravel(np.array(entry['bbox'], dtype=np.float32))), 'category_id': _int64_feature(entry['category_id']), 'image_shape': _int64_feature(image.shape), 'image': _bytes_feature(tf.compat.as_bytes(image.tostring())), } # Write the TF-Record example = tf.train.Example(features=tf.train.Features(feature=feature)) tf_record_writer.write(example.SerializeToString()) tf_record_writer.close() DATA_STORE.update_hash(record_root)
def is_valid(self, key: str, nohashcheck=False) -> bool: try: if key in self.db: if not nohashcheck and self.db[key]['hash'] is not None: if str(self.db[key]['hash']) == str( adler32(str(self.db[key]['fpath']))): return True else: return False else: return True # Where is the trigger for this error (Karen)? except FileNotFoundError as ex: log_warning('Key ({}) doesn\'t exist/has moved :O'.format(key)) return False except Exception as ex: log_warning('Key ({}) may have been corrupted: {}'.format( key, str(ex))) return False
def _build_dataset(self, dataset): _problem_key = [p for p in self.problems if p.endswith(dataset)] if len(_problem_key) < 1: log_warning("Problem key doesn't exist for {}. ".format(dataset) + str(_problem_key)) raise EnvironmentError() problem_key = _problem_key[0] tf_record_key = os.path.join(self.voc_root_key, self.problem_name.lower(), "tfrecord", dataset) log_message("Retrieving the index from " + problem_key) assert (os.path.exists(DATA_STORE[problem_key])) with open(DATA_STORE[problem_key], 'r') as f: images_index = [x.strip() for x in f.readlines()] tf_record_writer = tf.python_io.TFRecordWriter( DATA_STORE.create_key(tf_record_key, 'data.tfrecords', force=True)) errors = 0 log_message("Building {} dataset...".format(dataset)) total_num_examples = 0 for idx, index in tqdm(enumerate(images_index)): img_path = image_path_from_index(index, self.image_path, '.jpg') feature_dict = self._load_pascal_annotation(index) image = load_image(img_path) image = encode_jpeg(image) if image is None: errors += 1 log_warning( 'Error loading image: {}. {} Errors so far.'.format( img_path, errors)) continue seg_cls_path = image_path_from_index(index, self.seg_class_path, '.png') seg_class = load_image(seg_cls_path) seg_class = encode_png(seg_class) seg_obj_path = image_path_from_index(index, self.seg_obj_path, '.png') seg_obj = load_image(seg_obj_path) seg_obj = encode_png(seg_obj) if seg_class is None: errors += 1 log_warning( 'Error loading image: {}. {} Errors so far.'.format( seg_cls_path, errors)) continue if seg_obj is None: errors += 1 log_warning( 'Error loading image: {}. {} Errors so far.'.format( seg_obj_path, errors)) continue feature_dict["image"] = _bytes_feature(tf.compat.as_bytes(image)) feature_dict["seg_class"] = _bytes_feature( tf.compat.as_bytes(seg_class)) feature_dict["seg_obj"] = _bytes_feature( tf.compat.as_bytes(seg_obj)) example = tf.train.Example(features=tf.train.Features( feature=feature_dict)) tf_record_writer.write(example.SerializeToString()) total_num_examples += 1 tf_record_writer.close() DATA_STORE.update_hash(tf_record_key) return total_num_examples
import os from abc import ABC, abstractmethod from typing import List from flux.backend.globals import ROOT_FPATH # Handle NLTK imports try: import nltk nltk.download('punkt', os.path.join(ROOT_FPATH, 'nltk'), quiet=True) nltk.data.path.append(os.path.join(ROOT_FPATH, 'nltk')) NLTK_IMPORTED = True except Exception as ex: from flux.util.logging import log_warning log_warning( 'The PunktTokenizer requires NLTK. If you\'re using this, install NLTK using \'pip install nltk\'' ) NLTK_IMPORTED = False class Tokenizer(ABC): """Simple tokenizer base class """ def __init__(self, ) -> None: pass @abstractmethod def parse(self, input_string: str) -> List[str]: pass
def _build_dataset(self, dataset: str, shuffle: bool) -> None: if dataset not in ['train', 'val']: raise ValueError( "Must be building either training or validation dataset") record_root = os.path.join(self.root_key, "tfrecord") # Open the TFRecordWriter if dataset == 'train': record_root = os.path.join(record_root, "train") data_size = self._num_examples * TRAIN_PARTITION else: record_root = os.path.join(record_root, "val") data_size = self._num_examples * VAL_PARTITION # Construct the record reader tf_record_writer = tf.python_io.TFRecordWriter( DATA_STORE.create_key( record_root, 'shuffle.tfrecords' if shuffle else "data.tfrecords", force=True)) # Loop over the data and parse errors = 0 log_message('Building {} dataset...'.format(dataset)) img_path = DATA_STORE[self.keys[0]] for i in tqdm.tqdm(range(int(data_size))): img_meta = self._img_meta[i].strip("\n").split(" ") file_name = os.path.join(img_path, img_meta[0]) values = img_meta[1:] label = [] for attr_name in self.selected_attrs: idx = self.attr2idx[attr_name] if values[idx] == '1': label.append(1.0) else: label.append(0.0) assert (len(label) == self.num_attr ) # All labels should have 40 items. (One hot) label = np.array(label, dtype=np.float32) # Load the image image = load_image(file_name) if image is None: errors += 1 log_warning( 'Error loading image: {}. {} Errors so far.'.format( file_name, errors)) continue # Add the image data feature = { "label": _float_feature(label), 'image_shape': _int64_feature(image.shape), 'image': _bytes_feature(tf.compat.as_bytes(image.tostring())), } # Write the TF-Record example = tf.train.Example(features=tf.train.Features( feature=feature)) tf_record_writer.write(example.SerializeToString()) tf_record_writer.close() DATA_STORE.update_hash(record_root)
Arguments: fpath {str} -- The file path of the image to load """ image = cv2.imread(fpath) try: image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) except: try: image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB) except: return None return image def encode_jpeg(image: np.ndarray) -> str: return cv2.imencode('.jpg', image)[1].tostring() def encode_png(image: np.ndarray) -> str: return cv2.imencode('.png', image)[1].tostring() def resize_image(image: np.ndarray, shape: Tuple[int, int]) -> np.ndarray: image = cv2.resize(image, (shape[0], shape[1]), interpolation=cv2.INTER_CUBIC) return image except ImportError as ex: print(ex) log_warning( 'Error trying to import CV2 - To use the vision modules make sure opencv is installed.' )
""" Fast-Text embedding vectors """ try: import fastText except Exception as ex: from flux.util.logging import log_warning log_warning('FastText vectors require the fastText python module to be installed. Obtain and install from here: https://github.com/facebookresearch/fastText') raise ex import numpy as np from typing import Dict class FastTextEmbedding(): def __init__(self, model_path:str='fasttext_vectors.bin') -> None: # Need to download the model from Philippe self.model = fastText.load_model(model_path) self.dimension = self.model.get_dimension() def get_word_vector(self,input_string: str) -> np.ndarray: return self.model.get_word_vector(input_string) def get_sentence_vector(self, input_sentence: str) -> np.ndarray: return self.model.get_sentence_vector(input_sentence) def GenerateMatrix(self, dictionary: Dict[str, int]) -> np.ndarray: # Determine the length of the embedding matrix
def _build_images(self, ) -> None: # Define the Record Root # Open the TFRecordWriter train_record_root = os.path.join(self.train_fpath, "images") val_record_root = os.path.join(self.val_fpath, "images") test_record_root = os.path.join(self.test_fpath, "images") # Construct the record reader train_record_writer = tf.python_io.TFRecordWriter( DATA_STORE.create_key(train_record_root, 'data.tfrecords', force=True)) val_record_writer = tf.python_io.TFRecordWriter( DATA_STORE.create_key(val_record_root, 'data.tfrecords', force=True)) test_record_writer = tf.python_io.TFRecordWriter( DATA_STORE.create_key(test_record_root, 'data.tfrecords', force=True)) # Loop over the data and parse errors = 0 log_message('Building the image...') images = self._json['images'] total_num_examples = len(images) for idx, entry in tqdm.tqdm(enumerate(images), total=total_num_examples): # Load the image filename = entry['filename'] image_path = os.path.join(self.image_root_path, "images", filename) assert os.path.exists(image_path) image = load_image(image_path) image_shape = list(image.shape) image = encode_jpeg(image) if image is None: errors += 1 log_warning( 'Error loading image: {}. {} Errors so far.'.format( os.path.join(self.image_root_path, "images", filename), errors)) continue # Split the dataset split = entry["split"] if split == "val": tf_record_writer = val_record_writer elif split == "test": tf_record_writer = test_record_writer else: tf_record_writer = train_record_writer image_id = entry['image_id'] feature = { 'image_size': _int64_feature(image_shape), 'image_id': _int64_feature([image_id]), 'image': _bytes_feature(tf.compat.as_bytes(image)), } # Write the TF-Record example = tf.train.Example(features=tf.train.Features( feature=feature)) tf_record_writer.write(example.SerializeToString()) val_record_writer.close() train_record_writer.close() test_record_writer.close() DATA_STORE.update_hash(test_record_root) DATA_STORE.update_hash(train_record_root) DATA_STORE.update_hash(val_record_root)
""" Classes and methods for handling tf-records """ import numpy as np from flux.backend.globals import DATA_STORE from typing import Tuple, Dict try: import tensorflow as tf except Exception as ex: from flux.util.logging import log_warning log_warning( 'TFRecord utilities require Tensorflow! Get it here: https://www.tensorflow.org/ ' ) raise ex class TFFeature(): """ Wrapper class for the TF-Feature which contains some metadata """ def __init__(self, feature: tf.train.Feature, name: str, shape: Tuple, dtype: np.dtype) -> None: self.feature = feature self.name = name self.shape = shape self.dtype = dtype
def _build_dataset(self, dataset: str) -> None: # Open the TFRecordWriter if dataset == 'train': record_root = 'vqa/tfrecord/train' json_a = self.train_a_json json_q = self.train_q_json root_fpath = DATA_STORE['coco2014/data/train/images'] example_numbers = self.num_train_examples else: record_root = 'vqa/tfrecord/val' json_a = self.val_a_json json_q = self.val_q_json root_fpath = DATA_STORE['coco2014/data/val/images'] example_numbers = self.num_val_examples # Construct the record reader tf_record_writer = tf.python_io.TFRecordWriter( DATA_STORE.create_key(record_root, 'data.tfrecords', force=True)) # Loop over the data and parse errors = 0 log_message('Building {} dataset...'.format(dataset)) for idx, entry in tqdm.tqdm(enumerate(json_q['questions']), total=example_numbers): # Load the image image = load_image( build_fpath_from_image_id(root_fpath, entry['image_id'], dataset)) image = encode_jpeg(image) if image is None: errors += 1 log_warning( 'Error loading image: {}. {} Errors so far.'.format( build_fpath_from_image_id(root_fpath, entry['image_id'], dataset), errors)) continue # Parse the caption assert entry['question_id'] == json_a['annotations'][idx][ 'question_id'] question_raw = entry['question'] question_dense, question_len = self.dictionary.dense_parse( question_raw, word_padding=self.max_word_length, char_padding=self.max_char_length) answer_raw = json_a['annotations'][idx]['multiple_choice_answer'] answer_dense, answer_len = self.dictionary.dense_parse( answer_raw, word_padding=self.max_word_length, char_padding=self.max_char_length) # Add the class mapping if answer_raw not in self.class_map: self.class_map[answer_raw] = len(self.class_map) answer_class = self.class_map[answer_raw] # Add the image data feature = { 'question_word_embedding': _int64_feature(np.ravel(question_dense[0]).astype(np.int64)), 'question_char_embedding': _int64_feature(np.ravel(question_dense[1]).astype(np.int64)), 'question_length': _int64_feature([question_len]), 'answer_word_embedding': _int64_feature(np.ravel(answer_dense[0]).astype(np.int64)), 'answer_char_embedding': _int64_feature(np.ravel(answer_dense[1]).astype(np.int64)), 'answer_length': _int64_feature([answer_len]), 'answer_class': _int64_feature([answer_class]), 'image': _bytes_feature(tf.compat.as_bytes(image)), } # Write the TF-Record example = tf.train.Example(features=tf.train.Features( feature=feature)) tf_record_writer.write(example.SerializeToString()) tf_record_writer.close() DATA_STORE.update_hash(record_root)