예제 #1
0
파일: celeba.py 프로젝트: qbeer/flux
    def __init__(self,
                 num_parallel_reads: int = 1,
                 force_build=False,
                 force_download=False,
                 shuffle=True):
        file_pair = {
            "img_align_celeba.zip": "0B7EVK8r0v71pZjFTYXZWM3FlRnM",
            "list_attr_celeba.txt": "0B7EVK8r0v71pblRyaVFSWGxPY0U"
        }
        self.root_key = "celebA"
        self.num_attr = 40
        log_message("Retrieving CelebA data")
        self.keys = maybe_download_and_store_google_drive(
            file_pair,
            root_key=self.root_key,
            force_download=force_download,
            use_subkeys=False)
        if len(self.keys) == 0:
            log_warning("Download Failed, change force_download=True")
            return
        self.selected_attrs = None
        # Extract each batch
        log_message('Extracting CelebA data...')

        self._train_db = None
        self._val_db = None
        self.num_parallel_reads = num_parallel_reads
        # Extract labels
        self.attr2idx: Dict = {}
        self.idx2attr: Dict = {}
        log_message("Extracting CelebA labels first")
        info_files = DATA_STORE[self.keys[1]]
        self._process_attr(info_files)
        if force_build:
            if shuffle:
                random.shuffle(self._img_meta)
            # Build Dataset
            self._build_dataset("train", shuffle)
            self._build_dataset("val", shuffle)

        record_root = os.path.join(self.root_key, "tfrecord")
        train_root = os.path.join(record_root, "train")
        val_root = os.path.join(record_root, "val")

        self.train_fpath = DATA_STORE[train_root]
        self.val_fpath = DATA_STORE[val_root]
        self.num_train_examples = sum(
            1 for _ in tf.python_io.tf_record_iterator(self.train_fpath))
        self.num_val_examples = sum(
            1 for _ in tf.python_io.tf_record_iterator(self.train_fpath))

        log_message("Built Complete")
예제 #2
0
파일: datastore.py 프로젝트: qbeer/flux
 def at_terminate(self, ):
     """The code which is run at the termination of the
     program. In our case, this saves the db-store data
     """
     # Flush the database
     self.flush()
     # Clean the working directory
     try:
         shutil.rmtree(self.working_directory)
     except Exception as ex:
         if not self.testing:
             log_warning('Error removing working directory: {}'.format(
                 str(ex)))
예제 #3
0
파일: coco.py 프로젝트: qbeer/flux
    def _build_dataset(self, dataset: str) -> None:

        if dataset not in ['train', 'val']:
            raise ValueError("Must be building either training or validation dataset")

        # Open the TFRecordWriter
        if dataset == 'train':
            record_root = 'coco2014/tfrecord/train'
            json = self.train_json
            root_fpath = DATA_STORE['coco2014/data/train/images']
        else:
            record_root = 'coco2014/tfrecord/val'
            json = self.val_json
            root_fpath = DATA_STORE['coco2014/data/val/images']

        # Construct the record reader
        tf_record_writer = tf.python_io.TFRecordWriter(DATA_STORE.create_key(record_root, 'data.tfrecords', force=True))

        # Loop over the data and parse
        errors = 0
        log_message('Building {} dataset...'.format(dataset))
        for entry in tqdm.tqdm(json['annotations']):
            # Load the image
            image = load_image(build_fpath_from_image_id(root_fpath, entry['image_id'], dataset))
            if image is None:
                errors += 1
                log_warning('Error loading image: {}. {} Errors so far.'.format(build_fpath_from_image_id(root_fpath, entry['image_id'], dataset), errors))
                continue

            # Parse the caption
            caption_raw = entry['caption']
            caption_dense, caption_len = self.dictionary.dense_parse(caption_raw, word_padding=self.max_word_length, char_padding=self.max_char_length)

            # Add the image data 
            feature = {
                'caption_word_embedding': _int64_feature(np.ravel(caption_dense[0]).astype(np.int64)),
                'caption_char_embedding': _int64_feature(np.ravel(caption_dense[1]).astype(np.int64)),
                'caption_length': _int64_feature([caption_len]),
                'image_shape': _int64_feature(image.shape),
                'image': _bytes_feature(tf.compat.as_bytes(image.tostring())),
            }

            # Write the TF-Record
            example = tf.train.Example(features=tf.train.Features(feature=feature))
            tf_record_writer.write(example.SerializeToString())
        
        tf_record_writer.close()
        DATA_STORE.update_hash(record_root)
예제 #4
0
파일: coco.py 프로젝트: qbeer/flux
    def _build_dataset(self, dataset: str) -> None:

        if dataset not in ['train', 'val']:
            raise ValueError("Must be building either training or validation dataset")

        # Open the TFRecordWriter
        if dataset == 'train':
            record_root = 'coco2017/detection/tfrecord/train'
            json = self.train_json
            root_fpath = DATA_STORE['coco2017/data/train/images']
        else:
            record_root = 'coco2017/detection/tfrecord/val'
            json = self.val_json
            root_fpath = DATA_STORE['coco2017/data/val/images']

        # Construct the record reader
        tf_record_writer = tf.python_io.TFRecordWriter(DATA_STORE.create_key(record_root, 'data.tfrecords', force=True))

        # Loop over the data and parse
        errors = 0
        log_message('Building {} dataset...'.format(dataset))
        for entry in tqdm.tqdm(json['annotations']):
            # Load the image
            image = load_image(build_fpath_from_image_id(root_fpath, entry['image_id'], dataset))
            if image is None:
                errors += 1
                log_warning('Error loading image: {}. {} Errors so far.'.format(build_fpath_from_image_id(root_fpath, entry['image_id'], dataset), errors))
                continue

            # Add the image data 
            # TODO: Add the segmentation (decode using the RLE for COCO)
            feature = {
                'area': _float_feature(entry['area']),
                'iscrowd': _int64_feature(entry['iscrowd']),
                'bbox': _float_feature(np.ravel(np.array(entry['bbox'], dtype=np.float32))),
                'category_id': _int64_feature(entry['category_id']),
                'image_shape': _int64_feature(image.shape),
                'image': _bytes_feature(tf.compat.as_bytes(image.tostring())),
            }

            # Write the TF-Record
            example = tf.train.Example(features=tf.train.Features(feature=feature))
            tf_record_writer.write(example.SerializeToString())
        
        tf_record_writer.close()
        DATA_STORE.update_hash(record_root)
예제 #5
0
파일: datastore.py 프로젝트: qbeer/flux
    def is_valid(self, key: str, nohashcheck=False) -> bool:
        try:
            if key in self.db:
                if not nohashcheck and self.db[key]['hash'] is not None:
                    if str(self.db[key]['hash']) == str(
                            adler32(str(self.db[key]['fpath']))):
                        return True
                    else:
                        return False
                else:
                    return True

        # Where is the trigger for this error (Karen)?
        except FileNotFoundError as ex:
            log_warning('Key ({}) doesn\'t exist/has moved :O'.format(key))
            return False
        except Exception as ex:
            log_warning('Key ({}) may have been corrupted: {}'.format(
                key, str(ex)))
        return False
예제 #6
0
파일: pascal.py 프로젝트: qbeer/flux
    def _build_dataset(self, dataset):
        _problem_key = [p for p in self.problems if p.endswith(dataset)]
        if len(_problem_key) < 1:
            log_warning("Problem key doesn't exist for {}.  ".format(dataset) +
                        str(_problem_key))
            raise EnvironmentError()
        problem_key = _problem_key[0]
        tf_record_key = os.path.join(self.voc_root_key,
                                     self.problem_name.lower(), "tfrecord",
                                     dataset)
        log_message("Retrieving the index from " + problem_key)
        assert (os.path.exists(DATA_STORE[problem_key]))
        with open(DATA_STORE[problem_key], 'r') as f:
            images_index = [x.strip() for x in f.readlines()]
        tf_record_writer = tf.python_io.TFRecordWriter(
            DATA_STORE.create_key(tf_record_key, 'data.tfrecords', force=True))

        errors = 0
        log_message("Building {} dataset...".format(dataset))
        total_num_examples = 0
        for idx, index in tqdm(enumerate(images_index)):
            img_path = image_path_from_index(index, self.image_path, '.jpg')
            feature_dict = self._load_pascal_annotation(index)

            image = load_image(img_path)
            image = encode_jpeg(image)
            if image is None:
                errors += 1
                log_warning(
                    'Error loading image: {}. {} Errors so far.'.format(
                        img_path, errors))
                continue

            seg_cls_path = image_path_from_index(index, self.seg_class_path,
                                                 '.png')
            seg_class = load_image(seg_cls_path)
            seg_class = encode_png(seg_class)
            seg_obj_path = image_path_from_index(index, self.seg_obj_path,
                                                 '.png')
            seg_obj = load_image(seg_obj_path)
            seg_obj = encode_png(seg_obj)

            if seg_class is None:
                errors += 1
                log_warning(
                    'Error loading image: {}. {} Errors so far.'.format(
                        seg_cls_path, errors))
                continue
            if seg_obj is None:
                errors += 1
                log_warning(
                    'Error loading image: {}. {} Errors so far.'.format(
                        seg_obj_path, errors))
                continue
            feature_dict["image"] = _bytes_feature(tf.compat.as_bytes(image))
            feature_dict["seg_class"] = _bytes_feature(
                tf.compat.as_bytes(seg_class))
            feature_dict["seg_obj"] = _bytes_feature(
                tf.compat.as_bytes(seg_obj))

            example = tf.train.Example(features=tf.train.Features(
                feature=feature_dict))
            tf_record_writer.write(example.SerializeToString())
            total_num_examples += 1
        tf_record_writer.close()
        DATA_STORE.update_hash(tf_record_key)
        return total_num_examples
예제 #7
0
import os
from abc import ABC, abstractmethod
from typing import List

from flux.backend.globals import ROOT_FPATH

# Handle NLTK imports
try:
    import nltk
    nltk.download('punkt', os.path.join(ROOT_FPATH, 'nltk'), quiet=True)
    nltk.data.path.append(os.path.join(ROOT_FPATH, 'nltk'))
    NLTK_IMPORTED = True
except Exception as ex:
    from flux.util.logging import log_warning
    log_warning(
        'The PunktTokenizer requires NLTK. If you\'re using this, install NLTK using \'pip install nltk\''
    )
    NLTK_IMPORTED = False


class Tokenizer(ABC):
    """Simple tokenizer base class
    """
    def __init__(self, ) -> None:
        pass

    @abstractmethod
    def parse(self, input_string: str) -> List[str]:
        pass

예제 #8
0
파일: celeba.py 프로젝트: qbeer/flux
    def _build_dataset(self, dataset: str, shuffle: bool) -> None:

        if dataset not in ['train', 'val']:
            raise ValueError(
                "Must be building either training or validation dataset")

        record_root = os.path.join(self.root_key, "tfrecord")
        # Open the TFRecordWriter
        if dataset == 'train':
            record_root = os.path.join(record_root, "train")
            data_size = self._num_examples * TRAIN_PARTITION
        else:
            record_root = os.path.join(record_root, "val")
            data_size = self._num_examples * VAL_PARTITION
        # Construct the record reader
        tf_record_writer = tf.python_io.TFRecordWriter(
            DATA_STORE.create_key(
                record_root,
                'shuffle.tfrecords' if shuffle else "data.tfrecords",
                force=True))

        # Loop over the data and parse
        errors = 0
        log_message('Building {} dataset...'.format(dataset))
        img_path = DATA_STORE[self.keys[0]]
        for i in tqdm.tqdm(range(int(data_size))):
            img_meta = self._img_meta[i].strip("\n").split(" ")
            file_name = os.path.join(img_path, img_meta[0])
            values = img_meta[1:]
            label = []
            for attr_name in self.selected_attrs:
                idx = self.attr2idx[attr_name]
                if values[idx] == '1':
                    label.append(1.0)
                else:
                    label.append(0.0)
            assert (len(label) == self.num_attr
                    )  # All labels should have 40 items.  (One hot)
            label = np.array(label, dtype=np.float32)
            # Load the image
            image = load_image(file_name)
            if image is None:
                errors += 1
                log_warning(
                    'Error loading image: {}. {} Errors so far.'.format(
                        file_name, errors))
                continue

            # Add the image data
            feature = {
                "label": _float_feature(label),
                'image_shape': _int64_feature(image.shape),
                'image': _bytes_feature(tf.compat.as_bytes(image.tostring())),
            }

            # Write the TF-Record
            example = tf.train.Example(features=tf.train.Features(
                feature=feature))
            tf_record_writer.write(example.SerializeToString())
        tf_record_writer.close()
        DATA_STORE.update_hash(record_root)
예제 #9
0
    
        Arguments:
            fpath {str} -- The file path of the image to load
        """
        image = cv2.imread(fpath)
        try:
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        except:
            try:
                image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)
            except:
                return None
        return image

    def encode_jpeg(image: np.ndarray) -> str:
        return cv2.imencode('.jpg', image)[1].tostring()

    def encode_png(image: np.ndarray) -> str:
        return cv2.imencode('.png', image)[1].tostring()

    def resize_image(image: np.ndarray, shape: Tuple[int, int]) -> np.ndarray:
        image = cv2.resize(image, (shape[0], shape[1]),
                           interpolation=cv2.INTER_CUBIC)
        return image

except ImportError as ex:
    print(ex)
    log_warning(
        'Error trying to import CV2 - To use the vision modules make sure opencv is installed.'
    )
예제 #10
0
파일: fasttext.py 프로젝트: qbeer/flux
"""
Fast-Text embedding vectors
"""
try:
    import fastText
except Exception as ex:
    from flux.util.logging import log_warning
    log_warning('FastText vectors require the fastText python module to be installed. Obtain and install from here: https://github.com/facebookresearch/fastText')
    raise ex

import numpy as np

from typing import Dict


class FastTextEmbedding():

    def __init__(self, model_path:str='fasttext_vectors.bin') -> None:

        # Need to download the model from Philippe
        self.model = fastText.load_model(model_path)
        self.dimension = self.model.get_dimension()

    def get_word_vector(self,input_string: str) -> np.ndarray:
        return self.model.get_word_vector(input_string)

    def get_sentence_vector(self, input_sentence: str) -> np.ndarray:
        return self.model.get_sentence_vector(input_sentence)

    def GenerateMatrix(self, dictionary: Dict[str, int]) -> np.ndarray:
        # Determine the length of the embedding matrix
예제 #11
0
    def _build_images(self, ) -> None:
        # Define the Record Root

        # Open the TFRecordWriter
        train_record_root = os.path.join(self.train_fpath, "images")
        val_record_root = os.path.join(self.val_fpath, "images")
        test_record_root = os.path.join(self.test_fpath, "images")

        # Construct the record reader
        train_record_writer = tf.python_io.TFRecordWriter(
            DATA_STORE.create_key(train_record_root,
                                  'data.tfrecords',
                                  force=True))
        val_record_writer = tf.python_io.TFRecordWriter(
            DATA_STORE.create_key(val_record_root,
                                  'data.tfrecords',
                                  force=True))
        test_record_writer = tf.python_io.TFRecordWriter(
            DATA_STORE.create_key(test_record_root,
                                  'data.tfrecords',
                                  force=True))

        # Loop over the data and parse
        errors = 0
        log_message('Building the image...')

        images = self._json['images']

        total_num_examples = len(images)
        for idx, entry in tqdm.tqdm(enumerate(images),
                                    total=total_num_examples):
            # Load the image
            filename = entry['filename']
            image_path = os.path.join(self.image_root_path, "images", filename)
            assert os.path.exists(image_path)
            image = load_image(image_path)
            image_shape = list(image.shape)
            image = encode_jpeg(image)
            if image is None:
                errors += 1
                log_warning(
                    'Error loading image: {}. {} Errors so far.'.format(
                        os.path.join(self.image_root_path, "images", filename),
                        errors))
                continue

            # Split the dataset
            split = entry["split"]
            if split == "val":
                tf_record_writer = val_record_writer
            elif split == "test":
                tf_record_writer = test_record_writer
            else:
                tf_record_writer = train_record_writer

            image_id = entry['image_id']

            feature = {
                'image_size': _int64_feature(image_shape),
                'image_id': _int64_feature([image_id]),
                'image': _bytes_feature(tf.compat.as_bytes(image)),
            }
            # Write the TF-Record
            example = tf.train.Example(features=tf.train.Features(
                feature=feature))

            tf_record_writer.write(example.SerializeToString())

        val_record_writer.close()
        train_record_writer.close()
        test_record_writer.close()
        DATA_STORE.update_hash(test_record_root)
        DATA_STORE.update_hash(train_record_root)
        DATA_STORE.update_hash(val_record_root)
예제 #12
0
"""
Classes and methods for handling tf-records
"""

import numpy as np
from flux.backend.globals import DATA_STORE

from typing import Tuple, Dict

try:
    import tensorflow as tf
except Exception as ex:
    from flux.util.logging import log_warning
    log_warning(
        'TFRecord utilities require Tensorflow! Get it here: https://www.tensorflow.org/ '
    )
    raise ex


class TFFeature():
    """
    Wrapper class for the TF-Feature which contains some metadata
    """
    def __init__(self, feature: tf.train.Feature, name: str, shape: Tuple,
                 dtype: np.dtype) -> None:
        self.feature = feature
        self.name = name
        self.shape = shape
        self.dtype = dtype

예제 #13
0
    def _build_dataset(self, dataset: str) -> None:

        # Open the TFRecordWriter
        if dataset == 'train':
            record_root = 'vqa/tfrecord/train'
            json_a = self.train_a_json
            json_q = self.train_q_json
            root_fpath = DATA_STORE['coco2014/data/train/images']
            example_numbers = self.num_train_examples
        else:
            record_root = 'vqa/tfrecord/val'
            json_a = self.val_a_json
            json_q = self.val_q_json
            root_fpath = DATA_STORE['coco2014/data/val/images']
            example_numbers = self.num_val_examples

        # Construct the record reader
        tf_record_writer = tf.python_io.TFRecordWriter(
            DATA_STORE.create_key(record_root, 'data.tfrecords', force=True))

        # Loop over the data and parse
        errors = 0
        log_message('Building {} dataset...'.format(dataset))
        for idx, entry in tqdm.tqdm(enumerate(json_q['questions']),
                                    total=example_numbers):
            # Load the image
            image = load_image(
                build_fpath_from_image_id(root_fpath, entry['image_id'],
                                          dataset))
            image = encode_jpeg(image)
            if image is None:
                errors += 1
                log_warning(
                    'Error loading image: {}. {} Errors so far.'.format(
                        build_fpath_from_image_id(root_fpath,
                                                  entry['image_id'], dataset),
                        errors))
                continue

            # Parse the caption
            assert entry['question_id'] == json_a['annotations'][idx][
                'question_id']
            question_raw = entry['question']
            question_dense, question_len = self.dictionary.dense_parse(
                question_raw,
                word_padding=self.max_word_length,
                char_padding=self.max_char_length)
            answer_raw = json_a['annotations'][idx]['multiple_choice_answer']
            answer_dense, answer_len = self.dictionary.dense_parse(
                answer_raw,
                word_padding=self.max_word_length,
                char_padding=self.max_char_length)

            # Add the class mapping
            if answer_raw not in self.class_map:
                self.class_map[answer_raw] = len(self.class_map)
            answer_class = self.class_map[answer_raw]

            # Add the image data
            feature = {
                'question_word_embedding':
                _int64_feature(np.ravel(question_dense[0]).astype(np.int64)),
                'question_char_embedding':
                _int64_feature(np.ravel(question_dense[1]).astype(np.int64)),
                'question_length':
                _int64_feature([question_len]),
                'answer_word_embedding':
                _int64_feature(np.ravel(answer_dense[0]).astype(np.int64)),
                'answer_char_embedding':
                _int64_feature(np.ravel(answer_dense[1]).astype(np.int64)),
                'answer_length':
                _int64_feature([answer_len]),
                'answer_class':
                _int64_feature([answer_class]),
                'image':
                _bytes_feature(tf.compat.as_bytes(image)),
            }

            # Write the TF-Record
            example = tf.train.Example(features=tf.train.Features(
                feature=feature))
            tf_record_writer.write(example.SerializeToString())
        tf_record_writer.close()
        DATA_STORE.update_hash(record_root)