Python DATA_STORE示例，flux.backend.globals.DATA_STORE Python示例

示例#1

0

显示文件

文件： data.py 项目： qbeer/flux

def maybe_download_and_store_tar(url: str,
                                 root_key: str,
                                 description: str = None,
                                 use_subkeys=True,
                                 **kwargs) -> List[str]:
    # Validate the keys in the directory
    # needs_redownload = False
    # Traverse the key dictionary, and check the integrity of each of the files
    old_keys: List[str] = []
    if DATA_STORE.is_valid(root_key) and validate_subkeys(root_key, old_keys):
        return old_keys

    # This is where the hard work happens
    # First, we have to download the file into the working directory
    data_path = maybe_download(url.split('/')[-1],
                               url,
                               DATA_STORE.working_directory,
                               postprocess=untar,
                               **kwargs)

    # The data path gives us the root key
    keys: List[str] = []
    if use_subkeys:
        keys = register_to_datastore(data_path, root_key, description)
    else:
        DATA_STORE.create_key(root_key, '', force=True)

    return [os.path.join(root_key, k) for k in keys] + [root_key]

示例#2

0

显示文件

文件： fashion_mnist.py 项目： qbeer/flux

    def __init__(self,
                 one_hot: bool = False,
                 force_rebuild: bool = False,
                 nohashcheck: bool = True) -> None:

        # Download the MNIST data
        self.train_images_key = maybe_download_and_store_single_file(
            url=
            'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz',
            key='fashion_mnist/train_images')
        self.train_labels_key = maybe_download_and_store_single_file(
            url=
            'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz',
            key='fashion_mnist/train_labels')
        self.test_images_key = maybe_download_and_store_single_file(
            url=
            'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz',
            key='fashion_mnist/test_images')
        self.test_labels_key = maybe_download_and_store_single_file(
            url=
            'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz',
            key='fashion_mnist/test_labels')

        # Build the dataset
        check_image_file_header(DATA_STORE[self.train_images_key])
        check_labels_file_header(DATA_STORE[self.train_labels_key])
        check_image_file_header(DATA_STORE[self.test_images_key])
        check_labels_file_header(DATA_STORE[self.test_labels_key])

        # Decode the images
        if not DATA_STORE.is_valid('fashion_mnist/pickle') or force_rebuild:
            log_message('Extracting Training Images...')
            self.train_images, self.train_labels = build_dataset(
                self.train_images_key, self.train_labels_key, one_hot)
            log_message('Extracting Test Images...')
            self.test_images, self.test_labels = build_dataset(
                self.test_images_key, self.test_labels_key, one_hot)

            pickle_dict = {
                'train_im': self.train_images,
                'train_lb': self.train_labels,
                'test_im': self.test_images,
                'test_lb': self.test_labels,
            }

            with open(
                    DATA_STORE.create_key('fashion_mnist/pickle',
                                          'mnist.pkl',
                                          force=True), 'wb') as pkl_file:
                pickle.dump(pickle_dict, pkl_file)
            DATA_STORE.update_hash('fashion_mnist/pickle')
        else:
            with open(DATA_STORE['fashion_mnist/pickle'], 'rb') as pkl_file:
                pickle_dict = pickle.load(pkl_file)
                self.train_images = pickle_dict['train_im']
                self.test_images = pickle_dict['test_im']
                self.train_labels = pickle_dict['train_lb']
                self.test_labels = pickle_dict['test_lb']

示例#3

0

显示文件

文件： bAbI_20.py 项目： qbeer/flux

    def read_file_from_db(self, is_train, task_key, sample=True):
        if sample and not is_train:
            task_path = task_key + "_test"
        else:
            task_path = task_key + "_train" if is_train else task_key

        if not DATA_STORE.is_valid(task_path):
            raise NameError("{0} does not exist.".format(task_path))
        return DATA_STORE.get_file(task_path)["fpath"]

示例#4

0

显示文件

    def __init__(self, version: str='2.0', num_parallel_reads: Optional[int]=None, force_rebuild=False, nohashcheck=False) -> None:

        self.num_parallel_reads = num_parallel_reads
        self.num_val_examples = None
        self.num_train_examples = None

        if version == '2.0':
            self.training_data_json_key = maybe_download_and_store_single_file(
                url='https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json', key='squad/train_json')
            self.dev_data_json_key = maybe_download_and_store_single_file(
                url='https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json', key='squad/dev_json')

            # Load the JSON from the files
            with open(DATA_STORE[self.training_data_json_key], 'r') as train_json:
                self.train_json = json.loads(train_json.read())
            with open(DATA_STORE[self.dev_data_json_key], 'r') as dev_json:
                self.dev_json = json.loads(dev_json.read())

            # Setup some baked constants in the dataset
            self.mwl = 766
            self.mcl = 37
            self.mql = 30

            # Parse the JSON
            if not force_rebuild and DATA_STORE.is_valid('squad/dictionary', nohashcheck=nohashcheck):
                with open(DATA_STORE['squad/dictionary'], 'rb') as pkl_file:
                    self.dictionary = pickle.load(pkl_file)
            else:
                self.dictionary = NLPDictionary()

            # Build the training set if necessary
            self.num_train_examples = self.build_dataset(train=True, force_rebuild=force_rebuild, nohashcheck=nohashcheck)
            if self.num_train_examples is None or self.num_train_examples == 0:
                self.num_train_examples = sum(1 for _ in tf.python_io.tf_record_iterator(DATA_STORE['squad/tfrecord/train']))

            # Build the validation set if necessary
            self.num_val_examples = self.build_dataset(train=False, force_rebuild=force_rebuild, nohashcheck=nohashcheck)
            if self.num_val_examples is None or self.num_val_examples == 0:
                self.num_val_examples = sum(1 for _ in tf.python_io.tf_record_iterator(DATA_STORE['squad/tfrecord/dev']))

            self.train_fpath = DATA_STORE['squad/tfrecord/train']
            self.dev_fpath = DATA_STORE['squad/tfrecord/dev']

            # Save the dictionary
            with open(DATA_STORE.create_key('squad/dictionary', 'dict.pkl', force=True), 'wb') as pkl_file:
                pickle.dump(self.dictionary, pkl_file)
                DATA_STORE.update_hash('squad/dictionary')

            self.word_vocab_size = len(self.dictionary.word_dictionary)
            self.char_vocab_size = len(self.dictionary.char_dictionary)

            self._val_db = None
            self._train_db = None
        else:
            raise NotImplementedError(
                "Only version 2.0 is currently supported")

示例#5

0

显示文件

    def read_file_from_db(self, is_training: bool, train_key: List,
                          val_key: List) -> str:
        if is_training:
            if train_key is None:
                raise Exception(
                    "Initialized with validation only.  Training data not downloaded."
                )
            return DATA_STORE.get_file(train_key[0])["fpath"]

        return DATA_STORE.get_file(val_key[0])["fpath"]

示例#6

0

显示文件

文件： data.py 项目： qbeer/flux

def write_csv_file(root_key, filename, description):

    data_path = os.path.join(root_key, filename)
    mkdir_p(root_key)
    open(data_path, 'a+').close()
    key = data_path[:data_path.
                    rfind('.')] if data_path.rfind('.') > 0 else data_path

    DATA_STORE.add_file(key, data_path, description, force=True)

    return data_path

示例#7

0

显示文件

    def build_dataset(self, train, force_rebuild=False, nohashcheck=False):
        record_root = 'squad/tfrecord/train' if train else 'squad/tfrecord/dev'
        json_data = self.train_json['data'] if train else self.dev_json['data']
        num_errors = 0
        num_documents = 0

        if force_rebuild or not DATA_STORE.is_valid(record_root, nohashcheck=nohashcheck):
            log_message('Building dataset ({})...'.format('Train' if train else 'Valid'))
            tf_record_writer = tf.python_io.TFRecordWriter(
                DATA_STORE.create_key(record_root, 'data.tfrecords',force=force_rebuild))
            for article in tqdm.tqdm(json_data):
                for paragraph_json in article['paragraphs']:

                    # Compute the context embedding
                    context_tokens = self.dictionary.tokenizer.parse(paragraph_json['context'].strip().replace('\n', ''))
                    context_dense, context_len = self.dictionary.dense_parse_tokens(context_tokens, word_padding=self.mwl, char_padding=self.mcl)

                    # Compute the QA embeddings
                    for question_answer in paragraph_json['qas']:
                        question_dense, question_len = self.dictionary.dense_parse(
                            question_answer['question'].strip().replace('\n', ''), word_padding=self.mql, char_padding=self.mcl)

                        # For each answer
                        for answer in question_answer['answers']:
                            answer_dense, answer_len = self.dictionary.dense_parse(
                                answer['text'], word_padding=self.mql, char_padding=self.mcl)

                            # Character span start/end
                            span_start = answer['answer_start']
                            span_end = span_start + len(answer['text'])

                            # Get the token span from the char span
                            token_span_start, token_span_end = get_token_span_from_char_span(
                                paragraph_json['context'].strip().replace('\n', ''), context_tokens, span_start, span_end)

                            if token_span_start < 0 or token_span_end < 0:
                                num_errors += 1
                                break

                            # Now that we've got the contents, let's make a TF-Record
                            # We're going to handle the tf-record writing here for now
                            # TODO: Move the tf-record writing to it's own file
                            feature_dict = self.build_feature_dict(context_dense, question_dense, answer_dense, span_start, span_end, token_span_start, token_span_end, context_len, question_len, answer_len)

                            example = tf.train.Example(
                                features=tf.train.Features(feature=feature_dict))
                            tf_record_writer.write(
                                example.SerializeToString())
                            num_documents += 1
            tf_record_writer.close()
            DATA_STORE.update_hash(record_root)
        return num_documents

示例#8

0

显示文件

    def _build_dataset(self,
                       mode="train",
                       force_rebuild=False,
                       nohashcheck=False):
        # For now, we will not use the provided vocab
        record_root = os.path.join(self.root_key, "tfrecord", mode)
        if force_rebuild or not DATA_STORE.is_valid(record_root,
                                                    nohashcheck=nohashcheck):
            log_message('Building dataset ({})...'.format(mode))
            tf_record_writer = tf.python_io.TFRecordWriter(\
                DATA_STORE.create_key(record_root, 'data.tfrecords',force=force_rebuild))

            if mode == "train":
                eng_file = self.train_eng
                for_file = self.train_for
            if mode == "test":
                eng_file = self.test_eng
                for_file = self.test_for
            else:
                eng_file = self.val_eng
                for_file = self.val_for

            with codecs.getreader("utf-8")(tf.gfile.GFile(DATA_STORE[eng_file],
                                                          mode="rb")) as f:
                eng_data = f.read().splitlines()
            with codecs.getreader("utf-8")(tf.gfile.GFile(DATA_STORE[for_file],
                                                          mode="rb")) as f:
                for_data = f.read().splitlines()

            for i, line in tqdm.tqdm(enumerate(eng_data)):
                src_dense, src_len = self.src_dictionary.dense_parse(line, \
                                                                        word_padding=self.mwl, \
                                                                        char_padding=0)
                for_line = for_data[i]
                for_dense, for_len = self.dst_dictionary.dense_parse(for_line, \
                                                                    word_padding=self.mwl, \
                                                                    char_padding=0)
                feature_dict = self.build_feature_dict(src_dense[0],
                                                       for_dense[0], src_len,
                                                       for_len)

                example = tf.train.Example(features=tf.train.Features(
                    feature=feature_dict))
                tf_record_writer.write(example.SerializeToString())
            tf_record_writer.close()
            DATA_STORE.update_hash(record_root)
            return len(eng_data)
        else:
            return sum(1 for _ in tf.python_io.tf_record_iterator(
                DATA_STORE[record_root]))

示例#9

0

显示文件

文件： bAbI_20.py 项目： qbeer/flux

    def build_dataset(self,
                      train,
                      sample=True,
                      force_rebuild=False,
                      nohashcheck=False):
        num_tasks = 0
        record_root = self.train_record_root if train else self.val_record_root
        record_name = "sample.tfrecords" if sample else "data.tfrecords"
        subset = self.subset

        if not train:
            subset = subset + "-valid"

        if not sample:
            subset = subset + "-10k"

        if force_rebuild:
            log_message('Building dataset ({})...'.format(
                'Train' if train else 'Valid'))

            task_path = "{0}/{1}/{2}/{3}"

            for task in tqdm.tqdm(bAbI_20.task_list):
                if not train:
                    task_id = task.split("_")[0]
                else:
                    task_id = task
                task_tf_root = os.path.join(record_root, subset, task_id)

                tf_record_writer = tf.python_io.TFRecordWriter(
                    DATA_STORE.create_key(task_tf_root,
                                          record_name,
                                          force=force_rebuild))

                task_path = task_path.format(self.task_root, self.task_root,
                                             subset, task_id)
                data_path = self.read_file_from_db(train, task_path)

                txt = self.read_txt(data_path)
                features = self.parse_context_question(txt)

                for feature_dict in features:
                    example = tf.train.Example(\
                                    features=tf.train.Features(feature=feature_dict))
                    tf_record_writer.write(example.SerializeToString())
                tf_record_writer.close()
                DATA_STORE.update_hash(task_tf_root)
                num_tasks += 1
        return num_tasks

示例#10

0

显示文件

文件： coco.py 项目： qbeer/flux

    def _build_dataset(self, dataset: str) -> None:

        if dataset not in ['train', 'val']:
            raise ValueError("Must be building either training or validation dataset")

        # Open the TFRecordWriter
        if dataset == 'train':
            record_root = 'coco2014/tfrecord/train'
            json = self.train_json
            root_fpath = DATA_STORE['coco2014/data/train/images']
        else:
            record_root = 'coco2014/tfrecord/val'
            json = self.val_json
            root_fpath = DATA_STORE['coco2014/data/val/images']

        # Construct the record reader
        tf_record_writer = tf.python_io.TFRecordWriter(DATA_STORE.create_key(record_root, 'data.tfrecords', force=True))

        # Loop over the data and parse
        errors = 0
        log_message('Building {} dataset...'.format(dataset))
        for entry in tqdm.tqdm(json['annotations']):
            # Load the image
            image = load_image(build_fpath_from_image_id(root_fpath, entry['image_id'], dataset))
            if image is None:
                errors += 1
                log_warning('Error loading image: {}. {} Errors so far.'.format(build_fpath_from_image_id(root_fpath, entry['image_id'], dataset), errors))
                continue

            # Parse the caption
            caption_raw = entry['caption']
            caption_dense, caption_len = self.dictionary.dense_parse(caption_raw, word_padding=self.max_word_length, char_padding=self.max_char_length)

            # Add the image data 
            feature = {
                'caption_word_embedding': _int64_feature(np.ravel(caption_dense[0]).astype(np.int64)),
                'caption_char_embedding': _int64_feature(np.ravel(caption_dense[1]).astype(np.int64)),
                'caption_length': _int64_feature([caption_len]),
                'image_shape': _int64_feature(image.shape),
                'image': _bytes_feature(tf.compat.as_bytes(image.tostring())),
            }

            # Write the TF-Record
            example = tf.train.Example(features=tf.train.Features(feature=feature))
            tf_record_writer.write(example.SerializeToString())
        
        tf_record_writer.close()
        DATA_STORE.update_hash(record_root)

示例#11

0

显示文件

文件： data.py 项目： qbeer/flux

def register_to_datastore(data_path, root_key, description):
    root_length = len(data_path.split('/'))
    new_keys: List[str] = []
    DATA_STORE.create_key(root_key, '', force=True)
    for root, _, filenames in os.walk(data_path):
        for filename in filenames:
            if not filename.endswith(".zip"):
                key = '/'.join(
                    os.path.join(root, filename).split('/')[root_length:])
                key = key[:key.rfind('.')] if key.rfind('.') > 0 else key
                new_keys.append(key)
                DATA_STORE.add_file(os.path.join(root_key, key),
                                    os.path.join(root, filename),
                                    description,
                                    force=True)
    return new_keys

示例#12

0

显示文件

文件： data.py 项目： qbeer/flux

def retrieve_subkeys(root_key):
    keys = []
    for key in DATA_STORE.db.keys():
        if key.startswith(root_key) and key != root_key:
            if DATA_STORE.is_valid(key):
                keys.append(key)
    return keys

示例#13

0

显示文件

文件： coco.py 项目： qbeer/flux

    def _build_dataset(self, dataset: str) -> None:

        if dataset not in ['train', 'val']:
            raise ValueError("Must be building either training or validation dataset")

        # Open the TFRecordWriter
        if dataset == 'train':
            record_root = 'coco2017/detection/tfrecord/train'
            json = self.train_json
            root_fpath = DATA_STORE['coco2017/data/train/images']
        else:
            record_root = 'coco2017/detection/tfrecord/val'
            json = self.val_json
            root_fpath = DATA_STORE['coco2017/data/val/images']

        # Construct the record reader
        tf_record_writer = tf.python_io.TFRecordWriter(DATA_STORE.create_key(record_root, 'data.tfrecords', force=True))

        # Loop over the data and parse
        errors = 0
        log_message('Building {} dataset...'.format(dataset))
        for entry in tqdm.tqdm(json['annotations']):
            # Load the image
            image = load_image(build_fpath_from_image_id(root_fpath, entry['image_id'], dataset))
            if image is None:
                errors += 1
                log_warning('Error loading image: {}. {} Errors so far.'.format(build_fpath_from_image_id(root_fpath, entry['image_id'], dataset), errors))
                continue

            # Add the image data 
            # TODO: Add the segmentation (decode using the RLE for COCO)
            feature = {
                'area': _float_feature(entry['area']),
                'iscrowd': _int64_feature(entry['iscrowd']),
                'bbox': _float_feature(np.ravel(np.array(entry['bbox'], dtype=np.float32))),
                'category_id': _int64_feature(entry['category_id']),
                'image_shape': _int64_feature(image.shape),
                'image': _bytes_feature(tf.compat.as_bytes(image.tostring())),
            }

            # Write the TF-Record
            example = tf.train.Example(features=tf.train.Features(feature=feature))
            tf_record_writer.write(example.SerializeToString())
        
        tf_record_writer.close()
        DATA_STORE.update_hash(record_root)

示例#14

0

显示文件

文件： coco.py 项目： qbeer/flux

    def __init__(self, num_parallel_reads: int=1, force_rebuild: bool=False) -> None:

        # Query for the data password
        if not DATA_STORE.is_valid('coco2017/data/annotations') or force_rebuild:
            maybe_download_and_store_zip('http://images.cocodataset.org/annotations/annotations_trainval2017.zip', 'coco2017/data/annotations', use_subkeys=False)
        if not DATA_STORE.is_valid('coco2017/data/train/images') or force_rebuild:
            maybe_download_and_store_zip('http://images.cocodataset.org/zips/train2017.zip', 'coco2017/data/train/images', use_subkeys=False)
        if not DATA_STORE.is_valid('coco2017/data/val/images') or force_rebuild:
            maybe_download_and_store_zip('http://images.cocodataset.org/zips/val2017.zip', 'coco2017/data/val/images', use_subkeys=False)

        # TODO ([email protected]) Need to make sure that this works - there could be download issues, but it's hard to say
        self.train_json_key = 'coco2017/data/annotations'
        self.val_json_key = 'coco2017/data/annotations'

        # Now that we have the data, load and parse the JSON files
        need_rebuild_train = force_rebuild
        if not DATA_STORE.is_valid('coco2017/detection/tfrecord/train') or need_rebuild_train:
            need_rebuild_train = True
            with open(os.path.join(DATA_STORE[self.train_json_key], 'annotations/instances_train2017.json'), 'r') as annotation_file:
                self.train_json = json.loads(annotation_file.read())
        
        need_rebuild_val = force_rebuild
        if not DATA_STORE.is_valid('coco2017/detection/tfrecord/val') or need_rebuild_val:
            need_rebuild_val = True
            with open(os.path.join(DATA_STORE[self.val_json_key], 'annotations/instances_val2017.json'), 'r') as annotation_file:
                self.val_json = json.loads(annotation_file.read())

        # Setup some default options for the dataset
        self._val_db = None
        self._train_db = None
        self.num_parallel_reads = num_parallel_reads
        
        # Build the tfrecord dataset from the JSON
        if need_rebuild_train:
            self._build_dataset('train')
        if need_rebuild_val:
            self._build_dataset('val')

        self.train_fpath = DATA_STORE['coco2017/detection/tfrecord/train']
        self.val_fpath = DATA_STORE['coco2017/detection/tfrecord/val']

        # Compute the size of the datasets
        self.num_train_examples = sum(1 for _ in tf.python_io.tf_record_iterator(DATA_STORE['coco2017/detection/tfrecord/train']))
        self.num_val_examples = sum(1 for _ in tf.python_io.tf_record_iterator(DATA_STORE['coco2017/detection/tfrecord/val']))

示例#15

0

显示文件

文件： data.py 项目： qbeer/flux

def maybe_download_and_store_single_file(url: str,
                                         key: str,
                                         description: str = None,
                                         postprocess=None,
                                         **kwargs) -> str:
    if not DATA_STORE.is_valid(key):
        # This is where the hard work happens
        # First, we have to download the file into the working directory
        if postprocess is None:
            data_path = maybe_download(
                url.split('/')[-1], url, DATA_STORE.working_directory)
        else:
            data_path = maybe_download(url.split('/')[-1],
                                       url,
                                       DATA_STORE.working_directory,
                                       postprocess=postprocess,
                                       **kwargs)
        DATA_STORE.add_file(key, data_path, description, force=True)
    return key

示例#16

0

显示文件

    def val_data_stream(self):
        if self.images_val is None:
            raise Exception(
                "Download val_data first by setting validation_only to True")

        for img_id, q_id, a_id in self.index:
            img_key = "vqa/images-val/val2014/COCO_val2014_%012d" % img_id
            img_file = DATA_STORE.get_file(img_key)
            img = self.image_from_file(img_file["fpath"])
            q = self.question_index[q_id]
            #TODO: Fix this hardcode
            a = self.answer_index(True)[(q_id, a_id)]
            yield (img, q, a)

示例#17

0

显示文件

文件： data.py 项目： qbeer/flux

def maybe_download_and_store_zip(url: str,
                                 root_key: str,
                                 description: str = None,
                                 use_subkeys=True,
                                 **kwargs) -> List[str]:
    old_keys: List[str] = []
    if DATA_STORE.is_valid(root_key) and validate_subkeys(root_key, old_keys):
        return old_keys
        # Ensure one layer file structure for zip file? TODO (Karen)

    data_path = maybe_download(file_name=url.split("/")[-1],
                               source_url=url,
                               work_directory=DATA_STORE.working_directory,
                               postprocess=unzip,
                               **kwargs)
    keys: List[str] = []
    if use_subkeys:
        keys = register_to_datastore(data_path, root_key, description)
        # DATA_STORE.create_key(root_key, 'root.key', force=True) I removed this because this call removes all the file I have stored with the previous register_to_datastore. (Karen)
    else:
        DATA_STORE.add_folder(root_key, data_path, force=True)

    return [os.path.join(root_key, k) for k in keys]

示例#18

0

显示文件

文件： data.py 项目： qbeer/flux

def maybe_download_and_store_google_drive(file_pair: Dict[str, str],
                                          root_key: str,
                                          description: str = None,
                                          force_download: bool = False,
                                          use_subkeys=True,
                                          **kwargs) -> List[str]:
    old_keys: List[str] = []
    if not force_download and DATA_STORE.is_valid(
            root_key) and validate_subkeys(root_key, old_keys):
        return old_keys

    keys = []
    DATA_STORE.create_key(root_key, 'root.key', force=True)

    for file_name in file_pair:
        log_message("Downloading " + file_name)
        file_id = file_pair[file_name]
        file_dest = os.path.join(DATA_STORE.working_directory, file_name)
        data_path = maybe_download_google_drive(file_id,
                                                file_dest,
                                                force_download=force_download)
        data_path = post_process(data_path)
        log_message("Decompressed " + file_name + "to " + data_path)
        if os.path.isdir(data_path):
            if use_subkeys:
                _keys = register_to_datastore(data_path, root_key, description)
                keys.extend(_keys)
            else:
                data_key = os.path.join(root_key, file_name.split(".zip")[0])
                DATA_STORE.add_folder(data_key, data_path, force=True)
                keys.append(data_key)
        else:
            _key = os.path.join(root_key, file_name.split(".")[0])
            DATA_STORE.add_file(_key, data_path, description, force=True)
            keys.append(_key)
        log_message("Completed " + file_name)
    DATA_STORE.create_key(root_key, 'root.key', force=True)

    return [k for k in keys] + [root_key]

示例#19

0

显示文件

    def train_data_stream(self):
        if self.images_train is None:
            raise Exception(
                "Download train_data_first by setting validation_only to False"
            )

        if not self.is_training:
            raise Exception(
                "Build index first with build_index(is_training=True)")

        for img_id, q_id, a_id in self.index:
            img_key = "vqa/images-train/train2014/COCO_train2014_%012d" % img_id
            img_file = DATA_STORE.get_file(img_key)
            img = self.image_from_file(img_file["fpath"])
            q = self.question_index[q_id]
            #TODO: Fix this hardcode
            a = self.answer_index(True)[(q_id, a_id)]
            yield (img, q, a)
        return

示例#20

0

显示文件

文件： bAbI_20.py 项目： qbeer/flux

    def build_db(self, sample, is_train, db, subset) -> Dict:
        record_root = self.train_record_root if is_train else self.val_record_root
        if db is not None:
            return db
        task_dict = {}
        record_name = "sample.tfrecords" if sample else "data.tfrecords"
        for task in bAbI_20.task_list:
            if not is_train:
                task = task.split("_")[0]
            task_tf_root = os.path.join(record_root, subset, task)

            if not DATA_STORE.is_valid(task_tf_root):
                raise NotImplementedError("{} not built".format(task_tf_root))

            task_dict[task] = tf.data.TFRecordDataset(
                DATA_STORE[task_tf_root],
                num_parallel_reads=self.num_parallel_reads).map(self._map_fn)

        return task_dict

示例#21

0

显示文件

文件： data.py 项目： qbeer/flux

def validate_subkeys(root_key, old_keys=None):
    """Validates the sub-keys in a root key

    Arguments:
        root_key {[type]} -- [description]

    Keyword Arguments:
        old_keys {list} -- [description] (default: {[]})

    Returns:
        [type] -- [description]
    """
    if old_keys is None:
        old_keys: List[str] = []

    for key in DATA_STORE.db.keys():
        if key.startswith(root_key) and key != root_key:
            old_keys.append(key)
            if not DATA_STORE.is_valid(key):
                return False
    return True

示例#22

0

显示文件

文件： bAbI_20.py 项目： qbeer/flux

    def __init__(self,
                 num_parallel_reads=1,
                 sample_only=True,
                 force_rebuild=True,
                 nohashcheck=True,
                 subset="en",
                 wml=8,
                 cml=10):
        self.task_root = "tasks_1-20_v1-2"
        self.subset = subset
        url = "http://www.thespermwhale.com/jaseweston/babi/tasks_1-20_v1-2.tar.gz"
        self.keys = maybe_download_and_store_tar(url=url,
                                                 root_key=self.task_root)
        self.nlp_dict = NLPDictionary()
        self.wml = wml
        self.cml = cml
        self.num_parallel_reads = num_parallel_reads

        dict_name = self.task_root + "/dictionary"
        self.train_record_root = 'tasks_1-20_v1-2/tfrecord/train'
        self.val_record_root = 'tasks_1-20_v1-2/tfrecord/dev'

        if not force_rebuild and DATA_STORE.is_valid(dict_name,
                                                     nohashcheck=nohashcheck):
            with open(DATA_STORE[dict_name], 'rb') as pkl_file:
                self.dictionary = pickle.load(pkl_file)
        else:
            self.dictionary = NLPDictionary()

        # Build the training set if necessary
        self.sample_num_train_examples = self.build_dataset(
            train=True,
            sample=True,
            force_rebuild=force_rebuild,
            nohashcheck=nohashcheck)
        # if self.num_train_examples is None:
        #     self.num_train_examples = sum(1 for _ in tf.python_io.tf_record_iterator(DATA_STORE[self.train_record_root]))

        # Build the validation set if necessary
        self.sample_num_val_examples = self.build_dataset(
            train=False,
            sample=True,
            force_rebuild=force_rebuild,
            nohashcheck=nohashcheck)
        # if self.num_val_examples is None:
        #     self.num_val_examples = sum(1 for _ in tf.python_io.tf_record_iterator(DATA_STORE[self.val_record_root]))
        # Build the training set if necessary
        self.num_train_examples = self.build_dataset(
            train=True,
            sample=False,
            force_rebuild=force_rebuild,
            nohashcheck=nohashcheck)
        # if self.num_train_examples is None:
        #     self.num_train_examples = sum(1 for _ in tf.python_io.tf_record_iterator(DATA_STORE[self.train_record_root]))

        # Build the validation set if necessary
        self.num_val_examples = self.build_dataset(train=False,
                                                   sample=False,
                                                   force_rebuild=force_rebuild,
                                                   nohashcheck=nohashcheck)
        # if self.num_val_examples is None:
        #     self.num_val_examples = sum(1 for _ in tf.python_io.tf_record_iterator(DATA_STORE[self.val_record_root]))
        #  = DATA_STORE[self.train_record_root]
        # self.dev_fpath = DATA_STORE[self.val_record_root]

        # Save the dictionary
        with open(DATA_STORE.create_key(dict_name, 'dict.pkl', force=True),
                  'wb') as pkl_file:
            pickle.dump(self.dictionary, pkl_file)
            DATA_STORE.update_hash(dict_name)

        self.word_vocab_size = len(self.dictionary.word_dictionary)
        self.char_vocab_size = len(self.dictionary.char_dictionary)

        self._sample_val_db = None
        self._sample_train_db = None
        self._train_db = None
        self._val_db = None

        #TODO: Add Shuffle Dataset if necessary.

        print("Build Complete.")

示例#23

0

显示文件

    def _build_dataset(self, ) -> None:
        # Define the Record Root

        # Open the TFRecordWriter
        train_record_root = os.path.join(self.train_fpath, "data")
        val_record_root = os.path.join(self.val_fpath, "data")
        test_record_root = os.path.join(self.test_fpath, "data")

        # Construct the record reader
        train_record_writer = tf.python_io.TFRecordWriter(
            DATA_STORE.create_key(train_record_root,
                                  'data.tfrecords',
                                  force=True))
        val_record_writer = tf.python_io.TFRecordWriter(
            DATA_STORE.create_key(val_record_root,
                                  'data.tfrecords',
                                  force=True))
        test_record_writer = tf.python_io.TFRecordWriter(
            DATA_STORE.create_key(test_record_root,
                                  'data.tfrecords',
                                  force=True))

        # Loop over the data and parse
        errors = 0
        log_message('Building the dataset...')

        images = self._json['images']
        if self.data_type == "pointing":
            boxes = self._json['boxes']
            boxes_dict = {d["box_id"]: d for d in boxes}

        total_num_examples = len(images)
        for idx, entry in tqdm.tqdm(enumerate(images),
                                    total=total_num_examples):
            # Load the image

            # Split the dataset
            split = entry["split"]
            if split == "val":
                tf_record_writer = val_record_writer
                self.num_val_examples += 1
            elif split == "test":
                tf_record_writer = test_record_writer
                self.num_test_examples += 1
            else:
                tf_record_writer = train_record_writer
                self.num_train_examples += 1

            image_id = entry['image_id']
            qa_pairs = entry['qa_pairs']

            for qa in qa_pairs:
                question_raw = qa['question']
                question_type = qa['type']
                qa_id = qa['qa_id']
                mlt_choice = qa["multiple_choices"]
                answer = qa['answer']

                assert len(mlt_choice) == 3
                question_dense, question_len = self.dictionary.dense_parse(
                    question_raw,
                    word_padding=self.max_word_length,
                    char_padding=self.max_char_length)

                if self.data_type == "telling":
                    answer_dense, answer_len = self.dictionary.dense_parse(
                        answer,
                        word_padding=self.max_word_length,
                        char_padding=self.max_char_length)
                    m1_dense, m1_len = self.dictionary.dense_parse(
                        mlt_choice[0],
                        word_padding=self.max_word_length,
                        char_padding=self.max_char_length)
                    m2_dense, m2_len = self.dictionary.dense_parse(
                        mlt_choice[1],
                        word_padding=self.max_word_length,
                        char_padding=self.max_char_length)
                    m3_dense, m3_len = self.dictionary.dense_parse(
                        mlt_choice[2],
                        word_padding=self.max_word_length,
                        char_padding=self.max_char_length)

                    # Add the image data
                    feature = {
                        'question_word_embedding':
                        _int64_feature(
                            np.ravel(question_dense[0]).astype(np.int64)),
                        'question_char_embedding':
                        _int64_feature(
                            np.ravel(question_dense[1]).astype(np.int64)),
                        'question_length':
                        _int64_feature([question_len]),
                        'ans_word_embedding':
                        _int64_feature(
                            np.ravel(answer_dense[0]).astype(np.int64)),
                        'ans_char_embedding':
                        _int64_feature(
                            np.ravel(answer_dense[1]).astype(np.int64)),
                        'ans_length':
                        _int64_feature([answer_len]),
                        'm1_embedding':
                        _int64_feature(np.ravel(m1_dense[0]).astype(np.int64)),
                        'm1_char_embedding':
                        _int64_feature(np.ravel(m1_dense[1]).astype(np.int64)),
                        'm2_embedding':
                        _int64_feature(np.ravel(m2_dense[0]).astype(np.int64)),
                        'm2_char_embedding':
                        _int64_feature(np.ravel(m2_dense[1]).astype(np.int64)),
                        'm3_embedding':
                        _int64_feature(np.ravel(m3_dense[0]).astype(np.int64)),
                        'm3_char_embedding':
                        _int64_feature(np.ravel(m3_dense[1]).astype(np.int64)),
                        'mc_len':
                        _int64_feature([m1_len, m2_len, m3_len]),
                        "q_type":
                        _bytes_feature(tf.compat.as_bytes(question_type)),
                        'qa_id':
                        _int64_feature([qa_id]),
                        'image_id':
                        _int64_feature([image_id]),
                    }

                else:
                    answer_loc, answer_dense, answer_len = self.get_boxes(
                        answer, boxes_dict)
                    m1_loc, m1_dense, m1_len = self.get_boxes(
                        mlt_choice[0], boxes_dict)
                    m2_loc, m2_dense, m2_len = self.get_boxes(
                        mlt_choice[1], boxes_dict)
                    m3_loc, m3_dense, m3_len = self.get_boxes(
                        mlt_choice[2], boxes_dict)
                    coord = answer_loc + m1_loc + m2_loc + m3_loc
                    # Add the image data
                    feature = {
                        'question_word_embedding':
                        _int64_feature(
                            np.ravel(question_dense[0]).astype(np.int64)),
                        'question_char_embedding':
                        _int64_feature(
                            np.ravel(question_dense[1]).astype(np.int64)),
                        'question_length':
                        _int64_feature([question_len]),
                        'ans_word_embedding':
                        _int64_feature(
                            np.ravel(answer_dense[0]).astype(np.int64)),
                        'ans_char_embedding':
                        _int64_feature(
                            np.ravel(answer_dense[1]).astype(np.int64)),
                        'ans_length':
                        _int64_feature([answer_len]),
                        "coordinate":
                        _int64_feature(coord),
                        'm1_embedding':
                        _int64_feature(np.ravel(m1_dense[0]).astype(np.int64)),
                        'm1_char_embedding':
                        _int64_feature(np.ravel(m1_dense[1]).astype(np.int64)),
                        'm2_embedding':
                        _int64_feature(np.ravel(m2_dense[0]).astype(np.int64)),
                        'm2_char_embedding':
                        _int64_feature(np.ravel(m2_dense[1]).astype(np.int64)),
                        'm3_embedding':
                        _int64_feature(np.ravel(m3_dense[0]).astype(np.int64)),
                        'm3_char_embedding':
                        _int64_feature(np.ravel(m3_dense[1]).astype(np.int64)),
                        'mc_len':
                        _int64_feature([m1_len, m2_len, m3_len]),
                        'qa_id':
                        _int64_feature([qa_id]),
                        "q_type":
                        _bytes_feature(tf.compat.as_bytes(question_type)),
                        'image_id':
                        _int64_feature([image_id]),
                    }

                example = tf.train.Example(features=tf.train.Features(
                    feature=feature))

                tf_record_writer.write(example.SerializeToString())

        val_record_writer.close()
        train_record_writer.close()
        test_record_writer.close()
        DATA_STORE.update_hash(test_record_root)
        DATA_STORE.update_hash(train_record_root)
        DATA_STORE.update_hash(val_record_root)

示例#24

0

显示文件

    def __init__(self,
                 data_type="pointing",
                 num_parallel_reads: int = 1,
                 force_rebuild: bool = False,
                 ignore_hashes=False,
                 image_shape: Sequence[int] = [448, 448],
                 read_codes=False,
                 code_shape: Sequence[int] = [7, 7, 2048],
                 merge_qa=False) -> None:

        log_message("Building Dataset " + data_type)

        self.image_resize_shape = image_shape
        self.read_codes = read_codes
        self.code_shape = code_shape
        self.merge_qa = merge_qa
        self.image_root_path = DATA_STORE["visual7w/data/images"]
        # Get all of the necessary data
        self.images_key = maybe_download_and_store_zip(
            'http://vision.stanford.edu/yukezhu/visual7w_images.zip',
            'visual7w/data/images',
            use_subkeys=False)
        # Get all of the necessary data
        self.dataset_key = maybe_download_and_store_zip(
            "http://web.stanford.edu/~yukez/papers/resources/dataset_v7w_{0}.zip"
            .format(data_type),
            'visual7w/{0}/data/json'.format(data_type),
            use_subkeys=True)
        # Get the grounding data
        self.grounding_key = maybe_download_and_store_zip(
            "http://web.stanford.edu/~yukez/papers/resources/dataset_v7w_grounding_annotations.zip",
            "visual/data/grounding",
            use_subkeys=True)

        # Compute the size of the datasets
        self.num_train_examples = 0
        self.num_val_examples = 0
        self.num_test_examples = 0

        self.max_word_length = 44
        self.max_char_length = 26

        self.data_type = data_type

        root_key = "visual7w/{0}".format(data_type)
        dict_key = os.path.join(root_key, "dictionary")
        # Load the vocab files
        if not ignore_hashes and (force_rebuild
                                  or not DATA_STORE.is_valid(dict_key)):
            self.dictionary = NLPDictionary()
            need_rebuild_train = True
            need_rebuild_val = True
        else:
            self.dictionary = NLPDictionary().load(DATA_STORE[dict_key])

        self.train_fpath = os.path.join(root_key, 'tfrecord/train')
        self.val_fpath = os.path.join(root_key, 'tfrecord/val')
        self.test_fpath = os.path.join(root_key, 'tfrecord/test')

        if force_rebuild:
            # Now that we have the data, load and parse the JSON file
            file_ = DATA_STORE[self.dataset_key[0]]
            with open(file_, 'r') as ptr:
                self._json = json.load(ptr)
            self._build_images()
            self._build_dataset()
        else:
            # Compute the size of the datasets
            self.num_train_examples = sum(
                1 for _ in tf.python_io.tf_record_iterator(DATA_STORE[
                    os.path.join(self.train_fpath, "images")]))
            self.num_val_examples = sum(
                1 for _ in tf.python_io.tf_record_iterator(DATA_STORE[
                    os.path.join(self.val_fpath, "images")]))

        # Setup some default options for the dataset
        self._val_db = None
        self._train_db = None
        self._test_db = None
        self.num_parallel_reads = num_parallel_reads

        # Save the vocab
        if force_rebuild:
            self.dictionary.save(
                DATA_STORE.create_key(dict_key, 'dict.pkl', force=True))
            DATA_STORE.update_hash(dict_key)

        self.word_vocab_size = len(self.dictionary.word_dictionary)
        self.char_vocab_size = len(self.dictionary.char_dictionary)

示例#25

0

显示文件

文件： glove.py 项目： qbeer/flux

    def __init__(self, version: str='wikipedia', dimension: int=300) -> None:

        self.version = version
        self.dimension = dimension
        self.embedding_matrix: Optional[np.ndarray] = None

        if self.version == 'wikipedia':
            # Make sure that the dimension is valid
            if self.dimension not in GloveEmbedding.wikipedia_dimensions:
                raise ValueError('Error: Invalid GLoVe dimension ({}) for Wikipedia dataset. Must be one of {}'.format(self.dimension,
                                                                                                                       GloveEmbedding.wikipedia_dimensions))

            if not DATA_STORE.is_valid('glove/wikipedia/dim{}'.format(self.dimension)):
                # Download the file into the working direcotry
                maybe_download(file_name='glove.6B.zip', source_url='http://nlp.stanford.edu/data/glove.6B.zip',
                               work_directory=DATA_STORE.working_directory, postprocess=unzip)

                # Read the data keys from the file
                log_message('Loading vectors...')
                self.encoder: Dict[str, np.ndarray] = {}
                with open(os.path.join(DATA_STORE.working_directory, 'glove.6B/glove.6B.{}d.txt'.format(self.dimension)), 'r') as glove_file:
                    for line in glove_file:
                        tokens = line.split()
                        self.encoder[tokens[0]] = np.array(
                            [float(x) for x in tokens[1:]])

                # Save the encoder
                with open(DATA_STORE.create_key('glove/wikipedia/dim{}'.format(self.dimension), 'encoder.pkl', force=True), 'wb',) as pkl_file:
                    pickle.dump(self.encoder, pkl_file)
                DATA_STORE.update_hash('glove/wikipedia/dim{}'.format(self.dimension))

            else:
                with open(DATA_STORE['glove/wikipedia/dim{}'.format(self.dimension)], 'rb') as pkl_file:
                    self.encoder = pickle.load(pkl_file)

        elif self.version == 'common-small':
            # Make sure that the dimension is valid
            if self.dimension not in GloveEmbedding.common_small_dimensions:
                raise ValueError('Error: Invalid GLoVe dimension ({}) for Common-Crawl Small dataset. Must be one of {}'.format(self.dimension,
                                                                                                                                GloveEmbedding.common_small_dimensions))

            if not DATA_STORE.is_valid('glove/common-small/dim{}'.format(self.dimension)):
                # Download the file into the working direcotry
                maybe_download(file_name='glove.42B.300d.zip', source_url='http://nlp.stanford.edu/data/glove.42B.300d.zip',
                               work_directory=DATA_STORE.working_directory, postprocess=unzip)

                # Read the data keys from the file
                log_message('Loading vectors...')
                self.encoder: Dict[str, np.ndarray] = {}
                with open(os.path.join(DATA_STORE.working_directory, 'glove.42B.300d/glove.42B.{}d.txt'.format(self.dimension)), 'r') as glove_file:
                    for line in glove_file:
                        tokens = line.split()
                        self.encoder[tokens[0]] = np.array(
                            [float(x) for x in tokens[1:]])

                # Save the encoder
                with open(DATA_STORE.create_key('glove/common-small/dim{}'.format(self.dimension), 'encoder.pkl', force=True), 'wb') as pkl_file:
                    pickle.dump(self.encoder, pkl_file)
                DATA_STORE.update_hash('glove/common-small/dim{}'.format(self.dimension))

            else:
                with open(DATA_STORE['glove/common-small/dim{}'.format(self.dimension)], 'rb') as pkl_file:
                    self.encoder = pickle.load(pkl_file)

        elif self.version == 'common-large':
            # Make sure that the dimension is valid
            if self.dimension not in GloveEmbedding.common_large_dimensions:
                raise ValueError('Error: Invalid GLoVe dimension ({}) for Common-Crawl Large dataset. Must be one of {}'.format(self.dimension,
                                                                                                                                GloveEmbedding.common_large_dimensions))

            if not DATA_STORE.is_valid('glove/common-large/dim{}'.format(self.dimension)):
                # Download the file into the working direcotry
                maybe_download(file_name='glove.840B.300d.zip', source_url='http://nlp.stanford.edu/data/glove.840B.300d.zip',
                               work_directory=DATA_STORE.working_directory, postprocess=unzip)

                # Read the data keys from the file
                log_message('Loading vectors...')
                self.encoder: Dict[str, np.ndarray] = {}
                with open(os.path.join(DATA_STORE.working_directory, 'glove.840B.300d/glove.840B.{}d.txt'.format(self.dimension)), 'r') as glove_file:
                    for line in glove_file:
                        tokens = line.split()
                        self.encoder[tokens[0]] = np.array(
                            [float(x) for x in tokens[1:]])

                # Save the encoder
                with open(DATA_STORE.create_key('glove/common-large/dim{}'.format(self.dimension), 'encoder.pkl', force=True), 'wb') as pkl_file:
                    pickle.dump(self.encoder, pkl_file)
                DATA_STORE.update_hash('glove/common-large/dim{}'.format(self.dimension))

            else:
                with open(DATA_STORE['glove/common-large/dim{}'.format(self.dimension)], 'rb') as pkl_file:
                    self.encoder = pickle.load(pkl_file)

        elif self.version == 'twitter':
            # Make sure that the dimension is valid
            if self.dimension not in GloveEmbedding.twitter_dimensions:
                raise ValueError('Error: Invalid GLoVe dimension ({}) for Common-Crawl Large dataset. Must be one of {}'.format(self.dimension,
                                                                                                                                GloveEmbedding.twitter_dimensions))

            if not DATA_STORE.is_valid('glove/twitter/dim{}'.format(self.dimension)):
                # Download the file into the working direcotry
                maybe_download(file_name='glove.twitter.27B.zip', source_url='http://nlp.stanford.edu/data/glove.twitter.27B.zip',
                               work_directory=DATA_STORE.working_directory, postprocess=unzip)

                # Read the data keys from the file
                log_message('Loading vectors...')
                self.encoder: Dict[str, np.ndarray] = {}
                with open(os.path.join(DATA_STORE.working_directory, 'glove.twitter.27B/glove.twitter.27B.{}d.txt'.format(self.dimension)), 'r') as glove_file:
                    for line in glove_file:
                        tokens = line.split()
                        self.encoder[tokens[0]] = np.array(
                            [float(x) for x in tokens[1:]])

                # Save the encoder
                with open(DATA_STORE.create_key('glove/twitter/dim{}'.format(self.dimension), 'encoder.pkl', force=True), 'wb') as pkl_file:
                    pickle.dump(self.encoder, pkl_file)
                DATA_STORE.update_hash('glove/twitter/dim{}'.format(self.dimension))

            else:
                with open(DATA_STORE['glove/twitter/dim{}'.format(self.dimension)], 'rb') as pkl_file:
                    self.encoder = pickle.load(pkl_file)
        else:
            raise ValueError('Error: Invalid GLoVe Version: {}, Must be one of {}'.format(
                version, GloveEmbedding.valid_versions))

示例#26

0

显示文件

文件： coco.py 项目： qbeer/flux

    def __init__(self, num_parallel_reads: int=1, force_rebuild: bool=False, nohashcheck=False) -> None:
        # Amount of Space Check.
        if not Dataset.has_space(COCOCaptions.AMT_REQUIRED):
            return
        # Query for the data password
        if not DATA_STORE.is_valid('coco2014/data/annotations', nohashcheck=nohashcheck) or force_rebuild:
            maybe_download_and_store_zip('http://images.cocodataset.org/annotations/annotations_trainval2014.zip', 'coco2014/data/annotations', use_subkeys=False)
        if not DATA_STORE.is_valid('coco2014/data/train/images', nohashcheck=nohashcheck) or force_rebuild:
            maybe_download_and_store_zip('http://images.cocodataset.org/zips/train2014.zip', 'coco2014/data/train/images', use_subkeys=False)
        if not DATA_STORE.is_valid('coco2014/data/val/images', nohashcheck=nohashcheck) or force_rebuild:
            maybe_download_and_store_zip('http://images.cocodataset.org/zips/val2014.zip', 'coco2014/data/val/images', use_subkeys=False)

        # TODO ([email protected]) Need to make sure that this works - there could be download issues, but it's hard to say
        self.train_json_key = 'coco2014/data/annotations'
        self.val_json_key = 'coco2014/data/annotations'

        log_message("Finished Downloading")

        # Now that we have the data, load and parse the JSON files
        need_rebuild_train = force_rebuild
        if not DATA_STORE.is_valid('coco2014/tfrecord/train', nohashcheck=nohashcheck) or need_rebuild_train:
            need_rebuild_train = True
            with open(os.path.join(DATA_STORE[self.train_json_key], 'annotations/captions_train2014.json'), 'r') as annotation_file:
                self.train_json = json.loads(annotation_file.read())
        
        need_rebuild_val = force_rebuild
        if not DATA_STORE.is_valid('coco2014/tfrecord/val', nohashcheck=nohashcheck) or need_rebuild_val:
            need_rebuild_val = True
            with open(os.path.join(DATA_STORE[self.val_json_key], 'annotations/captions_val2014.json'), 'r') as annotation_file:
                self.val_json = json.loads(annotation_file.read())

        # Load the vocab files
        if not DATA_STORE.is_valid('coco2014/captions/dictionary') or force_rebuild:
            self.dictionary = NLPDictionary()
            need_rebuild_train = True
            need_rebuild_val = True
        else:
            self.dictionary = NLPDictionary()
            self.dictionary.load(DATA_STORE['coco2014/captions/dictionary'])

        # Setup some default options for the dataset
        self.max_word_length = 50
        self.max_char_length = 16
        self._val_db = None
        self._train_db = None
        self.num_parallel_reads = num_parallel_reads
        
        # Build the tfrecord dataset from the JSON
        if need_rebuild_train:
            self._build_dataset('train')
        if need_rebuild_val:
            self._build_dataset('val')

        self.train_fpath = DATA_STORE['coco2014/tfrecord/train']
        self.val_fpath = DATA_STORE['coco2014/tfrecord/val']
        log_message("Finished building tfrecords.")
        # # Compute the size of the datasets
        self.num_train_examples = sum(1 for _ in tf.python_io.tf_record_iterator(DATA_STORE['coco2014/tfrecord/train']))
        self.num_val_examples = sum(1 for _ in tf.python_io.tf_record_iterator(DATA_STORE['coco2014/tfrecord/val']))

        # Save the vocab
        dict_path = DATA_STORE.create_key('coco2014/captions/dictionary', 'dict.pkl', force=True)
        self.dictionary.save(dict_path)
        DATA_STORE.update_hash('coco2014/captions/dictionary')

        self.word_vocab_size = len(self.dictionary.word_dictionary)
        self.char_vocab_size = len(self.dictionary.char_dictionary)

示例#27

0

显示文件

文件： pascal.py 项目： qbeer/flux

    def __init__(self,
                 force_rebuild: bool = False,
                 nohashcheck: bool = True) -> None:
        file = "http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar"
        self._classes = (
            '__background__',  # always index 0
            'aeroplane',
            'bicycle',
            'bird',
            'boat',
            'bottle',
            'bus',
            'car',
            'cat',
            'chair',
            'cow',
            'diningtable',
            'dog',
            'horse',
            'motorbike',
            'person',
            'pottedplant',
            'sheep',
            'sofa',
            'train',
            'tvmonitor')
        self.num_classes = 21
        self._class_to_ind = dict(zip(self._classes, range(self.num_classes)))
        self.max_num_obj = 50
        self.voc_root_key = "pascal/voc/2012"
        self.file_structure = os.path.join("VOCtrainval_11-May-2", "VOCdevkit",
                                           "VOC2012")

        work_file_path = os.path.join(DATA_STORE.working_directory,
                                      self.file_structure)
        _annotation_path = os.path.join(work_file_path, "Annotations")
        _problems = os.path.join(work_file_path, "ImageSets")
        _images = os.path.join(work_file_path, "JPEGImages")
        _segmentation_class = os.path.join(work_file_path, "SegmentationClass")
        _segmentation_object = os.path.join(work_file_path,
                                            "SegmentationObject")
        self.annotation_key = os.path.join(self.voc_root_key, "annotations")
        self.images_key = os.path.join(self.voc_root_key, "images")
        self.segmentation_key = os.path.join(self.voc_root_key, "segmentation",
                                             "class")
        self.segmentation_obj_key = os.path.join(self.voc_root_key,
                                                 "segmentation", "obj")

        if force_rebuild:
            log_message("Copying data to destination folder in flux")
            maybe_download_and_store_tar(url=file,
                                         root_key='pascal/voc/2012',
                                         use_subkeys=False)
            DATA_STORE.add_folder(self.images_key, _images)
            DATA_STORE.add_folder(self.segmentation_key, _segmentation_class)
            DATA_STORE.add_folder(self.segmentation_obj, _segmentation_object)
            DATA_STORE.add_folder(self.annotation_key, _annotation_path)

        self.problems_key = retrieve_subkeys(self.voc_root_key)
        if len(self.problems_key) < 1:
            log_message("Building Problem Keys")
            self.problems_key = register_to_datastore(_problems,
                                                      self.voc_root_key, "")
            self.problems_key = [
                os.path.join(self.voc_root_key, key)
                for key in self.problems_key
            ]

        self.image_path = DATA_STORE[self.images_key]
        self.annotation_path = DATA_STORE[self.annotation_key]
        self.seg_class_path = DATA_STORE[self.segmentation_key]
        self.seg_obj_path = DATA_STORE[self.segmentation_obj_key]

示例#28

0

显示文件

文件： pascal.py 项目： qbeer/flux

    def _build_dataset(self, dataset):
        _problem_key = [p for p in self.problems if p.endswith(dataset)]
        if len(_problem_key) < 1:
            log_warning("Problem key doesn't exist for {}.  ".format(dataset) +
                        str(_problem_key))
            raise EnvironmentError()
        problem_key = _problem_key[0]
        tf_record_key = os.path.join(self.voc_root_key,
                                     self.problem_name.lower(), "tfrecord",
                                     dataset)
        log_message("Retrieving the index from " + problem_key)
        assert (os.path.exists(DATA_STORE[problem_key]))
        with open(DATA_STORE[problem_key], 'r') as f:
            images_index = [x.strip() for x in f.readlines()]
        tf_record_writer = tf.python_io.TFRecordWriter(
            DATA_STORE.create_key(tf_record_key, 'data.tfrecords', force=True))

        errors = 0
        log_message("Building {} dataset...".format(dataset))
        total_num_examples = 0
        for idx, index in tqdm(enumerate(images_index)):
            img_path = image_path_from_index(index, self.image_path, '.jpg')
            feature_dict = self._load_pascal_annotation(index)

            image = load_image(img_path)
            image = encode_jpeg(image)
            if image is None:
                errors += 1
                log_warning(
                    'Error loading image: {}. {} Errors so far.'.format(
                        img_path, errors))
                continue

            seg_cls_path = image_path_from_index(index, self.seg_class_path,
                                                 '.png')
            seg_class = load_image(seg_cls_path)
            seg_class = encode_png(seg_class)
            seg_obj_path = image_path_from_index(index, self.seg_obj_path,
                                                 '.png')
            seg_obj = load_image(seg_obj_path)
            seg_obj = encode_png(seg_obj)

            if seg_class is None:
                errors += 1
                log_warning(
                    'Error loading image: {}. {} Errors so far.'.format(
                        seg_cls_path, errors))
                continue
            if seg_obj is None:
                errors += 1
                log_warning(
                    'Error loading image: {}. {} Errors so far.'.format(
                        seg_obj_path, errors))
                continue
            feature_dict["image"] = _bytes_feature(tf.compat.as_bytes(image))
            feature_dict["seg_class"] = _bytes_feature(
                tf.compat.as_bytes(seg_class))
            feature_dict["seg_obj"] = _bytes_feature(
                tf.compat.as_bytes(seg_obj))

            example = tf.train.Example(features=tf.train.Features(
                feature=feature_dict))
            tf_record_writer.write(example.SerializeToString())
            total_num_examples += 1
        tf_record_writer.close()
        DATA_STORE.update_hash(tf_record_key)
        return total_num_examples

示例#29

0

显示文件

文件： celeba.py 项目： qbeer/flux

    def _build_dataset(self, dataset: str, shuffle: bool) -> None:

        if dataset not in ['train', 'val']:
            raise ValueError(
                "Must be building either training or validation dataset")

        record_root = os.path.join(self.root_key, "tfrecord")
        # Open the TFRecordWriter
        if dataset == 'train':
            record_root = os.path.join(record_root, "train")
            data_size = self._num_examples * TRAIN_PARTITION
        else:
            record_root = os.path.join(record_root, "val")
            data_size = self._num_examples * VAL_PARTITION
        # Construct the record reader
        tf_record_writer = tf.python_io.TFRecordWriter(
            DATA_STORE.create_key(
                record_root,
                'shuffle.tfrecords' if shuffle else "data.tfrecords",
                force=True))

        # Loop over the data and parse
        errors = 0
        log_message('Building {} dataset...'.format(dataset))
        img_path = DATA_STORE[self.keys[0]]
        for i in tqdm.tqdm(range(int(data_size))):
            img_meta = self._img_meta[i].strip("\n").split(" ")
            file_name = os.path.join(img_path, img_meta[0])
            values = img_meta[1:]
            label = []
            for attr_name in self.selected_attrs:
                idx = self.attr2idx[attr_name]
                if values[idx] == '1':
                    label.append(1.0)
                else:
                    label.append(0.0)
            assert (len(label) == self.num_attr
                    )  # All labels should have 40 items.  (One hot)
            label = np.array(label, dtype=np.float32)
            # Load the image
            image = load_image(file_name)
            if image is None:
                errors += 1
                log_warning(
                    'Error loading image: {}. {} Errors so far.'.format(
                        file_name, errors))
                continue

            # Add the image data
            feature = {
                "label": _float_feature(label),
                'image_shape': _int64_feature(image.shape),
                'image': _bytes_feature(tf.compat.as_bytes(image.tostring())),
            }

            # Write the TF-Record
            example = tf.train.Example(features=tf.train.Features(
                feature=feature))
            tf_record_writer.write(example.SerializeToString())
        tf_record_writer.close()
        DATA_STORE.update_hash(record_root)

示例#30

0

显示文件

    def __init__(self,
                 version: str = None,
                 num_parallel_reads: Optional[int] = None,
                 force_rebuild=False,
                 nohashcheck=False) -> None:
        log_message("Building NMT...")
        if not Dataset.has_space(NMT.REQ_SIZE):
            return
        if version == None:
            log_message(
                "Please Select From following translation: en-vi, en-de")
            return
        self.num_parallel_reads = num_parallel_reads
        self.num_val_examples = None
        self.num_train_examples = None
        self.num_test_examples = None
        self.mwl = 40
        self.qwl = 40

        site_prefix = "https://nlp.stanford.edu/projects/nmt/data/"
        root_key = "nmt"

        if version == 'en-vi':
            self.root_key = os.path.join(root_key, "en-vi")
            train_eng_file = os.path.join(site_prefix,
                                          "iwslt15.en-vi/train.en")
            train_for_file = os.path.join(site_prefix,
                                          "iwslt15.en-vi/train.vi")
            val_eng_file = os.path.join(site_prefix,
                                        "iwslt15.en-vi/tst2012.en")
            val_for_file = os.path.join(site_prefix,
                                        "iwslt15.en-vi/tst2012.vi")
            test_eng_file = os.path.join(site_prefix,
                                         "iwslt15.en-vi/tst2013.en")
            test_for_file = os.path.join(site_prefix,
                                         "iwslt15.en-vi/tst2013.vi")
            vocab_eng_file = os.path.join(site_prefix,
                                          "iwslt15.en-vi/vocab.en")
            vocab_for_file = os.path.join(site_prefix,
                                          "iwslt15.en-vi/vocab.vi")
            # size = {"train_eng_file": 13603614,
            #         "train_for_file": 18074646,
            #         "val_eng_file": 140250,
            #         "val_for_file": 188396,
            #         "test_eng_file": 132264,
            #         "test_for_file": 183855,
            #         "vocab_eng_file": 139741,
            #         "vocab_for_file": 46767}

        elif version == "en-de":
            self.root_key = os.path.join(root_key, "en-de")
            train_eng_file = os.path.join(site_prefix, "wmt14.en-de/train.en")
            train_for_file = os.path.join(site_prefix, "wmt14.en-de/train.de")
            val_eng_file = os.path.join(site_prefix,
                                        "wmt14.en-de/newstest2012.en")
            val_for_file = os.path.join(site_prefix,
                                        "wmt14.en-de/newstest2012.de")
            test_eng_file = os.path.join(site_prefix,
                                         "wmt14.en-de/newstest2013.en")
            test_for_file = os.path.join(site_prefix,
                                         "wmt14.en-de/newstest2013.de")
            vocab_eng_file = os.path.join(site_prefix,
                                          "wmt14.en-de/vocab.50K.en")
            vocab_for_file = os.path.join(site_prefix,
                                          "wmt14.en-de/vocab.50K.de")
            # size = {"train_eng_file": 644874240,
            #         "train_for_file": 717225984,
            #         "val_eng_file": 406528,
            #         "val_for_file": 470016,
            #         "test_eng_file": 355328,
            #         "test_for_file": 405504,
            #         "vocab_eng_file": 404480,
            #         "vocab_for_file": 504832}
        # Download Files
        self.train_eng = maybe_download_and_store_single_file(
            train_eng_file, os.path.join(self.root_key, "train-en"))
        self.train_for = maybe_download_and_store_single_file(
            train_for_file, os.path.join(self.root_key, "train-for"))
        self.val_eng = maybe_download_and_store_single_file(
            val_eng_file, os.path.join(self.root_key, "val-en"))
        self.val_for = maybe_download_and_store_single_file(
            val_for_file, os.path.join(self.root_key, "val-for"))
        self.test_eng = maybe_download_and_store_single_file(
            test_eng_file, os.path.join(self.root_key, "test-en"))
        self.test_for = maybe_download_and_store_single_file(
            test_for_file, os.path.join(self.root_key, "test-for"))
        self.vocab_eng = maybe_download_and_store_single_file(
            vocab_eng_file, os.path.join(self.root_key, "vocab-en"))
        self.vocab_for = maybe_download_and_store_single_file(
            vocab_for_file, os.path.join(self.root_key, "vocab-for"))

        # Load the vocab files
        src_dictionary_key = os.path.join(self.root_key, "dictionary", "en")
        for_dictionary_key = os.path.join(self.root_key, "dictionary", "for")

        if not DATA_STORE.is_valid(
                src_dictionary_key) or not DATA_STORE.is_valid(
                    for_dictionary_key) or force_rebuild:
            self.src_dictionary = NLPDictionary()
            self.dst_dictionary = NLPDictionary()
        else:
            self.src_dictionary = NLPDictionary()
            self.dst_dictionary = NLPDictionary()
            self.src_dictionary.load(DATA_STORE[src_dictionary_key])
            self.dst_dictionary.load(DATA_STORE[for_dictionary_key])

        self.num_train_examples = self._build_dataset(
            "train", force_rebuild=force_rebuild)
        self.num_val_examples = self._build_dataset(
            "val", force_rebuild=force_rebuild)
        self.num_test_examples = self._build_dataset(
            "test", force_rebuild=force_rebuild)

        with open(
                DATA_STORE.create_key(src_dictionary_key,
                                      'dict.pkl',
                                      force=True), 'wb') as pkl_file:
            pickle.dump(self.src_dictionary, pkl_file)
            DATA_STORE.update_hash(src_dictionary_key)

        with open(
                DATA_STORE.create_key(for_dictionary_key,
                                      'dict.pkl',
                                      force=True), 'wb') as pkl_file:
            pickle.dump(self.dst_dictionary, pkl_file)
            DATA_STORE.update_hash(for_dictionary_key)

        self.word_vocab_size = len(self.src_dictionary.word_dictionary)

        # TODO: Add current vocab size from vocab file

        self._train_db = None
        self._val_db = None