Пример #1
0
def rollout_to_tf_record(sases, obs_vectorizer, FLAGS, path=None):
    date_string = time.strftime("%Y-%m-%d-%H-%M-%S")
    path = path or FLAGS['rollout_data_path']
    filename = os.path.join(path, date_string + '.tfrecords')
    count = 0
    with TFRecordWriter(filename,
                        options=TFRecordOptions(
                            TFRecordCompressionType.GZIP)) as writer:
        for ridx, rollout in enumerate(sases['s']):
            for tidx in range(len(sases['s'][ridx])):
                sas = {}
                try:
                    sas['s'] = sases['s'][ridx][tidx]
                    sas['a'] = sases['a'][ridx][tidx]
                    sas['s_next'] = sases['s_next'][ridx][tidx]
                except Exception as e:
                    import ipdb
                    ipdb.set_trace()
                example = _generate_sas_example(copy.deepcopy(sas),
                                                obs_vectorizer)
                writer.write(example.SerializeToString())
                count += 1
    writer.close()

    # Old version.
    sas_shape = {key: sas[key].shape for key in sas}

    dump = {'sas_shape': sas_shape, 'FLAGS': FLAGS}
    with open(os.path.join(FLAGS['rollout_data_path'], 'metadata.yaml'),
              'w') as f:
        yaml.dump(dump, f)

    if FLAGS['debug']:
        print('Wrote {} SAS pairs to file'.format(count))
    return filename
Пример #2
0
def write_records_parsed_v2(
        sentences: Iterable[List[Word]],
        output_file: str,
        vocabmaps: Dict[str,Dict[str, int]],
        int_fields=INT_FIELDS,
        text_fields=TEXT_FIELDS,
        total=None):
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    with TFRecordWriter(output_file) as writer:
        for sentence in tqdm(sentences, desc="Writing Records", total=total):
            int_field_data = {
                field: feature_int64_list([int(getattr(word, field)) for word in sentence])
                for field in int_fields}
            text_field_data = {
                field: feature_int64_list(encode_words(
                    words=[getattr(word, field) for word in sentence],
                    wordmap=vocabmaps[field]))
                for field in text_fields}

            sentence_length = feature_int64([len(sentence)])
            sequence_features = dict()
            sequence_features.update(int_field_data)
            sequence_features.update(text_field_data)
            context_features = {
                SENTENCE_LENGTH: sentence_length
            }
            example = tf.train.SequenceExample(
                context=Features(feature=context_features),
                feature_lists=tf.train.FeatureLists(feature_list=sequence_features),
            )
            writer.write(example.SerializeToString())
Пример #3
0
def write_records_parsed(sentences: Iterable[List[Word]], output_file, wordmap, tagmap, total=None):
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    with TFRecordWriter(output_file) as writer:
        for sentence in tqdm(sentences, desc="Writing Records", total=total):
            indices = [word.index for word in sentence]
            text = [wordmap[word.text] if word.text in wordmap else wordmap[UNK] for word in sentence]
            tags = [tagmap[word.tag] if word.tag in tagmap else tagmap[UNK] for word in sentence]
            heads = [word.head for word in sentence]

            indices_feat = feature_int64_list(indices)
            text_feat = feature_int64_list(text)
            tags_feat = feature_int64_list(tags)
            heads_feat = feature_int64_list(heads)
            data_size = feature_int64([len(sentence)])
            sequence_features = {
                'indices': indices_feat,
                'text': text_feat,
                'tags': tags_feat,
                'heads': heads_feat
            }
            context_features = {
                'data_size': data_size
            }
            example = tf.train.SequenceExample(
                context=Features(feature=context_features),
                feature_lists=tf.train.FeatureLists(feature_list=sequence_features),
            )
            writer.write(example.SerializeToString())
Пример #4
0
 def write_test_data(example_proto, schema, schema_filename="schema.pb"):
     tmp_dir = tf.test.get_temp_dir()
     schema_path = pjoin(tmp_dir, schema_filename)
     with open(schema_path, "wb") as f:
         f.write(schema.SerializeToString())
     data_file = pjoin(tmp_dir, "test.tfrecord")
     with TFRecordWriter(data_file) as f:
         for i in example_proto:
             f.write(i.SerializeToString())
     return data_file, schema_path
Пример #5
0
 def write_featran_test_data(cls):
     tmp_dir = tf.test.get_temp_dir()
     feature_desc_file = pjoin(tmp_dir, cls.tf_record_spec_filename)
     with open(feature_desc_file, "w") as f:
         f.write(cls.tf_record_spec)
     e = DataUtil.get_example_proto()
     data_file = pjoin(tmp_dir, "test.tfrecord")
     with TFRecordWriter(data_file) as f:
         for i in e:
             f.write(i.SerializeToString())
     return tmp_dir, data_file, feature_desc_file
def convert_csv_to_tfrecord(csv, file_name):
    writer = TFRecordWriter(file_name)
    for index, row in enumerate(csv):
        features, label = row[:-1], row[-1]
        example = create_tf_example(features, label)
        writer.write(example.SerializeToString())
    writer.close()
Пример #7
0
def convert_csv_to_tf_record(csv, file_name):
    writer = TFRecordWriter(file_name)
    for index, row in enumerate(csv):
        #         try:
        if row is None:
            #             print("row was None")
            continue

        if row[0] is None or row[1] is None:
            #             print("row[0] or row[1] was None")
            continue

        if row[0].strip() is '':
            #             print("row[0].strip() was ''")
            continue

        feats = (index, row[0])
        lab = row[1]
        example = make_tf_ex(feats, lab)
        writer.write(example.SerializeToString())

#         except Exception as inst:
#             print(type(inst))
#             print(Exception.args)
#             print(Exception.with_traceback)

    writer.close()
def convert_csv_to_tf_record(csv, file_name):
    writer = TFRecordWriter(file_name)
    for index, row in enumerate(csv):
        try:
            if row is None:
                print("row was None")
                raise Exception('Row Missing')

            if row[0] is None or row[1] is None:
                print("row[0] or row[1] was None")
                raise Exception('Value Missing')

            if row[0].strip() is '':
                print("row[0].strip() was ''")
                raise Exception('Utterance is empty')

            feats = (index, row[0])
            lab = row[1]
            example = make_tf_ex(feats, lab)
            writer.write(example.SerializeToString())

        except Exception as inst:
            print(type(inst))
            print(Exception.args)
            print(Exception.with_traceback)

    writer.close()
Пример #9
0
class ShardRecordWriter(object):
    def __init__(self, path_fmt, chunksize):
        self.path_fmt = path_fmt
        self.chunksize = chunksize
        self.writer = None
        self.chunks = 0
        self.items = 0

    def __enter__(self):
        self.open_writer()
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.close_writer()

    def output_file(self):
        return self.path_fmt.format(self.chunks)

    def open_writer(self):
        output_file = self.output_file()
        os.makedirs(os.path.dirname(output_file), exist_ok=True)
        self.writer = TFRecordWriter(output_file)

    def close_writer(self):
        self.writer.close()
        self.writer = None

    def write(self, record):
        assert self.writer is not None
        if self.items >= self.chunksize:
            self.close_writer()
            self.items = 0
            self.chunks += 1
            self.open_writer()
        self.writer.write(record)
        self.items += 1
 def write_featran_test_data(feature_desc=["f1", "f2"],
                             values=[{
                                 "f1": 1,
                                 "f2": 2
                             }],
                             feature_desc_filename="_feature_desc"):
     tmp_dir = tf.test.get_temp_dir()
     feature_desc_file = pjoin(tmp_dir, feature_desc_filename)
     with open(feature_desc_file, "w") as f:
         f.writelines("\n".join(feature_desc))
     e = DataUtil.get_example_proto(values)
     data_file = pjoin(tmp_dir, "test.tfrecord")
     with TFRecordWriter(data_file) as f:
         for i in e:
             f.write(i.SerializeToString())
     return tmp_dir, data_file, feature_desc_file
Пример #11
0
def write_records(data, output_file):
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    with TFRecordWriter(output_file) as writer:
        for datum in data:
            # data_feat, data_size = feature_array(datum.astype(np.int64))
            data_size = feature_int64([datum.shape[0]])
            data_feat = feature_int64_list(datum)
            assert datum.shape[0] > 0
            assert datum.ndim == 1
            sequence_features = {'data_feat': data_feat}
            context_features = {'data_size': data_size}
            # example = Example(features=Features(feature=feature))
            example = tf.train.SequenceExample(
                context=Features(feature=context_features),
                feature_lists=tf.train.FeatureLists(
                    feature_list=sequence_features),
            )

            writer.write(example.SerializeToString())
Пример #12
0
    def _save_to_tfrecord(self, features, labels, path):
        encoding_placeholders, encoding_ops = self.get_encoding_ops()
        with tf.Session() as sess:
            with TFRecordWriter(str(path)) as writer:
                for idx, elems in \
                        enumerate(zip(*list(features.values()), *list(labels.values()))):
                    features_to_bytes = self.get_features_to_process(elems)
                    labels = self.get_labels(elems)
                    if idx % 5000 == 0:
                        print("encoding sample no: ", idx)

                    encoding_input = (skimage.img_as_uint(x)
                                      for x in features_to_bytes)

                    result = sess.run(list(encoding_ops),
                                      feed_dict=dict(
                                          zip(encoding_placeholders,
                                              encoding_input)))
                    self.save_example_op(writer, *result, *labels)
Пример #13
0
def create_tfrecord(input_file_name, output_file_name, random_state=-1):

    (labels, feauters) = load_feauters(input_file_name=input_file_name,
                                       random_state=random_state)

    len_feauters = len(feauters.values)
    writer = TFRecordWriter(output_file_name)
    for i in range(0, len_feauters):
        tf_example = tf.train.Example(features=tf.train.Features(feature={
            'idx': tf.train.Feature(int64_list=tf.train.Int64List(value=[i])),
            # Sentence is the yelp review which is stored in UTF-8 bytes
            'sentence': tf.train.Feature(bytes_list=tf.train.BytesList(value=[feauters.values[i].encode('utf-8')])),
            # Label is the sentiment value we are trying to predict
            'label': tf.train.Feature(int64_list=tf.train.Int64List(value=[labels.values[i]]))
        }))
        writer.write(tf_example.SerializeToString())
    writer.close()
Пример #14
0
def write_data(sim_manager, data_path):
    """Make some domrand data and save it to tfrecords."""
    image, label = sim_manager.get_data()

    rows = image.shape[0]
    cols = image.shape[1]
    depth = image.shape[2]

    # OUTER = 5e2
    # INNER = 1e3
    # generate 1e5 examples, spread across 1e2 files
    print()
    print('Generating 1e5 examples (~30 GB). You can ctrl-c anytime you want')
    print()
    for i in trange(int(1e2), desc='Files created'):
        date_string = time.strftime('%Y-%m-%d-%H-%M-%S')
        filename = os.path.join(data_path, date_string + '.tfrecords')
        with TFRecordWriter(filename,
                            options=TFRecordOptions(
                                TFRecordCompressionType.GZIP)) as writer:
            try:
                for j in trange(int(1e3), desc='Examples generated'):
                    image, label = sim_manager.get_data()
                    assert image.dtype == np.uint8
                    image_raw = image.tostring()
                    label_raw = label.astype(np.float32).tostring()

                    example = tf.train.Example(features=tf.train.Features(
                        feature={
                            'label_raw': _bytes_feature(label_raw),
                            'image_raw': _bytes_feature(image_raw)
                        }))
                    writer.write(example.SerializeToString())
            except:
                writer.close()
                os.remove(filename)

    writer.close()
Пример #15
0
def convert_test_to_tfrecord(csv, file_name):
    start_time = time.time()
    writer = TFRecordWriter(file_name)
    for index, row in csv.iterrows():
        try:
            if row is None:
                raise Exception('Row Missing')
            if row[0] is None or row[1] is None or row[2] is None:
                raise Exception('Value Missing')
            if row[1].strip() == "":
                raise Exception('Sequence is empty')
            features, label = row, ""
            example = create_tf_example(index, features, label)
            writer.write(example.SerializeToString())
        except Exception as inst:
            print(type(inst))
            print(inst.args)
            print(inst)
    writer.close()
    print(f"{file_name}: --- {(time.time() - start_time)} seconds ---")
Пример #16
0
# conding -*- utf-8 -*-
from tensorflow.python.lib.io.tf_record import TFRecordWriter
from tensorflow.python.lib.io.tf_record import tf_record_iterator
from tensorflow.python.lib.io.tf_record import TFRecordOptions, TFRecordCompressionType
tf_writer = TFRecordWriter(
    'rawtemp.tfrecords')  # TFRecordOptions(TFRecordCompressionType.GZIP)
tf_writer.write(b'the 1st test string')
tf_writer.flush()
tf_writer.write(b'the 2nd test string')
tf_writer.flush()
tf_writer.write(b'the 3rd test string')
tf_writer.flush()
tf_writer.write(b'the 4th test string')
tf_writer.flush()
tf_writer.write(b'the 5th test string')
tf_writer.flush()
tf_writer.write(b'the 6th test string')
tf_writer.flush()
tf_writer.write(b'the 7th test string')
tf_writer.flush()
tf_writer.write(b'the 8th test string')
tf_writer.flush()
tf_writer.write(b'the 9th test string')
tf_writer.flush()
tf_writer.write(b'the th test string')
tf_writer.flush()
tf_writer.close()

# for record in tf_record_iterator('append_output.tfrecords',TFRecordOptions(TFRecordCompressionType.NONE)):   # TFRecordOptions(TFRecordCompressionType.GZIP)
#     print(record)
Пример #17
0
def generate_single_tf_record(img_info: Dict, file_name: str,
                              writer: TFRecordWriter) -> None:
    tf_example: Example = build_example(file_name, img_info)
    writer.write(tf_example.SerializeToString())
Пример #18
0
def serialize(iterator, output_file):
    with TFRecordWriter(output_file) as writer:
        for img, text in iterator:
            writer.write(to_example(image=img, text=text).SerializeToString())
Пример #19
0
 def open_writer(self):
     output_file = self.output_file()
     os.makedirs(os.path.dirname(output_file), exist_ok=True)
     self.writer = TFRecordWriter(output_file)