def test_decode_example_with_var_len_tensor_to_dense(self): np_array = np.array([[1, 2, 3], [4, 5, 6]]) example = example_pb2.Example(features=feature_pb2.Features(feature={ 'labels': self._encode_int64_feature(np_array), })) serialized_example = example.SerializeToString() with self.test_session(): serialized_example = array_ops.reshape(serialized_example, shape=[]) keys_to_features = { 'labels': parsing_ops.VarLenFeature(dtype=dtypes.int64), } items_to_handlers = { 'labels': tfexample_decoder.Tensor( 'labels', shape=np_array.shape), } decoder = TFExampleDecoder(keys_to_features, items_to_handlers) [tf_labels] = decoder.decode(serialized_example, ['labels']) labels = tf_labels.eval() self.assertAllEqual(labels, np_array)
def _create_example_string(example_dict): """Create a serialized tf.example from feature dictionary.""" example = example_pb2.Example() for feature_name, feature_list in example_dict.items(): if not isinstance(feature_list, list): raise ValueError('feature value must be a list, but %s: "%s" is %s' % (feature_name, feature_list, type(feature_list))) if isinstance(feature_list[0], float): example.features.feature[feature_name].float_list.value.extend( feature_list) elif isinstance(feature_list[0], str): example.features.feature[feature_name].bytes_list.value.extend( feature_list) elif isinstance(feature_list[0], six.integer_types): example.features.feature[feature_name].int64_list.value.extend( feature_list) else: raise ValueError( 'Type %s for value %s is not supported for tf.train.Feature.' % (type(feature_list[0]), feature_list[0])) return example.SerializeToString()
def converter(graph: Dict, summary: List[str]) -> str: input_sequence = [ graph['node_labels'][i].lower() for i in graph['backbone_sequence'] ] assert len(input_sequence) > 0 assert len(summary) > 0 summary = [w.lower() for w in summary] if summary[-1] not in GET_TO_THE_POINT_END_TOKENS: summary.append('.') vocab_counter.update(input_sequence) vocab_counter.update(summary) tf_example = example_pb2.Example() tf_example.features.feature['article'].bytes_list.value.extend( [' '.join(input_sequence).encode()]) tf_example.features.feature['abstract'].bytes_list.value.extend([ ('<s>' + ' '.join(summary) + '</s>').encode() ]) return tf_example.SerializeToString()
def test_reference_label_variant(self): variant = test_utils.make_variant(start=10, alleles=['A', '.']) tvariant = test_utils.make_variant(start=10, alleles=['A', '.'], gt=[0, 0]) example = tf_utils.make_example(variant, ['.'], 'foo', self.default_shape, self.default_format) labeler = mock.Mock() labeler.match = mock.Mock(return_value=[True, tvariant]) self.processor.labeler = labeler labeled = example_pb2.Example() labeled.CopyFrom(example) self.processor.label_variant(labeled, variant) labeler.match.assert_called_once_with(variant) labeler.match_to_alt_count.assert_not_called() for key, value in example.features.feature.iteritems(): self.assertEqual(value, labeled.features.feature[key]) self.assertEqual(0, tf_utils.example_label(labeled)) self.assertEqual(tvariant, tf_utils.example_truth_variant(labeled))
def write_to_bin(out_file): # story_fnames = [name for name in os.listdir(tokenized_stories_dir) if os.path.isfile(tokenized_stories_dir+'\\'+name) ] story_fnames = [name for name in os.listdir(tokenized_stories_dir)] num_stories = len(story_fnames) # print(story_fnames) # for idx,s in enumerate(story_fnames): # print(idx,s) with open(out_file, 'wb') as writer: for idx, s in enumerate(story_fnames): if idx % 1000 == 0: print("Writing story %i of %i; %.2f percent done" % (idx, num_stories, float(idx) * 100.0 / float(num_stories))) # Look in the tokenized story dirs to find the .story file corresponding to this url if os.path.isfile(os.path.join(tokenized_stories_dir, s)): story_file = os.path.join(tokenized_stories_dir, s) print(story_file) else: print('Error: no data.') # # Get the strings to write to .bin file article, abstract = get_art_abs(story_file) # print('article:', article) # print('abstract:', abstract) # Write to tf.Example tf_example = example_pb2.Example() tf_example.features.feature['article'].bytes_list.value.extend( [article.encode()]) tf_example.features.feature['abstract'].bytes_list.value.extend( [abstract.encode()]) tf_example_str = tf_example.SerializeToString() str_len = len(tf_example_str) writer.write(struct.pack('q', str_len)) writer.write(struct.pack('%ds' % str_len, tf_example_str)) print("Finished writing file %s\n" % out_file)
def testDecodeExampleWithItemHandlerCallback(self): np.random.seed(0) tensor_shape = (2, 3, 1) np_array = np.random.rand(2, 3, 1) example = example_pb2.Example( features=feature_pb2.Features(feature={ 'image/depth_map': self._EncodedFloatFeature(np_array), })) serialized_example = example.SerializeToString() with self.cached_session(): serialized_example = array_ops.reshape(serialized_example, shape=[]) keys_to_features = { 'image/depth_map': parsing_ops.FixedLenFeature( tensor_shape, dtypes.float32, default_value=array_ops.zeros(tensor_shape)) } def HandleDepth(keys_to_tensors): depth = list(keys_to_tensors.values())[0] depth += 1 return depth items_to_handlers = { 'depth': tfexample_decoder.ItemHandlerCallback('image/depth_map', HandleDepth) } decoder = tfexample_decoder.TFExampleDecoder(keys_to_features, items_to_handlers) [tf_depth] = decoder.decode(serialized_example, ['depth']) depth = tf_depth.eval() self.assertAllClose(np_array, depth - 1)
def testDecodeExampleWithBoundingBoxSparse(self): num_bboxes = 10 np_ymin = np.random.rand(num_bboxes, 1) np_xmin = np.random.rand(num_bboxes, 1) np_ymax = np.random.rand(num_bboxes, 1) np_xmax = np.random.rand(num_bboxes, 1) np_bboxes = np.hstack([np_ymin, np_xmin, np_ymax, np_xmax]) example = example_pb2.Example( features=feature_pb2.Features( feature={ 'image/object/bbox/ymin': self._EncodedFloatFeature(np_ymin), 'image/object/bbox/xmin': self._EncodedFloatFeature(np_xmin), 'image/object/bbox/ymax': self._EncodedFloatFeature(np_ymax), 'image/object/bbox/xmax': self._EncodedFloatFeature(np_xmax), })) serialized_example = example.SerializeToString() with self.cached_session(): serialized_example = array_ops.reshape(serialized_example, shape=[]) keys_to_features = { 'image/object/bbox/ymin': parsing_ops.VarLenFeature(dtypes.float32), 'image/object/bbox/xmin': parsing_ops.VarLenFeature(dtypes.float32), 'image/object/bbox/ymax': parsing_ops.VarLenFeature(dtypes.float32), 'image/object/bbox/xmax': parsing_ops.VarLenFeature(dtypes.float32), } items_to_handlers = { 'object/bbox': tfexample_decoder.BoundingBox(['ymin', 'xmin', 'ymax', 'xmax'], 'image/object/bbox/'), } decoder = tfexample_decoder.TFExampleDecoder(keys_to_features, items_to_handlers) [tf_bboxes] = decoder.decode(serialized_example, ['object/bbox']) bboxes = tf_bboxes.eval() self.assertAllClose(np_bboxes, bboxes)
def testDecodeExampleWithRepeatedImages(self): image_shape = (2, 3, 3) image_format = 'png' image, _ = self.GenerateImage( image_format=image_format, image_shape=image_shape) tf_encoded = self._Encoder(image, image_format) with self.cached_session(): tf_string = tf_encoded.eval() example = example_pb2.Example( features=feature_pb2.Features( feature={ 'image/encoded': feature_pb2.Feature( bytes_list=feature_pb2.BytesList( value=[tf_string, tf_string])), 'image/format': self._StringFeature(image_format), })) serialized_example = example.SerializeToString() with self.cached_session(): serialized_example = array_ops.reshape(serialized_example, shape=[]) decoder = tfexample_decoder.TFExampleDecoder( keys_to_features={ 'image/encoded': parsing_ops.FixedLenFeature((2,), dtypes.string), 'image/format': parsing_ops.FixedLenFeature( (), dtypes.string, default_value=image_format), }, items_to_handlers={'image': tfexample_decoder.Image(repeated=True)}) [tf_image] = decoder.decode(serialized_example, ['image']) output_image = tf_image.eval() self.assertEqual(output_image.shape, (2, 2, 3, 3)) self.assertAllEqual(np.squeeze(output_image[0, :, :, :]), image) self.assertAllEqual(np.squeeze(output_image[1, :, :, :]), image)
def write_to_bin(input_file, out_file, makevocab=False): if makevocab: vocab_counter = collections.Counter() with open(out_file, 'wb') as writer: # 读取输入的文本文件,使偶数行成为article,奇数行成为abstract(行号从0开始) lines = read_text_file(input_file) for i, new_line in enumerate(lines): article = lines[i] abstract = "%s %s %s" % (SENTENCE_START, lines[i], SENTENCE_END) # 写到tf.Example tf_example = example_pb2.Example() tf_example.features.feature['article'].bytes_list.value.extend([bytes(article.encode('utf-8'))]) tf_example.features.feature['abstract'].bytes_list.value.extend([bytes(abstract.encode('utf-8'))]) tf_example_str = tf_example.SerializeToString() str_len = len(tf_example_str) writer.write(struct.pack('q', str_len)) writer.write(struct.pack('%ds' % str_len, tf_example_str)) # 如果可以,将词典写入文件 if makevocab: art_tokens = article.split(' ') abs_tokens = abstract.split(' ') abs_tokens = [t for t in abs_tokens if t not in [SENTENCE_START, SENTENCE_END]] # 从词典中删除这些符号 tokens = art_tokens + abs_tokens tokens = [t.strip() for t in tokens] # 清楚句子开头结尾的空字符 tokens = [t for t in tokens if t != ""] # 删除空行 vocab_counter.update(tokens) print("Finished writing file %s\n" % out_file) # 将词典写入文件 if makevocab: print("Writing vocab file...") with codecs.open(os.path.join(finished_files_dir, "vocab"), 'w', encoding='utf-8') as writer: for word, count in vocab_counter.most_common(VOCAB_SIZE): writer.write(word + ' ' + str(count) + '\n') print("Finished writing vocab file")
def write_to_bin(in_folder, out_file, makevocab=False): if makevocab: vocab_counter = collections.Counter() files = glob.glob(in_folder) counter = 0 with open(out_file, 'wb') as writer: for file in files: data = json.loads(open(file, 'r').readline()) article = get_string(data['clean_article']) abstract = get_string(data['clean_summary'], is_article=False) # Write to tf.Example tf_example = example_pb2.Example() tf_example.features.feature['article'].bytes_list.value.extend([article.encode()]) tf_example.features.feature['abstract'].bytes_list.value.extend([abstract.encode()]) tf_example_str = tf_example.SerializeToString() str_len = len(tf_example_str) writer.write(struct.pack('q', str_len)) writer.write(struct.pack('%ds' % str_len, tf_example_str)) # Write the vocab to file, if applicable if makevocab: art_tokens = article.split(' ') abs_tokens = abstract.split(' ') abs_tokens = [t for t in abs_tokens if t not in [SENTENCE_START, SENTENCE_END]] # remove these tags from vocab tokens = art_tokens + abs_tokens tokens = [t.strip() for t in tokens] # strip tokens = [t for t in tokens if t!=""] # remove empty vocab_counter.update(tokens) print ("Finished writing file %s" % out_file) # write vocab to file if makevocab: print ("Writing vocab file...") with open(data_path+"vocab", 'w') as writer: for word, count in vocab_counter.most_common(VOCAB_SIZE): writer.write(word + ' ' + str(count) + '\n') print ("Finished writing vocab file")
def write_docs_to_file(fname, docs): with open(fname, 'wb') as writer: for idx, doc in enumerate(docs): #if idx % 1000 == 0: # print "Writing story %i of %i; %.2f percent done" % (idx, num_stories, float(idx)*100.0/float(#num_stories)) # Look in the tokenized story dirs to find the .story file corresponding to this url # Get the strings to write to .bin file abstract = "".encode("utf8") article = "\n".join(doc).encode("utf8") # Write to tf.Example tf_example = example_pb2.Example() tf_example.features.feature['article'].bytes_list.value.extend( [article]) tf_example.features.feature['abstract'].bytes_list.value.extend( [abstract]) tf_example_str = tf_example.SerializeToString() str_len = len(tf_example_str) writer.write(struct.pack('q', str_len)) writer.write(struct.pack('%ds' % str_len, tf_example_str))
def generate_image(self, image_format, image_shape): """Generates an image and an example containing the encoded image. Args: image_format: the encoding format of the image. image_shape: the shape of the image to generate. Returns: image: the generated image. example: a TF-example with a feature key 'image/encoded' set to the serialized image and a feature key 'image/format' set to the image encoding format ['jpeg', 'JPEG', 'png', 'PNG', 'raw']. """ num_pixels = image_shape[0] * image_shape[1] * image_shape[2] image = np.linspace(0, num_pixels - 1, num=num_pixels).reshape(image_shape).astype(np.uint8) tf_encoded = self._encode(image, image_format) example = example_pb2.Example(features=feature_pb2.Features(feature={ 'image/encoded': self._encode_bytes_feature(tf_encoded), 'image/format': self._string_feature(image_format) })) return image, example.SerializeToString()
def testDecodeExampleShapeKeyTensor(self): np_image = np.random.rand(2, 3, 1).astype('f') np_labels = np.array([[[1], [2], [3]], [[4], [5], [6]]]) example = example_pb2.Example(features=feature_pb2.Features( feature={ 'image': self._EncodedFloatFeature(np_image), 'image/shape': self._EncodedInt64Feature(np.array(np_image.shape)), 'labels': self._EncodedInt64Feature(np_labels), 'labels/shape': self._EncodedInt64Feature(np.array(np_labels.shape)), })) serialized_example = example.SerializeToString() with self.test_session(): serialized_example = array_ops.reshape(serialized_example, shape=[]) keys_to_features = { 'image': parsing_ops.VarLenFeature(dtype=dtypes.float32), 'image/shape': parsing_ops.VarLenFeature(dtype=dtypes.int64), 'labels': parsing_ops.VarLenFeature(dtype=dtypes.int64), 'labels/shape': parsing_ops.VarLenFeature(dtype=dtypes.int64), } items_to_handlers = { 'image': tfexample_decoder.Tensor('image', shape_keys='image/shape'), 'labels': tfexample_decoder.Tensor('labels', shape_keys='labels/shape'), } decoder = tfexample_decoder.TFExampleDecoder( keys_to_features, items_to_handlers) [tf_image, tf_labels] = decoder.decode(serialized_example, ['image', 'labels']) self.assertAllEqual(tf_image.eval(), np_image) self.assertAllEqual(tf_labels.eval(), np_labels)
def testMakeExample(self): expected = example_pb2.Example() expected.features.feature['single_float'].float_list.value[:] = [1.0] expected.features.feature['single_int'].int64_list.value[:] = [2] expected.features.feature['single_str'].bytes_list.value[:] = [ b'apple' ] expected.features.feature['multi_float'].float_list.value[:] = [ 4.0, 5.0 ] expected.features.feature['multi_int'].int64_list.value[:] = [6, 7] expected.features.feature['multi_str'].bytes_list.value[:] = [ b'orange', b'banana' ] self.assertEqual( expected, util.make_example(single_float=1.0, single_int=2, single_str='apple', multi_float=[4.0, 5.0], multi_int=[6, 7], multi_str=['orange', 'banana']))
def create_examples(clusters, labels): """Creates Examples from the clusters and label strings. Args: clusters: NumPy array of shape (num_clusters, patch_height, patch_width). labels: List of string labels, which are names in musicscore_pb2.Glyph.Type. Length `num_clusters`. Returns: A list of Example protos of length `num_clusters`. """ examples = [] for cluster, label in zip(clusters, labels): example = example_pb2.Example() features = example.features features.feature['patch'].float_list.value.extend(cluster.ravel()) features.feature['height'].int64_list.value.append(cluster.shape[0]) features.feature['width'].int64_list.value.append(cluster.shape[1]) label_num = musicscore_pb2.Glyph.Type.Value(label) example.features.feature['label'].int64_list.value.append(label_num) examples.append(example) return examples
def parse_example_factory(): """Parse example factory.""" def _int64_feature(*values): return feature_pb2.Feature(int64_list=feature_pb2.Int64List(value=values)) def _bytes_feature(*values): return feature_pb2.Feature( bytes_list=feature_pb2.BytesList( value=[v.encode("utf-8") for v in values])) return dataset_ops.Dataset.from_tensor_slices( constant_op.constant([ example_pb2.Example( features=feature_pb2.Features( feature={ "dense_int": _int64_feature(i), "dense_str": _bytes_feature(str(i)), "sparse_int": _int64_feature(i, i * 2, i * 4, i * 8), "sparse_str": _bytes_feature(*["abc"] * i) })).SerializeToString() for i in range(10) ]))
def testDecodeExampleWithInt64Tensor(self): np_array = np.random.randint(1, 10, size=(2, 3, 1)) example = example_pb2.Example( features=feature_pb2.Features(feature={ 'array': self._EncodedInt64Feature(np_array), })) serialized_example = example.SerializeToString() with self.cached_session(): serialized_example = array_ops.reshape(serialized_example, shape=[]) keys_to_features = { 'array': parsing_ops.FixedLenFeature(np_array.shape, dtypes.int64) } items_to_handlers = { 'array': tfexample_decoder.Tensor('array'), } decoder = tfexample_decoder.TFExampleDecoder(keys_to_features, items_to_handlers) [tf_array] = decoder.decode(serialized_example, ['array']) self.assertAllEqual(tf_array.eval(), np_array)
def make_example(article, abstracts, doc_indices, raw_article_sents, corefs, article_lcs_paths=None): tf_example = example_pb2.Example() tf_example.features.feature['article'].bytes_list.value.extend([util.encode_text(article)]) for abstract in abstracts: if type(abstract) == list: tf_example.features.feature['abstract'].bytes_list.value.extend([util.encode_text(process_abstract(abstract))]) else: tf_example.features.feature['abstract'].bytes_list.value.extend([util.encode_text(abstract)]) if doc_indices is not None: if type(doc_indices) == list: doc_indices = ' '.join(doc_indices) tf_example.features.feature['doc_indices'].bytes_list.value.extend([util.encode_text(doc_indices)]) if raw_article_sents is not None: for sent in raw_article_sents: tf_example.features.feature['raw_article_sents'].bytes_list.value.extend([util.encode_text(sent)]) if corefs is not None: corefs_str = json.dumps(corefs) tf_example.features.feature['corefs'].bytes_list.value.extend([util.encode_text(corefs_str)]) if article_lcs_paths is not None: article_lcs_paths_str = ';'.join([' '.join(str(i) for i in source_indices) for source_indices in article_lcs_paths]) tf_example.features.feature['article_lcs_paths'].bytes_list.value.extend([util.encode_text(article_lcs_paths_str)]) return tf_example
def session_run(): example = example_pb2.Example() example.features.feature['x'].float_list.value.append(1) tensor_name_prediction = None tensor_name_classes = None key_prediction = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY tensor_name_prediction = ( meta_graph.signature_def[key_prediction].outputs['prediction'].name) key_classification = 'classification' tensor_name_classes = ( meta_graph.signature_def[key_classification].outputs['classes'].name) sess.run( tensor_name_prediction, feed_dict={'input_example_tensor:0': [example.SerializeToString()]}) sess.run( tensor_name_classes, feed_dict={'input_example_tensor:0': [example.SerializeToString()]}) sess.run( [tensor_name_prediction, tensor_name_classes], feed_dict={'input_example_tensor:0': [example.SerializeToString()]})
def _create_tf_example(feature_dict): """ Creates a tf example protobuf message given a feature dict. The protobuf message is defined here https://github.com/tensorflow/serving/blob/master/tensorflow_serving/apis/input.proto#L19 Args: feature_dict (dict of str -> feature): feature can be any of the following: int, strings, unicode object, float, or list of any of the previous types. Returns: a tf.train.Example including the features """ def _create_feature(feature): feature_list = feature if isinstance(feature, list) else [feature] # Each feature can be exactly one kind: # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/example/feature.proto#L76 feature_type = type(feature_list[0]) if feature_type == int: return feature_pb2.Feature(int64_list=feature_pb2.Int64List( value=feature_list)) elif feature_type == str: return feature_pb2.Feature(bytes_list=feature_pb2.BytesList( value=feature_list)) elif feature_type == unicode: return feature_pb2.Feature(bytes_list=feature_pb2.BytesList( value=map(lambda x: str(x), feature_list))) elif feature_type == float: return feature_pb2.Feature(float_list=feature_pb2.FloatList( value=feature_list)) else: message = """Unsupported request data format: {}, {}. Valid formats: float, int, str any object that implements __iter__ or classification_pb2.ClassificationRequest""" raise ValueError(message.format(feature, type(feature))) features = {k: _create_feature(v) for k, v in feature_dict.items()} return example_pb2.Example(features=feature_pb2.Features(feature=features))
def ExampleGen(data_path, num_epochs=None): """Generates tf.Examples from path of data files. ExampleGen Binary data format: <length><blob>. <length> represents the byte size of <blob>. <blob> is serialized tf.Example proto. The tf.Example contains the tokenized article text and summary. Args: data_path: path to tf.Example data files. num_epochs: Number of times to go through the data. None means infinite. Yields: Deserialized tf.Example. If there are multiple files specified, they accessed in a random order. """ epoch = 0 while True: if num_epochs is not None and epoch >= num_epochs: break filelist = FileUtils().get_files() assert filelist, 'Empty filelist.' # random.shuffle(filelist) for f in filelist: with open(f) as json_file: tf_example = example_pb2.Example() json_text = json.load(json_file) article = str(json_text["article"]) summary = str(json_text["summary"]) tf_example.features.feature['article'].bytes_list.value.extend( [article]) tf_example.features.feature[ 'abstract'].bytes_list.value.extend([summary]) yield tf_example epoch += 1
def write_to_bin(source, target, vocab_path, out_file): num_stories = count_num(source) vocab_counter = collections.Counter() with open(out_file, 'wb') as writer: gen = get_art_abs(source, target) for idx in range(num_stories): if idx % 1000 == 0: print("Writing story %i of %i; %.2f percent done" % (idx, num_stories, float(idx) * 100.0 / float(num_stories))) article, abstract = next(gen) tf_example = example_pb2.Example() tf_example.features.feature['article'].bytes_list.value.extend( [bytes(article, 'utf-8')]) tf_example.features.feature['abstract'].bytes_list.value.extend( [bytes(abstract, 'utf-8')]) tf_example_str = tf_example.SerializeToString() str_len = len(tf_example_str) writer.write(struct.pack('q', str_len)) writer.write(struct.pack('%ds' % str_len, tf_example_str)) art_tokens = article.split(' ') abs_tokens = abstract.split(' ') abs_tokens = [ t for t in abs_tokens if t not in [SENTENCE_START, SENTENCE_END] ] # remove these tags from vocab tokens = art_tokens + abs_tokens tokens = [t.strip() for t in tokens] # strip tokens = [t for t in tokens if t != ""] # remove empty tokens = [t for t in tokens if not t.isdigit()] # remove number vocab_counter.update(tokens) print("Finished writing file %s\n" % out_file) print("Writing vocab file...") with open(vocab_path, 'w') as writer: for word, count in vocab_counter.most_common(): writer.write(word + ' ' + str(count) + '\n') print("Finished writing vocab file")
def test_decode_example_multi_shape_key_tensor(self): np_image = np.random.rand(2, 3, 1).astype('f') np_labels = np.array([[[1], [2], [3]], [[4], [5], [6]]]) height, width, depth = np_labels.shape example = example_pb2.Example(features=feature_pb2.Features(feature={ 'image': self._encode_float_feature(np_image), 'image/shape': self._encode_int64_feature(np.array(np_image.shape)), 'labels': self._encode_int64_feature(np_labels), 'labels/height': self._encode_int64_feature(np.array([height])), 'labels/width': self._encode_int64_feature(np.array([width])), 'labels/depth': self._encode_int64_feature(np.array([depth])), })) serialized_example = example.SerializeToString() with self.test_session(): serialized_example = array_ops.reshape(serialized_example, shape=[]) keys_to_features = { 'image': parsing_ops.VarLenFeature(dtype=dtypes.float32), 'image/shape': parsing_ops.VarLenFeature(dtype=dtypes.int64), 'labels': parsing_ops.VarLenFeature(dtype=dtypes.int64), 'labels/height': parsing_ops.VarLenFeature(dtype=dtypes.int64), 'labels/width': parsing_ops.VarLenFeature(dtype=dtypes.int64), 'labels/depth': parsing_ops.VarLenFeature(dtype=dtypes.int64), } items_to_handlers = { 'image': tfexample_decoder.Tensor('image', shape_keys='image/shape'), 'labels': tfexample_decoder.Tensor( 'labels', shape_keys=['labels/height', 'labels/width', 'labels/depth']), } decoder = TFExampleDecoder(keys_to_features, items_to_handlers) [tf_image, tf_labels] = decoder.decode(serialized_example, ['image', 'labels']) self.assertAllEqual(tf_image.eval(), np_image) self.assertAllEqual(tf_labels.eval(), np_labels)
def convert_files_to_binary(input_filenames, output_filename, counter): with open(output_filename, 'wb') as serialized_f: for filename in input_filenames: with open(filename, 'r') as input_f: print("handling file: ", filename) pattern = re.compile( r'<HEADLINE>\n([\w\W]+?)\n</HEADLINE>[\w\W]+?<P>\n([\w\W]+?)\n</P>[\w\W]+?<P>\n([\w\W]+?)\n</P>' ) if FLAGS.lexrank == True: pattern = re.compile( r'<HEADLINE>\n([\w\W]+?)\n</HEADLINE>[\w\W]+?<TEXT>\n([\w\W]+?)\n</TEXT>' ) for match in pattern.findall(input_f.read()): if FLAGS.lexrank == True: abstract, s1, s2 = lexrankSentences(match) else: abstract, s1, s2 = match # split & count words abstract = modify(abstract) s1 = modify(s1) s2 = modify(s2) if len(abstract) == 0 or len(s1) == 0 or len(s2) == 0: continue counter.update(' '.join([abstract, s1, s2]).split()) # then create serialized version of abstract/article for training article = ' '.join([s1, s2]) tf_example = example_pb2.Example() tf_example.features.feature[ 'article'].bytes_list.value.extend([article]) tf_example.features.feature[ 'abstract'].bytes_list.value.extend([abstract]) tf_example_str = tf_example.SerializeToString() str_len = len(tf_example_str) serialized_f.write(struct.pack('q', str_len)) serialized_f.write( struct.pack('%ds' % str_len, tf_example_str))
def test_decode_example_with_float_tensor(self): np_array = np.random.rand(2, 3, 1).astype('f') example = example_pb2.Example(features=feature_pb2.Features( feature={ 'array': self._encode_float_feature(np_array), })) serialized_example = example.SerializeToString() with self.test_session(): serialized_example = array_ops.reshape(serialized_example, shape=[]) keys_to_features = { 'array': parsing_ops.FixedLenFeature(np_array.shape, dtypes.float32) } items_to_handlers = { 'array': tfexample_decoder.Tensor('array'), } decoder = TFExampleDecoder(keys_to_features, items_to_handlers) [tf_array] = decoder.decode(serialized_example, ['array']) self.assertAllEqual(tf_array.eval(), np_array)
def write_bert_tf_example(simple_similar_source_indices, raw_article_sents, summary_text, corefs_str, doc_indices, article_lcs_paths_list, writer, dataset_name): tf_example = example_pb2.Example() source_indices_str = ';'.join([' '.join(str(i) for i in source_indices) for source_indices in simple_similar_source_indices]) tf_example.features.feature['similar_source_indices'].bytes_list.value.extend([util.encode_text(source_indices_str)]) for sent in raw_article_sents: s = sent.strip() tf_example.features.feature['raw_article_sents'].bytes_list.value.extend([util.encode_text(s)]) if dataset_name == 'duc_2004': for summ_text in summary_text: tf_example.features.feature['summary_text'].bytes_list.value.extend([util.encode_text(summ_text)]) else: tf_example.features.feature['summary_text'].bytes_list.value.extend([util.encode_text(summary_text)]) if doc_indices is not None: tf_example.features.feature['doc_indices'].bytes_list.value.extend([util.encode_text(doc_indices)]) if corefs_str is not None: tf_example.features.feature['corefs'].bytes_list.value.extend([corefs_str]) if article_lcs_paths_list is not None: article_lcs_paths_list_str = '|'.join([';'.join([' '.join(str(i) for i in source_indices) for source_indices in article_lcs_paths]) for article_lcs_paths in article_lcs_paths_list]) tf_example.features.feature['article_lcs_paths_list'].bytes_list.value.extend([util.encode_text(article_lcs_paths_list_str)]) tf_example_str = tf_example.SerializeToString() str_len = len(tf_example_str) writer.write(struct.pack('q', str_len)) writer.write(struct.pack('%ds' % str_len, tf_example_str))
def _convert_files_to_binary(input_filenames, output_filename): with open(output_filename, 'wb') as writer: for filename in input_filenames: with open(filename, 'r') as f: document = f.read() document_parts = document.split('\n', 1) assert len(document_parts) == 2 title = '<d><p><s>' + document_parts[0] + '</s></p></d>' body = document_parts[1].decode('utf8').replace('\n', ' ').replace('\t', ' ') sentences = sent_tokenize(body) body = '<d><p>' + ' '.join(['<s>' + sentence + '</s>' for sentence in sentences]) + '</p></d>' body = body.encode('utf8') tf_example = example_pb2.Example() tf_example.features.feature['article'].bytes_list.value.extend([body]) tf_example.features.feature['abstract'].bytes_list.value.extend([title]) tf_example_str = tf_example.SerializeToString() str_len = len(tf_example_str) writer.write(struct.pack('q', str_len)) writer.write(struct.pack('%ds' % str_len, tf_example_str))
def write_to_file(article, abstract, rel, writer): #abstract = '<s> ' + ' '.join(abstract) + ' </s>' #abstract = abstract.encode('utf8', 'ignore') #rel = rel.encode('utf8', 'ignore') #article = article.encode('utf8', 'ignore') #print(article) #print(abstract) #print(len(rel)) tf_example = example_pb2.Example() tf_example.features.feature['abstract'].bytes_list.value.extend( [bytes(abstract)]) tf_example.features.feature['relevancy'].bytes_list.value.extend( [bytes(rel)]) tf_example.features.feature['article'].bytes_list.value.extend( [bytes(article)]) tf_example_str = tf_example.SerializeToString() #print(bytes(rel)) #print(tf_example.features.feature['relevancy']) #print((tf_example_str)) str_len = len(tf_example_str) writer.write(struct.pack('q', str_len)) writer.write(struct.pack('%ds' % str_len, tf_example_str))
def testDecodeExampleWithVarLenTensor(self): np_array = np.array([[[1], [2], [3]], [[4], [5], [6]]]) example = example_pb2.Example( features=feature_pb2.Features(feature={ 'labels': self._EncodedInt64Feature(np_array), })) serialized_example = example.SerializeToString() with self.cached_session(): serialized_example = array_ops.reshape(serialized_example, shape=[]) keys_to_features = { 'labels': parsing_ops.VarLenFeature(dtype=dtypes.int64), } items_to_handlers = { 'labels': tfexample_decoder.Tensor('labels'), } decoder = tfexample_decoder.TFExampleDecoder(keys_to_features, items_to_handlers) [tf_labels] = decoder.decode(serialized_example, ['labels']) labels = tf_labels.eval() self.assertAllEqual(labels, np_array.flatten())
def test_decode_example_with_sparse_tensor(self): np_indices = np.array([[1], [2], [5]]) np_values = np.array([0.1, 0.2, 0.6]).astype('f') example = example_pb2.Example(features=feature_pb2.Features(feature={ 'indices': self._encode_int64_feature(np_indices), 'values': self._encode_float_feature(np_values), })) serialized_example = example.SerializeToString() with self.test_session(): serialized_example = array_ops.reshape(serialized_example, shape=[]) keys_to_features = { 'indices': parsing_ops.VarLenFeature(dtype=dtypes.int64), 'values': parsing_ops.VarLenFeature(dtype=dtypes.float32), } items_to_handlers = {'labels': tfexample_decoder.SparseTensor()} decoder = TFExampleDecoder(keys_to_features, items_to_handlers) [tf_labels] = decoder.decode(serialized_example, ['labels']) labels = tf_labels.eval() self.assertAllEqual(labels.indices, np_indices) self.assertAllEqual(labels.values, np_values) self.assertAllEqual(labels.dense_shape, np_values.shape)