示例#1
0
    def test_decode_example_with_var_len_tensor_to_dense(self):
        np_array = np.array([[1, 2, 3], [4, 5, 6]])
        example = example_pb2.Example(features=feature_pb2.Features(feature={
            'labels': self._encode_int64_feature(np_array),
        }))

        serialized_example = example.SerializeToString()

        with self.test_session():
            serialized_example = array_ops.reshape(serialized_example, shape=[])
            keys_to_features = {
                'labels': parsing_ops.VarLenFeature(dtype=dtypes.int64),
            }
            items_to_handlers = {
                'labels': tfexample_decoder.Tensor(
                    'labels', shape=np_array.shape),
            }
            decoder = TFExampleDecoder(keys_to_features, items_to_handlers)
            [tf_labels] = decoder.decode(serialized_example, ['labels'])
            labels = tf_labels.eval()
            self.assertAllEqual(labels, np_array)
示例#2
0
def _create_example_string(example_dict):
  """Create a serialized tf.example from feature dictionary."""
  example = example_pb2.Example()
  for feature_name, feature_list in example_dict.items():
    if not isinstance(feature_list, list):
      raise ValueError('feature value must be a list, but %s: "%s" is %s' %
                       (feature_name, feature_list, type(feature_list)))
    if isinstance(feature_list[0], float):
      example.features.feature[feature_name].float_list.value.extend(
          feature_list)
    elif isinstance(feature_list[0], str):
      example.features.feature[feature_name].bytes_list.value.extend(
          feature_list)
    elif isinstance(feature_list[0], six.integer_types):
      example.features.feature[feature_name].int64_list.value.extend(
          feature_list)
    else:
      raise ValueError(
          'Type %s for value %s is not supported for tf.train.Feature.' %
          (type(feature_list[0]), feature_list[0]))
  return example.SerializeToString()
示例#3
0
    def converter(graph: Dict, summary: List[str]) -> str:
        input_sequence = [
            graph['node_labels'][i].lower() for i in graph['backbone_sequence']
        ]
        assert len(input_sequence) > 0
        assert len(summary) > 0

        summary = [w.lower() for w in summary]
        if summary[-1] not in GET_TO_THE_POINT_END_TOKENS:
            summary.append('.')

        vocab_counter.update(input_sequence)
        vocab_counter.update(summary)

        tf_example = example_pb2.Example()
        tf_example.features.feature['article'].bytes_list.value.extend(
            [' '.join(input_sequence).encode()])
        tf_example.features.feature['abstract'].bytes_list.value.extend([
            ('<s>' + ' '.join(summary) + '</s>').encode()
        ])
        return tf_example.SerializeToString()
  def test_reference_label_variant(self):
    variant = test_utils.make_variant(start=10, alleles=['A', '.'])
    tvariant = test_utils.make_variant(start=10, alleles=['A', '.'], gt=[0, 0])
    example = tf_utils.make_example(variant, ['.'], 'foo', self.default_shape,
                                    self.default_format)

    labeler = mock.Mock()
    labeler.match = mock.Mock(return_value=[True, tvariant])
    self.processor.labeler = labeler

    labeled = example_pb2.Example()
    labeled.CopyFrom(example)
    self.processor.label_variant(labeled, variant)

    labeler.match.assert_called_once_with(variant)
    labeler.match_to_alt_count.assert_not_called()

    for key, value in example.features.feature.iteritems():
      self.assertEqual(value, labeled.features.feature[key])
    self.assertEqual(0, tf_utils.example_label(labeled))
    self.assertEqual(tvariant, tf_utils.example_truth_variant(labeled))
示例#5
0
def write_to_bin(out_file):
    # story_fnames = [name for name in os.listdir(tokenized_stories_dir) if os.path.isfile(tokenized_stories_dir+'\\'+name) ]
    story_fnames = [name for name in os.listdir(tokenized_stories_dir)]
    num_stories = len(story_fnames)
    # print(story_fnames)
    # for idx,s in enumerate(story_fnames):
    #   print(idx,s)

    with open(out_file, 'wb') as writer:
        for idx, s in enumerate(story_fnames):
            if idx % 1000 == 0:
                print("Writing story %i of %i; %.2f percent done" %
                      (idx, num_stories,
                       float(idx) * 100.0 / float(num_stories)))

            # Look in the tokenized story dirs to find the .story file corresponding to this url
            if os.path.isfile(os.path.join(tokenized_stories_dir, s)):
                story_file = os.path.join(tokenized_stories_dir, s)
                print(story_file)

            else:
                print('Error: no data.')

    #     # Get the strings to write to .bin file
            article, abstract = get_art_abs(story_file)
            # print('article:', article)
            # print('abstract:', abstract)

            # Write to tf.Example
            tf_example = example_pb2.Example()
            tf_example.features.feature['article'].bytes_list.value.extend(
                [article.encode()])
            tf_example.features.feature['abstract'].bytes_list.value.extend(
                [abstract.encode()])
            tf_example_str = tf_example.SerializeToString()
            str_len = len(tf_example_str)
            writer.write(struct.pack('q', str_len))
            writer.write(struct.pack('%ds' % str_len, tf_example_str))

    print("Finished writing file %s\n" % out_file)
示例#6
0
  def testDecodeExampleWithItemHandlerCallback(self):
    np.random.seed(0)
    tensor_shape = (2, 3, 1)
    np_array = np.random.rand(2, 3, 1)

    example = example_pb2.Example(
        features=feature_pb2.Features(feature={
            'image/depth_map': self._EncodedFloatFeature(np_array),
        }))

    serialized_example = example.SerializeToString()

    with self.cached_session():
      serialized_example = array_ops.reshape(serialized_example, shape=[])

      keys_to_features = {
          'image/depth_map':
              parsing_ops.FixedLenFeature(
                  tensor_shape,
                  dtypes.float32,
                  default_value=array_ops.zeros(tensor_shape))
      }

      def HandleDepth(keys_to_tensors):
        depth = list(keys_to_tensors.values())[0]
        depth += 1
        return depth

      items_to_handlers = {
          'depth':
              tfexample_decoder.ItemHandlerCallback('image/depth_map',
                                                    HandleDepth)
      }

      decoder = tfexample_decoder.TFExampleDecoder(keys_to_features,
                                                   items_to_handlers)
      [tf_depth] = decoder.decode(serialized_example, ['depth'])
      depth = tf_depth.eval()

    self.assertAllClose(np_array, depth - 1)
示例#7
0
  def testDecodeExampleWithBoundingBoxSparse(self):
    num_bboxes = 10
    np_ymin = np.random.rand(num_bboxes, 1)
    np_xmin = np.random.rand(num_bboxes, 1)
    np_ymax = np.random.rand(num_bboxes, 1)
    np_xmax = np.random.rand(num_bboxes, 1)
    np_bboxes = np.hstack([np_ymin, np_xmin, np_ymax, np_xmax])

    example = example_pb2.Example(
        features=feature_pb2.Features(
            feature={
                'image/object/bbox/ymin': self._EncodedFloatFeature(np_ymin),
                'image/object/bbox/xmin': self._EncodedFloatFeature(np_xmin),
                'image/object/bbox/ymax': self._EncodedFloatFeature(np_ymax),
                'image/object/bbox/xmax': self._EncodedFloatFeature(np_xmax),
            }))
    serialized_example = example.SerializeToString()

    with self.cached_session():
      serialized_example = array_ops.reshape(serialized_example, shape=[])

      keys_to_features = {
          'image/object/bbox/ymin': parsing_ops.VarLenFeature(dtypes.float32),
          'image/object/bbox/xmin': parsing_ops.VarLenFeature(dtypes.float32),
          'image/object/bbox/ymax': parsing_ops.VarLenFeature(dtypes.float32),
          'image/object/bbox/xmax': parsing_ops.VarLenFeature(dtypes.float32),
      }

      items_to_handlers = {
          'object/bbox':
              tfexample_decoder.BoundingBox(['ymin', 'xmin', 'ymax', 'xmax'],
                                            'image/object/bbox/'),
      }

      decoder = tfexample_decoder.TFExampleDecoder(keys_to_features,
                                                   items_to_handlers)
      [tf_bboxes] = decoder.decode(serialized_example, ['object/bbox'])
      bboxes = tf_bboxes.eval()

    self.assertAllClose(np_bboxes, bboxes)
示例#8
0
  def testDecodeExampleWithRepeatedImages(self):
    image_shape = (2, 3, 3)
    image_format = 'png'
    image, _ = self.GenerateImage(
        image_format=image_format, image_shape=image_shape)
    tf_encoded = self._Encoder(image, image_format)
    with self.cached_session():
      tf_string = tf_encoded.eval()

    example = example_pb2.Example(
        features=feature_pb2.Features(
            feature={
                'image/encoded':
                    feature_pb2.Feature(
                        bytes_list=feature_pb2.BytesList(
                            value=[tf_string, tf_string])),
                'image/format':
                    self._StringFeature(image_format),
            }))
    serialized_example = example.SerializeToString()

    with self.cached_session():
      serialized_example = array_ops.reshape(serialized_example, shape=[])

      decoder = tfexample_decoder.TFExampleDecoder(
          keys_to_features={
              'image/encoded':
                  parsing_ops.FixedLenFeature((2,), dtypes.string),
              'image/format':
                  parsing_ops.FixedLenFeature(
                      (), dtypes.string, default_value=image_format),
          },
          items_to_handlers={'image': tfexample_decoder.Image(repeated=True)})
      [tf_image] = decoder.decode(serialized_example, ['image'])

      output_image = tf_image.eval()

      self.assertEqual(output_image.shape, (2, 2, 3, 3))
      self.assertAllEqual(np.squeeze(output_image[0, :, :, :]), image)
      self.assertAllEqual(np.squeeze(output_image[1, :, :, :]), image)
示例#9
0
def write_to_bin(input_file, out_file, makevocab=False):
    if makevocab:
        vocab_counter = collections.Counter()

    with open(out_file, 'wb') as writer:
        # 读取输入的文本文件,使偶数行成为article,奇数行成为abstract(行号从0开始)
        lines = read_text_file(input_file)
        for i, new_line in enumerate(lines):
            article = lines[i]
            abstract = "%s %s %s" % (SENTENCE_START, lines[i], SENTENCE_END)

            # 写到tf.Example
            tf_example = example_pb2.Example()
            tf_example.features.feature['article'].bytes_list.value.extend([bytes(article.encode('utf-8'))])
            tf_example.features.feature['abstract'].bytes_list.value.extend([bytes(abstract.encode('utf-8'))])
            tf_example_str = tf_example.SerializeToString()
            str_len = len(tf_example_str)
            writer.write(struct.pack('q', str_len))
            writer.write(struct.pack('%ds' % str_len, tf_example_str))

            # 如果可以,将词典写入文件
            if makevocab:
                art_tokens = article.split(' ')
                abs_tokens = abstract.split(' ')
                abs_tokens = [t for t in abs_tokens if
                              t not in [SENTENCE_START, SENTENCE_END]]  # 从词典中删除这些符号
                tokens = art_tokens + abs_tokens
                tokens = [t.strip() for t in tokens]  # 清楚句子开头结尾的空字符
                tokens = [t for t in tokens if t != ""]  # 删除空行
                vocab_counter.update(tokens)

    print("Finished writing file %s\n" % out_file)

    # 将词典写入文件
    if makevocab:
        print("Writing vocab file...")
        with codecs.open(os.path.join(finished_files_dir, "vocab"), 'w', encoding='utf-8') as writer:
            for word, count in vocab_counter.most_common(VOCAB_SIZE):
                writer.write(word + ' ' + str(count) + '\n')
        print("Finished writing vocab file")
示例#10
0
def write_to_bin(in_folder, out_file, makevocab=False):
    if makevocab:
        vocab_counter = collections.Counter()
    files = glob.glob(in_folder)
    counter = 0
    
    with open(out_file, 'wb') as writer:
        for file in files:
            data = json.loads(open(file, 'r').readline())
            article = get_string(data['clean_article'])
            abstract = get_string(data['clean_summary'], is_article=False)
            
            # Write to tf.Example
            tf_example = example_pb2.Example()
            tf_example.features.feature['article'].bytes_list.value.extend([article.encode()])
            tf_example.features.feature['abstract'].bytes_list.value.extend([abstract.encode()])
            tf_example_str = tf_example.SerializeToString()
            str_len = len(tf_example_str)
            writer.write(struct.pack('q', str_len))
            writer.write(struct.pack('%ds' % str_len, tf_example_str))
            
            # Write the vocab to file, if applicable
            if makevocab:
                art_tokens = article.split(' ')
                abs_tokens = abstract.split(' ')
                abs_tokens = [t for t in abs_tokens if t not in [SENTENCE_START, SENTENCE_END]] # remove these tags from vocab
                tokens = art_tokens + abs_tokens
                tokens = [t.strip() for t in tokens] # strip
                tokens = [t for t in tokens if t!=""] # remove empty
                vocab_counter.update(tokens)

    print ("Finished writing file %s" % out_file)

    # write vocab to file
    if makevocab:
        print ("Writing vocab file...")
        with open(data_path+"vocab", 'w') as writer:
            for word, count in vocab_counter.most_common(VOCAB_SIZE):
                writer.write(word + ' ' + str(count) + '\n')
        print ("Finished writing vocab file")
示例#11
0
def write_docs_to_file(fname, docs):
    with open(fname, 'wb') as writer:
        for idx, doc in enumerate(docs):
            #if idx % 1000 == 0:
            #  print "Writing story %i of %i; %.2f percent done" % (idx, num_stories, float(idx)*100.0/float(#num_stories))

            # Look in the tokenized story dirs to find the .story file corresponding to this url

            # Get the strings to write to .bin file
            abstract = "".encode("utf8")
            article = "\n".join(doc).encode("utf8")

            # Write to tf.Example
            tf_example = example_pb2.Example()
            tf_example.features.feature['article'].bytes_list.value.extend(
                [article])
            tf_example.features.feature['abstract'].bytes_list.value.extend(
                [abstract])
            tf_example_str = tf_example.SerializeToString()
            str_len = len(tf_example_str)
            writer.write(struct.pack('q', str_len))
            writer.write(struct.pack('%ds' % str_len, tf_example_str))
示例#12
0
    def generate_image(self, image_format, image_shape):
        """Generates an image and an example containing the encoded image.

        Args:
            image_format: the encoding format of the image.
            image_shape: the shape of the image to generate.
    
        Returns:
            image: the generated image.
            example: a TF-example with a feature key 'image/encoded' set to the
                serialized image and a feature key 'image/format' set to the image
                encoding format ['jpeg', 'JPEG', 'png', 'PNG', 'raw'].
        """
        num_pixels = image_shape[0] * image_shape[1] * image_shape[2]
        image = np.linspace(0, num_pixels - 1, num=num_pixels).reshape(image_shape).astype(np.uint8)
        tf_encoded = self._encode(image, image_format)
        example = example_pb2.Example(features=feature_pb2.Features(feature={
            'image/encoded': self._encode_bytes_feature(tf_encoded),
            'image/format': self._string_feature(image_format)
        }))

        return image, example.SerializeToString()
    def testDecodeExampleShapeKeyTensor(self):
        np_image = np.random.rand(2, 3, 1).astype('f')
        np_labels = np.array([[[1], [2], [3]], [[4], [5], [6]]])

        example = example_pb2.Example(features=feature_pb2.Features(
            feature={
                'image':
                self._EncodedFloatFeature(np_image),
                'image/shape':
                self._EncodedInt64Feature(np.array(np_image.shape)),
                'labels':
                self._EncodedInt64Feature(np_labels),
                'labels/shape':
                self._EncodedInt64Feature(np.array(np_labels.shape)),
            }))

        serialized_example = example.SerializeToString()

        with self.test_session():
            serialized_example = array_ops.reshape(serialized_example,
                                                   shape=[])
            keys_to_features = {
                'image': parsing_ops.VarLenFeature(dtype=dtypes.float32),
                'image/shape': parsing_ops.VarLenFeature(dtype=dtypes.int64),
                'labels': parsing_ops.VarLenFeature(dtype=dtypes.int64),
                'labels/shape': parsing_ops.VarLenFeature(dtype=dtypes.int64),
            }
            items_to_handlers = {
                'image':
                tfexample_decoder.Tensor('image', shape_keys='image/shape'),
                'labels':
                tfexample_decoder.Tensor('labels', shape_keys='labels/shape'),
            }
            decoder = tfexample_decoder.TFExampleDecoder(
                keys_to_features, items_to_handlers)
            [tf_image, tf_labels] = decoder.decode(serialized_example,
                                                   ['image', 'labels'])
            self.assertAllEqual(tf_image.eval(), np_image)
            self.assertAllEqual(tf_labels.eval(), np_labels)
示例#14
0
 def testMakeExample(self):
     expected = example_pb2.Example()
     expected.features.feature['single_float'].float_list.value[:] = [1.0]
     expected.features.feature['single_int'].int64_list.value[:] = [2]
     expected.features.feature['single_str'].bytes_list.value[:] = [
         b'apple'
     ]
     expected.features.feature['multi_float'].float_list.value[:] = [
         4.0, 5.0
     ]
     expected.features.feature['multi_int'].int64_list.value[:] = [6, 7]
     expected.features.feature['multi_str'].bytes_list.value[:] = [
         b'orange', b'banana'
     ]
     self.assertEqual(
         expected,
         util.make_example(single_float=1.0,
                           single_int=2,
                           single_str='apple',
                           multi_float=[4.0, 5.0],
                           multi_int=[6, 7],
                           multi_str=['orange', 'banana']))
示例#15
0
def create_examples(clusters, labels):
    """Creates Examples from the clusters and label strings.

  Args:
    clusters: NumPy array of shape (num_clusters, patch_height, patch_width).
    labels: List of string labels, which are names in musicscore_pb2.Glyph.Type.
        Length `num_clusters`.

  Returns:
    A list of Example protos of length `num_clusters`.
  """
    examples = []
    for cluster, label in zip(clusters, labels):
        example = example_pb2.Example()
        features = example.features
        features.feature['patch'].float_list.value.extend(cluster.ravel())
        features.feature['height'].int64_list.value.append(cluster.shape[0])
        features.feature['width'].int64_list.value.append(cluster.shape[1])
        label_num = musicscore_pb2.Glyph.Type.Value(label)
        example.features.feature['label'].int64_list.value.append(label_num)
        examples.append(example)
    return examples
  def parse_example_factory():
    """Parse example factory."""

    def _int64_feature(*values):
      return feature_pb2.Feature(int64_list=feature_pb2.Int64List(value=values))

    def _bytes_feature(*values):
      return feature_pb2.Feature(
          bytes_list=feature_pb2.BytesList(
              value=[v.encode("utf-8") for v in values]))

    return dataset_ops.Dataset.from_tensor_slices(
        constant_op.constant([
            example_pb2.Example(
                features=feature_pb2.Features(
                    feature={
                        "dense_int": _int64_feature(i),
                        "dense_str": _bytes_feature(str(i)),
                        "sparse_int": _int64_feature(i, i * 2, i * 4, i * 8),
                        "sparse_str": _bytes_feature(*["abc"] * i)
                    })).SerializeToString() for i in range(10)
        ]))
示例#17
0
  def testDecodeExampleWithInt64Tensor(self):
    np_array = np.random.randint(1, 10, size=(2, 3, 1))

    example = example_pb2.Example(
        features=feature_pb2.Features(feature={
            'array': self._EncodedInt64Feature(np_array),
        }))

    serialized_example = example.SerializeToString()

    with self.cached_session():
      serialized_example = array_ops.reshape(serialized_example, shape=[])
      keys_to_features = {
          'array': parsing_ops.FixedLenFeature(np_array.shape, dtypes.int64)
      }
      items_to_handlers = {
          'array': tfexample_decoder.Tensor('array'),
      }
      decoder = tfexample_decoder.TFExampleDecoder(keys_to_features,
                                                   items_to_handlers)
      [tf_array] = decoder.decode(serialized_example, ['array'])
      self.assertAllEqual(tf_array.eval(), np_array)
示例#18
0
def make_example(article, abstracts, doc_indices, raw_article_sents, corefs, article_lcs_paths=None):
    tf_example = example_pb2.Example()
    tf_example.features.feature['article'].bytes_list.value.extend([util.encode_text(article)])
    for abstract in abstracts:
        if type(abstract) == list:
            tf_example.features.feature['abstract'].bytes_list.value.extend([util.encode_text(process_abstract(abstract))])
        else:
            tf_example.features.feature['abstract'].bytes_list.value.extend([util.encode_text(abstract)])
    if doc_indices is not None:
        if type(doc_indices) == list:
            doc_indices = ' '.join(doc_indices)
        tf_example.features.feature['doc_indices'].bytes_list.value.extend([util.encode_text(doc_indices)])
    if raw_article_sents is not None:
        for sent in raw_article_sents:
            tf_example.features.feature['raw_article_sents'].bytes_list.value.extend([util.encode_text(sent)])
    if corefs is not None:
        corefs_str = json.dumps(corefs)
        tf_example.features.feature['corefs'].bytes_list.value.extend([util.encode_text(corefs_str)])
    if article_lcs_paths is not None:
        article_lcs_paths_str = ';'.join([' '.join(str(i) for i in source_indices) for source_indices in article_lcs_paths])
        tf_example.features.feature['article_lcs_paths'].bytes_list.value.extend([util.encode_text(article_lcs_paths_str)])
    return tf_example
示例#19
0
    def session_run():
      example = example_pb2.Example()
      example.features.feature['x'].float_list.value.append(1)

      tensor_name_prediction = None
      tensor_name_classes = None
      key_prediction = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
      tensor_name_prediction = (
          meta_graph.signature_def[key_prediction].outputs['prediction'].name)
      key_classification = 'classification'
      tensor_name_classes = (
          meta_graph.signature_def[key_classification].outputs['classes'].name)

      sess.run(
          tensor_name_prediction,
          feed_dict={'input_example_tensor:0': [example.SerializeToString()]})
      sess.run(
          tensor_name_classes,
          feed_dict={'input_example_tensor:0': [example.SerializeToString()]})
      sess.run(
          [tensor_name_prediction, tensor_name_classes],
          feed_dict={'input_example_tensor:0': [example.SerializeToString()]})
示例#20
0
def _create_tf_example(feature_dict):
    """
    Creates a tf example protobuf message given a feature dict. The protobuf message is defined here
        https://github.com/tensorflow/serving/blob/master/tensorflow_serving/apis/input.proto#L19
    Args:
        feature_dict (dict of str -> feature): feature can be any of the following:
          int, strings, unicode object, float, or list of any of the previous types.

    Returns:
        a tf.train.Example including the features
    """
    def _create_feature(feature):
        feature_list = feature if isinstance(feature, list) else [feature]

        # Each feature can be exactly one kind:
        # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/example/feature.proto#L76

        feature_type = type(feature_list[0])
        if feature_type == int:
            return feature_pb2.Feature(int64_list=feature_pb2.Int64List(
                value=feature_list))
        elif feature_type == str:
            return feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
                value=feature_list))
        elif feature_type == unicode:
            return feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
                value=map(lambda x: str(x), feature_list)))
        elif feature_type == float:
            return feature_pb2.Feature(float_list=feature_pb2.FloatList(
                value=feature_list))
        else:
            message = """Unsupported request data format: {}, {}.
                            Valid formats: float, int, str any object that implements __iter__
                                           or classification_pb2.ClassificationRequest"""
            raise ValueError(message.format(feature, type(feature)))

    features = {k: _create_feature(v) for k, v in feature_dict.items()}
    return example_pb2.Example(features=feature_pb2.Features(feature=features))
示例#21
0
def ExampleGen(data_path, num_epochs=None):
    """Generates tf.Examples from path of data files.
  ExampleGen
      Binary data format: <length><blob>. <length> represents the byte size
      of <blob>. <blob> is serialized tf.Example proto. The tf.Example contains
      the tokenized article text and summary.
  
    Args:
      data_path: path to tf.Example data files.
      num_epochs: Number of times to go through the data. None means infinite.
  
    Yields:
      Deserialized tf.Example.
  
    If there are multiple files specified, they accessed in a random order.
    """

    epoch = 0
    while True:
        if num_epochs is not None and epoch >= num_epochs:
            break

        filelist = FileUtils().get_files()
        assert filelist, 'Empty filelist.'
        # random.shuffle(filelist)
        for f in filelist:
            with open(f) as json_file:
                tf_example = example_pb2.Example()
                json_text = json.load(json_file)
                article = str(json_text["article"])
                summary = str(json_text["summary"])
                tf_example.features.feature['article'].bytes_list.value.extend(
                    [article])
                tf_example.features.feature[
                    'abstract'].bytes_list.value.extend([summary])
                yield tf_example

        epoch += 1
示例#22
0
def write_to_bin(source, target, vocab_path, out_file):
    num_stories = count_num(source)
    vocab_counter = collections.Counter()
    with open(out_file, 'wb') as writer:
        gen = get_art_abs(source, target)
        for idx in range(num_stories):
            if idx % 1000 == 0:
                print("Writing story %i of %i; %.2f percent done" %
                      (idx, num_stories,
                       float(idx) * 100.0 / float(num_stories)))
            article, abstract = next(gen)
            tf_example = example_pb2.Example()
            tf_example.features.feature['article'].bytes_list.value.extend(
                [bytes(article, 'utf-8')])
            tf_example.features.feature['abstract'].bytes_list.value.extend(
                [bytes(abstract, 'utf-8')])
            tf_example_str = tf_example.SerializeToString()
            str_len = len(tf_example_str)
            writer.write(struct.pack('q', str_len))
            writer.write(struct.pack('%ds' % str_len, tf_example_str))

            art_tokens = article.split(' ')
            abs_tokens = abstract.split(' ')
            abs_tokens = [
                t for t in abs_tokens
                if t not in [SENTENCE_START, SENTENCE_END]
            ]  # remove these tags from vocab
            tokens = art_tokens + abs_tokens
            tokens = [t.strip() for t in tokens]  # strip
            tokens = [t for t in tokens if t != ""]  # remove empty
            tokens = [t for t in tokens if not t.isdigit()]  # remove number
            vocab_counter.update(tokens)
    print("Finished writing file %s\n" % out_file)
    print("Writing vocab file...")
    with open(vocab_path, 'w') as writer:
        for word, count in vocab_counter.most_common():
            writer.write(word + ' ' + str(count) + '\n')
    print("Finished writing vocab file")
示例#23
0
    def test_decode_example_multi_shape_key_tensor(self):
        np_image = np.random.rand(2, 3, 1).astype('f')
        np_labels = np.array([[[1], [2], [3]], [[4], [5], [6]]])
        height, width, depth = np_labels.shape

        example = example_pb2.Example(features=feature_pb2.Features(feature={
            'image': self._encode_float_feature(np_image),
            'image/shape': self._encode_int64_feature(np.array(np_image.shape)),
            'labels': self._encode_int64_feature(np_labels),
            'labels/height': self._encode_int64_feature(np.array([height])),
            'labels/width': self._encode_int64_feature(np.array([width])),
            'labels/depth': self._encode_int64_feature(np.array([depth])),
        }))

        serialized_example = example.SerializeToString()

        with self.test_session():
            serialized_example = array_ops.reshape(serialized_example, shape=[])
            keys_to_features = {
                'image': parsing_ops.VarLenFeature(dtype=dtypes.float32),
                'image/shape': parsing_ops.VarLenFeature(dtype=dtypes.int64),
                'labels': parsing_ops.VarLenFeature(dtype=dtypes.int64),
                'labels/height': parsing_ops.VarLenFeature(dtype=dtypes.int64),
                'labels/width': parsing_ops.VarLenFeature(dtype=dtypes.int64),
                'labels/depth': parsing_ops.VarLenFeature(dtype=dtypes.int64),
            }
            items_to_handlers = {
                'image':
                    tfexample_decoder.Tensor('image', shape_keys='image/shape'),
                'labels':
                    tfexample_decoder.Tensor(
                        'labels',
                        shape_keys=['labels/height', 'labels/width', 'labels/depth']),
            }
            decoder = TFExampleDecoder(keys_to_features, items_to_handlers)
            [tf_image, tf_labels] = decoder.decode(serialized_example, ['image', 'labels'])
            self.assertAllEqual(tf_image.eval(), np_image)
            self.assertAllEqual(tf_labels.eval(), np_labels)
示例#24
0
def convert_files_to_binary(input_filenames, output_filename, counter):
    with open(output_filename, 'wb') as serialized_f:
        for filename in input_filenames:
            with open(filename, 'r') as input_f:
                print("handling file: ", filename)
                pattern = re.compile(
                    r'<HEADLINE>\n([\w\W]+?)\n</HEADLINE>[\w\W]+?<P>\n([\w\W]+?)\n</P>[\w\W]+?<P>\n([\w\W]+?)\n</P>'
                )
                if FLAGS.lexrank == True:
                    pattern = re.compile(
                        r'<HEADLINE>\n([\w\W]+?)\n</HEADLINE>[\w\W]+?<TEXT>\n([\w\W]+?)\n</TEXT>'
                    )
                for match in pattern.findall(input_f.read()):
                    if FLAGS.lexrank == True:
                        abstract, s1, s2 = lexrankSentences(match)
                    else:
                        abstract, s1, s2 = match

                    # split & count words
                    abstract = modify(abstract)
                    s1 = modify(s1)
                    s2 = modify(s2)
                    if len(abstract) == 0 or len(s1) == 0 or len(s2) == 0:
                        continue
                    counter.update(' '.join([abstract, s1, s2]).split())

                    # then create serialized version of abstract/article for training
                    article = ' '.join([s1, s2])
                    tf_example = example_pb2.Example()
                    tf_example.features.feature[
                        'article'].bytes_list.value.extend([article])
                    tf_example.features.feature[
                        'abstract'].bytes_list.value.extend([abstract])
                    tf_example_str = tf_example.SerializeToString()
                    str_len = len(tf_example_str)
                    serialized_f.write(struct.pack('q', str_len))
                    serialized_f.write(
                        struct.pack('%ds' % str_len, tf_example_str))
示例#25
0
    def test_decode_example_with_float_tensor(self):
        np_array = np.random.rand(2, 3, 1).astype('f')

        example = example_pb2.Example(features=feature_pb2.Features(
            feature={
                'array': self._encode_float_feature(np_array),
            }))

        serialized_example = example.SerializeToString()

        with self.test_session():
            serialized_example = array_ops.reshape(serialized_example,
                                                   shape=[])
            keys_to_features = {
                'array':
                parsing_ops.FixedLenFeature(np_array.shape, dtypes.float32)
            }
            items_to_handlers = {
                'array': tfexample_decoder.Tensor('array'),
            }
            decoder = TFExampleDecoder(keys_to_features, items_to_handlers)
            [tf_array] = decoder.decode(serialized_example, ['array'])
            self.assertAllEqual(tf_array.eval(), np_array)
示例#26
0
def write_bert_tf_example(simple_similar_source_indices, raw_article_sents, summary_text, corefs_str, doc_indices, article_lcs_paths_list, writer, dataset_name):
    tf_example = example_pb2.Example()
    source_indices_str = ';'.join([' '.join(str(i) for i in source_indices) for source_indices in simple_similar_source_indices])
    tf_example.features.feature['similar_source_indices'].bytes_list.value.extend([util.encode_text(source_indices_str)])
    for sent in raw_article_sents:
        s = sent.strip()
        tf_example.features.feature['raw_article_sents'].bytes_list.value.extend([util.encode_text(s)])
    if dataset_name == 'duc_2004':
        for summ_text in summary_text:
            tf_example.features.feature['summary_text'].bytes_list.value.extend([util.encode_text(summ_text)])
    else:
        tf_example.features.feature['summary_text'].bytes_list.value.extend([util.encode_text(summary_text)])
    if doc_indices is not None:
        tf_example.features.feature['doc_indices'].bytes_list.value.extend([util.encode_text(doc_indices)])
    if corefs_str is not None:
        tf_example.features.feature['corefs'].bytes_list.value.extend([corefs_str])
    if article_lcs_paths_list is not None:
        article_lcs_paths_list_str = '|'.join([';'.join([' '.join(str(i) for i in source_indices) for source_indices in article_lcs_paths]) for article_lcs_paths in article_lcs_paths_list])
        tf_example.features.feature['article_lcs_paths_list'].bytes_list.value.extend([util.encode_text(article_lcs_paths_list_str)])
    tf_example_str = tf_example.SerializeToString()
    str_len = len(tf_example_str)
    writer.write(struct.pack('q', str_len))
    writer.write(struct.pack('%ds' % str_len, tf_example_str))
示例#27
0
def _convert_files_to_binary(input_filenames, output_filename):
  with open(output_filename, 'wb') as writer:
    for filename in input_filenames:
      with open(filename, 'r') as f:
        document = f.read()
    
      document_parts = document.split('\n', 1)
      assert len(document_parts) == 2
    
      title = '<d><p><s>' + document_parts[0] + '</s></p></d>'
      
      body = document_parts[1].decode('utf8').replace('\n', ' ').replace('\t', ' ')
      sentences = sent_tokenize(body)
      body = '<d><p>' + ' '.join(['<s>' + sentence + '</s>' for sentence in sentences]) + '</p></d>'
      body = body.encode('utf8')
    
      tf_example = example_pb2.Example()
      tf_example.features.feature['article'].bytes_list.value.extend([body])
      tf_example.features.feature['abstract'].bytes_list.value.extend([title])
      tf_example_str = tf_example.SerializeToString()
      str_len = len(tf_example_str)
      writer.write(struct.pack('q', str_len))
      writer.write(struct.pack('%ds' % str_len, tf_example_str))
def write_to_file(article, abstract, rel, writer):
    #abstract = '<s> ' + ' '.join(abstract) + ' </s>'
    #abstract = abstract.encode('utf8', 'ignore')
    #rel = rel.encode('utf8', 'ignore')
    #article = article.encode('utf8', 'ignore')
    #print(article)
    #print(abstract)
    #print(len(rel))
    tf_example = example_pb2.Example()
    tf_example.features.feature['abstract'].bytes_list.value.extend(
        [bytes(abstract)])
    tf_example.features.feature['relevancy'].bytes_list.value.extend(
        [bytes(rel)])
    tf_example.features.feature['article'].bytes_list.value.extend(
        [bytes(article)])
    tf_example_str = tf_example.SerializeToString()

    #print(bytes(rel))
    #print(tf_example.features.feature['relevancy'])
    #print((tf_example_str))
    str_len = len(tf_example_str)
    writer.write(struct.pack('q', str_len))
    writer.write(struct.pack('%ds' % str_len, tf_example_str))
示例#29
0
  def testDecodeExampleWithVarLenTensor(self):
    np_array = np.array([[[1], [2], [3]], [[4], [5], [6]]])

    example = example_pb2.Example(
        features=feature_pb2.Features(feature={
            'labels': self._EncodedInt64Feature(np_array),
        }))

    serialized_example = example.SerializeToString()

    with self.cached_session():
      serialized_example = array_ops.reshape(serialized_example, shape=[])
      keys_to_features = {
          'labels': parsing_ops.VarLenFeature(dtype=dtypes.int64),
      }
      items_to_handlers = {
          'labels': tfexample_decoder.Tensor('labels'),
      }
      decoder = tfexample_decoder.TFExampleDecoder(keys_to_features,
                                                   items_to_handlers)
      [tf_labels] = decoder.decode(serialized_example, ['labels'])
      labels = tf_labels.eval()
      self.assertAllEqual(labels, np_array.flatten())
示例#30
0
    def test_decode_example_with_sparse_tensor(self):
        np_indices = np.array([[1], [2], [5]])
        np_values = np.array([0.1, 0.2, 0.6]).astype('f')
        example = example_pb2.Example(features=feature_pb2.Features(feature={
            'indices': self._encode_int64_feature(np_indices),
            'values': self._encode_float_feature(np_values),
        }))

        serialized_example = example.SerializeToString()

        with self.test_session():
            serialized_example = array_ops.reshape(serialized_example, shape=[])
            keys_to_features = {
                'indices': parsing_ops.VarLenFeature(dtype=dtypes.int64),
                'values': parsing_ops.VarLenFeature(dtype=dtypes.float32),
            }
            items_to_handlers = {'labels': tfexample_decoder.SparseTensor()}
            decoder = TFExampleDecoder(keys_to_features, items_to_handlers)
            [tf_labels] = decoder.decode(serialized_example, ['labels'])
            labels = tf_labels.eval()
            self.assertAllEqual(labels.indices, np_indices)
            self.assertAllEqual(labels.values, np_values)
            self.assertAllEqual(labels.dense_shape, np_values.shape)