예제 #1
0
def parse_text_file(text_file):
    if not os.path.exists(text_file):
        print('No text file', text_file, file=sys.stderr)
        return
    num_lines = gezi.get_num_lines(text_file)
    pb = ProgressBar(num_lines, 'parse text file %s' % text_file)
    for line in open(text_file):
        pb.progress()
        l = line.split('\t')
        image = l[0]
        image = image[:image.index('#')]
        text = l[-1].strip()
        #why text and ori_text ? because fo cn corpus text will be \x01 seperated(segged text)
        #for en corpus text and ori_text is the same
        ori_text = l[FLAGS.ori_text_index].strip()
        if text == '':
            continue
        if image not in text_map:
            text_map_[image] = set([text])
            text_map[image] = [(text, ori_text)]
        else:
            if text not in text_map_:
                text_map_[image].add(text)
                text_map[image].append((text, ori_text))
    for image in text_map:
        text_map[image] = list(text_map[image])
예제 #2
0
def convert_to(feat_file, name):
  num_shards = FLAGS.shards
  num_threads = FLAGS.threads
  if FLAGS.threads > 1:
    assert(num_threads == num_shards)
    f = open(feat_file).readlines()
    num_lines = len(f)
    if FLAGS.debug:
      num_lines = NUM_DEBUG_LINES
    shard_ranges = np.linspace(0,
                             num_lines,
                             num_shards + 1).astype(int)
    
    record = []
    for i in xrange(num_threads):
      args = (f, name, i, shard_ranges[i], shard_ranges[i + 1])
      process = multiprocessing.Process(target=_convert_to,args=args)
      process.start()
      record.append(process)

    for process in record:
      process.join()
    return

  #--------------single thread
  num_lines = gezi.get_num_lines(feat_file)
  if FLAGS.debug:
    num_lines = NUM_DEBUG_LINES
  shard_ranges = np.linspace(0,
                             num_lines,
                             num_shards + 1).astype(int)
  pb = ProgressBar(num_lines, "convert")
  shard = 0
  count = 0;
  
  output_filename = '%s-%.5d-of-%.5d' % (name, shard, num_shards)
  output_file = os.path.join(FLAGS.output_directory, output_filename)
  print('Writing', output_file, count)
  writer = tf.python_io.TFRecordWriter(output_file)
 
  for line in open(feat_file):
    pb.progress()
    if count >= shard_ranges[shard + 1]:
      shard += 1
      writer.close()
      output_filename = '%s-%.5d-of-%.5d' % (name, shard, num_shards)
      output_file = os.path.join(FLAGS.output_directory, output_filename)
      print('Writing', output_file, count)
      writer = tf.python_io.TFRecordWriter(output_file)
    _parse_line(line, writer)

    count += 1
    if FLAGS.debug and count >= NUM_DEBUG_LINES:
      break
   
  writer.close()