示例#1
0
def main(argv):
    writer = tf.python_io.TFRecordWriter(argv[2])
    num = 0
    for line in open(argv[1]):
        if line[0] == '#':
            continue
        if num % 10000 == 0:
            print('%d lines done' % num)
        l = line.rstrip().split()

        label_index = 0
        if l[0][0] == '_':
            label_index = 1
            id = int(l[0][1:])
        else:
            id = num
        label = int(l[label_index])

        start = label_index + 1
        feature = [float(x) for x in l[start:]]

        if FLAGS.fake_var_len:
            if id % 2 == 0:
                feature = feature[:10]

            if id % 3 == 0:
                feature = feature[:20]

        example = tf.train.SequenceExample(
            context=melt.features({
                'id': melt.int_feature(id),
                'label': melt.int_feature(label)
            }),
            feature_lists=melt.feature_lists({
                #see sequence_test.py use each single as a list and stack all lists(single items)
                #can this deal with var len sequence ?
                'feature':
                melt.feature_list(
                    [melt.float_feature(item) for item in feature])
                #'feature': melt.feature_list(melt.float_feature(feature))
            }))

        writer.write(example.SerializeToString())

        num += 1
        if FLAGS.num_examples and num == FLAGS.num_examples:
            break
示例#2
0
def deal_file(file):
    out_file = '{}/{}'.format(
        FLAGS.output_directory,
        '-'.join([FLAGS.name, file.split('/')[-1].split('-')[-1]]))
    print('out_file:', out_file)
    with melt.tfrecords.Writer(out_file) as writer:
        num = 0
        for line in open(file):
            if num % 1000 == 0:
                print(num)

            l = line.rstrip('\n').split('\t')
            img = l[0]

            texts = l[FLAGS.text_index].split('\x01')

            image_feature = [
                float(x)
                for x in l[FLAGS.image_feature_index].strip().split('\x01')
            ]
            #assert len(image_feature) == IMAGE_FEATURE_LEN, '%s %d'%(img, len(image_feature))
            if len(image_feature) != IMAGE_FEATURE_LEN:
                print('bad line:', line)
                continue

            input_texts = l[FLAGS.input_text_index].split('\x01')
            for input_text in input_texts:
                input_words = text2ids.Segmentor.Segment(
                    input_text, FLAGS.seg_method)
                input_word_ids = text2ids.words2ids(
                    input_words,
                    feed_single=FLAGS.feed_single,
                    allow_all_zero=True,
                    pad=False)
                if len(input_word_ids) == 0:
                    continue

                input_word_ids = input_word_ids[:INPUT_TEXT_MAX_WORDS]
                if FLAGS.pad:
                    input_word_ids = gezi.pad(input_word_ids)

                is_top_text = True
                for text in texts:
                    if text.strip() == '':
                        continue

                    words = text2ids.Segmentor.Segment(text, FLAGS.seg_method)
                    word_ids = text2ids.words2ids(
                        words,
                        feed_single=FLAGS.feed_single,
                        allow_all_zero=True,
                        pad=False)
                    word_ids_length = len(word_ids)
                    if num % 1000 == 0:
                        print(img,
                              text,
                              word_ids,
                              text2ids.ids2text(word_ids),
                              len(image_feature),
                              file=sys.stderr)
                    if word_ids_length == 0:
                        continue
                    if is_luanma(words, word_ids):
                        print('luanma',
                              img,
                              text,
                              word_ids,
                              text2ids.ids2text(word_ids),
                              len(image_feature),
                              file=sys.stderr)
                        continue

                    word_ids = word_ids[:TEXT_MAX_WORDS]
                    if FLAGS.pad:
                        word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0)
                    if not FLAGS.write_sequence_example:
                        example = tf.train.Example(features=tf.train.Features(
                            feature={
                                'image_name': melt.bytes_feature(img),
                                'image_feature': melt.float_feature(
                                    image_feature),
                                'input_text_str': melt.bytes_feature(
                                    input_text),
                                'input_text': melt.int64_feature(
                                    input_word_ids),
                                'text_str': melt.bytes_feature(text),
                                'text': melt.int64_feature(word_ids),
                            }))
                    else:
                        example = tf.train.SequenceExample(
                            context=melt.features({
                                'image_name':
                                melt.bytes_feature(img),
                                'image_feature':
                                melt.float_feature(image_feature),
                                'input_text_str':
                                melt.bytes_feature(input_text),
                                'text_str':
                                melt.bytes_feature(text),
                            }),
                            feature_lists=melt.feature_lists({
                                'input_text':
                                melt.int64_feature_list(input_word_ids),
                                'text':
                                melt.int64_feature_list(word_ids)
                            }))
                    writer.write(example)

                #global counter, max_num_words, sum_words
                with record_counter.get_lock():
                    record_counter.value += 1
                if word_ids_length > max_num_words.value:
                    with max_num_words.get_lock():
                        max_num_words.value = word_ids_length
                with sum_words.get_lock():
                    sum_words.value += word_ids_length

                if FLAGS.np_save:
                    assert FLAGS.threads == 1
                    gtexts.append(word_ids)
                    gtext_strs.append(text)

                    if img not in image_labels:
                        image_labels[img] = set()
                    image_labels[img].add(text)

                if is_top_text:
                    is_top_text = False
                    with image_counter.get_lock():
                        image_counter.value += 1

                    if FLAGS.np_save:
                        if img not in image_labels:
                            image_labels[img] = set()

                        image_names.append(img)
                        image_features.append(image_feature)

                    if FLAGS.num_max_records > 0:
                        #if fixed valid only get one click for each image
                        break

            num += 1
            if num == FLAGS.num_max_records:
                break
示例#3
0
def deal_file(file):
    out_file = '{}/{}'.format(
        FLAGS.output_directory,
        '-'.join([FLAGS.name, file.split('/')[-1].split('-')[-1]]))
    print('file:', file, 'out_file:', out_file, file=sys.stderr)
    with melt.tfrecords.Writer(out_file) as writer:
        num = 0
        for line in open(file):
            if num % 1000 == 0:
                print(num, file=sys.stderr)

            l = line.rstrip('\n').split('\t')
            img = l[0]

            texts = l[FLAGS.text_index].split('\x01')

            image_feature = [
                float(x)
                for x in l[FLAGS.image_feature_index].strip().split('\x01')
            ]
            #image_feature = [float(x) for x in l[FLAGS.image_feature_index].strip().split(' ')]
            #image_feature = [0.] * IMAGE_FEATURE_LEN
            assert len(image_feature) == IMAGE_FEATURE_LEN, '%s %d' % (
                img, len(image_feature))

            is_top_text = True
            for text in texts:
                text = normalize.norm(text)
                if text.strip() == '':
                    print('empty line', line, file=sys.stderr)
                    continue

                word_ids = _text2ids(text, TEXT_MAX_WORDS)
                word_ids_length = len(word_ids)
                if num % 10000 == 0:
                    print(img,
                          text,
                          word_ids,
                          text2ids.ids2text(word_ids),
                          len(image_feature),
                          file=sys.stderr)
                if len(word_ids) == 0:
                    print('empy wordids!', file=sys.stderr)
                    print(img,
                          text,
                          word_ids,
                          text2ids.ids2text(word_ids),
                          len(image_feature),
                          file=sys.stderr)
                    continue
                #if is_luanma(words, word_ids):
                #  print('luanma', img, text, word_ids, text2ids.ids2text(word_ids), len(image_feature), file=sys.stderr)
                #  continue

                word_ids = word_ids[:TEXT_MAX_WORDS]
                if FLAGS.pad:
                    word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0)
                if not FLAGS.write_sequence_example:
                    example = tf.train.Example(features=tf.train.Features(
                        feature={
                            'image_name': melt.bytes_feature(img),
                            'image_feature': melt.float_feature(image_feature),
                            'text_str': melt.bytes_feature(text),
                            'text': melt.int64_feature(word_ids),
                        }))
                else:
                    example = tf.train.SequenceExample(
                        context=melt.features({
                            'image_name':
                            melt.bytes_feature(img),
                            'image_feature':
                            melt.float_feature(image_feature),
                            'text_str':
                            melt.bytes_feature(text),
                        }),
                        feature_lists=melt.feature_lists(
                            {'text': melt.int64_feature_list(word_ids)}))
                writer.write(example)

                #global counter, max_num_words, sum_words
                with record_counter.get_lock():
                    record_counter.value += 1
                if word_ids_length > max_num_words.value:
                    with max_num_words.get_lock():
                        max_num_words.value = word_ids_length
                with sum_words.get_lock():
                    sum_words.value += word_ids_length

                if FLAGS.np_save:
                    assert FLAGS.threads == 1
                    gtexts.append(word_ids)
                    gtext_strs.append(text)

                    #Depreciated not use image_labels
                    if img not in image_labels:
                        image_labels[img] = set()
                    image_labels[img].add(text)

                if is_top_text:
                    is_top_text = False
                    with image_counter.get_lock():
                        image_counter.value += 1

                    if FLAGS.np_save:
                        if img not in image_labels:
                            image_labels[img] = set()

                        image_names.append(img)
                        if FLAGS.small_feature:
                            image_features.append(image_feature)
                        else:
                            #actually save pic path instead of image feature
                            image_features.append(
                                os.path.join(FLAGS.big_feature_image_dir,
                                             img.replace('/', '_')))

                    if FLAGS.num_max_records > 0:
                        #if fixed valid only get one click for each image
                        break

            num += 1
            if num == FLAGS.num_max_records:
                break
示例#4
0
def deal_imgtextfile(file):
    """
  since img text or encoded img both big.. say for 2w pic will be 18G, while for image feature (23820, 2048) will only be 373M
  this is not used much, only if you do not want to do metric evaluate(recall@1,... for images), and you do not want to 
  convert and store image binaries from imatext(preprocess)
  """
    out_file = '{}/{}'.format(
        FLAGS.output_directory,
        '-'.join([FLAGS.name, file.split('/')[-1].split('-')[-1]]))
    print('file:', file, 'out_file:', out_file, file=sys.stderr)
    assert len(pic_info_map) > 0
    with melt.tfrecords.Writer(out_file) as writer:
        num = 0
        for line in open(file):
            if num % 1000 == 0:
                print(num, file=sys.stderr)

            l = line.rstrip('\n').split('\t')
            img = l[0]

            if img not in pic_info_map:
                continue

            img_text = l[-1]
            encoded_image = urllib.unquote_plus(img_text)

            text_info = pic_info_map[img]
            texts = text_info.split('\x01')

            is_top_text = True
            for text in texts:
                text = normalize.norm(text)
                if text.strip() == '':
                    print('empty line', line, file=sys.stderr)
                    continue

                word_ids = _text2ids(text, TEXT_MAX_WORDS)
                word_ids_length = len(word_ids)
                if num % 10000 == 0:
                    print(img,
                          text,
                          word_ids,
                          text2ids.ids2text(word_ids),
                          file=sys.stderr)
                if len(word_ids) == 0:
                    print('empy wordids!', file=sys.stderr)
                    print(img,
                          text,
                          word_ids,
                          text2ids.ids2text(word_ids),
                          file=sys.stderr)
                    continue
                #if is_luanma(words, word_ids):
                #  print('luanma', img, text, word_ids, text2ids.ids2text(word_ids), len(image_feature), file=sys.stderr)
                #  continue

                word_ids = word_ids[:TEXT_MAX_WORDS]
                if FLAGS.pad:
                    word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0)
                if not FLAGS.write_sequence_example:
                    example = tf.train.Example(features=tf.train.Features(
                        feature={
                            'image_name': melt.bytes_feature(img),
                            'image_data': melt.bytes_feature(encoded_image),
                            'text_str': melt.bytes_feature(text),
                            'text': melt.int64_feature(word_ids),
                        }))
                else:
                    example = tf.train.SequenceExample(
                        context=melt.features({
                            'image_name':
                            melt.bytes_feature(img),
                            'image_data':
                            melt.bytes_feature(encoded_image),
                            'text_str':
                            melt.bytes_feature(text),
                        }),
                        feature_lists=melt.feature_lists(
                            {'text': melt.int64_feature_list(word_ids)}))
                writer.write(example)

                #global counter, max_num_words, sum_words
                with record_counter.get_lock():
                    record_counter.value += 1
                if word_ids_length > max_num_words.value:
                    with max_num_words.get_lock():
                        max_num_words.value = word_ids_length
                with sum_words.get_lock():
                    sum_words.value += word_ids_length

                if FLAGS.np_save:
                    assert FLAGS.threads == 1
                    gtexts.append(word_ids)
                    gtext_strs.append(text)

                    #Depreciated not use image_labels
                    if img not in image_labels:
                        image_labels[img] = set()
                    image_labels[img].add(text)

                if is_top_text:
                    is_top_text = False
                    with image_counter.get_lock():
                        image_counter.value += 1

                    if FLAGS.np_save:
                        if img not in image_labels:
                            image_labels[img] = set()

                        image_names.append(img)
                        ##--well too big for encoded_image and so not consider evaluation?  TODO
                        #image_features.append(encoded_image)
                        if FLAGS.image_dir:
                            #actually save pic path instead of image feature
                            image_features.append(
                                os.path.join(FLAGS.image_dir,
                                             img.replace('/', '_')))

                    if FLAGS.num_max_records > 0:
                        #if fixed valid only get one click for each image
                        break

            num += 1
            if num == FLAGS.num_max_records:
                break
示例#5
0
def _parse_line(line, writer, thread_index = 0):
  l = line.rstrip().split('\t')
  image_name = l[0]
  image_feature = [float(x) for x in l[1:]]
  if image_name not in text_map:
    print('image ', image_name, 'ignore ', 'name_len ', len(image_name), len(image_name.strip()))
    return
  else:
    image_path =  FLAGS.image_dir + '/' + image_name
    #print(image_path)

    if FLAGS.write_raw_image_bytes:
      with tf.gfile.FastGFile(image_path, "r") as f:
        encoded_image = f.read()
    else:
      encoded_image = ''

    #---------below will hang if multi process
    #try:
    #  decoder.decode_jpeg(encoded_image)
    #except (tf.errors.InvalidArgumentError, AssertionError):
    #  print("Skipping file with invalid JPEG data: %s" % image_path)
    #  return
      
    for text, ori_text in text_map[image_name]:
      word_ids = [vocabulary.id(word) for word in text.split(WORDS_SEP) if vocabulary.has(word) or ENCODE_UNK]
      if not word_ids:
        continue 
      word_ids_length = len(word_ids)
      word_ids = word_ids[:TEXT_MAX_WORDS]
      if FLAGS.pad:
        word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS)

      if not FLAGS.write_sequence_example:
        example = tf.train.Example(features=tf.train.Features(feature={
          'image_name': melt.bytes_feature(image_name),
          'image_data': melt.bytes_feature(encoded_image),
          'image_feature': melt.float_feature(image_feature),
          'text': melt.int64_feature(word_ids),
          'text_str': melt.bytes_feature(ori_text),
          }))
      else:
        example = tf.train.SequenceExample(
          context=melt.features(
            {
              'image_name': melt.bytes_feature(image_name),
              'image_data': melt.bytes_feature(encoded_image),
              'image_feature': melt.float_feature(image_feature),
              'text_str': melt.bytes_feature(ori_text),
             }),
          feature_lists=melt.feature_lists(
          { 
            'text': melt.int64_feature_list(word_ids)
          }))
     
      if FLAGS.np_save:
        gtexts[thread_index].append(word_ids)
        gtext_strs[thread_index].append(ori_text)


      #NOTICE not test here for num_threads > 1
      if FLAGS.num_records:
        if image_name not in images:
          images[image_name] = 1
          print(image_name, len(images))
          writer.write(example.SerializeToString())
          if len(images) == FLAGS.num_records:
            print('Done')
            exit(1)
      else:
        writer.write(example.SerializeToString())
        global counter, max_num_words, sum_words
        with counter.get_lock():
          counter.value += 1
        if word_ids_length > max_num_words.value:
          with max_num_words.get_lock():
            max_num_words.value = word_ids_length
        with sum_words.get_lock():
          sum_words.value += word_ids_length
示例#6
0
def deal_file(file):
  out_file = '{}/{}'.format(FLAGS.output_directory, '-'.join([FLAGS.name, file.split('/')[-1].split('-')[-1]]))
  print('out_file:', out_file)
  with melt.tfrecords.Writer(out_file) as writer:
    num = 0
    for line in open(file):
      if num % 1000 == 0:
        print(num)
      
      l = line.rstrip('\n').split('\t')

      text = l[FLAGS.text_index]

      input_text = l[FLAGS.input_text_index]
      
      input_words = text2ids.Segmentor.Segment(input_text, FLAGS.seg_method)
      input_word_ids = text2ids.words2ids(input_words, feed_single=FLAGS.feed_single, allow_all_zero=True, pad=False)
      if len(input_word_ids) == 0:
        continue
      input_word_ids = input_word_ids[:INPUT_TEXT_MAX_WORDS]
      if FLAGS.pad:
        input_word_ids = gezi.pad(input_word_ids)

      words = text2ids.Segmentor.Segment(text, FLAGS.seg_method)
      word_ids = text2ids.words2ids(words, feed_single=FLAGS.feed_single, allow_all_zero=True, pad=False)
      word_ids_length = len(word_ids)
      if num % 1000 == 0:
        print(text, word_ids, text2ids.ids2text(word_ids), file=sys.stderr)
      if word_ids_length == 0:
        continue 
      if is_luanma(words, word_ids):
        print('luanma', text, word_ids, text2ids.ids2text(word_ids), file=sys.stderr)
        continue 
                  
      word_ids = word_ids[:TEXT_MAX_WORDS]
      if FLAGS.pad:
        word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0)
      if not FLAGS.write_sequence_example:
        example = tf.train.Example(features=tf.train.Features(feature={
          'input_text_str': melt.bytes_feature(input_text),
          'input_text': melt.int64_feature(input_word_ids),
          'text_str': melt.bytes_feature(text),
          'text': melt.int64_feature(word_ids),
          }))
      else:
        example = tf.train.SequenceExample(
              context=melt.features(
              {
                'input_text_str': melt.bytes_feature(input_text),
                'text_str': melt.bytes_feature(text),
              }),
              feature_lists=melt.feature_lists(
              { 
                'input_text': melt.int64_feature_list(input_word_ids),
                'text': melt.int64_feature_list(word_ids)
              }))
      writer.write(example)
      
        #global counter, max_num_words, sum_words
      with record_counter.get_lock():
        record_counter.value += 1
      if word_ids_length > max_num_words.value:
        with max_num_words.get_lock():
          max_num_words.value = word_ids_length
      with sum_words.get_lock():
        sum_words.value += word_ids_length
      
      if FLAGS.np_save:
        assert FLAGS.threads == 1
        gtexts.append(word_ids)
        gtext_strs.append(text)
        
      num += 1   
      if num == FLAGS.num_max_records:
        break
示例#7
0
def deal_file(file):
    out_file = '{}/{}'.format(
        FLAGS.output_directory,
        '-'.join([FLAGS.name, file.split('/')[-1].split('-')[-1]]))
    print('out_file:', out_file)
    with melt.tfrecords.Writer(out_file) as writer:
        num = 0
        for line in open(file):
            if num % 1000 == 0:
                print(num)

            l = line.rstrip('\n').split('\t')
            cs = l[0]  #cs
            simid = l[3]
            objurl = l[1]
            fromurl = l[2]
            keyword = l[4].split('\x01')[0]
            extended_keyword = l[5].split('\x01')[0]

            img = objurl
            #img = cs

            idl4w_end = IDL4W_FEATURE_LEN + 6
            idl4w_feature = [float(x) for x in l[6:idl4w_end]]

            titles = l[idl4w_end + 1]
            descs = l[idl4w_end + 2]

            inception_feature = [float(x) for x in l[idl4w_end + 3:]]

            assert len(inception_feature) == INCEPTION_FEATURE_LEN, '%d %s' % (
                len(inception_feature), cs)

            click_query = l[idl4w_end]
            show_str = 'click:{} ex_key:{} key:{} titles:{} descs:{}'.format(
                click_query, extended_keyword, keyword, titles, descs)
            if click_query == 'noclickquery':
                click_query = ''
                #TODO now only consider click_query
                continue
            else:
                click_queries = click_query.split('$*$')
                is_top_text = True
                for click_query in click_queries:
                    if click_query.strip() == '':
                        continue

                    text_str = '{} {}'.format(click_query, show_str)

                    text = click_query
                    words = text2ids.Segmentor.Segment(text, FLAGS.seg_method)
                    word_ids = text2ids.words2ids(
                        words,
                        feed_single=FLAGS.feed_single,
                        allow_all_zero=True,
                        pad=False)
                    word_ids_length = len(word_ids)
                    if num % 1000 == 0:
                        print(cs,
                              simid,
                              text,
                              word_ids,
                              text2ids.ids2text(word_ids),
                              len(idl4w_feature),
                              len(inception_feature),
                              file=sys.stderr)
                    if len(word_ids) == 0:
                        continue
                    if is_bad(words, word_ids):
                        #print('luan_ma', cs, simid, text, word_ids, text2ids.ids2text(word_ids), len(idl4w_feature), len(inception_feature), file=sys.stderr)
                        continue

                    word_ids = word_ids[:TEXT_MAX_WORDS]
                    if FLAGS.pad:
                        word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0)
                    if not FLAGS.write_sequence_example:
                        example = tf.train.Example(features=tf.train.Features(
                            feature={
                                'image_name':
                                melt.bytes_feature(img),
                                'idl4w_feature':
                                melt.float_feature(idl4w_feature),
                                'inception_feature':
                                melt.float_feature(inception_feature),
                                'text_str':
                                melt.bytes_feature(text_str),
                                'text':
                                melt.int64_feature(word_ids),
                            }))
                    else:
                        example = tf.train.SequenceExample(
                            context=melt.features({
                                'image_name':
                                melt.bytes_feature(img),
                                'idl4w_feature':
                                melt.float_feature(idl4w_feature),
                                'inception_feature':
                                melt.float_feature(inception_feature),
                                'text_str':
                                melt.bytes_feature(text_str),
                            }),
                            feature_lists=melt.feature_lists(
                                {'text': melt.int64_feature_list(word_ids)}))
                    writer.write(example)

                    #global counter, max_num_words, sum_words
                    with record_counter.get_lock():
                        record_counter.value += 1
                    if word_ids_length > max_num_words.value:
                        with max_num_words.get_lock():
                            max_num_words.value = word_ids_length
                    with sum_words.get_lock():
                        sum_words.value += word_ids_length

                    if FLAGS.np_save:
                        assert FLAGS.threads == 1
                        texts.append(word_ids)
                        text_strs.append(text)

                        if img not in image_labels:
                            image_labels[img] = set()
                        image_labels[img].add(text)

                    if is_top_text:
                        is_top_text = False
                        with image_counter.get_lock():
                            image_counter.value += 1

                        if FLAGS.np_save:
                            if img not in image_labels:
                                image_labels[img] = set()

                            image_names.append(img)
                            #image_features.append(image_feature)
                            idl4w_features.append(idl4w_feature)
                            inception_features.append(inception_feature)

                        if FLAGS.num_max_records > 0:
                            #if fixed valid only get one click for each image
                            break

            num += 1
            if num == FLAGS.num_max_records:
                break