示例#1
0
def deal_file(file):
  out_file = '{}/{}'.format(FLAGS.output_directory, '-'.join([FLAGS.name, file.split('/')[-1].split('-')[-1]]))
  print('out_file:', out_file)
  with melt.tfrecords.Writer(out_file) as writer:
    num = 0
    for line in open(file):
      if num % 1000 == 0:
        print(num)
      
      l = line.rstrip('\n').split('\t')

      text = l[FLAGS.text_index]

      input_text = l[FLAGS.input_text_index]
      
      input_words = text2ids.Segmentor.Segment(input_text, FLAGS.seg_method)
      input_word_ids = text2ids.words2ids(input_words, feed_single=FLAGS.feed_single, allow_all_zero=True, pad=False)
      if len(input_word_ids) == 0:
        continue
      input_word_ids = input_word_ids[:INPUT_TEXT_MAX_WORDS]
      if FLAGS.pad:
        input_word_ids = gezi.pad(input_word_ids)

      words = text2ids.Segmentor.Segment(text, FLAGS.seg_method)
      word_ids = text2ids.words2ids(words, feed_single=FLAGS.feed_single, allow_all_zero=True, pad=False)
      word_ids_length = len(word_ids)
      if num % 1000 == 0:
        print(text, word_ids, text2ids.ids2text(word_ids), file=sys.stderr)
      if word_ids_length == 0:
        continue 
      if is_luanma(words, word_ids):
        print('luanma', text, word_ids, text2ids.ids2text(word_ids), file=sys.stderr)
        continue 
                  
      word_ids = word_ids[:TEXT_MAX_WORDS]
      if FLAGS.pad:
        word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0)
      if not FLAGS.write_sequence_example:
        example = tf.train.Example(features=tf.train.Features(feature={
          'input_text_str': melt.bytes_feature(input_text),
          'input_text': melt.int64_feature(input_word_ids),
          'text_str': melt.bytes_feature(text),
          'text': melt.int64_feature(word_ids),
          }))
      else:
        example = tf.train.SequenceExample(
              context=melt.features(
              {
                'input_text_str': melt.bytes_feature(input_text),
                'text_str': melt.bytes_feature(text),
              }),
              feature_lists=melt.feature_lists(
              { 
                'input_text': melt.int64_feature_list(input_word_ids),
                'text': melt.int64_feature_list(word_ids)
              }))
      writer.write(example)
      
        #global counter, max_num_words, sum_words
      with record_counter.get_lock():
        record_counter.value += 1
      if word_ids_length > max_num_words.value:
        with max_num_words.get_lock():
          max_num_words.value = word_ids_length
      with sum_words.get_lock():
        sum_words.value += word_ids_length
      
      if FLAGS.np_save:
        assert FLAGS.threads == 1
        gtexts.append(word_ids)
        gtext_strs.append(text)
        
      num += 1   
      if num == FLAGS.num_max_records:
        break
示例#2
0
def deal_file(file):
    out_file = '{}/{}'.format(
        FLAGS.output_directory,
        '-'.join([FLAGS.name, file.split('/')[-1].split('-')[-1]]))
    print('out_file:', out_file)
    with melt.tfrecords.Writer(out_file) as writer:
        num = 0
        for line in open(file):
            if num % 1000 == 0:
                print(num)

            l = line.rstrip('\n').split('\t')
            img = l[0]

            texts = l[FLAGS.text_index].split('\x01')

            image_feature = [
                float(x)
                for x in l[FLAGS.image_feature_index].strip().split('\x01')
            ]
            #assert len(image_feature) == IMAGE_FEATURE_LEN, '%s %d'%(img, len(image_feature))
            if len(image_feature) != IMAGE_FEATURE_LEN:
                print('bad line:', line)
                continue

            input_texts = l[FLAGS.input_text_index].split('\x01')
            for input_text in input_texts:
                input_words = text2ids.Segmentor.Segment(
                    input_text, FLAGS.seg_method)
                input_word_ids = text2ids.words2ids(
                    input_words,
                    feed_single=FLAGS.feed_single,
                    allow_all_zero=True,
                    pad=False)
                if len(input_word_ids) == 0:
                    continue

                input_word_ids = input_word_ids[:INPUT_TEXT_MAX_WORDS]
                if FLAGS.pad:
                    input_word_ids = gezi.pad(input_word_ids)

                is_top_text = True
                for text in texts:
                    if text.strip() == '':
                        continue

                    words = text2ids.Segmentor.Segment(text, FLAGS.seg_method)
                    word_ids = text2ids.words2ids(
                        words,
                        feed_single=FLAGS.feed_single,
                        allow_all_zero=True,
                        pad=False)
                    word_ids_length = len(word_ids)
                    if num % 1000 == 0:
                        print(img,
                              text,
                              word_ids,
                              text2ids.ids2text(word_ids),
                              len(image_feature),
                              file=sys.stderr)
                    if word_ids_length == 0:
                        continue
                    if is_luanma(words, word_ids):
                        print('luanma',
                              img,
                              text,
                              word_ids,
                              text2ids.ids2text(word_ids),
                              len(image_feature),
                              file=sys.stderr)
                        continue

                    word_ids = word_ids[:TEXT_MAX_WORDS]
                    if FLAGS.pad:
                        word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0)
                    if not FLAGS.write_sequence_example:
                        example = tf.train.Example(features=tf.train.Features(
                            feature={
                                'image_name': melt.bytes_feature(img),
                                'image_feature': melt.float_feature(
                                    image_feature),
                                'input_text_str': melt.bytes_feature(
                                    input_text),
                                'input_text': melt.int64_feature(
                                    input_word_ids),
                                'text_str': melt.bytes_feature(text),
                                'text': melt.int64_feature(word_ids),
                            }))
                    else:
                        example = tf.train.SequenceExample(
                            context=melt.features({
                                'image_name':
                                melt.bytes_feature(img),
                                'image_feature':
                                melt.float_feature(image_feature),
                                'input_text_str':
                                melt.bytes_feature(input_text),
                                'text_str':
                                melt.bytes_feature(text),
                            }),
                            feature_lists=melt.feature_lists({
                                'input_text':
                                melt.int64_feature_list(input_word_ids),
                                'text':
                                melt.int64_feature_list(word_ids)
                            }))
                    writer.write(example)

                #global counter, max_num_words, sum_words
                with record_counter.get_lock():
                    record_counter.value += 1
                if word_ids_length > max_num_words.value:
                    with max_num_words.get_lock():
                        max_num_words.value = word_ids_length
                with sum_words.get_lock():
                    sum_words.value += word_ids_length

                if FLAGS.np_save:
                    assert FLAGS.threads == 1
                    gtexts.append(word_ids)
                    gtext_strs.append(text)

                    if img not in image_labels:
                        image_labels[img] = set()
                    image_labels[img].add(text)

                if is_top_text:
                    is_top_text = False
                    with image_counter.get_lock():
                        image_counter.value += 1

                    if FLAGS.np_save:
                        if img not in image_labels:
                            image_labels[img] = set()

                        image_names.append(img)
                        image_features.append(image_feature)

                    if FLAGS.num_max_records > 0:
                        #if fixed valid only get one click for each image
                        break

            num += 1
            if num == FLAGS.num_max_records:
                break
示例#3
0
def deal_file(file):
    out_file = '{}/{}'.format(
        FLAGS.output_directory,
        '-'.join([FLAGS.name, file.split('/')[-1].split('-')[-1]]))
    print('out_file:', out_file)
    with melt.tfrecords.Writer(out_file) as writer:
        num = 0
        for line in open(file):
            if num % 1000 == 0:
                print(num)

            l = line.rstrip('\n').split('\t')
            cs = l[0]  #cs
            simid = l[3]
            objurl = l[1]
            fromurl = l[2]
            keyword = l[4].split('\x01')[0]
            extended_keyword = l[5].split('\x01')[0]

            img = objurl
            #img = cs

            idl4w_end = IDL4W_FEATURE_LEN + 6
            idl4w_feature = [float(x) for x in l[6:idl4w_end]]

            titles = l[idl4w_end + 1]
            descs = l[idl4w_end + 2]

            inception_feature = [float(x) for x in l[idl4w_end + 3:]]

            assert len(inception_feature) == INCEPTION_FEATURE_LEN, '%d %s' % (
                len(inception_feature), cs)

            click_query = l[idl4w_end]
            show_str = 'click:{} ex_key:{} key:{} titles:{} descs:{}'.format(
                click_query, extended_keyword, keyword, titles, descs)
            if click_query == 'noclickquery':
                click_query = ''
                #TODO now only consider click_query
                continue
            else:
                click_queries = click_query.split('$*$')
                is_top_text = True
                for click_query in click_queries:
                    if click_query.strip() == '':
                        continue

                    text_str = '{} {}'.format(click_query, show_str)

                    text = click_query
                    words = text2ids.Segmentor.Segment(text, FLAGS.seg_method)
                    word_ids = text2ids.words2ids(
                        words,
                        feed_single=FLAGS.feed_single,
                        allow_all_zero=True,
                        pad=False)
                    word_ids_length = len(word_ids)
                    if num % 1000 == 0:
                        print(cs,
                              simid,
                              text,
                              word_ids,
                              text2ids.ids2text(word_ids),
                              len(idl4w_feature),
                              len(inception_feature),
                              file=sys.stderr)
                    if len(word_ids) == 0:
                        continue
                    if is_bad(words, word_ids):
                        #print('luan_ma', cs, simid, text, word_ids, text2ids.ids2text(word_ids), len(idl4w_feature), len(inception_feature), file=sys.stderr)
                        continue

                    word_ids = word_ids[:TEXT_MAX_WORDS]
                    if FLAGS.pad:
                        word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0)
                    if not FLAGS.write_sequence_example:
                        example = tf.train.Example(features=tf.train.Features(
                            feature={
                                'image_name':
                                melt.bytes_feature(img),
                                'idl4w_feature':
                                melt.float_feature(idl4w_feature),
                                'inception_feature':
                                melt.float_feature(inception_feature),
                                'text_str':
                                melt.bytes_feature(text_str),
                                'text':
                                melt.int64_feature(word_ids),
                            }))
                    else:
                        example = tf.train.SequenceExample(
                            context=melt.features({
                                'image_name':
                                melt.bytes_feature(img),
                                'idl4w_feature':
                                melt.float_feature(idl4w_feature),
                                'inception_feature':
                                melt.float_feature(inception_feature),
                                'text_str':
                                melt.bytes_feature(text_str),
                            }),
                            feature_lists=melt.feature_lists(
                                {'text': melt.int64_feature_list(word_ids)}))
                    writer.write(example)

                    #global counter, max_num_words, sum_words
                    with record_counter.get_lock():
                        record_counter.value += 1
                    if word_ids_length > max_num_words.value:
                        with max_num_words.get_lock():
                            max_num_words.value = word_ids_length
                    with sum_words.get_lock():
                        sum_words.value += word_ids_length

                    if FLAGS.np_save:
                        assert FLAGS.threads == 1
                        texts.append(word_ids)
                        text_strs.append(text)

                        if img not in image_labels:
                            image_labels[img] = set()
                        image_labels[img].add(text)

                    if is_top_text:
                        is_top_text = False
                        with image_counter.get_lock():
                            image_counter.value += 1

                        if FLAGS.np_save:
                            if img not in image_labels:
                                image_labels[img] = set()

                            image_names.append(img)
                            #image_features.append(image_feature)
                            idl4w_features.append(idl4w_feature)
                            inception_features.append(inception_feature)

                        if FLAGS.num_max_records > 0:
                            #if fixed valid only get one click for each image
                            break

            num += 1
            if num == FLAGS.num_max_records:
                break