示例#1
0
def build_features(infile):
    ofile = get_out_file(infile)
    print('----------writing to', ofile)
    with melt.tfrecords.Writer(ofile) as writer:
        for line in tqdm(open(infile)):
            fields = line.rstrip().split('\t')
            if len(fields) > 4:
                label = int(fields[0])
                id = '{}\t{}'.format(fields[2], fields[3])
                feat_id, feat_field, feat_value = dataset.get_feat_set(fields)
                assert len(feat_id) == len(
                    feat_value
                ), "len(feat_id) == len(feat_value) -----------------"
                assert len(feat_id) == len(feat_field)

                feature = {
                    'label': melt.int64_feature(label),
                    'id': melt.bytes_feature(id),
                    'index': melt.int64_feature(feat_id),
                    'field': melt.int64_feature(feat_field),
                    'value': melt.float_feature(feat_value)
                }
                record = tf.train.Example(features=tf.train.Features(
                    feature=feature))

                writer.write(record)
                global counter
                with counter.get_lock():
                    counter.value += 1
示例#2
0
def convert_to_tfrecord(input_files, output_file):
  """Converts a file to TFRecords."""
  print('Generating %s' % output_file)
  with tf.python_io.TFRecordWriter(output_file) as record_writer:
    for input_file in tqdm(input_files, ascii=True):
      id = os.path.basename(input_file)[:-4]
      #img = cv2.imread(input_file)
      img = melt.read_image(input_file)
      # turn to channel first
      #img = img.transpose(2,0,1)
      if 'test' not in output_file:  
        label = m[id]
      else:
        label = -1
      example = tf.train.Example(features=tf.train.Features(
          feature={
              'id': melt.bytes_feature(id),
              #'image': melt.bytes_feature(img.tobytes()),
              'image': melt.bytes_feature(img),
              'label': melt.int64_feature(label)
          }))
      record_writer.write(example.SerializeToString())
示例#3
0
def deal_file(file):
    out_file = '{}/{}'.format(
        FLAGS.output_directory,
        '-'.join([FLAGS.name, file.split('/')[-1].split('-')[-1]]))
    print('out_file:', out_file)
    with melt.tfrecords.Writer(out_file) as writer:
        num = 0
        for line in open(file):
            if num % 1000 == 0:
                print(num)

            l = line.rstrip('\n').split('\t')
            img = l[0]

            texts = l[FLAGS.text_index].split('\x01')

            image_feature = [
                float(x)
                for x in l[FLAGS.image_feature_index].strip().split('\x01')
            ]
            #assert len(image_feature) == IMAGE_FEATURE_LEN, '%s %d'%(img, len(image_feature))
            if len(image_feature) != IMAGE_FEATURE_LEN:
                print('bad line:', line)
                continue

            input_texts = l[FLAGS.input_text_index].split('\x01')
            for input_text in input_texts:
                input_words = text2ids.Segmentor.Segment(
                    input_text, FLAGS.seg_method)
                input_word_ids = text2ids.words2ids(
                    input_words,
                    feed_single=FLAGS.feed_single,
                    allow_all_zero=True,
                    pad=False)
                if len(input_word_ids) == 0:
                    continue

                input_word_ids = input_word_ids[:INPUT_TEXT_MAX_WORDS]
                if FLAGS.pad:
                    input_word_ids = gezi.pad(input_word_ids)

                is_top_text = True
                for text in texts:
                    if text.strip() == '':
                        continue

                    words = text2ids.Segmentor.Segment(text, FLAGS.seg_method)
                    word_ids = text2ids.words2ids(
                        words,
                        feed_single=FLAGS.feed_single,
                        allow_all_zero=True,
                        pad=False)
                    word_ids_length = len(word_ids)
                    if num % 1000 == 0:
                        print(img,
                              text,
                              word_ids,
                              text2ids.ids2text(word_ids),
                              len(image_feature),
                              file=sys.stderr)
                    if word_ids_length == 0:
                        continue
                    if is_luanma(words, word_ids):
                        print('luanma',
                              img,
                              text,
                              word_ids,
                              text2ids.ids2text(word_ids),
                              len(image_feature),
                              file=sys.stderr)
                        continue

                    word_ids = word_ids[:TEXT_MAX_WORDS]
                    if FLAGS.pad:
                        word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0)
                    if not FLAGS.write_sequence_example:
                        example = tf.train.Example(features=tf.train.Features(
                            feature={
                                'image_name': melt.bytes_feature(img),
                                'image_feature': melt.float_feature(
                                    image_feature),
                                'input_text_str': melt.bytes_feature(
                                    input_text),
                                'input_text': melt.int64_feature(
                                    input_word_ids),
                                'text_str': melt.bytes_feature(text),
                                'text': melt.int64_feature(word_ids),
                            }))
                    else:
                        example = tf.train.SequenceExample(
                            context=melt.features({
                                'image_name':
                                melt.bytes_feature(img),
                                'image_feature':
                                melt.float_feature(image_feature),
                                'input_text_str':
                                melt.bytes_feature(input_text),
                                'text_str':
                                melt.bytes_feature(text),
                            }),
                            feature_lists=melt.feature_lists({
                                'input_text':
                                melt.int64_feature_list(input_word_ids),
                                'text':
                                melt.int64_feature_list(word_ids)
                            }))
                    writer.write(example)

                #global counter, max_num_words, sum_words
                with record_counter.get_lock():
                    record_counter.value += 1
                if word_ids_length > max_num_words.value:
                    with max_num_words.get_lock():
                        max_num_words.value = word_ids_length
                with sum_words.get_lock():
                    sum_words.value += word_ids_length

                if FLAGS.np_save:
                    assert FLAGS.threads == 1
                    gtexts.append(word_ids)
                    gtext_strs.append(text)

                    if img not in image_labels:
                        image_labels[img] = set()
                    image_labels[img].add(text)

                if is_top_text:
                    is_top_text = False
                    with image_counter.get_lock():
                        image_counter.value += 1

                    if FLAGS.np_save:
                        if img not in image_labels:
                            image_labels[img] = set()

                        image_names.append(img)
                        image_features.append(image_feature)

                    if FLAGS.num_max_records > 0:
                        #if fixed valid only get one click for each image
                        break

            num += 1
            if num == FLAGS.num_max_records:
                break
示例#4
0
def deal_file(file):
    out_file = '{}/{}'.format(
        FLAGS.output_directory,
        '-'.join([FLAGS.name, file.split('/')[-1].split('-')[-1]]))
    print('file:', file, 'out_file:', out_file, file=sys.stderr)
    with melt.tfrecords.Writer(out_file) as writer:
        num = 0
        for line in open(file):
            if num % 1000 == 0:
                print(num, file=sys.stderr)

            l = line.rstrip('\n').split('\t')
            img = l[0]

            texts = l[FLAGS.text_index].split('\x01')

            image_feature = [
                float(x)
                for x in l[FLAGS.image_feature_index].strip().split('\x01')
            ]
            #image_feature = [float(x) for x in l[FLAGS.image_feature_index].strip().split(' ')]
            #image_feature = [0.] * IMAGE_FEATURE_LEN
            assert len(image_feature) == IMAGE_FEATURE_LEN, '%s %d' % (
                img, len(image_feature))

            is_top_text = True
            for text in texts:
                text = normalize.norm(text)
                if text.strip() == '':
                    print('empty line', line, file=sys.stderr)
                    continue

                word_ids = _text2ids(text, TEXT_MAX_WORDS)
                word_ids_length = len(word_ids)
                if num % 10000 == 0:
                    print(img,
                          text,
                          word_ids,
                          text2ids.ids2text(word_ids),
                          len(image_feature),
                          file=sys.stderr)
                if len(word_ids) == 0:
                    print('empy wordids!', file=sys.stderr)
                    print(img,
                          text,
                          word_ids,
                          text2ids.ids2text(word_ids),
                          len(image_feature),
                          file=sys.stderr)
                    continue
                #if is_luanma(words, word_ids):
                #  print('luanma', img, text, word_ids, text2ids.ids2text(word_ids), len(image_feature), file=sys.stderr)
                #  continue

                word_ids = word_ids[:TEXT_MAX_WORDS]
                if FLAGS.pad:
                    word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0)
                if not FLAGS.write_sequence_example:
                    example = tf.train.Example(features=tf.train.Features(
                        feature={
                            'image_name': melt.bytes_feature(img),
                            'image_feature': melt.float_feature(image_feature),
                            'text_str': melt.bytes_feature(text),
                            'text': melt.int64_feature(word_ids),
                        }))
                else:
                    example = tf.train.SequenceExample(
                        context=melt.features({
                            'image_name':
                            melt.bytes_feature(img),
                            'image_feature':
                            melt.float_feature(image_feature),
                            'text_str':
                            melt.bytes_feature(text),
                        }),
                        feature_lists=melt.feature_lists(
                            {'text': melt.int64_feature_list(word_ids)}))
                writer.write(example)

                #global counter, max_num_words, sum_words
                with record_counter.get_lock():
                    record_counter.value += 1
                if word_ids_length > max_num_words.value:
                    with max_num_words.get_lock():
                        max_num_words.value = word_ids_length
                with sum_words.get_lock():
                    sum_words.value += word_ids_length

                if FLAGS.np_save:
                    assert FLAGS.threads == 1
                    gtexts.append(word_ids)
                    gtext_strs.append(text)

                    #Depreciated not use image_labels
                    if img not in image_labels:
                        image_labels[img] = set()
                    image_labels[img].add(text)

                if is_top_text:
                    is_top_text = False
                    with image_counter.get_lock():
                        image_counter.value += 1

                    if FLAGS.np_save:
                        if img not in image_labels:
                            image_labels[img] = set()

                        image_names.append(img)
                        if FLAGS.small_feature:
                            image_features.append(image_feature)
                        else:
                            #actually save pic path instead of image feature
                            image_features.append(
                                os.path.join(FLAGS.big_feature_image_dir,
                                             img.replace('/', '_')))

                    if FLAGS.num_max_records > 0:
                        #if fixed valid only get one click for each image
                        break

            num += 1
            if num == FLAGS.num_max_records:
                break
示例#5
0
def deal_imgtextfile(file):
    """
  since img text or encoded img both big.. say for 2w pic will be 18G, while for image feature (23820, 2048) will only be 373M
  this is not used much, only if you do not want to do metric evaluate(recall@1,... for images), and you do not want to 
  convert and store image binaries from imatext(preprocess)
  """
    out_file = '{}/{}'.format(
        FLAGS.output_directory,
        '-'.join([FLAGS.name, file.split('/')[-1].split('-')[-1]]))
    print('file:', file, 'out_file:', out_file, file=sys.stderr)
    assert len(pic_info_map) > 0
    with melt.tfrecords.Writer(out_file) as writer:
        num = 0
        for line in open(file):
            if num % 1000 == 0:
                print(num, file=sys.stderr)

            l = line.rstrip('\n').split('\t')
            img = l[0]

            if img not in pic_info_map:
                continue

            img_text = l[-1]
            encoded_image = urllib.unquote_plus(img_text)

            text_info = pic_info_map[img]
            texts = text_info.split('\x01')

            is_top_text = True
            for text in texts:
                text = normalize.norm(text)
                if text.strip() == '':
                    print('empty line', line, file=sys.stderr)
                    continue

                word_ids = _text2ids(text, TEXT_MAX_WORDS)
                word_ids_length = len(word_ids)
                if num % 10000 == 0:
                    print(img,
                          text,
                          word_ids,
                          text2ids.ids2text(word_ids),
                          file=sys.stderr)
                if len(word_ids) == 0:
                    print('empy wordids!', file=sys.stderr)
                    print(img,
                          text,
                          word_ids,
                          text2ids.ids2text(word_ids),
                          file=sys.stderr)
                    continue
                #if is_luanma(words, word_ids):
                #  print('luanma', img, text, word_ids, text2ids.ids2text(word_ids), len(image_feature), file=sys.stderr)
                #  continue

                word_ids = word_ids[:TEXT_MAX_WORDS]
                if FLAGS.pad:
                    word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0)
                if not FLAGS.write_sequence_example:
                    example = tf.train.Example(features=tf.train.Features(
                        feature={
                            'image_name': melt.bytes_feature(img),
                            'image_data': melt.bytes_feature(encoded_image),
                            'text_str': melt.bytes_feature(text),
                            'text': melt.int64_feature(word_ids),
                        }))
                else:
                    example = tf.train.SequenceExample(
                        context=melt.features({
                            'image_name':
                            melt.bytes_feature(img),
                            'image_data':
                            melt.bytes_feature(encoded_image),
                            'text_str':
                            melt.bytes_feature(text),
                        }),
                        feature_lists=melt.feature_lists(
                            {'text': melt.int64_feature_list(word_ids)}))
                writer.write(example)

                #global counter, max_num_words, sum_words
                with record_counter.get_lock():
                    record_counter.value += 1
                if word_ids_length > max_num_words.value:
                    with max_num_words.get_lock():
                        max_num_words.value = word_ids_length
                with sum_words.get_lock():
                    sum_words.value += word_ids_length

                if FLAGS.np_save:
                    assert FLAGS.threads == 1
                    gtexts.append(word_ids)
                    gtext_strs.append(text)

                    #Depreciated not use image_labels
                    if img not in image_labels:
                        image_labels[img] = set()
                    image_labels[img].add(text)

                if is_top_text:
                    is_top_text = False
                    with image_counter.get_lock():
                        image_counter.value += 1

                    if FLAGS.np_save:
                        if img not in image_labels:
                            image_labels[img] = set()

                        image_names.append(img)
                        ##--well too big for encoded_image and so not consider evaluation?  TODO
                        #image_features.append(encoded_image)
                        if FLAGS.image_dir:
                            #actually save pic path instead of image feature
                            image_features.append(
                                os.path.join(FLAGS.image_dir,
                                             img.replace('/', '_')))

                    if FLAGS.num_max_records > 0:
                        #if fixed valid only get one click for each image
                        break

            num += 1
            if num == FLAGS.num_max_records:
                break
def build_features(index):
    mode = 'train' if 'train' in FLAGS.input else 'test'
    out_file = os.path.dirname(FLAGS.vocab) + '/{0}/{1}.record'.format(
        mode, index)
    os.system('mkdir -p %s' % os.path.dirname(out_file))
    print('---out_file', out_file)
    # TODO now only gen one tfrecord file

    total = len(examples)
    start, end = gezi.get_fold(total, FLAGS.num_records, index)

    ids = examples['id'].values[start:end]
    comments = examples['comment_text'].values[start:end]

    try:
        labels = examples[CLASSES].values[start:end]
    except Exception:
        labels = [[0.] * len(CLASSES)] * len(ids)

    with melt.tfrecords.Writer(out_file) as writer:
        for id, comment, label in tqdm(zip(ids, comments, labels)):
            comment_str = comment
            # TODO use info
            doc = tokenizer.tokenize(comment)
            comment_tokens, tokens_info = doc.tokens, doc.attributes

            for i in range(len(tokens_info)):
                tokens_info[i] = list(map(float, tokens_info[i]))

            if FLAGS.comment_limit:
                comment_tokens = comment_tokens[:FLAGS.comment_limit]
                tokens_info = tokens_info[:FLAGS.comment_limit]

            tokens_info = np.array(tokens_info)
            tokens_info = tokens_info.reshape(-1)
            tokens_info = list(tokens_info)

            assert len(
                tokens_info) == len(comment_tokens) * len(attribute_names)

            comment_ids = [get_id(token, vocab) for token in comment_tokens]
            comment_tokens_str = '|'.join(
                [vocab.key(id) for id in comment_ids])
            label = list(map(float, label))

            comment_chars = [list(token) for token in comment_tokens]

            char_ids = np.zeros([len(comment_ids), FLAGS.char_limit],
                                dtype=np.int32)

            for i, token in enumerate(comment_chars):
                for j, ch in enumerate(token):
                    if j == FLAGS.char_limit:
                        break
                    char_ids[i, j] = get_char_id(ch, char_vocab)

            char_ids = list(char_ids.reshape(-1))

            #print(char_ids)

            simple_char_ids = []
            num_chs = 0
            for ch in list(comment):
                id_ = get_char_id(ch, char_vocab)
                #if id_ == char_vocab.unk_id():
                #  continue
                simple_char_ids.append(id_)
                if len(simple_char_ids) == FLAGS.simple_char_limit:
                    break

            simple_chars_str = ''.join(
                [char_vocab.key(id) for id in simple_char_ids])

            #print(simple_char_ids, simple_chars_str)

            record = tf.train.Example(features=tf.train.Features(
                feature={
                    "comment": melt.int64_feature(comment_ids),
                    "tokens_info": melt.float_feature(tokens_info),
                    "comment_chars": melt.int64_feature(char_ids),
                    "simple_chars": melt.int64_feature(simple_char_ids),
                    "simple_chars_str": melt.bytes_feature(simple_chars_str),
                    "classes": melt.float_feature(label),
                    "id": melt.bytes_feature(id),
                    "comment_str": melt.bytes_feature(comment_str),
                    "comment_tokens_str": melt.bytes_feature(
                        comment_tokens_str)
                }))

            writer.write(record)
            global counter
            with counter.get_lock():
                counter.value += 1

        print("Build {} instances of features in total".format(writer.size()))
        writer.close()
示例#7
0
def _parse_line(line, writer, thread_index = 0):
  l = line.rstrip().split('\t')
  image_name = l[0]
  image_feature = [float(x) for x in l[1:]]
  if image_name not in text_map:
    print('image ', image_name, 'ignore ', 'name_len ', len(image_name), len(image_name.strip()))
    return
  else:
    image_path =  FLAGS.image_dir + '/' + image_name
    #print(image_path)

    if FLAGS.write_raw_image_bytes:
      with tf.gfile.FastGFile(image_path, "r") as f:
        encoded_image = f.read()
    else:
      encoded_image = ''

    #---------below will hang if multi process
    #try:
    #  decoder.decode_jpeg(encoded_image)
    #except (tf.errors.InvalidArgumentError, AssertionError):
    #  print("Skipping file with invalid JPEG data: %s" % image_path)
    #  return
      
    for text, ori_text in text_map[image_name]:
      word_ids = [vocabulary.id(word) for word in text.split(WORDS_SEP) if vocabulary.has(word) or ENCODE_UNK]
      if not word_ids:
        continue 
      word_ids_length = len(word_ids)
      word_ids = word_ids[:TEXT_MAX_WORDS]
      if FLAGS.pad:
        word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS)

      if not FLAGS.write_sequence_example:
        example = tf.train.Example(features=tf.train.Features(feature={
          'image_name': melt.bytes_feature(image_name),
          'image_data': melt.bytes_feature(encoded_image),
          'image_feature': melt.float_feature(image_feature),
          'text': melt.int64_feature(word_ids),
          'text_str': melt.bytes_feature(ori_text),
          }))
      else:
        example = tf.train.SequenceExample(
          context=melt.features(
            {
              'image_name': melt.bytes_feature(image_name),
              'image_data': melt.bytes_feature(encoded_image),
              'image_feature': melt.float_feature(image_feature),
              'text_str': melt.bytes_feature(ori_text),
             }),
          feature_lists=melt.feature_lists(
          { 
            'text': melt.int64_feature_list(word_ids)
          }))
     
      if FLAGS.np_save:
        gtexts[thread_index].append(word_ids)
        gtext_strs[thread_index].append(ori_text)


      #NOTICE not test here for num_threads > 1
      if FLAGS.num_records:
        if image_name not in images:
          images[image_name] = 1
          print(image_name, len(images))
          writer.write(example.SerializeToString())
          if len(images) == FLAGS.num_records:
            print('Done')
            exit(1)
      else:
        writer.write(example.SerializeToString())
        global counter, max_num_words, sum_words
        with counter.get_lock():
          counter.value += 1
        if word_ids_length > max_num_words.value:
          with max_num_words.get_lock():
            max_num_words.value = word_ids_length
        with sum_words.get_lock():
          sum_words.value += word_ids_length
示例#8
0
def deal_file(file):
  out_file = '{}/{}'.format(FLAGS.output_directory, '-'.join([FLAGS.name, file.split('/')[-1].split('-')[-1]]))
  print('out_file:', out_file)
  with melt.tfrecords.Writer(out_file) as writer:
    num = 0
    for line in open(file):
      if num % 1000 == 0:
        print(num)
      
      l = line.rstrip('\n').split('\t')

      text = l[FLAGS.text_index]

      input_text = l[FLAGS.input_text_index]
      
      input_words = text2ids.Segmentor.Segment(input_text, FLAGS.seg_method)
      input_word_ids = text2ids.words2ids(input_words, feed_single=FLAGS.feed_single, allow_all_zero=True, pad=False)
      if len(input_word_ids) == 0:
        continue
      input_word_ids = input_word_ids[:INPUT_TEXT_MAX_WORDS]
      if FLAGS.pad:
        input_word_ids = gezi.pad(input_word_ids)

      words = text2ids.Segmentor.Segment(text, FLAGS.seg_method)
      word_ids = text2ids.words2ids(words, feed_single=FLAGS.feed_single, allow_all_zero=True, pad=False)
      word_ids_length = len(word_ids)
      if num % 1000 == 0:
        print(text, word_ids, text2ids.ids2text(word_ids), file=sys.stderr)
      if word_ids_length == 0:
        continue 
      if is_luanma(words, word_ids):
        print('luanma', text, word_ids, text2ids.ids2text(word_ids), file=sys.stderr)
        continue 
                  
      word_ids = word_ids[:TEXT_MAX_WORDS]
      if FLAGS.pad:
        word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0)
      if not FLAGS.write_sequence_example:
        example = tf.train.Example(features=tf.train.Features(feature={
          'input_text_str': melt.bytes_feature(input_text),
          'input_text': melt.int64_feature(input_word_ids),
          'text_str': melt.bytes_feature(text),
          'text': melt.int64_feature(word_ids),
          }))
      else:
        example = tf.train.SequenceExample(
              context=melt.features(
              {
                'input_text_str': melt.bytes_feature(input_text),
                'text_str': melt.bytes_feature(text),
              }),
              feature_lists=melt.feature_lists(
              { 
                'input_text': melt.int64_feature_list(input_word_ids),
                'text': melt.int64_feature_list(word_ids)
              }))
      writer.write(example)
      
        #global counter, max_num_words, sum_words
      with record_counter.get_lock():
        record_counter.value += 1
      if word_ids_length > max_num_words.value:
        with max_num_words.get_lock():
          max_num_words.value = word_ids_length
      with sum_words.get_lock():
        sum_words.value += word_ids_length
      
      if FLAGS.np_save:
        assert FLAGS.threads == 1
        gtexts.append(word_ids)
        gtext_strs.append(text)
        
      num += 1   
      if num == FLAGS.num_max_records:
        break
示例#9
0
def deal_file(file):
    out_file = '{}/{}'.format(
        FLAGS.output_directory,
        '-'.join([FLAGS.name, file.split('/')[-1].split('-')[-1]]))
    print('out_file:', out_file)
    with melt.tfrecords.Writer(out_file) as writer:
        num = 0
        for line in open(file):
            if num % 1000 == 0:
                print(num)

            l = line.rstrip('\n').split('\t')
            cs = l[0]  #cs
            simid = l[3]
            objurl = l[1]
            fromurl = l[2]
            keyword = l[4].split('\x01')[0]
            extended_keyword = l[5].split('\x01')[0]

            img = objurl
            #img = cs

            idl4w_end = IDL4W_FEATURE_LEN + 6
            idl4w_feature = [float(x) for x in l[6:idl4w_end]]

            titles = l[idl4w_end + 1]
            descs = l[idl4w_end + 2]

            inception_feature = [float(x) for x in l[idl4w_end + 3:]]

            assert len(inception_feature) == INCEPTION_FEATURE_LEN, '%d %s' % (
                len(inception_feature), cs)

            click_query = l[idl4w_end]
            show_str = 'click:{} ex_key:{} key:{} titles:{} descs:{}'.format(
                click_query, extended_keyword, keyword, titles, descs)
            if click_query == 'noclickquery':
                click_query = ''
                #TODO now only consider click_query
                continue
            else:
                click_queries = click_query.split('$*$')
                is_top_text = True
                for click_query in click_queries:
                    if click_query.strip() == '':
                        continue

                    text_str = '{} {}'.format(click_query, show_str)

                    text = click_query
                    words = text2ids.Segmentor.Segment(text, FLAGS.seg_method)
                    word_ids = text2ids.words2ids(
                        words,
                        feed_single=FLAGS.feed_single,
                        allow_all_zero=True,
                        pad=False)
                    word_ids_length = len(word_ids)
                    if num % 1000 == 0:
                        print(cs,
                              simid,
                              text,
                              word_ids,
                              text2ids.ids2text(word_ids),
                              len(idl4w_feature),
                              len(inception_feature),
                              file=sys.stderr)
                    if len(word_ids) == 0:
                        continue
                    if is_bad(words, word_ids):
                        #print('luan_ma', cs, simid, text, word_ids, text2ids.ids2text(word_ids), len(idl4w_feature), len(inception_feature), file=sys.stderr)
                        continue

                    word_ids = word_ids[:TEXT_MAX_WORDS]
                    if FLAGS.pad:
                        word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0)
                    if not FLAGS.write_sequence_example:
                        example = tf.train.Example(features=tf.train.Features(
                            feature={
                                'image_name':
                                melt.bytes_feature(img),
                                'idl4w_feature':
                                melt.float_feature(idl4w_feature),
                                'inception_feature':
                                melt.float_feature(inception_feature),
                                'text_str':
                                melt.bytes_feature(text_str),
                                'text':
                                melt.int64_feature(word_ids),
                            }))
                    else:
                        example = tf.train.SequenceExample(
                            context=melt.features({
                                'image_name':
                                melt.bytes_feature(img),
                                'idl4w_feature':
                                melt.float_feature(idl4w_feature),
                                'inception_feature':
                                melt.float_feature(inception_feature),
                                'text_str':
                                melt.bytes_feature(text_str),
                            }),
                            feature_lists=melt.feature_lists(
                                {'text': melt.int64_feature_list(word_ids)}))
                    writer.write(example)

                    #global counter, max_num_words, sum_words
                    with record_counter.get_lock():
                        record_counter.value += 1
                    if word_ids_length > max_num_words.value:
                        with max_num_words.get_lock():
                            max_num_words.value = word_ids_length
                    with sum_words.get_lock():
                        sum_words.value += word_ids_length

                    if FLAGS.np_save:
                        assert FLAGS.threads == 1
                        texts.append(word_ids)
                        text_strs.append(text)

                        if img not in image_labels:
                            image_labels[img] = set()
                        image_labels[img].add(text)

                    if is_top_text:
                        is_top_text = False
                        with image_counter.get_lock():
                            image_counter.value += 1

                        if FLAGS.np_save:
                            if img not in image_labels:
                                image_labels[img] = set()

                            image_names.append(img)
                            #image_features.append(image_feature)
                            idl4w_features.append(idl4w_feature)
                            inception_features.append(inception_feature)

                        if FLAGS.num_max_records > 0:
                            #if fixed valid only get one click for each image
                            break

            num += 1
            if num == FLAGS.num_max_records:
                break
def build_features(index):
  mode = get_mode()
  out_file = os.path.dirname(FLAGS.vocab) + '/{0}/{1}.record'.format(mode, index)
  os.system('mkdir -p %s' % os.path.dirname(out_file))
  print('---out_file', out_file)
  # TODO now only gen one tfrecord file 

  total = len(examples)
  if not FLAGS.has_dup:
    start, end = gezi.get_fold(total, FLAGS.num_records, index)
  else:
    start, end = get_fold(examples['id'].values, index)

  ids = examples['id'].values[start: end]
  ids = list(map(str, ids))
  comments = examples['comment_text'].values[start: end]
  tokens_list = examples['tokens'].values[start: end]
  tokens_infos = examples['attributes'].values[start: end]
  # TODO change to poses
  poses = examples['poses'].values[start: end]
  tags = examples['tags'].values[start: end]
  ners = examples['ners'].values[start: end]
  ori_tokens_list = examples['ori_tokens'].values[start: end]
  
  try:
    labels = examples[CLASSES].values[start: end]
  except Exception:
    labels = [[0.] * len(CLASSES)] * len(ids)

  with melt.tfrecords.Writer(out_file) as writer:
    for id, comment, label, comment_tokens, ori_tokens, tokens_info, pos, tag, ner in tqdm(zip(ids, comments, labels, tokens_list, ori_tokens_list, tokens_infos, poses, tags, ners)):
      if not isinstance(comment, str):
        comment = 'ok'
      comment_str = comment

      comment_tokens = comment_tokens.split(' ')
      tokens_info = tokens_info.split(' ')
      pos = pos.split(' ')
      tag = tag.split(' ')
      ner = ner.split(' ')
      ori_tokens = ori_tokens.split(' ')

      if FLAGS.comment_limit:
        comment_tokens = comment_tokens[:FLAGS.comment_limit]
        ori_tokens = ori_tokens[:FLAGS.comment_limit]
        tokens_info = tokens_info[:len(attribute_names) * FLAGS.comment_limit]

      pos_ids = [get_char_id(x, pos_vocab) for x in pos]
      tag_ids = [get_char_id(x, tag_vocab) for x in tag]
      ner_ids = [get_char_id(x, ner_vocab) for x in ner]

      # NOTICE comment_ids with vocab(all train + test word so no unk)
      if not FLAGS.lower:
        comment_ids = [get_id(token, vocab) for token in comment_tokens]
        #comment_ids_withunk = [get_id(token, unk_vocab) for token in comment_tokens]
      else:
        comment_ids = [get_id(token.lower(), vocab) for token in comment_tokens]
        #comment_ids_withunk = [get_id(token.lower(), unk_vocab) for token in comment_tokens]

      comment_tokens_str = '|'.join([vocab.key(id) for id in comment_ids])
      label = list(map(float, label))

      tokens_info = list(map(float, tokens_info))

      #print(len(comment_ids), len(tokens_info) / len(attribute_names), len(tokens_info) / len(comment_ids))
      assert len(tokens_info) == len(attribute_names) * len(comment_ids), '%d %f' %(len(comment_ids), len(tokens_info) / len(attribute_names))


      #comment_chars = [list(token) for token in comment_tokens]
      ## CHANGE to use ori token so fu**ck will encode ** but  NiggerMan to Nigger Man will all encode NiggerMan NiggerMan twice
      chars_list = [list(token) for token in ori_tokens]
      char_ids = np.zeros([len(comment_ids), FLAGS.char_limit], dtype=np.int32)
      assert len(comment_ids) == len(chars_list), '{} {} {} {} {}'.format((len(comment_ids), len(chars_list), comment), tokens, ori_tokens)
      
      for i, chars in enumerate(chars_list):
        for j, ch in enumerate(chars):
          if j == FLAGS.char_limit:
            break
          char_ids[i, j] = get_char_id(ch, char_vocab)

      char_ids = list(char_ids.reshape(-1))

      #print(char_ids)

      # --------------simple char
      simple_char_ids = []
      for ch in list(comment):
        id_ = get_char_id(ch, char_vocab)
        #if id_ == char_vocab.unk_id():
        #  continue
        simple_char_ids.append(id_)
        if len(simple_char_ids) == FLAGS.simple_char_limit:
          break

      simple_chars_str = ''.join([char_vocab.key(id) for id in simple_char_ids])
      #print(simple_char_ids, simple_chars_str)

      # # --------------simple ngram
      # simple_ngrams = gezi.get_ngrams(comment)
      # simple_ngrams = simple_ngrams[:FLAGS.simple_char_limit * 5]
      # simple_ngram_ids = [get_ngram_id(ngram, ngram_vocab) for ngram in simple_ngrams]

      # --------------ngram
      ngram_ids_list = np.zeros([len(comment_ids), FLAGS.char_limit], dtype=np.int32)
      if not FLAGS.ftngram:
        #ngrams_list = [gezi.get_ngrams(token) for token in ori_tokens]
        if not FLAGS.ngram_lower:
          ngrams_list = [gezi.get_ngrams(token, FLAGS.ngram_min, FLAGS.ngram_max) for token in comment_tokens]
        else:
          ngrams_list = [gezi.get_ngrams(token.lower(), FLAGS.ngram_min, FLAGS.ngram_max) for token in comment_tokens]

        for i, ngrams in enumerate(ngrams_list):
          for j, ngram in enumerate(ngrams):
            if j == FLAGS.char_limit:
              break
            #assert get_ngram_id(ngram, ngram_vocab) < 20003
            ngram_ids_list[i, j] = get_ngram_id(ngram, ngram_vocab)
      else:
        #for i, (token, ori_token) in enumerate(zip(comment_tokens, ori_tokens)):
        for i, (token, ori_token) in enumerate(zip(comment_tokens, comment_tokens)):
          ngram_ids = gezi.fasttext_ids(ori_token, vocab, FLAGS.ngram_buckets, FLAGS.ngram_min, FLAGS.ngram_max)
          if len(ngram_ids) >= FLAGS.char_limit:
            ngram_ids = gezi.fasttext_ids(token, vocab, FLAGS.ngram_buckets, FLAGS.ngram_min, FALGS.ngram_max)
          ngram_ids = ngram_ids[:FLAGS.char_limit]
          for j, ngram_id in enumerate(ngram_ids):
            ngram_ids_list[i, j] = ngram_id

      ngram_ids = list(ngram_ids_list.reshape(-1))

      # # ---------------fngrams(full ngrams)
      # fngrams_list = [gezi.get_ngrams_hash(token, FLAGS.ngram_buckets, 3, 6, reserve=3) for token in ori_tokens]
      # fngram_ids =  np.zeros([len(comment_ids), FLAGS.ngram_limit], dtype=np.int32)
      # for i, fngrams in enumerate(fngrams_list):
      #   for j, fngram in enumerate(fngrams):
      #     if j == FLAGS.ngram_limit:
      #       break
      #     fngram_ids[i, j] = fngram
      # fngram_ids = list(fngram_ids.reshape(-1))

      # global info per comment  7 features
      comment_info = []
      comment_info.append(len(ori_tokens))
      comment_info.append(len(comment_tokens))
      #comment_len = sum[len(x) for x in ori_tokens]
      comment_len = len(comment_str)
      comment_info.append(comment_len)
      comment_info.append(comment_len / (len(ori_tokens) + 1))
      num_unks = len([x for x in comment_ids if x == vocab.unk_id()])
      comment_info.append(num_unks)
      comment_info.append(num_unks / len(comment_tokens))
      comment_info.append(enprob_dict[id])

      record = tf.train.Example(features=tf.train.Features(feature={
                                "comment": melt.int64_feature(comment_ids),
                                #"comment_withunk": melt.int64_feature(comment_ids_withunk),
                                "tokens_info": melt.float_feature(tokens_info),
                                "comment_info": melt.float_feature(comment_info),
                                "pos": melt.int64_feature(pos_ids),
                                "tag": melt.int64_feature(tag_ids),
                                "ner": melt.int64_feature(ner_ids),
                                "comment_chars": melt.int64_feature(char_ids),
                                "comment_ngrams": melt.int64_feature(ngram_ids),
                                "simple_chars": melt.int64_feature(simple_char_ids),
                                #"simple_ngrams": melt.int64_feature(simple_ngram_ids),
                                #"comment_fngrams": melt.int64_feature(fngram_ids),
                                #"simple_chars_str": melt.bytes_feature(simple_chars_str),
                                "classes": melt.float_feature(label),
                                "id": melt.bytes_feature(id),
                                "weight": melt.float_feature([FLAGS.weight]),
                                "comment_str": melt.bytes_feature(comment_str),
                                "comment_tokens_str": melt.bytes_feature(comment_tokens_str)
                                }))
      
      writer.write(record)
      global counter
      with counter.get_lock():
        counter.value += 1

    print("Build {} instances of features in total".format(writer.size()))
    writer.close()
def build_features(index):
    mode = get_mode(FLAGS.input)

    start_index = 0 if not FLAGS.use_fold else 1
    out_file = os.path.dirname(FLAGS.vocab) + '/{0}/{1}.record'.format(
        mode, index + start_index)
    os.system('mkdir -p %s' % os.path.dirname(out_file))
    print('---out_file', out_file)
    # TODO now only gen one tfrecord file

    total = len(df)
    num_records = FLAGS.num_records_
    if mode in ['valid', 'test', 'dev', 'pm']:
        num_records = 1
    start, end = gezi.get_fold(total, num_records, index)

    print('infile', FLAGS.input, 'out_file', out_file)

    max_len = 0
    max_num_ids = 0
    num = 0
    with melt.tfrecords.Writer(out_file) as writer:
        for i in range(start, end):
            try:
                row = df.iloc[i]
                id = row[0]
                content = row[1]

                #print(content, type(content))
                if len(content) > max_len:
                    max_len = len(content)
                    print('max_len', max_len)

                if len(content) > 3000:
                    print(id, content)
                    if mode not in ['test', 'valid']:
                        continue

                label = list(row[2:])

                #label = [x + 2 for x in label]
                #num_labels = len(label)

                content_ids = text2ids_(content)

                if len(content_ids) < 5 and mode not in ['test', 'valid']:
                    continue

                limit = FLAGS.limit
                if len(content_ids) > max_num_ids:
                    max_num_ids = len(content_ids)
                    print('max_num_ids', max_num_ids)
                content_ids = content_ids[:limit]

                feature = {
                    'id': melt.bytes_feature(str(id)),
                    'label': melt.int64_feature(label),
                    'content': melt.int64_feature(content_ids),
                    'content_str': melt.bytes_feature(content),
                    'sorce': melt.bytes_feature(mode),
                }

                # TODO currenlty not get exact info wether show 1 image or 3 ...
                record = tf.train.Example(features=tf.train.Features(
                    feature=feature))

                if num % 1000 == 0:
                    print(num)

                writer.write(record)
                num += 1
                global counter
                with counter.get_lock():
                    counter.value += 1
                global total_words
                with total_words.get_lock():
                    total_words.value += len(content_ids)
            except Exception:
                #print(traceback.format_exc(), file=sys.stderr)
                pass
示例#12
0
def build_features(file_):
    mode = get_mode(FLAGS.input)
    out_file = os.path.dirname(FLAGS.vocab_) + '/{0}/{1}_{2}.tfrecord'.format(
        mode, os.path.basename(os.path.dirname(file_)),
        os.path.basename(file_))
    os.system('mkdir -p %s' % os.path.dirname(out_file))
    print('infile', file_, 'out_file', out_file)

    num = 0
    num_whether = 0
    answer_len = 0
    with melt.tfrecords.Writer(out_file) as writer:
        for line in open(file_):
            try:
                m = json.loads(line.rstrip('\n'))
                url = m['url']
                alternatives = m['alternatives']
                query_id = int(m['query_id'])
                passage = m['passage']
                query = m['query']

                # if query_id != 254146:
                #   continue

                if not 'answer' in m:
                    answer = 'unknown'
                else:
                    answer = m['answer']

                # candidates is neg,pos,uncertain
                # type 0 means true or false,  type 1 means wehter
                candidates, type = sort_alternatives(alternatives, query)

                assert candidates is not None

                answer_id = 0
                for i, candiate in enumerate(candidates):
                    if candiate == answer:
                        answer_id = i

                assert candidates is not None
                candidates_str = '|'.join(candidates)

                query_ids = text2ids_(query)
                passage_ids = text2ids_(passage)

                candidate_neg_ids = text2ids_(candidates[0])
                candidate_pos_ids = text2ids_(candidates[1])
                candidate_na_ids = text2ids_('无法确定')

                if len(candidate_pos_ids) > answer_len:
                    answer_len = len(candidate_pos_ids)
                    print(answer_len)
                if len(candidate_neg_ids) > answer_len:
                    answer_len = len(candidate_neg_ids)
                    print(answer_len)

                assert len(query_ids), line
                assert len(passage_ids), line

                limit = FLAGS.limit

                if len(passage_ids) > limit:
                    print('long line', len(passage_ids), query_id)

                query_ids = query_ids[:limit]
                passage_ids = passage_ids[:limit]

                feature = {
                    'id': melt.bytes_feature(str(query_id)),
                    'url': melt.bytes_feature(url),
                    'alternatives': melt.bytes_feature(alternatives),
                    'candidates': melt.bytes_feature(candidates_str),
                    'passage': melt.int64_feature(passage_ids),
                    'passage_str': melt.bytes_feature(passage),
                    'query': melt.int64_feature(query_ids),
                    'query_str': melt.bytes_feature(query),
                    'candidate_neg': melt.int64_feature(candidate_neg_ids),
                    'candidate_pos': melt.int64_feature(candidate_pos_ids),
                    'candidate_na': melt.int64_feature(candidate_na_ids),
                    'answer': melt.int64_feature(answer_id),
                    'answer_str': melt.bytes_feature(answer),
                    'type': melt.int64_feature(type)
                }

                # TODO currenlty not get exact info wether show 1 image or 3 ...
                record = tf.train.Example(features=tf.train.Features(
                    feature=feature))

                #if not candidates:
                if num % 1000 == 0:
                    print(num, query_id, query, type)
                    print(alternatives, candidates)
                    print(answer, answer_id)

                writer.write(record)
                num += 1
                if type:
                    num_whether += 1
                global counter
                with counter.get_lock():
                    counter.value += 1
                global total_words
                with total_words.get_lock():
                    total_words.value += len(passage_ids)
                if FLAGS.max_examples and num >= FLAGS.max_examples:
                    break
            except Exception:
                print(traceback.format_exc(), file=sys.stderr)
                print('-----------', query)
                print(alternatives)

            #break
    print('num_wehter:', num_whether)
def build_features(file_):
    mode = get_mode(FLAGS.input)
    out_file = os.path.dirname(FLAGS.vocab_) + '/{0}/{1}.tfrecord'.format(
        mode,
        os.path.basename(file_).split('_')[-1])
    os.system('mkdir -p %s' % os.path.dirname(out_file))
    print('infile', file_, 'out_file', out_file)

    max_len = 0
    num = 0
    num_whether = 0
    answer_len = 0
    with melt.tfrecords.Writer(out_file) as writer:
        for line in open(file_):
            try:
                m = json.loads(line.rstrip('\n'))
                url = m['url']
                alternatives = m['alternatives']
                query_id = int(m['query_id'])
                passage = m['passage']
                query = m['query']

                # if query_id != 254146:
                #   continue

                if not 'answer' in m:
                    answer = 'unknown'
                else:
                    answer = m['answer']

                # candidates is neg,pos,uncertain
                # type 0 means true or false,  type 1 means wehter
                candidates, type = sort_alternatives(alternatives, query)

                assert candidates is not None

                answer_id = 0
                for i, candiate in enumerate(candidates):
                    if candiate == answer:
                        answer_id = i

                assert candidates is not None
                candidates_str = '|'.join(candidates)

                pos = None
                words = m['seg_query'].split('\x09')
                if '|' in words[0]:
                    try:
                        l = [x.split('|') for x in words]
                        words, pos = list(zip(*l))
                    except Exception:
                        print(m['seg_query'].split('\x09'))
                if FLAGS.add_start_end_:
                    words = gezi.add_start_end(words)
                if pos:
                    if FLAGS.add_start_end_:
                        pos = gezi.add_start_end(pos)
                query_ids = [vocab.id(x) for x in words]
                query_pos_ids = get_pos_ids(pos)
                query_char_ids = get_char_ids(words)

                pos = None
                words = m['seg_passage'].split('\x09')
                if '|' in words[0]:
                    try:
                        l = [x.split('|') for x in words]
                        words, pos = list(zip(*l))
                    except Exception:
                        print(m['seg_passage'].split('\x09'))
                if FLAGS.add_start_end_:
                    words = gezi.add_start_end(words)
                if pos:
                    if FLAGS.add_start_end_:
                        pos = gezi.add_start_end(pos)
                passage_ids = [vocab.id(x) for x in words]
                passage_pos_ids = get_pos_ids(pos)
                passage_char_ids = get_char_ids(words)

                alternatives_list = alternatives.split('|')
                alternatives_segs = m['seg_alternatives'].split('|')

                for i, candidate in enumerate(candidates):
                    index = alternatives_list.index(candiate)
                    segs = alternatives_segs[index]
                    words = segs.split('\x09')
                    pos = None
                    if '|' in words[0]:
                        l = [x.split('|') for x in words]
                        words, pos = list(zip(*l))
                    if FLAGS.add_start_end_:
                        words = gezi.add_start_end(words)
                    if pos:
                        if FLAGS.add_start_end_:
                            pos = gezi.add_start_end(pos)

                    if i == 0:
                        candidate_neg_ids = [vocab.id(x) for x in words]
                        candidate_neg_pos_ids = get_pos_ids(pos)
                        candidate_neg_char_ids = get_char_ids(words)
                    elif i == 1:
                        candidate_pos_ids = [vocab.id(x) for x in words]
                        candidate_pos_pos_ids = get_pos_ids(pos)
                        candidate_pos_char_ids = get_char_ids(words)
                    else:
                        # 无法确定
                        candidate_na_ids = [vocab.id(x) for x in words]
                        candidate_na_pos_ids = get_pos_ids(pos)
                        candidate_na_char_ids = get_char_ids(words)

                if len(candidate_pos_ids) > answer_len:
                    answer_len = len(candidate_pos_ids)
                    print(answer_len)
                if len(candidate_neg_ids) > answer_len:
                    answer_len = len(candidate_neg_ids)
                    print(answer_len)

                assert len(query_ids), line
                assert len(passage_ids), line

                limit = FLAGS.limit

                if len(passage_ids) > limit:
                    print('long line', len(passage_ids), query_id)

                if len(passage_ids) > max_len:
                    max_len = len(passage_ids)
                    print('max_len', max_len)

                query_ids = query_ids[:limit]
                passage_ids = passage_ids[:limit]

                feature = {
                    'id':
                    melt.bytes_feature(str(query_id)),
                    'url':
                    melt.bytes_feature(url),
                    'alternatives':
                    melt.bytes_feature(alternatives),
                    'candidates':
                    melt.bytes_feature(candidates_str),
                    'passage':
                    melt.int64_feature(passage_ids),
                    'passage_char':
                    melt.int64_feature(passage_char_ids),
                    'passage_pos':
                    melt.int64_feature(passage_pos_ids),
                    'passage_str':
                    melt.bytes_feature(passage),
                    'query':
                    melt.int64_feature(query_ids),
                    'query_char':
                    melt.int64_feature(query_char_ids),
                    'query_pos':
                    melt.int64_feature(query_pos_ids),
                    'query_str':
                    melt.bytes_feature(query),
                    'candidate_neg':
                    melt.int64_feature(candidate_neg_ids),
                    'candidate_neg_char':
                    melt.int64_feature(candidate_neg_char_ids),
                    'candidate_neg_pos':
                    melt.int64_feature(candidate_neg_pos_ids),
                    'candidate_pos':
                    melt.int64_feature(candidate_pos_ids),
                    'candidate_pos_char':
                    melt.int64_feature(candidate_pos_char_ids),
                    'candidate_pos_pos':
                    melt.int64_feature(candidate_pos_pos_ids),
                    'candidate_na':
                    melt.int64_feature(candidate_na_ids),
                    'candidate_na_char':
                    melt.int64_feature(candidate_na_char_ids),
                    'candidate_na_pos':
                    melt.int64_feature(candidate_na_pos_ids),
                    'answer':
                    melt.int64_feature(answer_id),
                    'answer_str':
                    melt.bytes_feature(answer),
                    'type':
                    melt.int64_feature(type)
                }

                # TODO currenlty not get exact info wether show 1 image or 3 ...
                record = tf.train.Example(features=tf.train.Features(
                    feature=feature))

                #if not candidates:
                if num % 1000 == 0:
                    print(num, query_id, query, type)
                    print(alternatives, candidates)
                    print(answer, answer_id)

                writer.write(record)
                num += 1
                if type:
                    num_whether += 1
                global counter
                with counter.get_lock():
                    counter.value += 1
                global total_words
                with total_words.get_lock():
                    total_words.value += len(passage_ids)
                if FLAGS.max_examples and num >= FLAGS.max_examples:
                    break
            except Exception:
                print(traceback.format_exc(), file=sys.stderr)
                print('-----------', query)
                print(alternatives)

            #break
    print('num_wehter:', num_whether)