Пример #1
0
def deal_file(file, writer, thread_index):
  writer = melt.tfrecords.Writer('{}/{}_{}'.format(FLAGS.output_directory, FLAGS.name, thread_index))
  num = 0
  for line in open(file):
    #if num % 1000 == 0:
    #  print(num)
    l = line.rstrip().split('\t')
    img = l[0]
    #print(img)
    img_feature = [float(x) for x in l[1:1001]]
    #print(img_feature)
    text = l[-1].split('\x01')[0]
    #print(text)
    words = Segmentor.Segment(text)
    word_ids = [vocabulary.id(word) for word in words if vocabulary.has(word)]
    #print(word_ids)
    if len(word_ids) == 0:
      continue
    if FLAGS.pad:
      word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0)
    #print(word_ids)
    #gtexts[thread_index].append(word_ids)
    #gtext_strs[thread_index].append(text)

    example = tf.train.Example(features=tf.train.Features(feature={
      'image_name': melt.bytes_feature(img),
       'image_feature': melt.float_feature(img_feature),
       'text': melt.int_feature(word_ids),
       'text_str': melt.bytes_feature(text),
       }))
    writer.write(example)
    print(example.SerializeToString())
    num += 1
Пример #2
0
def main(argv):
  writer = tf.python_io.TFRecordWriter(argv[2])
  num = 0
  for line in open(argv[1]):
    if line[0] == '#':
      continue
    if num % 10000 == 0:
      print('%d lines done'%num)
    l = line.rstrip().split()
    
    label_index = 0
    if l[0][0] == '_':
      label_index = 1
      id = int(l[0][1:])
    else:
      id = num
    label = int(l[label_index])
    
    start = label_index + 1
    feature = [float(x) for x in l[start:]]
    example = tf.train.Example(
      features=tf.train.Features(
        feature={
        'id': melt.int_feature(id), 
        'label': melt.int_feature(label),
        'feature': melt.float_feature(feature),
        }))
    writer.write(example.SerializeToString())
    num += 1
    if FLAGS.num_examples and num == FLAGS.num_examples:
      break
Пример #3
0
def deal_file(file, writer):
  num = 0
  for line in open(file):
    if num % 1000 == 0:
      print('num:', num)
    l = line.rstrip().split('\t')
    img = l[0]
    img_feature = [float(x) for x in l[1:1001]]
    text = l[-1].split('\x01')[0]
    words = Segmentor.Segment(text)
    word_ids = [vocabulary.id(word) for word in words if vocabulary.has(word)]
    if len(word_ids) == 0:
      num += 1
      continue
    if FLAGS.pad:
      word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0)
    
    texts.append(word_ids)
    text_strs.append(text)
    example = tf.train.Example(features=tf.train.Features(feature={
      'image_name': melt.bytes_feature(img),
       'image_feature': melt.float_feature(img_feature),
       'text': melt.int_feature(word_ids),
       'text_str': melt.bytes_feature(text),
       }))
    #writer.write(example.SerializeToString())
    writer.write(example)
    num += 1
Пример #4
0
def build_features(infile):
    ofile = get_out_file(infile)
    print('----------writing to', ofile)
    with melt.tfrecords.Writer(ofile) as writer:
        for line in tqdm(open(infile)):
            fields = line.rstrip().split('\t')
            if len(fields) > 4:
                label = int(fields[0])
                id = '{}\t{}'.format(fields[2], fields[3])
                feat_id, feat_field, feat_value = dataset.get_feat_set(fields)
                assert len(feat_id) == len(
                    feat_value
                ), "len(feat_id) == len(feat_value) -----------------"
                assert len(feat_id) == len(feat_field)

                feature = {
                    'label': melt.int64_feature(label),
                    'id': melt.bytes_feature(id),
                    'index': melt.int64_feature(feat_id),
                    'field': melt.int64_feature(feat_field),
                    'value': melt.float_feature(feat_value)
                }
                record = tf.train.Example(features=tf.train.Features(
                    feature=feature))

                writer.write(record)
                global counter
                with counter.get_lock():
                    counter.value += 1
Пример #5
0
def deal_file(file, thread_index):
    out_file = '{}/{}_{}'.format(
        FLAGS.output_directory, FLAGS.name,
        thread_index) if FLAGS.threads > 1 else '{}/{}'.format(
            FLAGS.output_directory, FLAGS.name)
    print('out_file:', out_file)
    with melt.tfrecords.Writer(out_file) as writer:
        num = 0
        for line in open(file):
            if num % 1000 == 0:
                print(num)
            l = line.rstrip().split('\t')
            img = l[0]
            img_end = IMAGE_FEATURE_LEN + 1
            img_feature = [float(x) for x in l[1:img_end]]
            texts = [x.split('\x01')[0] for x in l[img_end:]]
            for text in texts:
                #@TODO from text -> ids should move out so online code can share it for evaluation or use for feed dict
                words = segmentor.Segment(text, FLAGS.seg_method)
                word_ids = [
                    vocabulary.id(word) for word in words
                    if vocabulary.has(word) or ENCODE_UNK
                ]
                word_ids_length = len(word_ids)
                if len(word_ids) == 0:
                    continue
                word_ids = word_ids[:TEXT_MAX_WORDS]
                if FLAGS.pad:
                    word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0)

                if FLAGS.np_save:
                    gtexts[thread_index].append(word_ids)
                    gtext_strs[thread_index].append(text)

                assert img and img_feature and word_ids and text, line
                assert len(img_feature) == IMAGE_FEATURE_LEN
                #add pos info? weght info? or @TODO add click num info
                example = tf.train.Example(features=tf.train.Features(
                    feature={
                        'image_name': melt.bytes_feature(img),
                        'image_feature': melt.float_feature(img_feature),
                        'text': melt.int_feature(word_ids),
                        'text_str': melt.bytes_feature(text),
                    }))
                writer.write(example)

                global counter, max_num_words, sum_words
                with counter.get_lock():
                    counter.value += 1
                if word_ids_length > max_num_words.value:
                    with max_num_words.get_lock():
                        max_num_words.value = word_ids_length
                with sum_words.get_lock():
                    sum_words.value += word_ids_length
            num += 1

    texts_dict[thread_index] = gtexts[thread_index]
    text_strs_dict[thread_index] = gtext_strs[thread_index]
Пример #6
0
def _parse_line(line, writer, thread_index=0):
    l = line.rstrip().split('\t')
    image_name = l[0]
    image_feature = [float(x) for x in l[1:]]
    if image_name not in text_map:
        print('image %s ignore' % image_name)
        return
    else:
        for text, ori_text in text_map[image_name]:
            word_ids = [
                vocabulary.id(word) for word in text.split(WORDS_SEP)
                if vocabulary.has(word)
            ]
            if not word_ids:
                continue
            word_ids_length = len(word_ids)
            word_ids = word_ids[:TEXT_MAX_WORDS]
            if FLAGS.pad:
                word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS)

            if FLAGS.np_save:
                gtexts[thread_index].append(word_ids)
                gtext_strs[thread_index].append(ori_text)

            example = tf.train.Example(features=tf.train.Features(
                feature={
                    'image_name': melt.bytes_feature(image_name),
                    'image_feature': melt.float_feature(image_feature),
                    'text': melt.int_feature(word_ids),
                    'text_str': melt.bytes_feature(ori_text),
                }))

            #NOTICE not test here for num_threads > 1
            if FLAGS.num_records:
                if image_name not in images:
                    images[image_name] = 1
                    print(image_name, len(images))
                    writer.write(example.SerializeToString())
                    if len(images) == FLAGS.num_records:
                        print('Done')
                        exit(1)
            else:
                writer.write(example.SerializeToString())
                global counter, max_num_words, sum_words
                with counter.get_lock():
                    counter.value += 1
                if word_ids_length > max_num_words.value:
                    with max_num_words.get_lock():
                        max_num_words.value = word_ids_length
                with sum_words.get_lock():
                    sum_words.value += word_ids_length
Пример #7
0
def main(argv):
    writer = tf.python_io.TFRecordWriter(argv[2])
    num = 0
    for line in open(argv[1]):
        if line[0] == '#':
            continue
        if num % 10000 == 0:
            print('%d lines done' % num)
        l = line.rstrip().split()

        label_index = 0
        if l[0][0] == '_':
            label_index = 1
            id = int(l[0][1:])
        else:
            id = num
        label = int(l[label_index])

        start = label_index + 1
        feature = [float(x) for x in l[start:]]

        if FLAGS.fake_var_len:
            if id % 2 == 0:
                feature = feature[:10]

            if id % 3 == 0:
                feature = feature[:20]

        example = tf.train.SequenceExample(
            context=melt.features({
                'id': melt.int_feature(id),
                'label': melt.int_feature(label)
            }),
            feature_lists=melt.feature_lists({
                #see sequence_test.py use each single as a list and stack all lists(single items)
                #can this deal with var len sequence ?
                'feature':
                melt.feature_list(
                    [melt.float_feature(item) for item in feature])
                #'feature': melt.feature_list(melt.float_feature(feature))
            }))

        writer.write(example.SerializeToString())

        num += 1
        if FLAGS.num_examples and num == FLAGS.num_examples:
            break
Пример #8
0
def deal_file(file, thread_index):
    out_file = '{}/{}_{}'.format(
        FLAGS.output_directory, FLAGS.name,
        thread_index) if FLAGS.threads > 1 else '{}/{}'.format(
            FLAGS.output_directory, FLAGS.name)
    writer = melt.tfrecords.Writer(out_file)
    num = 0
    for line in open(file):
        if num % 1000 == 0:
            print(num)
        l = line.rstrip().split('\t')
        img = l[0]
        img_end = IMAGE_FEATURE_LEN + 1
        img_feature = [float(x) for x in l[1:img_end]]
        texts = [x.split('\x01')[0] for x in l[img_end:]]
        for text in texts:
            words = Segmentor.Segment(text)
            word_ids = [
                vocabulary.id(word) for word in words if vocabulary.has(word)
            ]
            if len(word_ids) == 0:
                continue
            if FLAGS.pad:
                word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0)

            gtexts[thread_index].append(word_ids)
            gtext_strs[thread_index].append(text)

            #add pos info? weght info? or @TODO add click num info
            example = tf.train.Example(features=tf.train.Features(
                feature={
                    'image_name': melt.bytes_feature(img),
                    'image_feature': melt.float_feature(img_feature),
                    'text': melt.int_feature(word_ids),
                    'text_str': melt.bytes_feature(text),
                }))
            writer.write(example)
        num += 1

    texts_dict[thread_index] = gtexts[thread_index]
    text_strs_dict[thread_index] = gtext_strs[thread_index]
Пример #9
0
        word_ids_length = len(word_ids)
        if num % 1000 == 0:
            #print(libgezi.gbk2utf8('\t'.join(words)), file=sys.stderr)
            print('\t'.join(words), file=sys.stderr)
            print(word_ids, file=sys.stderr)
        if len(word_ids) == 0:
            continue
        word_ids = word_ids[:TEXT_MAX_WORDS]
        if FLAGS.pad:
            word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0)

        if writer is not None:
            example = tf.train.Example(features=tf.train.Features(
                feature={
                    'image_name': melt.bytes_feature(img),
                    'image_feature': melt.float_feature(img_feature),
                    'text': melt.int_feature(word_ids),
                    'text_str': melt.bytes_feature(text),
                }))
            writer.write(example)
        else:
            count += 1

if FLAGS.mode != 1:
    if writer is not None:
        count = writer.count
    print('count\t%d' % (count), file=sys.stderr)
    #--------for calc total count
    print('count\t%d' % (count))

#do not forget to close ! NOTICE
Пример #10
0
def deal_file(file):
    out_file = '{}/{}'.format(
        FLAGS.output_directory,
        '-'.join([FLAGS.name, file.split('/')[-1].split('-')[-1]]))
    print('out_file:', out_file)
    with melt.tfrecords.Writer(out_file) as writer:
        num = 0
        for line in open(file):
            if num % 1000 == 0:
                print(num)

            l = line.rstrip('\n').split('\t')
            img = l[0]

            texts = l[FLAGS.text_index].split('\x01')

            image_feature = [
                float(x)
                for x in l[FLAGS.image_feature_index].strip().split('\x01')
            ]
            #assert len(image_feature) == IMAGE_FEATURE_LEN, '%s %d'%(img, len(image_feature))
            if len(image_feature) != IMAGE_FEATURE_LEN:
                print('bad line:', line)
                continue

            input_texts = l[FLAGS.input_text_index].split('\x01')
            for input_text in input_texts:
                input_words = text2ids.Segmentor.Segment(
                    input_text, FLAGS.seg_method)
                input_word_ids = text2ids.words2ids(
                    input_words,
                    feed_single=FLAGS.feed_single,
                    allow_all_zero=True,
                    pad=False)
                if len(input_word_ids) == 0:
                    continue

                input_word_ids = input_word_ids[:INPUT_TEXT_MAX_WORDS]
                if FLAGS.pad:
                    input_word_ids = gezi.pad(input_word_ids)

                is_top_text = True
                for text in texts:
                    if text.strip() == '':
                        continue

                    words = text2ids.Segmentor.Segment(text, FLAGS.seg_method)
                    word_ids = text2ids.words2ids(
                        words,
                        feed_single=FLAGS.feed_single,
                        allow_all_zero=True,
                        pad=False)
                    word_ids_length = len(word_ids)
                    if num % 1000 == 0:
                        print(img,
                              text,
                              word_ids,
                              text2ids.ids2text(word_ids),
                              len(image_feature),
                              file=sys.stderr)
                    if word_ids_length == 0:
                        continue
                    if is_luanma(words, word_ids):
                        print('luanma',
                              img,
                              text,
                              word_ids,
                              text2ids.ids2text(word_ids),
                              len(image_feature),
                              file=sys.stderr)
                        continue

                    word_ids = word_ids[:TEXT_MAX_WORDS]
                    if FLAGS.pad:
                        word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0)
                    if not FLAGS.write_sequence_example:
                        example = tf.train.Example(features=tf.train.Features(
                            feature={
                                'image_name': melt.bytes_feature(img),
                                'image_feature': melt.float_feature(
                                    image_feature),
                                'input_text_str': melt.bytes_feature(
                                    input_text),
                                'input_text': melt.int64_feature(
                                    input_word_ids),
                                'text_str': melt.bytes_feature(text),
                                'text': melt.int64_feature(word_ids),
                            }))
                    else:
                        example = tf.train.SequenceExample(
                            context=melt.features({
                                'image_name':
                                melt.bytes_feature(img),
                                'image_feature':
                                melt.float_feature(image_feature),
                                'input_text_str':
                                melt.bytes_feature(input_text),
                                'text_str':
                                melt.bytes_feature(text),
                            }),
                            feature_lists=melt.feature_lists({
                                'input_text':
                                melt.int64_feature_list(input_word_ids),
                                'text':
                                melt.int64_feature_list(word_ids)
                            }))
                    writer.write(example)

                #global counter, max_num_words, sum_words
                with record_counter.get_lock():
                    record_counter.value += 1
                if word_ids_length > max_num_words.value:
                    with max_num_words.get_lock():
                        max_num_words.value = word_ids_length
                with sum_words.get_lock():
                    sum_words.value += word_ids_length

                if FLAGS.np_save:
                    assert FLAGS.threads == 1
                    gtexts.append(word_ids)
                    gtext_strs.append(text)

                    if img not in image_labels:
                        image_labels[img] = set()
                    image_labels[img].add(text)

                if is_top_text:
                    is_top_text = False
                    with image_counter.get_lock():
                        image_counter.value += 1

                    if FLAGS.np_save:
                        if img not in image_labels:
                            image_labels[img] = set()

                        image_names.append(img)
                        image_features.append(image_feature)

                    if FLAGS.num_max_records > 0:
                        #if fixed valid only get one click for each image
                        break

            num += 1
            if num == FLAGS.num_max_records:
                break
Пример #11
0
def deal_file(file):
    out_file = '{}/{}'.format(
        FLAGS.output_directory,
        '-'.join([FLAGS.name, file.split('/')[-1].split('-')[-1]]))
    print('file:', file, 'out_file:', out_file, file=sys.stderr)
    with melt.tfrecords.Writer(out_file) as writer:
        num = 0
        for line in open(file):
            if num % 1000 == 0:
                print(num, file=sys.stderr)

            l = line.rstrip('\n').split('\t')
            img = l[0]

            texts = l[FLAGS.text_index].split('\x01')

            image_feature = [
                float(x)
                for x in l[FLAGS.image_feature_index].strip().split('\x01')
            ]
            #image_feature = [float(x) for x in l[FLAGS.image_feature_index].strip().split(' ')]
            #image_feature = [0.] * IMAGE_FEATURE_LEN
            assert len(image_feature) == IMAGE_FEATURE_LEN, '%s %d' % (
                img, len(image_feature))

            is_top_text = True
            for text in texts:
                text = normalize.norm(text)
                if text.strip() == '':
                    print('empty line', line, file=sys.stderr)
                    continue

                word_ids = _text2ids(text, TEXT_MAX_WORDS)
                word_ids_length = len(word_ids)
                if num % 10000 == 0:
                    print(img,
                          text,
                          word_ids,
                          text2ids.ids2text(word_ids),
                          len(image_feature),
                          file=sys.stderr)
                if len(word_ids) == 0:
                    print('empy wordids!', file=sys.stderr)
                    print(img,
                          text,
                          word_ids,
                          text2ids.ids2text(word_ids),
                          len(image_feature),
                          file=sys.stderr)
                    continue
                #if is_luanma(words, word_ids):
                #  print('luanma', img, text, word_ids, text2ids.ids2text(word_ids), len(image_feature), file=sys.stderr)
                #  continue

                word_ids = word_ids[:TEXT_MAX_WORDS]
                if FLAGS.pad:
                    word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0)
                if not FLAGS.write_sequence_example:
                    example = tf.train.Example(features=tf.train.Features(
                        feature={
                            'image_name': melt.bytes_feature(img),
                            'image_feature': melt.float_feature(image_feature),
                            'text_str': melt.bytes_feature(text),
                            'text': melt.int64_feature(word_ids),
                        }))
                else:
                    example = tf.train.SequenceExample(
                        context=melt.features({
                            'image_name':
                            melt.bytes_feature(img),
                            'image_feature':
                            melt.float_feature(image_feature),
                            'text_str':
                            melt.bytes_feature(text),
                        }),
                        feature_lists=melt.feature_lists(
                            {'text': melt.int64_feature_list(word_ids)}))
                writer.write(example)

                #global counter, max_num_words, sum_words
                with record_counter.get_lock():
                    record_counter.value += 1
                if word_ids_length > max_num_words.value:
                    with max_num_words.get_lock():
                        max_num_words.value = word_ids_length
                with sum_words.get_lock():
                    sum_words.value += word_ids_length

                if FLAGS.np_save:
                    assert FLAGS.threads == 1
                    gtexts.append(word_ids)
                    gtext_strs.append(text)

                    #Depreciated not use image_labels
                    if img not in image_labels:
                        image_labels[img] = set()
                    image_labels[img].add(text)

                if is_top_text:
                    is_top_text = False
                    with image_counter.get_lock():
                        image_counter.value += 1

                    if FLAGS.np_save:
                        if img not in image_labels:
                            image_labels[img] = set()

                        image_names.append(img)
                        if FLAGS.small_feature:
                            image_features.append(image_feature)
                        else:
                            #actually save pic path instead of image feature
                            image_features.append(
                                os.path.join(FLAGS.big_feature_image_dir,
                                             img.replace('/', '_')))

                    if FLAGS.num_max_records > 0:
                        #if fixed valid only get one click for each image
                        break

            num += 1
            if num == FLAGS.num_max_records:
                break
Пример #12
0
def build_features(index):
    mode = 'train' if 'train' in FLAGS.input else 'test'
    out_file = os.path.dirname(FLAGS.vocab) + '/{0}/{1}.record'.format(
        mode, index)
    os.system('mkdir -p %s' % os.path.dirname(out_file))
    print('---out_file', out_file)
    # TODO now only gen one tfrecord file

    total = len(examples)
    start, end = gezi.get_fold(total, FLAGS.num_records, index)

    ids = examples['id'].values[start:end]
    comments = examples['comment_text'].values[start:end]

    try:
        labels = examples[CLASSES].values[start:end]
    except Exception:
        labels = [[0.] * len(CLASSES)] * len(ids)

    with melt.tfrecords.Writer(out_file) as writer:
        for id, comment, label in tqdm(zip(ids, comments, labels)):
            comment_str = comment
            # TODO use info
            doc = tokenizer.tokenize(comment)
            comment_tokens, tokens_info = doc.tokens, doc.attributes

            for i in range(len(tokens_info)):
                tokens_info[i] = list(map(float, tokens_info[i]))

            if FLAGS.comment_limit:
                comment_tokens = comment_tokens[:FLAGS.comment_limit]
                tokens_info = tokens_info[:FLAGS.comment_limit]

            tokens_info = np.array(tokens_info)
            tokens_info = tokens_info.reshape(-1)
            tokens_info = list(tokens_info)

            assert len(
                tokens_info) == len(comment_tokens) * len(attribute_names)

            comment_ids = [get_id(token, vocab) for token in comment_tokens]
            comment_tokens_str = '|'.join(
                [vocab.key(id) for id in comment_ids])
            label = list(map(float, label))

            comment_chars = [list(token) for token in comment_tokens]

            char_ids = np.zeros([len(comment_ids), FLAGS.char_limit],
                                dtype=np.int32)

            for i, token in enumerate(comment_chars):
                for j, ch in enumerate(token):
                    if j == FLAGS.char_limit:
                        break
                    char_ids[i, j] = get_char_id(ch, char_vocab)

            char_ids = list(char_ids.reshape(-1))

            #print(char_ids)

            simple_char_ids = []
            num_chs = 0
            for ch in list(comment):
                id_ = get_char_id(ch, char_vocab)
                #if id_ == char_vocab.unk_id():
                #  continue
                simple_char_ids.append(id_)
                if len(simple_char_ids) == FLAGS.simple_char_limit:
                    break

            simple_chars_str = ''.join(
                [char_vocab.key(id) for id in simple_char_ids])

            #print(simple_char_ids, simple_chars_str)

            record = tf.train.Example(features=tf.train.Features(
                feature={
                    "comment": melt.int64_feature(comment_ids),
                    "tokens_info": melt.float_feature(tokens_info),
                    "comment_chars": melt.int64_feature(char_ids),
                    "simple_chars": melt.int64_feature(simple_char_ids),
                    "simple_chars_str": melt.bytes_feature(simple_chars_str),
                    "classes": melt.float_feature(label),
                    "id": melt.bytes_feature(id),
                    "comment_str": melt.bytes_feature(comment_str),
                    "comment_tokens_str": melt.bytes_feature(
                        comment_tokens_str)
                }))

            writer.write(record)
            global counter
            with counter.get_lock():
                counter.value += 1

        print("Build {} instances of features in total".format(writer.size()))
        writer.close()
Пример #13
0
def _parse_line(line, writer, thread_index = 0):
  l = line.rstrip().split('\t')
  image_name = l[0]
  image_feature = [float(x) for x in l[1:]]
  if image_name not in text_map:
    print('image ', image_name, 'ignore ', 'name_len ', len(image_name), len(image_name.strip()))
    return
  else:
    image_path =  FLAGS.image_dir + '/' + image_name
    #print(image_path)

    if FLAGS.write_raw_image_bytes:
      with tf.gfile.FastGFile(image_path, "r") as f:
        encoded_image = f.read()
    else:
      encoded_image = ''

    #---------below will hang if multi process
    #try:
    #  decoder.decode_jpeg(encoded_image)
    #except (tf.errors.InvalidArgumentError, AssertionError):
    #  print("Skipping file with invalid JPEG data: %s" % image_path)
    #  return
      
    for text, ori_text in text_map[image_name]:
      word_ids = [vocabulary.id(word) for word in text.split(WORDS_SEP) if vocabulary.has(word) or ENCODE_UNK]
      if not word_ids:
        continue 
      word_ids_length = len(word_ids)
      word_ids = word_ids[:TEXT_MAX_WORDS]
      if FLAGS.pad:
        word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS)

      if not FLAGS.write_sequence_example:
        example = tf.train.Example(features=tf.train.Features(feature={
          'image_name': melt.bytes_feature(image_name),
          'image_data': melt.bytes_feature(encoded_image),
          'image_feature': melt.float_feature(image_feature),
          'text': melt.int64_feature(word_ids),
          'text_str': melt.bytes_feature(ori_text),
          }))
      else:
        example = tf.train.SequenceExample(
          context=melt.features(
            {
              'image_name': melt.bytes_feature(image_name),
              'image_data': melt.bytes_feature(encoded_image),
              'image_feature': melt.float_feature(image_feature),
              'text_str': melt.bytes_feature(ori_text),
             }),
          feature_lists=melt.feature_lists(
          { 
            'text': melt.int64_feature_list(word_ids)
          }))
     
      if FLAGS.np_save:
        gtexts[thread_index].append(word_ids)
        gtext_strs[thread_index].append(ori_text)


      #NOTICE not test here for num_threads > 1
      if FLAGS.num_records:
        if image_name not in images:
          images[image_name] = 1
          print(image_name, len(images))
          writer.write(example.SerializeToString())
          if len(images) == FLAGS.num_records:
            print('Done')
            exit(1)
      else:
        writer.write(example.SerializeToString())
        global counter, max_num_words, sum_words
        with counter.get_lock():
          counter.value += 1
        if word_ids_length > max_num_words.value:
          with max_num_words.get_lock():
            max_num_words.value = word_ids_length
        with sum_words.get_lock():
          sum_words.value += word_ids_length
Пример #14
0
def deal_file(file):
    out_file = '{}/{}'.format(
        FLAGS.output_directory,
        '-'.join([FLAGS.name, file.split('/')[-1].split('-')[-1]]))
    print('out_file:', out_file)
    with melt.tfrecords.Writer(out_file) as writer:
        num = 0
        for line in open(file):
            if num % 1000 == 0:
                print(num)

            l = line.rstrip('\n').split('\t')
            cs = l[0]  #cs
            simid = l[3]
            objurl = l[1]
            fromurl = l[2]
            keyword = l[4].split('\x01')[0]
            extended_keyword = l[5].split('\x01')[0]

            img = objurl
            #img = cs

            idl4w_end = IDL4W_FEATURE_LEN + 6
            idl4w_feature = [float(x) for x in l[6:idl4w_end]]

            titles = l[idl4w_end + 1]
            descs = l[idl4w_end + 2]

            inception_feature = [float(x) for x in l[idl4w_end + 3:]]

            assert len(inception_feature) == INCEPTION_FEATURE_LEN, '%d %s' % (
                len(inception_feature), cs)

            click_query = l[idl4w_end]
            show_str = 'click:{} ex_key:{} key:{} titles:{} descs:{}'.format(
                click_query, extended_keyword, keyword, titles, descs)
            if click_query == 'noclickquery':
                click_query = ''
                #TODO now only consider click_query
                continue
            else:
                click_queries = click_query.split('$*$')
                is_top_text = True
                for click_query in click_queries:
                    if click_query.strip() == '':
                        continue

                    text_str = '{} {}'.format(click_query, show_str)

                    text = click_query
                    words = text2ids.Segmentor.Segment(text, FLAGS.seg_method)
                    word_ids = text2ids.words2ids(
                        words,
                        feed_single=FLAGS.feed_single,
                        allow_all_zero=True,
                        pad=False)
                    word_ids_length = len(word_ids)
                    if num % 1000 == 0:
                        print(cs,
                              simid,
                              text,
                              word_ids,
                              text2ids.ids2text(word_ids),
                              len(idl4w_feature),
                              len(inception_feature),
                              file=sys.stderr)
                    if len(word_ids) == 0:
                        continue
                    if is_bad(words, word_ids):
                        #print('luan_ma', cs, simid, text, word_ids, text2ids.ids2text(word_ids), len(idl4w_feature), len(inception_feature), file=sys.stderr)
                        continue

                    word_ids = word_ids[:TEXT_MAX_WORDS]
                    if FLAGS.pad:
                        word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0)
                    if not FLAGS.write_sequence_example:
                        example = tf.train.Example(features=tf.train.Features(
                            feature={
                                'image_name':
                                melt.bytes_feature(img),
                                'idl4w_feature':
                                melt.float_feature(idl4w_feature),
                                'inception_feature':
                                melt.float_feature(inception_feature),
                                'text_str':
                                melt.bytes_feature(text_str),
                                'text':
                                melt.int64_feature(word_ids),
                            }))
                    else:
                        example = tf.train.SequenceExample(
                            context=melt.features({
                                'image_name':
                                melt.bytes_feature(img),
                                'idl4w_feature':
                                melt.float_feature(idl4w_feature),
                                'inception_feature':
                                melt.float_feature(inception_feature),
                                'text_str':
                                melt.bytes_feature(text_str),
                            }),
                            feature_lists=melt.feature_lists(
                                {'text': melt.int64_feature_list(word_ids)}))
                    writer.write(example)

                    #global counter, max_num_words, sum_words
                    with record_counter.get_lock():
                        record_counter.value += 1
                    if word_ids_length > max_num_words.value:
                        with max_num_words.get_lock():
                            max_num_words.value = word_ids_length
                    with sum_words.get_lock():
                        sum_words.value += word_ids_length

                    if FLAGS.np_save:
                        assert FLAGS.threads == 1
                        texts.append(word_ids)
                        text_strs.append(text)

                        if img not in image_labels:
                            image_labels[img] = set()
                        image_labels[img].add(text)

                    if is_top_text:
                        is_top_text = False
                        with image_counter.get_lock():
                            image_counter.value += 1

                        if FLAGS.np_save:
                            if img not in image_labels:
                                image_labels[img] = set()

                            image_names.append(img)
                            #image_features.append(image_feature)
                            idl4w_features.append(idl4w_feature)
                            inception_features.append(inception_feature)

                        if FLAGS.num_max_records > 0:
                            #if fixed valid only get one click for each image
                            break

            num += 1
            if num == FLAGS.num_max_records:
                break
def build_features(index):
  mode = get_mode()
  out_file = os.path.dirname(FLAGS.vocab) + '/{0}/{1}.record'.format(mode, index)
  os.system('mkdir -p %s' % os.path.dirname(out_file))
  print('---out_file', out_file)
  # TODO now only gen one tfrecord file 

  total = len(examples)
  if not FLAGS.has_dup:
    start, end = gezi.get_fold(total, FLAGS.num_records, index)
  else:
    start, end = get_fold(examples['id'].values, index)

  ids = examples['id'].values[start: end]
  ids = list(map(str, ids))
  comments = examples['comment_text'].values[start: end]
  tokens_list = examples['tokens'].values[start: end]
  tokens_infos = examples['attributes'].values[start: end]
  # TODO change to poses
  poses = examples['poses'].values[start: end]
  tags = examples['tags'].values[start: end]
  ners = examples['ners'].values[start: end]
  ori_tokens_list = examples['ori_tokens'].values[start: end]
  
  try:
    labels = examples[CLASSES].values[start: end]
  except Exception:
    labels = [[0.] * len(CLASSES)] * len(ids)

  with melt.tfrecords.Writer(out_file) as writer:
    for id, comment, label, comment_tokens, ori_tokens, tokens_info, pos, tag, ner in tqdm(zip(ids, comments, labels, tokens_list, ori_tokens_list, tokens_infos, poses, tags, ners)):
      if not isinstance(comment, str):
        comment = 'ok'
      comment_str = comment

      comment_tokens = comment_tokens.split(' ')
      tokens_info = tokens_info.split(' ')
      pos = pos.split(' ')
      tag = tag.split(' ')
      ner = ner.split(' ')
      ori_tokens = ori_tokens.split(' ')

      if FLAGS.comment_limit:
        comment_tokens = comment_tokens[:FLAGS.comment_limit]
        ori_tokens = ori_tokens[:FLAGS.comment_limit]
        tokens_info = tokens_info[:len(attribute_names) * FLAGS.comment_limit]

      pos_ids = [get_char_id(x, pos_vocab) for x in pos]
      tag_ids = [get_char_id(x, tag_vocab) for x in tag]
      ner_ids = [get_char_id(x, ner_vocab) for x in ner]

      # NOTICE comment_ids with vocab(all train + test word so no unk)
      if not FLAGS.lower:
        comment_ids = [get_id(token, vocab) for token in comment_tokens]
        #comment_ids_withunk = [get_id(token, unk_vocab) for token in comment_tokens]
      else:
        comment_ids = [get_id(token.lower(), vocab) for token in comment_tokens]
        #comment_ids_withunk = [get_id(token.lower(), unk_vocab) for token in comment_tokens]

      comment_tokens_str = '|'.join([vocab.key(id) for id in comment_ids])
      label = list(map(float, label))

      tokens_info = list(map(float, tokens_info))

      #print(len(comment_ids), len(tokens_info) / len(attribute_names), len(tokens_info) / len(comment_ids))
      assert len(tokens_info) == len(attribute_names) * len(comment_ids), '%d %f' %(len(comment_ids), len(tokens_info) / len(attribute_names))


      #comment_chars = [list(token) for token in comment_tokens]
      ## CHANGE to use ori token so fu**ck will encode ** but  NiggerMan to Nigger Man will all encode NiggerMan NiggerMan twice
      chars_list = [list(token) for token in ori_tokens]
      char_ids = np.zeros([len(comment_ids), FLAGS.char_limit], dtype=np.int32)
      assert len(comment_ids) == len(chars_list), '{} {} {} {} {}'.format((len(comment_ids), len(chars_list), comment), tokens, ori_tokens)
      
      for i, chars in enumerate(chars_list):
        for j, ch in enumerate(chars):
          if j == FLAGS.char_limit:
            break
          char_ids[i, j] = get_char_id(ch, char_vocab)

      char_ids = list(char_ids.reshape(-1))

      #print(char_ids)

      # --------------simple char
      simple_char_ids = []
      for ch in list(comment):
        id_ = get_char_id(ch, char_vocab)
        #if id_ == char_vocab.unk_id():
        #  continue
        simple_char_ids.append(id_)
        if len(simple_char_ids) == FLAGS.simple_char_limit:
          break

      simple_chars_str = ''.join([char_vocab.key(id) for id in simple_char_ids])
      #print(simple_char_ids, simple_chars_str)

      # # --------------simple ngram
      # simple_ngrams = gezi.get_ngrams(comment)
      # simple_ngrams = simple_ngrams[:FLAGS.simple_char_limit * 5]
      # simple_ngram_ids = [get_ngram_id(ngram, ngram_vocab) for ngram in simple_ngrams]

      # --------------ngram
      ngram_ids_list = np.zeros([len(comment_ids), FLAGS.char_limit], dtype=np.int32)
      if not FLAGS.ftngram:
        #ngrams_list = [gezi.get_ngrams(token) for token in ori_tokens]
        if not FLAGS.ngram_lower:
          ngrams_list = [gezi.get_ngrams(token, FLAGS.ngram_min, FLAGS.ngram_max) for token in comment_tokens]
        else:
          ngrams_list = [gezi.get_ngrams(token.lower(), FLAGS.ngram_min, FLAGS.ngram_max) for token in comment_tokens]

        for i, ngrams in enumerate(ngrams_list):
          for j, ngram in enumerate(ngrams):
            if j == FLAGS.char_limit:
              break
            #assert get_ngram_id(ngram, ngram_vocab) < 20003
            ngram_ids_list[i, j] = get_ngram_id(ngram, ngram_vocab)
      else:
        #for i, (token, ori_token) in enumerate(zip(comment_tokens, ori_tokens)):
        for i, (token, ori_token) in enumerate(zip(comment_tokens, comment_tokens)):
          ngram_ids = gezi.fasttext_ids(ori_token, vocab, FLAGS.ngram_buckets, FLAGS.ngram_min, FLAGS.ngram_max)
          if len(ngram_ids) >= FLAGS.char_limit:
            ngram_ids = gezi.fasttext_ids(token, vocab, FLAGS.ngram_buckets, FLAGS.ngram_min, FALGS.ngram_max)
          ngram_ids = ngram_ids[:FLAGS.char_limit]
          for j, ngram_id in enumerate(ngram_ids):
            ngram_ids_list[i, j] = ngram_id

      ngram_ids = list(ngram_ids_list.reshape(-1))

      # # ---------------fngrams(full ngrams)
      # fngrams_list = [gezi.get_ngrams_hash(token, FLAGS.ngram_buckets, 3, 6, reserve=3) for token in ori_tokens]
      # fngram_ids =  np.zeros([len(comment_ids), FLAGS.ngram_limit], dtype=np.int32)
      # for i, fngrams in enumerate(fngrams_list):
      #   for j, fngram in enumerate(fngrams):
      #     if j == FLAGS.ngram_limit:
      #       break
      #     fngram_ids[i, j] = fngram
      # fngram_ids = list(fngram_ids.reshape(-1))

      # global info per comment  7 features
      comment_info = []
      comment_info.append(len(ori_tokens))
      comment_info.append(len(comment_tokens))
      #comment_len = sum[len(x) for x in ori_tokens]
      comment_len = len(comment_str)
      comment_info.append(comment_len)
      comment_info.append(comment_len / (len(ori_tokens) + 1))
      num_unks = len([x for x in comment_ids if x == vocab.unk_id()])
      comment_info.append(num_unks)
      comment_info.append(num_unks / len(comment_tokens))
      comment_info.append(enprob_dict[id])

      record = tf.train.Example(features=tf.train.Features(feature={
                                "comment": melt.int64_feature(comment_ids),
                                #"comment_withunk": melt.int64_feature(comment_ids_withunk),
                                "tokens_info": melt.float_feature(tokens_info),
                                "comment_info": melt.float_feature(comment_info),
                                "pos": melt.int64_feature(pos_ids),
                                "tag": melt.int64_feature(tag_ids),
                                "ner": melt.int64_feature(ner_ids),
                                "comment_chars": melt.int64_feature(char_ids),
                                "comment_ngrams": melt.int64_feature(ngram_ids),
                                "simple_chars": melt.int64_feature(simple_char_ids),
                                #"simple_ngrams": melt.int64_feature(simple_ngram_ids),
                                #"comment_fngrams": melt.int64_feature(fngram_ids),
                                #"simple_chars_str": melt.bytes_feature(simple_chars_str),
                                "classes": melt.float_feature(label),
                                "id": melt.bytes_feature(id),
                                "weight": melt.float_feature([FLAGS.weight]),
                                "comment_str": melt.bytes_feature(comment_str),
                                "comment_tokens_str": melt.bytes_feature(comment_tokens_str)
                                }))
      
      writer.write(record)
      global counter
      with counter.get_lock():
        counter.value += 1

    print("Build {} instances of features in total".format(writer.size()))
    writer.close()