Пример #1
0
from conf import IMAGE_FEATURE_LEN

from gezi import Segmentor
segmentor = Segmentor()

START_WORD = '<S>'
END_WORD = '</S>'

print('seg_method:', FLAGS.seg_method, file=sys.stderr)
print('min_count:', FLAGS.min_count, 'most_common:', FLAGS.most_common)

num = 0
for line in sys.stdin:
  if num % 10000 == 0:
    print(num, file=sys.stderr)
  l = line.rstrip().split('\t')
  texts = l[1].split('\x01')
  for text in texts:
    words = segmentor.Segment(text, FLAGS.seg_method)
    if num % 10000 == 0:
      print(text, '|'.join(words), len(words), file=sys.stderr)
    counter.add(START_WORD)
    for word in words:
      counter.add(word)
    counter.add(END_WORD)
  num += 1

 
print(FLAGS.out_dir, file=sys.stderr)
counter.save(FLAGS.out_dir + '/vocab.bin', FLAGS.out_dir + '/vocab.txt')
Пример #2
0
  if num % 10000 == 0:
    print(num, file=sys.stderr)
  l = line.rstrip().split('\t')
  
  try:
    texts = l[1].split('\x01')
  except Exception:
    print(line, file=sys.stderr)
  #texts = l[2].split('\x01')
  
  for text in texts:
    text = normalize.norm(text)
    words = segmentor.Segment(text, FLAGS.seg_method)
    if num % 10000 == 0:
      print(text, '|'.join(words), len(words), file=sys.stderr)
    counter.add(START_WORD)
    for word in words:
      counter.add(word)
      if word.isdigit():
        counter.add('<NUM>')
    counter.add(END_WORD)
  num += 1

counter.add(START_WORD)

print(FLAGS.out_dir, file=sys.stderr)
if not FLAGS.vocab_name:
  counter.save(FLAGS.out_dir + '/vocab.bin', FLAGS.out_dir + '/vocab.txt')
else:
  counter.save(FLAGS.out_dir + '/%s.bin'%FLAGS.vocab_name, FLAGS.out_dir + '/%s.txt'%FLAGS.vocab_name)
Пример #3
0
from libword_counter import WordCounter
counter = WordCounter(addUnknown=FLAGS.add_unknown,
                      mostCommon=FLAGS.most_common,
                      minCount=FLAGS.min_count,
                      saveCountInfo=FLAGS.save_count_info)

import sys, os
import numpy as np
import melt

import conf
from conf import IMAGE_FEATURE_LEN

from gezi import Segmentor
segmentor = Segmentor()

num = 0
for line in sys.stdin:
    if num % 10000 == 0:
        print(num)
    l = line.rstrip().split('\t')
    img_end = IMAGE_FEATURE_LEN + 1
    texts = [x.split('\x01')[0] for x in l[img_end:]]
    for text in texts:
        words = segmentor.segment(text)
        for word in words:
            counter.add(word)
    num += 1

counter.save(FLAGS.out_dir + '/vocab.bin', FLAGS.out_dir + '/vocab.txt')
Пример #4
0
flags.DEFINE_boolean("add_unknown", True, "treat ignored words as unknow")
flags.DEFINE_boolean("save_count_info", True, "save count info to bin")
flags.DEFINE_string("out_dir", '/tmp/train/', "save count info to bin")

flags.DEFINE_string('seg_method', 'default', '')

flags.DEFINE_string('name', 'vocab.hdfs', '')

assert FLAGS.most_common > 0 or FLAGS.min_count > 0
import nowarning
from libword_counter import WordCounter

counter = WordCounter(addUnknown=FLAGS.add_unknown,
                      mostCommon=FLAGS.most_common,
                      minCount=FLAGS.min_count,
                      saveCountInfo=FLAGS.save_count_info)

import sys, os
import numpy as np
import melt

import conf
from conf import IMAGE_FEATURE_LEN

for line in sys.stdin:
    word, count = line.rstrip().split('\t')
    counter.add(word, int(count))

counter.save('%s/%s.bin' % (FLAGS.out_dir, FLAGS.name),
             '%s/%s.txt' % (FLAGS.out_dir, FLAGS.name))
Пример #5
0
END_WORD = '</S>'

with tf.gfile.FastGFile(FLAGS.captions_file, "r") as f:
    caption_data = json.load(f)

pb = ProgressBar(len(caption_data["annotations"]))

id_to_filename = [(x["id"], x["file_name"]) for x in caption_data["images"]]

print(len(id_to_filename))

ids = set()
for x in caption_data["images"]:
    ids.add(x["id"])
print(len(ids))

print(len(caption_data["annotations"]))

caption_data = None

for annotation in caption_data["annotations"]:
    pb.progress()
    caption = annotation["caption"]
    words = nltk.tokenize.word_tokenize(caption.lower())
    counter.add(START_WORD)
    for word in words:
        counter.add(word.encode('utf-8'))
    counter.add(END_WORD)

print(FLAGS.out_dir, file=sys.stderr)
counter.save(FLAGS.out_dir + '/vocab.bin', FLAGS.out_dir + '/vocab.txt')