from conf import IMAGE_FEATURE_LEN from gezi import Segmentor segmentor = Segmentor() START_WORD = '<S>' END_WORD = '</S>' print('seg_method:', FLAGS.seg_method, file=sys.stderr) print('min_count:', FLAGS.min_count, 'most_common:', FLAGS.most_common) num = 0 for line in sys.stdin: if num % 10000 == 0: print(num, file=sys.stderr) l = line.rstrip().split('\t') texts = l[1].split('\x01') for text in texts: words = segmentor.Segment(text, FLAGS.seg_method) if num % 10000 == 0: print(text, '|'.join(words), len(words), file=sys.stderr) counter.add(START_WORD) for word in words: counter.add(word) counter.add(END_WORD) num += 1 print(FLAGS.out_dir, file=sys.stderr) counter.save(FLAGS.out_dir + '/vocab.bin', FLAGS.out_dir + '/vocab.txt')
if num % 10000 == 0: print(num, file=sys.stderr) l = line.rstrip().split('\t') try: texts = l[1].split('\x01') except Exception: print(line, file=sys.stderr) #texts = l[2].split('\x01') for text in texts: text = normalize.norm(text) words = segmentor.Segment(text, FLAGS.seg_method) if num % 10000 == 0: print(text, '|'.join(words), len(words), file=sys.stderr) counter.add(START_WORD) for word in words: counter.add(word) if word.isdigit(): counter.add('<NUM>') counter.add(END_WORD) num += 1 counter.add(START_WORD) print(FLAGS.out_dir, file=sys.stderr) if not FLAGS.vocab_name: counter.save(FLAGS.out_dir + '/vocab.bin', FLAGS.out_dir + '/vocab.txt') else: counter.save(FLAGS.out_dir + '/%s.bin'%FLAGS.vocab_name, FLAGS.out_dir + '/%s.txt'%FLAGS.vocab_name)
from libword_counter import WordCounter counter = WordCounter(addUnknown=FLAGS.add_unknown, mostCommon=FLAGS.most_common, minCount=FLAGS.min_count, saveCountInfo=FLAGS.save_count_info) import sys, os import numpy as np import melt import conf from conf import IMAGE_FEATURE_LEN from gezi import Segmentor segmentor = Segmentor() num = 0 for line in sys.stdin: if num % 10000 == 0: print(num) l = line.rstrip().split('\t') img_end = IMAGE_FEATURE_LEN + 1 texts = [x.split('\x01')[0] for x in l[img_end:]] for text in texts: words = segmentor.segment(text) for word in words: counter.add(word) num += 1 counter.save(FLAGS.out_dir + '/vocab.bin', FLAGS.out_dir + '/vocab.txt')
flags.DEFINE_boolean("add_unknown", True, "treat ignored words as unknow") flags.DEFINE_boolean("save_count_info", True, "save count info to bin") flags.DEFINE_string("out_dir", '/tmp/train/', "save count info to bin") flags.DEFINE_string('seg_method', 'default', '') flags.DEFINE_string('name', 'vocab.hdfs', '') assert FLAGS.most_common > 0 or FLAGS.min_count > 0 import nowarning from libword_counter import WordCounter counter = WordCounter(addUnknown=FLAGS.add_unknown, mostCommon=FLAGS.most_common, minCount=FLAGS.min_count, saveCountInfo=FLAGS.save_count_info) import sys, os import numpy as np import melt import conf from conf import IMAGE_FEATURE_LEN for line in sys.stdin: word, count = line.rstrip().split('\t') counter.add(word, int(count)) counter.save('%s/%s.bin' % (FLAGS.out_dir, FLAGS.name), '%s/%s.txt' % (FLAGS.out_dir, FLAGS.name))
END_WORD = '</S>' with tf.gfile.FastGFile(FLAGS.captions_file, "r") as f: caption_data = json.load(f) pb = ProgressBar(len(caption_data["annotations"])) id_to_filename = [(x["id"], x["file_name"]) for x in caption_data["images"]] print(len(id_to_filename)) ids = set() for x in caption_data["images"]: ids.add(x["id"]) print(len(ids)) print(len(caption_data["annotations"])) caption_data = None for annotation in caption_data["annotations"]: pb.progress() caption = annotation["caption"] words = nltk.tokenize.word_tokenize(caption.lower()) counter.add(START_WORD) for word in words: counter.add(word.encode('utf-8')) counter.add(END_WORD) print(FLAGS.out_dir, file=sys.stderr) counter.save(FLAGS.out_dir + '/vocab.bin', FLAGS.out_dir + '/vocab.txt')