示例#1
0
flags.DEFINE_integer("most_common", 0, "if > 0 then get vocab with most_common words")
flags.DEFINE_integer("min_count", 0, "if > 0 then cut by min_count")
flags.DEFINE_boolean("add_unknown", True, "treat ignored words as unknow")
flags.DEFINE_boolean("save_count_info", True, "save count info to bin")
flags.DEFINE_string("out_dir", '/tmp/train/', "save count info to bin")
flags.DEFINE_string('seg_method', '', '')

assert FLAGS.most_common > 0 or FLAGS.min_count > 0
assert FLAGS.seg_method

import nowarning
from libword_counter import WordCounter
counter = WordCounter(
    addUnknown=FLAGS.add_unknown,
    mostCommon=FLAGS.most_common,
    minCount=FLAGS.min_count,
    saveCountInfo=FLAGS.save_count_info)

import sys,os
import numpy as np
import melt

import conf 
from conf import IMAGE_FEATURE_LEN

from gezi import Segmentor
segmentor = Segmentor()

START_WORD = '<S>'
END_WORD = '</S>'
示例#2
0
                     "if > 0 then get vocab with most_common words")
flags.DEFINE_integer("min_count", 0, "if > 0 then cut by min_count")
flags.DEFINE_boolean("add_unknown", True, "treat ignored words as unknow")
flags.DEFINE_boolean("save_count_info", True, "save count info to bin")
flags.DEFINE_string("out_dir", '/tmp/train/', "save count info to bin")

flags.DEFINE_string('seg_method', 'default', '')

flags.DEFINE_string('name', 'vocab.hdfs', '')

assert FLAGS.most_common > 0 or FLAGS.min_count > 0
import nowarning
from libword_counter import WordCounter

counter = WordCounter(addUnknown=FLAGS.add_unknown,
                      mostCommon=FLAGS.most_common,
                      minCount=FLAGS.min_count,
                      saveCountInfo=FLAGS.save_count_info)

import sys, os
import numpy as np
import melt

import conf
from conf import IMAGE_FEATURE_LEN

for line in sys.stdin:
    word, count = line.rstrip().split('\t')
    counter.add(word, int(count))

counter.save('%s/%s.bin' % (FLAGS.out_dir, FLAGS.name),
             '%s/%s.txt' % (FLAGS.out_dir, FLAGS.name))