def main(_):
    FLAGS.word_limit = 2000
    global vocab, char_vocab
    vocab = gezi.Vocabulary(FLAGS.vocab_)
    print('vocab file', FLAGS.vocab_, 'vocab size', vocab.size())
    if FLAGS.use_char:
        char_vocab = gezi.Vocabulary(
            FLAGS.vocab_.replace('vocab.txt', 'char_vocab.txt'))

    files = glob.glob(FLAGS.input + '/*')
    pool = multiprocessing.Pool(multiprocessing.cpu_count())
    pool.map(build_features, files)
    pool.close()
    pool.join()

    # for safe some machine might not use cpu count as default ...
    print('num_records:', counter.value)

    mode = 'train' if 'train' in FLAGS.input else 'valid'
    dir_ = os.path.dirname(os.path.dirname(FLAGS.input))
    os.system('mkdir -p %s/%s/%s' % (dir_, FLAGS.tfrecord_dir, mode))
    out_file = os.path.join(
        dir_, '{}/{}/num_records.txt'.format(FLAGS.tfrecord_dir, mode))
    gezi.write_to_txt(counter.value, out_file)

    print('mean words:', total_words.value / counter.value)
Пример #2
0
def main(_):
    mode = get_mode(FLAGS.input)

    global vocab, char_vocab
    vocab = gezi.Vocabulary(FLAGS.vocab_,
                            fixed=FLAGS.fixed_vocab,
                            unk_word=FLAGS.unk_word)
    char_vocab_file = FLAGS.vocab_.replace('vocab.txt', 'char_vocab.txt')
    if os.path.exists(char_vocab_file):
        char_vocab = Vocabulary(char_vocab_file)
        print('char vocab size:', char_vocab.size())

    mode_ = 'train'
    if 'valid' in FLAGS.input:
        mode_ = 'valid'
    elif 'test' in FLAGS.input:
        mode_ = 'test'
    else:
        assert 'train' in FLAGS.input

    if FLAGS.augument:
        mode_ = 'aug.' + mode_

    if FLAGS.mode_:
        mode_ = FLAGS.mode_

    global df
    df = []
    for line in open(FLAGS.input):
        df.append(line.strip().split('\t', 3))

    pool = multiprocessing.Pool()

    if not FLAGS.num_records_:
        if mode.split('.')[-1] in ['valid', 'test', 'dev', 'pm'
                                   ] or 'valid' in FLAGS.input:
            FLAGS.num_records_ = 1
        else:
            FLAGS.num_records_ = 1

    print('num records file to gen', FLAGS.num_records_)

    #FLAGS.num_records_ = 1

    pool.map(build_features, range(FLAGS.num_records_))
    pool.close()
    pool.join()

    # for i in range(FLAGS.num_records_):
    #   build_features(i)

    # for safe some machine might not use cpu count as default ...
    print('num_records:', counter.value)

    os.system('mkdir -p %s/%s' % (os.path.dirname(FLAGS.vocab_), mode))
    out_file = os.path.dirname(
        FLAGS.vocab_) + '/{0}/num_records.txt'.format(mode)
    gezi.write_to_txt(counter.value, out_file)

    print('mean words:', total_words.value / counter.value)
Пример #3
0
def main(argv):
    parse_text_file(FLAGS.text)
    gezi.try_mkdir(FLAGS.output_directory)
    convert_to(FLAGS.image_feature, FLAGS.name)

    if FLAGS.np_save:
        if FLAGS.threads == 1:
            global gtexts, gtext_strs
            texts = [val for sublist in gtexts for val in sublist]
            text_strs = [val for sublist in gtext_strs for val in sublist]
        else:
            texts = [val for sublist in texts_dict.values() for val in sublist]
            text_strs = [
                val for sublist in text_strs_dict.values() for val in sublist
            ]

        print('len(texts):', len(texts))
        np.save(os.path.join(FLAGS.output_directory, 'texts.npy'),
                np.array(texts))
        np.save(os.path.join(FLAGS.output_directory, 'text_strs.npy'),
                np.array(text_strs))

    num_records = counter.value
    print('num_records:', num_records)
    gezi.write_to_txt(num_records,
                      os.path.join(FLAGS.output_directory, 'num_records.txt'))

    print('counter:', counter.value)
    print('max_num_words:', max_num_words.value)
    print('avg_num_words:', sum_words.value / counter.value)
def main(_):  
  os.system('mkdir -p %s' % FLAGS.dir)
  tokenizer.init(FLAGS.tokenizer_vocab)
  global examples, vocab, unk_vocab, char_vocab, pos_vocab, tag_vocab, ner_vocab, ngram_vocab
  examples = pd.read_csv(FLAGS.input)
  #if 'train' in FLAGS.input:
  #  examples = shuffle(examples, random_state=1024)
  vocab = Vocabulary(FLAGS.vocab)
  # unk_vocab is actually a small vocab so will genearte unk for training
  #unk_vocab =  Vocabulary(FLAGS.vocab.replace('vocab.txt', 'unk_vocab.txt'))
  char_vocab = Vocabulary(FLAGS.vocab.replace('vocab.txt', 'char_vocab.txt'))
  pos_vocab = Vocabulary(FLAGS.vocab.replace('vocab.txt', 'pos_vocab.txt'))
  tag_vocab = Vocabulary(FLAGS.vocab.replace('vocab.txt', 'tag_vocab.txt'))
  ner_vocab = Vocabulary(FLAGS.vocab.replace('vocab.txt', 'ner_vocab.txt'))
  ngram_vocab = Vocabulary(FLAGS.vocab.replace('vocab.txt', 'ngram_vocab.txt'))

  global enprob_dict
  enprob_dict = {}
  enprob_file = '~/data/kaggle/toxic/train.enprob.csv' if 'train' in FLAGS.input else '~/data/kaggle/toxic/test.enprob.csv'
  enprob_df = pd.read_csv(enprob_file)
  for id, enprob in zip(enprob_df['id'].values, enprob_df['enprob'].values):
    enprob_dict[id] = enprob
  enprob_dict['0'] = 1.

  pool = multiprocessing.Pool()
  pool.map(build_features, range(FLAGS.num_records))
  pool.close()
  pool.join()

  #build_features(0)

  print('num_records:', counter.value)
  mode = get_mode()
  out_file = os.path.dirname(FLAGS.vocab) + '/{0}/num_records.txt'.format(mode)
  gezi.write_to_txt(counter.value, out_file)
Пример #5
0
def main(_):
    text2ids.init(FLAGS.vocab_)
    print('to_lower:', FLAGS.to_lower, 'feed_single_en:', FLAGS.feed_single_en,
          'seg_method', FLAGS.seg_method)
    print(text2ids.ids2text(text2ids_('傻逼脑残B')))
    print(text2ids_('傻逼脑残B'))
    print(text2ids.ids2text(text2ids_('喜欢玩孙尚香的加我好友:2948291976')))

    #exit(0)

    if os.path.isfile(FLAGS.input):
        build_features(FLAGS.input)
    else:
        files = glob.glob(FLAGS.input + '/*')
        pool = multiprocessing.Pool(multiprocessing.cpu_count())
        pool.map(build_features, files)
        pool.close()
        pool.join()

    # for safe some machine might not use cpu count as default ...
    print('num_records:', counter.value)
    mode = get_mode(FLAGS.input)

    os.system('mkdir -p %s/%s' % (os.path.dirname(FLAGS.vocab_), mode))
    out_file = os.path.dirname(
        FLAGS.vocab_) + '/{0}/num_records.txt'.format(mode)
    gezi.write_to_txt(counter.value, out_file)

    print('mean words:', total_words.value / counter.value)
Пример #6
0
def main(_):
    assert FLAGS.use_char is not None

    global vocab, char_vocab, pos_vocab
    vocab = gezi.Vocabulary(FLAGS.vocab_)
    print('vocab size:', vocab.size())
    char_vocab_file = FLAGS.vocab_.replace('vocab.txt', 'char_vocab.txt')
    if os.path.exists(char_vocab_file):
        char_vocab = Vocabulary(char_vocab_file)
        print('char vocab size:', char_vocab.size())
    pos_vocab_file = FLAGS.vocab_.replace('vocab.txt', 'pos_vocab.txt')
    if os.path.exists(pos_vocab_file):
        pos_vocab = Vocabulary(pos_vocab_file)
        print('pos vocab size:', pos_vocab.size())

    if os.path.isfile(FLAGS.input):
        build_features(FLAGS.input)
    else:
        files = glob.glob(FLAGS.input + '/*')
        pool = multiprocessing.Pool(multiprocessing.cpu_count())
        pool.map(build_features, files)
        pool.close()
        pool.join()

    # for safe some machine might not use cpu count as default ...
    print('num_records:', counter.value)
    mode = get_mode(FLAGS.input)

    os.system('mkdir -p %s/%s' % (os.path.dirname(FLAGS.vocab_), mode))
    out_file = os.path.dirname(
        FLAGS.vocab_) + '/{0}/num_records.txt'.format(mode)
    gezi.write_to_txt(counter.value, out_file)

    print('mean words:', total_words.value / counter.value)
Пример #7
0
def run():
    files = glob.glob(FLAGS.input_directory + '/*')
    if FLAGS.num_max_inputs:
        files = files[:FLAGS.num_max_inputs]

    print(FLAGS.input_directory, len(files))
    print('small_feature:', FLAGS.small_feature)
    print('image_dir:', FLAGS.image_dir, 'info_dir:', FLAGS.info_dir)
    if FLAGS.info_dir and os.path.exists(FLAGS.info_dir):
        print('deal imgtextfile', file=sys.stderr)
        deal_file_fn = deal_imgtextfile
        read_pic_info()
    elif FLAGS.image_dir and os.path.exists(FLAGS.image_dir):
        print('deal file with imgdir', file=sys.stderr)
        deal_file_fn = deal_file_with_imgdir
    else:
        print('noraml deal file', file=sys.stderr)
        deal_file_fn = deal_file

    if FLAGS.threads > len(files):
        FLAGS.threads = len(files)
    if FLAGS.threads > 1:
        pool = multiprocessing.Pool(processes=FLAGS.threads)
        pool.map(deal_file_fn, files)
        pool.close()
        pool.join()
    else:
        for file in files:
            deal_file_fn(file)

    num_images = image_counter.value
    print('num_images:', num_images)

    num_records = record_counter.value
    print('num_records:', num_records)
    gezi.write_to_txt(num_records,
                      os.path.join(FLAGS.output_directory, 'num_records.txt'))

    print('num_records_per_image', num_records / num_images)

    print('num_texts', len(gtexts))

    print('max_num_words:', max_num_words.value)
    print('avg_num_words:', sum_words.value / num_records)

    if FLAGS.np_save:
        print('len(texts):', len(gtexts))
        np.save(os.path.join(FLAGS.output_directory, 'texts.npy'),
                np.array(gtexts))
        np.save(os.path.join(FLAGS.output_directory, 'text_strs.npy'),
                np.array(gtext_strs))

        np.save(os.path.join(FLAGS.output_directory, 'image_labels.npy'),
                image_labels)

        np.save(os.path.join(FLAGS.output_directory, 'image_names.npy'),
                np.array(image_names))
        np.save(os.path.join(FLAGS.output_directory, 'image_features.npy'),
                np.array(image_features))
Пример #8
0
def run():
    files = glob.glob(FLAGS.input_directory + '/*')
    if FLAGS.num_max_inputs:
        files = files[:FLAGS.num_max_inputs]

    print(FLAGS.input_directory, len(files))

    if FLAGS.threads > len(files):
        FLAGS.threads = len(files)
    if FLAGS.threads > 1:
        pool = multiprocessing.Pool(processes=FLAGS.threads)

        pool.map(deal_file, files)

        pool.close()
        pool.join()
    else:
        for file in files:
            deal_file(file)

    num_images = image_counter.value
    print('num_images:', num_images)

    num_records = record_counter.value
    print('num_records:', num_records)
    gezi.write_to_txt(num_records,
                      os.path.join(FLAGS.output_directory, 'num_records.txt'))

    print('num_records_per_image', num_records / num_images)

    print('num_texts', len(gtexts))

    print('max_num_words:', max_num_words.value)
    print('avg_num_words:', sum_words.value / num_records)

    if FLAGS.np_save:
        print('len(texts):', len(gtexts))
        np.save(os.path.join(FLAGS.output_directory, 'texts.npy'),
                np.array(gtexts))
        np.save(os.path.join(FLAGS.output_directory, 'text_strs.npy'),
                np.array(gtext_strs))

        np.save(os.path.join(FLAGS.output_directory, 'image_labels.npy'),
                image_labels)

        np.save(os.path.join(FLAGS.output_directory, 'image_names.npy'),
                np.array(image_names))
        np.save(os.path.join(FLAGS.output_directory, 'image_features.npy'),
                np.array(image_features))
Пример #9
0
def main(_):
    global dataset
    dataset = Dataset()

    pool = multiprocessing.Pool()

    files = glob.glob(FLAGS.input)
    print('input', FLAGS.input)

    pool.map(build_features, files)
    pool.close()
    pool.join()

    print('num_records:', counter.value)

    out_file = '{}/num_records.txt'.format(FLAGS.out_dir)
    gezi.write_to_txt(counter.value, out_file)
Пример #10
0
def main(_):
    tokenizer.init(FLAGS.tokenizer_vocab)
    global examples, vocab, char_vocab
    examples = pd.read_csv(FLAGS.input)
    #if 'train' in FLAGS.input:
    #  examples = shuffle(examples, random_state=1024)
    vocab = Vocabulary(FLAGS.vocab)
    char_vocab = Vocabulary(FLAGS.vocab.replace('vocab.txt', 'char_vocab.txt'))

    pool = multiprocessing.Pool()
    pool.map(build_features, range(FLAGS.num_records))
    pool.close()
    pool.join()

    # build_features(0)

    print('num_records:', counter.value)
    mode = 'train' if 'train' in FLAGS.input else 'test'
    out_file = os.path.dirname(
        FLAGS.vocab) + '/{0}/num_records.txt'.format(mode)
    gezi.write_to_txt(counter.value, out_file)
Пример #11
0
def main(_):
    global dataset
    dataset = Dataset()

    pool = multiprocessing.Pool()

    files = glob.glob(FLAGS.input)
    print('input', FLAGS.input)

    if not os.path.exists(FLAGS.out_dir):
        print('make new dir: [%s]' % FLAGS.out_dir, file=sys.stderr)
        os.makedirs(FLAGS.out_dir)

    pool.map(build_features, files)
    pool.close()
    pool.join()

    print('num_records:', counter.value)

    out_file = '{}/num_records.txt'.format(FLAGS.out_dir)
    gezi.write_to_txt(counter.value, out_file)
Пример #12
0
def run():
    files = glob.glob(FLAGS.input_dir + '/*')
    if FLAGS.num_max_inputs:
        files = files[:FLAGS.num_max_inputs]

    print(FLAGS.input_dir, len(files))

    if FLAGS.threads > len(files):
        FLAGS.threads = len(files)
    if FLAGS.threads > 1:
        pool = multiprocessing.Pool(processes=FLAGS.threads)

        pool.map(deal_file, files)

        pool.close()
        pool.join()
    else:
        for file in files:
            deal_file(file)

    num_records = counter.value
    print('num_records:', num_records)
    gezi.write_to_txt(num_records,
                      os.path.join(FLAGS.output_dir, 'num_records.txt'))

    print('counter:', counter.value)
    print('max_num_words:', max_num_words.value)
    print('avg_num_words:', sum_words.value / counter.value)

    if FLAGS.np_save:
        #hack here TODO   now image_name as ltext_str, image_features as ltext(ids)
        #texts as rtext(ids), text_strs as rtext_str
        np.save(os.path.join(FLAGS.output_dir, 'image_names.npy'),
                np.array(ltext_strs))
        np.save(os.path.join(FLAGS.output_dir, 'image_features.npy'),
                np.array(ltexts))

        np.save(os.path.join(FLAGS.output_dir, 'texts.npy'), np.array(rtexts))
        np.save(os.path.join(FLAGS.output_dir, 'text_strs.npy'),
                np.array(rtext_strs))
Пример #13
0
def main(_):
    text2ids.init(FLAGS.vocab_)
    print('to_lower:', FLAGS.to_lower, 'feed_single_en:', FLAGS.feed_single_en,
          'seg_method', FLAGS.seg_method)
    print(text2ids.ids2text(text2ids_('傻逼脑残B')))
    print(text2ids.ids2text(text2ids_('喜欢玩孙尚香的加我好友:2948291976')))

    global df
    df = pd.read_csv(FLAGS.input, lineterminator='\n')

    mode = get_mode(FLAGS.input)

    pool = multiprocessing.Pool()

    if mode in ['valid', 'test', 'dev', 'pm']:
        FLAGS.num_records_ = 1

    print('num records file to gen', FLAGS.num_records_)

    #FLAGS.num_records_ = 1

    pool.map(build_features, range(FLAGS.num_records_))
    pool.close()
    pool.join()

    #build_features(FLAGS.input)

    # for safe some machine might not use cpu count as default ...
    print('num_records:', counter.value)

    os.system('mkdir -p %s/%s' % (os.path.dirname(FLAGS.vocab_), mode))
    out_file = os.path.dirname(
        FLAGS.vocab_) + '/{0}/num_records.txt'.format(mode)
    gezi.write_to_txt(counter.value, out_file)

    print('mean words:', total_words.value / counter.value)
Пример #14
0

record = []
for thread_index in xrange(FLAGS.threads):
    in_file = '{}_{}'.format(
        FLAGS.input, thread_index) if FLAGS.threads > 1 else FLAGS.input
    args = (in_file, thread_index)
    process = multiprocessing.Process(target=deal_file, args=args)
    process.start()
    record.append(process)

for process in record:
    process.join()

if FLAGS.np_save:
    texts = [val for sublist in texts_dict.values() for val in sublist]
    text_strs = [val for sublist in text_strs_dict.values() for val in sublist]

    print('len(texts):', len(texts))
    np.save(os.path.join(FLAGS.output_directory, 'texts.npy'), np.array(texts))
    np.save(os.path.join(FLAGS.output_directory, 'text_strs.npy'),
            np.array(text_strs))

print('output_directory:', FLAGS.output_directory)
gezi.write_to_txt(counter.value,
                  os.path.join(FLAGS.output_directory, 'num_records.txt'))

print('counter:', counter.value)
print('max_num_words:', max_num_words.value)
print('avg_num_words:', sum_words.value / counter.value)
Пример #15
0
record = []
for thread_index in xrange(FLAGS.threads):
    in_file = '{}_{}'.format(
        FLAGS.input, thread_index) if FLAGS.threads > 1 else FLAGS.input
    args = (in_file, thread_index)
    process = multiprocessing.Process(target=deal_file, args=args)
    process.start()
    record.append(process)

for process in record:
    process.join()

if FLAGS.np_save:
    texts = [val for sublist in texts_dict.values() for val in sublist]
    text_strs = [val for sublist in text_strs_dict.values() for val in sublist]

    print('len(texts):', len(texts))
    np.save(os.path.join(FLAGS.output_directory, 'texts.npy'), np.array(texts))
    np.save(os.path.join(FLAGS.output_directory, 'text_strs.npy'),
            np.array(text_strs))

num_records = counter.value
print('num_records:', num_records)
gezi.write_to_txt(num_records,
                  os.path.join(FLAGS.output_directory, 'num_records.txt'))

print('counter:', counter.value)
print('max_num_words:', max_num_words.value)
print('avg_num_words:', sum_words.value / counter.value)
Пример #16
0
def main(_):
    mode = get_mode(FLAGS.input)

    assert FLAGS.use_fold
    #text2ids.init(FLAGS.vocab_)
    global vocab, char_vocab, pos_vocab, ner_vocab, seg_result, pos_result, ner_result
    #vocab = text2ids.vocab
    vocab = gezi.Vocabulary(FLAGS.vocab_,
                            fixed=FLAGS.fixed_vocab,
                            unk_word=FLAGS.unk_word)
    print('vocab size:', vocab.size())
    char_vocab_file = FLAGS.vocab_.replace('vocab.txt', 'char_vocab.txt')
    if os.path.exists(char_vocab_file):
        char_vocab = Vocabulary(char_vocab_file)
        print('char vocab size:', char_vocab.size())
    pos_vocab_file = FLAGS.vocab_.replace('vocab.txt', 'pos_vocab.txt')
    if os.path.exists(pos_vocab_file):
        pos_vocab = Vocabulary(pos_vocab_file)
        print('pos vocab size:', pos_vocab.size())
    ner_vocab_file = FLAGS.vocab_.replace('vocab.txt', 'ner_vocab.txt')
    if os.path.exists(ner_vocab_file):
        ner_vocab = Vocabulary(ner_vocab_file)
        print('ner vocab size:', ner_vocab.size())

    mode_ = 'train'
    if 'valid' in FLAGS.input:
        mode_ = 'valid'
    elif 'test' in FLAGS.input:
        mode_ = 'test'
    else:
        assert 'train' in FLAGS.input

    if FLAGS.augument:
        mode_ = 'aug.' + mode_

    if FLAGS.mode_:
        mode_ = FLAGS.mode_

    seg_file = FLAGS.vocab_.replace('vocab.txt', '%s.seg.txt' % mode_)
    seg_result = {}
    if os.path.exists(seg_file):
        print('seg or seg_pos exits:', seg_file)
        pos_result = {}
        for line in open(seg_file):
            id, segs = line.rstrip('\n').split('\t', 1)
            segs = segs.split('\x09')
            if FLAGS.ignore_start_end:
                segs = segs[1:-1]
            if '|' in segs[0] and not FLAGS.word_only:
                l = [x.rsplit('|', 1) for x in segs]
                segs, pos = list(zip(*l))
                pos_result[id] = pos
            seg_result[id] = segs

    seg_done = True if seg_result else False
    ner_file = FLAGS.vocab_.replace('vocab.txt', '%s.ner.txt' % mode_)
    ner_result = {}
    if os.path.exists(ner_file):
        print('seg_ner exists:', ner_file)
        for line in open(ner_file):
            id, segs = line.rstrip('\n').split('\t', 1)
            segs = segs.split('\x09')
            if FLAGS.ignore_start_end:
                segs = segs[1:-1]
            if '|' in segs[0]:
                l = [x.split('|') for x in segs]
                segs, ner = list(zip(*l))
            if not seg_done:
                seg_result[id] = segs
            ner_result[id] = ner

    print('len(seg_result)', len(seg_result))
    print('len(ner_result)', len(ner_result))

    # print('to_lower:', FLAGS.to_lower, 'feed_single:', FLAGS.feed_single, 'feed_single_en:', FLAGS.feed_single_en, 'seg_method', FLAGS.seg_method)
    # print(text2ids.ids2text(text2ids_('傻逼脑残B')))
    # print(text2ids.ids2text(text2ids_('喜欢玩孙尚香的加我好友:2948291976')))

    global df
    df = pd.read_csv(FLAGS.input, lineterminator='\n')

    pool = multiprocessing.Pool()

    if mode.split('.')[-1] in ['valid', 'test', 'dev', 'pm'
                               ] or 'valid' in FLAGS.input:
        FLAGS.num_records_ = 1

    print('num records file to gen', FLAGS.num_records_)

    #FLAGS.num_records_ = 1

    pool.map(build_features, range(FLAGS.num_records_))
    pool.close()
    pool.join()

    # for i in range(FLAGS.num_records_):
    #   build_features(i)

    # for safe some machine might not use cpu count as default ...
    print('num_records:', counter.value)

    os.system('mkdir -p %s/%s' % (os.path.dirname(FLAGS.vocab_), mode))
    out_file = os.path.dirname(
        FLAGS.vocab_) + '/{0}/num_records.txt'.format(mode)
    gezi.write_to_txt(counter.value, out_file)

    print('mean words:', total_words.value / counter.value)