def main(_):
    mode = get_mode(FLAGS.input)

    global vocab, char_vocab
    vocab = gezi.Vocabulary(FLAGS.vocab_,
                            fixed=FLAGS.fixed_vocab,
                            unk_word=FLAGS.unk_word)
    char_vocab_file = FLAGS.vocab_.replace('vocab.txt', 'char_vocab.txt')
    if os.path.exists(char_vocab_file):
        char_vocab = Vocabulary(char_vocab_file)
        print('char vocab size:', char_vocab.size())

    mode_ = 'train'
    if 'valid' in FLAGS.input:
        mode_ = 'valid'
    elif 'test' in FLAGS.input:
        mode_ = 'test'
    else:
        assert 'train' in FLAGS.input

    if FLAGS.augument:
        mode_ = 'aug.' + mode_

    if FLAGS.mode_:
        mode_ = FLAGS.mode_

    global df
    df = []
    for line in open(FLAGS.input):
        df.append(line.strip().split('\t', 3))

    pool = multiprocessing.Pool()

    if not FLAGS.num_records_:
        if mode.split('.')[-1] in ['valid', 'test', 'dev', 'pm'
                                   ] or 'valid' in FLAGS.input:
            FLAGS.num_records_ = 1
        else:
            FLAGS.num_records_ = 1

    print('num records file to gen', FLAGS.num_records_)

    #FLAGS.num_records_ = 1

    pool.map(build_features, range(FLAGS.num_records_))
    pool.close()
    pool.join()

    # for i in range(FLAGS.num_records_):
    #   build_features(i)

    # for safe some machine might not use cpu count as default ...
    print('num_records:', counter.value)

    os.system('mkdir -p %s/%s' % (os.path.dirname(FLAGS.vocab_), mode))
    out_file = os.path.dirname(
        FLAGS.vocab_) + '/{0}/num_records.txt'.format(mode)
    gezi.write_to_txt(counter.value, out_file)

    print('mean words:', total_words.value / counter.value)
示例#2
0
def main(_):
  num_conflicts = 0
  visited = {}
  visited_ngram = {}
  ngram_vocab_path = FLAGS.ngram_vocab or os.path.join(FLAGS.dir, 'ngram_vocab.txt')
  ngram_vocab = Vocabulary(ngram_vocab_path)
  print('ngram_vocab size', ngram_vocab.size())
  print('num ngram buckets', FLAGS.ngram_buckets)
  if FLAGS.emb.endswith('.npy'):
    ngram_emb = np.load(FLAGS.emb)
    assert len(ngram_emb) > 100000
  else:
    ngram_emb = []
    for line in open(FLAGS.emb):
      ngram_emb.append([float(x) for x in line.strip().split()])
  print('len ngram emb', len(ngram_emb))
  emb_mat = []
  vec_size = FLAGS.emb_dim
  # for padding zero
  emb_mat.append(np.array([0.] * vec_size))
  # exclude first pad and last 3 unk <s> </s>
  # unk, <s>, </s>, sincie ngram vocab txt not include these will append 
  for i in range(3):
    emb_mat.append([np.random.uniform(-0.08, 0.08) for _ in range(vec_size)])

  for i in range(4, ngram_vocab.size()):
    ngram = ngram_vocab.key(i)
    ngram_hash = gezi.hash(ngram)
    ngram_id = ngram_hash % FLAGS.ngram_buckets
    if ngram_id not in visited:
      visited[ngram_id] = 1
      visited_ngram[ngram_id] = [ngram]
    else:
      visited[ngram_id] += 1
      visited_ngram[ngram_id].append(ngram)
      num_conflicts += 1
      #print('Conflict', visited_ngram[ngram_id], 'Num conflicts', num_conflicts)
    emb_mat.append(ngram_emb[ngram_id])
  print('Num conflicts', num_conflicts)

  print('len(emb_mat)', len(emb_mat))
  ngram_output = FLAGS.ngram_output or 'ngram.npy'
  out_mat = os.path.join(FLAGS.dir, ngram_output)
  print('out mat', out_mat)
  np.save(out_mat, np.array(emb_mat))
def main(_):
    tokenizer.init(FLAGS.tokenizer_vocab)
    global examples, vocab, char_vocab
    examples = pd.read_csv(FLAGS.input)
    #if 'train' in FLAGS.input:
    #  examples = shuffle(examples, random_state=1024)
    vocab = Vocabulary(FLAGS.vocab)
    char_vocab = Vocabulary(FLAGS.vocab.replace('vocab.txt', 'char_vocab.txt'))

    pool = multiprocessing.Pool()
    pool.map(build_features, range(FLAGS.num_records))
    pool.close()
    pool.join()

    # build_features(0)

    print('num_records:', counter.value)
    mode = 'train' if 'train' in FLAGS.input else 'test'
    out_file = os.path.dirname(
        FLAGS.vocab) + '/{0}/num_records.txt'.format(mode)
    gezi.write_to_txt(counter.value, out_file)
def main(_):  
  os.system('mkdir -p %s' % FLAGS.dir)
  tokenizer.init(FLAGS.tokenizer_vocab)
  global examples, vocab, unk_vocab, char_vocab, pos_vocab, tag_vocab, ner_vocab, ngram_vocab
  examples = pd.read_csv(FLAGS.input)
  #if 'train' in FLAGS.input:
  #  examples = shuffle(examples, random_state=1024)
  vocab = Vocabulary(FLAGS.vocab)
  # unk_vocab is actually a small vocab so will genearte unk for training
  #unk_vocab =  Vocabulary(FLAGS.vocab.replace('vocab.txt', 'unk_vocab.txt'))
  char_vocab = Vocabulary(FLAGS.vocab.replace('vocab.txt', 'char_vocab.txt'))
  pos_vocab = Vocabulary(FLAGS.vocab.replace('vocab.txt', 'pos_vocab.txt'))
  tag_vocab = Vocabulary(FLAGS.vocab.replace('vocab.txt', 'tag_vocab.txt'))
  ner_vocab = Vocabulary(FLAGS.vocab.replace('vocab.txt', 'ner_vocab.txt'))
  ngram_vocab = Vocabulary(FLAGS.vocab.replace('vocab.txt', 'ngram_vocab.txt'))

  global enprob_dict
  enprob_dict = {}
  enprob_file = '~/data/kaggle/toxic/train.enprob.csv' if 'train' in FLAGS.input else '~/data/kaggle/toxic/test.enprob.csv'
  enprob_df = pd.read_csv(enprob_file)
  for id, enprob in zip(enprob_df['id'].values, enprob_df['enprob'].values):
    enprob_dict[id] = enprob
  enprob_dict['0'] = 1.

  pool = multiprocessing.Pool()
  pool.map(build_features, range(FLAGS.num_records))
  pool.close()
  pool.join()

  #build_features(0)

  print('num_records:', counter.value)
  mode = get_mode()
  out_file = os.path.dirname(FLAGS.vocab) + '/{0}/num_records.txt'.format(mode)
  gezi.write_to_txt(counter.value, out_file)
def main(_):
    assert FLAGS.use_char is not None

    global vocab, char_vocab, pos_vocab
    vocab = gezi.Vocabulary(FLAGS.vocab_)
    print('vocab size:', vocab.size())
    char_vocab_file = FLAGS.vocab_.replace('vocab.txt', 'char_vocab.txt')
    if os.path.exists(char_vocab_file):
        char_vocab = Vocabulary(char_vocab_file)
        print('char vocab size:', char_vocab.size())
    pos_vocab_file = FLAGS.vocab_.replace('vocab.txt', 'pos_vocab.txt')
    if os.path.exists(pos_vocab_file):
        pos_vocab = Vocabulary(pos_vocab_file)
        print('pos vocab size:', pos_vocab.size())

    if os.path.isfile(FLAGS.input):
        build_features(FLAGS.input)
    else:
        files = glob.glob(FLAGS.input + '/*')
        pool = multiprocessing.Pool(multiprocessing.cpu_count())
        pool.map(build_features, files)
        pool.close()
        pool.join()

    # for safe some machine might not use cpu count as default ...
    print('num_records:', counter.value)
    mode = get_mode(FLAGS.input)

    os.system('mkdir -p %s/%s' % (os.path.dirname(FLAGS.vocab_), mode))
    out_file = os.path.dirname(
        FLAGS.vocab_) + '/{0}/num_records.txt'.format(mode)
    gezi.write_to_txt(counter.value, out_file)

    print('mean words:', total_words.value / counter.value)
示例#6
0
def main(_):
    mode = get_mode(FLAGS.input)

    assert FLAGS.use_fold
    #text2ids.init(FLAGS.vocab_)
    global vocab, char_vocab, pos_vocab, ner_vocab, seg_result, pos_result, ner_result
    #vocab = text2ids.vocab
    vocab = gezi.Vocabulary(FLAGS.vocab_,
                            fixed=FLAGS.fixed_vocab,
                            unk_word=FLAGS.unk_word)
    print('vocab size:', vocab.size())
    char_vocab_file = FLAGS.vocab_.replace('vocab.txt', 'char_vocab.txt')
    if os.path.exists(char_vocab_file):
        char_vocab = Vocabulary(char_vocab_file)
        print('char vocab size:', char_vocab.size())
    pos_vocab_file = FLAGS.vocab_.replace('vocab.txt', 'pos_vocab.txt')
    if os.path.exists(pos_vocab_file):
        pos_vocab = Vocabulary(pos_vocab_file)
        print('pos vocab size:', pos_vocab.size())
    ner_vocab_file = FLAGS.vocab_.replace('vocab.txt', 'ner_vocab.txt')
    if os.path.exists(ner_vocab_file):
        ner_vocab = Vocabulary(ner_vocab_file)
        print('ner vocab size:', ner_vocab.size())

    mode_ = 'train'
    if 'valid' in FLAGS.input:
        mode_ = 'valid'
    elif 'test' in FLAGS.input:
        mode_ = 'test'
    else:
        assert 'train' in FLAGS.input

    if FLAGS.augument:
        mode_ = 'aug.' + mode_

    if FLAGS.mode_:
        mode_ = FLAGS.mode_

    seg_file = FLAGS.vocab_.replace('vocab.txt', '%s.seg.txt' % mode_)
    seg_result = {}
    if os.path.exists(seg_file):
        print('seg or seg_pos exits:', seg_file)
        pos_result = {}
        for line in open(seg_file):
            id, segs = line.rstrip('\n').split('\t', 1)
            segs = segs.split('\x09')
            if FLAGS.ignore_start_end:
                segs = segs[1:-1]
            if '|' in segs[0] and not FLAGS.word_only:
                l = [x.rsplit('|', 1) for x in segs]
                segs, pos = list(zip(*l))
                pos_result[id] = pos
            seg_result[id] = segs

    seg_done = True if seg_result else False
    ner_file = FLAGS.vocab_.replace('vocab.txt', '%s.ner.txt' % mode_)
    ner_result = {}
    if os.path.exists(ner_file):
        print('seg_ner exists:', ner_file)
        for line in open(ner_file):
            id, segs = line.rstrip('\n').split('\t', 1)
            segs = segs.split('\x09')
            if FLAGS.ignore_start_end:
                segs = segs[1:-1]
            if '|' in segs[0]:
                l = [x.split('|') for x in segs]
                segs, ner = list(zip(*l))
            if not seg_done:
                seg_result[id] = segs
            ner_result[id] = ner

    print('len(seg_result)', len(seg_result))
    print('len(ner_result)', len(ner_result))

    # print('to_lower:', FLAGS.to_lower, 'feed_single:', FLAGS.feed_single, 'feed_single_en:', FLAGS.feed_single_en, 'seg_method', FLAGS.seg_method)
    # print(text2ids.ids2text(text2ids_('傻逼脑残B')))
    # print(text2ids.ids2text(text2ids_('喜欢玩孙尚香的加我好友:2948291976')))

    global df
    df = pd.read_csv(FLAGS.input, lineterminator='\n')

    pool = multiprocessing.Pool()

    if mode.split('.')[-1] in ['valid', 'test', 'dev', 'pm'
                               ] or 'valid' in FLAGS.input:
        FLAGS.num_records_ = 1

    print('num records file to gen', FLAGS.num_records_)

    #FLAGS.num_records_ = 1

    pool.map(build_features, range(FLAGS.num_records_))
    pool.close()
    pool.join()

    # for i in range(FLAGS.num_records_):
    #   build_features(i)

    # for safe some machine might not use cpu count as default ...
    print('num_records:', counter.value)

    os.system('mkdir -p %s/%s' % (os.path.dirname(FLAGS.vocab_), mode))
    out_file = os.path.dirname(
        FLAGS.vocab_) + '/{0}/num_records.txt'.format(mode)
    gezi.write_to_txt(counter.value, out_file)

    print('mean words:', total_words.value / counter.value)
示例#7
0
def main(_):
    input_vocab = os.path.join(FLAGS.dir, 'vocab.full.txt')
    ft_vocab = Vocabulary(os.path.join(os.path.dirname(FLAGS.emb),
                                       'vocab.txt'),
                          fixed=True)
    lines = open(input_vocab).readlines()

    ori_words_counts = [x.rstrip('\n').split('\t') for x in lines]
    # TODO FIXME why must remove? other wise when in  for word, count in zip(ori_words, counts): will ValueError: invalid literal for int() with base 10: '    '
    ori_words_counts = filter(lambda x: x[0].strip(), ori_words_counts)
    ori_words, counts = zip(*ori_words_counts)
    counts = list(map(int, counts))
    ori_set = set(ori_words)

    normed_ori_set = set([x.lower() for x in ori_set])

    embedding_dict = {}

    ngrams = []

    vec_size = FLAGS.emb_dim
    with open(FLAGS.emb, 'r', encoding='utf-8') as fh:
        #for line in tqdm(fh, total=2196017):
        for i, line in enumerate(fh):
            array = line.split()
            # fasttext txt has header line
            if len(array) < vec_size:
                continue
            vector = list(map(float, array))
            if i >= ft_vocab.size():
                ngrams.append(vector)
                continue
            word = ft_vocab.key(i)
            if word.lower() in normed_ori_set:
                embedding_dict[word] = vector
            if i % 100000 == 0:
                print(i)
                #break
    print("{} / {} tokens have corresponding word embedding vector".format(
        len(embedding_dict), len(ori_words)))

    words = []
    emb_mat = []
    # for padding zero
    emb_mat.append(np.array([0.] * vec_size))
    if not '<UNK>' in ori_set:
        #change from all 0 to random normal for unk
        #emb_mat.append([np.random.normal(scale=0.1) for _ in range(vec_size)])
        emb_mat.append(
            [np.random.uniform(-0.08, 0.08) for _ in range(vec_size)])
        words.append('<UNK>')
    if not '<S>' in ori_set:
        emb_mat.append(
            [np.random.uniform(-0.08, 0.08) for _ in range(vec_size)])
        words.append('<S>')
    if not '</S>' in ori_set:
        emb_mat.append(
            [np.random.uniform(-0.08, 0.08) for _ in range(vec_size)])
        words.append('</S>')

    with open('/home/gezi/tmp/rare_words.txt', 'w') as rare_out:
        for word, count in zip(ori_words, counts):
            if FLAGS.type == 'normal':
                if word in embedding_dict:
                    emb_mat.append(np.array(embedding_dict[word]))
                    words.append(word)
                else:
                    if count >= FLAGS.min_count:
                        print('%s %d' % (word, count), file=rare_out)
                        #emb_mat.append([np.random.normal(scale=0.1) for _ in range(vec_size)])
                        emb_mat.append([
                            np.random.uniform(-0.08, 0.08)
                            for _ in range(vec_size)
                        ])
                        words.append(word)
            elif FLAGS.type == 'scratch':
                if count >= FLAGS.min_count:
                    if word in embedding_dict:
                        emb_mat.append(np.array(embedding_dict[word]))
                        words.append(word)
                    else:
                        #emb_mat.append([np.random.normal(scale=0.1) for _ in range(vec_size)])
                        emb_mat.append([
                            np.random.uniform(-0.08, 0.08)
                            for _ in range(vec_size)
                        ])
                        words.append(word)
            elif FLAGS.type == 'only':
                if word in embedding_dict:
                    emb_mat.append(np.array(embedding_dict[word]))
                    words.append(word)

    words_set = set(words)

    for word, count in zip(ori_words, counts):
        if word not in words_set:
            contains = False
            for w in (word.lower(), word.capitalize(), word.upper()):
                if w in words_set:
                    contains = True
            if not contains:
                for w in (word.lower(), word.capitalize(), word.upper()):
                    if w in embedding_dict:
                        print('adding....', w, word)
                        words_set.add(w)
                        emb_mat.append(np.array(embedding_dict[w]))
                        words.append(w)
                        break

    out_vocab = os.path.join(FLAGS.dir, 'vocab.txt')
    print('out vocab size', len(words), 'ori ft vocab size', ft_vocab.size())
    with open(out_vocab, 'w') as out:
        for word in words:
            print(word, file=out)

    out_mat = os.path.join(FLAGS.dir, FLAGS.out_name)

    emb_mat += ngrams

    # # check
    # ids = gezi.fasttext_ids('you', Vocabulary(out_vocab), FLAGS.ngram_buckets, 3, 3)
    # print('---------ids', ids)
    # vectors = []
    # for id in ids:
    #   vectors.append(emb_mat[id])
    # vectors = np.stack(vectors)
    # print(np.mean(vectors, 0))

    print('len(emb_mat)', len(emb_mat))
    np.save(out_mat, np.array(emb_mat))
示例#8
0
def init(vocab_path_=None, append=None):
    global vocab, vocab_size, vocab_path
    if vocab is None:
        if not FLAGS.vocab_buckets:
            vocab_path = vocab_path_ or FLAGS.vocab or gezi.dirname(
                FLAGS.model_dir) + '/vocab.txt'
            FLAGS.vocab = vocab_path
            logging.info('vocab:{}'.format(vocab_path))
            logging.info('NUM_RESERVED_IDS:{}'.format(FLAGS.num_reserved_ids))
            if append is None:
                append = FLAGS.vocab_append
                if gezi.env_has('VOCAB_APPEND'):
                    append = True
            vocab = Vocabulary(vocab_path,
                               FLAGS.num_reserved_ids,
                               append=append,
                               max_words=FLAGS.vocab_max_words,
                               min_count=FLAGS.vocab_min_count)
        else:
            vocab = Vocabulary(buckets=FLAGS.vocab_buckets)
        vocab_size = vocab.size() if not FLAGS.vocab_size else min(
            vocab.size(), FLAGS.vocab_size)
        logging.info('vocab_size:{}'.format(vocab_size))
        assert vocab_size > FLAGS.num_reserved_ids, 'empty vocab, wrong vocab path? %s' % FLAGS.vocab
        logging.info('vocab_start:{} id:{}'.format(vocab.key(vocab.start_id()),
                                                   vocab.start_id()))
        logging.info('vocab_end:{} id:{}'.format(vocab.key(vocab.end_id()),
                                                 vocab.end_id()))
        logging.info('vocab_unk:{} id:{}'.format(vocab.key(vocab.unk_id()),
                                                 vocab.unk_id()))
示例#9
0
def init(vocab_path=None):
    global vocab, vocab_size
    if vocab is None:
        if vocab_path is None:
            vocab_path = FLAGS.vocab
        logging.info('vocab:{}'.format(vocab_path))
        logging.info('NUM_RESERVED_IDS:{}'.format(FLAGS.num_reserved_ids))
        vocab = Vocabulary(vocab_path, FLAGS.num_reserved_ids)
        vocab_size = vocab.size() if not FLAGS.vocab_size else min(
            vocab.size(), FLAGS.vocab_size)
        logging.info('vocab_size:{}'.format(vocab_size))
        assert vocab_size > FLAGS.num_reserved_ids, 'empty vocab, wrong vocab path? %s' % FLAGS.vocab
        logging.info('vocab_start:{} id:{}'.format(vocab.key(vocab.start_id()),
                                                   vocab.start_id()))
        logging.info('vocab_end:{} id:{}'.format(vocab.key(vocab.end_id()),
                                                 vocab.end_id()))
        logging.info('vocab_unk:{} id:{}'.format(vocab.key(vocab.unk_id()),
                                                 vocab.unk_id()))
示例#10
0
#   \Description  
# ==============================================================================

  
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import sys, os

from gezi import Vocabulary
import pandas as pd

dir = '/home/gezi/temp/toxic/v16/tfrecords/glove.lower/'

vocab = Vocabulary(dir + 'vocab.txt')

def run(input):
  total_tokens = 0
  total_unks = 0
  num_specials = 0 
  num_toxic = 0
  output = input.replace('.csv', '.numunks.csv')
  output_speial = input.replace('.csv', '.special.csv')
  df = pd.read_csv(input)
  ids = df['id'].values
  comments = df['tokens'].values 
  if 'toxic' not in df.columns:
    df['toxic'] = [0.] * len(comments)
  toxics = df['toxic'].values