logger.setLevel(logging.INFO) # brc_data = DatasetReader(train_file='./dataset/seg_train_data_20w.txt', # dev_file='./dataset/seg_dev_data_20w.txt', # # test_file ='./dataset/test_data' # ) brc_data = DatasetReader(train_file='../dataset/train_yes_no_8k.txt', dev_file='../dataset/dev_yes_no_8k.txt') from data.vocab import Vocab vocab = Vocab(lower=True) import sys for word in brc_data.word_iter(None): vocab.add(word) for char in word: vocab.add_char(char) logger.info(' char size {}'.format(vocab.get_char_vocab_size())) logger.info(' vocab size {} '.format(vocab.get_word_vocab())) unfiltered_vocab_size = vocab.size() unfiltered_char_size = vocab.get_char_vocab_size() vocab.filter_tokens_by_cnt(min_cnt=2) vocab.filter_chars_by_cnt(min_cnt=2) brc_data.convert_to_ids(vocab) train_batches = brc_data.gen_mini_batches('train', 32, 0, shuffle=True) # for batch in train_batches: # text_tokenized = [d['tokens'] for d in batch['raw_data']] # print(batcher.batch_sentences(text_tokenized)) # sys.exit(1)
brc_data = DatasetReader( test_file=args.input, bert_dir='/home/wujindou/chinese_L-12_H-768_A-12', # prefix='bert_meizhuang' #test_file = None, ) from data.vocab import Vocab vocab = Vocab(lower=True) import sys for word in brc_data.word_iter(None): vocab.add(word) for char in word: vocab.add_char(char) logger.info(' char size {}'.format(vocab.get_char_vocab_size())) logger.info(' vocab size {} '.format(vocab.get_word_vocab())) # unfiltered_vocab_size = vocab.size() unfiltered_char_size = vocab.get_char_vocab_size() vocab.filter_tokens_by_cnt(min_cnt=2) vocab.filter_chars_by_cnt(min_cnt=2) filtered_num = unfiltered_vocab_size - vocab.size() logger.info('After filter {} tokens, the final vocab size is {}'.format( filtered_num, vocab.size())) filtered_num = unfiltered_char_size - vocab.get_char_vocab_size() logger.info('After filter {} tokens, the final vocab size is {}'.format( filtered_num, vocab.get_char_vocab_size())) logger.info('after load embedding vocab size is {}'.format(vocab.size())) brc_data.convert_to_ids(vocab)
yield token if __name__ == '__main__': logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') logger = logging.getLogger("brc") logger.setLevel(logging.INFO) brc_data = DatasetReader( '/Users/apple/Downloads/news_qa/news_data_0827/news_data_0827_1w.csv') sys.exit(1) from data.vocab import Vocab vocab = Vocab(lower=True) for word in brc_data.word_iter('train'): vocab.add(word) unfiltered_vocab_size = vocab.size() # vocab.filter_tokens_by_cnt(min_cnt=2) filtered_num = unfiltered_vocab_size - vocab.size() logger.info('After filter {} tokens, the final vocab size is {}'.format( filtered_num, vocab.size())) brc_data.convert_to_ids(vocab) train_batches = brc_data.gen_mini_batches('train', batch_size=16) for batch in train_batches: print(batch['in']) sys.exit(1)