logger.setLevel(logging.INFO)
    # brc_data = DatasetReader(train_file='./dataset/seg_train_data_20w.txt',
    #                          dev_file='./dataset/seg_dev_data_20w.txt',
    #                          # test_file ='./dataset/test_data'
    #                          )
    brc_data = DatasetReader(train_file='../dataset/train_yes_no_8k.txt',
                             dev_file='../dataset/dev_yes_no_8k.txt')
    from data.vocab import Vocab

    vocab = Vocab(lower=True)
    import sys

    for word in brc_data.word_iter(None):
        vocab.add(word)
        for char in word:
            vocab.add_char(char)
    logger.info(' char size {}'.format(vocab.get_char_vocab_size()))
    logger.info(' vocab size {} '.format(vocab.get_word_vocab()))

    unfiltered_vocab_size = vocab.size()
    unfiltered_char_size = vocab.get_char_vocab_size()
    vocab.filter_tokens_by_cnt(min_cnt=2)
    vocab.filter_chars_by_cnt(min_cnt=2)
    brc_data.convert_to_ids(vocab)

    train_batches = brc_data.gen_mini_batches('train', 32, 0, shuffle=True)

    # for batch in train_batches:
    #     text_tokenized = [d['tokens'] for d in batch['raw_data']]
    #     print(batcher.batch_sentences(text_tokenized))
    #     sys.exit(1)
Пример #2
0
    brc_data = DatasetReader(
        test_file=args.input,
        bert_dir='/home/wujindou/chinese_L-12_H-768_A-12',  #
        prefix='bert_meizhuang'  #test_file = None,
    )
    from data.vocab import Vocab
    vocab = Vocab(lower=True)
    import sys
    for word in brc_data.word_iter(None):
        vocab.add(word)
        for char in word:
            vocab.add_char(char)
    logger.info(' char size {}'.format(vocab.get_char_vocab_size()))
    logger.info(' vocab size {} '.format(vocab.get_word_vocab()))
    #
    unfiltered_vocab_size = vocab.size()
    unfiltered_char_size = vocab.get_char_vocab_size()
    vocab.filter_tokens_by_cnt(min_cnt=2)
    vocab.filter_chars_by_cnt(min_cnt=2)

    filtered_num = unfiltered_vocab_size - vocab.size()
    logger.info('After filter {} tokens, the final vocab size is {}'.format(
        filtered_num, vocab.size()))

    filtered_num = unfiltered_char_size - vocab.get_char_vocab_size()
    logger.info('After filter {} tokens, the final vocab size is {}'.format(
        filtered_num, vocab.get_char_vocab_size()))

    logger.info('after load embedding vocab size is {}'.format(vocab.size()))

    brc_data.convert_to_ids(vocab)
Пример #3
0
                    yield token


if __name__ == '__main__':
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    logger = logging.getLogger("brc")
    logger.setLevel(logging.INFO)
    brc_data = DatasetReader(
        '/Users/apple/Downloads/news_qa/news_data_0827/news_data_0827_1w.csv')
    sys.exit(1)
    from data.vocab import Vocab

    vocab = Vocab(lower=True)
    for word in brc_data.word_iter('train'):
        vocab.add(word)

    unfiltered_vocab_size = vocab.size()
    # vocab.filter_tokens_by_cnt(min_cnt=2)

    filtered_num = unfiltered_vocab_size - vocab.size()
    logger.info('After filter {} tokens, the final vocab size is {}'.format(
        filtered_num, vocab.size()))
    brc_data.convert_to_ids(vocab)
    train_batches = brc_data.gen_mini_batches('train', batch_size=16)
    for batch in train_batches:
        print(batch['in'])

        sys.exit(1)