def pdtb_prepare(args): print('Loading dataset...') train_sections = [os.path.join(PathConfig.json_data_dir, '{:02}'.format(section_num)) for section_num in PathConfig.train_sections] dev_sections = [os.path.join(PathConfig.json_data_dir, '{:02}'.format(section_num)) for section_num in PathConfig.dev_sections] test_sections = [os.path.join(PathConfig.json_data_dir, '{:02}'.format(section_num)) for section_num in PathConfig.test_sections] dataset = PDTBDataSet(train_sections, dev_sections, test_sections, level=2 if args.task.startswith('fine') else 1) print('Size of train: {}, dev: {}, test: {}'.format(len(dataset.train_set), len(dataset.dev_set), len(dataset.test_set))) print('Creating word vocab...') if not os.path.exists(PathConfig.experiment_data_dir): os.mkdir(PathConfig.experiment_data_dir) word_vocab = Vocab(mannual_add=[PAD_WORD, UNK_WORD, BOS_WORD, EOS_WORD, NUM_WORD]) for word in dataset.get_all_words(): word_vocab.add(word) word_vocab.load_pretrained_emb(PathConfig.embedding_path) print('Size of word vocab: {}'.format(word_vocab.size())) torch.save(word_vocab, os.path.join(PathConfig.experiment_data_dir, 'word_vocab.obj')) tag_vocab = Vocab() for tag in dataset.get_all_tags(): tag_vocab.add(tag) print('Size of tag vocab: {}'.format(tag_vocab.size())) tag_vocab.init_embed(ModelConfig.tag_embed_dim) torch.save(tag_vocab, os.path.join(PathConfig.experiment_data_dir, 'tag_vocab.obj')) print('Formatting the dataset to torch variables...') dataset.format_instances_to_torch_var(word_vocab, tag_vocab) torch.save(dataset, os.path.join(PathConfig.experiment_data_dir, 'dataset.obj'))
def prepare(args): logger = logging.getLogger("brc") logger.info('Checking the data files...') for data_path in args.train_files + args.dev_files + args.test_files: assert os.path.exists(data_path), '{} file does not exist.'.format( data_path) logger.info('Preparing the directories...') for dir_path in [ args.vocab_dir, args.model_dir, args.result_dir, args.summary_dir ]: if not os.path.exists(dir_path): os.makedirs(dir_path) logger.info('Building vocabulary...') brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, args.gpus, args.batch_size, args.train_files, args.dev_files, args.test_files) vocab = Vocab(init_random=False, trainable_oov_cnt_threshold=2) for word in brc_data.word_iter('train'): vocab.add(word) unfiltered_vocab_size = vocab.size() vocab.filter_tokens_by_cnt(min_cnt=2) filtered_num = unfiltered_vocab_size - vocab.size() logger.info('After filter {} tokens, the final vocab size is {}'.format( filtered_num, vocab.size())) logger.info('Assigning embeddings...') # vocab.build_embedding_matrix(args.pretrained_word_path) vocab.randomly_init_embeddings(args.embed_size) logger.info('Saving vocab...') with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout: pickle.dump(vocab, fout) logger.info('Done with preparing!')
def prepare_data(): # load the dataset train_sections = [ os.path.join(paths.json_data_dir, '{:02}'.format(section_num)) for section_num in paths.train_sections ] dev_sections = [ os.path.join(paths.json_data_dir, '{:02}'.format(section_num)) for section_num in paths.dev_sections ] test_sections = [ os.path.join(paths.json_data_dir, '{:02}'.format(section_num)) for section_num in paths.test_sections ] train_dataset = PDTBDataSet(train_sections, tree_type=args.tree_type, level=args.level, multiple_labels=False) dev_dataset = PDTBDataSet(dev_sections, tree_type=args.tree_type, level=args.level, multiple_labels=True) test_dataset = PDTBDataSet(test_sections, tree_type=args.tree_type, level=args.level, multiple_labels=True) if not (train_dataset.consistent_with(dev_dataset) and dev_dataset.consistent_with(test_dataset)): print('Dataset labels are not consistent.') print('Train: {}'.format(sorted(train_dataset.label_map.keys()))) print('Dev: {}'.format(sorted(dev_dataset.label_map.keys()))) print('Test: {}'.format(sorted(test_dataset.label_map.keys()))) print('Size of train set: {}, dev set: {}, test set: {}'.format( len(train_dataset), len(dev_dataset), len(test_dataset))) # save the dataset torch.save(train_dataset, os.path.join(paths.experiment_data_dir, 'train.data')) torch.save(dev_dataset, os.path.join(paths.experiment_data_dir, 'dev.data')) torch.save(test_dataset, os.path.join(paths.experiment_data_dir, 'test.data')) # build the vocab vocab = Vocab( mannual_add=[PAD_WORD, UNK_WORD, BOS_WORD, EOS_WORD, NUM_WORD]) all_words = train_dataset.get_all_words() + dev_dataset.get_all_words( ) + test_dataset.get_all_words() # all_words = train_dataset.get_all_words() for word in all_words: vocab.add(word) # load and initialize the embeddings vocab.load_pretrained_emb(paths.embedding_path) print('Size of PDTB vocabulary: {}'.format(vocab.size())) # save the vocab torch.save(vocab, paths.vocab_path)