def main(): opt = parse_args() init_logger(opt.log_file) logger.info("Extracting features...") src_nfeats = inputters.get_num_features(opt.data_type, opt.train_dir, 'src') qa_nfeats = inputters.get_num_features(opt.data_type, opt.train_dir, 'qa') tgt_nfeats = inputters.get_num_features(opt.data_type, opt.train_dir, 'tgt') logger.info(" * number of source features: %d." % src_nfeats) logger.info(" * number of qa features: %d." % qa_nfeats) logger.info(" * number of target features: %d." % tgt_nfeats) logger.info("Building `Fields` object...") fields = inputters.get_fields(src_nfeats, qa_nfeats, tgt_nfeats, opt.data_type) logger.info("Building & saving training data...") train_dataset_files = build_save_dataset('train', fields, opt) logger.info("Building & saving validation data...") build_save_dataset('valid', fields, opt) logger.info("Building & saving vocabulary...") # train_dataset_files = '/research/king3/yfgao/pycharm_deployment/CoQG/data/coref_flow/processed/coqg.turn3.train.pt' build_save_vocab(train_dataset_files, opt.data_type, fields, opt)
def main(): opt = parse_args() if (opt.max_shard_size > 0): raise AssertionError("-max_shard_size is deprecated, please use \ -shard_size (number of examples) instead.") init_logger(opt.log_file) logger.info("Extracting features...") src_nfeats = inputters.get_num_features(opt.data_type, opt.train_src, 'src') tgt_nfeats = inputters.get_num_features(opt.data_type, opt.train_tgt, 'tgt') logger.info(" * number of source features: %d." % src_nfeats) logger.info(" * number of target features: %d." % tgt_nfeats) logger.info("Building `Fields` object...") fields = inputters.get_fields(opt.data_type, src_nfeats, tgt_nfeats) logger.info("Building & saving training data...") train_dataset_files = build_save_dataset('train', fields, opt) logger.info("Building & saving validation data...") build_save_dataset('valid', fields, opt) logger.info("Building & saving vocabulary...") build_save_vocab(train_dataset_files, fields, opt)
def main(): opt = parse_args() if (opt.max_shard_size > 0): raise AssertionError("-max_shard_size is deprecated, please use \ -shard_size (number of examples) instead.") init_logger(opt.log_file) logger.info("Extracting features...") # 下面的代码是尝试解决多进程prepare失败的问题,但是没有效果 torch.multiprocessing.set_sharing_strategy('file_system') import resource rlimit = resource.getrlimit(resource.RLIMIT_NOFILE) resource.setrlimit(resource.RLIMIT_NOFILE, (65535, rlimit[1])) # END src_nfeats = inputters.get_num_features(opt.data_type, opt.train_src, 'src') tgt_nfeats = inputters.get_num_features(opt.data_type, opt.train_tgt, 'tgt') logger.info(" * number of source features: %d." % src_nfeats) logger.info(" * number of target features: %d." % tgt_nfeats) logger.info("Building `Fields` object...") fields = inputters.get_fields(opt.data_type, src_nfeats, tgt_nfeats) myutils.add_more_field(fields) logger.info("Building & saving training data...") train_dataset_files = build_save_dataset('train', fields, opt) logger.info("Building & saving validation data...") build_save_dataset('valid', fields, opt) logger.info("Building & saving vocabulary...") build_save_vocab(train_dataset_files, fields, opt)
def main(): # Options are parsed and stored in opt object opt = parse_args() # Logging of data init_logger(opt.log_file) logger.info("Extracting features...") src_nfeats = inputters.get_num_features(opt.data_type, opt.train_src, 'src') tgt_nfeats = inputters.get_num_features(opt.data_type, opt.train_tgt, 'tgt') logger.info(" * number of source features: %d." % src_nfeats) logger.info(" * number of target features: %d." % tgt_nfeats) logger.info("Building `Fields` object...") fields = inputters.get_fields(opt.data_type, src_nfeats, tgt_nfeats) #Generation of traing, validation and dictionary data logger.info("Building & saving training data...") train_dataset_files = build_save_dataset('train', fields, opt) logger.info("Building & saving validation data...") valid_dataset_files = build_save_dataset('valid', fields, opt) logger.info("Building & saving vocabulary...") build_save_vocab(train_dataset_files + valid_dataset_files, fields, opt)
def main(): opt = parse_args() init_logger(opt.log_file) logger.info("Extracting features...") src_nfeats = inputters.get_num_features(opt.data_type, opt.train_src, 'src') tgt_nfeats = inputters.get_num_features(opt.data_type, opt.train_tgt, 'tgt') ans_nfeats = inputters.get_num_features(opt.data_type, opt.train_ans, "ans") logger.info(" * number of source features: %d." % src_nfeats) logger.info(" * number of target features: %d." % tgt_nfeats) logger.info(" * number of answer features: %d." % ans_nfeats) logger.info("Building `Fields` object...") fields = inputters.get_fields(opt.data_type, src_nfeats, tgt_nfeats, ans_nfeats) logger.info("fields src") logger.info(fields.src.__dict__) logger.info(fields.tgt.__dict__) logger.info(fields.src_map.__dict__) logger.info(fields.ans.__dict__) logger.info(fields.indices.__dict__) logger.info(fields.alignment.__dict__) '''
def main(): #pdb.set_trace() opt = parse_args() init_logger(opt.log_file) logger.info("Extracting features...") # If there are special features added -- not in our case src_nfeats = inputters.get_num_features(opt.data_type, opt.train_src, 'src') tgt_nfeats = inputters.get_num_features(opt.data_type, opt.train_tgt, 'tgt') logger.info(" * number of source features: %d." % src_nfeats) logger.info(" * number of target features: %d." % tgt_nfeats) logger.info("Building `Fields` object...") fields = inputters.get_fields(opt.data_type, src_nfeats, tgt_nfeats) logger.info("Building & saving training data...") train_dataset_files = build_save_dataset('train', fields, opt) logger.info("Building & saving validation data...") valid_dataset_files = build_save_dataset('valid', fields, opt) logger.info("Building & saving vocabulary...") build_save_vocab(train_dataset_files + valid_dataset_files, fields, opt)
def main(): opt = parse_args() assert opt.max_shard_size == 0, \ "-max_shard_size is deprecated. Please use \ -shard_size (number of examples) instead." assert opt.shuffle == 0, \ "-shuffle is not implemented. Please shuffle \ your data before pre-processing." init_logger(opt.log_file) logger.info("Extracting features...") src_nfeats = inputters.get_num_features(opt.data_type, opt.train_src, 'src') tgt_nfeats = inputters.get_num_features(opt.data_type, opt.train_tgt, 'tgt') logger.info(" * number of source features: %d." % src_nfeats) logger.info(" * number of target features: %d." % tgt_nfeats) logger.info("Building `Fields` object...") fields = inputters.get_fields(opt.data_type, src_nfeats, tgt_nfeats) logger.info("Building & saving training data...") train_dataset_files = build_save_dataset('train', fields, opt) logger.info("Building & saving validation data...") build_save_dataset('valid', fields, opt) logger.info("Building & saving vocabulary...") build_save_vocab(train_dataset_files, fields, opt)
def main(): opt = parse_args() if opt.max_shard_size > 0: raise AssertionError("-max_shard_size is deprecated, please use \ -shard_size (number of examples) instead.") if opt.shuffle > 0: raise AssertionError("-shuffle is not implemented, please make sure \ you shuffle your data before pre-processing.") init_logger(opt.log_file) logger.info("Extracting features...") src_nfeats = inputters.get_num_features(opt.data_type, opt.train_src, "src") tgt_nfeats = inputters.get_num_features(opt.data_type, opt.train_tgt, "tgt") logger.info(" * number of source features: %d." % src_nfeats) logger.info(" * number of target features: %d." % tgt_nfeats) logger.info("Building `Fields` object...") fields = inputters.get_fields(opt.data_type, src_nfeats, tgt_nfeats) logger.info("Building & saving training data...") train_dataset_files = build_save_dataset("train", fields, opt) logger.info("Building & saving validation data...") build_save_dataset("valid", fields, opt) logger.info("Building & saving vocabulary...") build_save_vocab(train_dataset_files, fields, opt)
def _get_fields(data_type, train_src, train_tgt): logger.info("Extracting features...") src_nfeats = inputters.get_num_features(data_type, train_src, 'src') tgt_nfeats = inputters.get_num_features(data_type, train_tgt, 'tgt') logger.info(" * number of source features: %d." % src_nfeats) logger.info(" * number of target features: %d." % tgt_nfeats) logger.info("Building `Fields` object...") fields = inputters.get_fields(data_type, src_nfeats, tgt_nfeats) return fields
def dump_dataset(savepath, save_dev=False): src_corpus = savepath + '/src-train.txt' tgt_corpus = savepath + '/tgt-train.txt' src_nfeats = inputters.get_num_features('text', src_corpus, 'src') tgt_nfeats = inputters.get_num_features('text', tgt_corpus, 'tgt') fields = inputters.get_fields('text', src_nfeats, tgt_nfeats) fields['graph'] = torchtext.data.Field(sequential=False) train_dataset_files = build_save_dataset('train', fields, src_corpus, tgt_corpus, savepath, args) if save_dev: src_corpus = savepath + '/src-dev.txt' tgt_corpus = savepath + '/tgt-dev.txt' build_save_dataset('dev', fields, src_corpus, tgt_corpus, savepath, args) build_save_vocab(train_dataset_files, fields, savepath, args)
def preprocess_main(opt): logger = get_logger(opt.log_file) src_nfeats = inputters.get_num_features(opt.data_type, opt.train_src, 'src') tgt_nfeats = inputters.get_num_features(opt.data_type, opt.train_tgt, 'tgt') logger.info(" * number of source features: %d." % src_nfeats) logger.info(" * number of target features: %d." % tgt_nfeats) logger.info("Building `Fields` object...") fields = inputters.get_fields(opt.data_type, src_nfeats, tgt_nfeats) logger.info("Building & saving training data...") train_dataset_files = build_save_dataset('train', fields, opt, logger) logger.info("Building & saving vocabulary...") build_save_vocab(train_dataset_files, fields, opt, logger) logger.info("Building & saving validation data...") build_save_dataset('valid', fields, opt, logger)
def main(): opt = parse_args() print("Extracting features...") src_nfeats = inputters.get_num_features(opt.data_type, opt.train_src, 'src') tgt_nfeats = inputters.get_num_features(opt.data_type, opt.train_tgt, 'tgt') print(" * number of source features: %d." % src_nfeats) print(" * number of target features: %d." % tgt_nfeats) print("Building `Fields` object...") fields = inputters.get_fields(opt.data_type, src_nfeats, tgt_nfeats) print("Building & saving training data...") train_dataset_files = build_save_dataset('train', fields, opt) print("Building & saving vocabulary...") build_save_vocab(train_dataset_files, fields, opt) print("Building & saving validation data...") build_save_dataset('valid', fields, opt)
def main(): opt = parse_args() init_logger(opt.log_file) logger.info("Extracting features...") src_nfeats = inputters.get_num_features( opt.data_type, opt.train_src, 'src') tgt_nfeats = inputters.get_num_features( opt.data_type, opt.train_tgt, 'tgt') ans_nfeats = inputters.get_num_features( opt.data_type, opt.train_ans, "ans") logger.info(" * number of source features: %d." % src_nfeats) logger.info(" * number of target features: %d." % tgt_nfeats) logger.info(" * number of answer features: %d." % ans_nfeats) logger.info("Building `Fields` object...") fields = inputters.get_fields(opt.data_type, src_nfeats, tgt_nfeats, ans_nfeats) logger.info("fields src") logger.info(fields['src'].__dict__) logger.info(fields['tgt'].__dict__) logger.info(fields['src_map'].__dict__) logger.info(fields['ans'].__dict__) logger.info(fields['indices'].__dict__) logger.info(fields['alignment'].__dict__) logger.info("Building & saving training data...") train_dataset_files = build_save_dataset('train', fields, opt) logger.info(train_dataset_files) logger.info("Building & saving vocabulary...") build_save_vocab(train_dataset_files, fields, opt) logger.info("Building & saving validation data...") build_save_dataset('valid', fields, opt)
"-src_word_vec_size", "25", "-tgt_word_vec_size", "25", ]) train_args.batch_size = 50 #print(train_args) #sys.exit() try: torch.manual_seed(preproc_args.seed) opt = preproc_args logger = logging src_nfeats = inputters.get_num_features(opt.data_type, opt.train_src, 'src') tgt_nfeats = inputters.get_num_features(opt.data_type, opt.train_tgt, 'tgt') logger.info(" * number of source features: %d." % src_nfeats) logger.info(" * number of target features: %d." % tgt_nfeats) logger.info("Building `Fields` object...") fields = inputters.get_fields(opt.data_type, src_nfeats, tgt_nfeats) logger.info("Building & saving training data...") train_dataset_files = build_save_dataset('train', fields, opt, logger) logger.info("Building & saving vocabulary...") build_save_vocab(train_dataset_files, fields, opt, logger) logger.info("Building & saving validation data...")