def main(): parser = argparse.ArgumentParser() parser.add_argument( "--out_data_dir", default="/webdata-nfs/jialliu/dpr/ann/ann_multi_data_256/", type=str, help="The output data dir", ) parser.add_argument( "--model_type", default="dpr", type=str, help="Model type selected in the list: " + ", ".join(MSMarcoConfigDict.keys()), ) parser.add_argument( "--model_name_or_path", default="bert-base-uncased", type=str, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS), ) parser.add_argument( "--max_seq_length", default=256, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--data_type", default=0, type=int, help="0 is nq, 1 is trivia, 2 is both", ) parser.add_argument( "--question_dir", type=str, help="location of the raw QnA question data", ) parser.add_argument( "--wiki_dir", type=str, help="location of the wiki corpus", ) parser.add_argument( "--answer_dir", type=str, help="location of the QnA answers for evaluation", ) parser.add_argument( "--bpe_vocab_file", type=str, help="location of the QnA answers for evaluation", ) args = parser.parse_args() if not os.path.exists(args.out_data_dir): os.makedirs(args.out_data_dir) preprocess(args)
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--out_data_dir", default="/fs/clip-scratch/chen/naacl_data/", type=str, help="The output data dir", ) parser.add_argument( "--model_type", default="dpr", type=str, help="Model type selected in the list: " + ", ".join(MSMarcoConfigDict.keys()), ) parser.add_argument( "--model_name_or_path", default="bert-base-uncased", type=str, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS), ) parser.add_argument( "--max_seq_length", default=256, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--wiki_dir", default='/fs/clip-scratch/chen/data/wikipedia_split/', type=str, help="location of the wiki corpus", ) parser.add_argument( "--answer_dir", default='/fs/clip-scratch/chen/data/data/retriever/qas/', type=str, help="location of the QnA answers for evaluation", ) args = parser.parse_args() if not os.path.exists(args.out_data_dir): os.makedirs(args.out_data_dir) preprocess(args)
def get_arguments(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.", ) parser.add_argument( "--training_dir", default=None, type=str, required=True, help="Training dir, will look for latest checkpoint dir in here", ) parser.add_argument( "--init_model_dir", default=None, type=str, required=True, help="Initial model dir, will use this if no checkpoint is found in model_dir", ) parser.add_argument( "--last_checkpoint_dir", default="", type=str, help="Last checkpoint used, this is for rerunning this script when some ann data is already generated", ) parser.add_argument( "--train_model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join( MSMarcoConfigDict.keys()), ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the training data will be written", ) parser.add_argument( "--cache_dir", default=None, type=str, required=True, help="The directory where cached data will be written", ) parser.add_argument( "--end_output_num", default=- 1, type=int, help="Stop after this number of data versions has been generated, default run forever", ) parser.add_argument( "--max_seq_length", default=128, type=int, help="The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--max_query_length", default=64, type=int, help="The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--max_doc_character", default=10000, type=int, help="used before tokenizer to save tokenizer latency", ) parser.add_argument( "--per_gpu_eval_batch_size", default=128, type=int, help="The starting output file number", ) parser.add_argument( "--ann_chunk_factor", default=5, # for 500k queryes, divided into 100k chunks for each epoch type=int, help="devide training queries into chunks", ) parser.add_argument( "--topk_training", default=500, type=int, help="top k from which negative samples are collected", ) parser.add_argument( "--negative_sample", default=5, type=int, help="at each resample, how many negative samples per query do I use", ) parser.add_argument( "--ann_measure_topk_mrr", default=False, action="store_true", help="load scheduler from checkpoint or not", ) parser.add_argument( "--only_keep_latest_embedding_file", default=False, action="store_true", help="load scheduler from checkpoint or not", ) parser.add_argument( "--no_cuda", action="store_true", help="Avoid using CUDA when available", ) parser.add_argument( "--local_rank", type=int, default=-1, help="For distributed training: local_rank", ) parser.add_argument( "--server_ip", type=str, default="", help="For distant debugging.", ) parser.add_argument( "--server_port", type=str, default="", help="For distant debugging.", ) parser.add_argument( "--inference", default=False, action="store_true", help="only do inference if specify", ) parser.add_argument( "--bpe_vocab_file", type=str, default="", help="For distant debugging.", ) parser.add_argument( "--model_file", default=None, type=str, #required=True, ) args = parser.parse_args() return args
def get_arguments(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the cached passage and query files", ) parser.add_argument( "--ann_dir", default=None, type=str, required=True, help= "The ann training data dir. Should contain the output of ann data generation job", ) parser.add_argument( "--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MSMarcoConfigDict.keys()), ) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS), ) parser.add_argument( "--task_name", default=None, type=str, required=True, help="The name of the task to train selected in the list: " + ", ".join(processors.keys()), ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written.", ) # Other parameters parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name", ) parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name", ) parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3", ) parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--max_query_length", default=64, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--triplet", default=False, action="store_true", help="Whether to run training.", ) parser.add_argument( "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model.", ) parser.add_argument( "--log_dir", default=None, type=str, help="Tensorboard log dir", ) parser.add_argument( "--optimizer", default="lamb", type=str, help="Optimizer - lamb or adamW", ) parser.add_argument( "--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.", ) parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass.", ) parser.add_argument( "--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.", ) parser.add_argument( "--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.", ) parser.add_argument( "--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.", ) parser.add_argument( "--max_grad_norm", default=1.0, type=float, help="Max gradient norm.", ) parser.add_argument( "--max_steps", default=1000000, type=int, help="If > 0: set total number of training steps to perform", ) parser.add_argument( "--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.", ) parser.add_argument( "--logging_steps", type=int, default=500, help="Log every X updates steps.", ) parser.add_argument( "--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.", ) parser.add_argument( "--no_cuda", action="store_true", help="Avoid using CUDA when available", ) parser.add_argument( "--seed", type=int, default=42, help="random seed for initialization", ) parser.add_argument( "--fp16", action="store_true", help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", ) parser.add_argument( "--fp16_opt_level", type=str, default="O1", help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html", ) # ----------------- ANN HyperParam ------------------ parser.add_argument( "--load_optimizer_scheduler", default=False, action="store_true", help="load scheduler from checkpoint or not", ) parser.add_argument( "--single_warmup", default=False, action="store_true", help="use single or re-warmup", ) # ----------------- End of Doc Ranking HyperParam ------------------ parser.add_argument( "--local_rank", type=int, default=-1, help="For distributed training: local_rank", ) parser.add_argument( "--server_ip", type=str, default="", help="For distant debugging.", ) parser.add_argument( "--server_port", type=str, default="", help="For distant debugging.", ) args = parser.parse_args() return args
def get_arguments(): parser = argparse.ArgumentParser() parser.add_argument( "--data_dir", default=None, type=str, required=True, help="The input data dir", ) parser.add_argument( "--out_data_dir", default=None, type=str, required=True, help="The output data dir", ) parser.add_argument( "--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MSMarcoConfigDict.keys()), ) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS), ) parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--max_query_length", default=64, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--max_doc_character", default=10000, type=int, help="used before tokenizer to save tokenizer latency", ) parser.add_argument( "--data_type", default=0, type=int, help="0 for doc, 1 for passage", ) args = parser.parse_args() return args
def get_arguments(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.", ) parser.add_argument( "--training_dir", default=None, type=str, required=True, help="Training dir, will look for latest checkpoint dir in here", ) parser.add_argument( "--init_model_dir", default=None, type=str, required=True, help="Initial model dir, will use this if no checkpoint is found in model_dir", ) parser.add_argument( "--last_checkpoint_dir", default="", type=str, help="Last checkpoint used, this is for rerunning this script when some ann data is already generated", ) parser.add_argument( "--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MSMarcoConfigDict.keys()), ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the training data will be written", ) parser.add_argument( "--cache_dir", default=None, type=str, required=True, help="The directory where cached data will be written", ) parser.add_argument( "--end_output_num", default=-1, type=int, help="Stop after this number of data versions has been generated, default run forever", ) parser.add_argument( "--max_seq_length", default=128, type=int, help="The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--max_query_length", default=64, type=int, help="The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--max_doc_character", default= 10000, type=int, help="used before tokenizer to save tokenizer latency", ) parser.add_argument( "--per_gpu_eval_batch_size", default=128, type=int, help="The starting output file number", ) parser.add_argument( "--ann_chunk_factor", default= 5, # for 500k queryes, divided into 100k chunks for each epoch type=int, help="devide training queries into chunks", ) parser.add_argument( "--topk_training", default= 500, type=int, help="top k from which negative samples are collected", ) parser.add_argument( "--negative_sample", default= 5, type=int, help="at each resample, how many negative samples per query do I use", ) parser.add_argument( "--topk_training_d2q", default=200, type=int, help="top k from which negative samples are collected", ) parser.add_argument( "--ann_measure_topk_mrr", default = False, action="store_true", help="load scheduler from checkpoint or not", ) parser.add_argument( "--only_keep_latest_embedding_file", default = False, action="store_true", help="load scheduler from checkpoint or not", ) parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.") parser.add_argument("--server_port", type=str, default="", help="For distant debugging.") parser.add_argument( "--passage_path", default=None, type=str, required=True, help="passage_path", ) parser.add_argument( "--test_qa_path", default=None, type=str, required=True, help="test_qa_path", ) parser.add_argument( "--trivia_test_qa_path", default=None, type=str, required=True, help="trivia_test_qa_path", ) parser.add_argument( "--dual_training", action="store_true", help="enable dual training, change the data loading, forward function and loss function", ) parser.add_argument( "--faiss_omp_num_threads", type=int, default=16, help="for faiss.omp_set_num_threads()", ) parser.add_argument( "--split_ann_search", default=False, action="store_true", help="separately do ANN index and merge result", ) parser.add_argument( "--gpu_index", default=False, action="store_true", help="separately do ANN index and merge result", ) parser.add_argument( "--emb_file_multi_split_num", default= -1, type=int, help="extra splitting of the embeddings", ) parser.add_argument( "--emb_file_multi_split_size", default= -1, type=int, help="extra splitting of the embeddings max size", ) parser.add_argument( "--grouping_ann_data", type=int, default=-1, help="group multiple <q,d> pair data into one line, I prefer set to 32", ) parser.add_argument( "--fp16", action="store_true", help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", ) parser.add_argument( "--init_from_fp16_ckpt", action="store_true", help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", ) parser.add_argument( "--representation_l2_normalization", action="store_true", help="enable l2_normalization on the representative embeddings for ANN retrieval, previously named as --l2_normalization", ) args = parser.parse_args() return args
def get_arguments(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task.", ) parser.add_argument( "--training_dir", default=None, type=str, required=True, help="Training dir, will look for latest checkpoint dir in here", ) parser.add_argument( "--data_type", type=int, default=0, help="the length of new model", ) parser.add_argument( "--init_model_dir", default=None, type=str, required=True, help= "Initial model dir, will use this if no checkpoint is found in model_dir", ) parser.add_argument( "--last_checkpoint_dir", default="", type=str, help= "Last checkpoint used, this is for rerunning this script when some ann data is already generated", ) parser.add_argument( "--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MSMarcoConfigDict.keys()), ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the training data will be written", ) parser.add_argument( "--cache_dir", default=None, type=str, required=True, help="The directory where cached data will be written", ) parser.add_argument( "--end_output_num", default=-1, type=int, help= "Stop after this number of data versions has been generated, default run forever", ) parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--max_query_length", default=64, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--max_doc_character", default=10000, type=int, help="used before tokenizer to save tokenizer latency", ) parser.add_argument( "--per_gpu_eval_batch_size", default=128, type=int, help="The starting output file number", ) parser.add_argument( "--ann_chunk_factor", default=5, # for 500k queryes, divided into 100k chunks for each epoch type=int, help="devide training queries into chunks", ) parser.add_argument( "--no_cuda", action="store_true", help="Avoid using CUDA when available", ) parser.add_argument( "--local_rank", type=int, default=-1, help="For distributed training: local_rank", ) parser.add_argument( "--faiss_omp_num_threads", type=int, default=16, help="for faiss.omp_set_num_threads()", ) parser.add_argument( "--server_ip", type=str, default="", help="For distant debugging.", ) parser.add_argument( "--server_port", type=str, default="", help="For distant debugging.", ) parser.add_argument( "--inference", default=False, action="store_true", help="only do inference if specify", ) parser.add_argument( "--save_training_query_trec", default=False, action="store_true", help="..", ) # ---------------------------------------------------------------- parser.add_argument( "--dual_training", action="store_true", help= "enable dual training, change the data loading, forward function and loss function", ) # ------------------- L2 normalization ------------------------ parser.add_argument( "--representation_l2_normalization", action="store_true", help= "enable l2_normalization on the representative embeddings for ANN retrieval", ) parser.add_argument( "--grouping_ann_data", type=int, default=-1, help="group multiple <q,d> pair data into one line, I prefer set to 32", ) parser.add_argument( "--split_ann_search", default=False, action="store_true", help="separately do ANN index and merge result", ) parser.add_argument( "--gpu_index", default=False, action="store_true", help="separately do ANN index and merge result", ) parser.add_argument( "--dev_split_num", type=int, default=-1, help="how much fold to split validation set", ) parser.add_argument( "--testing_split_idx", type=int, default=0, help="how much fold to split validation set", ) parser.add_argument( "--query_likelihood_strategy", type=str, default="positive_doc", choices=[ "BM25_retrieval", "positive_doc", "random_doc", "random_shuffle_positive_doc" ], help="use what doc to do retrieval", ) parser.add_argument( "--d2q_task_evaluation", action="store_true", help="evaluate and print out the d->q retrieval results", ) parser.add_argument( "--d2q_task_marco_dev_qrels", type=str, default=None, help= "reversed d2q_qrels.tsv, if split validation, should proved a file like: args.d2q_task_marco_dev_qrels+2_fold.split_dict ", ) parser.add_argument( "--inference_type", type=str, default="query", choices=["query", "document"], help="inference query or document embeddings", ) parser.add_argument("--save_prefix", default=None, type=str, required=False, help="saving name for the query collection") args = parser.parse_args() return args
def get_arguments(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task.", ) parser.add_argument( "--passage_path", default='', type=str, help= "Initial model dir, will use this if no checkpoint is found in model_dir", ) parser.add_argument( "--model", default='', type=str, help= "Initial model dir, will use this if no checkpoint is found in model_dir", ) parser.add_argument( "--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MSMarcoConfigDict.keys()), ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the training data will be written", ) parser.add_argument( "--max_seq_length", default=256, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--per_gpu_eval_batch_size", default=192, type=int, help="The starting output file number", ) parser.add_argument( "--latest_num", default=0, type=int, ) parser.add_argument( "--topk", default=50, type=int, help="top k from which negative samples are collected", ) parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument("--load_cache", action="store_true", help="Avoid using CUDA when available") parser.add_argument("--world_size", type=int, default=4) args = parser.parse_args() return args
def get_arguments(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the cached passage and query files", ) parser.add_argument( "--ann_dir", default=None, type=str, required=True, help= "The ann training data dir. Should contain the output of ann data generation job", ) parser.add_argument( "--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MSMarcoConfigDict.keys()), ) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS), ) parser.add_argument( "--task_name", default=None, type=str, required=True, help="The name of the task to train selected in the list: " + ", ".join(processors.keys()), ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written.", ) # Other parameters parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name", ) parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name", ) parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3", ) parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--max_query_length", default=64, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--triplet", default=False, action="store_true", help="Whether to run training.", ) parser.add_argument( "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model.", ) parser.add_argument( "--log_dir", default=None, type=str, help="Tensorboard log dir", ) parser.add_argument( "--optimizer", default="lamb", type=str, help="Optimizer - lamb or adamW", ) parser.add_argument( "--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.", ) parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass.", ) parser.add_argument( "--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.", ) parser.add_argument( "--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.", ) parser.add_argument( "--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.", ) parser.add_argument( "--max_grad_norm", default=1.0, type=float, help="Max gradient norm.", ) parser.add_argument( "--max_steps", default=1000000, type=int, help="If > 0: set total number of training steps to perform", ) parser.add_argument( "--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.", ) parser.add_argument( "--logging_steps", type=int, default=500, help="Log every X updates steps.", ) parser.add_argument( "--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.", ) parser.add_argument( "--no_cuda", action="store_true", help="Avoid using CUDA when available", ) parser.add_argument( "--seed", type=int, default=42, help="random seed for initialization", ) parser.add_argument( "--fp16", action="store_true", help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", ) parser.add_argument( "--fp16_opt_level", type=str, default="O1", help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html", ) # ----------------- ANN HyperParam ------------------ parser.add_argument( "--load_optimizer_scheduler", default=False, action="store_true", help="load scheduler from checkpoint or not", ) parser.add_argument( "--single_warmup", default=False, action="store_true", help="use single or re-warmup", ) # ----------------- End of Doc Ranking HyperParam ------------------ parser.add_argument( "--local_rank", type=int, default=-1, help="For distributed training: local_rank", ) parser.add_argument( "--server_ip", type=str, default="", help="For distant debugging.", ) parser.add_argument( "--server_port", type=str, default="", help="For distant debugging.", ) parser.add_argument( "--dual_training", action="store_true", help= "enable dual training, change the data loading, forward function and loss function", ) parser.add_argument( "--prime_loss_weight", type=float, default=1.0, help="primary task loss item weight", ) parser.add_argument( "--dual_loss_weight", type=float, default=0.1, help="dual learning loss item weight", ) # ----------------- HyperSphere Property ------------------ parser.add_argument( "--hyper_align_weight", type=float, default=0.0, help="hypershpere alignment property loss weight", ) parser.add_argument( "--hyper_unif_weight", type=float, default=0.0, help="hypershpere uniformity property loss weight", ) # ------------------- SimCLR parameters L2 normalization and Temperature------------------------ parser.add_argument( "--representation_l2_normalization", action="store_true", help= "enable l2_normalization on the representative embeddings for ANN retrieval, previously named as --l2_normalization", ) parser.add_argument( "--temperature", type=float, default=1.0, help="temperature in SimCLR", ) parser.add_argument( "--loss_objective_function", type=str, default="dot_product", choices=["dot_product", "pairwise_hinge", "simclr_cosine"], help="attention type", ) parser.add_argument( "--grouping_ann_data", type=int, default=-1, help="group multiple <q,d> pair data into one line, I prefer set to 32", ) parser.add_argument( "--polling_loaded_data_batch_from_group", default=False, action="store_true", help= "if polling, return batches like [q1,q2,q3,q4]. otherwise remain the original way: [q1,q1,q1,q1], [q1,q1,q2,q2], [q2,q2,q2,q2]...", ) args = parser.parse_args() return args
def get_arguments(call_paras=None): parser = argparse.ArgumentParser() parser.add_argument( "--data_dir", default=None, type=str, required=True, help="The input data dir", ) parser.add_argument( "--out_data_dir", default=None, type=str, required=True, help="The output data dir", ) parser.add_argument( "--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MSMarcoConfigDict.keys()), ) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS), ) parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--max_query_length", default=64, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--max_doc_character", default=10000, type=int, help="used before tokenizer to save tokenizer latency", ) parser.add_argument( "--data_type", default=-1, type=int, help="0 for doc, 1 for passage, -1 for custom dataset", ) parser.add_argument("--doc_collection_tsv", default=None, type=str, required=True, help="Path to document collection") parser.add_argument("--save_prefix", default=None, type=str, required=True, help="saving name for the query collection") parser.add_argument("--query_collection_tsv", default=None, type=str, required=True, help="Path to query collection") parser.add_argument("--qrel_tsv", default=None, type=str, required=False, help="Path to qrel file") parser.add_argument("--n_split_process", default=16, type=int, required=False, help="number of processes for data preprocessing") if call_paras is None: # call with command line args = parser.parse_args() else: args = parser.parse_args(call_paras) return args
def get_arguments(): parser = argparse.ArgumentParser() parser.add_argument( "--data_dir", default=None, type=str, required=True, help="The input data dir", ) parser.add_argument( "--out_data_dir", default=None, type=str, required=True, help="The output data dir", ) parser.add_argument( "--train_model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MSMarcoConfigDict.keys()), ) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS), ) parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--max_query_length", default=64, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--max_doc_character", default=10000, type=int, help="used before tokenizer to save tokenizer latency", ) parser.add_argument( "--data_type", default=0, type=int, help="0 for doc, 1 for passage", ) parser.add_argument( "--bpe_vocab_file", type=str, default="", help="For distant debugging.", ) parser.add_argument( "--model_file", default=None, type=str, #required=True, ) parser.add_argument( "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model.", ) parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name", ) args = parser.parse_args() return args
default="/fs/clip-scratch/chen/hotpot_data/", type=str, help="The output data dir", ) parser.add_argument( "--num", default=0, type=int, help="The output data dir", ) parser.add_argument( "--model_type", default="dpr", type=str, help="Model type selected in the list: " + ", ".join(MSMarcoConfigDict.keys()), ) parser.add_argument( "--model_name_or_path", default="bert-base-uncased", type=str, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS), ) parser.add_argument( "--max_seq_length", default=256, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.",