def main(): py_utils.add_stdout_logger() logging.info("Checking word vectors..") download_wordvecs() logging.info("Checking MNLI...") download_mnli() logging.info("Checking SQUAD...") download_squad() logging.info("Checking TriviaQA-CP...") download_triviaqa_cp() logging.info("Done! All data should be ready")
def main(): parser = argparse.ArgumentParser() parser.add_argument("model") parser.add_argument("--n_processes", "-n", type=int, default=1) parser.add_argument("--nocache", action="store_true") parser.add_argument("--dataset", choices=["dev", "hans", "both"], default="both") args = parser.parse_args() py_utils.add_stdout_logger() compute_scores( args.model, args.dataset in ["dev", "both"], args.dataset in ["hans", "both"], not args.nocache, args.n_processes)
def main(): parser = argparse.ArgumentParser() parser.add_argument("output_dir") parser.add_argument("--nocache", action="store_true") parser.add_argument("--datasets", default=None, help="Comma separated list of datasets") args = parser.parse_args() py_utils.add_stdout_logger() if args.datasets is None: datasets = ["dev", "add_sent", "add_one_sent"] else: datasets = args.datasets.split(",") for ds in datasets: if ds not in squad.DATASETS: raise ValueError("Unsupported dataset %s" % ds) compute_all_scores(args.output_dir, datasets, not args.nocache)
def main(): parser = argparse.ArgumentParser() parser.add_argument("output_dir") parser.add_argument("--nocache", action="store_true") parser.add_argument("--dataset", choices=["location", "person"], required=True, help="Dataset to test on") parser.add_argument("--parts", default=None, help="Comma seperated list of parts to test on") args = parser.parse_args() py_utils.add_stdout_logger() if args.parts is None: parts = ["dev", "test"] else: parts = args.parts.split(",") for ds in parts: if ds not in ["dev", "test", "train"]: raise ValueError("Unsupported dataset %s" % ds) show_scores(args.output_dir, args.dataset, parts, not args.nocache)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--bert_model", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.") parser.add_argument("--cache_dir", default="", type=str, help="Where do you want to store the pre-trained models downloaded from s3") parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--seed", default=None, type=int, help="Seed for randomized elements in the training") parser.add_argument("--eval_batch_size", default=16, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") ## Our arguements parser.add_argument("--mode", choices=["bias_product", "none", "learned_mixin", "reweight"], default="learned_mixin", help="Kind of debiasing method to use") parser.add_argument("--penalty", type=float, default=0.03, help="Penalty weight for the learn_mixin model") parser.add_argument("--n_processes", type=int, default=4, help="Processes to use for pre-processing") parser.add_argument("--debug", action="store_true") parser.add_argument("--sorted", action="store_true", help='Sort the data so most batches have the same input length,' ' makes things about 2x faster. Our experiments did not actually' ' use this in the end (not sure if it makes a difference) so ' 'its off by default.') args = parser.parse_args() py_utils.add_stdout_logger() if args.mode == "none": loss_fn = clf_debias_loss_functions.Plain() elif args.mode == "reweight": loss_fn = clf_debias_loss_functions.ReweightByInvBias() elif args.mode == "bias_product": loss_fn = clf_debias_loss_functions.BiasProduct() elif args.mode == "learned_mixin": loss_fn = clf_debias_loss_functions.LearnedMixin(args.penalty) else: raise RuntimeError() output_dir = args.output_dir if args.do_train: if exists(output_dir): if len(os.listdir(output_dir)) > 0: raise ValueError("Output dir exists and is non-empty") else: os.makedirs(output_dir) print("Saving model to %s" % output_dir) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logging.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps if args.seed is not None: random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") if os.path.exists(output_dir) and os.listdir(output_dir) and args.do_train: raise ValueError("Output directory ({}) already exists and is not empty.".format(output_dir)) if not os.path.exists(output_dir): os.makedirs(output_dir) # Its way ot easy to forget if this is being set by a command line flag if "-uncased" in args.bert_model: do_lower_case = True elif "-cased" in args.bert_model: do_lower_case = False else: raise NotImplementedError(args.bert_model) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=do_lower_case) num_train_optimization_steps = None train_examples = None if args.do_train: train_examples = load_mnli(True, 2000 if args.debug else None) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank)) model = BertWithDebiasLoss.from_pretrained( args.bert_model, cache_dir=cache_dir, num_labels=3, loss_fn=loss_fn) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: train_features: List[InputFeatures] = convert_examples_to_features( train_examples, args.max_seq_length, tokenizer, args.n_processes) bias_map = load_bias("train") for fe in train_features: fe.bias = bias_map[fe.example_id].astype(np.float32) logging.info("***** Running training *****") logging.info(" Num examples = %d", len(train_examples)) logging.info(" Batch size = %d", args.train_batch_size) logging.info(" Num steps = %d", num_train_optimization_steps) train_dataloader = build_train_dataloader(train_features, args.train_batch_size, args.seed, args.sorted) model.train() loss_ema = 0 total_steps = 0 decay = 0.99 for _ in trange(int(args.num_train_epochs), desc="Epoch", ncols=100): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 pbar = tqdm(train_dataloader, desc="loss", ncols=100) for step, batch in enumerate(pbar): batch = tuple(t.to(device) for t in batch) if bias_map is not None: input_ids, input_mask, segment_ids, label_ids, bias = batch else: bias = None input_ids, input_mask, segment_ids, label_ids = batch logits, loss = model(input_ids, segment_ids, input_mask, label_ids, bias) total_steps += 1 loss_ema = loss_ema * decay + loss.cpu().detach().numpy() * (1 - decay) descript = "loss=%.4f" % (loss_ema / (1 - decay**total_steps)) pbar.set_description(descript, refresh=False) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear(global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model and the associated configuration model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self output_model_file = os.path.join(output_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) # Record the args as well arg_dict = {} for arg in vars(args): arg_dict[arg] = getattr(args, arg) with open(join(output_dir, "args.json"), 'w') as out_fh: json.dump(arg_dict, out_fh) # Load a trained model and config that you have fine-tuned config = BertConfig(output_config_file) model = BertWithDebiasLoss(config, num_labels=3, loss_fn=loss_fn) model.load_state_dict(torch.load(output_model_file)) else: output_config_file = os.path.join(output_dir, CONFIG_NAME) config = BertConfig.from_json_file(output_config_file) output_model_file = os.path.join(output_dir, WEIGHTS_NAME) model = BertWithDebiasLoss(config, num_labels=3, loss_fn=loss_fn) model.load_state_dict(torch.load(output_model_file)) model.to(device) if not args.do_eval: return if not (args.local_rank == -1 or torch.distributed.get_rank() == 0): return model.eval() eval_datasets = [("dev", load_mnli(False)), ("hans", load_hans())] for name, eval_examples in eval_datasets: logging.info("***** Running evaluation on %s *****" % name) logging.info(" Num examples = %d", len(eval_examples)) logging.info(" Batch size = %d", args.eval_batch_size) eval_features = convert_examples_to_features( eval_examples, args.max_seq_length, tokenizer) eval_features.sort(key=lambda x: len(x.input_ids)) all_label_ids = np.array([x.label_id for x in eval_features]) eval_dataloader = build_eval_dataloader(eval_features, args.eval_batch_size) eval_loss = 0 nb_eval_steps = 0 probs = [] for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating", ncols=100): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask) # create eval loss and other metric required by the task loss_fct = CrossEntropyLoss() tmp_eval_loss = loss_fct(logits.view(-1, 3), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 probs.append(torch.nn.functional.softmax(logits, 1).detach().cpu().numpy()) probs = np.concatenate(probs, 0) eval_loss = eval_loss / nb_eval_steps if name == "hans": probs[:, 0] += probs[:, 2] probs = probs[:, :2] preds = np.argmax(probs, axis=1) result = {"acc": simple_accuracy(preds, all_label_ids)} loss = tr_loss / nb_tr_steps if args.do_train else None result['eval_loss'] = eval_loss result['global_step'] = global_step result['loss'] = loss output_eval_file = os.path.join(output_dir, "eval_%s_results.txt" % name) with open(output_eval_file, "w") as writer: logging.info("***** Eval results *****") for key in sorted(result.keys()): logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) output_answer_file = os.path.join(output_dir, "eval_%s_answers.json" % name) answers = {ex.example_id: [float(x) for x in p] for ex,p in zip(eval_features, probs)} with open(output_answer_file, "w") as f: json.dump(answers, f)
def build_mnli_bias_only(out_dir, cache_examples=None, w2v_cache=None): """Builds our bias-only MNLI model and saves its predictions :param out_dir: Directory to save the predictions :param cache_examples: Cache examples to this file :param w2v_cache: Cache w2v features to this file """ py_utils.add_stdout_logger() tok = NltkAndPunctTokenizer() # Load the data we want to use if cache_examples and exists(cache_examples): tf.logging.info("Loading cached examples") with open(cache_examples, "rb") as f: dataset_to_examples = pickle.load(f) else: dataset_to_examples = {} dataset_to_examples["hans"] = tokenize_examples(load_hans(), tok, 5) dataset_to_examples["train"] = tokenize_examples(load_mnli(True), tok, 5) dataset_to_examples["dev"] = tokenize_examples(load_mnli(False), tok, 5) if cache_examples: with open(cache_examples, "wb") as f: pickle.dump(dataset_to_examples, f) # Our models will only distinguish entailment vs (neutral/contradict) for examples in dataset_to_examples.values(): for i, ex in enumerate(examples): if ex.label == 2: examples[i] = ex._replace(label=0) # Load the pre-normalized word vectors to use when building features if w2v_cache and exists(w2v_cache): tf.logging.info("Loading cached word vectors") with open(w2v_cache, "rb") as f: w2v = pickle.load(f) else: logging.info("Loading word vectors") voc = set() for v in dataset_to_examples.values(): for ex in v: voc.update(ex.hypothesis) voc.update(ex.premise) words, vecs = load_word_vectors("crawl-300d-2M", voc) w2v = {w: v/np.linalg.norm(v) for w, v in zip(words, vecs)} if w2v_cache: with open(w2v_cache, "wb") as f: pickle.dump(w2v, f) # Build the features, store as a pandas dataset dataset_to_features = {} for name, examples in dataset_to_examples.items(): tf.logging.info("Building features for %s.." % name) features = [] for example in examples: h = [x.lower() for x in example.hypothesis] p = [x.lower() for x in example.premise] p_words = set(p) n_words_in_p = sum(x in p_words for x in h) fe = { "h-is-subseq": is_subseq(h, p), "all-in-p": n_words_in_p == len(h), "percent-in-p": n_words_in_p / len(h), "log-len-diff": np.log(max(len(p) - len(h), 1)), "label": example.label } h_vecs = [w2v[w] for w in example.hypothesis if w in w2v] p_vecs = [w2v[w] for w in example.premise if w in w2v] if len(h_vecs) > 0 and len(p_vecs) > 0: h_vecs = np.stack(h_vecs, 0) p_vecs = np.stack(p_vecs, 0) # [h_size, p_size] similarities = np.matmul(h_vecs, p_vecs.T) # [h_size] similarities = np.max(similarities, 1) similarities.sort() fe["average-sim"] = similarities.sum() / len(h) fe["min-similarity"] = similarities[0] if len(similarities) > 1: fe["min2-similarity"] = similarities[1] features.append(fe) dataset_to_features[name] = pd.DataFrame(features) dataset_to_features[name].fillna(0.0, inplace=True) # Train the model tf.logging.info("Fitting...") train_df = dataset_to_features["train"] feature_cols = [x for x in train_df.columns if x != "label"] # class_weight='balanced' will weight the entailemnt/non-entailment examples equally # C=100 means no regularization lr = LogisticRegression(multi_class="auto", solver="liblinear", class_weight='balanced', C=100) lr.fit(train_df[feature_cols].values, train_df.label.values) # Save the model predictions if not exists(out_dir): mkdir(out_dir) for name, ds in dataset_to_features.items(): tf.logging.info("Predicting for %s" % name) examples = dataset_to_examples[name] pred = lr.predict_log_proba(ds[feature_cols].values).astype(np.float32) y = ds.label.values bias = {} for i in range(len(pred)): if examples[i].id in bias: raise RuntimeError("non-unique IDs?") bias[examples[i].id] = pred[i] acc = np.mean(y == np.argmax(pred, 1)) print("%s two-class accuracy: %.4f (size=%d)" % (name, acc, len(examples))) with open(join(out_dir, "%s.pkl" % name), "wb") as f: pickle.dump(bias, f)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--stratify", type=int, default=None) parser.add_argument("--dataset", choices=["location", "person"], default="location") cli_utils.add_general_args(parser) cli_utils.add_loss_args(parser, default_penalty=None) args = parser.parse_args() if args.stratify is None: if args.mode == "learned_mixin": # Note sure if this actually makes a difference, but I turned this on # for the learned_mixin case so we do here for exactness args.stratify = 6 if args.penalty is None: if args.dataset == "person": args.penalty = 0.2 else: args.penalty = 0.4 dbg = args.debug if dbg: epoch_size = 50 else: epoch_size = 1200 opt = AdamOptimizer(decay_steps=50, max_grad_norm=3.0) batcher = QuantileBatcher(45, 10, 400, 4, 12) evaluator = Evaluator("triviaqa") trainer = Trainer( batcher, opt, evaluator, eval_batch_size=90, num_epochs=30, epoch_size=epoch_size, log_period=100, prefetch=5, loss_ema=0.999, n_processes=args.n_processes ) if dbg: dataset = AnnotatedTriviaQACPLoader( args.dataset, sample_train=1000, stratify=args.stratify) else: dataset = AnnotatedTriviaQACPLoader( args.dataset, sample_train_eval=8000, stratify=args.stratify) dim = 128 recurrent_layer = CudnnLSTMRecurrentDropout(dim, 0.2) model = TextPairQaDebiasingModel( None, # Assume pre-tokenized data text_encoder=WordAndCharEncoder( "glove.6B.50d" if dbg else "crawl-300d-2M", first_n=500000, char_embed_dim=24, character_mapper=Conv1d(100, 5, None), character_pooler=MaxPooler(), word_length=30 ), map_embed=seq( Dropout(0.3), HighwayLayer(recurrent_layer), ), fuse_layer=BiAttention(WeightedDot()), post_process_layer=seq( VariationalDropout(0.2), FullyConnected(dim * 2, activation="relu"), VariationalDropout(0.2), HighwayLayer(recurrent_layer), VariationalDropout(0.2), HighwayLayer(recurrent_layer), VariationalDropout(0.2), ), debias_loss_fn=cli_utils.get_qa_loss_fn(args) ) with open(__file__) as f: notes = f.read() py_utils.add_stdout_logger() trainer.train(dataset, model, args.output_dir, notes) if args.output_dir: logging.info("Evaluating...") eval_debiased_triviaqa_cp.show_scores(args.output_dir, args.dataset, ["dev", "test"])
def main(): parser = argparse.ArgumentParser() parser.add_argument("--bias", choices=["indicator", "excluder", "dependent"], default="indicator") cli_utils.add_general_args(parser) cli_utils.add_loss_args(parser, default_penalty=None) args = parser.parse_args() if args.penalty is None: if args.bias == "indicator": args.penalty = 0.01 else: args.penalty = 0.005 dbg = args.debug if dbg: epoch_size = 200 else: epoch_size = 6000 opt = AdamOptimizer(max_grad_norm=5.0) batcher = QuantileBatcher(32, 10, 160, 4, 12) evaluator = Evaluator(mode="clf") trainer = Trainer( batcher, opt, evaluator, eval_batch_size=64, num_epochs=30, epoch_size=epoch_size, log_period=100, prefetch=5, loss_ema=0.999, n_processes=args.n_processes, ) if args.bias == "indicator": bias_prob, i_prob = 0.8, None elif args.bias == "excluder": bias_prob, i_prob = 0.03, None elif args.bias == "dependent": bias_prob, i_prob = 0.9, 0.8 else: raise RuntimeError() if dbg: dataset = MnliWithSyntheticBiasLoading(bias_prob, n_train_eval=200, n_train_sample=1000, n_dev_sample=200, indicator_noise=i_prob) else: dataset = MnliWithSyntheticBiasLoading(bias_prob, n_train_eval=10000, indicator_noise=i_prob) dim = 50 if dbg else 200 recurrent_layer = CudnnLSTMRecurrentDropout(dim, 0.2) model = TextPairClfDebiasingModel( NltkAndPunctTokenizer(), WordAndCharEncoder( "glove.6B.50d" if dbg else "crawl-300d-2M", first_n=None, char_embed_dim=24, character_mapper=mseq(Dropout(0.1), Conv1d(100, 5, None)), character_pooler=MaxPooler(), word_length=30, ), map_embed=seq( VariationalDropout(0.2), recurrent_layer ), bifuse_layer=AttentionBiFuse(WeightedDot()), post_process_layer=seq( recurrent_layer, VariationalDropout(0.2), ), pool_layer=MaxPooler(), processs_joint=mseq( FullyConnected(100), Dropout(0.2) ), n_classes=3, debias_loss_fn=cli_utils.get_clf_loss_fn(args) ) with open(__file__) as f: notes = f.read() py_utils.add_stdout_logger() trainer.train(dataset, model, args.output_dir, notes) if args.output_dir: logging.info("Evaluating...") show_scores(args.output_dir, args.bias, [False, True], n_processes=args.n_processes)