def _split_generators(self, dl_manager): """Returns SplitGenerators.""" dl_dir = dl_manager.download_and_extract(self.config.data_url) splits_gen = [] for split_id, split_filename in self.config.splits.items(): if self.config.gameplay_scenario == "original": if "train" in split_id: split_name = nlp.Split.TRAIN elif "valid" in split_id: split_name = nlp.Split.VALIDATION elif "test" in split_id: split_name = nlp.Split.TEST else: split_name = nlp.Split(split_id) full_split_name = "-".join(["compguesswhat", self.config.gameplay_scenario]) splits_gen.append( nlp.SplitGenerator( name=split_name, gen_kwargs={ "filepath": os.path.join(dl_dir, full_split_name, self.VERSION.version_str, split_filename) }, ) ) return splits_gen
def _split_generators(self, dl_manager): qa_data_file = pjoin( self._cache_dir_root, self._relative_data_dir(with_version=False), "reddit_downloaded_qa_lists.json" ) if isfile(qa_data_file): logging.info("loading pre-computed QA list") self.filtered_reddit = json.load(open(qa_data_file)) else: self.filtered_reddit = _download_and_filter_reddit( dl_manager, start_year=2011, start_month=7, end_year=2019, end_month=7 ) logging.info("saving pre-computed QA list") json.dump(self.filtered_reddit, open(qa_data_file, "w")) # download data splits from AWS fpath_splits = dl_manager.download(self._DATA_SPLIT_URL) self.data_split = json.load(open(fpath_splits)) return [ nlp.SplitGenerator( name=nlp.Split("train_eli5"), gen_kwargs={"split": "train", "subreddit_name": "explainlikeimfive"}, ), nlp.SplitGenerator( name=nlp.Split("validation_eli5"), gen_kwargs={"split": "validation", "subreddit_name": "explainlikeimfive"}, ), nlp.SplitGenerator( name=nlp.Split("test_eli5"), gen_kwargs={"split": "test", "subreddit_name": "explainlikeimfive"}, ), nlp.SplitGenerator( name=nlp.Split("train_asks"), gen_kwargs={"split": "train", "subreddit_name": "askscience"}, ), nlp.SplitGenerator( name=nlp.Split("validation_asks"), gen_kwargs={"split": "validation", "subreddit_name": "askscience"}, ), nlp.SplitGenerator( name=nlp.Split("test_asks"), gen_kwargs={"split": "test", "subreddit_name": "askscience"}, ), nlp.SplitGenerator( name=nlp.Split("train_askh"), gen_kwargs={"split": "train", "subreddit_name": "AskHistorians"}, ), nlp.SplitGenerator( name=nlp.Split("validation_askh"), gen_kwargs={"split": "validation", "subreddit_name": "AskHistorians"}, ), nlp.SplitGenerator( name=nlp.Split("test_askh"), gen_kwargs={"split": "test", "subreddit_name": "AskHistorians"}, ), ]
def _split_generators(self, dl_manager): arch_path = dl_manager.download_and_extract(_DOWNLOAD_URL) data_dir = os.path.join(arch_path, "aclImdb") return [ nlp.SplitGenerator(name=nlp.Split.TRAIN, gen_kwargs={"directory": os.path.join(data_dir, "train")}), nlp.SplitGenerator(name=nlp.Split.TEST, gen_kwargs={"directory": os.path.join(data_dir, "test")}), nlp.SplitGenerator( name=nlp.Split("unsupervised"), gen_kwargs={"directory": os.path.join(data_dir, "train"), "labeled": False}, ), ]
def _split_generators(self, dl_manager): arch_path = dl_manager.download_and_extract(self.config.data_url) if "relations" in self.config.name: train_file = "train.csv" test_file = "test.csv" generators = [] for k in [1, 2, 3, 4]: folds_path = os.path.join(arch_path, 'folds', str(k)) generators += [ nlp.SplitGenerator(name=get_train_split(k), gen_kwargs={ 'filepath': os.path.join( folds_path, train_file) }), nlp.SplitGenerator(name=get_test_split(k), gen_kwargs={ 'filepath': os.path.join(folds_path, test_file) }) ] return generators elif "docs" in self.config.name: # docs docs_file = os.path.join(arch_path, "docs.jsonl") return [ nlp.SplitGenerator(name=nlp.Split('docs'), gen_kwargs={"filepath": docs_file}), ] else: raise ValueError()
def _split_generators(self, dl_manager): """Returns SplitGenerators.""" qanta_path = dl_manager.download_and_extract(_QANTA_URL) trick_path = dl_manager.download_and_extract(_TRICK_URL) return [ nlp.SplitGenerator( name=nlp.Split("guesstrain"), gen_kwargs={ "qanta_filepath": qanta_path, "trick_filepath": trick_path, "fold": "guesstrain", "mode": self.config.mode, "char_skip": self.config.char_skip, }, ), nlp.SplitGenerator( name=nlp.Split("buzztrain"), gen_kwargs={ "qanta_filepath": qanta_path, "trick_filepath": trick_path, "fold": "buzztrain", "mode": self.config.mode, "char_skip": self.config.char_skip, }, ), nlp.SplitGenerator( name=nlp.Split("guessdev"), gen_kwargs={ "qanta_filepath": qanta_path, "trick_filepath": trick_path, "fold": "guessdev", "mode": self.config.mode, "char_skip": self.config.char_skip, }, ), nlp.SplitGenerator( name=nlp.Split("buzzdev"), gen_kwargs={ "qanta_filepath": qanta_path, "trick_filepath": trick_path, "fold": "buzzdev", "mode": self.config.mode, "char_skip": self.config.char_skip, }, ), nlp.SplitGenerator( name=nlp.Split("guesstest"), gen_kwargs={ "qanta_filepath": qanta_path, "trick_filepath": trick_path, "fold": "guesstest", "mode": self.config.mode, "char_skip": self.config.char_skip, }, ), nlp.SplitGenerator( name=nlp.Split("buzztest"), gen_kwargs={ "qanta_filepath": qanta_path, "trick_filepath": trick_path, "fold": "buzztest", "mode": self.config.mode, "char_skip": self.config.char_skip, }, ), nlp.SplitGenerator( name=nlp.Split("adversarial"), gen_kwargs={ "qanta_filepath": qanta_path, "trick_filepath": trick_path, "fold": "adversarial", "mode": self.config.mode, "char_skip": self.config.char_skip, }, ), ]
def get_test_split(k): return nlp.Split(f'fold_{k}_test')
def get_train_split(k): return nlp.Split(f'fold_{k}_train')
def main(): # Auto-environment env = get_env() parser = HfArgumentParser( (ModelArguments, TrainingArguments, ExperimentArguments)) model_args, training_args, experiment_args = parser.parse_args_into_dataclasses( ) # Adjust output with folds and model name training_args.output_dir = os.path.join(training_args.output_dir, str(experiment_args.cv_fold), model_args.get_model_name()) # Model path from env if not os.path.exists(model_args.model_name_or_path) and os.path.exists( os.path.join(env['bert_dir'], model_args.model_name_or_path)): model_args.model_name_or_path = os.path.join( env['bert_dir'], model_args.model_name_or_path) if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Dataset args label_classes = get_label_classes_from_nlp_dataset( experiment_args.nlp_dataset) columns = ['input_ids', 'attention_mask', 'token_type_ids', 'labels'] # Build dataset for splits train_ds = load_dataset(experiment_args.nlp_dataset, name='relations', cache_dir=experiment_args.nlp_cache_dir, split=get_train_split(experiment_args.cv_fold)) test_ds = load_dataset(experiment_args.nlp_dataset, name='relations', cache_dir=experiment_args.nlp_cache_dir, split=get_test_split(experiment_args.cv_fold)) docs_ds = load_dataset(experiment_args.nlp_dataset, name='docs', cache_dir=experiment_args.nlp_cache_dir, split=nlp.Split('docs')) # Build ID => Doc mapping doc_id2doc = {doc[experiment_args.doc_id_col]: doc for doc in docs_ds} if model_args.model_name_or_path.startswith('baseline-rnn'): # Load Spacy as tokenizer spacy_nlp = spacy.load(experiment_args.spacy_model, disable=["tagger", "ner", "textcat"]) # Baseline models model = RNNForMultiLabelSequenceClassification( word_vectors=get_vectors_from_spacy_model(spacy_nlp), hidden_size=experiment_args.rnn_hidden_size, rnn=experiment_args.rnn_type, num_labels=len(label_classes), num_layers=experiment_args.rnn_num_layers, dropout=experiment_args.rnn_dropout, ) tokenizer = None else: # Load pretrained Transformers models and tokenizers model_config = AutoConfig.from_pretrained( model_args.model_name_or_path, num_labels=len(label_classes), cache_dir=model_args.cache_dir) # No need for spacy spacy_nlp = None if 'longformer' in model_args.model_name_or_path: # TVM: a custom CUDA kernel implementation of our sliding window attention (works only on GPU) model_config.attention_mode = 'tvm' # override tokenizer name if not set if model_args.tokenizer_name is None: roberta_path = os.path.join(env['bert_dir'], 'roberta-base') model_args.tokenizer_name = roberta_path if os.path.exists( roberta_path) else 'roberta-base' logger.info( f'Overriding tokenizer: {model_args.tokenizer_name}') # override max length experiment_args.max_length = 4096 model = AutoModelForMultiLabelSequenceClassification.from_pretrained( model_args.model_name_or_path, config=model_config, cache_dir=model_args.cache_dir) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) # Set token limit if defined by model (for Longformer) if model.config.max_position_embeddings > 0: tokenizer.model_max_length = model.config.max_position_embeddings # Init helper dpt = DocRelTrainerHelper( id2doc=doc_id2doc, transformers_tokenizer=tokenizer, spacy_nlp=spacy_nlp, label_classes=label_classes, doc_a_col=experiment_args.doc_a_col, doc_b_col=experiment_args.doc_b_col, label_col=experiment_args.label_col, text_from_doc_func=get_non_empty_text_from_doc, classification_threshold=experiment_args.classification_threshold, max_length=experiment_args.max_length, ) logger.info('Converting to features (doc mapping, tokenize, ...)') # Build hash from settings for caching data_settings_hash = hashlib.md5( dataclasses.asdict(experiment_args).__str__().encode("utf-8") + dataclasses.asdict(model_args).__str__().encode("utf-8")).hexdigest() train_ds = train_ds.map(dpt.convert_to_features, batched=True, load_from_cache_file=True, cache_file_name=os.path.join( experiment_args.nlp_cache_dir, "cache-train-" + data_settings_hash + ".arrow")) train_ds.set_format(type='torch', columns=columns) test_ds = test_ds.map(dpt.convert_to_features, batched=True, load_from_cache_file=True, cache_file_name=os.path.join( experiment_args.nlp_cache_dir, "cache-test-" + data_settings_hash + ".arrow")) test_ds.set_format(type='torch', columns=columns) # Load models weights (when no training but predictions) model_weights_path = os.path.join(training_args.output_dir, 'pytorch_model.bin') if not training_args.do_train and experiment_args.save_predictions: logger.info( f'Loading existing model weights from disk: {model_weights_path}') if os.path.exists(model_weights_path): model.load_state_dict(torch.load(model_weights_path)) else: logger.error('Weights files does not exist!') # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_ds, eval_dataset=test_ds, data_collator=DocRelDataCollator(), prediction_loss_only=False, compute_metrics=dpt.compute_metrics, ) # Log additional (to Weights & Baises) if is_wandb_available(): wandb.config.update(dataclasses.asdict(experiment_args)) wandb.config.update(dataclasses.asdict(model_args)) if training_args.do_train: logger.info('Training started...') trainer.train() if isinstance(model, PreTrainedModel): trainer.save_model() elif isinstance(model, nn.Module): # RNN model torch.save(model.state_dict(), model_weights_path) if experiment_args.save_predictions: logger.info('Predicting...') predictions = trainer.predict(test_ds) df = dpt.get_df_from_predictions(test_ds, docs_ds, predictions, exclude_columns=['abstract']) # Save results to disk df.to_csv(os.path.join(training_args.output_dir, 'results.csv'), index=False) json.dump( predictions.metrics, open(os.path.join(training_args.output_dir, 'metrics.json'), 'w')) logger.info('Done')