Пример #1
0
    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        dl_dir = dl_manager.download_and_extract(self.config.data_url)

        splits_gen = []

        for split_id, split_filename in self.config.splits.items():
            if self.config.gameplay_scenario == "original":
                if "train" in split_id:
                    split_name = nlp.Split.TRAIN
                elif "valid" in split_id:
                    split_name = nlp.Split.VALIDATION
                elif "test" in split_id:
                    split_name = nlp.Split.TEST
            else:
                split_name = nlp.Split(split_id)

            full_split_name = "-".join(["compguesswhat", self.config.gameplay_scenario])
            splits_gen.append(
                nlp.SplitGenerator(
                    name=split_name,
                    gen_kwargs={
                        "filepath": os.path.join(dl_dir, full_split_name, self.VERSION.version_str, split_filename)
                    },
                )
            )

        return splits_gen
Пример #2
0
 def _split_generators(self, dl_manager):
     qa_data_file = pjoin(
         self._cache_dir_root, self._relative_data_dir(with_version=False), "reddit_downloaded_qa_lists.json"
     )
     if isfile(qa_data_file):
         logging.info("loading pre-computed QA list")
         self.filtered_reddit = json.load(open(qa_data_file))
     else:
         self.filtered_reddit = _download_and_filter_reddit(
             dl_manager, start_year=2011, start_month=7, end_year=2019, end_month=7
         )
         logging.info("saving pre-computed QA list")
         json.dump(self.filtered_reddit, open(qa_data_file, "w"))
     # download data splits from AWS
     fpath_splits = dl_manager.download(self._DATA_SPLIT_URL)
     self.data_split = json.load(open(fpath_splits))
     return [
         nlp.SplitGenerator(
             name=nlp.Split("train_eli5"), gen_kwargs={"split": "train", "subreddit_name": "explainlikeimfive"},
         ),
         nlp.SplitGenerator(
             name=nlp.Split("validation_eli5"),
             gen_kwargs={"split": "validation", "subreddit_name": "explainlikeimfive"},
         ),
         nlp.SplitGenerator(
             name=nlp.Split("test_eli5"), gen_kwargs={"split": "test", "subreddit_name": "explainlikeimfive"},
         ),
         nlp.SplitGenerator(
             name=nlp.Split("train_asks"), gen_kwargs={"split": "train", "subreddit_name": "askscience"},
         ),
         nlp.SplitGenerator(
             name=nlp.Split("validation_asks"), gen_kwargs={"split": "validation", "subreddit_name": "askscience"},
         ),
         nlp.SplitGenerator(
             name=nlp.Split("test_asks"), gen_kwargs={"split": "test", "subreddit_name": "askscience"},
         ),
         nlp.SplitGenerator(
             name=nlp.Split("train_askh"), gen_kwargs={"split": "train", "subreddit_name": "AskHistorians"},
         ),
         nlp.SplitGenerator(
             name=nlp.Split("validation_askh"),
             gen_kwargs={"split": "validation", "subreddit_name": "AskHistorians"},
         ),
         nlp.SplitGenerator(
             name=nlp.Split("test_askh"), gen_kwargs={"split": "test", "subreddit_name": "AskHistorians"},
         ),
     ]
 def _split_generators(self, dl_manager):
     arch_path = dl_manager.download_and_extract(_DOWNLOAD_URL)
     data_dir = os.path.join(arch_path, "aclImdb")
     return [
         nlp.SplitGenerator(name=nlp.Split.TRAIN, gen_kwargs={"directory": os.path.join(data_dir, "train")}),
         nlp.SplitGenerator(name=nlp.Split.TEST, gen_kwargs={"directory": os.path.join(data_dir, "test")}),
         nlp.SplitGenerator(
             name=nlp.Split("unsupervised"),
             gen_kwargs={"directory": os.path.join(data_dir, "train"), "labeled": False},
         ),
     ]
Пример #4
0
    def _split_generators(self, dl_manager):
        arch_path = dl_manager.download_and_extract(self.config.data_url)

        if "relations" in self.config.name:
            train_file = "train.csv"
            test_file = "test.csv"

            generators = []

            for k in [1, 2, 3, 4]:
                folds_path = os.path.join(arch_path, 'folds', str(k))
                generators += [
                    nlp.SplitGenerator(name=get_train_split(k),
                                       gen_kwargs={
                                           'filepath':
                                           os.path.join(
                                               folds_path, train_file)
                                       }),
                    nlp.SplitGenerator(name=get_test_split(k),
                                       gen_kwargs={
                                           'filepath':
                                           os.path.join(folds_path, test_file)
                                       })
                ]
            return generators

        elif "docs" in self.config.name:
            # docs
            docs_file = os.path.join(arch_path, "docs.jsonl")

            return [
                nlp.SplitGenerator(name=nlp.Split('docs'),
                                   gen_kwargs={"filepath": docs_file}),
            ]
        else:
            raise ValueError()
Пример #5
0
 def _split_generators(self, dl_manager):
     """Returns SplitGenerators."""
     qanta_path = dl_manager.download_and_extract(_QANTA_URL)
     trick_path = dl_manager.download_and_extract(_TRICK_URL)
     return [
         nlp.SplitGenerator(
             name=nlp.Split("guesstrain"),
             gen_kwargs={
                 "qanta_filepath": qanta_path,
                 "trick_filepath": trick_path,
                 "fold": "guesstrain",
                 "mode": self.config.mode,
                 "char_skip": self.config.char_skip,
             },
         ),
         nlp.SplitGenerator(
             name=nlp.Split("buzztrain"),
             gen_kwargs={
                 "qanta_filepath": qanta_path,
                 "trick_filepath": trick_path,
                 "fold": "buzztrain",
                 "mode": self.config.mode,
                 "char_skip": self.config.char_skip,
             },
         ),
         nlp.SplitGenerator(
             name=nlp.Split("guessdev"),
             gen_kwargs={
                 "qanta_filepath": qanta_path,
                 "trick_filepath": trick_path,
                 "fold": "guessdev",
                 "mode": self.config.mode,
                 "char_skip": self.config.char_skip,
             },
         ),
         nlp.SplitGenerator(
             name=nlp.Split("buzzdev"),
             gen_kwargs={
                 "qanta_filepath": qanta_path,
                 "trick_filepath": trick_path,
                 "fold": "buzzdev",
                 "mode": self.config.mode,
                 "char_skip": self.config.char_skip,
             },
         ),
         nlp.SplitGenerator(
             name=nlp.Split("guesstest"),
             gen_kwargs={
                 "qanta_filepath": qanta_path,
                 "trick_filepath": trick_path,
                 "fold": "guesstest",
                 "mode": self.config.mode,
                 "char_skip": self.config.char_skip,
             },
         ),
         nlp.SplitGenerator(
             name=nlp.Split("buzztest"),
             gen_kwargs={
                 "qanta_filepath": qanta_path,
                 "trick_filepath": trick_path,
                 "fold": "buzztest",
                 "mode": self.config.mode,
                 "char_skip": self.config.char_skip,
             },
         ),
         nlp.SplitGenerator(
             name=nlp.Split("adversarial"),
             gen_kwargs={
                 "qanta_filepath": qanta_path,
                 "trick_filepath": trick_path,
                 "fold": "adversarial",
                 "mode": self.config.mode,
                 "char_skip": self.config.char_skip,
             },
         ),
     ]
Пример #6
0
def get_test_split(k):
    return nlp.Split(f'fold_{k}_test')
Пример #7
0
def get_train_split(k):
    return nlp.Split(f'fold_{k}_train')
def main():
    # Auto-environment
    env = get_env()

    parser = HfArgumentParser(
        (ModelArguments, TrainingArguments, ExperimentArguments))
    model_args, training_args, experiment_args = parser.parse_args_into_dataclasses(
    )

    # Adjust output with folds and model name
    training_args.output_dir = os.path.join(training_args.output_dir,
                                            str(experiment_args.cv_fold),
                                            model_args.get_model_name())

    # Model path from env
    if not os.path.exists(model_args.model_name_or_path) and os.path.exists(
            os.path.join(env['bert_dir'], model_args.model_name_or_path)):
        model_args.model_name_or_path = os.path.join(
            env['bert_dir'], model_args.model_name_or_path)

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%Y-%m-%d %H:%M:%S",
        level=logging.INFO
        if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    # Dataset args
    label_classes = get_label_classes_from_nlp_dataset(
        experiment_args.nlp_dataset)
    columns = ['input_ids', 'attention_mask', 'token_type_ids', 'labels']

    # Build dataset for splits
    train_ds = load_dataset(experiment_args.nlp_dataset,
                            name='relations',
                            cache_dir=experiment_args.nlp_cache_dir,
                            split=get_train_split(experiment_args.cv_fold))
    test_ds = load_dataset(experiment_args.nlp_dataset,
                           name='relations',
                           cache_dir=experiment_args.nlp_cache_dir,
                           split=get_test_split(experiment_args.cv_fold))
    docs_ds = load_dataset(experiment_args.nlp_dataset,
                           name='docs',
                           cache_dir=experiment_args.nlp_cache_dir,
                           split=nlp.Split('docs'))

    # Build ID => Doc mapping
    doc_id2doc = {doc[experiment_args.doc_id_col]: doc for doc in docs_ds}

    if model_args.model_name_or_path.startswith('baseline-rnn'):
        # Load Spacy as tokenizer
        spacy_nlp = spacy.load(experiment_args.spacy_model,
                               disable=["tagger", "ner", "textcat"])

        # Baseline models
        model = RNNForMultiLabelSequenceClassification(
            word_vectors=get_vectors_from_spacy_model(spacy_nlp),
            hidden_size=experiment_args.rnn_hidden_size,
            rnn=experiment_args.rnn_type,
            num_labels=len(label_classes),
            num_layers=experiment_args.rnn_num_layers,
            dropout=experiment_args.rnn_dropout,
        )
        tokenizer = None

    else:
        # Load pretrained Transformers models and tokenizers
        model_config = AutoConfig.from_pretrained(
            model_args.model_name_or_path,
            num_labels=len(label_classes),
            cache_dir=model_args.cache_dir)

        # No need for spacy
        spacy_nlp = None

        if 'longformer' in model_args.model_name_or_path:
            # TVM: a custom CUDA kernel implementation of our sliding window attention (works only on GPU)
            model_config.attention_mode = 'tvm'

            # override tokenizer name if not set
            if model_args.tokenizer_name is None:
                roberta_path = os.path.join(env['bert_dir'], 'roberta-base')
                model_args.tokenizer_name = roberta_path if os.path.exists(
                    roberta_path) else 'roberta-base'

                logger.info(
                    f'Overriding tokenizer: {model_args.tokenizer_name}')

            # override max length
            experiment_args.max_length = 4096

        model = AutoModelForMultiLabelSequenceClassification.from_pretrained(
            model_args.model_name_or_path,
            config=model_config,
            cache_dir=model_args.cache_dir)
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.tokenizer_name
            if model_args.tokenizer_name else model_args.model_name_or_path,
            cache_dir=model_args.cache_dir,
        )

        # Set token limit if defined by model (for Longformer)
        if model.config.max_position_embeddings > 0:
            tokenizer.model_max_length = model.config.max_position_embeddings

    # Init helper
    dpt = DocRelTrainerHelper(
        id2doc=doc_id2doc,
        transformers_tokenizer=tokenizer,
        spacy_nlp=spacy_nlp,
        label_classes=label_classes,
        doc_a_col=experiment_args.doc_a_col,
        doc_b_col=experiment_args.doc_b_col,
        label_col=experiment_args.label_col,
        text_from_doc_func=get_non_empty_text_from_doc,
        classification_threshold=experiment_args.classification_threshold,
        max_length=experiment_args.max_length,
    )

    logger.info('Converting to features (doc mapping, tokenize, ...)')

    # Build hash from settings for caching
    data_settings_hash = hashlib.md5(
        dataclasses.asdict(experiment_args).__str__().encode("utf-8") +
        dataclasses.asdict(model_args).__str__().encode("utf-8")).hexdigest()

    train_ds = train_ds.map(dpt.convert_to_features,
                            batched=True,
                            load_from_cache_file=True,
                            cache_file_name=os.path.join(
                                experiment_args.nlp_cache_dir, "cache-train-" +
                                data_settings_hash + ".arrow"))
    train_ds.set_format(type='torch', columns=columns)

    test_ds = test_ds.map(dpt.convert_to_features,
                          batched=True,
                          load_from_cache_file=True,
                          cache_file_name=os.path.join(
                              experiment_args.nlp_cache_dir,
                              "cache-test-" + data_settings_hash + ".arrow"))
    test_ds.set_format(type='torch', columns=columns)

    # Load models weights (when no training but predictions)
    model_weights_path = os.path.join(training_args.output_dir,
                                      'pytorch_model.bin')

    if not training_args.do_train and experiment_args.save_predictions:
        logger.info(
            f'Loading existing model weights from disk: {model_weights_path}')
        if os.path.exists(model_weights_path):
            model.load_state_dict(torch.load(model_weights_path))
        else:
            logger.error('Weights files does not exist!')

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=test_ds,
        data_collator=DocRelDataCollator(),
        prediction_loss_only=False,
        compute_metrics=dpt.compute_metrics,
    )

    # Log additional (to Weights & Baises)
    if is_wandb_available():
        wandb.config.update(dataclasses.asdict(experiment_args))
        wandb.config.update(dataclasses.asdict(model_args))

    if training_args.do_train:
        logger.info('Training started...')

        trainer.train()

        if isinstance(model, PreTrainedModel):
            trainer.save_model()
        elif isinstance(model, nn.Module):  # RNN model
            torch.save(model.state_dict(), model_weights_path)

    if experiment_args.save_predictions:
        logger.info('Predicting...')

        predictions = trainer.predict(test_ds)

        df = dpt.get_df_from_predictions(test_ds,
                                         docs_ds,
                                         predictions,
                                         exclude_columns=['abstract'])

        # Save results to disk
        df.to_csv(os.path.join(training_args.output_dir, 'results.csv'),
                  index=False)
        json.dump(
            predictions.metrics,
            open(os.path.join(training_args.output_dir, 'metrics.json'), 'w'))

    logger.info('Done')