Пример #1
0
def build_indexers(args):
    indexers = {}
    if args.input_module in ["scratch", "glove", "fastText"]:
        indexers["words"] = SingleIdTokenIndexer()
    elif args.input_module in ["elmo", "elmo-chars-only"]:
        indexers["elmo"] = ELMoTokenCharactersIndexer("elmo")
        assert args.tokenizer in {"", "MosesTokenizer"}

    if args.char_embs:
        indexers["chars"] = TokenCharactersIndexer("chars")
    if args.cove:
        assert args.tokenizer == "MosesTokenizer", (
            f"CoVe model expects Moses tokenization (MosesTokenizer);"
            " you are using args.tokenizer = {args.tokenizer}")

    if input_module_uses_transformers(args.input_module):
        assert (
            not indexers
        ), "transformers modules like BERT/XLNet are not supported alongside other "
        "indexers due to tokenization."
        assert args.tokenizer == args.input_module, (
            "transformers models use custom tokenization for each model, so tokenizer "
            "must match the specified model.")
        tokenizer_name = input_module_tokenizer_name(args.input_module)
        indexers[tokenizer_name] = SingleIdTokenIndexer(tokenizer_name)
    return indexers
Пример #2
0
def select_tokenizer(args):
    """
        Select a sane default tokenizer.
    """
    if args.tokenizer == "auto":
        if input_module_uses_transformers(args.input_module):
            tokenizer_name = args.input_module
        else:
            tokenizer_name = "MosesTokenizer"
    else:
        tokenizer_name = args.tokenizer
    return tokenizer_name
Пример #3
0
def _build_vocab(args: config.Params, tasks: List[Task], vocab_path: str):
    """Build vocabulary from scratch

    Read data from all tasks into namespaces, optionally add special vocab items, and save
    vocabulary file.

    Note
    ----
    task-specific target vocabulary should be counted in the task object
    and provided via `task.all_labels()`. The namespace should be task-specific,
    i.e. not something generic like "targets".

    Parameters
    ----------
    args : config.Params
        config map
    tasks : List[Task]
        list of Task from which to build vocab
    vocab_path : str
        vocab file save path

    """
    log.info("\tBuilding vocab from scratch.")
    max_v_sizes = {"word": args.max_word_v_size, "char": args.max_char_v_size}
    word2freq, char2freq = get_words(tasks)
    vocab = get_vocab(word2freq, char2freq, max_v_sizes)
    for task in tasks:  # add custom label namespaces
        # TODO: surface more docs for add_task_label_vocab:
        add_task_label_vocab(vocab, task)

    if args.force_include_wsj_vocabulary:
        # Add WSJ full vocabulary for PTB F1 parsing tasks.
        add_wsj_vocab(vocab, args.data_dir)
    if input_module_uses_transformers(args.input_module):
        # Add pre-computed vocabulary of corresponding tokenizer for transformers models.
        add_transformers_vocab(vocab, args.tokenizer)

    vocab.save_to_files(vocab_path)
    log.info("\tSaved vocab to %s", vocab_path)