def __init__(self, gpu):
     SeqIndexerBase.__init__(self,
                             gpu=gpu,
                             check_for_lowercase=False,
                             zero_digits=False,
                             pad=None,
                             unk=None,
                             load_embeddings=False,
                             verbose=True)
 def __init__(self,
              gpu,
              check_for_lowercase,
              zero_digits,
              pad,
              unk,
              load_embeddings,
              embeddings_dim,
              verbose,
              isElmo=False,
              isBert=False):
     SeqIndexerBase.__init__(self, gpu, check_for_lowercase, zero_digits,
                             pad, unk, load_embeddings, embeddings_dim,
                             verbose, isElmo, isBert)
Exemplo n.º 3
0
    data_io = DataIOFactory.create(args)
    word_sequences_train, tag_sequences_train, word_sequences_dev, tag_sequences_dev, word_sequences_test, tag_sequences_test = data_io.read_train_dev_test(
        args)
    # DatasetsBank provides storing the different dataset subsets (train/dev/test) and sampling batches
    datasets_bank = DatasetsBankFactory.create(args)
    datasets_bank.add_train_sequences(word_sequences_train,
                                      tag_sequences_train)
    datasets_bank.add_dev_sequences(word_sequences_dev, tag_sequences_dev)
    datasets_bank.add_test_sequences(word_sequences_test, tag_sequences_test)
    # Word_seq_indexer converts lists of lists of words to lists of lists of integer indices and back
    if args.word_seq_indexer is not None and isfile(args.word_seq_indexer):
        word_seq_indexer = torch.load(args.word_seq_indexer)

    # if we use elmo it is not nessesary to use castom word_seq_indexer TODO: write up ELMO to seq_indexer class
    elif args.isElmo:
        word_seq_indexer = SeqIndexerBase()
    else:
        word_seq_indexer = SeqIndexerWord(
            gpu=args.gpu,
            check_for_lowercase=args.check_for_lowercase,
            embeddings_dim=args.emb_dim,
            verbose=True)
        word_seq_indexer.load_items_from_embeddings_file_and_unique_words_list(
            emb_fn=args.emb_fn,
            emb_delimiter=args.emb_delimiter,
            emb_load_all=args.emb_load_all,
            unique_words_list=datasets_bank.unique_words_list)
    if args.word_seq_indexer is not None and not isfile(args.word_seq_indexer):
        torch.save(word_seq_indexer, args.word_seq_indexer)
    # Tag_seq_indexer converts lists of lists of tags to lists of lists of integer indices and back
    tag_seq_indexer = SeqIndexerTag(gpu=args.gpu)