Пример #1
0
def build_corpus(args):
    logger = logging.getLogger(args.logger_name)
    cls_add_idx = 20000 if args.mode != "base" else None

    # Load tokenizer
    old = "_old" if args.mode == "base" else ""
    tokenizer_path = os.path.join(args.data_dir, f"tokenizer{old}")
    if os.path.exists(os.path.join(tokenizer_path, "tokenizer_config.json")) and not args.rebuild_corpus:
        tokenizer = TransfoXLTokenizer.from_pretrained(tokenizer_path)
    else:
        tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
        tokenizer = add_cls_token_to_tokenizer(tokenizer, args.logger_name, add_idx=cls_add_idx)
        tokenizer.save_pretrained(tokenizer_path)

    assert tokenizer.cls_token_id == cls_add_idx or tokenizer.cls_token_id == len(tokenizer) - 1

    if args.mode == "moses":
        postfix = "_moses"
    elif args.mode != "base":
        postfix = "_move"
    else:
        postfix = ""
    cached_file = os.path.join(args.data_dir, f"data_cache_{args.corpus_pct}pct{postfix}.pt")
    if os.path.exists(cached_file) and not args.rebuild_corpus:
        # Load cached dataset
        logger.info(f"Loading cached dataset {cached_file}")
        corpus = torch.load(cached_file)
        if getattr(corpus, "use_moses", -1) == -1:
            corpus.set_use_moses(args.mode == "moses")
        for split in Splits:
            if len(corpus.data[split]) == 0:
                logger.warning(f"The split {split} does not contain data!")
    else:
        # Load data
        logger.info(f"Selecting {args.corpus_pct}% of the CNN/DM dataset as corpus")
        cnndm_train = get_cnndm_dataset(Splits.TRAIN, args.corpus_pct)
        cnndm_valid = get_cnndm_dataset(Splits.VALID, args.corpus_pct)
        cnndm_test = get_cnndm_dataset(Splits.TEST, args.corpus_pct)

        logger.info("=" * 100)

        # Build corpus
        corpus = Corpus(tokenizer, args.mode == "moses")
        start_time = time.time()
        corpus.encode_dataset(cnndm_train, split=Splits.TRAIN)
        corpus.encode_dataset(cnndm_valid, split=Splits.VALID)
        corpus.encode_dataset(cnndm_test, split=Splits.TEST)
        elapsed = time.time() - start_time
        logger.info(f"Elapsed time for encoding {(elapsed / 60):5.2f} min")

        logger.info(f"Saving corpus to '{cached_file}'")
        torch.save(corpus, cached_file)

    corpus.tokenizer = tokenizer
    assert corpus.use_moses == (args.mode == "moses")

    return corpus
def tokenize_samples(genes):
  k= len(genes[0][0])
  if k==2:
    kmer_filepath = '/Users/camillo_stuff/Downloads/fourmersXL.txt'
  elif k==6:
    kmer_filepath = '/Users/camillo_stuff/Downloads/hexamersXL.txt'

  tokenizer=TransfoXLTokenizer(vocab_file=kmer_filepath)
  print("TOKENIZER LENGTH", len(tokenizer))
  seq_ids = [tokenizer.convert_tokens_to_ids(gene) for gene in genes]
  return seq_ids
Пример #3
0
 def __init__(self):
     super().__init__()
     self.tokenizer = TransfoXLTokenizer.from_pretrained("transfo-xl-wt103",
                                                         eos_token='<eos>')
     self.tokenizer.add_special_tokens({'bos_token': '<sos>'})
     self.model = TransfoXLLMHeadModel.from_pretrained("transfo-xl-wt103")
     self.softmax = nn.Softmax(dim=0)
    def test_transfoxl(self):
        for tokenizer_name in TransfoXLTokenizer.pretrained_vocab_files_map["pretrained_vocab_file"].keys():
            tokenizer_p = TransfoXLTokenizer.from_pretrained(tokenizer_name)
            tokenizer_r = TransfoXLTokenizerFast.from_pretrained(tokenizer_name)

            # Check we have the same number of added_tokens for both pair and non-pair inputs.
            self.assertEqual(tokenizer_r.num_added_tokens(False), tokenizer_p.num_added_tokens(False))
            self.assertEqual(tokenizer_r.num_added_tokens(True), tokenizer_p.num_added_tokens(True))

            # Check we have the correct max_length for both pair and non-pair inputs.
            self.assertEqual(tokenizer_r.max_len_single_sentence, tokenizer_p.max_len_single_sentence)
            self.assertEqual(tokenizer_r.max_len_sentences_pair, tokenizer_p.max_len_sentences_pair)

            # Assert the set of special tokens match.
            self.assertSequenceEqual(
                tokenizer_p.special_tokens_map.items(),
                tokenizer_r.special_tokens_map.items(),
                "TransfoXL tokenizers doesn't have the same set of special_tokens",
            )

            # Assure tokenization overlap between python and rust impl.
            self.assert_tokenization_python_rust_almost_equals(tokenizer_p, tokenizer_r, 0.0)

            # Ensure add_tokens and add_special_tokens return the correct vocab size
            self.assert_add_tokens(tokenizer_r)

            # Check for offsets mapping
            self.assert_offsets_mapping(tokenizer_r)

            # Check for dynamic encoding sequence handling in batch_encode_plus
            self.assertRaises(ValueError, self.assert_batch_encode_dynamic_overflowing, tokenizer_r)

            # Check alignment for build_inputs_with_special_tokens
            self.assert_build_inputs_with_special_tokens(tokenizer_r, tokenizer_p)
Пример #5
0
 def __init__(self):
     super(Model, self).__init__()
     self.config = TransfoXLConfig(
         vocab_size_or_config_json_file=len(vocab) + 267735,
         n_heads=8,
         n_layers=9)
     self.model = TransfoXLModel(self.config)
     self.tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
     self.out_layer = torch.nn.Linear(self.model.d_model, 2)
Пример #6
0
    def __init__(self, device='cpu'):
        tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
        model = TransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103')
        model = model.to(device)

        self.tokenizer = tokenizer
        self.model = model.eval()
        self.device = device

        self.NUM_CLASSES = 267735
Пример #7
0
def get_IEMOCAP_loaders_transfo_xl(dataset_name = 'IEMOCAP', batch_size=32, num_workers=0, pin_memory=False, args = None):
    tokenizer = TransfoXLTokenizer.from_pretrained(args.home_dir + args.bert_tokenizer_dir)
    print('building vocab.. ')
    speaker_vocab, label_vocab, person_vec = load_vocab(dataset_name)
    train_data, dev_data, test_data = read_datas(dataset_name, batch_size)
    print('building datasets..')
    trainsets = [IEMOCAPDataset_transfo_xl(d,  speaker_vocab, label_vocab, args, tokenizer) for d in train_data]
    devsets = [IEMOCAPDataset_transfo_xl(d, speaker_vocab, label_vocab, args, tokenizer) for d in dev_data]
    testsets = [IEMOCAPDataset_transfo_xl(d, speaker_vocab, label_vocab, args, tokenizer) for d in test_data]

    return trainsets, devsets, testsets, speaker_vocab, label_vocab, person_vec
Пример #8
0
def add_transformers_vocab(vocab, tokenizer_name):
    """Add vocabulary from tokenizers in transformers for use with pre-tokenized data.

    These tokenizers have a convert_tokens_to_ids method, but this doesn't do
    anything special, so we can just use the standard indexers.
    """
    do_lower_case = "uncased" in tokenizer_name
    log.info('In add_transformers_vocab')
    log.info(tokenizer_name)
    if tokenizer_name.startswith(
            "bert-"
    ) or 'rubert' in tokenizer_name or '/bert-' in tokenizer_name:
        tokenizer = BertTokenizer.from_pretrained(tokenizer_name,
                                                  do_lower_case=do_lower_case)
    elif tokenizer_name.startswith(
            "roberta-"):  # or 'roberta' in tokenizer_name:
        tokenizer = RobertaTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("albert-"):
        tokenizer = AlbertTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("xlnet-"):
        tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name,
                                                   do_lower_case=do_lower_case)
    elif tokenizer_name.startswith("openai-gpt"):
        tokenizer = OpenAIGPTTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("gpt2") or 'gpt' in tokenizer_name:
        tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("transfo-xl-"):
        tokenizer = TransfoXLTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("xlm-roberta"):
        tokenizer = XLMRobertaTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("xlm-"):
        tokenizer = XLMTokenizer.from_pretrained(tokenizer_name)

    if (tokenizer_name.startswith("openai-gpt")
            or tokenizer_name.startswith("gpt2")
            or tokenizer_name.startswith("transo-xl-")):
        tokenizer.add_special_tokens({
            "bos_token": "<start>",
            "sep_token": "<delim>",
            "cls_token": "<extract>"
        })
    # TODO: this is another place can be simplified by "model-before-preprocess" reorganization
    # we can pass tokenizer created in model here, see issue <TBD>

    vocab_size = len(tokenizer)
    # do not use tokenizer.vocab_size, it does not include newly added token

    ordered_vocab = tokenizer.convert_ids_to_tokens(range(vocab_size))
    log.info("Added transformers vocab (%s): %d tokens", tokenizer_name,
             len(ordered_vocab))
    for word in ordered_vocab:
        vocab.add_token_to_namespace(
            word, input_module_tokenizer_name(tokenizer_name))
Пример #9
0
def run_TFXL_RSA(stim_file, layer, header=False, filter_file=None):

    EXP = data.Stim(stim_file, header, filter_file, VOCAB_FILE)

    #Get tokenizer
    tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')

    #Load model
    model = TransfoXLModel.from_pretrained(
        'transfo-xl-wt103', output_hidden_states=True)  #, force_download=True)
    #turn off learning
    model.zero_grad()

    for x in range(len(EXP.SENTS)):
        sentences = list(EXP.SENTS[x])
        target = sentences[0]
        sentence = sentences[1]

        #GET BASELINE
        target_encoded = tokenizer.encode(target, add_special_tokens=True)
        target_input_ids = torch.tensor(target_encoded).unsqueeze(0)

        #Get model outputs
        output = model(target_input_ids)
        predictions, mems, hidden_states = output

        hidden_states = hidden_states[1:]

        baseline = hidden_states[layer][0][-1].data.cpu().squeeze()

        #GET SIMs
        sims = get_TFXL_sims(sentence, layer, baseline, tokenizer, model)
        values = get_dummy_values(sentence)

        EXP.load_IT('tfxl', x, values, False, sims)

    return EXP
Пример #10
0
def setup_transfo_xl(model_name):
    def _fix_tokenizer_encoding(tokenizer):
        import collections
        if '–' not in tokenizer.sym2idx:
            tokenizer.idx2sym = [sym.encode('latin1').decode(
                'utf-8') for sym in tokenizer.idx2sym]
            tokenizer.sym2idx = collections.OrderedDict((sym.encode('latin1').decode('utf-8'), idx)
                                                        for sym, idx in tokenizer.sym2idx.items())
        else:
            logger.info("No need to fix tokenizer encoding")
        return tokenizer

    model = TransfoXLLMHeadModel.from_pretrained(model_name)
    tokenizer = TransfoXLTokenizer.from_pretrained(model_name)
    tokenizer = _fix_tokenizer_encoding(tokenizer)

    def encode(lines):
        # TODO: tokenize is removing the empty lines and add_eos is not being added.
        # TODO2: tokenize in transformers xl does not handle multiple lines correctly (removes <eos>)
        return tokenizer.convert_tokens_to_ids(
            [tok for l in lines for tok in tokenizer._tokenize(l.strip(), add_eos=True)])
    tokenizer.encode = encode
    
    return model, tokenizer
XLM = ModelInfo(
    XLMWithLMHeadModel.from_pretrained('xlm-mlm-xnli15-1024',
                                       return_dict=True),
    XLMTokenizer.from_pretrained('xlm-mlm-xnli15-1024'), "_", vocab, "XLM")

T5 = ModelInfo(
    T5ForConditionalGeneration.from_pretrained("t5-base", return_dict=True),
    T5Tokenizer.from_pretrained("t5-base"), "_", vocab, "T5")

Albert = ModelInfo(
    AlbertForMaskedLM.from_pretrained('albert-base-v2', return_dict=True),
    AlbertTokenizer.from_pretrained('albert-base-v2'), "_", vocab, "Albert")

TXL = ModelInfo(TransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103'),
                TransfoXLTokenizer.from_pretrained('transfo-xl-wt103'), "_",
                vocab, "TXL")

if __name__ == "__main__":

    sentences = [sample_sentences("sentences4lara.txt") for i in range(11)]

    sent_dict = dict(zip([str(x) for x in range(1, 11)], sentences))

    sentence = sent_dict[sys.argv[2]]

    batch_size = 100
    convergence_criterion = int(sys.argv[4])
    model_list = [GPT2, Roberta, Albert, XLM, T5]
    max_length = 8
    top_k = 25
def main():
    print('start of main')
    parser = argparse.ArgumentParser(
        description='''This script computes probabilities for a masked token
                         with words from the words file, and
                         stores result in csv format to the output file ''')

    parser.add_argument("-s",
                        type=str,
                        required=True,
                        dest="sent_type",
                        help='class name: "sv_agreement" or "anaphora"')
    parser.add_argument("-t",
                        type=str,
                        required=True,
                        dest="template",
                        help='template name (see templates.txt)')
    parser.add_argument("-g",
                        type=int,
                        required=False,
                        default=None,
                        dest="gpu_num",
                        help='which gpu to run this on')
    parser.add_argument("-m",
                        type=str,
                        required=False,
                        default='transfo-xl-wt103',
                        dest="model_path_or_name",
                        help='path to the model or name of the model')

    args = parser.parse_args()

    if args.sent_type not in ['sv_agreement', 'anaphora']:
        parser.error("invalid sent_type argument for -s")

    print('creating results path')
    use_wug = args.model_path_or_name != 'transfo-xl-wt103'

    number = None

    if use_wug:
        model_type = args.model_path_or_name.split('/')
        if model_type[-1] == '':
            model_type = model_type[:-1]
        number = model_type[-3].lower()
        model_path = '/'.join(model_type[-3:])

        results_path = FINE_TUNE_RESULTS_PATH[:-7] % model_path
        if not os.path.isdir(results_path):
            print('creating directory %s' % results_path)
            os.mkdir(results_path)
        results_path = FINE_TUNE_RESULTS_PATH[:-4] % (model_path,
                                                      args.sent_type)
        if not os.path.isdir(results_path):
            print('creating directory %s' % results_path)
            os.mkdir(results_path)
        results_path = FINE_TUNE_RESULTS_PATH % (model_path, args.sent_type,
                                                 args.template)
    else:
        results_path = RESULTS_PATH[:-4] % args.sent_type
        if not os.path.isdir(results_path):
            print('creating directory %s' % results_path)
            os.mkdir(results_path)
        results_path = RESULTS_PATH % (args.sent_type, args.template)

    results_filename = RESULTS_FILENAME % args.template

    outfilename = os.path.join(str(ABS_PATH), results_path, results_filename)

    if not os.path.isdir(results_path):
        print('creating directory %s' % results_path)
        os.mkdir(results_path)

    print('getting consts')

    sent_types = csp_consts.SENT_TYPES[args.sent_type]
    batch_sizes = csp_consts.BATCH_SIZES[args.sent_type]

    try:
        template_name = sent_types[args.template]
        batch_size_dict = batch_sizes[args.template]
    except KeyError:
        parser.error("Incompatible template for the given sentence type")
        sys.exit()

    print('loading model at', datetime.now())

    txl_tokenizer = TransfoXLTokenizer.from_pretrained(MODEL_NAME)
    txl_tokenizer.add_special_tokens({
        'bos_token': BOS_TOKEN,
        'pad_token': PAD_TOKEN
    })
    txl_model = TransfoXLLMHeadModel.from_pretrained(MODEL_NAME)
    txl_model.eval()

    if args.gpu_num is not None:
        device = torch.device(
            'cuda:' +
            str(args.gpu_num) if torch.cuda.is_available() else 'cpu')
        print('running on GPU: %d' % args.gpu_num)
    else:
        device = torch.device('cpu')

    txl_model.to(device)

    PADDING_TEXT_TXL_TOKENIZED = txl_tokenizer.encode(PADDING_TEXT,
                                                      add_eos=True)
    PADDING_TEXT_TENSOR = torch.tensor(PADDING_TEXT_TXL_TOKENIZED,
                                       dtype=torch.long,
                                       device=device).unsqueeze(0)
    global PADDING_MEMS
    _, PADDING_MEMS = txl_model(PADDING_TEXT_TENSOR)

    batch_size = batch_size_dict['pairs']
    num_sents = batch_size_dict['sents']
    if use_wug:
        batch_size *= 2
        num_sents //= 2

    print('starting all computations at', datetime.now())
    eval_from_file(txl_model,
                   txl_tokenizer,
                   template_name,
                   outfilename,
                   batch_size,
                   num_sents,
                   device=device,
                   use_wug=use_wug,
                   number=number)
    print('completed all computations at', datetime.now())
def train(config):
    train_data = open(config.TRAIN_FNAME).readlines()
    val_data = open(config.VAL_FNAME).readlines()
    test_data = open(config.TEST_FNAME).readlines()

    tokenizer = TransfoXLTokenizer.from_pretrained(config.TOKENIZER_FNAME)
    tokenize = functools.partial(tokenizer.encode,
                                 add_space_before_punct_symbol=True)

    train_data = list(map(tokenize, train_data))
    val_data = list(map(tokenize, val_data))
    test_data = list(map(tokenize, test_data))

    train_dataloader = utils.DataGenerator(
        train_data,
        batch_size=config.BATCH_SIZE,
        max_len=config.MAX_SEQ_LEN,
        n_seg_splits=config.N_SEG_SPLITS,
        max_seg_len=config.MAX_SEG_LEN,
    )

    val_dataloader = utils.DataGenerator(
        val_data,
        batch_size=config.BATCH_SIZE,
        max_len=config.MAX_SEQ_LEN,
        n_seg_splits=config.N_SEG_SPLITS,
        max_seg_len=config.MAX_SEG_LEN,
    )

    test_dataloader = utils.DataGenerator(
        test_data,
        batch_size=config.BATCH_SIZE,
        max_len=config.MAX_SEQ_LEN,
        n_seg_splits=config.N_SEG_SPLITS,
        max_seg_len=config.MAX_SEG_LEN,
    )

    model = transformer_model.create_model(
        max_len=config.MAX_SEQ_LEN,
        lstm_dim=config.LSTM_DIM,
        hidden_dim=config.HIDDEN_DIM,
        dropout_rate=config.DROPOUT_RATE,
        train_embeddings=config.TRAIN_EMBED,
    )
    model.summary()
    model = utils.compile_model(model)
    model = utils.load_weights(model, config.MODEL_WEIGHTS_PATH)

    callbacks = utils.load_callbacks(**config.CALLBACK_PARAMS)
    history = model.fit(
        train_dataloader,
        validation_data=val_dataloader,
        steps_per_epoch=config.EPOCH_LEN,
        validation_steps=config.VAL_LEN,
        epochs=config.N_EPOCHS,
        callbacks=[callbacks],
    )
    hist_df = pd.DataFrame(history.history)
    hist_df.to_json(f"{config.MODEL_LOGS_PATH}/history.json")

    test_eval_results = utils.eval_model(model,
                                         test_dataloader,
                                         thresh=0.5,
                                         steps=None)
    eval_results_path = f"{config.MODEL_LOGS_PATH}/eval_results.json"

    json.dump(test_eval_results, open(eval_results_path, "w+"))
Пример #14
0
def test_transformer_xl_embeddings():
    transfo_model: str = "transfo-xl-wt103"

    tokenizer = TransfoXLTokenizer.from_pretrained(transfo_model)
    model = TransfoXLModel.from_pretrained(
        pretrained_model_name_or_path=transfo_model, output_hidden_states=True)
    model.to(flair.device)
    model.eval()

    s: str = "Berlin and Munich have a lot of puppeteer to see ."

    with torch.no_grad():
        tokens = tokenizer.tokenize(s + "<eos>")

        print(tokens)

        indexed_tokens = tokenizer.convert_tokens_to_ids(tokens)
        tokens_tensor = torch.tensor([indexed_tokens])
        tokens_tensor = tokens_tensor.to(flair.device)

        hidden_states = model(tokens_tensor)[-1]

        first_layer = hidden_states[1][0]

    assert len(first_layer) == len(tokens)

    #     0       1        2        3     4     5      6        7        8      9     10     11
    #
    # 'Berlin', 'and', 'Munich', 'have', 'a', 'lot', 'of', 'puppeteer', 'to', 'see', '.', '<eos>'
    #     |       |        |        |     |     |      |        |        |      |     |
    #  Berlin    and    Munich    have    a    lot    of    puppeteer    to    see    .
    #
    #     0       1        2        3     4     5      6        7        8      9     10

    def embed_sentence(sentence: str,
                       layers: str = "1",
                       use_scalar_mix: bool = False) -> Sentence:
        embeddings = TransformerXLEmbeddings(
            pretrained_model_name_or_path=transfo_model,
            layers=layers,
            use_scalar_mix=use_scalar_mix,
        )
        flair_sentence = Sentence(sentence)
        embeddings.embed(flair_sentence)

        return flair_sentence

    sentence = embed_sentence(sentence=s)

    first_token_embedding_ref = first_layer[0].tolist()
    first_token_embedding_actual = sentence.tokens[0].embedding.tolist()

    puppeteer_embedding_ref = first_layer[7].tolist()
    puppeteer_embedding_actual = sentence.tokens[7].embedding.tolist()

    assert first_token_embedding_ref == first_token_embedding_actual
    assert puppeteer_embedding_ref == puppeteer_embedding_actual

    # Check embedding dimension when using multiple layers
    sentence_mult_layers = embed_sentence(sentence="Munich", layers="1,2,3,4")

    ref_embedding_size = 4 * model.d_embed
    actual_embedding_size = len(sentence_mult_layers.tokens[0].embedding)

    assert ref_embedding_size == actual_embedding_size

    # Check embedding dimension when using multiple layers and scalar mix
    sentence_mult_layers_scalar_mix = embed_sentence(sentence="Berlin",
                                                     layers="1,2,3,4",
                                                     use_scalar_mix=True)

    ref_embedding_size = 1 * model.d_embed
    actual_embedding_size = len(
        sentence_mult_layers_scalar_mix.tokens[0].embedding)

    assert ref_embedding_size == actual_embedding_size
Пример #15
0
def main():
    parser = argparse.ArgumentParser(description='PyTorch Transformer Language Model')
    parser.add_argument('--model_name', type=str, default='transfo-xl-wt103',
                        help='pretrained model name')
    parser.add_argument('--split', type=str, default='test',
                        choices=['all', 'valid', 'test'],
                        help='which split to evaluate')
    parser.add_argument('--batch_size', type=int, default=10,
                        help='batch size')
    parser.add_argument('--tgt_len', type=int, default=128,
                        help='number of tokens to predict')
    parser.add_argument('--ext_len', type=int, default=0,
                        help='length of the extended context')
    parser.add_argument('--mem_len', type=int, default=1600,
                        help='length of the retained previous heads')
    parser.add_argument('--clamp_len', type=int, default=1000,
                        help='max positional embedding index')
    parser.add_argument('--no_cuda', action='store_true',
                        help='Do not use CUDA even though CUA is available')
    parser.add_argument('--work_dir', type=str, required=True,
                        help='path to the work_dir')
    parser.add_argument('--no_log', action='store_true',
                        help='do not log the eval result')
    parser.add_argument('--same_length', action='store_true',
                        help='set same length attention with masking')
    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
    args = parser.parse_args()
    assert args.ext_len >= 0, 'extended context length must be non-negative'

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()

    device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    logger.info("device: {}".format(device))

    # Load a pre-processed dataset
    # You can also build the corpus yourself using TransfoXLCorpus methods
    # The pre-processing involve computing word frequencies to prepare the Adaptive input and SoftMax
    # and tokenizing the dataset
    # The pre-processed corpus is a convertion (using the conversion script )
    tokenizer = TransfoXLTokenizer.from_pretrained(args.model_name)
    corpus = TransfoXLCorpus.from_pretrained(args.model_name)
    ntokens = len(corpus.vocab)

    va_iter = corpus.get_iterator('valid', args.batch_size, args.tgt_len,
        device=device, ext_len=args.ext_len)
    te_iter = corpus.get_iterator('test', args.batch_size, args.tgt_len,
        device=device, ext_len=args.ext_len)

    # Load a pre-trained model
    model = TransfoXLLMHeadModel.from_pretrained(args.model_name)
    model = model.to(device)

    logger.info('Evaluating with bsz {} tgt_len {} ext_len {} mem_len {} clamp_len {}'.format(
        args.batch_size, args.tgt_len, args.ext_len, args.mem_len, args.clamp_len))

    model.reset_length(args.tgt_len, args.ext_len, args.mem_len)
    if args.clamp_len > 0:
        model.clamp_len = args.clamp_len
    if args.same_length:
        model.same_length = True

    ###############################################################################
    # Evaluation code
    ###############################################################################
    def evaluate(eval_iter):
        # Turn on evaluation mode which disables dropout.
        model.eval()
        total_len, total_loss = 0, 0.
        start_time = time.time()
        with torch.no_grad():
            mems = None
            for idx, (data, target, seq_len) in enumerate(eval_iter):
                ret = model(data, lm_labels=target, mems=mems)
                loss, _, mems = ret
                loss = loss.mean()
                total_loss += seq_len * loss.item()
                total_len += seq_len
            total_time = time.time() - start_time
        logger.info('Time : {:.2f}s, {:.2f}ms/segment'.format(
                total_time, 1000 * total_time / (idx+1)))
        return total_loss / total_len

    # Run on test data.
    if args.split == 'all':
        test_loss = evaluate(te_iter)
        valid_loss = evaluate(va_iter)
    elif args.split == 'valid':
        valid_loss = evaluate(va_iter)
        test_loss = None
    elif args.split == 'test':
        test_loss = evaluate(te_iter)
        valid_loss = None

    def format_log(loss, split):
        log_str = '| {0} loss {1:5.2f} | {0} ppl {2:9.3f} '.format(
            split, loss, math.exp(loss))
        return log_str

    log_str = ''
    if valid_loss is not None:
        log_str += format_log(valid_loss, 'valid')
    if test_loss is not None:
        log_str += format_log(test_loss, 'test')

    logger.info('=' * 100)
    logger.info(log_str)
    logger.info('=' * 100)
Пример #16
0
    def test_transfoxl(self):
        for tokenizer_name in TransfoXLTokenizer.pretrained_vocab_files_map[
                "pretrained_vocab_file"].keys():
            tokenizer_p = TransfoXLTokenizer.from_pretrained(tokenizer_name)
            tokenizer_r = TransfoXLTokenizerFast.from_pretrained(
                tokenizer_name)

            # Check we have the same number of added_tokens for both pair and non-pair inputs.
            self.assertEqual(tokenizer_r.num_added_tokens(False),
                             tokenizer_p.num_added_tokens(False))
            self.assertEqual(tokenizer_r.num_added_tokens(True),
                             tokenizer_p.num_added_tokens(True))

            # Check we have the correct max_length for both pair and non-pair inputs.
            self.assertEqual(tokenizer_r.max_len_single_sentence,
                             tokenizer_p.max_len_single_sentence)
            self.assertEqual(tokenizer_r.max_len_sentences_pair,
                             tokenizer_p.max_len_sentences_pair)

            # Assert the set of special tokens match.
            self.assertSequenceEqual(
                tokenizer_p.special_tokens_map.items(),
                tokenizer_r.special_tokens_map.items(),
                "TransfoXL tokenizers doesn't have the same set of special_tokens",
            )

            # Assure tokenization overlap between python and rust impl.
            self.assert_tokenization_python_rust_almost_equals(
                tokenizer_p, tokenizer_r, 0.0)

            # Ensure add_tokens and add_special_tokens return the correct vocab size
            self.assert_add_tokens(tokenizer_r)

            # Check for offsets mapping
            self.assert_offsets_mapping(tokenizer_r)

            # Check for dynamic encoding sequence handling in batch_encode_plus
            self.assertRaises(ValueError,
                              self.assert_batch_encode_dynamic_overflowing,
                              tokenizer_r)

            # Check alignment for build_inputs_with_special_tokens
            self.assert_build_inputs_with_special_tokens(
                tokenizer_r, tokenizer_p)

            # Check for padding
            self.assertRaises(ValueError, self.assert_padding, tokenizer_r,
                              tokenizer_p)

            # Check the number of returned files for save_vocabulary
            # TransfoXL tokenizers comes in a special format which is not compatible at all
            # with rust tokenizers. We ensure the errors detection at correctly raised
            tokenizer_r_files = tokenizer_r.save_pretrained(".")
            self.assertSequenceEqual(tokenizer_r_files, [
                "./vocab.json", "./special_tokens_map.json",
                "./added_tokens.json"
            ])

            # Check loading Python-tokenizer save through Rust doesnt work (and the opposite)
            self.assertRaises(ValueError, tokenizer_p.from_pretrained,
                              *tokenizer_r_files)
            self.assertRaises(ValueError, tokenizer_r.from_pretrained,
                              *tokenizer_p.save_pretrained("."))

            # Check loading works for Python to Python and Rust to Rust
            # Issue: https://github.com/huggingface/transformers/issues/3000
            # self.assertIsNotNone(tokenizer_p.__class__.from_pretrained('./'))
            self.assertIsNotNone(tokenizer_r.__class__.from_pretrained("./"))
Пример #17
0
    #flatten_train = [word for sublist in words_train for word in sublist]
    #flatten_dev   = [word for sublist in words_dev for word in sublist]
    #flatten_test  = [word for sublist in words_test for word in sublist]
    
    # Generate a distribution over tags, useful for control task
    #dist = find_distribution(data.DataLoader(POSDataset(train_x, train_y), batch_size=1))
    #print(len(dist))
    #ypos_train_control, ypos_dev_control, ypos_test_control = save_or_load_pos_controls(
    #    train_x, train_y, [flatten_train, flatten_dev, flatten_test], dist)

    #
    return train_x, train_y, \
           dev_x, dev_y, \
           test_x, test_y


# Load
# Transformer XL
from transformers import TransfoXLTokenizer, TransfoXLModel
transfo_XL = TransfoXLModel.from_pretrained('transfo-xl-wt103')
print("I have loaded the transformer XL model")
transfo_XL_tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
print("I have loaded the transformer XL Tokenizer")


# Build TransformerXL
import warnings
warnings.filterwarnings(action='ignore')
train_x_transfo_XL, train_y_transfo_XL,  \
           dev_x_transfo_XL, dev_y_transfo_XL,  \
           test_x_transfo_XL, test_y_transfo_XL = get_transformer_reps(transfo_XL, transfo_XL_tokenizer, extra_transformer='TransformerXL')
Пример #18
0
def load_tokenizer(config):
    tokenizer = TransfoXLTokenizer.from_pretrained(config.TOKENIZER_FNAME)
    return tokenizer
Пример #19
0
def train(datapath, outpath, seed, batch_size, epochs, save_steps, use_gpt, use_cuda = True):
    #set up model and device (hopefully cuda)
    device = torch.device("cuda" if torch.cuda.is_available() and use_cuda else "cpu")

    if use_gpt:
        model = GPT2LMHeadModel.from_pretrained('gpt2')
        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    else:
        model = TransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103')
        tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
    model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), betas=(.9,.98), eps=1e-09)
    
    #setup rng seeds on all devices to ensure repeatable results
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    num_batches = len(os.listdir(datapath)) / batch_size
    batch_list = getBatch(datapath, batch_size, tokenizer)

    avg_losses = []
    avg_loss = 0
    
    model.zero_grad()
    timestamp = datetime.datetime.now().strftime('%y%m%d%H%M%S')

    for _ in trange(epochs, desc="Epochs"):
        for batch_num in tqdm(range(0,int(num_batches), batch_size), desc="Batches"):
            #setup this batch.
            batch = torch.tensor(next(batch_list), dtype=torch.long, device=device)
            inputs, labels = batch, batch
            inputs = inputs.to(device)
            labels = labels.to(device)

            #feed input to model to train
            model.train()
            outputs = model(input_ids=inputs, labels=labels)

            if not use_gpt:
                # loss returned from transfoXL was broken
                first_pad = get_first_occ(inputs[0], -1)
                loss = outputs[0][0][:first_pad].mean()

            loss = outputs[0]
            avg_loss += loss
            
            #update parameters
            loss.backward()
            optimizer.step()
            model.zero_grad()

            if batch_num % (batch_size * save_steps) == 0:
                print('CHECKPOINT')
                checkpoint_path = f"{fixpath(outpath)}{timestamp}/e{epochs}-num{batch_num}-size{batch_size}"
                if not os.path.exists(checkpoint_path):
                    os.makedirs(checkpoint_path)
                model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
                model_to_save.save_pretrained(checkpoint_path)
                tokenizer.save_pretrained(checkpoint_path)

                avg = avg_loss / save_steps
                print(f"average loss: {avg}")
                avg_losses += [avg]
                print('finished')
    
    print(avg_losses)