def finetune_lm(train_df, config, arch, args):
    # Function to fine-tune the pre-trained language model
    if args.lang != 'en':
        tok = SentencePieceTokenizer(lang=args.lang,
                                     sp_model=SP_MODEL_LOCAL_PATH)
    else:
        tok = None
    blocks = TextBlock.from_df(args.text_col, is_lm=True, tok=tok)

    data_block = DataBlock(blocks=blocks,
                           get_x=ColReader("text"),
                           splitter=RandomSplitter(valid_pct=VAL_SIZE,
                                                   seed=RANDOM_STATE))

    lm_dataloaders = (data_block).dataloaders(train_df,
                                              bs=args.batch_size,
                                              backwards=args.bw)

    if args.lang == 'en':
        pretrained_filenames = None
    else:
        pretrained_filenames = [WEIGHTS_PRETRAINED_FILE, VOCAB_PRETRAINED_FILE]

    learner_lm = language_model_learner(
        lm_dataloaders,
        arch,
        config=config,
        path=LM_MODEL_PATH,
        pretrained=True,
        pretrained_fnames=pretrained_filenames).to_fp32()

    lr = find_best_lr(learner_lm)

    learner_lm = fit_with_gradual_unfreezing(learner_lm, args.epochs, lr, args)
    learner_lm.save_encoder(ENCODER_FILE_NAME)
    return lm_dataloaders
示例#2
0
    def __init__(self,
                 data_path: str = 'lang_model',
                 emb_sz: int = 800,
                 qrnn: bool = False,
                 bidir: bool = False,
                 n_layers: int = 4,
                 n_hid: int = 2500,
                 bs: int = 104,
                 bptt: int = 67,
                 lr: float = 0.0013,
                 wd: float = .012,
                 one_cycle: bool = True,
                 cycle_len: int = 1) -> None:
        """ Instantiate AWD_LSTM Language Model with hyper-parameters.
        
        data_path: str
            path where databunch is loaded from
        emb_sz: int
            size of word embeddings
        qrnn: bool
            whether or not to use qrnn (requires CudNN)
        bidir: bool
            if RNN should be bi-directional
        n_layers: int
            number of layers in lang model
        n_hid: int
            number of hidden units in model
        lr: float
            learning rate
        bptt: int
            back-propigation-through-time; max sequence length through which gradients will be accumulated.
        bs: int
            batch size
        
        The hyper-parameters are stored in a fastai dict called `fastai.text.models.awd_lstm_lm_config`:
           {'emb_sz': 400, 'n_hid': 1150, 'n_layers': 3, 'pad_token': 1, 'qrnn': False, 'bidir': False, 'output_p': 0.1,
            'hidden_p': 0.15, 'input_p': 0.25, 'embed_p': 0.02,'weight_p': 0.2, 'tie_weights': True, 'out_bias': True}
        """
        self.lr, self.wd, self.one_cycle, self.cycle_len = lr, wd, one_cycle, cycle_len
        awd_lstm_lm_config.update(
            dict(emb_sz=emb_sz,
                 qrnn=qrnn,
                 bidir=bidir,
                 n_layers=n_layers,
                 n_hid=n_hid))
        #log params
        wb_handle = wandb.init(config=awd_lstm_lm_config)
        wandb.config.update({
            'data_path': str(data_path),
            'bs': bs,
            'bptt': bptt,
            'lr': lr
        })
        self.csv_name = 'history_' + wb_handle.name
        wandb.config.update({'csvlog_save_path': self.csv_name})

        # instantiate databunch
        self.data_lm = load_data(data_path, bs=bs, bptt=bptt)

        # instantiate language model
        self.learn = language_model_learner(data=self.data_lm,
                                            arch=AWD_LSTM,
                                            pretrained=False,
                                            model_dir=Path('models_' +
                                                           wb_handle.name),
                                            config=awd_lstm_lm_config)
        self.full_model_path = str(self.learn.path / self.learn.model_dir)
        wandb.config.update({'model_save_path': self.full_model_path})

        # prepare callbacks
        escb = EarlyStoppingCallback(learn=self.learn, patience=2)
        smcb = SaveModelCallback(learn=self.learn,
                                 name='best_' + wb_handle.name)
        rpcb = ReduceLROnPlateauCallback(learn=self.learn, patience=1)
        csvcb = CSVLogger(learn=self.learn, filename=self.csv_name)
        wb = wandbCallback(self.learn)
        self.callbacks = [escb, smcb, rpcb, csvcb, wb]

        self.fit()
示例#3
0
                       splitter=RandomSplitter(0.1))
    dls_lm = dls_lm.dataloaders(path, path=path, bs=bs, seq_len=80)
    print(dls_lm.show_batch(max_n=3))

    # #%%
    # learn = language_model_learner(
    #     dls_lm, AWD_LSTM, drop_mult=0.3,
    #     metrics=[accuracy, Perplexity()]).to_fp16()
    # learn.lr_find()
    # print(learn.model)
    # learn.fit_one_cycle(1, 2e-2, moms=(0.8,0.7,0.8), cbs=cbs)
    # learn.save("1epoch")

    # #%%
    learn = language_model_learner(dls_lm,
                                   AWD_LSTM,
                                   drop_mult=0.3,
                                   metrics=[accuracy, Perplexity()]).to_fp16()
    learn = learn.load("1epoch")
    print(learn.model)

    # #%%

    # learn.unfreeze()
    # learn.fit_one_cycle(1, 2e-3, moms=(0.8,0.7,0.8), cbs=cbs)
    # learn.save("1epochs_finetuned")
    # learn.save_encoder("finetuned")

    # #%%
    #     # Classification
    def read_tokenized_file(f):
        return L(f.read_text().split(' '))
示例#4
0
def main(gpu: Param("GPU to run on", str)=None,
        max_cpu_per_dataloader: Param("Max CPU", int, opt=True)=8,
        bs: Param("batch size", int)=256,
        fp16: Param("mixed precision", int, opt=True)=0,
         use_sp_processor: Param("use sentence piece as processor", int) = 0,
        sp_model: Param("sentence piece trained model file", str)=None,
        sp_vocab: Param("sentence piece trained model file", str)=None,
    ):
    datetime_str = f'{datetime.now():%Y-%m-%d_%H-%M-%S%z}'
    random_seed = 0
    max_vocab = 30000
    print('max_cpu_per_dataloader', max_cpu_per_dataloader, 'bs', bs,
        'fp16', fp16, 'sp_processor', use_sp_processor, 'sp_model', sp_model, 'sp_vocab', sp_vocab)
    """## Prepare Dataset"""
    local_project_path = './data/sprot_lm/'

    #### Distributed
    print('gpu', gpu)
    gpu = setup_distrib(gpu)
    n_gpus = num_distrib()
    if n_gpus > 0:
        workers = min(max_cpu_per_dataloader, num_cpus()//n_gpus)
    else:
        workers = min(max_cpu_per_dataloader, num_cpus())
    print(gpu, 'n_gpus', n_gpus)
    print(gpu, 'workers', workers)


    """## Prepare fastai"""
    np.random.seed(random_seed)

    if not os.path.exists(local_project_path):
        os.makedirs(local_project_path)
    print('local_project_path:', local_project_path)

    """## Tokenization"""
    tokenizer = Tokenizer(tok_func=dna_tokenizer, pre_rules=[],
                        post_rules=[], special_cases=[])
    processor = [TokenizeProcessor(tokenizer=tokenizer, include_bos=True,
                                include_eos=True), NumericalizeProcessor(max_vocab=max_vocab)]
    df = pickle.load(
        open('./data/sprot_lm/sprot_sequence_taxon_anc.pickle', 'rb'))

    if use_sp_processor: # './data/sprot_lm/tmp/spm.model', './data/sprot_lm/tmp/spm.vocab'
        processor = [OpenFileProcessor(), SPProcessor(sp_model=sp_model, sp_vocab=sp_vocab, max_sentence_len=35826, max_vocab_sz=max_vocab)]
    data_lm = (TextList.from_df(df, path=local_project_path, cols='seq_anc_tax', processor=processor)
                    .split_by_rand_pct(0.1, seed = random_seed)
                    .label_for_lm()
                    .databunch(bs=bs, num_workers=workers))

    data_lm.vocab.save(local_project_path +
                       'vocab_lm_sprot_seq_anc_tax-' + datetime_str + '.pickle')

    print('data_cls Training set size', len(data_lm.train_ds))
    print('data_cls Validation set size', len(data_lm.valid_ds))
    print('vocab size ', len(data_lm.vocab.itos))


    learn_lm = language_model_learner(
        data_lm, AWD_LSTM, drop_mult=0.1, pretrained=False)

    if gpu is None:
        print(gpu, 'DataParallel')
        learn_lm.model = nn.DataParallel(learn_lm.model)
    else:
        print(gpu, 'to_distributed')
        learn_lm.to_distributed(gpu)
        if fp16:
            learn_lm.to_fp16()
    

    lr = 3e-3
    print(gpu, 'freeze')
    learn_lm.freeze()
    learn_lm.fit_one_cycle(1, lr, moms=(0.8, 0.7))  # I don't know why multigpu doesn't work without first freezing

    print(gpu, 'unfreeze')
    learn_lm.unfreeze()
    learn_lm.fit_one_cycle(10, lr*10, moms=(0.8, 0.7))
    learn_lm.save('lm-sp-anc-v1-1-' + datetime_str)
    learn_lm.save_encoder('lm-sp-ans-v1-1-enc-' + datetime_str)
    
    learn_lm.fit_one_cycle(10, lr, moms=(0.8, 0.7))
    learn_lm.save('lm-sp-anc-v1-2-' + datetime_str)
    learn_lm.save_encoder('lm-sp-ans-v1-2-enc-' + datetime_str)

    learn_lm.fit_one_cycle(10, lr/10, moms=(0.8, 0.7))
    learn_lm.save('lm-sp-anc-v1-3' + datetime_str)
    learn_lm.save_encoder('lm-sp-ans-v1-3-enc-' + datetime_str)
    learn_lm.export(file = 'export-lm-sp-ans-v1-3' + datetime_str+ '.pkl')
    print('Done')