예제 #1
0
def test_language_modeling(model_type, model_name):
    with open("train.txt", "w") as f:
        for i in range(100):
            f.writelines("Hello world with Simple Transformers! \n")

    model_args = {
        "reprocess_input_data": True,
        "overwrite_output_dir": True,
        "num_train_epochs": 1,
        "no_save": True,
        "vocab_size": 100,
        "generator_config": {
            "embedding_size": 512,
            "hidden_size": 256,
            "num_hidden_layers": 1
        },
        "discriminator_config": {
            "embedding_size": 512,
            "hidden_size": 256,
            "num_hidden_layers": 2
        },
    }

    model = LanguageModelingModel(
        "electra",
        None,
        args=model_args,
        train_files="train.txt",
        use_cuda=False,
    )

    # Train the model
    model.train_model("train.txt")
예제 #2
0
def get_gpt2_perplexity_for_every_sentence(data_as_pd_dataframe,
                                           output_file,
                                           cuda_device=-1,
                                           lowercase_all_text=True):
    if cuda_device < 0:
        use_cuda = False
    else:
        use_cuda = True
    model = LanguageModelingModel('gpt2',
                                  'gpt2-large',
                                  use_cuda=use_cuda,
                                  cuda_device=cuda_device,
                                  args={
                                      'do_lower_case': lowercase_all_text,
                                      'mlm': False
                                  })
    model.model = model.model.to(torch.device(f"cuda:{cuda_device}"))

    disposable_output_dir = 'scratch_output_dir/'
    if not os.path.isdir(disposable_output_dir):
        was_originally_dir = False
        os.makedirs(disposable_output_dir)
    else:
        was_originally_dir = True

    perplexities_to_return = []
    with open(output_file, 'w') as f:
        f.write('perplexity\tsentence\n')
        for i, row in tqdm(data_as_pd_dataframe.iterrows(),
                           total=data_as_pd_dataframe.shape[0]):
            # single_example_dataset = pd.DataFrame(pd.Series([row['text']]), columns=['text'], index=[0])
            single_example_dataset = SingleItemDataset(model.tokenizer,
                                                       row['text'],
                                                       cuda_device)
            results = model.evaluate(single_example_dataset,
                                     disposable_output_dir,
                                     multi_label=False,
                                     verbose=False,
                                     silent=True)
            instance_perplexity = float(results['perplexity'])
            perplexities_to_return.append(instance_perplexity)
            text = row['text']
            if '\n' in text or '\t' in text:
                if '"' in text:
                    f.write('\t'.join(
                        [str(instance_perplexity), '""' + text + '""']) + '\n')
                else:
                    f.write('\t'.join(
                        [str(instance_perplexity), '"' + text + '"']) + '\n')
            else:
                f.write('\t'.join([str(instance_perplexity), text]) + '\n')

    if not was_originally_dir:
        shutil.rmtree(disposable_output_dir)
    return perplexities_to_return
예제 #3
0
def fine_tune():
    train_args = {
        "reprocess_input_data": True,
        "overwrite_output_dir": True,
        "do_lower_case": True,
        "use_multiprocessing": False,
        "mlm": False,
        "num_train_epochs": args.num_train_epochs,
        'learning_rate': args.learning_rate,
        "max_seq_length": 300,
        "evaluate_during_training": True,
        "output_dir": args.model_output_dir
    }

    print(args.use_cuda)

    model = LanguageModelingModel('gpt2',
                                  args.model_name,
                                  use_cuda=args.use_cuda,
                                  args=train_args)

    model.eval_model(args.test_path)

    model.train_model(args.train_path, eval_file=args.validation_path)

    model.eval_model(args.test_path)
예제 #4
0
def build_language_model(handle):
    train_args = {
        "output_dir": f"gpt2_outputs/{handle}/",
        "reprocess_input_data": True,
        "overwrite_output_dir": True,
        "fp16": False,
        "train_batch_size": 32,
        "eval_batch_size":32,
        "num_train_epochs": 3,
        "tensorboard_dir": 'gpt2_tweet_runs/',
        'mlm':False,
        'use_multiprocessing':False
    }


    model = LanguageModelingModel('gpt2', 'gpt2', args=train_args,use_cuda=False)
    st.info('Training model. This may take a few mins - you may want to check back later.')
    model.train_model(f"./tweets_cache/{handle}_train.txt", eval_file=f"./tweets_cache/{handle}_valid.txt")
    return
def train(orig_bert, outdir_name, train_filename, eval_filename):
    logging.basicConfig(level=logging.INFO)
    transformers_logger = logging.getLogger("transformers")
    transformers_logger.setLevel(logging.WARNING)

    model_args = LanguageModelingArgs()
    model_args.reprocess_input_data = True
    model_args.output_dir = outdir_name
    model_args.best_model_dir = outdir_name + "/best_model"
    model_args.tensorboard_dir = outdir_name + "/runs"
    model_args.overwrite_output_dir = True
    model_args.save_steps = 0
    model_args.num_train_epochs = 10
    model_args.dataset_type = "simple"
    model_args.evaluate_during_training = True
    model_args.evaluate_during_training_verbose = True
    model_args.evaluate_during_training_steps = 0
    model_args.silent = True
    model_args.do_lower_case = ("uncased" in orig_bert)

    model = LanguageModelingModel("bert", orig_bert, args=model_args)
    model.train_model(train_filename, eval_file=eval_filename)
def fine_tuning(model_name, epoch, batch_size):

    model_base = model_name.split("/")[1].split("-")[0]
    train_args = {
        "reprocess_input_data": True,
        "overwrite_output_dir": True,
    }
    filename = "train-v0.1.json"
    with open('train_data.txt', 'w') as f:
        for item in data_list:
            f.write("%s\n" % item)
    # for some models seed may cause error
    try:
        lang_model = LanguageModelingModel(model_base,
                                           model_name,
                                           torch.Generator().manual_seed(42),
                                           use_cuda=True,
                                           args=train_args)
    except:
        lang_model = LanguageModelingModel(model_base,
                                           model_name,
                                           use_cuda=True,
                                           args=train_args)

    lang_output_file_name = "lang_output"
    lang_model.train_model("train_data.txt", output_dir=lang_output_file_name)
    necessary_files_for_pretrained_model = [
        'pytorch_model.bin', 'config.json', 'vocab.txt'
    ]
    lang_model_output_files = files = glob(lang_output_file_name + "/*")
    files_to_be_removed = []
    for f in lang_model_output_files:
        if f.split("/")[1] not in necessary_files_for_pretrained_model:
            files_to_be_removed.append(f)
    for f in files_to_be_removed:
        if os.path.isfile(f):
            os.remove(f)
        elif os.path.isdir(f):
            shutil.rmtree(f)
    shutil.rmtree("cache_dir", ignore_errors=True)
    shutil.rmtree("runs", ignore_errors=True)

    train_args = {
        "reprocess_input_data": True,
        "overwrite_output_dir": True,
        "num_train_epochs": epoch,
        "train_batch_size": batch_size
    }

    qa_model = QuestionAnsweringModel(model_base,
                                      "lang_output",
                                      use_cuda=True,
                                      args=train_args)

    return qa_model
예제 #7
0
def train_model(train_data, test_data, train_args):
    """ Train the model and save it to the outputs folder

    :param test_data:
    :param train_data:
    :param train_args: dict - arguments to be passed into the train function
    :return:
    """
    model = LanguageModelingModel('gpt2',
                                  MODEL,
                                  args=train_args,
                                  use_cuda=True)
    model.train_model(train_data, eval_file=test_data)
    model.eval_model(test_data)
예제 #8
0
        "n_gpu": args.gpu,
        "use_multiprocessing": True,
        "use_early_stopping": True,
        "early_stopping_patience": 3,
        "early_stopping_delta": 0,
        "early_stopping_metric": "eval_loss",
        "early_stopping_metric_minimize": True,
        "overwrite_output_dir": True,
        "manual_seed": None,
        "encoding": None,
        "dataset_type": "simple",
        "tokenizer_name": args.tokenizer,
        "evaluate_during_training_verbose": True,
        "use_cached_eval_features": True,
        "sliding_window": True,
        "vocab_size": 52000
    }

    model = LanguageModelingModel("electra",
                                  None,
                                  args=train_args,
                                  use_cuda=False,
                                  train_files=args.tok_data)

    model.train_model(
        args.train_data,
        eval_file=args.test_data,
    )

    model.eval_model(args.test_data)
예제 #9
0
def run_exps():
    logging.basicConfig(level=logging.INFO)
    transformers_logger = logging.getLogger("transformers")
    transformers_logger.setLevel(logging.WARNING)

    now = datetime.now()
    time = now.strftime("%H:%M:%S")
    date_time = now.strftime("%m%d%Y_%H%M%S")

    if sched_params_opt == "plateau":
        sched_params = {}
        sched_params['sched_name'] = 'plateau'
        sched_params['factor'] = 0.1
        sched_params['patience'] = sched_patience
        sched_params['verbose'] = True
        sched_params['threshold'] = 0.001
        sched_params['min_lr'] = min_lr
    else:
        sched_params = None

    if sched_params is not None:
        run_name = f'{subset[0]}_slen{seq_len}_wd{weight_decay}_lr{max_lr}-{min_lr}_msk{masks}_p{mask_prob}_dl{disc_hid_layers}_sz{disc_hid_size}_gl{gen_hid_layers}_sz{gen_hid_size}_rgen{random_generator}_drop{disc_drop}_w{rtd_loss_weight}_{rmd_loss_weight}_{mlm_loss_weight}_replace{replace_tokens}_mlmr{mlm_lr_ratio}_{date_time}_cont{contamination}'
    else:
        run_name = f'{subset[0]}_slen{seq_len}_wd{weight_decay}_mlr{max_lr}_minlr{min_lr}_msk{masks}_p{mask_prob}_dl{disc_hid_layers}_sz{disc_hid_size}_gl{gen_hid_layers}_sz{gen_hid_size}_rgen{random_generator}_drop{disc_drop}_w{rtd_loss_weight}_{rmd_loss_weight}_{mlm_loss_weight}_replace{replace_tokens}_mlmr{mlm_lr_ratio}_{date_time}_cont{contamination}'

    print(f'RUN: {run_name}')

    train_args = {
        "fp16": False,
        "use_multiprocessing": False,
        "reprocess_input_data": False,
        "overwrite_output_dir": True,
        "num_train_epochs": 20,
        "save_eval_checkpoints": False,
        "save_model_every_epoch": False,
        "learning_rate": max_lr,
        "warmup_steps": warmup,
        "train_batch_size": train_batch_size,  #was 32
        "eval_batch_size": eval_batch_size,  #was 32
        "gradient_accumulation_steps": 1,
        "block_size": seq_len + 2,
        "max_seq_length": seq_len + 2,
        "dataset_type": "simple",
        "logging_steps": 500,
        "evaluate_during_training": True,
        "evaluate_during_training_steps": 500,  #was 500
        "evaluate_during_training_steps_anomaly": eval_anomaly,  #was 500
        "anomaly_batch_size": anomaly_batch_size,
        "evaluate_during_training_verbose": True,
        "use_cached_eval_features": True,
        "sliding_window": True,
        "vocab_size": 52000,
        "eval_anomalies": True,
        "random_generator": random_generator,
        "use_rtd_loss": True,
        "rtd_loss_weight": rtd_loss_weight,
        "rmd_loss_weight": rmd_loss_weight,
        "mlm_loss_weight": mlm_loss_weight,
        "dump_histogram": dump_histogram,
        "eval_anomaly_after": eval_anomaly_after,
        "train_just_generator": train_just_generator,
        "replace_tokens": replace_tokens,
        "extract_scores": 1,
        "subset_name": subset[0],
        "extract_repr": 0,
        # "vanilla_electra": {
        #     "no_masks": masks,
        # },
        # "vanilla_electra": False,
        "train_document": True,
        "tokenizer_name": "bert-base-uncased",
        "tensorboard_dir": f'{tensorboard_dir}/{exp_prefix}/{run_name}',
        "extract_reps": extract_reps,
        "weight_decay": weight_decay,
        "optimizer": optimizer,
        "scores_export_path": f"./token_scores/{run_name}/",
        "generator_config": {
            "embedding_size": 128,
            "hidden_size": gen_hid_size,
            "num_hidden_layers": gen_hid_layers,
        },
        "discriminator_config": {
            "hidden_dropout_prob": disc_drop,
            "attention_probs_dropout_prob": disc_drop,
            "embedding_size": 128,
            "hidden_size": disc_hid_size,
            "num_hidden_layers": disc_hid_layers,
        },
        "mlm_lr_ratio": mlm_lr_ratio,
    }

    for subset_r in tqdm(subset):
        print('-' * 10, '\n', f'SUBSET: {subset_r}', '-' * 10)

        now = datetime.now()
        time = now.strftime("%H:%M:%S")
        date_time = now.strftime("%m%d%Y_%H%M%S")

        if preprocessed:
            train_file = f"./datasets/ag_od/train/{subset_r}.txt"
            test_file = f"./datasets/ag_od/test/{subset_r}.txt"
            outlier_file = f"./datasets/ag_od/test/{subset_r}-outliers.txt"
        if contamination != 0:
            train_file = f"./datasets/ag_od/train/{subset_r}-contaminated/{subset_r}_c{int(contamination)}.txt"

        model = LanguageModelingModel("electra",
                                      None,
                                      masks=masks_,
                                      args=train_args,
                                      train_files=train_file,
                                      use_cuda=True)

        model.train_model_anomaly(train_file,
                                  eval_file=test_file,
                                  eval_file_outlier=outlier_file,
                                  sched_params=sched_params)
예제 #10
0
	task = sys.argv[1]
	vocsize = int(sys.argv[2])

	logging.basicConfig(level=logging.INFO)
	transformers_logger = logging.getLogger("transformers")
	transformers_logger.setLevel(logging.WARNING)

	outdir_name = task + "-constr-uncased-{:d}k".format(vocsize/1000)
	model_args = LanguageModelingArgs()
	model_args.reprocess_input_data = True
	model_args.output_dir = outdir_name
	model_args.best_model_dir = outdir_name + "/best_model"
	model_args.tensorboard_dir = outdir_name + "/runs"
	model_args.overwrite_output_dir = True
	model_args.train_batch_size	= 32
	model_args.save_steps = 5000
	model_args.max_steps = 50000
	model_args.dataset_type = "simple"
	model_args.evaluate_during_training = True
	model_args.evaluate_during_training_verbose = True
	model_args.evaluate_during_training_steps = 5000
	model_args.silent = True
	model_args.do_lower_case = True
	model_args.tokenizer_name = None
	model_args.vocab_size = vocsize

	model = LanguageModelingModel("bert", None, args=model_args,
		train_files=task + "/train_textonly.txt")
	model.train_model(task + "/train_textonly.txt",
		eval_file=task + "/dev_textonly.txt")
        print(
            "----------------------------------------------------------------------"
        )


from simpletransformers.language_modeling import LanguageModelingModel
import logging

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.INFO)

train_args = {
    "reprocess_input_data": True,
    "overwrite_output_dir": True,
    "train_batch_size": 16,
    "num_train_epochs": 3,
    "mlm": False,
}

model = LanguageModelingModel('gpt2', 'gpt2', args=train_args, use_cuda=False)

model.train_model(
    "/mnt/InternalStorage/sidkas/DocumentSearchEngine/assets/sample_docs_eng/cs_ai_train.txt",
    eval_file=
    "/mnt/InternalStorage/sidkas/DocumentSearchEngine/assets/sample_docs_eng/cs_ai_test.txt"
)

model.eval_model(
    "/mnt/InternalStorage/sidkas/DocumentSearchEngine/assets/sample_docs_eng/cs_ai_test.txt"
)
예제 #12
0
    logging.basicConfig(level=logging.INFO)
    transformers_logger = logging.getLogger("transformers")
    transformers_logger.setLevel(logging.WARNING)

    outdir_name = "ch-swisscrawl-{}-{:d}k".format(casing, vocsize / 1000)
    model_args = LanguageModelingArgs()
    model_args.reprocess_input_data = True
    model_args.output_dir = outdir_name
    model_args.best_model_dir = outdir_name + "/best_model"
    model_args.tensorboard_dir = outdir_name + "/runs"
    model_args.overwrite_output_dir = True
    model_args.train_batch_size = 32
    model_args.save_steps = 5000
    model_args.max_steps = 50000
    model_args.dataset_type = "simple"
    model_args.evaluate_during_training = True
    model_args.evaluate_during_training_verbose = True
    model_args.evaluate_during_training_steps = 5000
    model_args.silent = True
    model_args.do_lower_case = (casing == "uncased")
    model_args.tokenizer_name = None
    model_args.vocab_size = vocsize

    model = LanguageModelingModel("bert",
                                  None,
                                  args=model_args,
                                  train_files="swisscrawl_reformatted.txt")
    model.train_model("swisscrawl_reformatted.txt",
                      eval_file="vardial_train_reformatted.txt")
예제 #13
0
import logging
from simpletransformers.language_modeling import LanguageModelingModel

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

train_args = {
    "reprocess_input_data": True,
    "overwrite_output_dir": True,
    "block_size": 512,
    "max_seq_length": 512,
    "learning_rate": 5e-6,
    "train_batch_size": 8,
    "gradient_accumulation_steps": 8,
    "num_train_epochs": 3,
    "mlm": False,
    'reprocess_input_data': True,
    "output_dir": f"outputs/fine-tuned/",
    'overwrite_output_dir': True
}

model = LanguageModelingModel("gpt2", "gpt2", args=train_args, use_cuda=False)

#model.train_model("data/train.txt", eval_file="data/test.txt")
model.train_model("datatest.txt")

#model.eval_model("data/test.txt")
#model.eval_model("evaltest.txt")
예제 #14
0
    "mlm": False,
    "dataset_type": "simple",
    "logging_steps": 100,
    "evaluate_during_training": True,
	"evaluate_during_training_steps": 3000,
    "evaluate_during_training_verbose": True,
    "use_cached_eval_features": True,
    "sliding_window": True,
    "use_multiprocessing": False,
    "vocab_size": 10000,
    "output_dir": f"outputs/from_scratch_",
    "best_model_dir": f"outputs/from_scratch/best_model",
}

train_file = f"data/train.txt"
test_file = f"data/test.txt"

model = LanguageModelingModel(
    "gpt2",
    None,
    args=train_args,
    train_files=train_file,
)

model.train_model(
    train_file,
    eval_file=test_file,
)

model.eval_model(test_file)
def main():
    logging.basicConfig(level=logging.INFO)
    transformers_logger = logging.getLogger("transformers")
    transformers_logger.setLevel(logging.WARNING)

    train_args = {
        "reprocess_input_data": False,
        "overwrite_output_dir": True,
        "num_train_epochs": 50,
        "save_eval_checkpoints": True,
        "save_model_every_epoch": False,
        "learning_rate": 1e-3,
        "warmup_steps": 10000,
        "train_batch_size": 64,
        "eval_batch_size": 128,
        "gradient_accumulation_steps": 2,
        "block_size": 128,
        "max_seq_length": 128,
        "dataset_type": "simple",
        "wandb_project": "Esperanto - ConvBert",
        "wandb_kwargs": {
            "name": "ConvBert-SMALL"
        },
        "logging_steps": 100,
        "evaluate_during_training": True,
        "evaluate_during_training_steps": 3000,
        "evaluate_during_training_verbose": True,
        "use_cached_eval_features": True,
        "sliding_window": False,
        "tokenizer_name": "bert-base-chinese",
        "use_multiprocessing": True,
        "process_count": 8,
        "vocab_size": 21128,
        "generator_config": {
            "attention_probs_dropout_prob": 0.1,
            "directionality": "bidi",
            "embedding_size": 128,
            "hidden_act": "gelu",
            "hidden_dropout_prob": 0.1,
            "hidden_size": 64,
            "initializer_range": 0.02,
            "intermediate_size": 256,
            "layer_norm_eps": 1e-12,
            "max_position_embeddings": 512,
            "model_type": "convbert",
            "num_attention_heads": 1,
            "num_hidden_layers": 12,
            "pad_token_id": 0,
            "summary_activation": "gelu",
            "summary_last_dropout": 0.1,
            "summary_type": "first",
            "summary_use_proj": True,
            "type_vocab_size": 2,
            "vocab_size": 21128
        },
        "discriminator_config": {
            "attention_probs_dropout_prob": 0.1,
            "embedding_size": 128,
            "hidden_act": "gelu",
            "hidden_dropout_prob": 0.1,
            "hidden_size": 256,
            "initializer_range": 0.02,
            "intermediate_size": 1024,
            "layer_norm_eps": 1e-12,
            "max_position_embeddings": 512,
            "model_type": "convbert",
            "num_attention_heads": 4,
            "num_hidden_layers": 12,
            "output_past": True,
            "pad_token_id": 0,
            "summary_activation": "gelu",
            "summary_last_dropout": 0.1,
            "summary_type": "first",
            "summary_use_proj": True,
            "type_vocab_size": 2,
            "vocab_size": 21128
        },
    }

    train_file = "data/train.txt"
    test_file = "data/test.txt"

    model = LanguageModelingModel(
        "convbert",
        None,
        args=train_args,
        train_files=train_file,
        cuda_device=1,
    )

    model.train_model(
        train_file,
        eval_file=test_file,
    )

    model.eval_model(test_file)
def save_best_model():
    model = LanguageModelingModel('convbert',
                                  'outputs/best_model',
                                  args={"output_dir": "discriminator_trained"})
    model.save_discriminator()
예제 #17
0
    issues = [json.loads(line) for line in f.readlines()]
    test = []
    train = []
    for issue in issues:
      body = issue['title'] + ' ' + issue['body'].replace('\n', ' ').replace('\r', ' ')
      if random.random() > 0.8:
        test.append(body)
      else:
        train.append(body)
    with open(train_token_path, 'w') as p:
      p.write('\n'.join(train))
    with open(test_token_path, 'w') as p:
      p.write('\n'.join(test))

print("training model")

train_args = {
    "reprocess_input_data": True,
    "overwrite_output_dir": True,
    "save_steps": 10000
}

model = LanguageModelingModel("bert", "bert-base-uncased", args=train_args)

model.train_model(
    "issues.train.tokens",
    eval_file="issues.test.tokens",
    output_dir="finetuned",
)

model.eval_model("issues.test.tokens", "finetune-eval")
        "vocab_size": 21128,
    },
    "discriminator_config": {
        "embedding_size": 128,
        "hidden_size": 256,
        "vocab_size": 21128,
    },
    "use_multiprocessing": False,
    "wandb_project": False,
    "fp16": False,
    "save_steps": 20000,
    "tokenizer_name": 'outputs',
    "model_name": 'outputs/checkpoint-120000'
}

train_file = r"train.txt"
test_file = r"test.txt"

model = LanguageModelingModel("electra",
                              args=train_args,
                              train_files=train_file,
                              use_cuda=False,
                              model_name="outputs/checkpoint-120000")

model.train_model(
    train_file,
    eval_file=test_file,
)

model.eval_model(test_file)
예제 #19
0
import logging

from simpletransformers.language_modeling import LanguageModelingModel

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

train_args = {
    "reprocess_input_data": True,
    "overwrite_output_dir": True,
    "block_size": 512,
    "max_seq_length": 512,
    "learning_rate": 5e-6,
    "train_batch_size": 8,
    "gradient_accumulation_steps": 8,
    "num_train_epochs": 3,
    "mlm": False,
    "output_dir": f"outputs/fine-tuned/",
}

model = LanguageModelingModel("gpt2", "gpt2", args=train_args)

model.train_model("data/train.txt", eval_file="data/test.txt")

model.eval_model("data/test.txt")
    def finetunePretrainedBertOnMLM(self, saveMultipleFilesThatWereMergedIntoSingleTrainFile=True):
        ############################################################################################
        # This method finetunes a pre-trained BERT model (base-uncased) on MLM using SEC data as follows:
        #   1. Builds training and eval files.
        #   2. Uses Simple Transformers "LanguageModelingModel" class to batch process the
        #      above training and eval files.
        #   3. Evaluates the finetuned model using the above class.
        ############################################################################################
        try:
            if self.corpusFolder is None or self.corpusFolder == "" or os.path.exists(self.corpusFolder) is False:
                log.error(f"Text corpus folder '{self.corpusFolder}' DOES NOT exist!")
                return

            # Check if CUDA is available for doing training on a GPU system
            if torch.cuda.is_available() is False:
                log.error(
                    f"CUDA libs not found. A new language model can be trained from scratch only on a GPU system with "
                    f"CUDA libs!")
                return

            startTime = time.time()
            #################################################################
            # 1. Build text corpus files for train and eval datasets.
            #################################################################
            trainOk, evalOk = self.__buildTrainingAndEvalTextFiles(saveMultipleFilesThatWereMergedIntoSingleTrainFile)

            # Check if the train and eval files are built as single text corpus file
            if trainOk is False:
                log.error(f"Error building training files to finetune pre-trained BERT on MLM objective!")
                return
            if evalOk is False:
                log.error(f"Error building evaluation file to finetune pre-trained BERT on MLM obective!")
                return

            #####################################################################
            # 2. Uses Simple Transformers "LanguageModelingModel" class to train
            #####################################################################
            log.debug(f"Building config params for SimpleTransformer..")
            transformers_logger = log.getLogger("transformers")
            transformers_logger.setLevel(log.WARNING)
            modelArgs = {"reprocess_input_data": True, "overwrite_output_dir": True, "num_training_epochs": 2,
                         "dataset_type": "simple",
                         "encoding": "utf-8", "cache_dir": self.modelCacheDir, "do_lower_case": True,
                         "learning_rate": self.learningRate, "max_seq_length": self.maxSeqLength,
                         "sliding_window": True, "stride": 0.7, "handle_chinese_chars": False,}
            log.debug(f"Finished building config params for SimpleTransformer.")

            log.debug(f"Initializing SimpleTransformer's LanguageModelingModel class..")
            model = LanguageModelingModel(model_type=self.modelType, model_name=self.modelNameOrPath, args=modelArgs)
            log.debug(f"Finished initializing SimpleTransformer's LanguageModelingModel class.")

            log.info(f"Started finetuning pre-trained BERT (large-uncased) on same MLM objective with SEC data..")
            model.train_model(train_file=self.singleCorpusTrainFile, eval_file=self.singleCorpusEvalFile, output_dir=self.modelOutputDir, show_running_loss=True,)
            log.info(f"Finished finetuning and saving a pre-trained BERT (large-uncased) model on MLM with SEC data. "
                     f"Check the folder '{self.modelOutputDir}' for finetuned weights.")
            log.info(f"It took {round((time.time()-startTime)/3600, 1)} hours to finetune a pre-trained BERT model on "
                     f"MLM with SEC data from the corpus '{self.corpusFolder}'")

            # Evaluation
            log.info(f"Started evaluating the finetuned BERT (large-uncased) model on: a) perplexity, and b) eval_loss.")
            model.eval_model(eval_file=self.singleCorpusEvalFile, output_dir=self.modelOutputDir, verbose=True, silent=False)
            log.info(f"Finished evaluation of the finetuned BERT (large-uncased) model on MLM with SEC data. Check "
                     f"the evaluation results in the folder '{self.modelOutputDir}'.")
            log.info(f"It took {round((time.time()-startTime)/3600, 1)} hours to evaluate the finetuned BERT model on MLM.'")

        except:
            exc_type, exc_value, exc_traceback = sys.exc_info()
            err = f"** ERROR ** occurred while finetuning a pre-trained BERT model and evaluating it. Error is: {exc_type}; {exc_value}."
            raise Exception(err)
예제 #21
0
    "vocab_size": 52000,
    "generator_config": {
        "embedding_size": 128,
        "hidden_size": 256,
        "num_hidden_layers": 3,
    },
    "discriminator_config": {
        "embedding_size": 128,
        "hidden_size": 256,
    },
}

train_file = "../data/train.txt"
test_file = "../data/test.txt"

model = LanguageModelingModel(
    "electra",
    None,
    args=train_args,
    train_files=train_file,
)

model.train_model(train_file, eval_file=test_file,)

model.eval_model(test_file)


model.save_discriminator()

model.save_generator()
예제 #22
0
import pandas as pd

# Finetune m-BERT model (1ST PART)

finetune_args = {
    "reprocess_input_data": True,
    "overwrite_output_dir": True,
    'fp16': False,
    "special_tokens": ["<s>", "<pad>", "</s>", "<unk>", "<mask>", '</br>'],
    "evaluate_during_training" : True,
    'save_model_every_epoch': False,
    'num_train_epochs': 20,
    'train_batch_size' : 25,
}

model = LanguageModelingModel('bert', 'bert-base-multilingual-cased', args = finetune_args, use_cuda = False) # Highly recommended to set use_cuda = True to ultilize GPU (if available) for training

model.train_model(curr_path + "/raw-poetry-train.txt", eval_file=curr_path + '/raw-poetry-test.txt')

# ## Train an emotion classifier with the fine-tuned model from above (2ND PART)

# def f1_evaluate(true, pred):
#     for p in pred:
#         for i in range(len(p)):
#             if p[i] >= 0.5:
#                 p[i] = 1
#             else:
#                 p[i] = 0

#     score = f1_score(true, pred, average = 'macro')
#     label = f1_score(true, pred, average = None)
}
model_args.vocab_size = 20000
model_args.output_dir = "{}/from_scratch_".format(output_dir_name)
model_args.best_model_dir = "{}/from_scratch/best_model".format(
    output_dir_name)
model_args.num_train_epochs = 5  #3
model_args.save_eval_checkpoints = True
model_args.overwrite_output_dir = True
model_args.evaluate_during_training = True,  # needed to save the best model!
model_args.evaluate_during_training_verbose = True,
#model_args.evaluate_during_training_steps = 3000,
#model = LanguageModelingModel("bert", None, args=model_args, train_files=train_file, use_cuda=False)
#model = LanguageModelingModel("electra", None, args=model_args, train_files=train_file, use_cuda=False)
model = LanguageModelingModel("albert",
                              None,
                              args=model_args,
                              train_files=train_file,
                              use_cuda=False)

#---------------------------------
"""
model_args = LanguageModelingArgs()
model_args.config = {
    "hidden_size": 768, # 768
    "num_hidden_layers": 3,  # 12
    "num_attention_heads": 3,
}
model_args.vocab_size = 20000
model_args.output_dir = "{}/from_scratch_".format(output_dir_name)
model_args.best_model_dir = "{}/from_scratch/best_model".format(output_dir_name)
model_args.num_train_epochs = 3
    "fp16_opt_level": "O1",

    # misc. settings
    "manual_seed": 3435,

    # output settings
    "overwrite_output_dir": True,
    "best_model_dir": "outputs/pretrain/electra_base_paper_final/best_model",
    "cache_dir": "cache_dir/pretrain",
    "output_dir": "outputs/pretrain/electra_base_paper_final/",
}

train_file = "inputs/pretrain/train-wikidump-books.en"
# test_file = "inputs/pretrain/test.txt"

model = LanguageModelingModel("electra", None, args=train_args)

model.load_and_cache_examples(train_file)
# model.load_and_cache_examples(test_file)

# def globalizer(fn=None, kwargs=None):
#     if fn:
#         fn(**kwargs)

# if __name__ == "__main__":
#     if train_args['n_gpu'] > 1:
#         fn = model.train_model
#         kwargs = {
#             'train_file': train_file,
#             'args': train_args,
#             'eval_file': test_file
예제 #25
0
def main():
    parser = argparse.ArgumentParser()

    # Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the source and target files for the task.",
    )
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
        help=
        "Path to pretrained model or model identifier from huggingface.co/models",
    )

    # Other parameters
    parser.add_argument("--do_train",
                        action="store_true",
                        help="Whether to run training.")
    parser.add_argument("--overwrite_output_dir",
                        action="store_true",
                        help="Whether to overwrite on the existing output dir")
    parser.add_argument(
        "--output_dir",
        default='output_dir/',
        type=str,
        help=
        "The output directory where the model predictions and checkpoints will be written.",
    )
    parser.add_argument(
        "--max_seq_length",
        default=None,
        type=int,
        help="Max input seq length",
    )
    parser.add_argument(
        "--train_batch_size",
        default=16,
        type=int,
        help="Size of each train batch",
    )
    parser.add_argument(
        "--gradient_accumulation_steps",
        default=1,
        type=int,
        help="gradient accumulation steps",
    )
    args = parser.parse_args()

    if (os.path.exists(args.output_dir) and os.listdir(args.output_dir)
            and args.do_train and not args.overwrite_output_dir):
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome."
            .format(args.output_dir))

    train_args = {
        "reprocess_input_data": True,
        "overwrite_output_dir": True,
        "block_size": 128,
        "max_seq_length": args.max_seq_length,
        "learning_rate": 5e-6,
        "train_batch_size": args.train_batch_size,
        "gradient_accumulation_steps": args.gradient_accumulation_steps,
        "num_train_epochs": 100,
        "mlm": False,
        "fp16": False,
        "output_dir": args.output_dir,
        "dataset_type": "line_by_line",
    }

    model = LanguageModelingModel(model_type="gpt2",
                                  model_name=args.model_name_or_path,
                                  args=train_args)

    model.train_model(args.data_dir + "train.txt",
                      eval_file=args.data_dir + "test.txt")

    model.eval_model(args.data_dir + "test.txt")