def main(): combine_txt_files() train_tokenizer(os.path.join(PROCESSED_DATA_DIR, "_ALL.TXT"), vocab_size=AI_TEXT_GEN_VOCAB_SIZE, save_path=AI_TEXT_GEN_TOKENIZED_DIR) txt_files = [ os.path.join(PROCESSED_DATA_DIR, f) for f in os.listdir(PROCESSED_DATA_DIR) if f.endswith(".TXT") ] for f in txt_files: create_dataset(f)
def train(user): user = user.strip("\n") file_name = f"user_data/{user}.txt" # print(file_name) train_tokenizer(file_name) ai = aitextgen(tokenizer_file=tokenizer_file, config=config) data = TokenDataset(file_name, tokenizer_file=tokenizer_file, block_size=64) print(f"======================== {user} ========================") ai.train(data, batch_size=8, num_steps=20000, learning_rate=1e-5, output_dir=f"user_data/{user}/trained_model")
def training(): file_name = "./poesie_clean.txt" train_tokenizer(file_name) vocab_file = "aitextgen-vocab.json" merges_file = "aitextgen-merges.txt" config = get_config() ai = aitextgen(vocab_file=vocab_file, merges_file=merges_file, config=config) data = TokenDataset(file_name, vocab_file=vocab_file, merges_file=merges_file, block_size=64) ai.train(data, batch_size=32, num_steps=180000) ai.generate(5, prompt="Les artistes ")
def run(path_params: str): # Input params = load_yaml(path_params) params_data = params['data'] params_ml = params['ml'] params_gen = params['generation'] logging.debug(f"Params: {params}") # Init timestamp = datetime.utcnow().strftime('%Y%m%d%H%M%S') run_name = f"03_gpt2scratch_{timestamp}" model_dir = join(params_ml['save_path'], run_name) os.makedirs(model_dir, exist_ok=True) # Train tokenizer logging.info("Training tokenizer...") dropout = params_ml[ 'tokenizer_dropout'] if params_ml['tokenizer_dropout'] != 0.0 else None train_tokenizer(files=params_data['file_path'], dropout=dropout, vocab_size=params_ml['vocab_size'], min_frequency=params_ml['tokens_min_frequency'], save_path=model_dir) logging.info("Training tokenizer completed!") # Train GPT-2 model logging.info("Training model...") gpt2_config = build_gpt2_config(vocab_size=params_ml['vocab_size'], max_length=params_ml['model_max_length'], dropout=params_ml['model_dropout'], n_embd=params_ml['model_n_embd'], n_layer=params_ml['model_n_layer'], n_head=params_ml['model_n_head']) logging.debug(f'Gpt2 configuration:{gpt2_config}') gpt2_model = aitextgen(config=gpt2_config, vocab_file=join(model_dir, "aitextgen-vocab.json"), merges_file=join(model_dir, "aitextgen-merges.txt"), to_gpu=True) gpt2_model.train(params_data['file_path'], line_by_line=False, output_dir=model_dir, num_steps=params_ml['train_steps'], generate_every=params_ml['train_generate_every'], save_every=params_ml['train_save_every'], save_gdrive=False, learning_rate=params_ml['train_learning_rate'], batch_size=params_ml['train_batch_size']) logging.info("Training completed!") # Generate logging.info("Generation starting...") generation_folder = join(model_dir, "generation") os.makedirs(generation_folder, exist_ok=True) generation_file_path = join(generation_folder, f"{timestamp}.txt") gpt2_model.generate_to_file( n=params_gen['n_text'], batch_size=params_gen['batch_size'], destination_path=generation_file_path, seed=params_gen['seed'], cleanup=params_gen['cleanup'] == 'True', prompt=params_gen['prefix'], max_length=params_gen['max_length'], temperature=params_gen['temperature'], top_p=params_gen['top_p'], repetition_penalty=params_gen['repetition_penalty'], early_stopping=params_gen['early_stopping'], num_beams=params_gen['num_beams']) logging.info("Generation completed!") # Output persist model_params_path = join(model_dir, 'gpt2_scratch_params.yaml') with open(model_params_path, 'w') as f: yaml.dump(params, f, default_flow_style=False) logging.debug(f"Model params saved at {model_params_path}")
from aitextgen import aitextgen from aitextgen.utils import GPT2ConfigCPU from aitextgen.TokenDataset import TokenDataset from aitextgen.tokenizers import train_tokenizer config = GPT2ConfigCPU() file_name = 'philo.txt' train_tokenizer(file_name) vocab_file = "aitextgen-vocab.json" merges_file = "aitextgen-merges.txt" ai = aitextgen(tf_gpt2="124M", config=config) data = TokenDataset(file_name, block_size=64) data ai.train(data, batch_size=16, num_steps=200, save_every=10) ai = aitextgen('trained_model/pytorch_model.bin', config='trained_model/config.json') prompt_text = "What is life?" prompt_text = "I hate you" gpt_text = ai.generate(prompt=prompt_text, top_p=0.9) ''' str = '7.5, edss, 5, control' element = str.split(',') len(element) element numbers = []
from aitextgen.TokenDataset import TokenDataset from aitextgen.tokenizers import train_tokenizer from aitextgen.utils import GPT2ConfigCPU from aitextgen import aitextgen model_parent = "/Users/iyinoluwatugbobo/Desktop/Development/guyCodeAI/combined_guycode_model" model_dir = "/Users/iyinoluwatugbobo/Desktop/Development/guyCodeAI/combined_guycode_model/trained_model" input_file = "/Users/iyinoluwatugbobo/Desktop/Development/guyCodeAI/combined_guycode_model/combined_input.txt" train_tokenizer(input_file) tokenizer_file = "aitextgen.tokenizer.json" config = GPT2ConfigCPU() def train(): ai = aitextgen(tokenizer_file=tokenizer_file, config=config) data = TokenDataset(input_file, tokenizer_file=tokenizer_file, block_size=64) ai.train(data, batch_size=8, num_steps=50000, generate_every=10000, save_every=10000, output_dir=model_dir) train()