Пример #1
0
def main():
    combine_txt_files()

    train_tokenizer(os.path.join(PROCESSED_DATA_DIR, "_ALL.TXT"),
                    vocab_size=AI_TEXT_GEN_VOCAB_SIZE,
                    save_path=AI_TEXT_GEN_TOKENIZED_DIR)

    txt_files = [
        os.path.join(PROCESSED_DATA_DIR, f)
        for f in os.listdir(PROCESSED_DATA_DIR) if f.endswith(".TXT")
    ]
    for f in txt_files:
        create_dataset(f)
Пример #2
0
def train(user):
    user = user.strip("\n")
    file_name = f"user_data/{user}.txt"
    # print(file_name)
    train_tokenizer(file_name)
    ai = aitextgen(tokenizer_file=tokenizer_file, config=config)
    data = TokenDataset(file_name,
                        tokenizer_file=tokenizer_file,
                        block_size=64)
    print(f"======================== {user} ========================")
    ai.train(data,
             batch_size=8,
             num_steps=20000,
             learning_rate=1e-5,
             output_dir=f"user_data/{user}/trained_model")
Пример #3
0
def training():

    file_name  =  "./poesie_clean.txt"

    train_tokenizer(file_name)
    vocab_file = "aitextgen-vocab.json"
    merges_file = "aitextgen-merges.txt"

    config = get_config()
    ai = aitextgen(vocab_file=vocab_file, merges_file=merges_file, config=config)

    data = TokenDataset(file_name,
                        vocab_file=vocab_file,
                        merges_file=merges_file,
                        block_size=64)

    ai.train(data, batch_size=32, num_steps=180000)

    ai.generate(5, prompt="Les artistes ")
Пример #4
0
def run(path_params: str):
    # Input
    params = load_yaml(path_params)
    params_data = params['data']
    params_ml = params['ml']
    params_gen = params['generation']
    logging.debug(f"Params: {params}")

    # Init
    timestamp = datetime.utcnow().strftime('%Y%m%d%H%M%S')
    run_name = f"03_gpt2scratch_{timestamp}"
    model_dir = join(params_ml['save_path'], run_name)
    os.makedirs(model_dir, exist_ok=True)

    # Train tokenizer
    logging.info("Training tokenizer...")
    dropout = params_ml[
        'tokenizer_dropout'] if params_ml['tokenizer_dropout'] != 0.0 else None
    train_tokenizer(files=params_data['file_path'],
                    dropout=dropout,
                    vocab_size=params_ml['vocab_size'],
                    min_frequency=params_ml['tokens_min_frequency'],
                    save_path=model_dir)
    logging.info("Training tokenizer completed!")

    # Train GPT-2 model
    logging.info("Training model...")
    gpt2_config = build_gpt2_config(vocab_size=params_ml['vocab_size'],
                                    max_length=params_ml['model_max_length'],
                                    dropout=params_ml['model_dropout'],
                                    n_embd=params_ml['model_n_embd'],
                                    n_layer=params_ml['model_n_layer'],
                                    n_head=params_ml['model_n_head'])
    logging.debug(f'Gpt2 configuration:{gpt2_config}')
    gpt2_model = aitextgen(config=gpt2_config,
                           vocab_file=join(model_dir, "aitextgen-vocab.json"),
                           merges_file=join(model_dir, "aitextgen-merges.txt"),
                           to_gpu=True)

    gpt2_model.train(params_data['file_path'],
                     line_by_line=False,
                     output_dir=model_dir,
                     num_steps=params_ml['train_steps'],
                     generate_every=params_ml['train_generate_every'],
                     save_every=params_ml['train_save_every'],
                     save_gdrive=False,
                     learning_rate=params_ml['train_learning_rate'],
                     batch_size=params_ml['train_batch_size'])
    logging.info("Training completed!")

    # Generate
    logging.info("Generation starting...")
    generation_folder = join(model_dir, "generation")
    os.makedirs(generation_folder, exist_ok=True)
    generation_file_path = join(generation_folder, f"{timestamp}.txt")

    gpt2_model.generate_to_file(
        n=params_gen['n_text'],
        batch_size=params_gen['batch_size'],
        destination_path=generation_file_path,
        seed=params_gen['seed'],
        cleanup=params_gen['cleanup'] == 'True',
        prompt=params_gen['prefix'],
        max_length=params_gen['max_length'],
        temperature=params_gen['temperature'],
        top_p=params_gen['top_p'],
        repetition_penalty=params_gen['repetition_penalty'],
        early_stopping=params_gen['early_stopping'],
        num_beams=params_gen['num_beams'])
    logging.info("Generation completed!")

    # Output persist
    model_params_path = join(model_dir, 'gpt2_scratch_params.yaml')
    with open(model_params_path, 'w') as f:
        yaml.dump(params, f, default_flow_style=False)
    logging.debug(f"Model params saved at {model_params_path}")
Пример #5
0
from aitextgen import aitextgen
from aitextgen.utils import GPT2ConfigCPU
from aitextgen.TokenDataset import TokenDataset
from aitextgen.tokenizers import train_tokenizer

config = GPT2ConfigCPU()

file_name = 'philo.txt'
train_tokenizer(file_name)
vocab_file = "aitextgen-vocab.json"
merges_file = "aitextgen-merges.txt"

ai = aitextgen(tf_gpt2="124M", config=config)
data = TokenDataset(file_name, block_size=64)
data

ai.train(data, batch_size=16, num_steps=200, save_every=10)

ai = aitextgen('trained_model/pytorch_model.bin',
               config='trained_model/config.json')
prompt_text = "What is life?"
prompt_text = "I hate you"

gpt_text = ai.generate(prompt=prompt_text, top_p=0.9)
'''
str = '7.5, edss, 5, control'
element = str.split(',')
len(element)
element

numbers = []
Пример #6
0
from aitextgen.TokenDataset import TokenDataset
from aitextgen.tokenizers import train_tokenizer
from aitextgen.utils import GPT2ConfigCPU
from aitextgen import aitextgen

model_parent = "/Users/iyinoluwatugbobo/Desktop/Development/guyCodeAI/combined_guycode_model"
model_dir = "/Users/iyinoluwatugbobo/Desktop/Development/guyCodeAI/combined_guycode_model/trained_model"
input_file = "/Users/iyinoluwatugbobo/Desktop/Development/guyCodeAI/combined_guycode_model/combined_input.txt"

train_tokenizer(input_file)
tokenizer_file = "aitextgen.tokenizer.json"
config = GPT2ConfigCPU()


def train():
    ai = aitextgen(tokenizer_file=tokenizer_file, config=config)
    data = TokenDataset(input_file,
                        tokenizer_file=tokenizer_file,
                        block_size=64)
    ai.train(data,
             batch_size=8,
             num_steps=50000,
             generate_every=10000,
             save_every=10000,
             output_dir=model_dir)


train()