Пример #1
0
def predict_single(model,
                   tokenizer,
                   prompt,
                   top=1,
                   device=None,
                   max_length=None,
                   beams=None):
    if device == None:
        device = cfg('device')
    if max_length == None:
        max_length = cfg('max_gen')
    if beams == None:
        beams = cfg('beams')

    prompt = tokenize_query(tokenizer, prompt, device)
    model = model.to(device)

    output_sequences = model.generate(input_ids=prompt,
                                      max_length=max_length,
                                      num_beams=max(top, beams),
                                      do_sample=False,
                                      num_return_sequences=top,
                                      pad_token_id=tokenizer.eos_token_id)
    output = decode_batch(tokenizer, output_sequences)
    return output
Пример #2
0
def validate(model, tokenizer, dev_nls, dev_cms):
    outputs = [
        predict_single_mod(model, tokenizer, dev_nl, top=cfg('val_n'))
        for dev_nl in dev_nls
    ]

    confidences = [x[1] for x in outputs]
    predictions = [x[0] for x in outputs]

    scores_template = [
        get_template_score(dev_cm[0], pred_cm)
        for (pred_cm, dev_cm) in zip(predictions, dev_cms)
    ]
    print(f"[DEBUG]: TM score {np.mean(scores_template)}")

    scores_blue = [
        sentence_bleu(dev_cm, pred_cm[0])
        for (pred_cm, dev_cm) in zip(predictions, dev_cms)
    ]
    print(f"[DEBUG]: BLUE score {np.mean(scores_blue)}")

    if cfg('val_metric') == 'BLUE':
        scores = scores_blue
    elif cfg('val_metric') == 'template':
        scores = scores_template
    else:
        assert False, f"Unkown validation metric '{metric}'"
    return scores, predictions
Пример #3
0
def get_tokenizer():
    tokenizer = AutoTokenizer.from_pretrained(cfg('model'))
    # gpt2 has no padding by default
    try:
        if cfg('eos') != tokenizer.eos_token:
            print(
                f"Warning: non-default eos token (default is {tokenizer.eos_token})"
            )
    except:
        print("Warning: no default eos token")
    tokenizer.add_tokens(cfg('eos'))
    tokenizer.eos_token = cfg('eos')
    print("EOS", tokenizer.eos_token, tokenizer.eos_token_id)
    # add eos as pad_token should there be no pad token
    if tokenizer.pad_token == None:
        tokenizer.pad_token = tokenizer.eos_token
    print("PAD", tokenizer.pad_token, tokenizer.pad_token_id)
    try:
        if cfg('add_tokens'):
            # assert that there are tokens for seperators and common bash tools
            #added = tokenizer.add_tokens([cfg('sep1'), cfg('sep2')])
            added = tokenizer.add_tokens(bashinfo.top_100_utilities)
            print(f"added {added} tokens")
    except:
        pass
    return tokenizer
Пример #4
0
def main():
    print("PREPROCESSING DATA")
    preprocess()
    print("LOADING TOKENIZER")
    tokenizer = get_tokenizer()
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                    mlm=False)
    print("LOADING MODEL", cfg('model'))
    model = get_model(tokenizer)

    print("LOADING DATA")
    if cfg('encoding') == 'LBL':
        train_dataset = LBLDataset(tokenizer=tokenizer,
                                   file_path=filename('train'))
    elif cfg('encoding') == 'blocked':
        train_dataset = BlockedDataset(tokenizer=tokenizer,
                                       file_path=filename('train'))
    elif cfg('encoding') == 'text':
        train_dataset = TextDataset(tokenizer=tokenizer,
                                    file_path=filename('train'),
                                    block_size=cfg('max_block'))
    elif cfg('encoding').startswith('inter'):
        if cfg('encoding').endswith('LBL'):
            loader = LBLDataset
        elif cfg('encoding').endswith('blocked'):
            loader = BlockedDataset

        d1 = loader(tokenizer=tokenizer, file_path=filename('train'))
        d2 = loader(tokenizer=tokenizer, file_path=filename('dirty'))
        train_dataset = CombinedDataset(d1, d2)
    else:
        raise ValueError("Unkown encoding")

    trainer = get_trainer(train_dataset, data_collator, model)

    def validator(x, y):
        global BEST_metric
        model.save_pretrained(session)
        metric, pred = validate(model, tokenizer, x, y)
        if np.mean(metric) > BEST_metric:
            print("NEW BEST (saving)")
            BEST_metric = np.mean(metric)

        # save predicitions and model
        save(session + "metric.txt", str(metric) + "\n")
        save(session + "pred.txt", str(pred) + "\n\n")
        return metric, pred

    trainer.validator = validator
    trainer.val_dataset = get_validation_data()

    # saving configuration
    print("SAVING...")
    session = get_session_path()
    print(session)
    save(session + "conf.txt", repr(cfg()))

    print("STARTING TRAINING...")
    trainer.train()
Пример #5
0
def decode(tokenizer, v):
    text = tokenizer.decode(v, clean_up_tokenization_spaces=False)
    # remove query at the start
    start = text.find(cfg('sep2')) + len(cfg('sep2'))
    text = text[start:]
    # remove possible junk at the end
    end = text.find("\n")
    if end != -1:
        text = text[:end]
    text = text.strip('\n ')
    return text
Пример #6
0
def get_model(tokenizer, resume=False):
    if cfg('random_init'):
        # load randomly initialized model instead of pretrained
        model_config = transformers.GPT2Config()
        model = transformers.GPT2LMHeadModel(model_config)
    elif resume:
        # resume from previous best
        model = AutoModelForCausalLM.from_pretrained(
            cfg('out_path') + cfg('name'))
    else:
        # load pretrained model
        model = AutoModelForCausalLM.from_pretrained(cfg('model'))
    model.resize_token_embeddings(len(tokenizer))
    model = model.to(cfg('device'))
    return model
Пример #7
0
def predict_batch(model, tokenizer, input_ids, top=1):
    """ Requires all inputs to be of equal length """
    input_ids = torch.tensor(input_ids)
    input_ids = input_ids.to(cfg('device'))

    # prediction
    output_sequences = model.generate(input_ids=input_ids,
                                      max_length=cfg('max_gen'),
                                      num_beams=cfg('beams'),
                                      do_sample=False,
                                      num_return_sequences=top,
                                      pad_token_id=tokenizer.eos_token_id)
    print(output_sequences.shape)
    output = decode_batch(tokenizer, output_sequences)
    return output
Пример #8
0
def preprocess():
    for NAME in ('dev', 'train', 'dirty'):
        try:
            cm = read_data(NAME + '_cm.txt')
            nl = read_data(NAME + '_nl.txt')
        except:
            print(f"[WARNING]: {NAME} data not found")
            continue

        al = [context(x) for x in zip(nl, cm)]
        al = al = "".join(al)
        if not al.endswith(cfg('eos')):
            al += cfg('eos')

        save_data(NAME + ".txt", al)
Пример #9
0
def context(pair):
    if '' in pair:
        return ''
    enc = f"{encode(pair[0])} {pair[1]}"
    if cfg('encoding').endswith('LBL'):
        return f"{enc} {cfg('eos')}\n"
    else:
        return f"{enc}\n"
Пример #10
0
def get_trainer(train_dataset, collator, model):
    training_args = TrainingArguments(
        output_dir=f'output/bash',
        overwrite_output_dir=True,
        do_train=True,
        no_cuda=cfg('device') == 'cpu',
        num_train_epochs=cfg('epochs'),
        per_device_train_batch_size=cfg('batch_size'),
        gradient_accumulation_steps=cfg('grad_acc'),
        logging_steps=5,
        save_steps=0,
        seed=random.randint(0, 2**32 - 1))
    trainer = MTrainer(model=model,
                       args=training_args,
                       data_collator=collator,
                       train_dataset=train_dataset,
                       prediction_loss_only=True)
    return trainer
Пример #11
0
def predict_single_mod(model,
                       tokenizer,
                       prompt,
                       top=1,
                       device=None,
                       beams=None):
    if device == None:
        device = cfg('device')
    if beams == None:
        beams = cfg('beams')
    prompt = tokenize_query(tokenizer, prompt, device)

    # bit hacky
    PreTrainedModel._generate_beam_search = mbs._generate_beam_search
    output_sequences, output_scores = model.generate(
        input_ids=prompt,
        max_length=300,  # max_length less relevant as mod does early stopping
        num_beams=max(top, beams),
        do_sample=False,
        num_return_sequences=top,
        pad_token_id=tokenizer.eos_token_id)
    output = decode_batch(tokenizer, output_sequences)
    return output, output_scores
Пример #12
0
def predict_diverse(model, tokenizer, prompt, temp, top_p, top=1):
    prompt = tokenize_query(tokenizer, prompt)

    output_sequences = model.generate(input_ids=prompt,
                                      max_length=cfg('max_gen'),
                                      temperature=temp,
                                      top_p=top_p,
                                      do_sample=True,
                                      num_return_sequences=top,
                                      pad_token_id=tokenizer.eos_token_id)
    output = decode_batch(tokenizer, output_sequences)
    if len(output) == 1:
        return output[0]
    else:
        return output
Пример #13
0
def prepare_onnx_generation(model_path, device):
    load_cfg(model_path)
    print("# Converting model")
    convert_to_onnx(model_path, cfg('model'), 'onnx/model.onnx')
    print("# Loading model into huggingface")
    model = transformers.AutoModelForCausalLM.from_pretrained(model_path)
    model.to(device)
    if device == 'cuda':
        provider = 'CUDAExecutionProvider'
    else:
        raise ValueError("Unkown device")
    print("# Loading model into ONNX")
    onnx_model = create_model_for_provider('onnx/model.onnx', provider)
    print("# Loading tokenizer")
    tokenizer = get_tokenizer()
    return onnx_model, model.lm_head, tokenizer
Пример #14
0
    def __init__(self,
                 tokenizer: PreTrainedTokenizer,
                 file_path: str,
                 debug=False):
        assert os.path.isfile(file_path)
        block_size = cfg('max_block')
        block_size = block_size - tokenizer.num_special_tokens_to_add(
            pair=False)

        self.examples = []
        print(file_path)
        with open(file_path, encoding="utf-8") as f:
            text = f.read()

        tokenized_text = tokenizer.convert_tokens_to_ids(
            tokenizer.tokenize(text))
        eos = tokenizer.eos_token_id
        cblock = [tokenized_text.pop(0)]

        while tokenized_text:
            t = tokenized_text.index(eos)
            if t == -1:
                break
            if t > block_size:
                # if entry doesn't fit in block, throw it away
                tokenized_text = tokenized_text[t + 1:]
                print(f"Throwing away {t} tokens.")
            elif t + len(cblock) <= block_size:
                cblock += tokenized_text[:t + 1]
                tokenized_text = tokenized_text[t + 1:]
            else:
                rest = block_size - len(cblock)

                cblock = cblock + [eos] * rest
                self.examples.append(cblock)
                cblock = [eos]

        if debug:
            for i in self.examples[:1]:
                print(i)
                print(tokenizer.decode(i))
Пример #15
0
    def __init__(self,
                 tokenizer: PreTrainedTokenizer,
                 file_path: str,
                 debug=False):
        assert os.path.isfile(file_path)
        with open(file_path, encoding="utf-8") as f:
            lines = [line for line in f.read().splitlines()]

        pairs = [x + "\n" + y for x, y in zip(lines[::2], lines[1::2])]
        if debug:
            for i in pairs[:2]:
                print(i)
        batch_encoding = tokenizer(pairs,
                                   add_special_tokens=True,
                                   padding=True,
                                   truncation=True,
                                   max_length=cfg('max_line'))
        self.examples = batch_encoding["input_ids"]
        if debug:
            for i in self.examples[:2]:
                print(i)
Пример #16
0
 def __getitem__(self, i) -> torch.Tensor:
     return torch.tensor(self.examples[i],
                         dtype=torch.long,
                         device=cfg('device'))
Пример #17
0
 def _setup_wandb(self):
     if cfg('wandb'):
         super(MTrainer, self)._setup_wandb()
Пример #18
0
def save_data(filename, content):
    if type(content) == type(list):
        content = "\n".join(lines)
    with open(cfg('data_path') + filename, 'w+') as handle:
        handle.write(content)
Пример #19
0
def read_data(filename):
    with open(cfg('data_path') + filename, 'r') as handle:
        content = handle.readlines()
    return [x.strip() for x in content]
Пример #20
0
def get_session_path():
    path = cfg('out_path') + datetime.now().strftime("%m-%d_%H:%M:%S") + '/'
    os.mkdir(path)
    return path