Пример #1
0
def eval_layers(in_vecs_dir, in_labels_f, in_texts_f, task, device, n):
    _, tokenizer, out_embed, bias = get_lm_vals('bert-base-uncased')

    tasks_results = []

    for from_layer in tqdm(range(13)):
        vecs_train_f = f'{in_vecs_dir}/from:{from_layer}.to:{13}.npy'
        labels_train_f = in_labels_f

        vecs_train, labels_train, sentences_train = read_files(vecs_train_f, labels_train_f, text_f=in_texts_f,
                                                               ignore_special_tokens=True)
        vecs_dev, labels_dev, sentences_dev = read_files(vecs_train_f.replace('train', 'dev'),
                                                         labels_train_f.replace('train', 'dev'),
                                                         text_f=in_texts_f.replace('train', 'dev'),
                                                         ignore_special_tokens=True)

        (x_train, y_train, words_train), (x_dev, y_dev, words_dev) = get_appropriate_data(task, vecs_train,
                                                                                          labels_train,
                                                                                          sentences_train,
                                                                                          vecs_dev, labels_dev,
                                                                                          sentences_dev)

        x_train, y_train = shuffle(x_train, y_train, random_state=0, n_samples=min(len(y_train), n))

        y_ids = tokenizer.convert_tokens_to_ids(words_dev)
        task_acc, lm_acc = layer_eval(x_train, x_dev, y_train, y_dev, y_ids, out_embed, bias, task, device)
        print(from_layer, task_acc, lm_acc)
        tasks_results.append([from_layer, task_acc, lm_acc])
    return tasks_results
def learn_cls_for_layer_new(layer_name, labels_name, text_name, task, n):
    vecs_train, labels_train, sentences_train = read_files(
        layer_name, labels_name, text_name, ignore_special_tokens=True)
    vecs_dev, labels_dev, sentences_dev = read_files(
        layer_name.replace('train', 'dev'),
        labels_name.replace('train', 'dev'),
        text_name.replace('train', 'dev'),
        ignore_special_tokens=True)

    (x_train, y_train,
     words_train), (x_dev, y_dev, words_dev) = get_appropriate_data(
         task, vecs_train, labels_train, sentences_train, vecs_dev, labels_dev,
         sentences_dev)
    x_train, y_train = shuffle(x_train, y_train, random_state=0, n_samples=n)
    if task in classification_tasks:
        score = learn_cls(x_train, y_train, x_dev, y_dev)
    else:
        score = learn_pls_cls(x_train, y_train, x_dev, y_dev)
    return score
Пример #3
0

if __name__ == '__main__':
    arguments = docopt(__doc__)

    deprobe_dir = arguments['--deprobe_dir']
    if not os.path.isdir(deprobe_dir):
        assert 'Deprobing directory does not exists...'

    use_wandb = arguments['--wandb']
    if use_wandb:
        log_wandb(arguments)

    vecs_train, labels_train, sentences_train = read_files(
        arguments['--vecs'],
        arguments['--labels'],
        arguments['--text'],
        ignore_special_tokens=True)
    vecs_dev, labels_dev, sentences_dev = read_files(
        arguments['--vecs'].replace('train', 'dev'),
        arguments['--labels'].replace('train', 'dev'),
        arguments['--text'].replace('train', 'dev'),
        ignore_special_tokens=True)

    task = arguments['--task']

    (x_train, y_train,
     words_train), (x_dev, y_dev, words_dev) = get_appropriate_data(
         task, vecs_train, labels_train, sentences_train, vecs_dev, labels_dev,
         sentences_dev)