Exemplo n.º 1
0
def run_fixed_lambda_bbcluster(train_cluster_data, val_cluster_data, test_cluster_data, output_path, train_batch_size, eval_steps,
                               num_epochs, warmup_frac, lambda_val, reg, beta, loss_name, use_model_device, model_name='distilbert-base-uncased', out_features=256):
    task = Task.init(project_name='BB Clustering', task_name='bbclustering_fixed_lambda')
    config_dict = {'lambda_val': lambda_val, 'reg': reg}
    config_dict = task.connect(config_dict)
    if torch.cuda.is_available():
        device = torch.device('cuda')
        print('CUDA is available and using device: '+str(device))
    else:
        device = torch.device('cpu')
        print('CUDA not available, using device: '+str(device))
    ### Configure sentence transformers for training and train on the provided dataset
    # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
    word_embedding_model = models.Transformer(model_name)

    # Apply mean pooling to get one fixed sized sentence vector
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                                   pooling_mode_mean_tokens=True,
                                   pooling_mode_cls_token=False,
                                   pooling_mode_max_tokens=False)

    doc_dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=out_features,
                                   activation_function=nn.Tanh())

    model = CustomSentenceTransformer(modules=[word_embedding_model, pooling_model, doc_dense_model])
    # model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
    GPUtil.showUtilization()
    if loss_name == 'bbspec':
        loss_model = BBSpectralClusterLossModel(model=model, device=device,
                                                lambda_val=config_dict.get('lambda_val', lambda_val),
                                                reg_const=config_dict.get('reg', reg), beta=beta)
    else:
        loss_model = BBClusterLossModel(model=model, device=device,
                                        lambda_val=config_dict.get('lambda_val', lambda_val),
                                        reg_const=config_dict.get('reg', reg))
    # reg_loss_model = ClusterDistLossModel(model=model)

    train_dataloader = DataLoader(train_cluster_data, shuffle=True, batch_size=train_batch_size)
    GPUtil.showUtilization()
    # train_dataloader2 = DataLoader(train_cluster_data, shuffle=True, batch_size=train_batch_size)
    evaluator = ClusterEvaluator.from_input_examples(val_cluster_data, use_model_device)
    test_evaluator = ClusterEvaluator.from_input_examples(test_cluster_data, use_model_device)
    GPUtil.showUtilization()
    warmup_steps = int(len(train_dataloader) * num_epochs * warmup_frac)  # 10% of train data

    print("Raw BERT embedding performance")
    model.to(device)
    evaluator(model, output_path)
    GPUtil.showUtilization()

    # Train the model
    model.fit(train_objectives=[(train_dataloader, loss_model)],
              evaluator=evaluator,
              test_evaluator=test_evaluator,
              epochs=num_epochs,
              evaluation_steps=eval_steps,
              warmup_steps=warmup_steps,
              output_path=output_path)
Exemplo n.º 2
0
def evaluate_treccar(model_path, test_art_qrels, test_top_qrels,
                     test_hier_qrels, test_paratext, level):
    test_page_paras, test_rev_para_top, test_rev_para_hier = get_trec_dat(
        test_art_qrels, test_top_qrels, test_hier_qrels)
    test_len_paras = np.array(
        [len(test_page_paras[page]) for page in test_page_paras.keys()])
    print('test mean paras: %.2f, std: %.2f, max paras: %.2f' %
          (np.mean(test_len_paras), np.std(test_len_paras),
           np.max(test_len_paras)))
    test_ptext_dict = get_paratext_dict(test_paratext)
    test_top_cluster_data = []
    test_hier_cluster_data = []
    max_num_doc_test = max(
        [len(test_page_paras[p]) for p in test_page_paras.keys()])
    test_pages = list(test_page_paras.keys())
    for i in trange(len(test_pages)):
        page = test_pages[i]
        paras = test_page_paras[page]
        paratexts = [test_ptext_dict[p] for p in paras]
        top_sections = list(set([test_rev_para_top[p] for p in paras]))
        top_labels = [top_sections.index(test_rev_para_top[p]) for p in paras]
        hier_sections = list(set([test_rev_para_hier[p] for p in paras]))
        hier_labels = [
            hier_sections.index(test_rev_para_hier[p]) for p in paras
        ]
        query_text = ' '.join(page.split('enwiki:')[1].split('%20'))
        n = len(paras)
        paras = paras + ['dummy'] * (max_num_doc_test - n)
        paratexts = paratexts + [''] * (max_num_doc_test - n)
        top_labels = top_labels + [-1] * (max_num_doc_test - n)
        hier_labels = hier_labels + [-1] * (max_num_doc_test - n)
        test_top_cluster_data.append(
            InputTRECCARExample(qid=page,
                                q_context=query_text,
                                pids=paras,
                                texts=paratexts,
                                label=np.array(top_labels)))
        test_hier_cluster_data.append(
            InputTRECCARExample(qid=page,
                                q_context=query_text,
                                pids=paras,
                                texts=paratexts,
                                label=np.array(hier_labels)))
    print("Top-level datasets")
    print("Test instances: %5d" % len(test_top_cluster_data))

    model = SentenceTransformer(model_path)
    if level == 'h':
        print('Evaluating hiererchical clusters')
        test_evaluator = ClusterEvaluator.from_input_examples(
            test_hier_cluster_data)
        model.evaluate(test_evaluator)
    else:
        print('Evaluating toplevel clusters')
        test_evaluator = ClusterEvaluator.from_input_examples(
            test_top_cluster_data)
        model.evaluate(test_evaluator)
Exemplo n.º 3
0
def run_binary_model(train_pairs, val_cluster_data, test_cluster_data, output_path, train_batch_size, eval_steps, num_epochs, warmup_frac,
                       use_model_device, model_name='distilbert-base-uncased', out_features=256):
    task = Task.init(project_name='BB Clustering', task_name='bbclustering_pairs')
    if torch.cuda.is_available():
        device = torch.device('cuda')
        print('CUDA is available and using device: ' + str(device))
    else:
        device = torch.device('cpu')
        print('CUDA not available, using device: ' + str(device))
    ### Configure sentence transformers for training and train on the provided dataset
    # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
    word_embedding_model = models.Transformer(model_name)

    # Apply mean pooling to get one fixed sized sentence vector
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                                   pooling_mode_mean_tokens=True,
                                   pooling_mode_cls_token=False,
                                   pooling_mode_max_tokens=False)

    doc_dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(),
                                   out_features=out_features,
                                   activation_function=nn.Tanh())

    model = CustomSentenceTransformer(modules=[word_embedding_model, pooling_model, doc_dense_model])

    train_dataloader = DataLoader(train_pairs, shuffle=True, batch_size=train_batch_size)
    train_loss = BinaryLoss(model=model)

    evaluator = ClusterEvaluator.from_input_examples(val_cluster_data, use_model_device)
    test_evaluator = ClusterEvaluator.from_input_examples(test_cluster_data, use_model_device)

    warmup_steps = int(len(train_dataloader) * num_epochs * warmup_frac)  # 10% of train data

    print("Raw BERT embedding performance")
    model.to(device)
    evaluator(model, output_path)

    # Train the model
    model.fit(train_objectives=[(train_dataloader, train_loss)],
              evaluator=evaluator,
              test_evaluator=test_evaluator,
              epochs=num_epochs,
              evaluation_steps=eval_steps,
              warmup_steps=warmup_steps,
              output_path=output_path)
Exemplo n.º 4
0
def evaluate_ng20(model_path, test_cluster_data, gpu_eval):
    if torch.cuda.is_available():
        print('CUDA is available')
        device = torch.device('cuda')
    else:
        print('Using CPU')
        device = torch.device('cpu')
    model = SentenceTransformer(model_path)
    model.to(device)
    test_evaluator = ClusterEvaluator.from_input_examples(
        test_cluster_data, gpu_eval)
    model.evaluate(test_evaluator)
Exemplo n.º 5
0
def _run_fixed_lambda_bbcluster(train_batch_size,
                                num_epochs,
                                lambda_val,
                                reg,
                                use_model_device,
                                eval_steps,
                                out_path,
                                warmup_frac=0.1,
                                model_name='distilbert-base-uncased',
                                out_features=256):
    exp_task = Task.create(project_name='Optuna Hyperparam optim',
                           task_name='trial')
    config_dict = {'lambda_val': lambda_val, 'reg': reg}
    config_dict = task.connect(config_dict)
    if torch.cuda.is_available():
        device = torch.device('cuda')
        print('CUDA is available and using device: ' + str(device))
    else:
        device = torch.device('cpu')
        print('CUDA not available, using device: ' + str(device))
    word_embedding_model = models.Transformer(model_name)

    pooling_model = models.Pooling(
        word_embedding_model.get_word_embedding_dimension(),
        pooling_mode_mean_tokens=True,
        pooling_mode_cls_token=False,
        pooling_mode_max_tokens=False)

    doc_dense_model = models.Dense(
        in_features=pooling_model.get_sentence_embedding_dimension(),
        out_features=out_features,
        activation_function=nn.Tanh())

    model = CustomSentenceTransformer(
        modules=[word_embedding_model, pooling_model, doc_dense_model])
    loss_model = BBClusterLossModel(model=model,
                                    device=device,
                                    lambda_val=config_dict.get(
                                        'lambda_val', lambda_val),
                                    reg_const=config_dict.get('reg', reg))

    train_dataloader = DataLoader(train_cluster_data,
                                  shuffle=True,
                                  batch_size=train_batch_size)
    evaluator = ClusterEvaluator.from_input_examples(val_cluster_data,
                                                     use_model_device)

    warmup_steps = int(len(train_dataloader) * num_epochs *
                       warmup_frac)  # 10% of train data

    model.to(device)

    # Train the model
    model.fit(train_objectives=[(train_dataloader, loss_model)],
              epochs=num_epochs,
              warmup_steps=warmup_steps,
              evaluator=evaluator,
              evaluation_steps=eval_steps,
              output_path=out_path)
    best_model = CustomSentenceTransformer(out_path)
    return evaluator(best_model)