def test_multiclass(self):
        transformer = models.Transformer('prajjwal1/bert-tiny')
        model = SentenceTransformer(modules=[
            transformer,
            models.Pooling(transformer.get_word_embedding_dimension())
        ])
        softmax_loss = losses.SoftmaxLoss(
            model, transformer.get_word_embedding_dimension(), num_labels=3)

        samples = [
            InputExample(texts=[
                "Hello Word, a first test sentence",
                "Hello Word, a other test sentence"
            ],
                         label=0),
            InputExample(texts=[
                "Hello Word, a second test sentence",
                "Hello Word, a other test sentence"
            ],
                         label=1),
            InputExample(texts=[
                "Hello Word, a third test sentence",
                "Hello Word, a other test sentence"
            ],
                         label=2)
        ]
        dataloader = DataLoader(samples, batch_size=1)
        evaluator = MulticlassEvaluator(dataloader, softmax_model=softmax_loss)
        result = evaluator(model)

        i = 0
예제 #2
0
def train(hp):
    """Train the advanced blocking model
    Store the trained model in hp.model_fn.

    Args:
        hp (Namespace): the hyperparameters

    Returns:
        None
    """
    # define model
    model_names = {'distilbert': 'distilbert-base-uncased',
                   'bert': 'bert-base-uncased',
                   'albert': 'albert-base-v2' }

    word_embedding_model = models.Transformer(model_names[hp.lm])
    pooling_model = models.Pooling(word_embedding_model\
                                   .get_word_embedding_dimension(),
				   pooling_mode_mean_tokens=True,
				   pooling_mode_cls_token=False,
				   pooling_mode_max_tokens=False)

    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

    # load the training and validation data
    reader = Reader()
    trainset = SentencesDataset(examples=reader.get_examples(hp.train_fn),
                                model=model)
    train_dataloader = DataLoader(trainset,
                                  shuffle=True,
                                  batch_size=hp.batch_size)
    train_loss = losses.SoftmaxLoss(model=model,
            sentence_embedding_dimension=model\
                    .get_sentence_embedding_dimension(),
            num_labels=2)

    dev_data = SentencesDataset(examples=reader\
                                         .get_examples(hp.valid_fn),
                                model=model)
    dev_dataloader = DataLoader(dev_data,
                                shuffle=False,
                                batch_size=hp.batch_size)
    evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)

    warmup_steps = math.ceil(len(train_dataloader) \
            * hp.n_epochs / hp.batch_size * 0.1) #10% of train data for warm-up

    if os.path.exists(hp.model_fn):
        import shutil
        shutil.rmtree(hp.model_fn)

    # Train the model
    model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=hp.n_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=hp.model_fn,
          fp16=hp.fp16,
          fp16_opt_level='O2')
예제 #3
0
    def __init__(self,
                 model_name,
                 args=None,
                 embedding_learning=None,
                 threshold=None):
        """
        Initializes a STClassificationModel
        :param model_name:
        :param args:
        """
        self.args = SPArgs()

        if isinstance(args, dict):
            self.args.update_from_dict(args)
        elif isinstance(args, SPArgs):
            self.args = args

        if embedding_learning is not None and embedding_learning == 'from-scratch':
            word_embedding_model = models.Transformer(
                model_name, max_seq_length=self.args.max_seq_length)
            pooling_model = models.Pooling(
                word_embedding_model.get_word_embedding_dimension())

            self.model = SentenceTransformer(
                modules=[word_embedding_model, pooling_model])
        else:
            self.model = SentenceTransformer(model_name)

        self.threshold = threshold
        # Because AveragePrecision with Cosine-Similarity is used to pick the best model by
        # BinaryClassificationEvaluator
        self.score_type = "cosine_average_precision"
        self.threshold_type = "cosine_f1_threshold"
예제 #4
0
def fit_model(df, method):
    """
        Fitting chosen model

        params:
            df: DataFrame used,
            method: model chosen

        returns:
            generated model,
            transformed datas
    """

    if method == "TF-IDF":
        model = TfidfVectorizer(analyzer='word',
                                ngram_range=(1, 2),
                                min_df=0,
                                stop_words=STOPS)
        X = model.fit_transform(df['content'])
    elif method == "CountVectorizer":
        model = CountVectorizer(analyzer='word',
                                ngram_range=(1, 2),
                                min_df=0,
                                stop_words=STOPS)
        X = model.fit_transform(df['content'])
    elif method == "BERT":
        word_embedding_model = models.Transformer('camembert-base')
        pooling_model = models.Pooling(
            word_embedding_model.get_word_embedding_dimension(),
            pooling_mode_mean_tokens=True,
            pooling_mode_max_tokens=False)
        model = SentenceTransformer(
            modules=[word_embedding_model, pooling_model])
        X = model.encode(df['content'], show_progress_bar=True)
    return model, X
예제 #5
0
def getSentenceVector(doc, model_params: dict = {}, encoder = "bert", model_name = 'bert-base-cased' ):
  
  sp = spacy.load('en_core_web_sm')
  tokenized = sp(doc)
  sentences = []
  for token in tokenized.sents:
    sentences.append(token.text)

  if encoder in ['bert', 'xlnet', 'longformer', 'reformer', 'distilbert', 'roberta', 'bart', 'finbert']:
    # Use encoder for mapping tokens to embeddings
    word_embedding_model = models.Transformer(model_name, 
                tokenizer_args= model_params['tokenizer_args'] if 'tokenizer_args' in model_params else {})
    # Apply mean pooling to get one fixed sized sentence vector
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                                   pooling_mode_mean_tokens=True,
                                   pooling_mode_cls_token=False,
                                   pooling_mode_max_tokens=False)
    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])   
    sentence_embeddings = model.encode(sentences)
    

  elif encoder == 'use':
    #!pip install embedding-as-service
    from embedding_as_service.text.encode import Encoder
    en = Encoder(embedding='use', model='use_dan', max_seq_length=256)
    sentence_embeddings = en.encode(texts=sentences)


  elif encoder == 'infersent':
    import nltk
    nltk.download('punkt')
    from models import InferSent
    params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                    'pool_type': 'max', 'dpout_model': 0.0, 'version': 2}
    infersent = InferSent(params_model)
    W2V_PATH = 'drive/My Drive/wiki-news-300d-1M.vec'
    infersent.set_w2v_path(W2V_PATH)
    infersent.build_vocab(sentences, tokenize=True)
    sentence_embeddings = infersent.encode(sentences, tokenize=True)


  elif encoder == 'sent2vec':
    import sent2vec
    model = sent2vec.Sent2vecModel()
    model.load_model('drive/My Drive/torontobooks_unigram.bin') 
    sentence_embeddings = model.embed_sentences(sentences)
   

  elif encoder == 'laser':
    from laserembeddings import Laser
    laser = Laser()  ## Also used for multilingual sentence embeddings
    sentence_embeddings = laser.embed_sentences(sentences, lang='en') 
  
  
  else:
    raise ValueError('Invalid encoder {} or encoder Unavailable.'.format(encoder))  
  
  return list(zip(sentences, sentence_embeddings))
예제 #6
0
def run_fixed_lambda_bbcluster(train_cluster_data, val_cluster_data, test_cluster_data, output_path, train_batch_size, eval_steps,
                               num_epochs, warmup_frac, lambda_val, reg, beta, loss_name, use_model_device, model_name='distilbert-base-uncased', out_features=256):
    task = Task.init(project_name='BB Clustering', task_name='bbclustering_fixed_lambda')
    config_dict = {'lambda_val': lambda_val, 'reg': reg}
    config_dict = task.connect(config_dict)
    if torch.cuda.is_available():
        device = torch.device('cuda')
        print('CUDA is available and using device: '+str(device))
    else:
        device = torch.device('cpu')
        print('CUDA not available, using device: '+str(device))
    ### Configure sentence transformers for training and train on the provided dataset
    # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
    word_embedding_model = models.Transformer(model_name)

    # Apply mean pooling to get one fixed sized sentence vector
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                                   pooling_mode_mean_tokens=True,
                                   pooling_mode_cls_token=False,
                                   pooling_mode_max_tokens=False)

    doc_dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=out_features,
                                   activation_function=nn.Tanh())

    model = CustomSentenceTransformer(modules=[word_embedding_model, pooling_model, doc_dense_model])
    # model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
    GPUtil.showUtilization()
    if loss_name == 'bbspec':
        loss_model = BBSpectralClusterLossModel(model=model, device=device,
                                                lambda_val=config_dict.get('lambda_val', lambda_val),
                                                reg_const=config_dict.get('reg', reg), beta=beta)
    else:
        loss_model = BBClusterLossModel(model=model, device=device,
                                        lambda_val=config_dict.get('lambda_val', lambda_val),
                                        reg_const=config_dict.get('reg', reg))
    # reg_loss_model = ClusterDistLossModel(model=model)

    train_dataloader = DataLoader(train_cluster_data, shuffle=True, batch_size=train_batch_size)
    GPUtil.showUtilization()
    # train_dataloader2 = DataLoader(train_cluster_data, shuffle=True, batch_size=train_batch_size)
    evaluator = ClusterEvaluator.from_input_examples(val_cluster_data, use_model_device)
    test_evaluator = ClusterEvaluator.from_input_examples(test_cluster_data, use_model_device)
    GPUtil.showUtilization()
    warmup_steps = int(len(train_dataloader) * num_epochs * warmup_frac)  # 10% of train data

    print("Raw BERT embedding performance")
    model.to(device)
    evaluator(model, output_path)
    GPUtil.showUtilization()

    # Train the model
    model.fit(train_objectives=[(train_dataloader, loss_model)],
              evaluator=evaluator,
              test_evaluator=test_evaluator,
              epochs=num_epochs,
              evaluation_steps=eval_steps,
              warmup_steps=warmup_steps,
              output_path=output_path)
예제 #7
0
def get_model():
    # Google-Drive link: https://drive.google.com/drive/folders/1sUxvLCTJHOkPeB4thHO-RW8WI3DWLHos?usp=sharing
    PATH = "DeepPavlov/rubert-base-cased"

    model = models.Transformer(PATH)
    pooling_model = models.Pooling(model.get_word_embedding_dimension())
    model = SentenceTransformer(modules=[model, pooling_model])
    nltk.download('punkt')
    return model
예제 #8
0
def transformerModel(data, path_to_model, seq_length):

    word_embedding_model = models.Transformer(path_to_model,
                                              max_seq_length=seq_length)
    pooling_model = models.Pooling(
        word_embedding_model.get_word_embedding_dimension())
    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
    embeddings = model.encode(data)

    return embeddings
예제 #9
0
def get_model(path):
    word_embedding_model = models.Transformer(path)

    # Apply mean pooling to get one fixed sized sentence vector
    pooling_model = models.Pooling(
        word_embedding_model.get_word_embedding_dimension(),
        pooling_mode_mean_tokens=True,
        pooling_mode_cls_token=False,
        pooling_mode_max_tokens=False)

    return SentenceTransformer(modules=[word_embedding_model, pooling_model])
예제 #10
0
 def __init__(self, bert_path):
     word_embedding_model = sent_models.Transformer(bert_path)
     pooling_model = sent_models.Pooling(
         word_embedding_model.get_word_embedding_dimension(),
         pooling_mode_mean_tokens=True,
         pooling_mode_cls_token=False,
         pooling_mode_max_tokens=False)
     self.model = SentenceTransformer(
         modules=[word_embedding_model, pooling_model])
     self.model.to(DEFAULT_DEVICE)
     self.model.eval()
def run():
    train_file = config.TRAINING_FILE
    train_batch = config.TRAIN_BATCH_SIZE
    vaild_batch = config.VALID_BATCH_SIZE
    model_path = config.BERT_PATH
    max_length = config.MAX_LEN
    dfs = pd.read_csv(train_file,
                      sep="\t",
                      names=['idx', 'sent1', 'sent2', 'label'])
    dfs['label'] = pd.to_numeric(dfs["label"], downcast='float')
    df_train, df_valid = model_selection.train_test_split(
        dfs,
        test_size=0.1,
        random_state=42,
        stratify=dfs.label.values,
    )

    df_train = df_train.reset_index(drop=True)
    df_valid = df_valid.reset_index(drop=True)

    dataset_reader = dataset.Dataset()

    train_dataset = dataset_reader.read(df_train, return_pt=True)
    valid_sentence1, valid_sentence2, valid_labels = dataset_reader.read(
        df_valid)

    train_dataloader = DataLoader(train_dataset,
                                  shuffle=True,
                                  batch_size=train_batch)
    # evaluator = evaluation.EmbeddingSimilarityEvaluator(valid_sentence1, valid_sentence2, valid_labels)
    evaluator = evaluation.BinaryClassificationEvaluator(
        valid_sentence1,
        valid_sentence2,
        valid_labels,
        batch_size=vaild_batch,
        show_progress_bar=False)

    word_embedding_model = models.Transformer(model_path,
                                              max_seq_length=max_length)
    pooling_model = models.Pooling(
        word_embedding_model.get_word_embedding_dimension())
    dense_model = models.Dense(
        in_features=pooling_model.get_sentence_embedding_dimension(),
        out_features=max_length,
        activation_function=nn.Tanh())

    model = SentenceTransformer(
        modules=[word_embedding_model, pooling_model, dense_model])

    train_loss = losses.CosineSimilarityLoss(model)

    engine.train(train_dataloader, model, train_loss, evaluator)
예제 #12
0
def build_model():

  # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
  word_embedding_model = models.Transformer(model_name)

  # Apply mean pooling to get one fixed sized sentence vector
  pooling_model = models.Pooling(
      word_embedding_model.get_word_embedding_dimension(),
      pooling_mode_mean_tokens=True,
      pooling_mode_cls_token=False,
      pooling_mode_max_tokens=False)
  model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
  return model
예제 #13
0
def define_bert_encoder():
    word_embedding_model = models.Transformer('bert-base-uncased',
                                              max_seq_length=200)
    pooling_model = models.Pooling(
        word_embedding_model.get_word_embedding_dimension())
    dense_model = models.Dense(
        ni_features=pooling_model.get_sentence_embedding_dimension(),
        out_features=200,
        activation_function=nn.Tanh())

    bert_model = SentenceTransformer(
        modules=[word_embedding_model, pooling_model, dense_model])
    return bert_model
예제 #14
0
    def __init__(self, json_path, max_n_sent, method, wgts='distil'):

        with open(json_path) as fin:
            dat = json.load(fin)
        # if group:
        #     dat = info_df[info_df['partition']==group]
        # self.info_df = info_df.reset_index(drop=True)

        self.dat = dat
        self.max_n_sent = max_n_sent
        self.method = method

        if wgts == 'distil':
            model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')
        if wgts == 'marco':
            model = SentenceTransformer('msmarco-distilbert-base-v2')

        if wgts == 'base':
            word_embedding_model = models.Transformer('bert-base-uncased',
                                                      max_seq_length=512)
        if wgts == 'bio':
            word_embedding_model = models.Transformer('dmis-lab/biobert-v1.1',
                                                      max_seq_length=512)
        if wgts == 'abs':
            word_embedding_model = models.Transformer(
                'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract',
                max_seq_length=512)
        if wgts == 'full':
            word_embedding_model = models.Transformer(
                'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext',
                max_seq_length=512)

        if wgts in ['base', 'bio', 'abs', 'full']:
            pooling_model = models.Pooling(
                word_embedding_model.get_word_embedding_dimension())
            model = SentenceTransformer(
                modules=[word_embedding_model, pooling_model])

        self.model = model
예제 #15
0
    def test_generalized_pooling(self):
        transformer = models.Transformer('prajjwal1/bert-tiny')
        model = SentenceTransformer(modules=[
            transformer,
            models.GeneralizedPooling(
                transformer.get_word_embedding_dimension())
        ])

        emb = model.encode("Hello Word, a test sentence")
        assert emb.shape == (transformer.get_word_embedding_dimension(), )

        # Single sentence as list
        emb = model.encode(["Hello Word, a test sentence"])
        assert emb.shape == (1, transformer.get_word_embedding_dimension())
예제 #16
0
    def run(
        self,
        training_data,
        evaluator,
        output_path,
        from_scratch=False,
        loss=SentenceTransformerLoss.cosine_similarity_loss,
        model_name_or_path="roberta-large-nli-stsb-mean-tokens",
        cuda=True,
        **kwargs,
    ):
        logger.info(
            f"Running Sentence Transformer Task: {model_name_or_path}, Output path: {output_path}"
        )
        if from_scratch:
            logger.info("Training from scratch")
            models.Transformer(model_name_or_path,
                               max_seq_length=kwargs.get(
                                   "max_seq_length", 128))
        else:
            model = SentenceTransformer(model_name_or_path)
        if cuda:
            logger.info("Running model on GPU")
            model.cuda()

        train_examples = [
            InputExample(texts=[data["sentence1"], data["sentence2"]],
                         label=data["label"])
            for data in training_data.values()
        ]
        train_dataset = SentencesDataset(train_examples, model)
        train_dataloader = DataLoader(
            train_dataset,
            shuffle=kwargs.get("shuffle", True),
            batch_size=kwargs.get("batch_size", 4),
        )
        warmup_steps = math.ceil(
            len(train_examples) * kwargs.get("num_epochs", 3) /
            kwargs.get("train_batch_size", 4) *
            0.1)  # 10% of train data for warm-up
        train_loss = loss.value(model)
        model.fit(
            train_objectives=[(train_dataloader, train_loss)],
            epochs=kwargs.get("num_epochs", 3),
            evaluation_steps=kwargs.get("evaluation_steps", 500),
            warmup_steps=warmup_steps,
            output_path=output_path,
            evaluator=evaluator,
        )
예제 #17
0
    def get_sentence_transformer(name):
        try:
            model = SentenceTransformer(name)
        except:
            transformer_model = models.Transformer(name)

            # Apply mean pooling to get one fixed sized sentence vector
            pooling_model = models.Pooling(transformer_model.get_word_embedding_dimension(),
                                           pooling_mode_mean_tokens=True,
                                           pooling_mode_cls_token=True,
                                           pooling_mode_max_tokens=False)

            model = SentenceTransformer(modules=[transformer_model, pooling_model])

        return model
예제 #18
0
 def __init__(self):
     word_embedding_model = models.Transformer(
         'sentence-transformers/bert-large-nli-max-tokens',
         max_seq_length=256)
     pooling_model = models.Pooling(
         word_embedding_model.get_word_embedding_dimension())
     dense_model = models.Dense(
         in_features=pooling_model.get_sentence_embedding_dimension(),
         out_features=256,
         activation_function=nn.Tanh())
     self.model = SentenceTransformer(
         modules=[word_embedding_model, pooling_model, dense_model])
     path = 'multinli_1.0/'
     self.MNLI_train_path = path + 'multinli_1.0_train.txt'
     self.MNLI_matched_test_path = path + 'multinli_1.0_dev_matched.txt'
     self.MNLI_mismatched_test_path = path + 'multinli_1.0_dev_mismatched.txt'
def build_sbert_model(model_name: str, logistic_model: bool = True):
    """Build SBERT model, based on model name provided.

    :param model_name: model to be used, currently supported: covidbert or biobert
    :type model_name: str
    :param logistic_model: use logistic regression as classifier
    :type logistic_model: bool
    :return: SBERT model and corresponding tokenizer
    """
    if model_name == "covidbert":
        model_name = "deepset/covid_bert_base"
        covid_bert_path = "covid_bert_path"
        model_save_path = covid_bert_path
        os.makedirs(model_save_path, exist_ok=True)
        wget.download(
            "https://cdn.huggingface.co/deepset/covid_bert_base/vocab.txt",
            out=f"{model_save_path}/")  # download the vocab file

    else:
        model_name = "allenai/biomed_roberta_base"
        model_save_path = "biobert_path"
        os.makedirs(model_save_path, exist_ok=True)
        wget.download(
            "https://cdn.huggingface.co/allenai/biomed_roberta_base/merges.txt",
            out=f"{model_save_path}/")
        wget.download(
            "https://cdn.huggingface.co/allenai/biomed_roberta_base/vocab.json",
            out=f"{model_save_path}/")  # download the vocab file

    bert_model = AutoModel.from_pretrained(model_name)
    bert_model.save_pretrained(model_save_path)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    del bert_model

    word_embedding_model = models.Transformer(model_save_path)
    shutil.rmtree(model_save_path)
    pooling_model = models.Pooling(768,
                                   pooling_mode_mean_tokens=True,
                                   pooling_mode_cls_token=False,
                                   pooling_mode_max_tokens=False)
    # generating biobert sentence embeddings (mean pooling of sentence
    # embedding vectors)
    sbert_model = SBERTPredictor(word_embedding_model,
                                 pooling_model,
                                 logistic_model=logistic_model)
    return sbert_model, tokenizer
예제 #20
0
def run_binary_model(train_pairs, val_cluster_data, test_cluster_data, output_path, train_batch_size, eval_steps, num_epochs, warmup_frac,
                       use_model_device, model_name='distilbert-base-uncased', out_features=256):
    task = Task.init(project_name='BB Clustering', task_name='bbclustering_pairs')
    if torch.cuda.is_available():
        device = torch.device('cuda')
        print('CUDA is available and using device: ' + str(device))
    else:
        device = torch.device('cpu')
        print('CUDA not available, using device: ' + str(device))
    ### Configure sentence transformers for training and train on the provided dataset
    # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
    word_embedding_model = models.Transformer(model_name)

    # Apply mean pooling to get one fixed sized sentence vector
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                                   pooling_mode_mean_tokens=True,
                                   pooling_mode_cls_token=False,
                                   pooling_mode_max_tokens=False)

    doc_dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(),
                                   out_features=out_features,
                                   activation_function=nn.Tanh())

    model = CustomSentenceTransformer(modules=[word_embedding_model, pooling_model, doc_dense_model])

    train_dataloader = DataLoader(train_pairs, shuffle=True, batch_size=train_batch_size)
    train_loss = BinaryLoss(model=model)

    evaluator = ClusterEvaluator.from_input_examples(val_cluster_data, use_model_device)
    test_evaluator = ClusterEvaluator.from_input_examples(test_cluster_data, use_model_device)

    warmup_steps = int(len(train_dataloader) * num_epochs * warmup_frac)  # 10% of train data

    print("Raw BERT embedding performance")
    model.to(device)
    evaluator(model, output_path)

    # Train the model
    model.fit(train_objectives=[(train_dataloader, train_loss)],
              evaluator=evaluator,
              test_evaluator=test_evaluator,
              epochs=num_epochs,
              evaluation_steps=eval_steps,
              warmup_steps=warmup_steps,
              output_path=output_path)
예제 #21
0
def load_model_from_s3():
    try:
    # get object from s3
    #   obj = s3.get_object(Bucket=S3_BUCKET, Key=MODEL_PATH)
    # unzip it
    #   tar = tarfile.open(fileobj=bytestream, mode="r:gz")
        word_embedding_model = models.Transformer('T-Systems-onsite/bert-german-dbmdz-uncased-sentence-stsb',max_seq_length=512)

        pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                                       pooling_mode_mean_tokens=True,
                                       pooling_mode_cls_token=False,
                                       pooling_mode_max_tokens=False)

        # join BERT model and pooling to get the sentence transformer
        model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
        return model
    except Exception as e:
        raise(e)
예제 #22
0
 def initialize_model(self):
     # Read the dataset
     # Use BERT for mapping tokens to embeddings
     word_embedding_model = models.Transformer(self.base_model,
                                               max_seq_length=128)
     # Apply mean pooling to get one fixed sized sentence vector
     pooling_model = models.Pooling(
         word_embedding_model.get_word_embedding_dimension(),
         pooling_mode_mean_tokens=True,
         pooling_mode_cls_token=False,
         pooling_mode_max_tokens=False)
     self.model = SentenceTransformer(
         modules=[word_embedding_model, pooling_model])
     self.train_loss_nli = losses.SoftmaxLoss(
         model=self.model,
         sentence_embedding_dimension=self.model.
         get_sentence_embedding_dimension(),
         num_labels=len(self.label2int))
    def test_roberta_wkpooling(self):
        word_embedding_model = models.Transformer(
            'roberta-base', model_args={'output_hidden_states': True})
        pooling_model = models.WKPooling(
            word_embedding_model.get_word_embedding_dimension())
        model = SentenceTransformer(
            modules=[word_embedding_model, pooling_model])
        scores = [
            0.9594874382019043, 0.9928674697875977, 0.9241214990615845,
            0.9309519529342651, 0.9506515264511108
        ]

        for sentences, score in zip(WKPoolingTest.sentence_pairs, scores):
            embedding = model.encode(sentences, convert_to_numpy=True)

            similarity = 1 - scipy.spatial.distance.cosine(
                embedding[0], embedding[1])
            assert abs(similarity - score) < 0.01
    def test_train_stsb(self):
        word_embedding_model = models.Transformer('distilbert-base-uncased')
        pooling_model = models.Pooling(
            word_embedding_model.get_word_embedding_dimension())
        model = SentenceTransformer(
            modules=[word_embedding_model, pooling_model])
        train_dataset = SentencesDataset(self.stsb_train_samples, model)
        train_dataloader = DataLoader(train_dataset,
                                      shuffle=True,
                                      batch_size=16)
        train_loss = losses.CosineSimilarityLoss(model=model)
        model.fit(train_objectives=[(train_dataloader, train_loss)],
                  evaluator=None,
                  epochs=1,
                  evaluation_steps=1000,
                  warmup_steps=int(len(train_dataloader) * 0.1),
                  use_amp=True)

        self.evaluate_stsb_test(model, 80.0)
def train(conf: "TrainConfig"):
    logger = logging.getLogger(__name__)
    logger.info("Initialize model")
    transformer = models.Transformer(conf.transformer_model)

    pooling = models.Pooling(
        transformer.get_word_embedding_dimension(),
        pooling_mode_mean_tokens=True,
        pooling_mode_cls_token=False,
        pooling_mode_max_tokens=False,
    )

    model = SentenceTransformer(modules=[transformer, pooling])
    model.tokenizer = AutoTokenizer.from_pretrained(conf.transformer_model)
    logger.info(f"model: {type(model)}")
    logger.info(f"tokenizer: {type(model.tokenizer)}")
    encode_result = model.tokenizer(["日本語のトークナイゼーションの問題"], return_tensors='pt', padding=True)
    logger.info(model.tokenizer.convert_ids_to_tokens(encode_result.input_ids.flatten().tolist()))

    logger.info("Read training data")
    triplet_reader = TripletReader(str(conf.train_triplets_tsv.parent))
    train_data = SentencesDataset(
        triplet_reader.get_examples(conf.train_triplets_tsv.name), model=model
    )
    train_dataloader = DataLoader(train_data, shuffle=True, batch_size=conf.batch_size)
    train_loss = TripletLoss(
        model=model, distance_metric=TripletDistanceMetric.EUCLIDEAN, triplet_margin=1
    )

    evaluator = TripletEvaluator.from_input_examples(
        triplet_reader.get_examples(conf.dev_triplets_tsv.name), name="dev"
    )

    logger.info("Start training")
    warmup_steps = int(len(train_data) // conf.batch_size * 0.1)
    model.fit(
        train_objectives=[(train_dataloader, train_loss)],
        evaluator=evaluator,
        epochs=conf.epochs,
        evaluation_steps=conf.eval_steps,
        warmup_steps=warmup_steps,
        output_path=str(conf.model_dir),
    )
def train():
    # We construct the SentenceTransformer bi-encoder from scratch
    word_embedding_model = models.Transformer(model_name, max_seq_length=350)
    pooling_model = models.Pooling(
        word_embedding_model.get_word_embedding_dimension())
    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

    model_save_path = 'output/training_ms-marco_bi-encoder-' + model_name.replace(
        "/", "-") + '-' + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

    # Read our training file. qidpidtriples consists of triplets (qid, positive_pid, negative_pid)
    train_filepath = os.path.join(
        data_folder, 'msmarco-qidpidtriples.rnd-shuf.train-eval.tsv')

    # Create the evaluator that is called during training
    queries = read_queries()
    corpus = read_corpus()
    dev_queries, dev_corpus, dev_rel_docs = prepare_data_for_evaluation(
        queries, corpus)
    ir_evaluator = evaluation.InformationRetrievalEvaluator(
        dev_queries, dev_corpus, dev_rel_docs, name='ms-marco-train_eval')

    # For training the SentenceTransformer model, we need a dataset, a dataloader, and a loss used for training.
    train_dataset = TripletsDataset(model=model,
                                    queries=queries,
                                    corpus=corpus,
                                    triplets_file=train_filepath)
    train_dataloader = DataLoader(train_dataset,
                                  shuffle=False,
                                  batch_size=train_batch_size)
    train_loss = losses.MultipleNegativesRankingLoss(model=model)

    # print(next(iter(train_dataloader)))
    # return

    # Train the model
    model.fit(train_objectives=[(train_dataloader, train_loss)],
              evaluator=ir_evaluator,
              epochs=1,
              warmup_steps=1000,
              output_path=model_save_path,
              evaluation_steps=5000,
              use_amp=True)
예제 #27
0
def build_model(num_labels):
    model_name = 'bert-base-uncased'

    # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
    word_embedding_model = models.Transformer(model_name)

    # Apply mean pooling to get one fixed sized sentence vector
    pooling_model = models.Pooling(
        word_embedding_model.get_word_embedding_dimension(),
        pooling_mode_mean_tokens=True,
        pooling_mode_cls_token=False,
        pooling_mode_max_tokens=False)

    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
    train_loss = new_softmax_loss.SoftmaxLoss(
        model=model,
        sentence_embedding_dimension=model.get_sentence_embedding_dimension(),
        num_labels=num_labels,
        num_vectors=3)
    return model, train_loss
예제 #28
0
def do_test(pt_file, model_name, n):
    text = []
    i = 0
    with open(pt_file, 'r', encoding='utf8') as f:
        for l in f:
            text.append(l.split('\t')[1])
            i += 1
            if i >= n:
                break
    psg_word_embedding_model = models.Transformer(model_name)

    # Apply mean pooling to get one fixed sized sentence vector
    psg_pooling_model = models.Pooling(
        psg_word_embedding_model.get_word_embedding_dimension(),
        pooling_mode_mean_tokens=True,
        pooling_mode_cls_token=False,
        pooling_mode_max_tokens=False)

    psg_dense_model = models.Dense(
        in_features=psg_pooling_model.get_sentence_embedding_dimension(),
        out_features=256,
        activation_function=nn.Tanh())
    psg_model = CustomSentenceTransformer(
        modules=[psg_word_embedding_model, psg_pooling_model, psg_dense_model])
    if torch.cuda.is_available():
        psg_model.to(torch.device('cuda'))
    psg_features = []
    print('Tokenizing')
    for p in text:
        psg_tkn = psg_model.tokenize(p)
        if torch.cuda.is_available():
            batch_to_device(psg_tkn, torch.device('cuda'))
        psg_features.append(psg_tkn)
    psg_embs = []
    print('Embedding')
    for pfet in psg_features:
        psg_emb = psg_model(pfet)['sentence_embedding']
        psg_emb.to(torch.device('cpu'))
        psg_embs.append(psg_emb)
    print(psg_embs[:10])
예제 #29
0
    def prepare(self, texts):

        if self.model is None:
            # if "/" not in self.model_path:
            from sentence_transformers import SentenceTransformer, models
            try:
                self.model = SentenceTransformer(self.model_path)
            # else:
            # catch Exception:
            except Exception as e:
                word_embedding_model = models.Transformer(self.model_path)
                pooling_model = models.Pooling(
                    word_embedding_model.get_word_embedding_dimension(),
                    pooling_mode_mean_tokens=True,
                    pooling_mode_cls_token=False,
                    pooling_mode_max_tokens=False)
                self.model = SentenceTransformer(
                    modules=[word_embedding_model, pooling_model])

        texts_preprocessed = self.preprocess_all(texts)
        vecs = self.model.encode(texts_preprocessed)
        self.set_sen2vec(texts, vecs)
예제 #30
0
    def __init__(self, model_name, args=None, embedding_learning=None):
        """
        Initializes a STClassificationModel
        :param model_name:
        :param args:
        """
        self.args = SPArgs()

        if isinstance(args, dict):
            self.args.update_from_dict(args)
        elif isinstance(args, SPArgs):
            self.args = args

        if embedding_learning is not None and embedding_learning == 'from-scratch':
            word_embedding_model = models.Transformer(
                model_name, max_seq_length=self.args.max_seq_length)
            pooling_model = models.Pooling(
                word_embedding_model.get_word_embedding_dimension())

            self.model = SentenceTransformer(
                modules=[word_embedding_model, pooling_model])
        else:
            self.model = SentenceTransformer(model_name)