def test_multiclass(self): transformer = models.Transformer('prajjwal1/bert-tiny') model = SentenceTransformer(modules=[ transformer, models.Pooling(transformer.get_word_embedding_dimension()) ]) softmax_loss = losses.SoftmaxLoss( model, transformer.get_word_embedding_dimension(), num_labels=3) samples = [ InputExample(texts=[ "Hello Word, a first test sentence", "Hello Word, a other test sentence" ], label=0), InputExample(texts=[ "Hello Word, a second test sentence", "Hello Word, a other test sentence" ], label=1), InputExample(texts=[ "Hello Word, a third test sentence", "Hello Word, a other test sentence" ], label=2) ] dataloader = DataLoader(samples, batch_size=1) evaluator = MulticlassEvaluator(dataloader, softmax_model=softmax_loss) result = evaluator(model) i = 0
def train(hp): """Train the advanced blocking model Store the trained model in hp.model_fn. Args: hp (Namespace): the hyperparameters Returns: None """ # define model model_names = {'distilbert': 'distilbert-base-uncased', 'bert': 'bert-base-uncased', 'albert': 'albert-base-v2' } word_embedding_model = models.Transformer(model_names[hp.lm]) pooling_model = models.Pooling(word_embedding_model\ .get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) # load the training and validation data reader = Reader() trainset = SentencesDataset(examples=reader.get_examples(hp.train_fn), model=model) train_dataloader = DataLoader(trainset, shuffle=True, batch_size=hp.batch_size) train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model\ .get_sentence_embedding_dimension(), num_labels=2) dev_data = SentencesDataset(examples=reader\ .get_examples(hp.valid_fn), model=model) dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=hp.batch_size) evaluator = EmbeddingSimilarityEvaluator(dev_dataloader) warmup_steps = math.ceil(len(train_dataloader) \ * hp.n_epochs / hp.batch_size * 0.1) #10% of train data for warm-up if os.path.exists(hp.model_fn): import shutil shutil.rmtree(hp.model_fn) # Train the model model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=hp.n_epochs, evaluation_steps=1000, warmup_steps=warmup_steps, output_path=hp.model_fn, fp16=hp.fp16, fp16_opt_level='O2')
def __init__(self, model_name, args=None, embedding_learning=None, threshold=None): """ Initializes a STClassificationModel :param model_name: :param args: """ self.args = SPArgs() if isinstance(args, dict): self.args.update_from_dict(args) elif isinstance(args, SPArgs): self.args = args if embedding_learning is not None and embedding_learning == 'from-scratch': word_embedding_model = models.Transformer( model_name, max_seq_length=self.args.max_seq_length) pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension()) self.model = SentenceTransformer( modules=[word_embedding_model, pooling_model]) else: self.model = SentenceTransformer(model_name) self.threshold = threshold # Because AveragePrecision with Cosine-Similarity is used to pick the best model by # BinaryClassificationEvaluator self.score_type = "cosine_average_precision" self.threshold_type = "cosine_f1_threshold"
def fit_model(df, method): """ Fitting chosen model params: df: DataFrame used, method: model chosen returns: generated model, transformed datas """ if method == "TF-IDF": model = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words=STOPS) X = model.fit_transform(df['content']) elif method == "CountVectorizer": model = CountVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words=STOPS) X = model.fit_transform(df['content']) elif method == "BERT": word_embedding_model = models.Transformer('camembert-base') pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_max_tokens=False) model = SentenceTransformer( modules=[word_embedding_model, pooling_model]) X = model.encode(df['content'], show_progress_bar=True) return model, X
def getSentenceVector(doc, model_params: dict = {}, encoder = "bert", model_name = 'bert-base-cased' ): sp = spacy.load('en_core_web_sm') tokenized = sp(doc) sentences = [] for token in tokenized.sents: sentences.append(token.text) if encoder in ['bert', 'xlnet', 'longformer', 'reformer', 'distilbert', 'roberta', 'bart', 'finbert']: # Use encoder for mapping tokens to embeddings word_embedding_model = models.Transformer(model_name, tokenizer_args= model_params['tokenizer_args'] if 'tokenizer_args' in model_params else {}) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) sentence_embeddings = model.encode(sentences) elif encoder == 'use': #!pip install embedding-as-service from embedding_as_service.text.encode import Encoder en = Encoder(embedding='use', model='use_dan', max_seq_length=256) sentence_embeddings = en.encode(texts=sentences) elif encoder == 'infersent': import nltk nltk.download('punkt') from models import InferSent params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': 2} infersent = InferSent(params_model) W2V_PATH = 'drive/My Drive/wiki-news-300d-1M.vec' infersent.set_w2v_path(W2V_PATH) infersent.build_vocab(sentences, tokenize=True) sentence_embeddings = infersent.encode(sentences, tokenize=True) elif encoder == 'sent2vec': import sent2vec model = sent2vec.Sent2vecModel() model.load_model('drive/My Drive/torontobooks_unigram.bin') sentence_embeddings = model.embed_sentences(sentences) elif encoder == 'laser': from laserembeddings import Laser laser = Laser() ## Also used for multilingual sentence embeddings sentence_embeddings = laser.embed_sentences(sentences, lang='en') else: raise ValueError('Invalid encoder {} or encoder Unavailable.'.format(encoder)) return list(zip(sentences, sentence_embeddings))
def run_fixed_lambda_bbcluster(train_cluster_data, val_cluster_data, test_cluster_data, output_path, train_batch_size, eval_steps, num_epochs, warmup_frac, lambda_val, reg, beta, loss_name, use_model_device, model_name='distilbert-base-uncased', out_features=256): task = Task.init(project_name='BB Clustering', task_name='bbclustering_fixed_lambda') config_dict = {'lambda_val': lambda_val, 'reg': reg} config_dict = task.connect(config_dict) if torch.cuda.is_available(): device = torch.device('cuda') print('CUDA is available and using device: '+str(device)) else: device = torch.device('cpu') print('CUDA not available, using device: '+str(device)) ### Configure sentence transformers for training and train on the provided dataset # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings word_embedding_model = models.Transformer(model_name) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) doc_dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=out_features, activation_function=nn.Tanh()) model = CustomSentenceTransformer(modules=[word_embedding_model, pooling_model, doc_dense_model]) # model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) GPUtil.showUtilization() if loss_name == 'bbspec': loss_model = BBSpectralClusterLossModel(model=model, device=device, lambda_val=config_dict.get('lambda_val', lambda_val), reg_const=config_dict.get('reg', reg), beta=beta) else: loss_model = BBClusterLossModel(model=model, device=device, lambda_val=config_dict.get('lambda_val', lambda_val), reg_const=config_dict.get('reg', reg)) # reg_loss_model = ClusterDistLossModel(model=model) train_dataloader = DataLoader(train_cluster_data, shuffle=True, batch_size=train_batch_size) GPUtil.showUtilization() # train_dataloader2 = DataLoader(train_cluster_data, shuffle=True, batch_size=train_batch_size) evaluator = ClusterEvaluator.from_input_examples(val_cluster_data, use_model_device) test_evaluator = ClusterEvaluator.from_input_examples(test_cluster_data, use_model_device) GPUtil.showUtilization() warmup_steps = int(len(train_dataloader) * num_epochs * warmup_frac) # 10% of train data print("Raw BERT embedding performance") model.to(device) evaluator(model, output_path) GPUtil.showUtilization() # Train the model model.fit(train_objectives=[(train_dataloader, loss_model)], evaluator=evaluator, test_evaluator=test_evaluator, epochs=num_epochs, evaluation_steps=eval_steps, warmup_steps=warmup_steps, output_path=output_path)
def get_model(): # Google-Drive link: https://drive.google.com/drive/folders/1sUxvLCTJHOkPeB4thHO-RW8WI3DWLHos?usp=sharing PATH = "DeepPavlov/rubert-base-cased" model = models.Transformer(PATH) pooling_model = models.Pooling(model.get_word_embedding_dimension()) model = SentenceTransformer(modules=[model, pooling_model]) nltk.download('punkt') return model
def transformerModel(data, path_to_model, seq_length): word_embedding_model = models.Transformer(path_to_model, max_seq_length=seq_length) pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension()) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) embeddings = model.encode(data) return embeddings
def get_model(path): word_embedding_model = models.Transformer(path) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) return SentenceTransformer(modules=[word_embedding_model, pooling_model])
def __init__(self, bert_path): word_embedding_model = sent_models.Transformer(bert_path) pooling_model = sent_models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) self.model = SentenceTransformer( modules=[word_embedding_model, pooling_model]) self.model.to(DEFAULT_DEVICE) self.model.eval()
def run(): train_file = config.TRAINING_FILE train_batch = config.TRAIN_BATCH_SIZE vaild_batch = config.VALID_BATCH_SIZE model_path = config.BERT_PATH max_length = config.MAX_LEN dfs = pd.read_csv(train_file, sep="\t", names=['idx', 'sent1', 'sent2', 'label']) dfs['label'] = pd.to_numeric(dfs["label"], downcast='float') df_train, df_valid = model_selection.train_test_split( dfs, test_size=0.1, random_state=42, stratify=dfs.label.values, ) df_train = df_train.reset_index(drop=True) df_valid = df_valid.reset_index(drop=True) dataset_reader = dataset.Dataset() train_dataset = dataset_reader.read(df_train, return_pt=True) valid_sentence1, valid_sentence2, valid_labels = dataset_reader.read( df_valid) train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch) # evaluator = evaluation.EmbeddingSimilarityEvaluator(valid_sentence1, valid_sentence2, valid_labels) evaluator = evaluation.BinaryClassificationEvaluator( valid_sentence1, valid_sentence2, valid_labels, batch_size=vaild_batch, show_progress_bar=False) word_embedding_model = models.Transformer(model_path, max_seq_length=max_length) pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension()) dense_model = models.Dense( in_features=pooling_model.get_sentence_embedding_dimension(), out_features=max_length, activation_function=nn.Tanh()) model = SentenceTransformer( modules=[word_embedding_model, pooling_model, dense_model]) train_loss = losses.CosineSimilarityLoss(model) engine.train(train_dataloader, model, train_loss, evaluator)
def build_model(): # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings word_embedding_model = models.Transformer(model_name) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) return model
def define_bert_encoder(): word_embedding_model = models.Transformer('bert-base-uncased', max_seq_length=200) pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension()) dense_model = models.Dense( ni_features=pooling_model.get_sentence_embedding_dimension(), out_features=200, activation_function=nn.Tanh()) bert_model = SentenceTransformer( modules=[word_embedding_model, pooling_model, dense_model]) return bert_model
def __init__(self, json_path, max_n_sent, method, wgts='distil'): with open(json_path) as fin: dat = json.load(fin) # if group: # dat = info_df[info_df['partition']==group] # self.info_df = info_df.reset_index(drop=True) self.dat = dat self.max_n_sent = max_n_sent self.method = method if wgts == 'distil': model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens') if wgts == 'marco': model = SentenceTransformer('msmarco-distilbert-base-v2') if wgts == 'base': word_embedding_model = models.Transformer('bert-base-uncased', max_seq_length=512) if wgts == 'bio': word_embedding_model = models.Transformer('dmis-lab/biobert-v1.1', max_seq_length=512) if wgts == 'abs': word_embedding_model = models.Transformer( 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract', max_seq_length=512) if wgts == 'full': word_embedding_model = models.Transformer( 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext', max_seq_length=512) if wgts in ['base', 'bio', 'abs', 'full']: pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension()) model = SentenceTransformer( modules=[word_embedding_model, pooling_model]) self.model = model
def test_generalized_pooling(self): transformer = models.Transformer('prajjwal1/bert-tiny') model = SentenceTransformer(modules=[ transformer, models.GeneralizedPooling( transformer.get_word_embedding_dimension()) ]) emb = model.encode("Hello Word, a test sentence") assert emb.shape == (transformer.get_word_embedding_dimension(), ) # Single sentence as list emb = model.encode(["Hello Word, a test sentence"]) assert emb.shape == (1, transformer.get_word_embedding_dimension())
def run( self, training_data, evaluator, output_path, from_scratch=False, loss=SentenceTransformerLoss.cosine_similarity_loss, model_name_or_path="roberta-large-nli-stsb-mean-tokens", cuda=True, **kwargs, ): logger.info( f"Running Sentence Transformer Task: {model_name_or_path}, Output path: {output_path}" ) if from_scratch: logger.info("Training from scratch") models.Transformer(model_name_or_path, max_seq_length=kwargs.get( "max_seq_length", 128)) else: model = SentenceTransformer(model_name_or_path) if cuda: logger.info("Running model on GPU") model.cuda() train_examples = [ InputExample(texts=[data["sentence1"], data["sentence2"]], label=data["label"]) for data in training_data.values() ] train_dataset = SentencesDataset(train_examples, model) train_dataloader = DataLoader( train_dataset, shuffle=kwargs.get("shuffle", True), batch_size=kwargs.get("batch_size", 4), ) warmup_steps = math.ceil( len(train_examples) * kwargs.get("num_epochs", 3) / kwargs.get("train_batch_size", 4) * 0.1) # 10% of train data for warm-up train_loss = loss.value(model) model.fit( train_objectives=[(train_dataloader, train_loss)], epochs=kwargs.get("num_epochs", 3), evaluation_steps=kwargs.get("evaluation_steps", 500), warmup_steps=warmup_steps, output_path=output_path, evaluator=evaluator, )
def get_sentence_transformer(name): try: model = SentenceTransformer(name) except: transformer_model = models.Transformer(name) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling(transformer_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=True, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[transformer_model, pooling_model]) return model
def __init__(self): word_embedding_model = models.Transformer( 'sentence-transformers/bert-large-nli-max-tokens', max_seq_length=256) pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension()) dense_model = models.Dense( in_features=pooling_model.get_sentence_embedding_dimension(), out_features=256, activation_function=nn.Tanh()) self.model = SentenceTransformer( modules=[word_embedding_model, pooling_model, dense_model]) path = 'multinli_1.0/' self.MNLI_train_path = path + 'multinli_1.0_train.txt' self.MNLI_matched_test_path = path + 'multinli_1.0_dev_matched.txt' self.MNLI_mismatched_test_path = path + 'multinli_1.0_dev_mismatched.txt'
def build_sbert_model(model_name: str, logistic_model: bool = True): """Build SBERT model, based on model name provided. :param model_name: model to be used, currently supported: covidbert or biobert :type model_name: str :param logistic_model: use logistic regression as classifier :type logistic_model: bool :return: SBERT model and corresponding tokenizer """ if model_name == "covidbert": model_name = "deepset/covid_bert_base" covid_bert_path = "covid_bert_path" model_save_path = covid_bert_path os.makedirs(model_save_path, exist_ok=True) wget.download( "https://cdn.huggingface.co/deepset/covid_bert_base/vocab.txt", out=f"{model_save_path}/") # download the vocab file else: model_name = "allenai/biomed_roberta_base" model_save_path = "biobert_path" os.makedirs(model_save_path, exist_ok=True) wget.download( "https://cdn.huggingface.co/allenai/biomed_roberta_base/merges.txt", out=f"{model_save_path}/") wget.download( "https://cdn.huggingface.co/allenai/biomed_roberta_base/vocab.json", out=f"{model_save_path}/") # download the vocab file bert_model = AutoModel.from_pretrained(model_name) bert_model.save_pretrained(model_save_path) tokenizer = AutoTokenizer.from_pretrained(model_name) del bert_model word_embedding_model = models.Transformer(model_save_path) shutil.rmtree(model_save_path) pooling_model = models.Pooling(768, pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) # generating biobert sentence embeddings (mean pooling of sentence # embedding vectors) sbert_model = SBERTPredictor(word_embedding_model, pooling_model, logistic_model=logistic_model) return sbert_model, tokenizer
def run_binary_model(train_pairs, val_cluster_data, test_cluster_data, output_path, train_batch_size, eval_steps, num_epochs, warmup_frac, use_model_device, model_name='distilbert-base-uncased', out_features=256): task = Task.init(project_name='BB Clustering', task_name='bbclustering_pairs') if torch.cuda.is_available(): device = torch.device('cuda') print('CUDA is available and using device: ' + str(device)) else: device = torch.device('cpu') print('CUDA not available, using device: ' + str(device)) ### Configure sentence transformers for training and train on the provided dataset # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings word_embedding_model = models.Transformer(model_name) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) doc_dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=out_features, activation_function=nn.Tanh()) model = CustomSentenceTransformer(modules=[word_embedding_model, pooling_model, doc_dense_model]) train_dataloader = DataLoader(train_pairs, shuffle=True, batch_size=train_batch_size) train_loss = BinaryLoss(model=model) evaluator = ClusterEvaluator.from_input_examples(val_cluster_data, use_model_device) test_evaluator = ClusterEvaluator.from_input_examples(test_cluster_data, use_model_device) warmup_steps = int(len(train_dataloader) * num_epochs * warmup_frac) # 10% of train data print("Raw BERT embedding performance") model.to(device) evaluator(model, output_path) # Train the model model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, test_evaluator=test_evaluator, epochs=num_epochs, evaluation_steps=eval_steps, warmup_steps=warmup_steps, output_path=output_path)
def load_model_from_s3(): try: # get object from s3 # obj = s3.get_object(Bucket=S3_BUCKET, Key=MODEL_PATH) # unzip it # tar = tarfile.open(fileobj=bytestream, mode="r:gz") word_embedding_model = models.Transformer('T-Systems-onsite/bert-german-dbmdz-uncased-sentence-stsb',max_seq_length=512) pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) # join BERT model and pooling to get the sentence transformer model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) return model except Exception as e: raise(e)
def initialize_model(self): # Read the dataset # Use BERT for mapping tokens to embeddings word_embedding_model = models.Transformer(self.base_model, max_seq_length=128) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) self.model = SentenceTransformer( modules=[word_embedding_model, pooling_model]) self.train_loss_nli = losses.SoftmaxLoss( model=self.model, sentence_embedding_dimension=self.model. get_sentence_embedding_dimension(), num_labels=len(self.label2int))
def test_roberta_wkpooling(self): word_embedding_model = models.Transformer( 'roberta-base', model_args={'output_hidden_states': True}) pooling_model = models.WKPooling( word_embedding_model.get_word_embedding_dimension()) model = SentenceTransformer( modules=[word_embedding_model, pooling_model]) scores = [ 0.9594874382019043, 0.9928674697875977, 0.9241214990615845, 0.9309519529342651, 0.9506515264511108 ] for sentences, score in zip(WKPoolingTest.sentence_pairs, scores): embedding = model.encode(sentences, convert_to_numpy=True) similarity = 1 - scipy.spatial.distance.cosine( embedding[0], embedding[1]) assert abs(similarity - score) < 0.01
def test_train_stsb(self): word_embedding_model = models.Transformer('distilbert-base-uncased') pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension()) model = SentenceTransformer( modules=[word_embedding_model, pooling_model]) train_dataset = SentencesDataset(self.stsb_train_samples, model) train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=16) train_loss = losses.CosineSimilarityLoss(model=model) model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=None, epochs=1, evaluation_steps=1000, warmup_steps=int(len(train_dataloader) * 0.1), use_amp=True) self.evaluate_stsb_test(model, 80.0)
def train(conf: "TrainConfig"): logger = logging.getLogger(__name__) logger.info("Initialize model") transformer = models.Transformer(conf.transformer_model) pooling = models.Pooling( transformer.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False, ) model = SentenceTransformer(modules=[transformer, pooling]) model.tokenizer = AutoTokenizer.from_pretrained(conf.transformer_model) logger.info(f"model: {type(model)}") logger.info(f"tokenizer: {type(model.tokenizer)}") encode_result = model.tokenizer(["日本語のトークナイゼーションの問題"], return_tensors='pt', padding=True) logger.info(model.tokenizer.convert_ids_to_tokens(encode_result.input_ids.flatten().tolist())) logger.info("Read training data") triplet_reader = TripletReader(str(conf.train_triplets_tsv.parent)) train_data = SentencesDataset( triplet_reader.get_examples(conf.train_triplets_tsv.name), model=model ) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=conf.batch_size) train_loss = TripletLoss( model=model, distance_metric=TripletDistanceMetric.EUCLIDEAN, triplet_margin=1 ) evaluator = TripletEvaluator.from_input_examples( triplet_reader.get_examples(conf.dev_triplets_tsv.name), name="dev" ) logger.info("Start training") warmup_steps = int(len(train_data) // conf.batch_size * 0.1) model.fit( train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=conf.epochs, evaluation_steps=conf.eval_steps, warmup_steps=warmup_steps, output_path=str(conf.model_dir), )
def train(): # We construct the SentenceTransformer bi-encoder from scratch word_embedding_model = models.Transformer(model_name, max_seq_length=350) pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension()) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) model_save_path = 'output/training_ms-marco_bi-encoder-' + model_name.replace( "/", "-") + '-' + datetime.now().strftime("%Y-%m-%d_%H-%M-%S") # Read our training file. qidpidtriples consists of triplets (qid, positive_pid, negative_pid) train_filepath = os.path.join( data_folder, 'msmarco-qidpidtriples.rnd-shuf.train-eval.tsv') # Create the evaluator that is called during training queries = read_queries() corpus = read_corpus() dev_queries, dev_corpus, dev_rel_docs = prepare_data_for_evaluation( queries, corpus) ir_evaluator = evaluation.InformationRetrievalEvaluator( dev_queries, dev_corpus, dev_rel_docs, name='ms-marco-train_eval') # For training the SentenceTransformer model, we need a dataset, a dataloader, and a loss used for training. train_dataset = TripletsDataset(model=model, queries=queries, corpus=corpus, triplets_file=train_filepath) train_dataloader = DataLoader(train_dataset, shuffle=False, batch_size=train_batch_size) train_loss = losses.MultipleNegativesRankingLoss(model=model) # print(next(iter(train_dataloader))) # return # Train the model model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=ir_evaluator, epochs=1, warmup_steps=1000, output_path=model_save_path, evaluation_steps=5000, use_amp=True)
def build_model(num_labels): model_name = 'bert-base-uncased' # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings word_embedding_model = models.Transformer(model_name) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) train_loss = new_softmax_loss.SoftmaxLoss( model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=num_labels, num_vectors=3) return model, train_loss
def do_test(pt_file, model_name, n): text = [] i = 0 with open(pt_file, 'r', encoding='utf8') as f: for l in f: text.append(l.split('\t')[1]) i += 1 if i >= n: break psg_word_embedding_model = models.Transformer(model_name) # Apply mean pooling to get one fixed sized sentence vector psg_pooling_model = models.Pooling( psg_word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) psg_dense_model = models.Dense( in_features=psg_pooling_model.get_sentence_embedding_dimension(), out_features=256, activation_function=nn.Tanh()) psg_model = CustomSentenceTransformer( modules=[psg_word_embedding_model, psg_pooling_model, psg_dense_model]) if torch.cuda.is_available(): psg_model.to(torch.device('cuda')) psg_features = [] print('Tokenizing') for p in text: psg_tkn = psg_model.tokenize(p) if torch.cuda.is_available(): batch_to_device(psg_tkn, torch.device('cuda')) psg_features.append(psg_tkn) psg_embs = [] print('Embedding') for pfet in psg_features: psg_emb = psg_model(pfet)['sentence_embedding'] psg_emb.to(torch.device('cpu')) psg_embs.append(psg_emb) print(psg_embs[:10])
def prepare(self, texts): if self.model is None: # if "/" not in self.model_path: from sentence_transformers import SentenceTransformer, models try: self.model = SentenceTransformer(self.model_path) # else: # catch Exception: except Exception as e: word_embedding_model = models.Transformer(self.model_path) pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) self.model = SentenceTransformer( modules=[word_embedding_model, pooling_model]) texts_preprocessed = self.preprocess_all(texts) vecs = self.model.encode(texts_preprocessed) self.set_sen2vec(texts, vecs)
def __init__(self, model_name, args=None, embedding_learning=None): """ Initializes a STClassificationModel :param model_name: :param args: """ self.args = SPArgs() if isinstance(args, dict): self.args.update_from_dict(args) elif isinstance(args, SPArgs): self.args = args if embedding_learning is not None and embedding_learning == 'from-scratch': word_embedding_model = models.Transformer( model_name, max_seq_length=self.args.max_seq_length) pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension()) self.model = SentenceTransformer( modules=[word_embedding_model, pooling_model]) else: self.model = SentenceTransformer(model_name)