def run( self, training_data, evaluator, output_path, from_scratch=False, loss=SentenceTransformerLoss.cosine_similarity_loss, model_name_or_path="roberta-large-nli-stsb-mean-tokens", cuda=True, **kwargs, ): logger.info( f"Running Sentence Transformer Task: {model_name_or_path}, Output path: {output_path}" ) if from_scratch: logger.info("Training from scratch") models.Transformer(model_name_or_path, max_seq_length=kwargs.get( "max_seq_length", 128)) else: model = SentenceTransformer(model_name_or_path) if cuda: logger.info("Running model on GPU") model.cuda() train_examples = [ InputExample(texts=[data["sentence1"], data["sentence2"]], label=data["label"]) for data in training_data.values() ] train_dataset = SentencesDataset(train_examples, model) train_dataloader = DataLoader( train_dataset, shuffle=kwargs.get("shuffle", True), batch_size=kwargs.get("batch_size", 4), ) warmup_steps = math.ceil( len(train_examples) * kwargs.get("num_epochs", 3) / kwargs.get("train_batch_size", 4) * 0.1) # 10% of train data for warm-up train_loss = loss.value(model) model.fit( train_objectives=[(train_dataloader, train_loss)], epochs=kwargs.get("num_epochs", 3), evaluation_steps=kwargs.get("evaluation_steps", 500), warmup_steps=warmup_steps, output_path=output_path, evaluator=evaluator, )
def main(): parser = argparse.ArgumentParser() parser.add_argument("--do_test", action='store_true', help="Generate embeddings for test splits (test set is usually large, so we don't want to repeatedly generate embeddings for them)") parser.add_argument("--sbert_model", type=str, default='roberta-large', help="Sentence BERT model name") parser.add_argument("--k", type=int, help="Number of training instances per label", default=16) parser.add_argument("--data_dir", type=str, default="data/k-shot", help="Path to few-shot data") parser.add_argument("--seed", type=int, nargs="+", default=[42, 13, 21, 87, 100], help="Seeds for data splits") parser.add_argument("--task", type=str, nargs="+", default=["SST-2", "sst-5", "mr", "cr", "mpqa", "subj", "trec", "CoLA", "MRPC", "QQP", "STS-B", "MNLI", "SNLI", "QNLI", "RTE"], help="Tasks") args = parser.parse_args() model = SentenceTransformer('{}-nli-stsb-mean-tokens'.format(args.sbert_model)) model = model.cuda() for task in args.task: for seed in args.seed: folder = os.path.join(args.data_dir, task, '{}-{}'.format(args.k, seed)) dataset = load_datasets(folder, task, do_test=args.do_test) for split in dataset: print('{}-{}-{}-{}'.format(task, args.k, seed, split)) lines = dataset[split] embeddings = [] for line_id, line in tqdm(enumerate(lines)): sent = get_sentence(task, line) if line_id == 0: print('|', sent) emb = model.encode(sent) embeddings.append(emb) embeddings = np.stack(embeddings) np.save(os.path.join(folder, "{}_sbert-{}.npy".format(split, args.sbert_model)), embeddings)
def main(args): model = SentenceTransformer(args.model_name) if args.device == 'cuda' and torch.cuda.is_available(): model.cuda() ids = [] src_sentences = [] tgt_sentences = [] programs = [] with open(args.input_file, 'r') as fin: for i, line in enumerate(fin): row = list(map(lambda part: part.strip(), line.split('\t'))) ids.append(row[0]) src_sentences.append(row[1]) tgt_sentences.append(row[2]) if len(row) > 3: programs.append(row[3]) if args.subsample != -1 and i >= args.subsample: break embeddings1 = model.encode(src_sentences, batch_size=args.batch_size, show_progress_bar=True, convert_to_numpy=True) embeddings2 = model.encode(tgt_sentences, batch_size=args.batch_size, show_progress_bar=True, convert_to_numpy=True) cosine_scores = 1 - (paired_cosine_distances(embeddings1, embeddings2)) with open(args.output_file, 'w') as fout: for i in range(len(ids)): id_, src, tgt, score = ids[i], src_sentences[i], tgt_sentences[ i], cosine_scores[i] prog = None if programs: prog = programs[i] fout.write('\t'.join([ id_, src, tgt, '{:0.4f}'.format(score), prog if prog else '' ]) + '\n')
def run( self, sentences, model_name_or_path="roberta-large-nli-stsb-mean-tokens", cuda=True, **kwargs, ): logger.info(f"Running Sentence Transformer Task: {model_name_or_path}") model = SentenceTransformer(model_name_or_path) if cuda: logger.info("Running model on GPU") model.cuda() sentence_embeddings = model.encode( list(sentences.values()), show_progress_bar=kwargs.get("show_progress_bar", True), batch_size=kwargs.get("batch_size", 8), convert_to_numpy=kwargs.get("covert_to_numpy", True), ) return {id: sentence_embeddings[index] for index, id in enumerate(sentences)}
class Tokenizer: def __init__(self, device): self.device = device self.model = SentenceTransformer('bert-base-nli-mean-tokens') self.model.cuda() self.embedding_dim = 768 def tokenize(self, text): # Change text format depending on the parameter to be used sentence_embeddings = self.model.encode([text]) # Length of a sentence embedding is 768 (just like in BERT) # print(len(sentence_embeddings[0])) return sentence_embeddings def encode_state(self, state_description): return {key: self.tokenize(description) for key, description in state_description.items()} def encode_commands(self, commands): return [self.tokenize(cmd) for cmd in commands]
def run( self, sentences: Dict[str, str], model_name: str = "roberta-large-nli-stsb-mean-tokens", batch_size: int = 8, ) -> Dict[str, Dict]: logger.info(f"Running sentence transformers, Model name: {model_name}") model = SentenceTransformer(model_name) if torch.cuda.is_available(): logger.info("GPU found") logger.info("Initializing Coreference predictor with GPU") model.cuda() else: logger.info("Initializing Coreference predictor with CPU") sentence_embeddings = model.encode(list(sentences.values()), batch_size=batch_size, show_progress_bar=True) return { sentence_id: embedding for sentence_id, embedding in zip(sentences.keys(), sentence_embeddings) }
def main(): args = parser.parse_args() # Create dataset print("=> creating dataset") normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) dataset = Talk2Car(root=args.root, split='test', transform=transforms.Compose( [transforms.ToTensor(), normalize])) dataloader = data.DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True, drop_last=False) print('Test set contains %d samples' % (len(dataset))) # Create model print("=> creating model") img_encoder = nn.DataParallel( EfficientNet.from_pretrained('efficientnet-b2')) text_encoder = SentenceTransformer('roberta-large-nli-stsb-mean-tokens') fc_model = nn.Sequential(nn.Linear(1024, 1000), nn.ReLU(), nn.Linear(1000, 1000)) img_encoder.cuda() text_encoder.cuda() fc_model.cuda() cudnn.benchmark = True # Evaluate model print("=> Evaluating best model") checkpoint = torch.load('best_model.pth.tar', map_location='cpu') img_encoder.load_state_dict(checkpoint['img_encoder']) fc_model.load_state_dict(checkpoint['fc_model']) evaluate(dataloader, img_encoder, text_encoder, fc_model, args)
from sklearn.metrics.pairwise import paired_cosine_distances from sentence_transformers import SentenceTransformer, util import csv import torch model = SentenceTransformer("xlm-r-distilroberta-base-paraphrase-v1") if torch.cuda.is_available(): model.cuda() print('Using GPU') # model = SentenceTransformer("distiluse-base-multilingual-cased-v2") # model = SentenceTransformer("xlm-r-bert-base-nli-stsb-mean-tokens") # model = SentenceTransformer("distilbert-multilingual-nli-stsb-quora-ranking") # model = SentenceTransformer("LaBSE") tsv_file = open('train-hotels-es.csv') read_tsv = csv.reader(tsv_file, delimiter=",") src_sentences = [] trg_sentences = [] for row in read_tsv: src_sentences.append(row[1]) trg_sentences.append(row[2]) batch_size = 500 embeddings1 = model.encode(src_sentences, batch_size=batch_size, show_progress_bar=True, convert_to_numpy=True) embeddings2 = model.encode(trg_sentences, batch_size=batch_size, show_progress_bar=True, convert_to_numpy=True) cosine_scores = 1 - (paired_cosine_distances(embeddings1, embeddings2))
class EmbExtractor(): def __init__(self, model_name: str, sentence_transformer: bool, gpu: bool, fp16: bool, pooling: str, without_encoding: bool, use_mlm_head: bool, use_mlm_head_without_layernorm: bool): self._sentence_transformer = sentence_transformer self._gpu = gpu self._fp16 = fp16 self._pooling = pooling self._without_encoding = without_encoding self._use_mlm_head = use_mlm_head self._use_mlm_head_without_layernorm = use_mlm_head_without_layernorm self._tokenizer = AutoTokenizer.from_pretrained(model_name) if self._sentence_transformer: self._model = SentenceTransformer(model_name) else: if self._pooling == "mask" or self._use_mlm_head: self._model = AutoModelForMaskedLM.from_pretrained(model_name) self._model.config.output_hidden_states = True else: self._model = AutoModel.from_pretrained(model_name) if self._gpu: self._model.cuda() if self._fp16: self._model.half() def extract_emb(self, lines: Union[str, List[str]]): if not isinstance(lines, list): lines = [lines] if self._sentence_transformer: # Shape: (batch_size, num_embs) sentence_embedding = self._model.encode(lines) return sentence_embedding else: encoded_input = self._tokenizer.batch_encode_plus( lines, truncation=True, padding=True, pad_to_multiple_of=8, return_tensors='pt', return_special_tokens_mask=True) if self._gpu: encoded_input = {k: v.cuda() for k, v in encoded_input.items()} # Shape: (batch_size, num_tokens, 1) special_tokens_mask = ( 1 - encoded_input.pop("special_tokens_mask").unsqueeze(axis=-1)) if self._use_mlm_head: self._model.lm_head.decoder = Identity() if self._use_mlm_head_without_layernorm: self._model.lm_head.lm_head_norm = Identity() with torch.no_grad(): outputs = self._model(**encoded_input) if self._use_mlm_head: self._pooling = "mask" if self._pooling == "mask": assert not self._without_encoding # Shape: (batch_size, num_tokens, num_embs) output = outputs["hidden_states"][-1] if self._use_mlm_head: with torch.no_grad(): # Shape: (batch_size, num_tokens, num_embs) output = self._model.lm_head(output) # Shape: (batch_size, num_embs) - <mask> is the 2nd token sentence_embedding = output[:, 1, :] # ... elif self._pooling == "cls": # Shape: (batch_size, num_tokens, num_embs) output = outputs["last_hidden_state"] # Shape: (batch_size, num_embs) sentence_embedding = output[:, 0, :] else: if self._without_encoding: # Shape: (batch_size, num_embs) output = outputs["last_hidden_state"][ 0] * special_tokens_mask else: # Shape: (batch_size, num_tokens, num_embs) output = outputs["last_hidden_state"] * special_tokens_mask if self._pooling == 'avg': # Shape: (batch_size, num_embs) output_masked = torch.sum(output, dim=1) # Shape: (batch_size, 1) non_zeros_n = torch.sum(special_tokens_mask, dim=1) # Shape: (batch_size, num_embs) sentence_embedding = output_masked / non_zeros_n elif self._pooling == 'max': # Shape: (batch_size, num_embs) output_masked = (output).max(dim=1) # Shape: (batch_size, num_embs) sentence_embedding = output_masked.values else: logging.critical(" - pooling method doesnt exists") exit() return sentence_embedding.float().cpu().numpy()
def __init__(self, D_h, cls_model, transformer_model_family, mode, num_classes, context_attention, attention=False, residual=False): super().__init__() if transformer_model_family == 'bert': if mode == '0': model = BertForSequenceClassification.from_pretrained( 'bert-base-uncased') tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') hidden_dim = 768 elif mode == '1': model = BertForSequenceClassification.from_pretrained( 'bert-large-uncased') tokenizer = BertTokenizer.from_pretrained('bert-large-uncased') hidden_dim = 1024 elif transformer_model_family == 'roberta': if mode == '0': model = RobertaForSequenceClassification.from_pretrained( 'roberta-base') tokenizer = RobertaTokenizer.from_pretrained('roberta-base') hidden_dim = 768 elif mode == '1': model = RobertaForSequenceClassification.from_pretrained( 'roberta-large') tokenizer = RobertaTokenizer.from_pretrained('roberta-large') hidden_dim = 1024 elif transformer_model_family == 'sbert': if mode == '0': model = SentenceTransformer('bert-base-nli-mean-tokens') hidden_dim = 768 elif mode == '1': model = SentenceTransformer('bert-large-nli-mean-tokens') hidden_dim = 1024 elif mode == '2': model = SentenceTransformer('roberta-base-nli-mean-tokens') hidden_dim = 768 elif mode == '3': model = SentenceTransformer('roberta-large-nli-mean-tokens') hidden_dim = 1024 self.transformer_model_family = transformer_model_family self.model = model.cuda() self.hidden_dim = hidden_dim self.cls_model = cls_model self.D_h = D_h self.residual = residual if self.transformer_model_family in ['bert', 'roberta']: self.tokenizer = tokenizer if self.cls_model == 'lstm': self.lstm = nn.LSTM(input_size=self.hidden_dim, hidden_size=D_h, num_layers=2, bidirectional=True).cuda() self.fc = nn.Linear(self.hidden_dim, 2 * D_h).cuda() self.attention = attention if self.attention: self.matchatt = MatchingAttention(2 * D_h, 2 * D_h, att_type='general2').cuda() self.linear = nn.Linear(2 * D_h, 2 * D_h).cuda() self.smax_fc = nn.Linear(2 * D_h, num_classes).cuda() elif self.cls_model == 'dialogrnn': self.dialog_rnn_f = DialogueRNN(self.hidden_dim, D_h, D_h, D_h, context_attention).cuda() self.dialog_rnn_r = DialogueRNN(self.hidden_dim, D_h, D_h, D_h, context_attention).cuda() self.fc = nn.Linear(self.hidden_dim, 2 * D_h).cuda() self.attention = attention if self.attention: self.matchatt = MatchingAttention(2 * D_h, 2 * D_h, att_type='general2').cuda() self.linear = nn.Linear(2 * D_h, 2 * D_h).cuda() self.smax_fc = nn.Linear(2 * D_h, num_classes).cuda() self.dropout_rec = nn.Dropout(0.1) elif self.cls_model == 'logreg': self.linear = nn.Linear(self.hidden_dim, D_h).cuda() self.smax_fc = nn.Linear(D_h, num_classes).cuda()
def docsEmbedding(docData, modelFlag='sbert', tf_idf_weight=False, data_dir=None): ''' data_dir : place to store the vectors data ''' if tf_idf_weight: print('####Training the TF-IDF matrix####') docs = [' '.join(doc) for doc, _ in docData] tfidf = TfidfVectorizer(use_idf=True, smooth_idf=True, norm=None) tfidf.fit_transform(docs) max_idf = max(tfidf.idf_) ''' if a word was never seen - it must be at least as infrequent as any of the known words - so the default idf is the max of known idf's ''' word2weight = defaultdict(lambda: max_idf, [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()]) else: word2weight = None if data_dir: if not os.path.exists(data_dir): os.mkdir(data_dir) print( '####Embedding results will be stored at {}####'.format(data_dir)) else: print('####Embedding results will not be stored####') X = [] Y = [] model, tokenizer = None, None print('####Loading the model####') if modelFlag == 'sbert': # for sentence transformer (SBERT) from sentence_transformers import SentenceTransformer sentenceModelList = [ 'bert-base-nli-mean-tokens', 'bert-large-nli-mean-tokens' ] '''No need to convert to GPU here, since the model does for us''' model = SentenceTransformer(sentenceModelList[0]) elif modelFlag == 'naive-bert': from transformers import BertModel, BertTokenizer, BertConfig # full list https://huggingface.co/transformers/pretrained_models.html transformerModelList = [ 'bert-base-uncased', 'bert-large-uncased', 'bert-base-cased', 'bert-large-cased' ] config = BertConfig.from_pretrained(transformerModelList[0], output_hidden_states=True) tokenizer = BertTokenizer.from_pretrained(transformerModelList[0]) model = BertModel.from_pretrained(transformerModelList[0], config=config) if torch.cuda.is_available(): model = model.cuda() elif modelFlag == 'word2vec': import gensim model = gensim.models.KeyedVectors.load_word2vec_format(\ './word2vec/GoogleNews-vectors-negative300.bin', binary=True) idx = 0 for doc, docLabels in tqdm(docData): if modelFlag == 'sbert': docEmbedding = model.encode(doc) elif modelFlag == 'naive-bert': docEmbedding = naiveBERT_embed(doc, model, tokenizer, word2weight=word2weight, use_CLS=False) elif modelFlag == 'word2vec': doc_emd_temp = [] for sen in doc: sen_emd = word2vec_embed(sen, model, word2weight=word2weight) doc_emd_temp.append(sen_emd) docEmbedding = torch.cat(doc_emd_temp) '''append the embedding result''' X.append(torch.Tensor(docEmbedding)) Y.append(torch.LongTensor(docLabels)) if data_dir: '''save the embedding result''' x_numpy = np.array(docEmbedding) y_numpy = np.array(docLabels) np.save(os.path.join(data_dir, str(idx) + '_x.npy'), x_numpy) np.save(os.path.join(data_dir, str(idx) + '_y.npy'), y_numpy) idx += 1 data = {'text': X, 'label': Y} return data