Exemplo n.º 1
0
    def run(
        self,
        training_data,
        evaluator,
        output_path,
        from_scratch=False,
        loss=SentenceTransformerLoss.cosine_similarity_loss,
        model_name_or_path="roberta-large-nli-stsb-mean-tokens",
        cuda=True,
        **kwargs,
    ):
        logger.info(
            f"Running Sentence Transformer Task: {model_name_or_path}, Output path: {output_path}"
        )
        if from_scratch:
            logger.info("Training from scratch")
            models.Transformer(model_name_or_path,
                               max_seq_length=kwargs.get(
                                   "max_seq_length", 128))
        else:
            model = SentenceTransformer(model_name_or_path)
        if cuda:
            logger.info("Running model on GPU")
            model.cuda()

        train_examples = [
            InputExample(texts=[data["sentence1"], data["sentence2"]],
                         label=data["label"])
            for data in training_data.values()
        ]
        train_dataset = SentencesDataset(train_examples, model)
        train_dataloader = DataLoader(
            train_dataset,
            shuffle=kwargs.get("shuffle", True),
            batch_size=kwargs.get("batch_size", 4),
        )
        warmup_steps = math.ceil(
            len(train_examples) * kwargs.get("num_epochs", 3) /
            kwargs.get("train_batch_size", 4) *
            0.1)  # 10% of train data for warm-up
        train_loss = loss.value(model)
        model.fit(
            train_objectives=[(train_dataloader, train_loss)],
            epochs=kwargs.get("num_epochs", 3),
            evaluation_steps=kwargs.get("evaluation_steps", 500),
            warmup_steps=warmup_steps,
            output_path=output_path,
            evaluator=evaluator,
        )
Exemplo n.º 2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--do_test", action='store_true', help="Generate embeddings for test splits (test set is usually large, so we don't want to repeatedly generate embeddings for them)")
    parser.add_argument("--sbert_model", type=str, default='roberta-large', help="Sentence BERT model name")

    parser.add_argument("--k", type=int, help="Number of training instances per label", default=16)
    parser.add_argument("--data_dir", type=str, default="data/k-shot", help="Path to few-shot data")
    parser.add_argument("--seed", type=int, nargs="+", default=[42, 13, 21, 87, 100], help="Seeds for data splits")
    parser.add_argument("--task", type=str, nargs="+", default=["SST-2", "sst-5", "mr", "cr", "mpqa", "subj", "trec", "CoLA", "MRPC", "QQP", "STS-B", "MNLI", "SNLI", "QNLI", "RTE"], help="Tasks")

    args = parser.parse_args()

    model = SentenceTransformer('{}-nli-stsb-mean-tokens'.format(args.sbert_model))
    model = model.cuda()

    for task in args.task:
        for seed in args.seed:
            folder = os.path.join(args.data_dir, task, '{}-{}'.format(args.k, seed))
            dataset = load_datasets(folder, task, do_test=args.do_test)
            for split in dataset:
                print('{}-{}-{}-{}'.format(task, args.k, seed, split))
                lines = dataset[split]
                embeddings = []
                for line_id, line in tqdm(enumerate(lines)):
                    sent = get_sentence(task, line)
                    if line_id == 0:
                        print('|', sent)
                    emb = model.encode(sent)
                    embeddings.append(emb)
                embeddings = np.stack(embeddings)
                np.save(os.path.join(folder, "{}_sbert-{}.npy".format(split, args.sbert_model)), embeddings)
def main(args):

    model = SentenceTransformer(args.model_name)

    if args.device == 'cuda' and torch.cuda.is_available():
        model.cuda()

    ids = []
    src_sentences = []
    tgt_sentences = []
    programs = []

    with open(args.input_file, 'r') as fin:
        for i, line in enumerate(fin):
            row = list(map(lambda part: part.strip(), line.split('\t')))
            ids.append(row[0])
            src_sentences.append(row[1])
            tgt_sentences.append(row[2])
            if len(row) > 3:
                programs.append(row[3])

            if args.subsample != -1 and i >= args.subsample:
                break

    embeddings1 = model.encode(src_sentences,
                               batch_size=args.batch_size,
                               show_progress_bar=True,
                               convert_to_numpy=True)
    embeddings2 = model.encode(tgt_sentences,
                               batch_size=args.batch_size,
                               show_progress_bar=True,
                               convert_to_numpy=True)

    cosine_scores = 1 - (paired_cosine_distances(embeddings1, embeddings2))

    with open(args.output_file, 'w') as fout:
        for i in range(len(ids)):
            id_, src, tgt, score = ids[i], src_sentences[i], tgt_sentences[
                i], cosine_scores[i]
            prog = None
            if programs:
                prog = programs[i]
            fout.write('\t'.join([
                id_, src, tgt, '{:0.4f}'.format(score), prog if prog else ''
            ]) + '\n')
Exemplo n.º 4
0
 def run(
     self,
     sentences,
     model_name_or_path="roberta-large-nli-stsb-mean-tokens",
     cuda=True,
     **kwargs,
 ):
     logger.info(f"Running Sentence Transformer Task: {model_name_or_path}")
     model = SentenceTransformer(model_name_or_path)
     if cuda:
         logger.info("Running model on GPU")
         model.cuda()
     sentence_embeddings = model.encode(
         list(sentences.values()),
         show_progress_bar=kwargs.get("show_progress_bar", True),
         batch_size=kwargs.get("batch_size", 8),
         convert_to_numpy=kwargs.get("covert_to_numpy", True),
     )
     return {id: sentence_embeddings[index] for index, id in enumerate(sentences)}
class Tokenizer:
    def __init__(self, device):

        self.device = device
        self.model = SentenceTransformer('bert-base-nli-mean-tokens')
        self.model.cuda()
        self.embedding_dim = 768

    def tokenize(self, text):
        # Change text format depending on the parameter to be used
        sentence_embeddings = self.model.encode([text])
        
        # Length of a sentence embedding is 768 (just like in BERT)
        # print(len(sentence_embeddings[0]))
        return sentence_embeddings

    def encode_state(self, state_description):
        return {key: self.tokenize(description) for key, description in state_description.items()}

    def encode_commands(self, commands):
        return [self.tokenize(cmd) for cmd in commands]
Exemplo n.º 6
0
 def run(
     self,
     sentences: Dict[str, str],
     model_name: str = "roberta-large-nli-stsb-mean-tokens",
     batch_size: int = 8,
 ) -> Dict[str, Dict]:
     logger.info(f"Running sentence transformers, Model name: {model_name}")
     model = SentenceTransformer(model_name)
     if torch.cuda.is_available():
         logger.info("GPU found")
         logger.info("Initializing Coreference predictor with GPU")
         model.cuda()
     else:
         logger.info("Initializing Coreference predictor with CPU")
     sentence_embeddings = model.encode(list(sentences.values()),
                                        batch_size=batch_size,
                                        show_progress_bar=True)
     return {
         sentence_id: embedding
         for sentence_id, embedding in zip(sentences.keys(),
                                           sentence_embeddings)
     }
Exemplo n.º 7
0
def main():
    args = parser.parse_args()

    # Create dataset
    print("=> creating dataset")
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
    dataset = Talk2Car(root=args.root,
                       split='test',
                       transform=transforms.Compose(
                           [transforms.ToTensor(), normalize]))
    dataloader = data.DataLoader(dataset,
                                 batch_size=args.batch_size,
                                 shuffle=False,
                                 num_workers=args.workers,
                                 pin_memory=True,
                                 drop_last=False)
    print('Test set contains %d samples' % (len(dataset)))

    # Create model
    print("=> creating model")
    img_encoder = nn.DataParallel(
        EfficientNet.from_pretrained('efficientnet-b2'))
    text_encoder = SentenceTransformer('roberta-large-nli-stsb-mean-tokens')
    fc_model = nn.Sequential(nn.Linear(1024, 1000), nn.ReLU(),
                             nn.Linear(1000, 1000))

    img_encoder.cuda()
    text_encoder.cuda()
    fc_model.cuda()

    cudnn.benchmark = True

    # Evaluate model
    print("=> Evaluating best model")
    checkpoint = torch.load('best_model.pth.tar', map_location='cpu')
    img_encoder.load_state_dict(checkpoint['img_encoder'])
    fc_model.load_state_dict(checkpoint['fc_model'])
    evaluate(dataloader, img_encoder, text_encoder, fc_model, args)
Exemplo n.º 8
0
from sklearn.metrics.pairwise import paired_cosine_distances
from sentence_transformers import SentenceTransformer, util
import csv
import torch

model = SentenceTransformer("xlm-r-distilroberta-base-paraphrase-v1")
if torch.cuda.is_available():
    model.cuda()
    print('Using GPU')
# model = SentenceTransformer("distiluse-base-multilingual-cased-v2")
# model = SentenceTransformer("xlm-r-bert-base-nli-stsb-mean-tokens")
# model = SentenceTransformer("distilbert-multilingual-nli-stsb-quora-ranking")
# model = SentenceTransformer("LaBSE")
tsv_file = open('train-hotels-es.csv')
read_tsv = csv.reader(tsv_file, delimiter=",")
src_sentences = []
trg_sentences = []

for row in read_tsv:
    src_sentences.append(row[1])
    trg_sentences.append(row[2])
batch_size = 500
embeddings1 = model.encode(src_sentences,
                           batch_size=batch_size,
                           show_progress_bar=True,
                           convert_to_numpy=True)
embeddings2 = model.encode(trg_sentences,
                           batch_size=batch_size,
                           show_progress_bar=True,
                           convert_to_numpy=True)
cosine_scores = 1 - (paired_cosine_distances(embeddings1, embeddings2))
Exemplo n.º 9
0
class EmbExtractor():
    def __init__(self, model_name: str, sentence_transformer: bool, gpu: bool,
                 fp16: bool, pooling: str, without_encoding: bool,
                 use_mlm_head: bool, use_mlm_head_without_layernorm: bool):

        self._sentence_transformer = sentence_transformer
        self._gpu = gpu
        self._fp16 = fp16
        self._pooling = pooling
        self._without_encoding = without_encoding
        self._use_mlm_head = use_mlm_head
        self._use_mlm_head_without_layernorm = use_mlm_head_without_layernorm

        self._tokenizer = AutoTokenizer.from_pretrained(model_name)

        if self._sentence_transformer:
            self._model = SentenceTransformer(model_name)
        else:
            if self._pooling == "mask" or self._use_mlm_head:
                self._model = AutoModelForMaskedLM.from_pretrained(model_name)
                self._model.config.output_hidden_states = True
            else:
                self._model = AutoModel.from_pretrained(model_name)

        if self._gpu:
            self._model.cuda()
        if self._fp16:
            self._model.half()

    def extract_emb(self, lines: Union[str, List[str]]):

        if not isinstance(lines, list):
            lines = [lines]

        if self._sentence_transformer:
            # Shape: (batch_size, num_embs)
            sentence_embedding = self._model.encode(lines)

            return sentence_embedding
        else:
            encoded_input = self._tokenizer.batch_encode_plus(
                lines,
                truncation=True,
                padding=True,
                pad_to_multiple_of=8,
                return_tensors='pt',
                return_special_tokens_mask=True)
            if self._gpu:
                encoded_input = {k: v.cuda() for k, v in encoded_input.items()}

            # Shape: (batch_size, num_tokens, 1)
            special_tokens_mask = (
                1 -
                encoded_input.pop("special_tokens_mask").unsqueeze(axis=-1))

            if self._use_mlm_head:
                self._model.lm_head.decoder = Identity()
                if self._use_mlm_head_without_layernorm:
                    self._model.lm_head.lm_head_norm = Identity()

            with torch.no_grad():
                outputs = self._model(**encoded_input)

            if self._use_mlm_head:
                self._pooling = "mask"

            if self._pooling == "mask":
                assert not self._without_encoding
                # Shape: (batch_size, num_tokens, num_embs)
                output = outputs["hidden_states"][-1]

                if self._use_mlm_head:
                    with torch.no_grad():
                        # Shape: (batch_size, num_tokens, num_embs)
                        output = self._model.lm_head(output)
                # Shape: (batch_size, num_embs) - <mask> is the 2nd token
                sentence_embedding = output[:, 1, :]
                # ...
            elif self._pooling == "cls":
                # Shape: (batch_size, num_tokens, num_embs)
                output = outputs["last_hidden_state"]
                # Shape: (batch_size, num_embs)
                sentence_embedding = output[:, 0, :]
            else:

                if self._without_encoding:
                    # Shape: (batch_size, num_embs)
                    output = outputs["last_hidden_state"][
                        0] * special_tokens_mask
                else:
                    # Shape: (batch_size, num_tokens, num_embs)
                    output = outputs["last_hidden_state"] * special_tokens_mask

                if self._pooling == 'avg':
                    # Shape: (batch_size, num_embs)
                    output_masked = torch.sum(output, dim=1)
                    # Shape: (batch_size, 1)
                    non_zeros_n = torch.sum(special_tokens_mask, dim=1)

                    # Shape: (batch_size, num_embs)
                    sentence_embedding = output_masked / non_zeros_n
                elif self._pooling == 'max':
                    # Shape: (batch_size, num_embs)
                    output_masked = (output).max(dim=1)

                    # Shape: (batch_size, num_embs)
                    sentence_embedding = output_masked.values
                else:
                    logging.critical(" - pooling method doesnt exists")
                    exit()

            return sentence_embedding.float().cpu().numpy()
Exemplo n.º 10
0
    def __init__(self,
                 D_h,
                 cls_model,
                 transformer_model_family,
                 mode,
                 num_classes,
                 context_attention,
                 attention=False,
                 residual=False):
        super().__init__()

        if transformer_model_family == 'bert':
            if mode == '0':
                model = BertForSequenceClassification.from_pretrained(
                    'bert-base-uncased')
                tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
                hidden_dim = 768
            elif mode == '1':
                model = BertForSequenceClassification.from_pretrained(
                    'bert-large-uncased')
                tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
                hidden_dim = 1024

        elif transformer_model_family == 'roberta':
            if mode == '0':
                model = RobertaForSequenceClassification.from_pretrained(
                    'roberta-base')
                tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
                hidden_dim = 768
            elif mode == '1':
                model = RobertaForSequenceClassification.from_pretrained(
                    'roberta-large')
                tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
                hidden_dim = 1024

        elif transformer_model_family == 'sbert':
            if mode == '0':
                model = SentenceTransformer('bert-base-nli-mean-tokens')
                hidden_dim = 768
            elif mode == '1':
                model = SentenceTransformer('bert-large-nli-mean-tokens')
                hidden_dim = 1024
            elif mode == '2':
                model = SentenceTransformer('roberta-base-nli-mean-tokens')
                hidden_dim = 768
            elif mode == '3':
                model = SentenceTransformer('roberta-large-nli-mean-tokens')
                hidden_dim = 1024

        self.transformer_model_family = transformer_model_family
        self.model = model.cuda()
        self.hidden_dim = hidden_dim
        self.cls_model = cls_model
        self.D_h = D_h
        self.residual = residual

        if self.transformer_model_family in ['bert', 'roberta']:
            self.tokenizer = tokenizer

        if self.cls_model == 'lstm':
            self.lstm = nn.LSTM(input_size=self.hidden_dim,
                                hidden_size=D_h,
                                num_layers=2,
                                bidirectional=True).cuda()
            self.fc = nn.Linear(self.hidden_dim, 2 * D_h).cuda()

            self.attention = attention
            if self.attention:
                self.matchatt = MatchingAttention(2 * D_h,
                                                  2 * D_h,
                                                  att_type='general2').cuda()

            self.linear = nn.Linear(2 * D_h, 2 * D_h).cuda()
            self.smax_fc = nn.Linear(2 * D_h, num_classes).cuda()

        elif self.cls_model == 'dialogrnn':
            self.dialog_rnn_f = DialogueRNN(self.hidden_dim, D_h, D_h, D_h,
                                            context_attention).cuda()
            self.dialog_rnn_r = DialogueRNN(self.hidden_dim, D_h, D_h, D_h,
                                            context_attention).cuda()
            self.fc = nn.Linear(self.hidden_dim, 2 * D_h).cuda()

            self.attention = attention
            if self.attention:
                self.matchatt = MatchingAttention(2 * D_h,
                                                  2 * D_h,
                                                  att_type='general2').cuda()

            self.linear = nn.Linear(2 * D_h, 2 * D_h).cuda()

            self.smax_fc = nn.Linear(2 * D_h, num_classes).cuda()
            self.dropout_rec = nn.Dropout(0.1)

        elif self.cls_model == 'logreg':
            self.linear = nn.Linear(self.hidden_dim, D_h).cuda()
            self.smax_fc = nn.Linear(D_h, num_classes).cuda()
Exemplo n.º 11
0
def docsEmbedding(docData,
                  modelFlag='sbert',
                  tf_idf_weight=False,
                  data_dir=None):
    '''
    data_dir        : place to store the vectors data
    '''
    if tf_idf_weight:
        print('####Training the TF-IDF matrix####')
        docs = [' '.join(doc) for doc, _ in docData]
        tfidf = TfidfVectorizer(use_idf=True, smooth_idf=True, norm=None)
        tfidf.fit_transform(docs)
        max_idf = max(tfidf.idf_)
        ''' if a word was never seen - it must be at least as infrequent
        as any of the known words - so the default idf is the max of known idf's '''
        word2weight = defaultdict(lambda: max_idf,
                                  [(w, tfidf.idf_[i])
                                   for w, i in tfidf.vocabulary_.items()])
    else:
        word2weight = None

    if data_dir:
        if not os.path.exists(data_dir):
            os.mkdir(data_dir)
        print(
            '####Embedding results will be stored at {}####'.format(data_dir))
    else:
        print('####Embedding results will not be stored####')

    X = []
    Y = []
    model, tokenizer = None, None

    print('####Loading the model####')
    if modelFlag == 'sbert':
        # for sentence transformer (SBERT)
        from sentence_transformers import SentenceTransformer
        sentenceModelList = [
            'bert-base-nli-mean-tokens', 'bert-large-nli-mean-tokens'
        ]
        '''No need to convert to GPU here, since the model does for us'''
        model = SentenceTransformer(sentenceModelList[0])

    elif modelFlag == 'naive-bert':
        from transformers import BertModel, BertTokenizer, BertConfig
        # full list https://huggingface.co/transformers/pretrained_models.html
        transformerModelList = [
            'bert-base-uncased', 'bert-large-uncased', 'bert-base-cased',
            'bert-large-cased'
        ]
        config = BertConfig.from_pretrained(transformerModelList[0],
                                            output_hidden_states=True)
        tokenizer = BertTokenizer.from_pretrained(transformerModelList[0])
        model = BertModel.from_pretrained(transformerModelList[0],
                                          config=config)
        if torch.cuda.is_available():
            model = model.cuda()
    elif modelFlag == 'word2vec':
        import gensim
        model = gensim.models.KeyedVectors.load_word2vec_format(\
            './word2vec/GoogleNews-vectors-negative300.bin', binary=True)

    idx = 0
    for doc, docLabels in tqdm(docData):

        if modelFlag == 'sbert':
            docEmbedding = model.encode(doc)
        elif modelFlag == 'naive-bert':
            docEmbedding = naiveBERT_embed(doc,
                                           model,
                                           tokenizer,
                                           word2weight=word2weight,
                                           use_CLS=False)
        elif modelFlag == 'word2vec':
            doc_emd_temp = []
            for sen in doc:
                sen_emd = word2vec_embed(sen, model, word2weight=word2weight)
                doc_emd_temp.append(sen_emd)

            docEmbedding = torch.cat(doc_emd_temp)
        '''append the embedding result'''
        X.append(torch.Tensor(docEmbedding))
        Y.append(torch.LongTensor(docLabels))

        if data_dir:
            '''save the embedding result'''
            x_numpy = np.array(docEmbedding)
            y_numpy = np.array(docLabels)
            np.save(os.path.join(data_dir, str(idx) + '_x.npy'), x_numpy)
            np.save(os.path.join(data_dir, str(idx) + '_y.npy'), y_numpy)
            idx += 1

    data = {'text': X, 'label': Y}
    return data