def test_LabelAccuracyEvaluator(self):


        model = SentenceTransformer('paraphrase-distilroberta-base-v1')

        nli_dataset_path = 'datasets/AllNLI.tsv.gz'
        if not os.path.exists(nli_dataset_path):
            util.http_get('https://sbert.net/datasets/AllNLI.tsv.gz', nli_dataset_path)

        label2int = {"contradiction": 0, "entailment": 1, "neutral": 2}
        dev_samples = []
        with gzip.open(nli_dataset_path, 'rt', encoding='utf8') as fIn:
            reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
            for row in reader:
                if row['split'] == 'train':
                    label_id = label2int[row['label']]
                    dev_samples.append(InputExample(texts=[row['sentence1'], row['sentence2']], label=label_id))
                    if len(dev_samples) >= 100:
                        break

        train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=len(label2int))

        dev_dataloader = DataLoader(dev_samples, shuffle=False, batch_size=16)
        evaluator = evaluation.LabelAccuracyEvaluator(dev_dataloader, softmax_model=train_loss)
        acc = evaluator(model)
        assert acc > 0.2
class TransformerSentencesEmbedding:
    def __init__(self):
        self.model = SentenceTransformer('bert-base-nli-mean-tokens')
        self.embeddings_dim = self.model.get_sentence_embedding_dimension()

    def sentences_encode(self, sentences):
        return self.model.encode(sentences)
Пример #3
0
class SentenceTransformerRecSys(KeyedVectorRecSys):
    model_name_or_path = None
    batch_size = 12
    language_model = None

    def train(self, texts: List):
        from sentence_transformers import SentenceTransformer

        # load sentence transformer model
        if not self.language_model:
            logger.info(
                f'Loading Sentence Transformer: {self.model_name_or_path}')
            self.language_model = SentenceTransformer(self.model_name_or_path)

        # reset doc vector model
        self.model = KeyedVectors(
            vector_size=self.language_model.get_sentence_embedding_dimension())

        # encode
        sentence_embeddings = self.language_model.encode(
            texts,
            batch_size=self.batch_size,
            show_progress_bar=self.print_progress)

        # save into keyed vector
        for idx, vec in enumerate(sentence_embeddings):
            self.model.add([str(self.idx2doc_id[idx])], [vec])

        return self.model
Пример #4
0
 def __init__(self, model_names):
   self.emb_dim = 0
   # args is a list with a list of all models
   
   for i, arg in enumerate(model_names):
     sentence_model = SentenceTransformer(arg)
     self.emb_dim += sentence_model.get_sentence_embedding_dimension()
     new_model_att = {"model_"+str(i): sentence_model}
     self.__dict__.update(new_model_att)
Пример #5
0
class SentenceTransformerDocumentEmbeddings(DocumentEmbeddings):
    def __init__(
            self,
            model: str = "bert-base-nli-mean-tokens",
            batch_size: int = 1,
            convert_to_numpy: bool = False,
    ):
        """
        :param model: string name of models from SentencesTransformer Class
        :param name: string name of embedding type which will be set to Sentence object
        :param batch_size: int number of sentences to processed in one batch
        :param convert_to_numpy: bool whether the encode() returns a numpy array or PyTorch tensor
        """
        super().__init__()

        try:
            from sentence_transformers import SentenceTransformer
        except ModuleNotFoundError:
            log.warning("-" * 100)
            log.warning('ATTENTION! The library "sentence-transformers" is not installed!')
            log.warning(
                'To use Sentence Transformers, please first install with "pip install sentence-transformers"'
            )
            log.warning("-" * 100)
            pass

        self.model = SentenceTransformer(model)
        self.name = 'sentence-transformers-' + str(model)
        self.batch_size = batch_size
        self.convert_to_numpy = convert_to_numpy
        self.static_embeddings = True

    def _add_embeddings_internal(self, sentences: List[Sentence]) -> List[Sentence]:

        sentence_batches = [sentences[i * self.batch_size:(i + 1) * self.batch_size]
                            for i in range((len(sentences) + self.batch_size - 1) // self.batch_size)]

        for batch in sentence_batches:
            self._add_embeddings_to_sentences(batch)

        return sentences

    def _add_embeddings_to_sentences(self, sentences: List[Sentence]):

        # convert to plain strings, embedded in a list for the encode function
        sentences_plain_text = [sentence.to_plain_string() for sentence in sentences]

        embeddings = self.model.encode(sentences_plain_text, convert_to_numpy=self.convert_to_numpy)
        for sentence, embedding in zip(sentences, embeddings):
            sentence.set_embedding(self.name, embedding)

    @property
    @abstractmethod
    def embedding_length(self) -> int:
        """Returns the length of the embedding vector."""
        return self.model.get_sentence_embedding_dimension()
Пример #6
0
    def test_simple_sentence_transformers_from_disk(self):
        model = SentenceTransformer(
            self.env['datasets_dir'] +
            '/sentence_transformers/bert-base-nli-mean-tokens')

        # sentence_embeddings = model.encode(self.texts)
        #
        # for sentence, embedding in zip(self.texts, sentence_embeddings):
        #     print("Sentence:", sentence)
        #     print("Embedding:", embedding)
        #     print("")

        self.assertEqual(768, model.get_sentence_embedding_dimension())
Пример #7
0
    def __init__(self,
                 name_model: str = "roberta-base-nli-stsb-mean-tokens",
                 device: str = None,
                 multiple_sentences: bool = False) -> None:
        self.name_model = name_model
        self.device = device
        self.multiple_sentences = multiple_sentences

        #Load the Model
        if self.name_model in MODELS:
            model = SentenceTransformer(self.name_model, device=self.device)
            self.embding_dim = model.get_sentence_embedding_dimension()
            self.model = model
        else:
            #TODO:Description
            raise ValueError("Error")
Пример #8
0
def train_sbert(model_name, model_save_path):
    batch_size = 16
    nli_reader, sts_reader = load_dataset()
    train_num_labels = nli_reader.get_num_labels()
    # Use BERT for mapping tokens to embeddings
    word_embedding_model = models.BERT(model_name)

    # Apply mean pooling to get one fixed sized sentence vector
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                                pooling_mode_mean_tokens=True,
                                pooling_mode_cls_token=False,
                                pooling_mode_max_tokens=False)

    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])


    # Convert the dataset to a DataLoader ready for training
    logging.info("Read AllNLI train dataset")
    train_data = SentencesDataset(nli_reader.get_examples('train.gz'), model=model)
    train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
    train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=train_num_labels)

    logging.info("Read STSbenchmark dev dataset")
    dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model)
    dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size)
    evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)

    # Configure the training
    num_epochs = 1

    warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up
    logging.info("Warmup-steps: {}".format(warmup_steps))
    # Train the model
    model.fit(train_objectives=[(train_dataloader, train_loss)],
            evaluator=evaluator,
            epochs=num_epochs,
            evaluation_steps=1000,
            warmup_steps=warmup_steps,
            output_path=model_save_path
            )

    model = SentenceTransformer(model_save_path)
    test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model)
    test_dataloader = DataLoader(test_data, shuffle=False, batch_size=batch_size)
    evaluator = EmbeddingSimilarityEvaluator(test_dataloader)

    model.evaluate(evaluator)
Пример #9
0
def build_model(num_labels):
    model_name = 'bert-base-uncased'

    # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
    word_embedding_model = models.Transformer(model_name)

    # Apply mean pooling to get one fixed sized sentence vector
    pooling_model = models.Pooling(
        word_embedding_model.get_word_embedding_dimension(),
        pooling_mode_mean_tokens=True,
        pooling_mode_cls_token=False,
        pooling_mode_max_tokens=False)

    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
    train_loss = new_softmax_loss.SoftmaxLoss(
        model=model,
        sentence_embedding_dimension=model.get_sentence_embedding_dimension(),
        num_labels=num_labels,
        num_vectors=3)
    return model, train_loss
Пример #10
0
class CachedSentenceTransformer:
    def __init__(self, model_name: str):
        super().__init__()
        self._model = SentenceTransformer(model_name)
        self._cache = Cache(cache_directory / model_name)

    def featurize(self, sentences: List[str]) -> np.ndarray:
        result = []
        for sentence in sentences:
            if sentence in self._cache:
                vec = self._cache[sentence]
            else:
                vec = self._model.encode(sentence).squeeze()
                self._cache[sentence] = vec

            result.append(vec)

        return np.array(result)

    def get_dimension(self) -> int:
        return self._model.get_sentence_embedding_dimension()
    def __init__(
            self,
            path_embedding_sentences: Union[str, Path],
            path_cuases: Union[str, Path],
            path_embedding_causes: Union[str, Path],
            by_augmentation: int = 3,
            sentences_embedding_model: str = "roberta-base-nli-stsb-mean-tokens"
    ):

        self.path_embedding_sentences = path_embedding_sentences
        self.path_embedding_causes = path_embedding_causes
        self.path_cuases = path_cuases
        self.by_augmentation = by_augmentation
        self.sentences_embedding_model = sentences_embedding_model

        #Load the Model
        if self.sentences_embedding_model in MODELS:
            model = SentenceTransformer(self.sentences_embedding_model)
            self.embding_dim = model.get_sentence_embedding_dimension()
            self.model = model
        else:
            #TODO:Description
            raise ValueError("Error")
class SentenceVectorizer(BaseVectorizer):
    """Vectorize text by using sentence transformers
    https://github.com/UKPLab/sentence-transformers
    """

    CONF_KEY_TRAINED_MODEL_PATH = "model_path"
    CONF_KEY_TRANSFORMER_MODEL_NAME = "transformer_model_name"

    def __init__(self, model_path: str, transformer_model_name: str):
        # Reference:
        # https://github.com/UKPLab/sentence-transformers/blob/e0aa596a0397a41ba69f75c1124318f0cb1dceca/sentence_transformers/models/Transformer.py
        self.model = SentenceTransformer(model_path)
        self.tokenizer = AutoTokenizer.from_pretrained(transformer_model_name)
        self.model.tokenizer = self.tokenizer
        self.word_embedding_dim = self.model.get_sentence_embedding_dimension()

    @classmethod
    def create(cls: "SentenceVectorizer", config: Dict[str, Any]):
        return cls(
            config[cls.CONF_KEY_TRAINED_MODEL_PATH],
            config[cls.CONF_KEY_TRANSFORMER_MODEL_NAME],
        )

    def encode(self, sentences: List[str], padding: bool = True):
        if len(sentences) == 0:
            return None
        return self.model.tokenizer(sentences,
                                    return_tensors="pt",
                                    padding=padding)

    def decode(self, encode_result):
        return self.model.tokenizer.convert_ids_to_tokens(
            encode_result.input_ids.flatten().tolist())

    def vectorize(self, sentences):
        vectors = self.model.encode(sentences)
        return [vectors[i][:].tolist() for i in range(len(sentences))]
Пример #13
0
        if os.path.isfile(labels_file):
            os.remove(os.path.join(curr_dir, "prediction_labels.csv"))
        if os.path.isfile(pred_file):
            os.remove(os.path.join(curr_dir, "prediction_results.csv"))

        # Model path
        model_save_path = curr_dir
        batch_size = 24
        agb_reader = TestAGBReader('datasets/og-test')
        train_num_labels = agb_reader.get_num_labels()

        model = SentenceTransformer(model_save_path, device="cpu")

        train_loss = losses.SoftmaxLoss(model=model,
                                        sentence_embedding_dimension=model.get_sentence_embedding_dimension(),
                                        num_labels=train_num_labels)
        train_loss.classifier = torch.load(os.path.join(model_save_path, "2_Softmax/pytorch_model.bin"))

        print("test")
        test_dir = "/data/daumiller/sentence-transformers/examples/datasets/og-test"
        for fn in sorted(os.listdir(test_dir)):
            examples = agb_reader.get_examples(fn)
            if not examples:
                continue
            # Hack to avoid problems with docs almost as long as batch size
            if len(examples) == batch_size + 1:
                batch_size_used = batch_size - 3
            else:
                batch_size_used = batch_size
            test_data = SentencesDataset(examples=examples, model=model, shorten=True)
Пример #14
0
word_embedding_model = models.BERT('bert-base-uncased')

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])


# Convert the dataset to a DataLoader ready for training
logging.info("Read AllNLI train dataset")
train_data = SentencesDataset(nli_reader.get_examples('train.gz'), model=model)
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=train_num_labels)



logging.info("Read STSbenchmark dev dataset")
dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model)
dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size)
evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)

# Configure the training
num_epochs = 1

warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

Пример #15
0
    for row in reader:
        if row['split'] == 'dev':
            label_id = label2int[row['label']]
            acc_samples.append(
                InputExample(texts=[row['sentence1'], row['sentence2']],
                             label=label_id))

train_dataloader = DataLoader(train_samples,
                              shuffle=True,
                              batch_size=train_batch_size)

acc_dataloader = DataLoader(acc_samples,
                            shuffle=True,
                            batch_size=train_batch_size)

print("sent embed:", model.get_sentence_embedding_dimension())

# train_loss = losses.BatchSemiHardTripletLoss(model=model)
train_loss = losses.SoftmaxLoss(
    model=model,
    sentence_embedding_dimension=model.get_sentence_embedding_dimension(),
    num_labels=len(label2int))

#Read STSbenchmark dataset and use it as development set
logging.info("Read STSbenchmark dev dataset")
dev_samples = []
with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        if row['split'] == 'dev':
            score = float(
    return avg_abs_emb


######################################################
####################### CORPUS #######################
######################################################

# # SciSpacy model to tokenize text
print("-------- Loading scispacy en_core_sci_sm model --------")
nlp = en_core_sci_sm.load(disable=['ner', 'tagger'])
nlp.max_length = 2000000

# Sentence Transformer model
logger.info("-------- Loading SentenceTransformer model --------")
embedder = SentenceTransformer(options.model)
dim = embedder.get_sentence_embedding_dimension()

# # Corpus
logger.info("-------- Building corpus --------")
df_docs.title = df_docs.title.fillna("")
df_docs.abstract = df_docs.abstract.fillna("")
df_docs.fulltext = df_docs.fulltext.fillna("")

corpus_list = []
name_corpus_list = []
if options.fulltext:
    fulltext_corpus = df_docs.fulltext.to_list()
    corpus_list.append(fulltext_corpus)
    name_corpus_list.append("fulltext")
if options.abstract:
    abstract_corpus = df_docs.abstract.to_list()
Пример #17
0
def extract():
    for id_model, m in enumerate(args.model_name):
        print(
            '****************************************************************')
        print('EXTRACTION MODEL: %s' % m)

        if args.text_output_split[id_model]:
            # create directories for split
            if not os.path.exists('../data/{0}/original/{1}_{2}'.format(
                    args.dataset, args.input_file, m.lower())):
                os.makedirs('../data/{0}/original/{1}_{2}'.format(
                    args.dataset, args.input_file, m.lower()))

        # model setting
        text_model = SentenceTransformer(args.model_name[id_model])

        # dataset setting
        data = read_csv('../data/{0}/original/{1}.tsv'.format(
            args.dataset, args.input_file),
                        sep='\t')
        print('Loaded dataset from %s' %
              descriptions_path.format(args.dataset))

        # text features
        text_features = np.empty(
            shape=[len(data),
                   text_model.get_sentence_embedding_dimension()])

        # features extraction
        print('Starting extraction...\n')
        start = time.time()

        for index, row in data.iterrows():
            # text features extraction
            text_features[index] = text_model.encode(
                sentences=str(row[args.column]))

            if (index + 1) % args.print_each == 0:
                sys.stdout.write('\r%d/%d samples completed' %
                                 (index + 1, len(data)))
                sys.stdout.flush()

        end = time.time()
        print('\n\nFeature extraction completed in %f seconds.' %
              (end - start))

        if args.normalize:
            text_features = text_features / np.max(np.abs(text_features))

        if args.text_output_split[id_model]:
            for d in range(len(data)):
                save_np(npy=text_features[d],
                        filename='../data/{0}/original/{1}_{2}'.format(
                            args.dataset, args.input_file, m.lower()) +
                        str(d) + '.npy')
            print('Saved text features numpy to ==> %s' %
                  text_features_dir.format(args.dataset, m.lower()))
        else:
            save_np(npy=text_features,
                    filename='../data/{0}/original/{1}_{2}.npy'.format(
                        args.dataset, args.input_file, m.lower()))
            print('Saved text features numpy to ==> %s' %
                  '../data/{0}/original/{1}_{2}.npy'.format(
                      args.dataset, args.input_file, m.lower()))
dev_samples = []
with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        if row['split'] == 'dev':
            score = float(row['score']) / 5.0 #Normalize score to range 0 ... 1
            dev_samples.append(InputExample(texts=[row['sentence1'], row['sentence2']], label=score))

dev_evaluator_sts = evaluation.EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name='sts-dev')


logging.info("Teacher Performance:")
dev_evaluator_sts(teacher_model)

# Student model has fewer dimensions. Compute PCA for the teacher to reduce the dimensions
if student_model.get_sentence_embedding_dimension() < teacher_model.get_sentence_embedding_dimension():
    logging.info("Student model has fewer dimensions than the teacher. Compute PCA for down projection")
    pca_sentences = train_sentences_nli[0:20000] + train_sentences_wikipedia[0:20000]
    pca_embeddings = teacher_model.encode(pca_sentences, convert_to_numpy=True)
    pca = PCA(n_components=student_model.get_sentence_embedding_dimension())
    pca.fit(pca_embeddings)

    #Add Dense layer to teacher that projects the embeddings down to the student embedding size
    dense = models.Dense(in_features=teacher_model.get_sentence_embedding_dimension(), out_features=student_model.get_sentence_embedding_dimension(), bias=False, activation_function=torch.nn.Identity())
    dense.linear.weight = torch.nn.Parameter(torch.tensor(pca.components_))
    teacher_model.add_module('dense', dense)

    logging.info("Teacher Performance with {} dimensions:".format(teacher_model.get_sentence_embedding_dimension()))
    dev_evaluator_sts(teacher_model)

                sentence = line_source.strip()
                train_sent.append(sentence)

            if min_sent_len <= len(line_target.strip()) <= max_sent_len:
                sentence = line_target.strip()
                train_sent.append(sentence)

            if len(train_sent) >= num_train_sent:
                break

    print("Encode training embeddings for PCA")
    train_matrix = model.encode(train_sent, show_progress_bar=True, convert_to_numpy=True)
    pca = PCA(n_components=pca_dimensions)
    pca.fit(train_matrix)

    dense = models.Dense(in_features=model.get_sentence_embedding_dimension(), out_features=pca_dimensions, bias=False, activation_function=torch.nn.Identity())
    dense.linear.weight = torch.nn.Parameter(torch.tensor(pca.components_))
    model.add_module('dense', dense)


print("Read source file")
source_sentences = set()
with file_open(source_file) as fIn:
    for line in tqdm.tqdm(fIn):
        line = line.strip()
        if len(line) >= min_sent_len and len(line) <= max_sent_len:
            source_sentences.add(line)

print("Read target file")
target_sentences = set()
with file_open(target_file) as fIn:
Пример #20
0
def train_nli():

    #### Just some code to print debug information to stdout
    logging.basicConfig(format='%(asctime)s - %(message)s',
                        datefmt='%Y-%m-%d %H:%M:%S',
                        level=logging.INFO,
                        handlers=[LoggingHandler()])
    #### /print debug information to stdout

    #You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
    #model_name = sys.argv[1] if len(sys.argv) > 1 else 'bert-base-uncased'
    model_name = 'pretrained_model/bert-base-uncased'

    # Read the dataset
    train_batch_size = 6
    nli_reader = NLIDataReader('./examples/datasets/AllNLI')
    sts_reader = STSBenchmarkDataReader('./examples/datasets/stsbenchmark')
    train_num_labels = nli_reader.get_num_labels()
    model_save_path = 'output/training_nli_'+model_name.replace("/", "-")+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")


    # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
    word_embedding_model = models.Transformer(model_name)

    # Apply mean pooling to get one fixed sized sentence vector
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                                pooling_mode_mean_tokens=True,
                                pooling_mode_cls_token=False,
                                pooling_mode_max_tokens=False)

    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])


    # Convert the dataset to a DataLoader ready for training
    logging.info("Read AllNLI train dataset")
    train_dataset = SentencesDataset(nli_reader.get_examples('train.gz'), model=model)
    train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
    train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=train_num_labels)



    logging.info("Read STSbenchmark dev dataset")
    dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model)
    dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size)
    evaluator = LabelAccuracyEvaluator(dev_dataloader,softmax_model = Softmax_label(model = model,
                                                                                    sentence_embedding_dimension = model.get_sentence_embedding_dimension(),
                                                                                    num_labels = train_num_labels))


    # Configure the training
    num_epochs = 1

    warmup_steps = math.ceil(len(train_dataset) * num_epochs / train_batch_size * 0.1) #10% of train data for warm-up
    logging.info("Warmup-steps: {}".format(warmup_steps))



    # Train the model
    model.fit(train_objectives=[(train_dataloader, train_loss)],
            evaluator=evaluator,
            epochs=num_epochs,
            evaluation_steps=100,
            warmup_steps=warmup_steps,
            output_path=model_save_path
            )



    ##############################################################################
    #
    # Load the stored model and evaluate its performance on STS benchmark dataset
    #
    ##############################################################################

    #model = SentenceTransformer(model_save_path)
    test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model)
    test_dataloader = DataLoader(test_data, shuffle=False, batch_size=train_batch_size)
    #evaluator = EmbeddingSimilarityEvaluator(test_dataloader)

    model.evaluate(evaluator)
nli_sentences = list(nli_sentences)
random.shuffle(nli_sentences)

#To determine the PCA matrix, we need some example sentence embeddings.
#Here, we compute the embeddings for 20k random sentences from the AllNLI dataset
pca_train_sentences = nli_sentences[0:20000]
train_embeddings = model.encode(pca_train_sentences, convert_to_numpy=True)

#Compute PCA on the train embeddings matrix
pca = PCA(n_components=new_dimension)
pca.fit(train_embeddings)
pca_comp = np.asarray(pca.components_)

# We add a dense layer to the model, so that it will produce directly embeddings with the new size
dense = models.Dense(in_features=model.get_sentence_embedding_dimension(),
                     out_features=new_dimension,
                     bias=False,
                     activation_function=torch.nn.Identity())
dense.linear.weight = torch.nn.Parameter(torch.tensor(pca_comp))
model.add_module('dense', dense)

# Evaluate the model with the reduce embedding size
logger.info("Model with {} dimensions:".format(new_dimension))
stsb_evaluator(model)

# If you like, you can store the model on disc by uncommenting the following line
#model.save('models/bert-base-nli-stsb-mean-tokens-128dim')

# You can then load the adapted model that produces 128 dimensional embeddings like this:
#model = SentenceTransformer('models/bert-base-nli-stsb-mean-tokens-128dim')
Пример #22
0
#ANN: Faster, but the recall will be lower
use_ann_search = True

#Number of clusters for ANN. Optimal number depends on dataset size
ann_num_clusters = 32768

#How many cluster to explorer for search. Higher number = better recall, slower
ann_num_cluster_probe = 5

#To save memory, we can use PCA to reduce the dimensionality from 768 to for example 128 dimensions
#The encoded embeddings will hence require 6 times less memory. However, we observe a small drop in performance.
use_pca = False
pca_dimensions = 128

#We store the embeddings on disc, so that they can later be loaded from disc
source_embedding_file = '{}_{}_{}.emb'.format(model_name, os.path.basename(source_file), pca_dimensions if use_pca else model.get_sentence_embedding_dimension())
target_embedding_file = '{}_{}_{}.emb'.format(model_name, os.path.basename(target_file), pca_dimensions if use_pca else model.get_sentence_embedding_dimension())


#Use PCA to reduce the dimensionality of the sentence embedding model
if use_pca:
    # We use a smaller number of training sentences to learn the PCA
    train_sent = []
    num_train_sent = 20000

    with open(source_file, encoding='utf8') as fSource, open(target_file, encoding='utf8') as fTarget:
        for line_source, line_target in zip(fSource, fTarget):
            id, sentence = line_source.strip().split("\t", maxsplit=1)
            train_sent.append(sentence)

            id, sentence = line_target.strip().split("\t", maxsplit=1)
    for row in reader:
        if row['split'] == 'dev':
            score = float(
                row['score']) / 5.0  #Normalize score to range 0 ... 1
            dev_samples.append(
                InputExample(texts=[row['sentence1'], row['sentence2']],
                             label=score))

dev_evaluator_sts = evaluation.EmbeddingSimilarityEvaluator.from_input_examples(
    dev_samples, name='sts-dev')

logging.info("Teacher Performance:")
dev_evaluator_sts(teacher_model)

# Student Model has fewer dimension. Compute PCA for the teacher to reduce the dimensions
if student_model.get_sentence_embedding_dimension(
) < teacher_model.get_sentence_embedding_dimension():
    logging.info(
        "Student model has fewer dimensions that the teacher. Compute PCA for down projection"
    )
    pca_sentences = train_sentences_nli[0:25000]
    pca_embeddings = teacher_model.encode(pca_sentences, convert_to_numpy=True)
    pca = PCA(n_components=student_model.get_sentence_embedding_dimension())
    pca.fit(pca_embeddings)

    #Add Dense layer to teacher that projects the embeddings down to the student embedding size
    dense = models.Dense(
        in_features=teacher_model.get_sentence_embedding_dimension(),
        out_features=student_model.get_sentence_embedding_dimension(),
        bias=False,
        activation_function=torch.nn.Identity())
    dense.linear.weight = torch.nn.Parameter(torch.tensor(pca.components_))
Пример #24
0
class SentenceBERTForRD(nn.Module):
    def __init__(self,
                 pretrained_name,
                 out_dim,
                 *sbert_args,
                 freeze_sbert=True,
                 criterion=None,
                 **sbert_kwargs):
        '''
        To use this model, you will need to first run "pip install sentence-transformers"

        Should be used in conjunction with the WantWordsDataset class, i.e.:
        >>> model = SentenceBERTForRD(...)
        >>> dataset = WantWordsDataset(definitions, embeddings, model.tokenizer)

        pretrained_name: Name of pretrained SentenceBERT variant to be used
        vocab_size:      Size of output vocabulary
        freeze_sbert:    Can optionally freeze SentenceBERT model and train
                         only output MLP
        criterion:       (optional) Must be one of CrossEntropyLoss, MSELoss, 
                         and CosineSimilarity
        '''
        super(SentenceBERTForRD, self).__init__()
        self.sbert = SentenceTransformer(pretrained_name, *sbert_args,
                                         **sbert_kwargs)
        self.pretrained_name = pretrained_name
        self.freeze_sbert = freeze_sbert
        if freeze_sbert:
            for param in self.sbert.parameters():
                param.requires_grad = False

        hidden_dim = self.sbert.get_sentence_embedding_dimension()
        # Simple MLP decoder --> modeled off of BERT MLM head
        self.decoder = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.GELU(),
            nn.LayerNorm(hidden_dim),
            nn.Linear(hidden_dim, out_dim),
        )

        self.criterion = criterion
        self.classification = None
        if criterion is not None:
            if isinstance(criterion, nn.CrossEntropyLoss):
                self.classification = True
            elif isinstance(criterion, (nn.MSELoss, nn.CosineSimilarity)):
                self.classification = False
            else:
                raise Exception(
                    "Criterion must be one of CrossEntropyLoss, MSELoss, or CosineSimilarity"
                )

        # init weights of linear layer
        for layer in self.decoder.modules():
            if isinstance(layer, nn.Linear):
                nn.init.normal_(layer.weight, mean=0.0, std=0.02)
                nn.init.zeros_(layer.bias)

    def unfreeze(self):
        for param in self.sbert.parameters():
            param.requires_grad = True

    def forward(self, input_ids, attention_mask, ground_truth=None):
        # embed: (batch, 768)
        embed = self.sbert({
            'input_ids': input_ids,
            'attention_mask': attention_mask
        })['sentence_embedding']
        # out: (batch, vocab_size)
        # prob distribution over vocabulary
        out = self.decoder(embed)

        if self.criterion is not None and ground_truth is not None:
            loss = self.criterion(out, ground_truth)
            return loss, out
        return out
Пример #25
0
def build_vectors(st_output_path: str,
                  hf_dataset: str,
                  aspect: str,
                  fold: Union[int, str],
                  include_all_docs: bool = False,
                  override: bool = False):
    """

    :param override:
    :param include_all_docs: Generate also vectors for samples from training data
    :param st_output_path: Path to Sentence Transformer model
    :param hf_dataset: Huggingface dataset path or name
    :param aspect:
    :param fold:
    :return:
    """
    max_token_length = 336  # ssee pwc_token_stats.ipynb
    nlp_cache_dir = './data/nlp_cache'

    out_fn = 'pwc_id2vec__all_docs.w2v.txt' if include_all_docs else 'pwc_id2vec.w2v.txt'
    out_fp = os.path.join(st_output_path, out_fn)

    if not os.path.exists(st_output_path):
        logger.error(
            f'Sentence Transformer directory does not exist: {st_output_path}')
        return

    if os.path.exists(out_fp) and not override:
        logger.error(
            f'Output path exists already and override is disabled: {out_fp}')
        return

    # Inference for best model
    best_model = SentenceTransformer(st_output_path)
    best_model.get_sentence_embedding_dimension()

    test_ds = load_dataset(get_local_hf_dataset_path(hf_dataset),
                           name='relations',
                           cache_dir=nlp_cache_dir,
                           split=get_test_split(aspect, fold))

    docs_ds = load_dataset(get_local_hf_dataset_path(hf_dataset),
                           name='docs',
                           cache_dir=nlp_cache_dir,
                           split='docs')
    test_sds = DocumentPairSentencesDataset(docs_ds, test_ds, best_model)

    if include_all_docs:
        # use all document ids
        input_paper_ids = set(docs_ds['paper_id'])
        logger.info(f'All documents in corpus: {len(input_paper_ids):,}')

    else:
        # generate vectors from unique test documents only
        input_paper_ids = set(test_ds['from_paper_id']).union(
            set(test_ds['to_paper_id']))

    with open(out_fp, 'w') as f:
        # header
        f.write(
            f'{len(input_paper_ids)} {best_model.get_sentence_embedding_dimension()}\n'
        )

        # body
        for paper_id in tqdm(input_paper_ids, desc='Inference'):
            vec = [
                str(v) for v in best_model.encode(test_sds.get_text_from_doc(
                    paper_id),
                                                  show_progress_bar=False)
            ]

            assert len(vec) == best_model.get_sentence_embedding_dimension()

            vec_str = ' '.join(vec)
            line = f'{paper_id} {vec_str}\n'
            f.write(line)
            # break
    logger.info(f'Encoded {len(input_paper_ids):,} into {out_fp}')