This examples loads a pre-trained model and evaluates it on the STSbenchmark dataset
"""
from torch.utils.data import DataLoader
from soco_sentence_transformers import SentenceTransformer, SentencesDataset, LoggingHandler
from soco_sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from soco_sentence_transformers.readers import STSDataReader
import numpy as np
import logging

#### Just some code to print debug information to stdout
np.set_printoptions(threshold=100)

logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

# Load a named sentence model (based on BERT). This will download the model from our server.
# Alternatively, you can also pass a filepath to SentenceTransformer()
model = SentenceTransformer('bert-base-nli-mean-tokens')

sts_reader = STSDataReader('datasets/stsbenchmark')

test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"),
                             model=model)
test_dataloader = DataLoader(test_data, shuffle=False, batch_size=8)
evaluator = EmbeddingSimilarityEvaluator(test_dataloader)

model.evaluate(evaluator)
num_epochs = 4
model_save_path = 'output/training_stsbenchmark_roberta-' + datetime.now(
).strftime("%Y-%m-%d_%H-%M-%S")
sts_reader = STSDataReader('datasets/stsbenchmark', normalize_scores=True)

# Use XLNet for mapping tokens to embeddings
word_embedding_model = models.RoBERTa('roberta-base')

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(
    word_embedding_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True,
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

# Convert the dataset to a DataLoader ready for training
logging.info("Read STSbenchmark train dataset")
train_data = SentencesDataset(sts_reader.get_examples('sts-train.csv'), model)
train_dataloader = DataLoader(train_data,
                              shuffle=True,
                              batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)

logging.info("Read STSbenchmark dev dataset")
dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'),
                            model=model)
dev_dataloader = DataLoader(dev_data,
                            shuffle=False,
                            batch_size=train_batch_size)
from soco_sentence_transformers import SentenceTransformer, LoggingHandler
import numpy as np
import logging

#### Just some code to print debug information to stdout
np.set_printoptions(threshold=100)

logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout



# Load Sentence model (based on BERT) from URL
model = SentenceTransformer('bert-base-nli-mean-tokens')

# Embed a list of sentences
sentences = ['This framework generates embeddings for each input sentence',
             'Sentences are passed as a list of string.',
             'The quick brown fox jumps over the lazy dog.']
sentence_embeddings = model.encode(sentences)

# The result is a list of sentence embeddings as numpy arrays
for sentence, embedding in zip(sentences, sentence_embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")
"""
This is a simple application for sentence embeddings: clustering

Sentences are mapped to sentence embeddings and then k-mean clustering is applied.
"""
from soco_sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering

embedder = SentenceTransformer('bert-large-nli-mean-tokens')

# Corpus with example sentences
corpus = ['A man is eating a food.',
          'A man is eating a piece of bread.',
          'A man is eating pasta.',
          'The girl is carrying a baby.',
          'The baby is carried by the woman',
          'A man is riding a horse.',
          'A man is riding a white horse on an enclosed ground.',
          'A monkey is playing drums.',
          'Someone in a gorilla costume is playing a set of drums.',
          'A cheetah is running behind its prey.',
          'A cheetah chases prey on across a field.'
          ]
corpus_embeddings = embedder.encode(corpus)

# Perform kmean clustering
num_clusters = 5
clustering_model = AgglomerativeClustering(n_clusters=num_clusters)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_
"""
This is a simple application for sentence embeddings: semantic search

We have a corpus with various sentences. Then, for a given query sentence,
we want to find the most similar sentence in this corpus.

This script outputs for various queries the top 5 most similar sentences in the corpus.
"""
from soco_sentence_transformers import SentenceTransformer
import scipy.spatial

embedder = SentenceTransformer('bert-base-nli-mean-tokens')

# Corpus with example sentences
corpus = [
    'A man is eating a food.', 'A man is eating a piece of bread.',
    'The girl is carrying a baby.', 'A man is riding a horse.',
    'A woman is playing violin.', 'Two men pushed carts through the woods.',
    'A man is riding a white horse on an enclosed ground.',
    'A monkey is playing drums.', 'A cheetah is running behind its prey.'
]
corpus_embeddings = embedder.encode(corpus)

# Query sentences:
queries = [
    'A man is eating pasta.',
    'Someone in a gorilla costume is playing a set of drums.',
    'A cheetah chases prey on across a field.'
]
query_embeddings = embedder.encode(queries)
    if len(vocab) >= max_vocab_size:
        break

#Create the BoW model. Because we set word_weights to the IDF values and cumulative_term_frequency=True, we
#get tf-idf vectors. Set word_weights to an empty dict and cumulative_term_frequency=False to get a 1-hot sentence encoding
bow = models.BoW(vocab=vocab,
                 word_weights=weights,
                 cumulative_term_frequency=True)

# Add two trainable feed-forward networks (DAN) with max_vocab_size -> 768 -> 512 dimensions.
sent_embeddings_dimension = max_vocab_size
dan1 = models.Dense(in_features=sent_embeddings_dimension, out_features=768)
dan2 = models.Dense(in_features=768, out_features=512)

model = SentenceTransformer(modules=[bow, dan1, dan2])

# Convert the dataset to a DataLoader ready for training
logging.info("Read STSbenchmark train dataset")
train_data = SentencesDataset(sts_reader.get_examples('sts-train.csv'),
                              model=model)
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)

logging.info("Read STSbenchmark dev dataset")
dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'),
                            model=model)
dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size)
evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)

# Configure the training
#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

# Read the dataset
train_batch_size = 16
num_epochs = 4
model_save_path = 'output/training_stsbenchmark-' + datetime.now().strftime(
    "%Y-%m-%d_%H-%M-%S")
sts_reader = STSDataReader('datasets/stsbenchmark', normalize_scores=True)

# Load a pre-trained sentence transformer model
model = SentenceTransformer('bert-base-nli-mean-tokens')

# Convert the dataset to a DataLoader ready for training
logging.info("Read STSbenchmark train dataset")
train_data = SentencesDataset(sts_reader.get_examples('sts-train.csv'), model)
train_dataloader = DataLoader(train_data,
                              shuffle=True,
                              batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)

logging.info("Read STSbenchmark dev dataset")
dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'),
                            model=model)
dev_dataloader = DataLoader(dev_data,
                            shuffle=False,
                            batch_size=train_batch_size)
Exemplo n.º 8
0
sts_reader = STSDataReader('datasets/stsbenchmark')
train_num_labels = nli_reader.get_num_labels()
model_save_path = 'output/training_nli_bert-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")



# Use BERT for mapping tokens to embeddings
word_embedding_model = models.BERT('bert-base-uncased')

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])


# Convert the dataset to a DataLoader ready for training
logging.info("Read AllNLI train dataset")
train_data = SentencesDataset(nli_reader.get_examples('train.gz'), model=model)
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=train_num_labels)



logging.info("Read STSbenchmark dev dataset")
dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model)
dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size)
evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)
"""
This examples clusters different sentences that come from the same wikipedia article.

It uses the 'wikipedia-sections' model, a model that was trained to differentiate if two sentences from the
same article come from the same section or from different sections in that article.
"""
from soco_sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering



embedder = SentenceTransformer('bert-base-wikipedia-sections-mean-tokens')

#Sentences and sections are from Wikipeda.
#Source: https://en.wikipedia.org/wiki/Bushnell,_Illinois
corpus = [
("Bushnell is located at 40°33′6″N 90°30′29″W (40.551667, -90.507921).", "Geography"),
("According to the 2010 census, Bushnell has a total area of 2.138 square miles (5.54 km2), of which 2.13 square miles (5.52 km2) (or 99.63%) is land and 0.008 square miles (0.02 km2) (or 0.37%) is water.", "Geography"),

("The town was founded in 1854 when the Northern Cross Railroad built a line through the area.", "History"),
("Nehemiah Bushnell was the President of the Railroad, and townspeople honored him by naming their community after him. ", "History"),
("Bushnell was also served by the Toledo, Peoria and Western Railway, now the Keokuk Junction Railway.", "History"),

("As of the census[6] of 2000, there were 3,221 people, 1,323 households, and 889 families residing in the city. ", "Demographics"),
("The population density was 1,573.9 people per square mile (606.7/km²).", "Demographics"),
("There were 1,446 housing units at an average density of 706.6 per square mile (272.3/km²).", "Demographics"),

("From 1991 to 2012, Bushnell was home to one of the largest Christian Music and Arts festivals in the world, known as the Cornerstone Festival.", "Music"),
("Each year around the 4th of July, 25,000 people from all over the world would descend on the small farm town to watch over 300 bands, authors and artists perform at the Cornerstone Farm Campgrounds.", "Music"),
("The festival was generally well received by locals, and businesses in the area would typically put up signs welcoming festival-goers to their town.", "Music"),
("As a result of the location of the music festival, numerous live albums and videos have been recorded or filmed in Bushnell, including the annual Cornerstone Festival DVD. ", "Music"),