datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

# Read the dataset
model_name = 'bert-base-nli-mean-tokens'
train_batch_size = 16
num_epochs = 4
model_save_path = 'output/training_stsbenchmark_continue_training-' + model_name + '-' + datetime.now(
).strftime("%Y-%m-%d_%H-%M-%S")
sts_reader = STSBenchmarkDataReader(
    '/checkpoint/xiaojianwu/data/sentBERT/stsbenchmark', normalize_scores=True)

# Load a pre-trained sentence transformer model
model = SentenceTransformer(model_name)

# Convert the dataset to a DataLoader ready for training
logging.info("Read STSbenchmark train dataset")
train_dataset = SentencesDataset(sts_reader.get_examples('sts-train.csv'),
                                 model)
train_dataloader = DataLoader(train_dataset,
                              shuffle=True,
                              batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)

logging.info("Read STSbenchmark dev dataset")
dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'),
                            model=model)
dev_dataloader = DataLoader(dev_data,
                            shuffle=False,
Exemplo n.º 2
0
from datetime import datetime
import csv
import os
from zipfile import ZipFile
import random

#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
logger = logging.getLogger(__name__)
#### /print debug information to stdout

#As base model, we use DistilBERT-base that was pre-trained on NLI and STSb data
model = SentenceTransformer('stsb-distilbert-base')
num_epochs = 10
train_batch_size = 64

#As distance metric, we use cosine distance (cosine_distance = 1-cosine_similarity)
distance_metric = losses.SiameseDistanceMetric.COSINE_DISTANCE

#Negative pairs should have a distance of at least 0.5
margin = 0.5

dataset_path = 'quora-IR-dataset'
model_save_path = 'output/training_OnlineConstrativeLoss-' + datetime.now(
).strftime("%Y-%m-%d_%H-%M-%S")

os.makedirs(model_save_path, exist_ok=True)
Exemplo n.º 3
0
train_batch_size = 16
num_epochs = 4
model_save_path = 'output/training_stsbenchmark_bert-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
sts_reader = STSDataReader('datasets/stsbenchmark', normalize_scores=True)

# Use BERT for mapping tokens to embeddings
word_embedding_model = models.Transformers(model_name_or_path='bert-base-uncased',
                                           model_type='bert')

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

# Convert the dataset to a DataLoader ready for training
logging.info("Read STSbenchmark train dataset")
train_data = SentencesDataset(sts_reader.get_examples('sts-train.csv'), model)
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)


logging.info("Read STSbenchmark dev dataset")
dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model)
dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size)
evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)


# Configure the training. We skip evaluation in this example
 def __init__(self, batch_size=32):
     print("Loading sentence transformer model")
     self.model = SentenceTransformer('bert-large-nli-mean-tokens')
     self.batch_size = batch_size
     print("Loaded model")
 def __init__(self):
     self.model = SentenceTransformer('bert-base-nli-mean-tokens')
     self.embeddings_dim = self.model.get_sentence_embedding_dimension()
import os.path
import pickle
from sentence_transformers import SentenceTransformer
from flask import current_app as app
from .core import LimitedSizeDict

sentence_transformer = SentenceTransformer(app.config['LANGUAGE_MODEL_NAME'])


def encode(texts):
    if not sentence_transformer:
        app.logger.warning(
            'Unable to encode because the model was not correctly loaded')
        return []
    if len(texts) == 0:
        return []

    new_texts = list(set(text for text in texts if text not in encoding_cache))

    if len(new_texts) > 0:
        app.logger.debug(f'Encoding {len(new_texts)} texts')
        new_embeddings = sentence_transformer.encode(new_texts)
        encoding_cache.update(zip(new_texts, new_embeddings))
        save_encoding_cache()

    return [encoding_cache[text] for text in texts]


def load_encoding_cache():
    path = os.path.join(app.config['DATA_PATH'], 'encoding_cache.pkl')
    if os.path.exists(path):
Exemplo n.º 7
0
def eval_model(annotate_file, model_name, eval_res_file):
    # annotate_dataset = load_annotate_data(annotate_file)
    annotate_dataset = load_title_content_data(annotate_file)
    results = []

    idindex = {}
    corpus = []
    count = 0
    for query, sen, label in annotate_dataset:
        id_gen = hashlib.md5()
        id_gen.update(query.encode('utf-8'))
        query_id = id_gen.hexdigest()
        if query_id not in idindex:
            corpus.append(query)
            idindex[query_id] = count
            count += 1

        id_gen = hashlib.md5()
        id_gen.update(sen.encode('utf-8'))
        sen_id = id_gen.hexdigest()
        if sen_id not in idindex:
            corpus.append(sen)
            idindex[sen_id] = count
            count += 1

    model = SentenceTransformer(model_name)
    #word_embedding_model = models.Transformer(model_name)
    #pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
    #                           pooling_mode_mean_tokens=True,
    #                           pooling_mode_cls_token=False,
    #                           pooling_mode_max_tokens=False)
    #model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

    corpus_embeddings = model.encode(corpus)

    for query, sen, label in annotate_dataset:
        label = int(label)

        id_gen = hashlib.md5()
        id_gen.update(query.encode('utf-8'))
        query_id = id_gen.hexdigest()

        id_gen = hashlib.md5()
        id_gen.update(sen.encode('utf-8'))
        sen_id = id_gen.hexdigest()

        logging.debug('query:%s' % (query))
        logging.debug('idindex[query_id]:%d' % (idindex[query_id]))
        logging.debug('sen:%s' % (sen))
        logging.debug('idindex[sen_id]:%d' % (idindex[sen_id]))
        query_vec = corpus_embeddings[idindex[query_id]]
        sen_vec = corpus_embeddings[idindex[sen_id]]
        sim_score = scipy.spatial.distance.cdist([query_vec], [sen_vec],
                                                 "cosine")[0]
        results.append((label, query_id, sim_score, sen_id))

    fp = open(eval_res_file, "w", encoding="utf-8")
    writer = csv.writer(fp)
    ndcg = analysis.cal_NDCG(results, 10)
    writer.writerow([model_path, ndcg])
    fp.close()
Exemplo n.º 8
0
    if len(vocab) >= max_vocab_size:
        break

#Create the BoW model. Because we set word_weights to the IDF values and cumulative_term_frequency=True, we
#get tf-idf vectors. Set word_weights to an empty dict and cumulative_term_frequency=False to get a 1-hot sentence encoding
bow = models.BoW(vocab=vocab,
                 word_weights=weights,
                 cumulative_term_frequency=True)

# Add two trainable feed-forward networks (DAN) with max_vocab_size -> 768 -> 512 dimensions.
sent_embeddings_dimension = max_vocab_size
dan1 = models.Dense(in_features=sent_embeddings_dimension, out_features=768)
dan2 = models.Dense(in_features=768, out_features=512)

model = SentenceTransformer(modules=[bow, dan1, dan2])

# Convert the dataset to a DataLoader ready for training
logging.info("Read STSbenchmark train dataset")
train_data = SentencesDataset(sts_reader.get_examples('sts-train.csv'),
                              model=model)
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)

logging.info("Read STSbenchmark dev dataset")
dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'),
                            model=model)
dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size)
evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)

# Configure the training
Exemplo n.º 9
0
sns.set(rc={'figure.figsize':(6,6)})

#Labels, title and ticks
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels');
ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(labels, rotation = 89); ax.yaxis.set_ticklabels(labels, rotation = 0);

"""# **Multilingual model 1**

"""

#Make word embeddings of positive and negative words and plot them
from sentence_transformers import SentenceTransformer, util
import torch

embedder = SentenceTransformer('xlm-r-bert-base-nli-stsb-mean-tokens')

embeddings = embedder.encode(words, convert_to_numpy = True)
#embeddings = np.reshape(embeddings, (len(words), 1, 768))
print(embeddings.shape)

#Dimensionality reduction
#source https://towardsdatascience.com/elmo-contextual-language-embedding-335de2268604
pca = PCA(n_components=50)
y = pca.fit_transform(embeddings)
y = TSNE(n_components=2).fit_transform(y)

#Static visualization
plot_data = pd.DataFrame({'x':y[:,0], 'y':y[:,1], 'Category':tags})
ggplot(plot_data, aes(x='x', y='y', color='Category')) + geom_point(size = 1)
import pandas as pd
import pickle as pkl
import numpy as np
import tqdm, os, glob, time
from sentence_transformers import SentenceTransformer
import scipy.spatial

embedder = SentenceTransformer('bert-base-nli-mean-tokens')

root_path = '/Users/Janjua/Desktop/Projects/Octofying-COVID19-Literature/dataset/CORD-19-research-challenge/'


def test_BERT(queries):
    '''
        This is a CLI based tester for BERT.
        Takes in a list of queries and computes n-closed points.
        The distance is computed based on cosine similarity.
        This is taken from:
            https://github.com/theamrzaki/COVID-19-BERT-ResearchPapers-Semantic-Search#data-links
    '''

    df = pd.read_csv(root_path + "covid_sentences_full.csv", index_col=0)
    with open(root_path + 'sentences_list.pkl', 'rb') as f:
        df_sentences_list = pkl.load(f)
    f.close()
    corpus = df_sentences_list
    with open(root_path + "corpus_embeddings.pkl", "rb") as file_:
        corpus_embeddings = pkl.load(file_)
    file_.close()
    query_embeddings = embedder.encode(queries, show_progress_bar=True)
    closest_n = 1
Exemplo n.º 11
0
from sentence_transformers import SentenceTransformer
import torch
import numpy as np
import pandas as pd
import argparse
import warnings
warnings.filterwarnings("ignore")

model = SentenceTransformer('distilroberta-base-msmarco-v1')


def create_embedding(input_path, output_path):
    data = pd.read_csv(input_path, header=None)
    sentences = data.iloc[:, 0].tolist()
    sentence_embeddings = model.encode(sentences)
    embedding = pd.DataFrame(sentence_embeddings)
    data2 = pd.concat([data, embedding], axis=1)
    # breakpoint()
    data2.to_csv(output_path, index=False)


if __name__ == '__main__':
    argument_parser = argparse.ArgumentParser(
        description='Embedding creation for queries')
    argument_parser.add_argument('--path_input',
                                 type=str,
                                 default="sample_query.txt",
                                 help='Path Name',
                                 required=False)
    argument_parser.add_argument('--path_output',
                                 type=str,
Exemplo n.º 12
0
from sentence_transformers import SentenceTransformer, util
import torch

embedder = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')
corpus = [
    "The initial project idea at the start of the course was to build a VR campus for OMSCS students to socialise in. While the objective for this research is to motivate the implementation of a OMSCS VR campus, the scope of this project has since been reduced to from actually implementing it to establishing a decentralised, student led social network for the course.",
    "The second part of my project relates to the research based on social learning in VR. I propose a second mode of giving peer feedback - through a MUVE called Hubs \citep{hubs} (which is VR ready but only requires a the use of a web browser and can even run on mobile). Peer reviews groups would be formed from the working groups, with a small number of random links outside the clusters as well. The emergent network will be evaluated based on 2 groups - those who take part in the new mode of peer review and those who do not. We should be able to model the networks from interactions on the Piazza working groups and deduce whether VR peer review sessions have any effect on emergent leadership, network centrality and strength of network links (trust) to name a few. Quantitative analysis will also be undertaken on pre \& post surveys given to students about their perception of the value of peer feedback, again split by whether or not they participated in the VR sessions.",
    "This framework allows you to fine-tune your own sentence embedding methods, so that you get task-specific sentence embeddings. You have various options to choose from in order to get perfect sentence embeddings for your specific task.",
    "This framework provides an easy method to compute dense vector representations for sentences and paragraphs (also known as sentence embeddings). The models are based on transformer networks like BERT / RoBERTa / XLM-RoBERTa etc. and are tuned specificially meaningul sentence embeddings such that sentences with similar meanings are close in vector space.",
    "Notwithstanding the order provided by the CoI Framework, perhaps the main reason that the framework was widely adopted is the methodological guidelines for measuring each of the presences that constituted a community of inquiry. The first of these presences that required rigorous definition and operational rigor was social presence. Extending the original socio-emotional perspective, social presence is most recently defined as the ability of participants to identify with the community (e.g., course of study), communicate purposefully in a trusting environment, and develop inter-personal relationships by way of projecting their individual personalities",
    "Another scenario needs to be considered when scrutinizing the Teaching Presence construct. Much as the more general construct of Presence in an online learning environment can be explained more in depth by separating out Teaching, Social, and Cognitive subfactors, it may be that the Teaching Presence construct's potential bifurcation reflects a strength, and not necessarily a weakness in the subscale's construction. That is, since this factor represents a greater chunk of the total variance, results may simply be pointing to the Teaching Presence subscale itself having two or more subscales. At this early stage of development of measures to operationalize the CoI framework it is important not to assume that a subscale's multidimensionality is necessarily a weakness. Further studies conducted with larger samples and within other contexts will help clarify this issue."
]

corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)

# Query sentences:
queries = [
    'The second part of my project relates to the research based on social learning in VR. I propose a second mode of giving peer feedback - through a MUVE called Hubs \citep{hubs} (which is VR ready but only requires a the use of a web browser and can even run on mobile). Peer reviews groups would be formed from the working groups, with a small number of random links outside the clusters as well. The emergent network will be evaluated based on 2 groups - those who take part in the new mode of peer review and those who do not. We should be able to model the networks from interactions on the Piazza working groups and deduce whether VR peer review sessions have any effect on emergent leadership, network centrality and strength of network links (trust) to name a few. Quantitative analysis will also be undertaken on pre \& post surveys given to students about their perception of the value of peer feedback, again split by whether or not they participated in the VR sessions.',
    'If all goes well, at the end of the research project we should have ample motivation to continue to build a full scale social VR campus. For CS6460 the motivation for students to use it would be the peer feedback focus groups, but for other courses a similar peer feedback system might need to be implemented or another student motivation found to encourage the use of the platform. In any case, as leaders emerge and the VR campus network grows, I would expect moderators to appear who will be enthusiastic in mediating focus groups for their respective courses. Luckily in Hubs, moderator super privileges are easily controlled a linked through discord server, and the server side of the campus is hosted on AWS, so the capabilities of scaling the VR campus up significantly are already in place.',
    'In order to seed the network, I would like to learn a set of embeddings that represent each students research interests from their research logs. Clustering based on the cosine similarity of these embeddings will create a semantic social network of topics being investigated in the class and within each cluster will be students with a high degree of homophily. I would encourage these clusters to form working groups where they can easily share their research and project updates on Piazza.'
]

# Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
top_k = 3
for query in queries:
    query_embedding = embedder.encode(query, convert_to_tensor=True)
    cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
    cos_scores = cos_scores.cpu()

    # We use torch.topk to find the highest 5 scores
    top_results = torch.topk(cos_scores, k=top_k)
Exemplo n.º 13
0
    def load(self, path):
        model = Transformer(path)
        pooling = Pooling(model.get_word_embedding_dimension())

        return SentenceTransformer(modules=[model, pooling])
try:
    sentence_embeddings = [
        np.array([float(a) for a in l.strip().split()])
        for l in open(FILENAME, "r", encoding="utf-8").readlines()
    ][:SAMPLE]
    sentences = [
        l.strip()
        for l in open(SENT_FILENAME, "r", encoding="utf-8").readlines()
    ][:SAMPLE]
except Exception as e:
    print(e)
    quit()
    texts = "Lester's is located in a beautiful neighborhood and has been there since 1951. They are known for smoked meat which most deli's have but their brisket sandwich is what I come to montreal for. They've got about 12 seats outside to go along with the inside.   The smoked meat is up there in quality and taste with Schwartz's and you'll find less tourists at Lester's as well."
    sentences = sent_tokenize(texts, language="english")
    model = SentenceTransformer("./bert-base-nli-mean-tokens")
    sentence_embeddings = model.encode(sentences)

print("Graph construction...")
distances = kneighbors_graph(sentence_embeddings,
                             n_neighbors=K,
                             mode="distance",
                             include_self=False,
                             n_jobs=6,
                             metric="cosine")
print("Conversion...")
distances = distances.todense()

sentences2knn = {i: [] for i, s in enumerate(sentences)}

for idx, sentence in enumerate(sentences):
Exemplo n.º 15
0
from sentence_transformers import SentenceTransformer
import numpy as np

model_save_path = "../outputs/taobao/"
# Load pre-trained Sentence Transformer Model (based on DistilBERT). It will be downloaded automatically
model = SentenceTransformer(model_save_path, device='cuda')

# Embed a list of sentences
#Sentences we want sentence embeddings for
sentences = ['谁是公司的投资方', '公司几号地铁可以到达', '公司的核心价值观']

sentence_embeddings = model.encode(sentences)

# The result is a list of sentence embeddings as numpy arrays
for sentence, embedding in zip(sentences, sentence_embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")
cross_encoder = CrossEncoder(model_name, num_labels=1)

###### Bi-encoder (sentence-transformers) ######
logging.info("Loading bi-encoder model: {}".format(model_name))
# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
word_embedding_model = models.Transformer(model_name,
                                          max_seq_length=max_seq_length)

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(
    word_embedding_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True,
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False)

bi_encoder = SentenceTransformer(modules=[word_embedding_model, pooling_model])

#####################################################
#
# Step 1: Train cross-encoder model with STSbenchmark
#
#####################################################

logging.info(
    "Step 1: Train cross-encoder: {} with STSbenchmark (gold dataset)".format(
        model_name))

gold_samples = []
dev_samples = []
test_samples = []
Exemplo n.º 17
0
 def load_my_model():
     model = SentenceTransformer('distilbert-base-nli-mean-tokens')
     return model
Exemplo n.º 18
0
import re
from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate
import emoji
from laserembeddings import Laser
from scipy.spatial import distance
from sentence_transformers import SentenceTransformer
from fuzzywuzzy import fuzz
from fuzzywuzzy import process


laser = Laser()
indian_sbert = SentenceTransformer('../multilingual-sbert/models/se-asian-sbert')
portuguese_sbert = SentenceTransformer('distiluse-base-multilingual-cased')
english_sbert = SentenceTransformer('bert-base-nli-mean-tokens')

uchr = chr  # Python 3

# Unicode 11.0 Emoji Component map (deemed safe to remove)
_removable_emoji_components = (
    (0x20E3, 0xFE0F),  # combining enclosing keycap, VARIATION SELECTOR-16
    range(0x1F1E6, 0x1F1FF + 1),  # regional indicator symbol letter a..regional indicator symbol letter z
    range(0x1F3FB, 0x1F3FF + 1),  # light skin tone..dark skin tone
    range(0x1F9B0, 0x1F9B3 + 1),  # red-haired..white-haired
    range(0xE0020, 0xE007F + 1),  # tag space..cancel tag
)
emoji_components = re.compile(u'({})'.format(u'|'.join([
    re.escape(uchr(c)) for r in _removable_emoji_components for c in r])),
    flags=re.UNICODE)

Exemplo n.º 19
0
from sentence_transformers import SentenceTransformer, InputExample, losses, evaluation
from datagen import *
import torch
from models import *
import copy

if __name__ == '__main__':
    device = 'cuda'
    eng_sm = SentenceTransformer('cp10-work')
    dataset = SentenceTokenized(eng_sm.tokenizer,
                                'first',
                                language='eng',
                                true_only=True)
    val_set = dataset.train(False)
    embeddings = copy.deepcopy(
        eng_sm._first_module().auto_model.embeddings).to(device)

    model = AEPretrainedEmbedding(dataset.vocab_size, embeddings).to(device)
    model.load_state_dict(torch.load('cp14/099_6.3634e-06.pth'))
    with torch.no_grad():
        for data in val_set:
            # meaning, word = data.texts
            word, meaning, _ = dataset.collate_fn([data])
            memory = eng_sm.encode(word, convert_to_tensor=True).unsqueeze(0)
            out_indexes = [dataset.cls]
            for i in range(6):
                trg_tensor = torch.LongTensor(out_indexes).unsqueeze(1).to(
                    device)
                output = model.embeddings(trg_tensor)
                output = model.transformer_decoder(output, memory)
                output = model.fc(output)
Exemplo n.º 20
0
def main():
    a = get_args()

    # Load CLIP models
    model_clip, _ = clip.load(a.model)
    if a.verbose is True: print(' using model', a.model)
    xmem = {'RN50': 0.5, 'RN50x4': 0.16, 'RN101': 0.33}
    if 'RN' in a.model:
        a.samples = int(a.samples * xmem[a.model])
    workdir = os.path.join(a.out_dir, basename(a.in_txt))
    workdir += '-%s' % a.model if 'RN' in a.model.upper() else ''
    os.makedirs(workdir, exist_ok=True)

    if a.diverse != 0:
        a.samples = int(a.samples * 0.5)

    norm_in = torchvision.transforms.Normalize(
        (0.48145466, 0.4578275, 0.40821073),
        (0.26862954, 0.26130258, 0.27577711))

    if a.in_txt0 is not None:
        if a.verbose is True: print(' subtract text:', basename(a.in_txt0))
        if a.translate:
            translator = Translator()
            a.in_txt0 = translator.translate(a.in_txt0, dest='en').text
            if a.verbose is True: print(' translated to:', a.in_txt0)
        if a.multilang is True:
            model_lang = SentenceTransformer(
                'clip-ViT-B-32-multilingual-v1').cuda()
            txt_enc0 = model_lang.encode(
                [a.in_txt0], convert_to_tensor=True,
                show_progress_bar=False).detach().clone()
            del model_lang
        else:
            txt_enc0 = model_clip.encode_text(clip.tokenize(
                a.in_txt0).cuda()).detach().clone()

    # make init
    global params_start, params_ema
    params_shape = [1, 3, a.size[0], a.size[1] // 2 + 1, 2]
    params_start = torch.randn(*params_shape).cuda()  # random init
    params_ema = 0.
    if a.resume is not None and os.path.isfile(a.resume):
        if a.verbose is True: print(' resuming from', a.resume)
        params_start = load_params(a.resume).cuda()
        if a.keep > 0:
            params_ema = params_start[0].detach().clone()
    else:
        a.resume = 'init.pt'

    torch.save(params_start, 'init.pt')  # final init
    shutil.copy(a.resume,
                os.path.join(workdir, '000-%s.pt' % basename(a.resume)))

    prev_enc = 0

    def process(txt, num):

        sd = 0.01
        if a.keep > 0: sd = a.keep + (1 - a.keep) * sd
        params, image_f = fft_image([1, 3, *a.size],
                                    resume='init.pt',
                                    sd=sd,
                                    decay_power=a.decay)
        image_f = to_valid_rgb(image_f, colors=a.colors)

        if a.prog is True:
            lr1 = a.lrate * 2
            lr0 = a.lrate * 0.1
        else:
            lr0 = a.lrate
        optimizer = torch.optim.Adam(params, lr0)

        if a.verbose is True: print(' ref text: ', txt)
        if a.translate:
            translator = Translator()
            txt = translator.translate(txt, dest='en').text
            if a.verbose is True: print(' translated to:', txt)
        if a.multilang is True:
            model_lang = SentenceTransformer(
                'clip-ViT-B-32-multilingual-v1').cuda()
            txt_enc = model_lang.encode(
                [txt], convert_to_tensor=True,
                show_progress_bar=False).detach().clone()
            del model_lang
        else:
            txt_enc = model_clip.encode_text(
                clip.tokenize(txt).cuda()).detach().clone()
        if a.notext > 0:
            txt_plot = torch.from_numpy(plot_text(txt, a.modsize) /
                                        255.).unsqueeze(0).permute(0, 3, 1,
                                                                   2).cuda()
            txt_plot_enc = model_clip.encode_image(txt_plot).detach().clone()
        else:
            txt_plot_enc = None

        out_name = '%03d-%s' % (num + 1, txt_clean(txt))
        out_name += '-%s' % a.model if 'RN' in a.model.upper() else ''
        tempdir = os.path.join(workdir, out_name)
        os.makedirs(tempdir, exist_ok=True)

        pbar = ProgressBar(a.steps // a.fstep)
        for i in range(a.steps):
            loss = 0

            noise = a.noise * torch.randn(1, 1, *params[0].shape[2:4],
                                          1).cuda() if a.noise > 0 else None
            img_out = image_f(noise)

            imgs_sliced = slice_imgs([img_out],
                                     a.samples,
                                     a.modsize,
                                     norm_in,
                                     a.overscan,
                                     micro=None)
            out_enc = model_clip.encode_image(imgs_sliced[-1])
            loss -= torch.cosine_similarity(txt_enc, out_enc, dim=-1).mean()
            if a.notext > 0:
                loss += a.notext * torch.cosine_similarity(
                    txt_plot_enc, out_enc, dim=-1).mean()
            if a.diverse != 0:
                imgs_sliced = slice_imgs([image_f(noise)],
                                         a.samples,
                                         a.modsize,
                                         norm_in,
                                         a.overscan,
                                         micro=None)
                out_enc2 = model_clip.encode_image(imgs_sliced[-1])
                loss += a.diverse * torch.cosine_similarity(
                    out_enc, out_enc2, dim=-1).mean()
                del out_enc2
                torch.cuda.empty_cache()
            if a.expand > 0:
                global prev_enc
                if i > 0:
                    loss += a.expand * torch.cosine_similarity(
                        out_enc, prev_enc, dim=-1).mean()
                prev_enc = out_enc.detach().clone()
            if a.in_txt0 is not None:  # subtract text
                loss += torch.cosine_similarity(txt_enc0, out_enc,
                                                dim=-1).mean()
            del img_out, imgs_sliced, out_enc
            torch.cuda.empty_cache()

            if a.prog is True:
                lr_cur = lr0 + (i / a.steps) * (lr1 - lr0)
                for g in optimizer.param_groups:
                    g['lr'] = lr_cur

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if i % a.fstep == 0:
                with torch.no_grad():
                    img = image_f(contrast=a.contrast).cpu().numpy()[0]
                checkout(img,
                         os.path.join(tempdir, '%04d.jpg' % (i // a.fstep)),
                         verbose=a.verbose)
                pbar.upd()
                del img

        if a.keep > 0:
            global params_start, params_ema
            params_ema = ema(params_ema, params[0].detach().clone(), num + 1)
            torch.save((1 - a.keep) * params_start + a.keep * params_ema,
                       'init.pt')

        torch.save(params[0], '%s.pt' % os.path.join(workdir, out_name))
        shutil.copy(
            img_list(tempdir)[-1],
            os.path.join(workdir, '%s-%d.jpg' % (out_name, a.steps)))
        os.system('ffmpeg -v warning -y -i %s\%%04d.jpg "%s.mp4"' %
                  (tempdir, os.path.join(workdir, out_name)))

    with open(a.in_txt, 'r', encoding="utf-8") as f:
        texts = f.readlines()
        texts = [
            tt.strip() for tt in texts if len(tt.strip()) > 0 and tt[0] != '#'
        ]
    if a.verbose is True:
        print(' total lines:', len(texts))
        print(' samples:', a.samples)

    for i, txt in enumerate(texts):
        process(txt, i)

    vsteps = int(a.length * 25 / len(texts))  # 25 fps
    tempdir = os.path.join(workdir, '_final')
    os.makedirs(tempdir, exist_ok=True)

    def read_pt(file):
        return torch.load(file).cuda()

    if a.verbose is True: print(' rendering complete piece')
    ptfiles = file_list(workdir, 'pt')
    pbar = ProgressBar(vsteps * len(ptfiles))
    for px in range(len(ptfiles)):
        params1 = read_pt(ptfiles[px])
        params2 = read_pt(ptfiles[(px + 1) % len(ptfiles)])

        params, image_f = fft_image([1, 3, *a.size],
                                    resume=params1,
                                    sd=1.,
                                    decay_power=a.decay)
        image_f = to_valid_rgb(image_f, colors=a.colors)

        for i in range(vsteps):
            with torch.no_grad():
                img = image_f(
                    (params2 - params1) *
                    math.sin(1.5708 * i / vsteps)**2)[0].permute(1, 2, 0)
                img = torch.clip(img * 255, 0,
                                 255).cpu().numpy().astype(np.uint8)
            imsave(os.path.join(tempdir, '%05d.jpg' % (px * vsteps + i)), img)
            if a.verbose is True: cvshow(img)
            pbar.upd()

    os.system('ffmpeg -v warning -y -i %s\%%05d.jpg "%s.mp4"' %
              (tempdir, os.path.join(a.out_dir, basename(a.in_txt))))
    if a.keep > 0: os.remove('init.pt')
def main():
    parser = argparse.ArgumentParser()

    # Input and output configs
    parser.add_argument("--task",
                        default=None,
                        type=str,
                        required=True,
                        help="the task to run bert ranker for")
    parser.add_argument("--data_folder",
                        default=None,
                        type=str,
                        required=True,
                        help="the folder containing data")
    parser.add_argument("--output_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="the folder to output predictions")

    # #Training procedure
    parser.add_argument("--num_epochs",
                        default=5,
                        type=int,
                        required=False,
                        help="Number of epochs for training.")
    parser.add_argument("--train_batch_size",
                        default=8,
                        type=int,
                        required=False,
                        help="Training batch size.")
    # #Model hyperparameters
    parser.add_argument("--transformer_model",
                        default="bert-base-cased",
                        type=str,
                        required=False,
                        help="Bert model to use (default = bert-base-cased).")

    args = parser.parse_args()

    word_embedding_model = models.Transformer(args.transformer_model)

    pooling_model = models.Pooling(
        word_embedding_model.get_word_embedding_dimension(),
        pooling_mode_mean_tokens=True,
        pooling_mode_cls_token=False,
        pooling_mode_max_tokens=False)

    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

    logging.info("Creating train CRR dataset.")
    crr_reader = CRRBenchmarkDataReader('{}/{}'.format(args.data_folder,
                                                       args.task))

    train_data = SentencesDataset(crr_reader.get_examples("train.tsv"), model)
    train_dataloader = DataLoader(train_data,
                                  shuffle=True,
                                  batch_size=args.train_batch_size)
    train_loss = losses.CosineSimilarityLoss(model=model)

    logging.info("Creating dev CRR dataset.")
    dev_data = SentencesDataset(crr_reader.get_examples('valid.tsv'), model)
    dev_dataloader = DataLoader(dev_data,
                                shuffle=False,
                                batch_size=args.train_batch_size)
    evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)

    warmup_steps = math.ceil(
        len(train_data) * args.num_epochs / args.train_batch_size *
        0.1)  #10% of train data for warm-up
    logging.info("Warmup-steps: {}".format(warmup_steps))

    logging.info("Fitting sentenceBERT")
    model.fit(train_objectives=[(train_dataloader, train_loss)],
              evaluator=evaluator,
              epochs=args.num_epochs,
              evaluation_steps=1000,
              warmup_steps=warmup_steps,
              output_path=args.output_dir +
              "{}_{}".format(args.transformer_model, args.task))
Exemplo n.º 22
0
    def process(txt, num):

        sd = 0.01
        if a.keep > 0: sd = a.keep + (1 - a.keep) * sd
        params, image_f = fft_image([1, 3, *a.size],
                                    resume='init.pt',
                                    sd=sd,
                                    decay_power=a.decay)
        image_f = to_valid_rgb(image_f, colors=a.colors)

        if a.prog is True:
            lr1 = a.lrate * 2
            lr0 = a.lrate * 0.1
        else:
            lr0 = a.lrate
        optimizer = torch.optim.Adam(params, lr0)

        if a.verbose is True: print(' ref text: ', txt)
        if a.translate:
            translator = Translator()
            txt = translator.translate(txt, dest='en').text
            if a.verbose is True: print(' translated to:', txt)
        if a.multilang is True:
            model_lang = SentenceTransformer(
                'clip-ViT-B-32-multilingual-v1').cuda()
            txt_enc = model_lang.encode(
                [txt], convert_to_tensor=True,
                show_progress_bar=False).detach().clone()
            del model_lang
        else:
            txt_enc = model_clip.encode_text(
                clip.tokenize(txt).cuda()).detach().clone()
        if a.notext > 0:
            txt_plot = torch.from_numpy(plot_text(txt, a.modsize) /
                                        255.).unsqueeze(0).permute(0, 3, 1,
                                                                   2).cuda()
            txt_plot_enc = model_clip.encode_image(txt_plot).detach().clone()
        else:
            txt_plot_enc = None

        out_name = '%03d-%s' % (num + 1, txt_clean(txt))
        out_name += '-%s' % a.model if 'RN' in a.model.upper() else ''
        tempdir = os.path.join(workdir, out_name)
        os.makedirs(tempdir, exist_ok=True)

        pbar = ProgressBar(a.steps // a.fstep)
        for i in range(a.steps):
            loss = 0

            noise = a.noise * torch.randn(1, 1, *params[0].shape[2:4],
                                          1).cuda() if a.noise > 0 else None
            img_out = image_f(noise)

            imgs_sliced = slice_imgs([img_out],
                                     a.samples,
                                     a.modsize,
                                     norm_in,
                                     a.overscan,
                                     micro=None)
            out_enc = model_clip.encode_image(imgs_sliced[-1])
            loss -= torch.cosine_similarity(txt_enc, out_enc, dim=-1).mean()
            if a.notext > 0:
                loss += a.notext * torch.cosine_similarity(
                    txt_plot_enc, out_enc, dim=-1).mean()
            if a.diverse != 0:
                imgs_sliced = slice_imgs([image_f(noise)],
                                         a.samples,
                                         a.modsize,
                                         norm_in,
                                         a.overscan,
                                         micro=None)
                out_enc2 = model_clip.encode_image(imgs_sliced[-1])
                loss += a.diverse * torch.cosine_similarity(
                    out_enc, out_enc2, dim=-1).mean()
                del out_enc2
                torch.cuda.empty_cache()
            if a.expand > 0:
                global prev_enc
                if i > 0:
                    loss += a.expand * torch.cosine_similarity(
                        out_enc, prev_enc, dim=-1).mean()
                prev_enc = out_enc.detach().clone()
            if a.in_txt0 is not None:  # subtract text
                loss += torch.cosine_similarity(txt_enc0, out_enc,
                                                dim=-1).mean()
            del img_out, imgs_sliced, out_enc
            torch.cuda.empty_cache()

            if a.prog is True:
                lr_cur = lr0 + (i / a.steps) * (lr1 - lr0)
                for g in optimizer.param_groups:
                    g['lr'] = lr_cur

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if i % a.fstep == 0:
                with torch.no_grad():
                    img = image_f(contrast=a.contrast).cpu().numpy()[0]
                checkout(img,
                         os.path.join(tempdir, '%04d.jpg' % (i // a.fstep)),
                         verbose=a.verbose)
                pbar.upd()
                del img

        if a.keep > 0:
            global params_start, params_ema
            params_ema = ema(params_ema, params[0].detach().clone(), num + 1)
            torch.save((1 - a.keep) * params_start + a.keep * params_ema,
                       'init.pt')

        torch.save(params[0], '%s.pt' % os.path.join(workdir, out_name))
        shutil.copy(
            img_list(tempdir)[-1],
            os.path.join(workdir, '%s-%d.jpg' % (out_name, a.steps)))
        os.system('ffmpeg -v warning -y -i %s\%%04d.jpg "%s.mp4"' %
                  (tempdir, os.path.join(workdir, out_name)))
Exemplo n.º 23
0
    compute_sense_clusters

logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])

config = yaml.safe_load(open('config.yml', 'r'))
test_sets = config['test_files']
batch_size = config['batch_size']
results_dir = config['results_dir']
eval_dir = config['eval_dir']
loss_type = config['loss_type']

logging.info("Loading Model ...")
model = SentenceTransformer(
    os.path.join(config['saved_model_dir'], config['eval_base']))

embedding_lookup = {}
embedding_lookup_all = {}
if config['eval_strategy'] == 'Centroid' or config['eval_strategy'] == 'MaxSim':
    logging.info("Computing Sense Centroids ...")
    train_file = os.path.join(config['train_dir'], config['train_flat_file'])
    def_sentences_map = get_sense_samples(train_file)
    compute_sense_clusters(def_sentences_map, batch_size, embedding_lookup,
                           embedding_lookup_all, model)

key_file = open(config['keys_file'], 'r', encoding='utf8')
semcor_keys = set()
for line in key_file:
    skey = line.strip()
    semcor_keys.add(skey)
# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(
    word_embedding_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True,
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False)

# Add two trainable feed-forward networks (DAN)
sent_embeddings_dimension = pooling_model.get_sentence_embedding_dimension()
dan1 = models.Dense(in_features=sent_embeddings_dimension,
                    out_features=sent_embeddings_dimension)
dan2 = models.Dense(in_features=sent_embeddings_dimension,
                    out_features=sent_embeddings_dimension)

model = SentenceTransformer(
    modules=[word_embedding_model, word_weights, pooling_model, dan1, dan2])

# Convert the dataset to a DataLoader ready for training
logging.info("Read STSbenchmark train dataset")
train_dataloader = DataLoader(train_samples,
                              shuffle=True,
                              batch_size=batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)

logging.info("Read STSbenchmark dev dataset")
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples,
                                                             name='sts-dev')

# Configure the training
num_epochs = 10
warmup_steps = math.ceil(len(train_dataloader) * num_epochs *
Exemplo n.º 25
0
"""

from sentence_transformers import SentenceTransformer, LoggingHandler
import numpy as np
import logging
from scipy import spatial

#### Just some code to print debug information to stdout
np.set_printoptions(threshold=100)

logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

# Load Sentence model (based on BERT) from URL
model = SentenceTransformer('bert-base-nli-mean-tokens')

# Embed a list of sentences
sentences = ['I love you', 'I hate you']
sentence_embeddings = model.encode(sentences)
result = 1 - spatial.distance.cosine(sentence_embeddings[0],
                                     sentence_embeddings[1])
print(result)
# The result is a list of sentence embeddings as numpy arrays
for sentence, embedding in zip(sentences, sentence_embeddings):
    print("Sentence:", sentence)
    print("Embedding:", len(embedding))
    print("")
Exemplo n.º 26
0
 def __init__(self):
     self.model = SentenceTransformer('bert-base-nli-mean-tokens')
Exemplo n.º 27
0
def load_model(use_covidbert=False):
    """Function that loads and returns the CovidBERT model"""

    # # Load CovidBERT
    # if use_covidbert:
    #     print("Loading model...")
    #     model = AutoModelForMaskedLM.from_pretrained("deepset/covid_bert_base")
    #     print("Loading tokenizer...")
    #     tokenizer = AutoTokenizer.from_pretrained("deepset/covid_bert_base")
    #     print("Finished loading the model successfully!")

    #model = SentenceTransformer(model_path)

    # #Load CovidBERT
    # if use_covidbert:
    #     print("Loading model...")
    #     model = AutoModelWithLMHead.from_pretrained("manueltonneau/clinicalcovid-bert-nli")
    #     print("Loading tokenizer...")
    #     print("\n")
    #     tokenizer = AutoTokenizer.from_pretrained("manueltonneau/clinicalcovid-bert-nli")
    #     print("\n")
    #     print("Finished loading the model successfully!")

    #     # Save the model to model path
    #     model_path = os.path.join("models","clinicalcovid")
    #     if not os.path.exists(model_path):
    #         os.makedirs(model_path)
    #     model.save_pretrained(model_path)
    #     tokenizer.save_pretrained(model_path)

    #     model = SentenceTransformer(model_path)

    # Load CovidBERT
    if use_covidbert:
        print("Loading model...")
        model = AutoModelWithLMHead.from_pretrained("gsarti/covidbert-nli")
        print("Loading tokenizer...")
        print("\n")
        tokenizer = AutoTokenizer.from_pretrained("gsarti/covidbert-nli")
        print("\n")
        print("Finished loading the model successfully!")

        # Save the model to model path
        model_path = os.path.join("models", "gsarticovid")
        if not os.path.exists(model_path):
            os.makedirs(model_path)
        model.save_pretrained(model_path)
        tokenizer.save_pretrained(model_path)
        print(f"Successfully saved model to {model_path}")

        print("Loading Sentence Transformer now!")
        word_embedding_model = models.BERT(
            model_path,
            # max_seq_length=args.max_seq_length,
            # do_lower_case=args.do_lower_case
        )
        pooling_model = models.Pooling(
            word_embedding_model.get_word_embedding_dimension(),
            pooling_mode_mean_tokens=True,
            pooling_mode_cls_token=False,
            pooling_mode_max_tokens=False)
        model = SentenceTransformer(
            modules=[word_embedding_model, pooling_model])
        rmtree(model_path)
        model.save(model_path)
        print("Finished building Sentence Transformer!")

    # Load regular BERT
    else:
        print("Loading BERT")
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = BertModel.from_pretrained('bert-base-uncased')
        print("Finished loading BERT")

    return model, tokenizer
Exemplo n.º 28
0
"""
This is a simple application for sentence embeddings: clustering

Sentences are mapped to sentence embeddings and then k-mean clustering is applied.
"""
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans

embedder = SentenceTransformer('paraphrase-distilroberta-base-v1')

# Corpus with example sentences
corpus = ['A man is eating food.',
          'A man is eating a piece of bread.',
          'A man is eating pasta.',
          'The girl is carrying a baby.',
          'The baby is carried by the woman',
          'A man is riding a horse.',
          'A man is riding a white horse on an enclosed ground.',
          'A monkey is playing drums.',
          'Someone in a gorilla costume is playing a set of drums.',
          'A cheetah is running behind its prey.',
          'A cheetah chases prey on across a field.'
          ]
corpus_embeddings = embedder.encode(corpus)

# Perform kmean clustering
num_clusters = 5
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_
Exemplo n.º 29
0
 def __init__(self, model: EmbeddingModel):
     self.model = SentenceTransformer(model.model_name)
Exemplo n.º 30
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--model_type",
                        default=None,
                        type=str,
                        required=True,
                        help="Model type selected in the list: " +
                        ", ".join(MODEL_CLASSES.keys()))
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
        help="Path to pre-trained model or shortcut name selected in the list: "
        + ", ".join(ALL_MODELS))
    parser.add_argument('--filetype',
                        type=str,
                        default='mat',
                        choices=['mat', 'hdf5'],
                        help='Specify the file format for output. '
                        '"mat" is the matrix format in kaldi')
    parser.add_argument('--compress',
                        type=bool,
                        default=False,
                        help='Save in compressed format')
    parser.add_argument(
        '--compression-method',
        type=int,
        default=2,
        help='Specify the method(if mat) or gzip-level(if hdf5)')

    ## Other parameters
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after tokenization. Sequences longer "
        "than this will be truncated, sequences shorter will be padded.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")

    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Avoid using CUDA when available")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")

    parser.add_argument('input_file', type=str, help='Input file')
    parser.add_argument('wspecifier', type=str, help='Write specifier')
    args = parser.parse_args()

    # Setup CUDA, GPU & distributed training
    device = torch.device(
        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    args.n_gpu = torch.cuda.device_count()
    args.device = device

    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO)

    # Set seed
    set_seed(args)

    args.model_type = args.model_type.lower()

    if args.model_type == 'sbert':
        transformer = SentenceTransformer(args.model_name_or_path)
        examples = read_examples(args.input_file)
        embeddings = transformer.encode([e.text for e in examples])

        with file_writer_helper(
                args.wspecifier,
                filetype=args.filetype,
                compress=args.compress,
                compression_method=args.compression_method) as writer:
            for i in range(len(examples)):
                writer[examples[i].unique_id] = embeddings[i]
    else:
        config_class, model_class, tokenizer_class = MODEL_CLASSES[
            args.model_type]
        config = config_class.from_pretrained(args.model_name_or_path)
        tokenizer = tokenizer_class.from_pretrained(
            args.model_name_or_path, do_lower_case=args.do_lower_case)
        transformer = model_class.from_pretrained(
            args.model_name_or_path,
            from_tf=bool('.ckpt' in args.model_name_or_path),
            config=config)

        transformer.eval()
        transformer.to(args.device)

        with torch.no_grad():
            load_and_convert_examples(args, tokenizer, transformer)

    logger.info('Done converting {} to {}'.format(args.input_file,
                                                  args.wspecifier))