Пример #1
0
def main(model_path, model_type, sentence_corpus, output_path):

    #### Read sentence courpus.  output: list of sentences ####
    sentences = read.read_from_tsv(os.path.join(sentence_corpus, "input.tsv"))
    sentences = [item for row in sentences for item in row]
    print(sentences[:10])

    if model_type.lower() in ["bert"]:
        word_embedding_model = models.BERT(model_path)

        # Apply mean pooling to get one fixed sized sentence vector
        pooling_model = models.Pooling(
            word_embedding_model.get_word_embedding_dimension(),
            pooling_mode_mean_tokens=True,
            pooling_mode_cls_token=False,
            pooling_mode_max_tokens=False)

        embedder = SentenceTransformer(
            modules=[word_embedding_model, pooling_model])
        #### load sentence BERT models and generate sentence embeddings ####
    else:
        #### load sentence BERT models and generate sentence embeddings ####
        embedder = SentenceTransformer(model_path)

    sentences_embedding = embedder.encode(sentences)

    read.save_in_pickle(os.path.join(output_path, "embeddings.pkl"),
                        sentences_embedding)
Пример #2
0
def main(model_path, model_type, extra_dataset):
    # Read the dataset
    train_batch_size = 64
    num_epochs = 20
    model_save_path = model_path + '_continue_training_' + datetime.now(
    ).strftime("%Y_%m_%d_%H_%M_%S")
    n2c2_reader = TripletReader(extra_dataset)

    if model_type.lower() in ["bert"]:
        word_embedding_model = models.BERT(model_path)

        # Apply mean pooling to get one fixed sized sentence vector
        pooling_model = models.Pooling(
            word_embedding_model.get_word_embedding_dimension(),
            pooling_mode_mean_tokens=True,
            pooling_mode_cls_token=False,
            pooling_mode_max_tokens=False)

        embedder = SentenceTransformer(
            modules=[word_embedding_model, pooling_model])
        #### load sentence BERT models and generate sentence embeddings ####
    else:
        #### load sentence BERT models and generate sentence embeddings ####
        embedder = SentenceTransformer(model_path)

    # Load a pre-trained sentence transformer model
    model = SentenceTransformer(model_path)

    # Convert the dataset to a DataLoader ready for training
    logging.info("Read extra training dataset")
    train_data = SentencesDataset(n2c2_reader.get_examples('train.tsv'), model)
    train_dataloader = DataLoader(train_data,
                                  shuffle=True,
                                  batch_size=train_batch_size)
    train_loss = losses.TripletLoss(model=model, triplet_margin=triplet_margin)

    logging.info("Read development dataset")
    dev_data = SentencesDataset(examples=n2c2_reader.get_examples('dev.tsv'),
                                model=model)
    dev_dataloader = DataLoader(dev_data,
                                shuffle=False,
                                batch_size=train_batch_size)
    evaluator = TripletEvaluator(dev_dataloader)

    # Configure the training. We skip evaluation in this example
    warmup_steps = math.ceil(
        len(train_data) * num_epochs / train_batch_size *
        0.1)  #10% of train data for warm-up
    logging.info("Warmup-steps: {}".format(warmup_steps))

    # Train the model
    model.fit(train_objectives=[(train_dataloader, train_loss)],
              evaluator=evaluator,
              epochs=num_epochs,
              evaluation_steps=math.ceil(len(train_data) / train_batch_size),
              warmup_steps=warmup_steps,
              output_path=model_save_path)
Пример #3
0
    def calc(self, text1, text2):
        transformer = models.BERT('cl-tohoku/bert-base-japanese-whole-word-masking')
        pooling = models.Pooling(transformer.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False)
        model = SentenceTransformer(modules=[transformer, pooling])

        sentences = [text1, text2]
        embeddings = model.encode(sentences)

        return cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
Пример #4
0
def main():
    parser = set_parser()
    args = parser.parse_args()

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )

    logger.info("arguments are parsed")
    args.world_size = args.gpus * args.nodes

    patent_reader = PatentDataReader(args.data_dir, normalize_scores=True)
    # Use BERT for mapping tokens to embeddings
    logger.warning("Loading Bert Model")
    word_embedding_model = models.BERT('bert-base-uncased', max_seq_length=510)
    logger.warning("Model is loaded")
    # Apply mean pooling to get one fixed sized sentence vector
    pooling_model = models.Pooling(
        word_embedding_model.get_word_embedding_dimension(),
        pooling_mode_mean_tokens=True,
        pooling_mode_cls_token=False,
        pooling_mode_max_tokens=False)

    if args.use_tpu:
        logger.warning("TPU training")
        device = xm.xla_device()
        args.n_gpu = 1
    elif args.local_rank == -1:
        logger.warning("Non dist training")
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        logger.warning("Dist training local rank %s", args.local_rank)
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(
            backend="nccl", timeout=datetime.timedelta(hours=10))
        args.n_gpu = 1
    args.device = device

    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
    train_loss = losses.CosineSimilarityLoss(model=model)
    model.to(args.device)
    train_loss.to(args.device)

    # Training
    if args.do_train:
        logger.warning("Read Patent Training dataset")
        train_data = load_and_cache_examples(args, patent_reader, model)
        logger.warning("Training dataset is loaded")
        # train_data = SentencesDataset(patent_reader.get_examples('train.tsv', max_examples=17714), model)
        tr_loss = train(args, train_data, model, train_loss)
        logger.info(" average loss = %s", tr_loss)
def load_model(path):
    checkpoint_files = os.listdir(path)
    if 'pytorch_model.bin' in checkpoint_files:
        word_embedding_model = models.BERT(path)
        pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                                       pooling_mode_mean_tokens=True,
                                       pooling_mode_cls_token=False,
                                       pooling_mode_max_tokens=False)
        return SentenceTransformer(modules=[word_embedding_model, pooling_model])
    return SentenceTransformer(path)
Пример #6
0
 def __init__(self, base, alpha=0.21):
     super().__init__()
     self.base = base
     model_path = "models/covidbert-nli"
     # for downloading/loading the model check:
     #    https://github.com/gsarti/covid-papers-browser
     #        /blob/master/scripts/download_model.py
     if (path.exists(model_path) and path.isdir(model_path)):
         word_embedding_model = models.BERT(model_path,
                                            max_seq_length=128,
                                            do_lower_case=True)
         pooling_model = models.Pooling(
                             word_embedding_model\
                                 .get_word_embedding_dimension(),
                             pooling_mode_mean_tokens=True,
                             pooling_mode_cls_token=False,
                             pooling_mode_max_tokens=False
                             )
         model = SentenceTransformer(
             modules=[word_embedding_model, pooling_model])
         self.model = model
     else:
         print("Installing bert model...")
         tokenizer = AutoTokenizer.from_pretrained("gsarti/covidbert-nli")
         model = AutoModel.from_pretrained("gsarti/covidbert-nli")
         model.save_pretrained(model_path)
         tokenizer.save_pretrained(model_path)
         # Build the SentenceTransformer directly
         word_embedding_model = models.BERT(model_path,
                                            max_seq_length=128,
                                            do_lower_case=True)
         pooling_model = models.Pooling(
                             word_embedding_model\
                                 .get_word_embedding_dimension(),
                             pooling_mode_mean_tokens=True,
                             pooling_mode_cls_token=False,
                             pooling_mode_max_tokens=False
                             )
         model = SentenceTransformer(
             modules=[word_embedding_model, pooling_model])
         self.model = model
         print("Bert model is installed")
     self.alpha = alpha
Пример #7
0
    def GetDevEmbedding(self, test=False):
        """Only used by WriteDPRCP."""
        if test:
            primary_texts = self.test_primary_texts
            if not self.symmetric:
                secondary_texts = self.test_secondary_texts
        else:
            primary_texts = self.dev_primary_texts
            if not self.symmetric:
                secondary_texts = self.dev_secondary_texts

        if self.world.tag == 'inat' or self.world.tag == 'celeba':
            image_embedder = ImageEmbedder(self.world.tag, None)
            image_embedder.init_model()
            logging.info(
                "Getting {} embedding".format('test' if test else 'dev'))
            logging.info("Primary:")
            primary_embs = image_embedder.embed(primary_texts)
            if not self.symmetric:
                logging.info("Secondary:")
                secondary_embs = image_embedder.embed(secondary_texts)
        else:
            word_embedding_model = models.BERT('bert-base-uncased')
            pooling_model = models.Pooling(
                word_embedding_model.get_word_embedding_dimension(),
                pooling_mode_mean_tokens=True,
                pooling_mode_cls_token=False,
                pooling_mode_max_tokens=False)
            model = SentenceTransformer(
                modules=[word_embedding_model, pooling_model])

            logging.info(
                "Getting {} embedding".format('test' if test else 'dev'))
            logging.info("Primary:")
            primary_embs = np.array(model.encode(primary_texts))
            if not self.symmetric:
                logging.info("Secondary:")
                secondary_embs = np.array(model.encode(secondary_texts))

        # Normalize
        for i in range(primary_embs.shape[0]):
            primary_embs[i, :] /= np.linalg.norm(primary_embs[i, :])

        if not self.symmetric:
            for i in range(secondary_embs.shape[0]):
                secondary_embs[i, :] /= np.linalg.norm(secondary_embs[i, :])

        if test:
            self.test_primary_embeddings = primary_embs
            if not self.symmetric:
                self.test_secondary_embeddings = secondary_embs
        else:
            self.dev_primary_embeddings = primary_embs
            if not self.symmetric:
                self.dev_secondary_embeddings = secondary_embs
Пример #8
0
 def get_model(self):
     if self.model_type == 'electra':
         return ELECTRA(self.model_name,
                        max_seq_length=self.max_seq_length,
                        do_lower_case=self.do_lower_case)
     elif self.model_type == 'bert':
         return models.BERT(self.model_name,
                            max_seq_length=self.max_seq_length,
                            do_lower_case=self.do_lower_case)
     else:
         raise AttributeError("Not supported")
Пример #9
0
def get_sbert():
    global model
    if model is None:
        transformer = models.BERT(
            'cl-tohoku/bert-base-japanese-whole-word-masking')
        pooling = models.Pooling(transformer.get_word_embedding_dimension(),
                                 pooling_mode_mean_tokens=True,
                                 pooling_mode_cls_token=False,
                                 pooling_mode_max_tokens=False)
        model = SentenceTransformer(modules=[transformer, pooling])
    return model
Пример #10
0
def load_sentence_transformer(
        name: str = 'gsarti/scibert-nli',
        max_seq_length: int = 128,
        do_lower_case: bool = True) -> SentenceTransformer:
    """ Loads a SentenceTransformer from HuggingFace AutoModel bestiary """
    word_embedding_model = models.BERT('gsarti/scibert-nli',
                                       max_seq_length=128,
                                       do_lower_case=True)
    pooling_model = models.Pooling(
        word_embedding_model.get_word_embedding_dimension(),
        pooling_mode_mean_tokens=True,
        pooling_mode_cls_token=False,
        pooling_mode_max_tokens=False)
    return SentenceTransformer(modules=[word_embedding_model, pooling_model])
 def __post_init__(self):
     word_embedding_model = models.BERT(
         'gsarti/biobert-nli',
         max_seq_length=128,
         do_lower_case=True
     )
     # apply pooling to get one fixed vector
     pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
             pooling_mode_mean_tokens=True,
             pooling_mode_cls_token=False,
             pooling_mode_max_tokens=False
         )
 
     self.model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
Пример #12
0
def train_sbert(model_name, model_save_path):
    batch_size = 16
    nli_reader, sts_reader = load_dataset()
    train_num_labels = nli_reader.get_num_labels()
    # Use BERT for mapping tokens to embeddings
    word_embedding_model = models.BERT(model_name)

    # Apply mean pooling to get one fixed sized sentence vector
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                                pooling_mode_mean_tokens=True,
                                pooling_mode_cls_token=False,
                                pooling_mode_max_tokens=False)

    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])


    # Convert the dataset to a DataLoader ready for training
    logging.info("Read AllNLI train dataset")
    train_data = SentencesDataset(nli_reader.get_examples('train.gz'), model=model)
    train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
    train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=train_num_labels)

    logging.info("Read STSbenchmark dev dataset")
    dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model)
    dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size)
    evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)

    # Configure the training
    num_epochs = 1

    warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up
    logging.info("Warmup-steps: {}".format(warmup_steps))
    # Train the model
    model.fit(train_objectives=[(train_dataloader, train_loss)],
            evaluator=evaluator,
            epochs=num_epochs,
            evaluation_steps=1000,
            warmup_steps=warmup_steps,
            output_path=model_save_path
            )

    model = SentenceTransformer(model_save_path)
    test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model)
    test_dataloader = DataLoader(test_data, shuffle=False, batch_size=batch_size)
    evaluator = EmbeddingSimilarityEvaluator(test_dataloader)

    model.evaluate(evaluator)
Пример #13
0
 def __init__(self,
              model_dir: str,
              vocab: Optional[pd.DataFrame] = None) -> None:
     word_embedding_model = models.BERT(model_dir)
     pooling_model = models.Pooling(
         word_embedding_model.get_word_embedding_dimension(),
         pooling_mode_mean_tokens=False,
         pooling_mode_cls_token=True,
         pooling_mode_max_tokens=False)
     self.encoder = SentenceTransformer(
         modules=[word_embedding_model, pooling_model])
     if vocab is not None:
         self.vocab2index(vocab)
     else:
         self.codes = []
         self.concept_names = []
         self.tree_index = None
    def test_bert_wkpooling(self):
        word_embedding_model = models.BERT(
            'bert-base-uncased', model_args={'output_hidden_states': True})
        pooling_model = models.WKPooling(
            word_embedding_model.get_word_embedding_dimension())
        model = SentenceTransformer(
            modules=[word_embedding_model, pooling_model])
        scores = [
            0.6906377742193329, 0.9910573945907297, 0.8395676755959804,
            0.7569234597143, 0.8324509121875274
        ]

        for sentences, score in zip(WKPoolingTest.sentence_pairs, scores):
            embedding = model.encode(sentences, convert_to_numpy=True)

            similarity = 1 - scipy.spatial.distance.cosine(
                embedding[0], embedding[1])
            assert abs(similarity - score) < 0.01
Пример #15
0
    def initialize(self):
        if self.model == 'USE':
            encoder = hub.load(
                "https://tfhub.dev/google/universal-sentence-encoder/2")

        if self.model == 'scibert_scivocab_uncased':
            #provide the path to the downloaded scibert model
            word_embedding_model = models.BERT(
                './../rev_sig/codes/models/scibert_scivocab_uncased/')
            pooling_model = models.Pooling(
                word_embedding_model.get_word_embedding_dimension(),
                pooling_mode_mean_tokens=True,
                pooling_mode_cls_token=False,
                pooling_mode_max_tokens=False)
            encoder = SentenceTransformer(
                modules=[word_embedding_model, pooling_model])

        else:
            encoder = SentenceTransformer('bert-base-nli-mean-tokens')
        return encoder
Пример #16
0
def main(model_path, model_type, sentence_corpus, query):
    if model_type.lower() in ["bert"]:
        word_embedding_model = models.BERT(model_path)

        # Apply mean pooling to get one fixed sized sentence vector
        pooling_model = models.Pooling(
            word_embedding_model.get_word_embedding_dimension(),
            pooling_mode_mean_tokens=True,
            pooling_mode_cls_token=False,
            pooling_mode_max_tokens=False)

        embedder = SentenceTransformer(
            modules=[word_embedding_model, pooling_model])
        #### load sentence BERT models and generate sentence embeddings ####
    else:
        #### load sentence BERT models and generate sentence embeddings ####
        embedder = SentenceTransformer(model_path)
    corpus_embeddings = read.read_from_pickle(
        os.path.join(sentence_corpus, "embeddings.pkl"))
    corpus = read.read_from_tsv(os.path.join(sentence_corpus, "input.tsv"))
    sentences = [item for row in corpus for item in row]

    query_embedding = embedder.encode([query])

    # Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
    closest_n = 5

    distances = scipy.spatial.distance.cdist(query_embedding,
                                             corpus_embeddings, "cosine")[0]

    results = zip(range(len(distances)), distances)
    results = sorted(results, key=lambda x: x[1])

    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 5 most similar sentences in corpus:")

    for idx, distance in results[0:closest_n]:
        print(sentences[idx].strip(), "(Score: %.4f)" % (1 - distance))
Пример #17
0
    def GetInitialEmbedding(self, encode_batch_size):
        if self.world.tag == 'inat' or self.world.tag == 'celeba':
            image_embedder = ImageEmbedder(self.world.tag, None)
            image_embedder.init_model()
            logging.info("Getting initial embedding")
            logging.info("Primary:")
            self.train_primary_embeddings = image_embedder.embed(
                self.train_primary_texts)
            if not self.symmetric:
                logging.info("Secondary:")
                self.train_secondary_embeddings = image_embedder.embed(
                    self.train_secondary_texts)
        else:
            word_embedding_model = models.BERT('bert-base-uncased')
            pooling_model = models.Pooling(
                word_embedding_model.get_word_embedding_dimension(),
                pooling_mode_mean_tokens=True,
                pooling_mode_cls_token=False,
                pooling_mode_max_tokens=False)
            model = SentenceTransformer(
                modules=[word_embedding_model, pooling_model])

            logging.info("Getting initial embedding")
            logging.info("Primary:")
            self.train_primary_embeddings = np.array(
                model.encode(self.train_primary_texts,
                             batch_size=encode_batch_size))
            if not self.symmetric:
                logging.info("Secondary:")
                self.train_secondary_embeddings = np.array(
                    model.encode(self.train_secondary_texts,
                                 batch_size=encode_batch_size))

        self.NormalizeEmbeddings()

        return
def main():
    parser = set_parser()
    args = parser.parse_args()

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )

    logger.info("arguments are parsed")
    args.world_size = args.gpus * args.nodes

    patent_reader = PatentDataReader(args.data_dir, normalize_scores=True)
    # Use BERT for mapping tokens to embeddings
    word_embedding_model = models.BERT('bert-base-cased', max_seq_length=510)

    # Apply mean pooling to get one fixed sized sentence vector
    pooling_model = models.Pooling(
        word_embedding_model.get_word_embedding_dimension(),
        pooling_mode_mean_tokens=True,
        pooling_mode_cls_token=False,
        pooling_mode_max_tokens=False)

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1:
        logger.warning("Non dist training")
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        logger.warning("Dist training local rank %s", args.local_rank)
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(
            backend="nccl", timeout=datetime.timedelta(hours=10))
        args.n_gpu = 1
    args.device = device

    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
    train_loss = losses.CosineSimilarityLoss(model=model)
    model.to(args.device)
    train_loss.to(args.device)

    if args.fp16:
        try:
            import apex
            apex.amp.register_half_function(torch, "einsum")
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )

    logger.info("Training/evaluation parameters %s", args)

    # Training
    if args.do_train:
        logger.warning("Read Patent Training dataset")
        train_data = load_and_cache_examples(args, patent_reader, model)
        if args.eval_during_train:
            logging.info("Read STSbenchmark dev dataset")
            dev_data = load_and_cache_examples(args,
                                               patent_reader,
                                               model,
                                               evaluate=True)

        else:
            dev_data = None

        # train_data = SentencesDataset(patent_reader.get_examples('train.tsv', max_examples=17714), model)
        tr_loss = train(args,
                        train_data,
                        model,
                        train_loss,
                        dev_dataset=dev_data)
        logger.info(" average loss = %s", tr_loss)
Пример #19
0
# model.save_pretrained('models/')   # not sure if this the right way to save tuned models or right use of path
# tokenizer.save_pretrained('models/')

# select one Transformer
model_name = 'allenai/scibert_scivocab_uncased' # same not sure i'm calling the model right here

# Read the dataset
batch_size = 16
nli_reader = NLIDataReader('../datasets/AllNLI')
sts_reader = STSBenchmarkDataReader('../datasets/stsbenchmark')
train_num_labels = nli_reader.get_num_labels()
model_save_path = 'models/training_nli_'+model_name.replace("/", "-")+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")


# Use sciBERT model for mapping tokens to embeddings
word_embedding_model = models.BERT(model_name)

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])


# Convert the dataset to a DataLoader ready for training
logging.info("Read AllNLI train dataset")
train_data = SentencesDataset(nli_reader.get_examples('train.gz'), model=model)
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=train_num_labels)
Пример #20
0
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

# Read the dataset

nli_reader = ChineseDataReader(args.data_dir)
train_num_labels = nli_reader.get_num_labels()
model_save_path = args.output_dir

if args.do_train:



    # Use BERT for mapping tokens to embeddings
    word_embedding_model = models.BERT(args.model_name_or_path)

    # Apply mean pooling to get one fixed sized sentence vector
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                                   pooling_mode_mean_tokens=True,
                                   pooling_mode_cls_token=False,
                                   pooling_mode_max_tokens=False)


    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])


    # Convert the dataset to a DataLoader ready for training
    logging.info("Read train dataset")
    train_data = SentencesDataset(nli_reader.get_train_examples(args.data_dir), model=model)
    train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
def is_correct(meddra_code, candidates, topk=1):
    for candidate in candidates[:topk]:
        if check_label(candidate, meddra_code): return 1
    return 0


if __name__ == '__main__':
    parser = ArgumentParser()
    parser.add_argument('--model_dir')
    parser.add_argument('--data_folder')
    parser.add_argument('--vocab')
    parser.add_argument('--k', type=int, default=5)
    args = parser.parse_args()

    word_embedding_model = models.BERT(args.model_dir)
    pooling_model = models.Pooling(
        word_embedding_model.get_word_embedding_dimension(),
        pooling_mode_mean_tokens=False,
        pooling_mode_cls_token=True,
        pooling_mode_max_tokens=False)
    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
    ################
    entities = read_dataset(args.data_folder)
    ################
    entity_texts = [e['entity_text'].lower() for e in entities]
    labels = [e['label'] for e in entities]
    ##################
    vocab = read_vocab(args.vocab)
    codes = vocab.label.values
Пример #22
0
def get_vector(record_texts_dict: dict,
               embedding_size,
               record_num: int,
               device,
               save_path='',
               s_text_emb_method='glove'):
    record_embeddings = None
    if s_text_emb_method == 'glove':
        print('----glove vector----')
        stop_word_list = get_stop_word(
            stop_word_path='./resource/stop_words.txt')
        glove_dict = get_glove_dict(
            glove_dict_path='./resource/glove/glove.6B.' +
            str(embedding_size) + 'd.txt')
        record_embeddings = np.zeros((record_num, embedding_size))
        t_count = 0
        # print(item_num)
        for i in tqdm(range(record_num)):
            item_emb = np.zeros(embedding_size)
            try:
                word_str = str(record_texts_dict[i])
                word_list = word_str.split(" ")
                # print(word_list)
                t_div = 1
                for word in word_list:
                    if word not in stop_word_list:
                        try:
                            word_glove_vector = glove_dict[word]
                            item_emb = item_emb + word_glove_vector
                        except KeyError:
                            continue
                        t_div += 1
                    else:
                        continue
                # print(t_div, item_emb, item_emb / t_div)
                record_embeddings[i] = item_emb / t_div  # normalise
                t_count += 1
            except KeyError:
                continue
    elif s_text_emb_method == 'sbert':
        print('----sentence-bert vector----')
        # Sentence-BERT:
        # Sentence Embeddings using Siamese BERT-Networks https://arxiv.org/abs/1908.10084
        # https://github.com/UKPLab/sentence-transformers
        # google/bert_uncased_L-2_H-128_A-2(BERT-Tiny)
        # google/bert_uncased_L-12_H-256_A-4(BERT-Mini)
        # google/bert_uncased_L-4_H-512_A-8(BERT-Small)
        # google/bert_uncased_L-8_H-512_A-8(BERT-Medium)
        # google/bert_uncased_L-12_H-768_A-12(BERT-Base)
        word_embedding_model = models.BERT(
            'google/bert_uncased_L-12_H-256_A-4', max_seq_length=510)
        # Apply mean pooling to get one fixed sized sentence vector
        pooling_model = models.Pooling(
            word_embedding_model.get_word_embedding_dimension())
        bert_model = SentenceTransformer(
            modules=[word_embedding_model, pooling_model], device=device)
        one_req_num = 500
        record_list = list(record_texts_dict.values())
        req_times = int(math.ceil(len(record_list) / one_req_num))
        for ii in tqdm(range(req_times)):
            if ii == 0:
                record_embeddings = bert_model.encode(
                    record_list[ii * one_req_num:(ii + 1) * one_req_num])
            elif ii < req_times - 1:
                record_embeddings = np.vstack(
                    (record_embeddings,
                     bert_model.encode(record_list[ii * one_req_num:(ii + 1) *
                                                   one_req_num])))
            else:
                record_embeddings = np.vstack(
                    (record_embeddings,
                     bert_model.encode(record_list[ii * one_req_num:])))
    else:
        print('Do not support', s_text_emb_method, 'text embedding method.')
    if save_path != '':
        np.save(save_path, record_embeddings)
    return record_embeddings
# In[2]:

# Read the dataset
batch_size = 16
nli_reader = NLIDataReader('datasets/AllNLI')
sts_reader = STSDataReader('datasets/stsbenchmark')
train_num_labels = nli_reader.get_num_labels()
model_save_path = 'output/training_nli_bert-' + datetime.now().strftime(
    "%Y-%m-%d_%H-%M-%S")

# In[3]:

# Use BERT for mapping tokens to embeddings
# Using manually downloaded model data:
word_embedding_model = models.BERT('../models/bert-base-multilingual-cased/')
# Or you can let the library handle the downloading and caching for you:
# word_embedding_model = models.BERT('bert-base-multilingual-cased')

# In[4]:


def children(m):
    return m if isinstance(m, (list, tuple)) else list(m.children())


def set_trainable_attr(m, b):
    m.trainable = b
    for p in m.parameters():
        p.requires_grad = b
Пример #24
0
    index=2)

'#### Selected model:', model_name
EMBEDDINGS_PATH = f'{model_name}-embeddings.pkl'

path = os.path.join(MODELS_DIR, model_name)
if not os.path.exists(path):
    os.makedirs(path)

tokenizer = AutoTokenizer.from_pretrained(MODELS[model_name])
model = AutoModel.from_pretrained(MODELS[model_name])
model.save_pretrained(path)
tokenizer.save_pretrained(path)

word_embedding_model = models.BERT(path,
                                   max_seq_length=512,
                                   do_lower_case=True)

pooling_model = models.Pooling(
    word_embedding_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True,
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False)
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
rmtree(path)
model.save(path)
print(f'Model {model_name} available in {path}')

# Displaying data

Пример #25
0
        "--output_dir",  
        type=str,
        help="Directory where the models are saved."
    )    
    args = parser.parse_args()
    path = os.path.join(args.output_dir, MODELS_PATH, args.model)
    if not os.path.exists(path):
        os.makedirs(path)
    if args.model not in list(MODELS_PRETRAINED) + list(MODELS_FINETUNED):
        raise AttributeError("Model should be selected in the list: " + 
            ", ".join(list(MODELS_PRETRAINED) + list(MODELS_FINETUNED))
        )
    tokenizer = AutoTokenizer.from_pretrained(MODELS[args.model])
    model = AutoModel.from_pretrained(MODELS[args.model])
    model.save_pretrained(path)
    tokenizer.save_pretrained(path)
    if args.model in MODELS_FINETUNED.keys(): # Build the SentenceTransformer directly
        word_embedding_model = models.BERT(
            path,
            max_seq_length=args.max_seq_length,
            do_lower_case=args.do_lower_case
        )
        pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)
        model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
        rmtree(path)
        model.save(path)
    print(f'Model {args.model} available in', path)
Пример #26
0
    args = parser.parse_args()
    path = os.path.join(MODELS_PATH, args.model)
    if not os.path.exists(path):
        os.makedirs(path)
    if args.model == 'scibert':  # Used to fine-tune SciBERT from default embeddings
        tokenizer = AutoTokenizer.from_pretrained(
            "allenai/scibert_scivocab_cased")
        model = AutoModel.from_pretrained("allenai/scibert_scivocab_cased")
        model.save_pretrained(path)
        tokenizer.save_pretrained(path)
        print('SciBERT Transformer model available in', path)
    elif args.model == 'scibert-nli':  # Already-trained SciBERT
        tokenizer = AutoTokenizer.from_pretrained("gsarti/scibert-nli")
        model = AutoModel.from_pretrained("gsarti/scibert-nli")
        model.save_pretrained(path)
        tokenizer.save_pretrained(path)
        word_embedding_model = models.BERT(path)
        pooling_model = models.Pooling(
            word_embedding_model.get_word_embedding_dimension(),
            pooling_mode_mean_tokens=True,
            pooling_mode_cls_token=False,
            pooling_mode_max_tokens=False)
        model = SentenceTransformer(
            modules=[word_embedding_model, pooling_model])
        rmtree(path)
        model.save(path)
        print('SciBERT SentenceTransformer model available in', path)
    else:
        raise AttributeError("Model should be selected in the list: " +
                             ", ".join(MODELS))
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

# Read the dataset
batch_size = 16
nli_reader = NLIDataReader('datasets/AllNLI')
sts_reader = STSDataReader('datasets/stsbenchmark')
train_num_labels = nli_reader.get_num_labels()
model_save_path = 'output/training_nli_bert-' + datetime.now().strftime(
    "%Y-%m-%d_%H:%M:%S")

# Use BERT for mapping tokens to embeddings
word_embedding_model = models.BERT('bert-base-uncased')

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(
    word_embedding_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True,
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

# Convert the dataset to a DataLoader ready for training
logging.info("Read AllNLI train dataset")
train_data = SentencesDataset(nli_reader.get_examples('train.gz'), model=model)
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
train_loss = losses.SoftmaxLoss(
Пример #28
0
def load_model(use_covidbert=False):
    """Function that loads and returns the CovidBERT model"""

    # # Load CovidBERT
    # if use_covidbert:
    #     print("Loading model...")
    #     model = AutoModelForMaskedLM.from_pretrained("deepset/covid_bert_base")
    #     print("Loading tokenizer...")
    #     tokenizer = AutoTokenizer.from_pretrained("deepset/covid_bert_base")
    #     print("Finished loading the model successfully!")

    #model = SentenceTransformer(model_path)

    # #Load CovidBERT
    # if use_covidbert:
    #     print("Loading model...")
    #     model = AutoModelWithLMHead.from_pretrained("manueltonneau/clinicalcovid-bert-nli")
    #     print("Loading tokenizer...")
    #     print("\n")
    #     tokenizer = AutoTokenizer.from_pretrained("manueltonneau/clinicalcovid-bert-nli")
    #     print("\n")
    #     print("Finished loading the model successfully!")

    #     # Save the model to model path
    #     model_path = os.path.join("models","clinicalcovid")
    #     if not os.path.exists(model_path):
    #         os.makedirs(model_path)
    #     model.save_pretrained(model_path)
    #     tokenizer.save_pretrained(model_path)

    #     model = SentenceTransformer(model_path)

    # Load CovidBERT
    if use_covidbert:
        print("Loading model...")
        model = AutoModelWithLMHead.from_pretrained("gsarti/covidbert-nli")
        print("Loading tokenizer...")
        print("\n")
        tokenizer = AutoTokenizer.from_pretrained("gsarti/covidbert-nli")
        print("\n")
        print("Finished loading the model successfully!")

        # Save the model to model path
        model_path = os.path.join("models", "gsarticovid")
        if not os.path.exists(model_path):
            os.makedirs(model_path)
        model.save_pretrained(model_path)
        tokenizer.save_pretrained(model_path)
        print(f"Successfully saved model to {model_path}")

        print("Loading Sentence Transformer now!")
        word_embedding_model = models.BERT(
            model_path,
            # max_seq_length=args.max_seq_length,
            # do_lower_case=args.do_lower_case
        )
        pooling_model = models.Pooling(
            word_embedding_model.get_word_embedding_dimension(),
            pooling_mode_mean_tokens=True,
            pooling_mode_cls_token=False,
            pooling_mode_max_tokens=False)
        model = SentenceTransformer(
            modules=[word_embedding_model, pooling_model])
        rmtree(model_path)
        model.save(model_path)
        print("Finished building Sentence Transformer!")

    # Load regular BERT
    else:
        print("Loading BERT")
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = BertModel.from_pretrained('bert-base-uncased')
        print("Finished loading BERT")

    return model, tokenizer
Пример #29
0
def train(triplet_data_dir, output):
    logging.basicConfig(format='%(asctime)s - %(message)s',
                        datefmt='%Y-%m-%d %H:%M:%S',
                        level=logging.INFO,
                        handlers=[LoggingHandler()])

    ### Create a torch.DataLoader that passes training batch instances to our model
    train_batch_size = 16
    triplet_reader = TripletReader(triplet_data_dir,
                                   s1_col_idx=1,
                                   s2_col_idx=2,
                                   s3_col_idx=3,
                                   delimiter=',',
                                   quoting=csv.QUOTE_MINIMAL,
                                   has_header=True)
    # output_path = "output/bert-base-wikipedia-sections-mean-tokens-"+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    output_path = output + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    num_epochs = 1

    ### Configure sentence transformers for training and train on the provided dataset
    # Use BERT for mapping tokens to embeddings
    word_embedding_model = models.BERT('bert-base-uncased')

    # Apply mean pooling to get one fixed sized sentence vector
    pooling_model = models.Pooling(
        word_embedding_model.get_word_embedding_dimension(),
        pooling_mode_mean_tokens=True,
        pooling_mode_cls_token=False,
        pooling_mode_max_tokens=False)

    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

    logging.info("Read Triplet train dataset")
    train_data = SentencesDataset(examples=triplet_reader.get_examples(
        'train.csv', 2000000),
                                  model=model)
    train_dataloader = DataLoader(train_data,
                                  shuffle=True,
                                  batch_size=train_batch_size)
    train_loss = losses.TripletLoss(model=model)

    logging.info("Read Wikipedia Triplet dev dataset")
    dev_data = SentencesDataset(examples=triplet_reader.get_examples(
        'validation.csv', 10000),
                                model=model)
    dev_dataloader = DataLoader(dev_data,
                                shuffle=False,
                                batch_size=train_batch_size)
    evaluator = TripletEvaluator(dev_dataloader)

    warmup_steps = int(len(train_data) * num_epochs / train_batch_size *
                       0.1)  #10% of train data

    # Train the model
    model.fit(train_objectives=[(train_dataloader, train_loss)],
              evaluator=evaluator,
              epochs=num_epochs,
              evaluation_steps=1000,
              warmup_steps=warmup_steps,
              output_path=output_path)

    ##############################################################################
    #
    # Load the stored model and evaluate its performance on STS benchmark dataset
    #
    ##############################################################################

    model = SentenceTransformer(output_path)
    test_data = SentencesDataset(
        examples=triplet_reader.get_examples('test.csv'), model=model)
    test_dataloader = DataLoader(test_data,
                                 shuffle=False,
                                 batch_size=train_batch_size)
    evaluator = TripletEvaluator(test_dataloader)

    model.evaluate(evaluator)
Пример #30
0
#!/usr/bin/env python3
# Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.

from transformers import AutoTokenizer
from transformers import AutoModelWithLMHead
from sentence_transformers import SentenceTransformer
from sentence_transformers import models
import sys

PATH = sys.argv[1]
print('Saving model to %s' % PATH)

tokenizer = AutoTokenizer.from_pretrained("gsarti/scibert-nli")
model = AutoModelWithLMHead.from_pretrained("gsarti/scibert-nli")
model.save_pretrained(PATH)
tokenizer.save_pretrained(PATH)

embedding = models.BERT(PATH, max_seq_length=128, do_lower_case=True)
pooling_model = models.Pooling(embedding.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)
model = SentenceTransformer(modules=[embedding, pooling_model])
model.save(PATH)