예제 #1
0
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

# Read the dataset
batch_size = 32
sts_reader = STSBenchmarkDataReader('../datasets/stsbenchmark')
model_save_path = 'output/training_stsbenchmark_bilstm-' + datetime.now(
).strftime("%Y-%m-%d_%H-%M-%S")

# Map tokens to traditional word embeddings like GloVe
word_embedding_model = models.WordEmbeddings.from_text_file(
    'glove.6B.300d.txt.gz')

lstm = models.LSTM(word_embedding_dimension=word_embedding_model.
                   get_word_embedding_dimension(),
                   hidden_dim=1024)

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(lstm.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=False,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=True)

model = SentenceTransformer(
    modules=[word_embedding_model, lstm, pooling_model])

# Convert the dataset to a DataLoader ready for training
logging.info("Read STSbenchmark train dataset")
train_data = SentencesDataset(sts_reader.get_examples('sts-train.csv'),
                              model=model)
예제 #2
0
    os.mkdir(args.ckpt_path)

# Read the dataset
sts_reader = STSBenchmarkDataReader(args.data_path, normalize_scores=True)

# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
word_embedding_model = models.PhoBERT(args.pre_trained_path,
                                      tokenizer_args={
                                          'vncorenlp_path':
                                          args.vncorenlp_path,
                                          'bpe_path': args.bpe_path
                                      })

lstm = models.LSTM(
    word_embedding_dimension=word_embedding_model.get_word_embedding_dimension(
    ),
    hidden_dim=384,
    num_layers=1)  # Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(
    word_embedding_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True,
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False)

model = SentenceTransformer(
    modules=[word_embedding_model, lstm, pooling_model])

# Convert the dataset to a DataLoader ready for training
logging.info("Read STSbenchmark train dataset")
train_data = SentencesDataset(sts_reader.get_examples('sts-train_vi.csv'),
                              model)