def initialize(): # Just some code to print debug information to stdout np.set_printoptions(threshold=100) logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, handlers=[LoggingHandler()])
def main(): parser = argparse.ArgumentParser() # Required parameters #parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.", ) parser.add_argument("--model_dir", default=None, type=str, required=True, help="The model dir") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output dir") parser.add_argument("--version", default=None, type=str, required=True, help="version of the model dir") parser.add_argument("--source_file", default=None, type=str, required=True, help="Path to the input data.", ) args = parser.parse_args() #### Just some code to print debug information to stdout logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, handlers=[LoggingHandler()]) model = SentenceTransformer(args.model_dir) ''' pno2abstract = pickle.load(open('{}/pno2abstract.dict'.format(args.data_dir), 'rb')) # model._first_module().max_seq_length = 256 pnos = [] texts = [] items = list(pno2abstract.items()) for pno,text in items: pnos.append(pno) texts.append(text) embeddings = model.encode(texts) dct = {} for idx in range(len(pnos)): dct[pnos[idx]] = embeddings[idx] pickle.dump(dct, open('{}/{}_abstract_embeddings.dict'.format(args.output_dir, args.version), 'wb')) ''' pno2desc = pickle.load(open(args.source_file, 'rb')) # model._first_module().max_seq_length = 256 pnos = [] texts = [] for pno in pno2desc.keys(): pnos.append(pno) texts.append(pno2desc[pno]) embeddings = model.encode(texts) dct = {} for idx in range(len(pnos)): dct[pnos[idx]] = embeddings[idx] pickle.dump(dct, open('{}/pno2vec_{}.dict'.format(args.output_dir, args.version), 'wb'))
def nlptrain(premodel,ver,tr_data,te_data): #### Just some code to print debug information to stdout logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, handlers=[LoggingHandler()]) #### /print debug information to stdout # Read the dataset model_name = 'roberta-large-nli-stsb-mean-tokens' train_batch_size = 16 num_epochs = 4 model_save_path = ver sts_reader = STSDataReader('kt_datasets/kt_benchmark', normalize_scores=True) # Load a pre-trained sentence transformer model model = SentenceTransformer(premodel) # Convert the dataset to a DataLoader ready for training logging.info("") train_data = SentencesDataset(sts_reader.get_examples(tr_data), model) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size) train_loss = losses.CosineSimilarityLoss(model=model) logging.info("") dev_data = SentencesDataset(examples=sts_reader.get_examples(te_data), model=model) dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size) evaluator = EmbeddingSimilarityEvaluator(dev_dataloader) # Configure the training. We skip evaluation in this example warmup_steps = math.ceil(len(train_data)*num_epochs/train_batch_size*0.1) #10% of train data for warm-up logging.info("Warmup-steps: {}".format(warmup_steps)) # Train the model model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=num_epochs, evaluation_steps=1000, warmup_steps=warmup_steps, output_path=model_save_path) list=['model saved in '+ ver+' directory'] return(list)
def train(self): path = self.get_path() np.set_printoptions(threshold=100) logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.ERROR, handlers=[LoggingHandler()]) model = SentenceTransformer(self.pretrain_model) sentences = open(path.get('training_set')).read().splitlines() sentence_embeddings = model.encode(sentences) vecs = np.stack(sentence_embeddings) model.save(path.get('model')) print('Saving the model to ' + path.get('model') + '...') np.save(path.get('vector'), sentence_embeddings) print('Saving the vector to ' + path.get('vector') + '...') print('Initiating model compression(.zip) ...') os.rename(path.get('training_set'), path.get('train_file')) self.compress_file(path.get('model'), path.get('zip_path')) print('→ Download "model.zip" and use it for prediction ...')
def _main(): parser = argparse.ArgumentParser('Create file with sentence embeddings for further prccessing.', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-i', '--inputs', type=str, help='input text file') parser.add_argument('-o', '--output', type=str, help='output file with sentence embeddings') #### Just some code to print debug information to stdout np.set_printoptions(threshold=100) logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, handlers=[LoggingHandler()]) #### /print debug information to stdout args = parser.parse_args() bert_embed(output_file=args.output, input_file=args.inputs)
def main(args): #### Just some code to print debug information to stdout logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, handlers=[LoggingHandler()]) #### /print debug information to stdout # Read the dataset train_batch_size = 64 num_epochs = 1000 if args.pretrained: model = SentenceTransformer(args.pretrained) model_save_path = os.path.join( args.save_path, args.pretrained.split("/")[-1] + '-' + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")) else: #You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base model_name = 'cl-tohoku/bert-base-japanese-char-whole-word-masking' model_save_path = os.path.join( args.save_path, model_name.replace("/", "-") + '-' + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")) # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings word_embedding_model = models.Transformer(model_name) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer( modules=[word_embedding_model, pooling_model]) # Convert the dataset to a DataLoader ready for training logging.info("Read custom train dataset") train_samples = [] val_samples = [] inp_list = [] dataset_path = args.data_path with gzip.open(dataset_path, 'rt', encoding='utf8') as fIn: reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE) for row in reader: score = float( row['score']) / 10 # Normalize score to range 0 ... 1 inp_list.append( InputExample(texts=[row['sentence1'], row['sentence2']], label=score)) from sklearn.model_selection import train_test_split train_samples, val_samples = train_test_split(inp_list, test_size=0.2) # import ipdb; ipdb.set_trace() train_dataset = SentencesDataset(train_samples, model) train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size) train_loss = losses.CosineSimilarityLoss(model=model) logging.info("Read custom dev dataset") # evaluator = EmbeddingSimilarityEvaluator.from_input_examples(val_samples, name='sts-dev') evaluator = EmbeddingSimilarityEvaluator.from_input_examples(val_samples) # Configure the training. We skip evaluation in this example warmup_steps = math.ceil( len(train_dataset) * num_epochs / train_batch_size * 0.1) #10% of train data for warm-up logging.info("Warmup-steps: {}".format(warmup_steps)) # import ipdb; ipdb.set_trace() # Train the model model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=num_epochs, evaluation_steps=1000, warmup_steps=warmup_steps, output_path=model_save_path)
""" import torch from torch.utils.data import DataLoader import math from sentence_transformers import models, losses from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator from sentence_transformers.readers import * import logging from datetime import datetime #### Just some code to print debug information to stdout logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, handlers=[LoggingHandler()]) #### /print debug information to stdout # Read the dataset batch_size = 32 sts_reader = STSDataReader('datasets/stsbenchmark') model_save_path = 'output/training_tf-idf_word_embeddings-' + datetime.now( ).strftime("%Y-%m-%d_%H-%M-%S") # Map tokens to traditional word embeddings like GloVe word_embedding_model = models.WordEmbeddings.from_text_file( 'glove.6B.300d.txt.gz') # Weight word embeddings using Inverse-Document-Frequency (IDF) values. # For each word in the vocab ob the tokenizer, we must specify a weight value. # The word embedding is then multiplied by this value
def ServiceDesk(): if request.method=='POST': posted_data = request.get_json() original_question = posted_data['issue'] #Sample Dataset gotten from the web txt = "./faq.txt" my_file = open(txt, "r") content = my_file. read() hold_lines = [] holdLines2 = [] with open(txt,'r') as text_file: for row in text_file: red= row if '?' in red: hold_lines.append(red) else: holdLines2.append(red) g = holdLines2[0:30] data ={"ISSUES":hold_lines,"Resolution":g} df = pd.DataFrame(data) new_f = df.replace('\\n',' ', regex=True) new_f.to_csv("newFile.csv", index=False) df=pd.read_csv("newFile.csv") # Convert to Dataframe #Create dummy Database conn = sqlite3.connect('knowledgeBases.db') c = conn.cursor() def createDB(): c.execute(''' SELECT count(name) FROM sqlite_master WHERE type='table' AND name='knowledgeBass' ''') #if the count is 1, then table exists if c.fetchone()[0]==1 : print('The table already exists.') else : c.execute("CREATE TABLE knowledgeBass (categoryOfIssues TEXT, priorityOfproblem TEXT, date INTEGER, issues TEXT, resolution TEXT, ticket_Id VARCHAR, issusesReportedDescription TEXT)") print('just created a table.') conn.commit() createDB() #Data Creation to Populate database ticket_number_lists = [] def generate_ticket_id(no_of_people): dept = ['DS','SE','ACC','HR'] i = 0 for i in range(no_of_people): department = random.choice(dept) ticket_number_lists.append(department + str(random.randint(12000,99999))) return ticket_number_lists generate_ticket_id(30) def clean_words(sentence, stopwords=False): sentence = sentence.lower().strip() sentence = re.sub(r'[^a-z0-9\s]', '', sentence) if stopwords: sentence = remove_stopwords(sentence) return sentence def get_cleaned_words(df,stopwords=False): sents=df[["ISSUES"]]; cleaned_word=[] for index,row in df.iterrows(): #print(index,row) cleaned=clean_words(row["ISSUES"],stopwords) cleaned_word.append(cleaned) return cleaned_word cleaned_word=get_cleaned_words(df,stopwords=True) catOfIssues = ['Networking','Hardware','Operating System','Others'] priOfIssues = ['High','Low','Medium'] currentTime = time.time() dates = datetime.datetime.fromtimestamp(currentTime).strftime('%Y-%m-%d %H:%M:%S') issues = df['ISSUES'] #print(len(issues)) Resolution = df['Resolution'] issusesReportedDescription = cleaned_word ticket_id = ticket_number_lists for each in range(len(df)): Issue = df['ISSUES'][each] Resolutn = df['Resolution'][each] priority = random.choice(priOfIssues) category = random.choice(catOfIssues) ticket = ticket_id[each] description = issusesReportedDescription[each] date=dates c.execute("INSERT INTO knowledgeBasess (categoryOfIssues, priorityOfproblem, date, issues, resolution, ticket_Id, issusesReportedDescription) VALUES (?,?,?,?,?,?,?)", (category, priority,date,Issue,Resolutn,ticket,description)) conn.commit() #Check to see whether the data has been correctly inserted c.execute('''SELECT * fROM KnowledgeBasess;''') print(c.fetchone()) #Export data into csv,although not necessary, you can skip this part print ("........Exporting sql data into CSV............") c.execute("SELECT * FROM KnowledgeBasess") with open("Services_DeskData.csv", "w") as csv_file: csv_writer = csv.writer(csv_file, delimiter="\t") csv_writer.writerow([i[0] for i in c.description]) csv_writer.writerows(c) dirpath = os.getcwd() + "/Services_DeskData.csv" print ("Data exported Successfully into {}".format(dirpath)) #convert the sql data to dataframe for further use nw_df = pd.read_sql("SELECT * FROM KnowledgeBasess",conn) new_df = nw_df[['issues','resolution']] #Clean sentences and remove all stop words and tabs def clean_sentence(sentence, stopwords=False): sentence = sentence.lower().strip() sentence = re.sub(r'[^a-z0-9\s]', '', sentence) if stopwords: sentence = remove_stopwords(sentence) return sentence def get_cleaned_sentences(new_df,stopwords=False): sents=new_df[["issues"]]; cleaned_sentences=[] for index,row in new_df.iterrows(): #print(index,row) cleaned=clean_sentence(row["issues"],stopwords) cleaned_sentences.append(cleaned) return cleaned_sentences cleaned_sentences=get_cleaned_sentences(new_df,stopwords=True) #print(cleaned_sentences) print("\n") cleaned_sentences_with_stopwords=get_cleaned_sentences(new_df,stopwords=False) #print(cleaned_sentences_with_stopwords) original_question = original_question question=clean_sentence(original_question,stopwords=False) def retrieveAndPrintFAQAnswer(question_embedding,sentence_embeddings,FAQdf,sentences): max_sim=-1; index_sim=-1; for index,faq_embedding in enumerate(sentence_embeddings): sim=cosine_similarity(faq_embedding,question_embedding)[0][0]; print(index, sim, sentences[index]) if sim>max_sim: max_sim=sim index_sim=index print("\n") print("Question: ",question) print("\n"); print("Retrieved: ",FAQdf.iloc[index_sim,0]) print(FAQdf.iloc[index_sim,1]) issues = question similar_query = FAQdf.iloc[index_sim,0] suggested_resolution = FAQdf.iloc[index_sim,1] result = question+ "is:"+ suggested_resolution return result #### Just some code to print debug information to stdout np.set_printoptions(threshold=100) logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, handlers=[LoggingHandler()]) #### /print debug information to stdout # Load pre-trained Sentence Transformer Model (based on DistilBERT). It will be downloaded automatically model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens') # Embed a list of sentencesh sentences = cleaned_sentences_with_stopwords sent_bertphrase_embedding=[]; # The result is a list of sentence embeddings as numpy arrays for sent in sentences: sent_bertphrase_embedding.append(model.encode([sent])); question_embedding=model.encode([question]); Trial = retrieveAndPrintFAQAnswer(question_embedding,sent_bertphrase_embedding,new_df,sentences) return jsonify({'resolution': Trial})
def train(triplet_data_dir, output): logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, handlers=[LoggingHandler()]) ### Create a torch.DataLoader that passes training batch instances to our model train_batch_size = 16 triplet_reader = TripletReader(triplet_data_dir, s1_col_idx=1, s2_col_idx=2, s3_col_idx=3, delimiter=',', quoting=csv.QUOTE_MINIMAL, has_header=True) # output_path = "output/bert-base-wikipedia-sections-mean-tokens-"+datetime.now().strftime("%Y-%m-%d_%H-%M-%S") output_path = output + datetime.now().strftime("%Y-%m-%d_%H-%M-%S") num_epochs = 1 ### Configure sentence transformers for training and train on the provided dataset # Use BERT for mapping tokens to embeddings word_embedding_model = models.BERT('bert-base-uncased') # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) logging.info("Read Triplet train dataset") train_data = SentencesDataset(examples=triplet_reader.get_examples( 'train.csv', 2000000), model=model) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size) train_loss = losses.TripletLoss(model=model) logging.info("Read Wikipedia Triplet dev dataset") dev_data = SentencesDataset(examples=triplet_reader.get_examples( 'validation.csv', 10000), model=model) dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size) evaluator = TripletEvaluator(dev_dataloader) warmup_steps = int(len(train_data) * num_epochs / train_batch_size * 0.1) #10% of train data # Train the model model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=num_epochs, evaluation_steps=1000, warmup_steps=warmup_steps, output_path=output_path) ############################################################################## # # Load the stored model and evaluate its performance on STS benchmark dataset # ############################################################################## model = SentenceTransformer(output_path) test_data = SentencesDataset( examples=triplet_reader.get_examples('test.csv'), model=model) test_dataloader = DataLoader(test_data, shuffle=False, batch_size=train_batch_size) evaluator = TripletEvaluator(test_dataloader) model.evaluate(evaluator)
# @Time : 2020/6/23 14:08 # @Author : SN # @Version:V 0.1 # @File : embedding.py # @desc : Get vector representation from sentence_transformers import SentenceTransformer, LoggingHandler from PMRank.calculate_distance import calculateDistance import numpy as np import logging '''Processing log''' np.set_printoptions(threshold=100) logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, handlers=[LoggingHandler()]) '''Load model''' model = SentenceTransformer('D:/Desktop/myModel/auxiliary_data/bert-base-nli-mean-tokens/') def get_keyphrase_candidate_enbeddings(keyphrase_candidate): """Obtain candidate word vectors according to the pre-trained model""" if len(keyphrase_candidate) > 0: keyphrase_candidate_enbedding = model.encode(keyphrase_candidate) return keyphrase_candidate_enbedding def get_sentence_enbeddings(sents_sectioned): """Get the vector of each sentence in the document according to the pre-trained model""" if len(sents_sectioned) > 0: sentence_embedding = model.encode(sents_sectioned)
def main(args): #### Just some code to print debug information to stdout logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, handlers=[LoggingHandler()]) #### /print debug information to stdout #You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base #model_name = sys.argv[1] if len(sys.argv) > 1 else 'bert-base-uncased' model_name = args.model_name if model_name is None: model_name = 'bert-base-chinese' # Read the dataset batch_size = args.batch_size model_output_dir = args.model_output_dir #model_save_path = os.path.join(model_output_dir, "bert-base", datetime.now().strftime("%Y-%m-%d_%H-%M-%S")) model_save_path = model_output_dir # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings # word_embedding_model = models.Transformer(model_name) if args.init_model is None: word_embedding_model = models.Transformer(model_name) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer( modules=[word_embedding_model, pooling_model]) else: model = SentenceTransformer(args.init_model) if args.do_train == 1: # Convert the dataset to a DataLoader ready for training data_reader = SimTextDataReader() logging.info("train_data:%s" % (args.train_data)) logging.info("cache_data:%s" % (args.cached_data)) train_data_files = args.train_data.split('#') cached_data_file = args.cached_data logging.info("Read train dataset") if not os.path.isfile(cached_data_file): train_examples = [] for train_file in train_data_files: if os.path.isfile(train_file): logging.info("load train file:%s" % (train_file)) now_examples = data_reader.get_examples(train_file) train_examples.extend(now_examples) train_data = SentencesDataset(train_examples, model=model) torch.save(train_data, args.cached_data) else: train_data = torch.load(cached_data_file) logging.info("Load cached dataset %s" % (cached_data_file)) logging.info("Build train dataset") train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size) # train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=train_num_labels) train_loss = losses.CosineSimilarityLoss(model=model) logging.info("Read dev dataset") dev_data_files = args.dev_data.split('#') dev_examples = [] for dev_file in dev_data_files: if os.path.isfile(dev_file): logging.info("load dev file:%s" % (dev_file)) now_examples = data_reader.get_examples(dev_file) dev_examples.extend(now_examples) dev_data = SentencesDataset(examples=dev_examples, model=model) logging.info("Build dev dataset") dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size) evaluator = EmbeddingSimilarityEvaluator(dev_dataloader) # Configure the training num_epochs = args.num_epochs warmup_steps = math.ceil( len(train_dataloader) * num_epochs / batch_size * 0.1) #10% of train data for warm-up logging.info("Warmup-steps: {}".format(warmup_steps)) logging.info("Start training") # Train the model model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=num_epochs, evaluation_steps=1000, warmup_steps=warmup_steps, output_path=model_save_path) if args.do_predict == 1: logging.info("Read predict dataset") pred_data_file = args.pred_data output_file = os.path.join(args.model_output_dir, "pred_res") text_pairs = load_pred_data(pred_data_file) with open(output_file, "w", encoding="utf-8") as fp: for tpair in text_pairs: embedding_pair = model.encode(tpair) cos_sim = cosine_similarity(embedding_pair[0], embedding_pair[1]) fp.write("%s\t%s\t%f\n" % (tpair[0], tpair[1], cos_sim))
def model_training( train_data_path, evaluator_path, model_name, output_path, train_batch_size, num_epochs, samples_per_label, ): logging.basicConfig( format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()], ) output_path = (output_path + datetime.now().strftime("%Y_%m_%d_%H_%M_%S")) os.makedirs(output_path, exist_ok=True) # You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base # model_name = 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext/' ### Create a torch.DataLoader that passes training batch instances to our model logging.info("Loading training dataset") train_set = read_dataset(train_data_path) # Load pretrained model word_embedding_model = models.Transformer(model_name) # tokenizer_args={"additional_special_tokens": ['<e>', '</e>']}) # word_embedding_model.auto_model.resize_token_embeddings( # len(word_embedding_model.tokenizer)) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) # pooling_mode_mean_mark_tokens=True) # dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=2048, activation_function=nn.Tanh()) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) model.max_seq_length = 16 logging.info("Read concept normalization training dataset") #### try different sample size #### train_data_sampler = SentenceLabelDataset( examples=train_set, samples_per_label=samples_per_label) ##### Try whether shuffle ##### By default, it shouldn't be shuffled every epoch train_dataloader = DataLoader(train_data_sampler, batch_size=train_batch_size, drop_last=True) ### Triplet losses #################### ### There are 4 triplet loss variants: ### - BatchHardTripletLoss ### - BatchHardSoftMarginTripletLoss ### - BatchSemiHardTripletLoss ### - BatchAllTripletLoss ####################################### # train_loss = losses.BatchAllTripletLoss(model=model) #train_loss = losses.BatchHardTripletLoss(sentence_embedder=model) train_loss = losses.BatchHardSoftMarginTripletLoss(model) #train_loss = losses.BatchSemiHardTripletLoss(sentence_embedder=model) # evaluator = [] logging.info("Read concept normalization val dataset") ir_queries = read.read_from_json( os.path.join(evaluator_path, "dev_queries")) ir_corpus = read.read_from_json(os.path.join(evaluator_path, "corpus")) ir_relevant_docs = read.read_from_json( os.path.join(evaluator_path, "dev_relevant_docs")) ir_evaluator_n2c2_dev = evaluation.InformationRetrievalEvaluator( ir_queries, ir_corpus, ir_relevant_docs, corpus_chunk_size=300000, name="evaluation_results", map_at_k=[1, 3, 5, 10], batch_size=1024, show_progress_bar=True) # evaluator.append(ir_evaluator_n2c2_dev) # Create a SequentialEvaluator. This SequentialEvaluator runs all three evaluators in a sequential order. # We optimize the model with respect to the score from the last evaluator (scores[-1]) # seq_evaluator = evaluation.SequentialEvaluator(evaluator, main_score_function=lambda scores: scores[1]) logging.info("Performance before fine-tuning:") ir_evaluator_n2c2_dev(model) # warmup_steps = int( # len(train_dataset) * num_epochs / train_batch_size * 0.1 # ) # 10% of train data warmup_steps = 0 # Train the model model.fit( train_objectives=[(train_dataloader, train_loss)], # evaluator = None, evaluator=ir_evaluator_n2c2_dev, output_path_ignore_not_empty=True, optimizer_params={ 'lr': 1e-4, 'eps': 1e-6, 'correct_bias': False }, epochs=num_epochs, warmup_steps=warmup_steps, output_path=output_path, )
def mount_graph(df, path_to_language_model="../language_model"): print("Creating graph...") np.set_printoptions(threshold=100) logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, handlers=[LoggingHandler()]) language_model = SentenceTransformer(path_to_language_model) df['embedding'] = list(language_model.encode(df['text'].to_list())) df = df.loc[~df['Themes'].isna()] G = nx.Graph() for index, row in df.iterrows(): node_id = row['GKGRECORDID'] node_date = str(row['DATE'])[0:8] node_themes_array = row['Themes'].split(';') node_locations_array = '' node_persons_array = '' node_organizations_array = '' try: node_locations_array = row['Locations'].split(';') node_persons_array = row['Persons'].split(';') node_organizations_array = row['Organizations'].split(';') except: 1 # event <-> date G.add_edge(node_id, node_date) G.nodes[node_id]['themes'] = node_themes_array G.nodes[node_date]['themes'] = node_themes_array # event <-> theme for theme in node_themes_array: if len(theme) > 0: G.add_edge(node_id, theme) G.nodes[theme]['themes'] = node_themes_array # event <-> locations for location in node_locations_array: if len(location) > 0: G.add_edge(node_id, location) G.nodes[location]['themes'] = node_themes_array # event <-> persons for person in node_persons_array: if len(person) > 0: G.add_edge(node_id, person) G.nodes[person]['themes'] = node_themes_array # event <-> organization for org in node_organizations_array: if len(org) > 0: G.add_edge(node_id, org) G.nodes[org]['themes'] = node_themes_array # embedding G.nodes[node_id]['embedding'] = row['embedding'] # We'll relabel our nodes, since it's names are not convenient... mapping = {value: idx for idx, value in enumerate(G.nodes())} G = nx.relabel_nodes(G, mapping=mapping, copy=True) print( f"Graph loaded: OK - \t Nodes: {len(G.nodes)} \t edges: {len(G.edges)}" ) return G
def train_nli(): #### Just some code to print debug information to stdout logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, handlers=[LoggingHandler()]) #### /print debug information to stdout #You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base #model_name = sys.argv[1] if len(sys.argv) > 1 else 'bert-base-uncased' model_name = 'pretrained_model/bert-base-uncased' # Read the dataset train_batch_size = 6 nli_reader = NLIDataReader('./examples/datasets/AllNLI') sts_reader = STSBenchmarkDataReader('./examples/datasets/stsbenchmark') train_num_labels = nli_reader.get_num_labels() model_save_path = 'output/training_nli_'+model_name.replace("/", "-")+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S") # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings word_embedding_model = models.Transformer(model_name) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) # Convert the dataset to a DataLoader ready for training logging.info("Read AllNLI train dataset") train_dataset = SentencesDataset(nli_reader.get_examples('train.gz'), model=model) train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size) train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=train_num_labels) logging.info("Read STSbenchmark dev dataset") dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model) dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size) evaluator = LabelAccuracyEvaluator(dev_dataloader,softmax_model = Softmax_label(model = model, sentence_embedding_dimension = model.get_sentence_embedding_dimension(), num_labels = train_num_labels)) # Configure the training num_epochs = 1 warmup_steps = math.ceil(len(train_dataset) * num_epochs / train_batch_size * 0.1) #10% of train data for warm-up logging.info("Warmup-steps: {}".format(warmup_steps)) # Train the model model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=num_epochs, evaluation_steps=100, warmup_steps=warmup_steps, output_path=model_save_path ) ############################################################################## # # Load the stored model and evaluate its performance on STS benchmark dataset # ############################################################################## #model = SentenceTransformer(model_save_path) test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model) test_dataloader = DataLoader(test_data, shuffle=False, batch_size=train_batch_size) #evaluator = EmbeddingSimilarityEvaluator(test_dataloader) model.evaluate(evaluator)
def train_self(): train_batch_size = 8 num_epochs = 50 device = 'cuda:0' train_num_labels = 6 evaluation_steps = 1000 local = True #### Just some code to print debug information to stdout logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, handlers=[LoggingHandler()]) # model_name = sys.argv[1] if len(sys.argv) > 1 else 'bert-base-uncased' #model_name = 'bert-base-chinese' model_name = './pretrained_model/bert-base-chinese' #train_batch_size = config.train_batch_size self_reader = Self_csv_DataReader('./self_dataset',local = local) #train_num_labels = config.train_num_labels model_save_path = 'output/training_nli_'+model_name.replace("/", "-")+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S") # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings word_embedding_model = models.Transformer(model_name,cache_dir = './pretrained_model') # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer_NoPooling(modules=[word_embedding_model])#, pooling_model]) # Convert the dataset to a DataLoader ready for training logging.info("Read self train dataset") train_dataset = SentencesDataset(examples=self_reader.get_examples("train.csv"), model=model) train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size) train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_word_embedding_dimension(), num_labels=train_num_labels) logging.info("Read self dev dataset") dev_data = SentencesDataset(examples=self_reader.get_examples('dev.csv'), model=model) dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size) evaluator = LabelAccuracyEvaluator(dev_dataloader,softmax_model = Softmax_label(model = model, sentence_embedding_dimension = model.get_word_embedding_dimension(), num_labels = train_num_labels)) warmup_steps = math.ceil(len(train_dataset) * num_epochs / train_batch_size * 0.1) #10% of train data for warm-up logging.info("Warmup-steps: {}".format(warmup_steps)) # Train the model model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=num_epochs, evaluation_steps=evaluation_steps, warmup_steps=warmup_steps, output_path=model_save_path )