예제 #1
0
def initialize():
    # Just some code to print debug information to stdout
    np.set_printoptions(threshold=100)
    logging.basicConfig(format='%(asctime)s - %(message)s',
                        datefmt='%Y-%m-%d %H:%M:%S',
                        level=logging.INFO,
                        handlers=[LoggingHandler()])
예제 #2
0
def main():
    parser = argparse.ArgumentParser()

    # Required parameters
    #parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.",     )
    parser.add_argument("--model_dir", default=None, type=str, required=True, help="The model dir")
    parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output dir")
    parser.add_argument("--version", default=None, type=str, required=True, help="version of the model dir")
    parser.add_argument("--source_file", default=None, type=str, required=True, help="Path to the input data.",     )

    args = parser.parse_args()

    #### Just some code to print debug information to stdout
    logging.basicConfig(format='%(asctime)s - %(message)s',
                        datefmt='%Y-%m-%d %H:%M:%S',
                        level=logging.INFO,
                        handlers=[LoggingHandler()])

    model = SentenceTransformer(args.model_dir)
    '''
    pno2abstract = pickle.load(open('{}/pno2abstract.dict'.format(args.data_dir), 'rb'))

    # model._first_module().max_seq_length = 256
    pnos = []
    texts = []
    items = list(pno2abstract.items())
    for pno,text in items:
        pnos.append(pno)
        texts.append(text)

    embeddings = model.encode(texts)
    dct = {}
    for idx in range(len(pnos)):
        dct[pnos[idx]] = embeddings[idx]

    pickle.dump(dct, open('{}/{}_abstract_embeddings.dict'.format(args.output_dir, args.version), 'wb'))
    '''
    pno2desc = pickle.load(open(args.source_file, 'rb'))

    # model._first_module().max_seq_length = 256
    pnos = []
    texts = []
    for pno in pno2desc.keys():
        pnos.append(pno)
        texts.append(pno2desc[pno])

    embeddings = model.encode(texts)
    dct = {}
    for idx in range(len(pnos)):
        dct[pnos[idx]] = embeddings[idx]

    pickle.dump(dct, open('{}/pno2vec_{}.dict'.format(args.output_dir, args.version), 'wb'))
def nlptrain(premodel,ver,tr_data,te_data):
	
#### Just some code to print debug information to stdout
	logging.basicConfig(format='%(asctime)s - %(message)s',
                    	datefmt='%Y-%m-%d %H:%M:%S',
                    	level=logging.INFO,
                    	handlers=[LoggingHandler()])
#### /print debug information to stdout

# Read the dataset
	model_name = 'roberta-large-nli-stsb-mean-tokens'
	train_batch_size = 16
	num_epochs = 4
	model_save_path = ver
	sts_reader = STSDataReader('kt_datasets/kt_benchmark', normalize_scores=True)

# Load a pre-trained sentence transformer model
	model = SentenceTransformer(premodel)

# Convert the dataset to a DataLoader ready for training
	logging.info("")
	train_data = SentencesDataset(sts_reader.get_examples(tr_data), model)
	train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)
	train_loss = losses.CosineSimilarityLoss(model=model)


	logging.info("")
	dev_data = SentencesDataset(examples=sts_reader.get_examples(te_data), model=model)
	dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size)
	evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)


# Configure the training. We skip evaluation in this example
	warmup_steps = math.ceil(len(train_data)*num_epochs/train_batch_size*0.1) #10% of train data for warm-up
	logging.info("Warmup-steps: {}".format(warmup_steps))


# Train the model
	model.fit(train_objectives=[(train_dataloader, train_loss)],
          	evaluator=evaluator,
          	epochs=num_epochs,
          	evaluation_steps=1000,
         	warmup_steps=warmup_steps,
          	output_path=model_save_path)

	list=['model saved in '+ ver+' directory']

	return(list)
예제 #4
0
 def train(self):
     path = self.get_path()
     np.set_printoptions(threshold=100)
     logging.basicConfig(format='%(asctime)s - %(message)s',
                         datefmt='%Y-%m-%d %H:%M:%S',
                         level=logging.ERROR,
                         handlers=[LoggingHandler()])
     model = SentenceTransformer(self.pretrain_model)
     sentences = open(path.get('training_set')).read().splitlines()
     sentence_embeddings = model.encode(sentences)
     vecs = np.stack(sentence_embeddings)
     model.save(path.get('model'))
     print('Saving the model to ' + path.get('model') + '...')
     np.save(path.get('vector'), sentence_embeddings)
     print('Saving the vector to ' + path.get('vector') + '...')
     print('Initiating model compression(.zip) ...')
     os.rename(path.get('training_set'), path.get('train_file'))
     self.compress_file(path.get('model'), path.get('zip_path'))
     print('→ Download "model.zip" and use it for prediction ...')
예제 #5
0
def _main():
    parser = argparse.ArgumentParser('Create file with sentence embeddings for further prccessing.',
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('-i', '--inputs', type=str, 
                        help='input text file')

    parser.add_argument('-o', '--output', type=str,
                        help='output file with sentence embeddings')

    #### Just some code to print debug information to stdout
    np.set_printoptions(threshold=100)

    logging.basicConfig(format='%(asctime)s - %(message)s',
                        datefmt='%Y-%m-%d %H:%M:%S',
                        level=logging.INFO,
                        handlers=[LoggingHandler()])
    #### /print debug information to stdout


    args = parser.parse_args()
    bert_embed(output_file=args.output,
       input_file=args.inputs)
예제 #6
0
def main(args):
    #### Just some code to print debug information to stdout
    logging.basicConfig(format='%(asctime)s - %(message)s',
                        datefmt='%Y-%m-%d %H:%M:%S',
                        level=logging.INFO,
                        handlers=[LoggingHandler()])
    #### /print debug information to stdout
    # Read the dataset
    train_batch_size = 64
    num_epochs = 1000

    if args.pretrained:
        model = SentenceTransformer(args.pretrained)
        model_save_path = os.path.join(
            args.save_path,
            args.pretrained.split("/")[-1] + '-' +
            datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
    else:
        #You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
        model_name = 'cl-tohoku/bert-base-japanese-char-whole-word-masking'
        model_save_path = os.path.join(
            args.save_path,
            model_name.replace("/", "-") + '-' +
            datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
        # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
        word_embedding_model = models.Transformer(model_name)

        # Apply mean pooling to get one fixed sized sentence vector
        pooling_model = models.Pooling(
            word_embedding_model.get_word_embedding_dimension(),
            pooling_mode_mean_tokens=True,
            pooling_mode_cls_token=False,
            pooling_mode_max_tokens=False)

        model = SentenceTransformer(
            modules=[word_embedding_model, pooling_model])

    # Convert the dataset to a DataLoader ready for training
    logging.info("Read custom train dataset")

    train_samples = []
    val_samples = []
    inp_list = []
    dataset_path = args.data_path
    with gzip.open(dataset_path, 'rt', encoding='utf8') as fIn:
        reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
        for row in reader:
            score = float(
                row['score']) / 10  # Normalize score to range 0 ... 1
            inp_list.append(
                InputExample(texts=[row['sentence1'], row['sentence2']],
                             label=score))

    from sklearn.model_selection import train_test_split
    train_samples, val_samples = train_test_split(inp_list, test_size=0.2)
    # import ipdb; ipdb.set_trace()

    train_dataset = SentencesDataset(train_samples, model)
    train_dataloader = DataLoader(train_dataset,
                                  shuffle=True,
                                  batch_size=train_batch_size)
    train_loss = losses.CosineSimilarityLoss(model=model)

    logging.info("Read custom dev dataset")
    # evaluator = EmbeddingSimilarityEvaluator.from_input_examples(val_samples, name='sts-dev')
    evaluator = EmbeddingSimilarityEvaluator.from_input_examples(val_samples)

    # Configure the training. We skip evaluation in this example
    warmup_steps = math.ceil(
        len(train_dataset) * num_epochs / train_batch_size *
        0.1)  #10% of train data for warm-up
    logging.info("Warmup-steps: {}".format(warmup_steps))

    # import ipdb; ipdb.set_trace()
    # Train the model
    model.fit(train_objectives=[(train_dataloader, train_loss)],
              evaluator=evaluator,
              epochs=num_epochs,
              evaluation_steps=1000,
              warmup_steps=warmup_steps,
              output_path=model_save_path)
예제 #7
0
"""
import torch
from torch.utils.data import DataLoader
import math
from sentence_transformers import models, losses
from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import *
import logging
from datetime import datetime

#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

# Read the dataset
batch_size = 32
sts_reader = STSDataReader('datasets/stsbenchmark')
model_save_path = 'output/training_tf-idf_word_embeddings-' + datetime.now(
).strftime("%Y-%m-%d_%H-%M-%S")

# Map tokens to traditional word embeddings like GloVe
word_embedding_model = models.WordEmbeddings.from_text_file(
    'glove.6B.300d.txt.gz')

# Weight word embeddings using Inverse-Document-Frequency (IDF) values.
# For each word in the vocab ob the tokenizer, we must specify a weight value.
# The word embedding is then multiplied by this value
def ServiceDesk(): 
    if request.method=='POST':
        posted_data = request.get_json()
        original_question = posted_data['issue']
        #Sample Dataset gotten from the web
        txt = "./faq.txt"
        my_file = open(txt, "r")
        content = my_file. read()
        hold_lines = []
        holdLines2 = []
        with open(txt,'r') as text_file:
            for row in text_file:
                red= row
                if '?' in red:

                    hold_lines.append(red)
                else:
                    holdLines2.append(red)
        g = holdLines2[0:30]
        data ={"ISSUES":hold_lines,"Resolution":g}
        df = pd.DataFrame(data)
        new_f = df.replace('\\n',' ', regex=True)
        new_f.to_csv("newFile.csv", index=False)
        df=pd.read_csv("newFile.csv") # Convert to Dataframe


        #Create dummy Database
        conn = sqlite3.connect('knowledgeBases.db')
        c = conn.cursor()
        def createDB():
            c.execute(''' SELECT count(name) FROM sqlite_master WHERE type='table' AND name='knowledgeBass' ''')

            #if the count is 1, then table exists
            if c.fetchone()[0]==1 : 
                print('The table already exists.')
            else :
                c.execute("CREATE TABLE knowledgeBass (categoryOfIssues TEXT, priorityOfproblem TEXT, date INTEGER, issues TEXT, resolution TEXT, ticket_Id VARCHAR, issusesReportedDescription TEXT)")  

                print('just created a table.')
            conn.commit()
        createDB()
        #Data Creation to Populate database  
        ticket_number_lists = []
        def generate_ticket_id(no_of_people):
            dept = ['DS','SE','ACC','HR']
            i = 0
            for i in range(no_of_people):
                department = random.choice(dept)
                ticket_number_lists.append(department + str(random.randint(12000,99999)))
            return ticket_number_lists

        generate_ticket_id(30)
        def clean_words(sentence, stopwords=False):

            sentence = sentence.lower().strip()
            sentence = re.sub(r'[^a-z0-9\s]', '', sentence)

            if stopwords:
                 sentence = remove_stopwords(sentence)

            return sentence

        def get_cleaned_words(df,stopwords=False):    
            sents=df[["ISSUES"]];
            cleaned_word=[]

            for index,row in df.iterrows():
                #print(index,row)
                cleaned=clean_words(row["ISSUES"],stopwords)
                cleaned_word.append(cleaned)
            return cleaned_word

        cleaned_word=get_cleaned_words(df,stopwords=True)

        catOfIssues = ['Networking','Hardware','Operating System','Others']
        priOfIssues = ['High','Low','Medium']
        currentTime = time.time()
        dates = datetime.datetime.fromtimestamp(currentTime).strftime('%Y-%m-%d %H:%M:%S')
        issues = df['ISSUES']
        #print(len(issues))
        Resolution = df['Resolution']
        issusesReportedDescription = cleaned_word
        ticket_id = ticket_number_lists
        for each in range(len(df)):
            Issue = df['ISSUES'][each]
            Resolutn = df['Resolution'][each]
            priority = random.choice(priOfIssues)
            category = random.choice(catOfIssues)
            ticket = ticket_id[each]
            description = issusesReportedDescription[each]
            date=dates
            c.execute("INSERT INTO knowledgeBasess (categoryOfIssues, priorityOfproblem, date, issues, resolution, ticket_Id, issusesReportedDescription) VALUES (?,?,?,?,?,?,?)",
                                  (category, priority,date,Issue,Resolutn,ticket,description))
            conn.commit()

        #Check to see whether the data has been correctly inserted
        c.execute('''SELECT * fROM KnowledgeBasess;''')
        print(c.fetchone())

        #Export data into csv,although not necessary, you can skip this part


        print ("........Exporting sql data into CSV............")
        c.execute("SELECT * FROM KnowledgeBasess")
        with open("Services_DeskData.csv", "w") as csv_file:
            csv_writer = csv.writer(csv_file, delimiter="\t")
            csv_writer.writerow([i[0] for i in c.description])
            csv_writer.writerows(c)

        dirpath = os.getcwd() + "/Services_DeskData.csv"
        print ("Data exported Successfully into {}".format(dirpath))

        #convert the sql data to dataframe for further use
        nw_df = pd.read_sql("SELECT * FROM KnowledgeBasess",conn)
        new_df = nw_df[['issues','resolution']]

        #Clean sentences and remove all stop words and tabs
        def clean_sentence(sentence, stopwords=False):

            sentence = sentence.lower().strip()
            sentence = re.sub(r'[^a-z0-9\s]', '', sentence)

            if stopwords:
                 sentence = remove_stopwords(sentence)

            return sentence

        def get_cleaned_sentences(new_df,stopwords=False):    
            sents=new_df[["issues"]];
            cleaned_sentences=[]

            for index,row in new_df.iterrows():
                #print(index,row)
                cleaned=clean_sentence(row["issues"],stopwords)
                cleaned_sentences.append(cleaned)
            return cleaned_sentences

        cleaned_sentences=get_cleaned_sentences(new_df,stopwords=True)
        #print(cleaned_sentences)

        print("\n")

        cleaned_sentences_with_stopwords=get_cleaned_sentences(new_df,stopwords=False)
        #print(cleaned_sentences_with_stopwords)

        original_question = original_question
        question=clean_sentence(original_question,stopwords=False)
        def retrieveAndPrintFAQAnswer(question_embedding,sentence_embeddings,FAQdf,sentences):
            max_sim=-1;
            index_sim=-1;
            for index,faq_embedding in enumerate(sentence_embeddings):

                sim=cosine_similarity(faq_embedding,question_embedding)[0][0];
                print(index, sim, sentences[index])
                if sim>max_sim:
                    max_sim=sim
                    index_sim=index

            print("\n")
            print("Question: ",question)
            print("\n");
            print("Retrieved: ",FAQdf.iloc[index_sim,0]) 
            print(FAQdf.iloc[index_sim,1])
            issues = question
            similar_query = FAQdf.iloc[index_sim,0] 
            suggested_resolution = FAQdf.iloc[index_sim,1]
            result = question+ "is:"+ suggested_resolution

            return result


        #### Just some code to print debug information to stdout
        np.set_printoptions(threshold=100)

        logging.basicConfig(format='%(asctime)s - %(message)s',
                            datefmt='%Y-%m-%d %H:%M:%S',
                            level=logging.INFO,
                            handlers=[LoggingHandler()])
        #### /print debug information to stdout



        # Load pre-trained Sentence Transformer Model (based on DistilBERT). It will be downloaded automatically
        model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')

        # Embed a list of sentencesh
        sentences = cleaned_sentences_with_stopwords

        sent_bertphrase_embedding=[];

        # The result is a list of sentence embeddings as numpy arrays
        for sent in sentences:
            sent_bertphrase_embedding.append(model.encode([sent]));


        question_embedding=model.encode([question]);

        Trial = retrieveAndPrintFAQAnswer(question_embedding,sent_bertphrase_embedding,new_df,sentences)
        return jsonify({'resolution': Trial}) 
예제 #9
0
def train(triplet_data_dir, output):
    logging.basicConfig(format='%(asctime)s - %(message)s',
                        datefmt='%Y-%m-%d %H:%M:%S',
                        level=logging.INFO,
                        handlers=[LoggingHandler()])

    ### Create a torch.DataLoader that passes training batch instances to our model
    train_batch_size = 16
    triplet_reader = TripletReader(triplet_data_dir,
                                   s1_col_idx=1,
                                   s2_col_idx=2,
                                   s3_col_idx=3,
                                   delimiter=',',
                                   quoting=csv.QUOTE_MINIMAL,
                                   has_header=True)
    # output_path = "output/bert-base-wikipedia-sections-mean-tokens-"+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    output_path = output + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    num_epochs = 1

    ### Configure sentence transformers for training and train on the provided dataset
    # Use BERT for mapping tokens to embeddings
    word_embedding_model = models.BERT('bert-base-uncased')

    # Apply mean pooling to get one fixed sized sentence vector
    pooling_model = models.Pooling(
        word_embedding_model.get_word_embedding_dimension(),
        pooling_mode_mean_tokens=True,
        pooling_mode_cls_token=False,
        pooling_mode_max_tokens=False)

    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

    logging.info("Read Triplet train dataset")
    train_data = SentencesDataset(examples=triplet_reader.get_examples(
        'train.csv', 2000000),
                                  model=model)
    train_dataloader = DataLoader(train_data,
                                  shuffle=True,
                                  batch_size=train_batch_size)
    train_loss = losses.TripletLoss(model=model)

    logging.info("Read Wikipedia Triplet dev dataset")
    dev_data = SentencesDataset(examples=triplet_reader.get_examples(
        'validation.csv', 10000),
                                model=model)
    dev_dataloader = DataLoader(dev_data,
                                shuffle=False,
                                batch_size=train_batch_size)
    evaluator = TripletEvaluator(dev_dataloader)

    warmup_steps = int(len(train_data) * num_epochs / train_batch_size *
                       0.1)  #10% of train data

    # Train the model
    model.fit(train_objectives=[(train_dataloader, train_loss)],
              evaluator=evaluator,
              epochs=num_epochs,
              evaluation_steps=1000,
              warmup_steps=warmup_steps,
              output_path=output_path)

    ##############################################################################
    #
    # Load the stored model and evaluate its performance on STS benchmark dataset
    #
    ##############################################################################

    model = SentenceTransformer(output_path)
    test_data = SentencesDataset(
        examples=triplet_reader.get_examples('test.csv'), model=model)
    test_dataloader = DataLoader(test_data,
                                 shuffle=False,
                                 batch_size=train_batch_size)
    evaluator = TripletEvaluator(test_dataloader)

    model.evaluate(evaluator)
예제 #10
0
# @Time : 2020/6/23 14:08
# @Author : SN
# @Version:V 0.1
# @File : embedding.py
# @desc : Get vector representation

from sentence_transformers import SentenceTransformer, LoggingHandler
from PMRank.calculate_distance import calculateDistance
import numpy as np
import logging


'''Processing log'''

np.set_printoptions(threshold=100)
logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, handlers=[LoggingHandler()])

'''Load model'''
model = SentenceTransformer('D:/Desktop/myModel/auxiliary_data/bert-base-nli-mean-tokens/')

def get_keyphrase_candidate_enbeddings(keyphrase_candidate):
    """Obtain candidate word vectors according to the pre-trained model"""
    if len(keyphrase_candidate) > 0:
        keyphrase_candidate_enbedding = model.encode(keyphrase_candidate)
    return keyphrase_candidate_enbedding


def get_sentence_enbeddings(sents_sectioned):
    """Get the vector of each sentence in the document according to the pre-trained model"""
    if len(sents_sectioned) > 0:
        sentence_embedding = model.encode(sents_sectioned)
예제 #11
0
def main(args):
    #### Just some code to print debug information to stdout
    logging.basicConfig(format='%(asctime)s - %(message)s',
                        datefmt='%Y-%m-%d %H:%M:%S',
                        level=logging.INFO,
                        handlers=[LoggingHandler()])
    #### /print debug information to stdout

    #You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
    #model_name = sys.argv[1] if len(sys.argv) > 1 else 'bert-base-uncased'

    model_name = args.model_name
    if model_name is None:
        model_name = 'bert-base-chinese'

    # Read the dataset
    batch_size = args.batch_size

    model_output_dir = args.model_output_dir
    #model_save_path = os.path.join(model_output_dir, "bert-base", datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
    model_save_path = model_output_dir

    # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
    # word_embedding_model = models.Transformer(model_name)
    if args.init_model is None:
        word_embedding_model = models.Transformer(model_name)

        # Apply mean pooling to get one fixed sized sentence vector
        pooling_model = models.Pooling(
            word_embedding_model.get_word_embedding_dimension(),
            pooling_mode_mean_tokens=True,
            pooling_mode_cls_token=False,
            pooling_mode_max_tokens=False)

        model = SentenceTransformer(
            modules=[word_embedding_model, pooling_model])
    else:
        model = SentenceTransformer(args.init_model)

    if args.do_train == 1:
        # Convert the dataset to a DataLoader ready for training
        data_reader = SimTextDataReader()
        logging.info("train_data:%s" % (args.train_data))
        logging.info("cache_data:%s" % (args.cached_data))
        train_data_files = args.train_data.split('#')
        cached_data_file = args.cached_data
        logging.info("Read train dataset")
        if not os.path.isfile(cached_data_file):
            train_examples = []
            for train_file in train_data_files:
                if os.path.isfile(train_file):
                    logging.info("load train file:%s" % (train_file))
                    now_examples = data_reader.get_examples(train_file)
                    train_examples.extend(now_examples)

            train_data = SentencesDataset(train_examples, model=model)
            torch.save(train_data, args.cached_data)
        else:
            train_data = torch.load(cached_data_file)
            logging.info("Load cached dataset %s" % (cached_data_file))
        logging.info("Build train dataset")
        train_dataloader = DataLoader(train_data,
                                      shuffle=True,
                                      batch_size=batch_size)
        # train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=train_num_labels)
        train_loss = losses.CosineSimilarityLoss(model=model)

        logging.info("Read dev dataset")
        dev_data_files = args.dev_data.split('#')
        dev_examples = []
        for dev_file in dev_data_files:
            if os.path.isfile(dev_file):
                logging.info("load dev file:%s" % (dev_file))
                now_examples = data_reader.get_examples(dev_file)
                dev_examples.extend(now_examples)
        dev_data = SentencesDataset(examples=dev_examples, model=model)
        logging.info("Build dev dataset")
        dev_dataloader = DataLoader(dev_data,
                                    shuffle=False,
                                    batch_size=batch_size)
        evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)

        # Configure the training
        num_epochs = args.num_epochs
        warmup_steps = math.ceil(
            len(train_dataloader) * num_epochs / batch_size *
            0.1)  #10% of train data for warm-up
        logging.info("Warmup-steps: {}".format(warmup_steps))

        logging.info("Start training")
        # Train the model
        model.fit(train_objectives=[(train_dataloader, train_loss)],
                  evaluator=evaluator,
                  epochs=num_epochs,
                  evaluation_steps=1000,
                  warmup_steps=warmup_steps,
                  output_path=model_save_path)

    if args.do_predict == 1:
        logging.info("Read predict dataset")
        pred_data_file = args.pred_data
        output_file = os.path.join(args.model_output_dir, "pred_res")
        text_pairs = load_pred_data(pred_data_file)
        with open(output_file, "w", encoding="utf-8") as fp:
            for tpair in text_pairs:
                embedding_pair = model.encode(tpair)
                cos_sim = cosine_similarity(embedding_pair[0],
                                            embedding_pair[1])
                fp.write("%s\t%s\t%f\n" % (tpair[0], tpair[1], cos_sim))
예제 #12
0
def model_training(
    train_data_path,
    evaluator_path,
    model_name,
    output_path,
    train_batch_size,
    num_epochs,
    samples_per_label,
):

    logging.basicConfig(
        format="%(asctime)s - %(message)s",
        datefmt="%Y-%m-%d %H:%M:%S",
        level=logging.INFO,
        handlers=[LoggingHandler()],
    )

    output_path = (output_path + datetime.now().strftime("%Y_%m_%d_%H_%M_%S"))

    os.makedirs(output_path, exist_ok=True)

    # You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
    # model_name = 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext/'

    ### Create a torch.DataLoader that passes training batch instances to our model

    logging.info("Loading training dataset")
    train_set = read_dataset(train_data_path)

    # Load pretrained model
    word_embedding_model = models.Transformer(model_name)
    # tokenizer_args={"additional_special_tokens": ['<e>', '</e>']})

    # word_embedding_model.auto_model.resize_token_embeddings(
    #     len(word_embedding_model.tokenizer))

    # Apply mean pooling to get one fixed sized sentence vector
    pooling_model = models.Pooling(
        word_embedding_model.get_word_embedding_dimension(),
        pooling_mode_mean_tokens=True,
        pooling_mode_cls_token=False,
        pooling_mode_max_tokens=False)
    # pooling_mode_mean_mark_tokens=True)

    # dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=2048, activation_function=nn.Tanh())

    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
    model.max_seq_length = 16

    logging.info("Read concept normalization training dataset")

    #### try different sample size ####

    train_data_sampler = SentenceLabelDataset(
        examples=train_set, samples_per_label=samples_per_label)

    ##### Try whether shuffle  #####  By default, it shouldn't be shuffled every epoch

    train_dataloader = DataLoader(train_data_sampler,
                                  batch_size=train_batch_size,
                                  drop_last=True)

    ### Triplet losses ####################
    ### There are 4 triplet loss variants:
    ### - BatchHardTripletLoss
    ### - BatchHardSoftMarginTripletLoss
    ### - BatchSemiHardTripletLoss
    ### - BatchAllTripletLoss
    #######################################

    # train_loss = losses.BatchAllTripletLoss(model=model)
    #train_loss = losses.BatchHardTripletLoss(sentence_embedder=model)
    train_loss = losses.BatchHardSoftMarginTripletLoss(model)
    #train_loss = losses.BatchSemiHardTripletLoss(sentence_embedder=model)

    # evaluator = []

    logging.info("Read concept normalization val dataset")

    ir_queries = read.read_from_json(
        os.path.join(evaluator_path, "dev_queries"))
    ir_corpus = read.read_from_json(os.path.join(evaluator_path, "corpus"))
    ir_relevant_docs = read.read_from_json(
        os.path.join(evaluator_path, "dev_relevant_docs"))
    ir_evaluator_n2c2_dev = evaluation.InformationRetrievalEvaluator(
        ir_queries,
        ir_corpus,
        ir_relevant_docs,
        corpus_chunk_size=300000,
        name="evaluation_results",
        map_at_k=[1, 3, 5, 10],
        batch_size=1024,
        show_progress_bar=True)

    # evaluator.append(ir_evaluator_n2c2_dev)
    # Create a SequentialEvaluator. This SequentialEvaluator runs all three evaluators in a sequential order.
    # We optimize the model with respect to the score from the last evaluator (scores[-1])
    # seq_evaluator = evaluation.SequentialEvaluator(evaluator, main_score_function=lambda scores: scores[1])

    logging.info("Performance before fine-tuning:")
    ir_evaluator_n2c2_dev(model)

    # warmup_steps = int(
    #     len(train_dataset) * num_epochs / train_batch_size * 0.1
    # )  # 10% of train data
    warmup_steps = 0

    # Train the model
    model.fit(
        train_objectives=[(train_dataloader, train_loss)],
        # evaluator = None,
        evaluator=ir_evaluator_n2c2_dev,
        output_path_ignore_not_empty=True,
        optimizer_params={
            'lr': 1e-4,
            'eps': 1e-6,
            'correct_bias': False
        },
        epochs=num_epochs,
        warmup_steps=warmup_steps,
        output_path=output_path,
    )
예제 #13
0
def mount_graph(df, path_to_language_model="../language_model"):
    print("Creating graph...")
    np.set_printoptions(threshold=100)
    logging.basicConfig(format='%(asctime)s - %(message)s',
                        datefmt='%Y-%m-%d %H:%M:%S',
                        level=logging.INFO,
                        handlers=[LoggingHandler()])

    language_model = SentenceTransformer(path_to_language_model)

    df['embedding'] = list(language_model.encode(df['text'].to_list()))
    df = df.loc[~df['Themes'].isna()]

    G = nx.Graph()

    for index, row in df.iterrows():

        node_id = row['GKGRECORDID']
        node_date = str(row['DATE'])[0:8]
        node_themes_array = row['Themes'].split(';')
        node_locations_array = ''
        node_persons_array = ''
        node_organizations_array = ''

        try:
            node_locations_array = row['Locations'].split(';')
            node_persons_array = row['Persons'].split(';')
            node_organizations_array = row['Organizations'].split(';')
        except:
            1

        # event <-> date
        G.add_edge(node_id, node_date)
        G.nodes[node_id]['themes'] = node_themes_array
        G.nodes[node_date]['themes'] = node_themes_array

        # event <-> theme
        for theme in node_themes_array:
            if len(theme) > 0:
                G.add_edge(node_id, theme)
                G.nodes[theme]['themes'] = node_themes_array

        # event <-> locations
        for location in node_locations_array:
            if len(location) > 0:
                G.add_edge(node_id, location)
                G.nodes[location]['themes'] = node_themes_array

        # event <-> persons
        for person in node_persons_array:
            if len(person) > 0:
                G.add_edge(node_id, person)
                G.nodes[person]['themes'] = node_themes_array

        # event <-> organization
        for org in node_organizations_array:
            if len(org) > 0:
                G.add_edge(node_id, org)
                G.nodes[org]['themes'] = node_themes_array

        # embedding
        G.nodes[node_id]['embedding'] = row['embedding']

    # We'll relabel our nodes, since it's names are not convenient...
    mapping = {value: idx for idx, value in enumerate(G.nodes())}
    G = nx.relabel_nodes(G, mapping=mapping, copy=True)

    print(
        f"Graph loaded: OK - \t Nodes: {len(G.nodes)} \t  edges: {len(G.edges)}"
    )
    return G
예제 #14
0
def train_nli():

    #### Just some code to print debug information to stdout
    logging.basicConfig(format='%(asctime)s - %(message)s',
                        datefmt='%Y-%m-%d %H:%M:%S',
                        level=logging.INFO,
                        handlers=[LoggingHandler()])
    #### /print debug information to stdout

    #You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
    #model_name = sys.argv[1] if len(sys.argv) > 1 else 'bert-base-uncased'
    model_name = 'pretrained_model/bert-base-uncased'

    # Read the dataset
    train_batch_size = 6
    nli_reader = NLIDataReader('./examples/datasets/AllNLI')
    sts_reader = STSBenchmarkDataReader('./examples/datasets/stsbenchmark')
    train_num_labels = nli_reader.get_num_labels()
    model_save_path = 'output/training_nli_'+model_name.replace("/", "-")+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")


    # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
    word_embedding_model = models.Transformer(model_name)

    # Apply mean pooling to get one fixed sized sentence vector
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                                pooling_mode_mean_tokens=True,
                                pooling_mode_cls_token=False,
                                pooling_mode_max_tokens=False)

    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])


    # Convert the dataset to a DataLoader ready for training
    logging.info("Read AllNLI train dataset")
    train_dataset = SentencesDataset(nli_reader.get_examples('train.gz'), model=model)
    train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
    train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=train_num_labels)



    logging.info("Read STSbenchmark dev dataset")
    dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model)
    dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size)
    evaluator = LabelAccuracyEvaluator(dev_dataloader,softmax_model = Softmax_label(model = model,
                                                                                    sentence_embedding_dimension = model.get_sentence_embedding_dimension(),
                                                                                    num_labels = train_num_labels))


    # Configure the training
    num_epochs = 1

    warmup_steps = math.ceil(len(train_dataset) * num_epochs / train_batch_size * 0.1) #10% of train data for warm-up
    logging.info("Warmup-steps: {}".format(warmup_steps))



    # Train the model
    model.fit(train_objectives=[(train_dataloader, train_loss)],
            evaluator=evaluator,
            epochs=num_epochs,
            evaluation_steps=100,
            warmup_steps=warmup_steps,
            output_path=model_save_path
            )



    ##############################################################################
    #
    # Load the stored model and evaluate its performance on STS benchmark dataset
    #
    ##############################################################################

    #model = SentenceTransformer(model_save_path)
    test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model)
    test_dataloader = DataLoader(test_data, shuffle=False, batch_size=train_batch_size)
    #evaluator = EmbeddingSimilarityEvaluator(test_dataloader)

    model.evaluate(evaluator)
예제 #15
0
def train_self():


    train_batch_size = 8
    num_epochs = 50
    device = 'cuda:0'
    train_num_labels = 6
    evaluation_steps = 1000
    local = True

    #### Just some code to print debug information to stdout
    logging.basicConfig(format='%(asctime)s - %(message)s',
                        datefmt='%Y-%m-%d %H:%M:%S',
                        level=logging.INFO,
                        handlers=[LoggingHandler()])

    # model_name = sys.argv[1] if len(sys.argv) > 1 else 'bert-base-uncased'
    #model_name = 'bert-base-chinese'
    model_name = './pretrained_model/bert-base-chinese'
    #train_batch_size = config.train_batch_size

    self_reader = Self_csv_DataReader('./self_dataset',local = local)
    #train_num_labels = config.train_num_labels
    model_save_path = 'output/training_nli_'+model_name.replace("/", "-")+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")


    # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
    word_embedding_model = models.Transformer(model_name,cache_dir = './pretrained_model')

    # Apply mean pooling to get one fixed sized sentence vector
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                                pooling_mode_mean_tokens=True,
                                pooling_mode_cls_token=False,
                                pooling_mode_max_tokens=False)

    model = SentenceTransformer_NoPooling(modules=[word_embedding_model])#, pooling_model])


    # Convert the dataset to a DataLoader ready for training
    logging.info("Read self train dataset")
    train_dataset = SentencesDataset(examples=self_reader.get_examples("train.csv"), model=model)
    train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
    train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_word_embedding_dimension(), num_labels=train_num_labels)



    logging.info("Read self dev dataset")
    dev_data = SentencesDataset(examples=self_reader.get_examples('dev.csv'), model=model)
    dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size)
    evaluator = LabelAccuracyEvaluator(dev_dataloader,softmax_model = Softmax_label(model = model,
                                                                                    sentence_embedding_dimension = model.get_word_embedding_dimension(),
                                                                                    num_labels = train_num_labels))



    warmup_steps = math.ceil(len(train_dataset) * num_epochs / train_batch_size * 0.1) #10% of train data for warm-up
    logging.info("Warmup-steps: {}".format(warmup_steps))



    # Train the model
    model.fit(train_objectives=[(train_dataloader, train_loss)],
            evaluator=evaluator,
            epochs=num_epochs,
            evaluation_steps=evaluation_steps,
            warmup_steps=warmup_steps,
            output_path=model_save_path
            )