def transform(self, texts): _check_st() model = SentenceTransformer('bert-base-nli-mean-tokens') X = np.array(model.encode(texts)) return X
def transform(self, texts): try: from sentence_transformers.SentenceTransformer import SentenceTransformer #noqa except ImportError: print("Error: install sentence_transformers package " "(`pip install sentence_transformers`)" " to use Sentence BERT.") sys.exit(192) model = SentenceTransformer('bert-base-nli-mean-tokens') X = np.array(model.encode(texts)) return X
def __init__(self, faq_path: str, faq_data: dict = None, model_path: str = None): """ Either mention the model path to previously saved model , or let it be , none when model path is None , the model will be a transformer model, with roBERTa base faq_name is the name of the faq , generated questions and answers , if it already exists , we will used the processed questions and answers , otherwise , we have to create a new one if faq name , has not been processed atleast once , you must provide faq_data faq_data --> dict has two keys , question_to_label , and answer_to_label {q2l : {} , a2l : {}} 1) question_to_label 2) answer_to_label q2l = {"How are you doing " : 1 , "where are you ? ": 3} a2l = {"I am fine" : 1 , "I am in India": 3} faq_data = {"questiontolabel" : q2l , "answertolabel" : a2l} question_to_labels is again a dictionary from questions : label(int) can have multiple questions for same label answer_to_labels is a dictionary from answers to label : one label per answer (strict !!!) """ if (model_path == None): model_path = 'roberta-base-nli-stsb-mean-tokens' self.model = SentenceTransformer(model_path) self.current_faq = None self.faq_path = faq_path self.question_to_label = { } # contans all the augmented and orignal questions mapped to their labels self.answer_to_label = {} # contains mapping form answer to labels # current data is to be filled using the fit_FAQ function call # it has 3 keys 1) embeddings (a np array) 2) labels 3) label_to_answer dict if (self.check_faq_path()): print("found preexisiting faq data , loading dicts from the same") que_path = os.path.join(self.faq_path, "questions.pkl") self.question_to_label = load_dict(que_path) ans_path = os.path.join(self.faq_path, "answers.pkl") self.answer_to_label = load_dict(ans_path) else: assert not faq_data is None, "Did not find and preexisting of {} so you must provide faq_data".format( faq_data) self.make_faq(faq_data)
class modelInterface: def __init__(self, faq_path: str, faq_data: dict = None, model_path: str = None): """ Either mention the model path to previously saved model , or let it be , none when model path is None , the model will be a transformer model, with roBERTa base faq_name is the name of the faq , generated questions and answers , if it already exists , we will used the processed questions and answers , otherwise , we have to create a new one if faq name , has not been processed atleast once , you must provide faq_data faq_data --> dict has two keys , question_to_label , and answer_to_label {q2l : {} , a2l : {}} 1) question_to_label 2) answer_to_label q2l = {"How are you doing " : 1 , "where are you ? ": 3} a2l = {"I am fine" : 1 , "I am in India": 3} faq_data = {"questiontolabel" : q2l , "answertolabel" : a2l} question_to_labels is again a dictionary from questions : label(int) can have multiple questions for same label answer_to_labels is a dictionary from answers to label : one label per answer (strict !!!) """ if (model_path == None): model_path = 'roberta-base-nli-stsb-mean-tokens' self.model = SentenceTransformer(model_path) self.current_faq = None self.faq_path = faq_path self.question_to_label = { } # contans all the augmented and orignal questions mapped to their labels self.answer_to_label = {} # contains mapping form answer to labels # current data is to be filled using the fit_FAQ function call # it has 3 keys 1) embeddings (a np array) 2) labels 3) label_to_answer dict if (self.check_faq_path()): print("found preexisiting faq data , loading dicts from the same") que_path = os.path.join(self.faq_path, "questions.pkl") self.question_to_label = load_dict(que_path) ans_path = os.path.join(self.faq_path, "answers.pkl") self.answer_to_label = load_dict(ans_path) else: assert not faq_data is None, "Did not find and preexisting of {} so you must provide faq_data".format( faq_data) self.make_faq(faq_data) def check_faq_path(self): if (os.path.exists(self.faq_path) == False): return False files = ["questions.pkl", "answers.pkl"] for f in files: pth = os.path.join(self.faq_path, f) if (not os.path.exists(pth)): return False return True def destroy_faq(self): if (os.path.exists(self.faq_path) == False): return files = ["questions.pkl", "answers.pkl", "fit.pkl"] for f in files: pth = os.path.join(self.faq_path, f) if (os.path.exists(pth)): os.remove(pth) os.rmdir(self.faq_path) def make_faq(self, FAQ: dict): """ FAQ is a dictionary has 2 keys..... 1) question_to_label 2) answer_to_label question_to_labels is again a dictionary from questions : label(int) can have multiple questions for same label answer_to_labels is a dictionary from answers to label : one label per answer (strict !!!) """ self.destroy_faq() os.mkdir(self.faq_path) question_to_label = FAQ['question_to_label'] answer_to_label = FAQ['answer_to_label'] q_labels = set() a_labels = set() for q, l in question_to_label.items(): q_labels.add(l) for a, l in answer_to_label.items(): if (l not in q_labels): warnings.warn( "Some labels in answers are not in the questions, these answers will never be a part of answers from the FAQ !!!" ) print("label {} not in questions".format(l)) aug_question_to_label = dict() generated_dict = multiProcessControl( producer_classes, list(question_to_label) ) # this is a mapping from question to a list of generated questions for que, label in question_to_label.items(): aug_question_to_label[que] = label if (que not in generated_dict): print( "Some of the questions in the FAQ are missing after generation" ) continue gens = generated_dict[que] for q in gens: aug_question_to_label[q] = label """ for q,l in question_to_label.items(): # Damien , here also incorporate the other pipeline .... #gen_ques = self.augment_rushi(q,6) # gen_ques are generated questions , a list , you need to append other gengerated ques to this list # invoke your function for augmentation pipeline here.... # and append results to the gen_ques list.. # Thank you gen_ques = multiProcessControl(producer_classes= producer_classes, questions= [q]) aug_question_to_label[q] = l for a_q in gen_ques: aug_question_to_label[a_q] = l # note that ifthe augmentation yields the same question, as a result it will not be added.... """ self.question_to_label = aug_question_to_label self.answer_to_label = answer_to_label save_dict(self.question_to_label, os.path.join(self.faq_path, "questions.pkl")) save_dict(self.answer_to_label, os.path.join(self.faq_path, 'answers.pkl')) def train(self, model_save_path, data=None): """ questions = ['Q1', 'Q2', 'Q3', ....] labels = [1,2,3,1,4,8,9,10] generated_ques = {'Q1' : ['GQ1-1', 'GQ1-2', ...] , 'Q2' : ['GQ2-1', ...]} bs : 32 n : 4 model_save_path : './models/model_first' data --> a dict {'question_to_label' : mapping from question to label, 'bs': batch_size for training 'n' : num_of_classes to sample in a batch (bs%n ==0), } model_save_path = path of folder to save the model This function will fit you data using a batch hard triplet loss, and save the model to the folder specified the folder should be empty or not created in the beginning!!! if data is NONE we will just use the presaved data """ if (data is None): data = { 'question_to_label': self.question_to_label, "bs": 32, "n": 4 } data['model'] = self.model train_dataloader = get_dataloader(**data) train_loss = losses.BatchHardTripletLoss(sentence_embedder=self.model) self.model.fit( train_objectives=[(train_dataloader, train_loss)], epochs=1, evaluator=None, output_path=model_save_path, ) def cosine_sim(self, v, V): """ computes cosine sim between v,V where v and V are 2D np matrices (n,E) (N,E) output is of the shape (n,N) """ n1 = np.linalg.norm(v, axis=-1) n2 = np.linalg.norm(V, axis=-1) dot = np.expand_dims(v, 1) * np.expand_dims(V, 0) # shape (n,N,E) dot = dot.sum(axis=-1) ans = dot / n1.reshape(-1, 1) ans = ans / n2.reshape(1, -1) return ans def evaluate(self, data, K=5, cutoff=.6): """ Will evaluate model , on a given test_set data is a dict from test_questions to labels MAKE SURE THAT THE LABELS ARE IN SYNC WITH THE ONES YOU USED FIT MODEL ON!!! """ correct = 0 for q, l in data.items(): _, predicted_label = self.answer_question(q, verbose=False, K=K, cutoff=cutoff) if (int(predicted_label) == int(l)): correct += 1 return correct / len(data) # converting to array def unfit_FAQ(self): savepath = os.path.join(self.faq_path, "fit.pkl") if (os.path.exists(savepath)): os.remove(savepath) def fit_FAQ(self): """ Will calculate the vectors of all the questions and store them in a file "fit.pkl", if the file already exists then , will directely fetch data from there..... To make changes , is IF YOU HAVE TRAINED A NEW MODEL AND WANT TO FIT AGAIN.... PLEASE CALL UNFIT_MODEL FIRST... """ save_path = os.path.join(self.faq_path, "fit.pkl") if (os.path.exists(save_path)): warnings.warn( "Found existing fit.pkl loading diles from there ..... if you have trained the model recently and want to use that model to fit, please call unfit_model... " ) self.current_faq = load_dict(save_path) return question_to_label = self.question_to_label answer_to_label = self.answer_to_label questions = [] labels = [] for q, l in question_to_label.items(): questions.append(q) labels.append(int(l)) # Now inverting answer_to_label label_to_answer = {} for answer, label in answer_to_label.items(): if (label in label_to_answer): assert False, 'multiple answers have the same labels' label_to_answer[label] = answer if (label not in labels): print("{} label present in answer but not in question".format( label)) warnings.warn( 'some labels in answers are not present in questions , you might not have labels in Sync' ) for l in labels: if (l not in label_to_answer): warnings.warn( 'some labels in question are not present in answers ,this might cause runtime errors later, you might not have labels in Sync' ) self.current_faq = { 'embeddings': np.array(self.model.encode(questions)), 'labels': labels, 'label_to_answer': label_to_answer, 'question_to_label': question_to_label } save_dict(self.current_faq, save_path) def answer_question(self, question, K=1, cutoff=.3, verbose=True): """ This is where you ask the question , and a approropriate answer is returned, must call fit_FAQ before this question ==> string returns ==> (string , int) ------ the answer and the label to the question the answer belongs to..... """ if (self.current_faq is None): assert False, 'Need to fit_FAQ before calling answer_question' embeddings = self.current_faq['embeddings'] question_labels = self.current_faq['labels'] label_to_answer = self.current_faq['label_to_answer'] question_to_label = self.current_faq['question_to_label'] question = self.model.encode([question])[0].reshape(1, -1) # question is now a np.ndarray of shape (1,embedding_dim) cosine_sim = self.cosine_sim(question, embeddings)[0] #cosine_sim --> shape (N,) cosine_sim = cosine_sim.tolist() inds = [x for x in range(len(cosine_sim))] inds.sort(reverse=True, key=lambda x: cosine_sim[x]) inds = inds[:K] # we need to pick the top k answers max_val = cosine_sim[inds[0]] if (max_val < cutoff): return "out of set question", -1 labels = [question_labels[x] for x in inds] confs = [cosine_sim[x] for x in inds] label_to_conf = {} ans = -1 mx = -1 for l, conf in zip(labels, confs): if (l not in label_to_conf): label_to_conf[l] = 0 label_to_conf[l] += conf if (label_to_conf[l] > mx): mx = label_to_conf[l] ans = l """ print(labels) print(confs) majority = {} for label in labels: if(label not in majority): majority[label] = 0 majority[label] += 1 cnt = -1 ans = -1 print(majority) for label, count in majority.items(): if(count > cnt): cnt = count ans = label """ if (ans not in label_to_answer): return 'No answer corrosponding to the label {} , this means your question--label--answer dict is faulty '.format( ans) if (verbose): print(max_val) print(label_to_conf) print("MAX label is {}".format(ans)) for que, lab in question_to_label.items(): if (lab == ans): if (verbose): print("Answering {}".format(que)) return label_to_answer[ans], ans