def langAgnosticInit(files_dir, langs): time.sleep(30) ##### sleep 30 seconds to allow BERT to be up and running global lang_questions global all_questions if not class_support: lang_questions, all_questions = loadQuestionsFromFile(files_dir, langs) else: global lang_classes lang_questions, lang_classes = loadTextAndClassesFromFiles( files_dir, langs) global all_classes for lang in langs: if lang_questions.get(lang) is not None: all_questions = all_questions + list(lang_questions.get(lang)) all_classes = all_classes + list(lang_classes.get(lang)) embeddings = retrieveEmbeddings(embed_service, all_questions, 'en') tot_size = uploadQuestionsToService(compare_service, embeddings, 'en') logging.warning( "{:d} questions for {} language were loaded to the system for LASER". format(tot_size, 'general')) if is_bert_supported: embeddings_bert = retrieveEmbeddings(embed_service, all_questions, 'en', is_bert_supported) tot_size = uploadQuestionsToService(compare_service, embeddings_bert, 'en', is_bert_supported) logging.warning( "{:d} questions for {} language were loaded to the system for BERT" .format(tot_size, 'general'))
def compareSentence(lang): """compares a sentence with existing sentences to find similar sentences Creates an endpoint that takes in a sentence in json format and compares this sentence to previously uploaded sentences (from file). The input JSON format should be: { "sentence":"<sentence>" "topk":"<topk>" ---> optional parameter. Default is 3 "isbert": True/False ---> optional parameter. Default is False. } """ if not request.is_json: response = jsonify( {"message": "ERROR. The mime type needs to be application/json"}) response.status_code = 415 return response data = request.get_json() sentence = data['sentence'] topk = 3 if 'topk' in data: topk = int(data['topk']) isbert = False if 'isbert' in data: isbert = data['isbert'] == True try: #### do the comparison and return index embedding = retrieveEmbeddings(embed_service, sentence, lang, isbert) #url string urlstr = compare_service.strip('/') + '/compare' urlstr = urlstr + '/' + lang input_text = {'embedding': embedding, 'topk': topk} if isbert: input_text.update(isbert=True) result_response = requests.post(urlstr, json=input_text).json() #logging.info('Result {}'.format(result_response)) #### now retrieve texts from list results = [] ids = result_response['results'] global lang_questions for i in ids: sntns = lang_questions.get(lang)[i] results.append(sntns) response = jsonify({"results": results}) response.status_code = 200 except Exception as e: #print(e) response = jsonify({ "message": "An error occured while searching for results.", "exception ": str(e) }) logging.error("exception {}, stacktrace {}".format( str(e), str(traceback.format_exc()))) response.status_code = 500 return response
def reloadBase(lang): """accepts a list of sentences to be a new base for comparison Creates an endpoint that takes in a list of sentences in json format and reinitializes a vector base. BERT support is enabled by CLI argument The input JSON format should be: { "sentences":"[sentences]" } """ if not request.is_json: response = jsonify( {"message": "ERROR. The mime type needs to be application/json"}) response.status_code = 415 return response data = request.get_json() sentences = data['sentences'] global is_bert_supported try: #### do the comparison and return index laser_embeddings = retrieveEmbeddings(embed_service, sentences, lang) #url string to clear current base questions urlstr = compare_service.strip('/') + '/clearbase' urlstr = urlstr + '/' + lang input_text = {'isbert': False} clear_resp = requests.post(urlstr, json=input_text).json() v_size = clear_resp['vectorsize'] if int(v_size) != 0: logging.warning("Vectorsize is {}, System is LASER".format(v_size)) if is_bert_supported: bert_embeddings = retrieveEmbeddings(embed_service, sentences, lang, is_bert_supported) input_text = {'isbert': is_bert_supported} clear_resp = requests.post(urlstr, json=input_text).json() v_size = clear_resp['vectorsize'] if int(v_size) != 0: logging.warning( "Vectorsize is {}, System is BERT".format(v_size)) #### now upload texts to VectorComparison global lang_questions lang_questions[lang] = sentences logging.warning("number of questions for {} is {}".format( lang, len(lang_questions[lang]))) #tot_size = uploadQuestionsToService(compare_service, # embedding, lang) tot_size = uploadQuestionsToService(compare_service, laser_embeddings, lang) if is_bert_supported: tot_size = uploadQuestionsToService(compare_service, bert_embeddings, lang, is_bert_supported) logging.warning( "{:d} questions for {} language were loaded to the system".format( tot_size, lang)) response = jsonify({"vectorsize": tot_size}) response.status_code = 200 except Exception as e: #print(e) response = jsonify({ "message": "An error occured while searching for results.", "exception ": str(e) }) logging.error("exception {}, stacktrace {}".format( str(e), str(traceback.format_exc()))) response.status_code = 500 return response