def execute_pipeline(query, n_predictions=None): download_bnpp_data("./data/bnpp_newsroom_v1.1/") download_model("bert-squad_1.1", dir="./models") df = pd.read_csv( "./data/bnpp_newsroom_v1.1/bnpp_newsroom-v1.1.csv", converters={"paragraphs": literal_eval}, ) df = filter_paragraphs(df) cdqa_pipeline = QAPipeline(reader="models/bert_qa_vCPU-sklearn.joblib") cdqa_pipeline.fit_retriever(df) if torch.cuda.is_available(): cdqa_pipeline.cuda() if n_predictions is not None: predictions = cdqa_pipeline.predict(query, n_predictions=n_predictions) result = [] for answer, title, paragraph, score in predictions: prediction = (answer, title) result.append(prediction) return result else: prediction = cdqa_pipeline.predict(query) result = (prediction[0], prediction[1]) return result
def test_evaluate_pipeline(): download_bnpp_data("./data/bnpp_newsroom_v1.1/") download_model("bert-squad_1.1", dir="./models") df = pd.read_csv( "./data/bnpp_newsroom_v1.1/bnpp_newsroom-v1.1.csv", converters={"paragraphs": literal_eval}, ) df = filter_paragraphs(df) test_data = { "data": [{ "title": "BNP Paribas’ commitment to universities and schools", "paragraphs": [{ "context": "Since January 2016, BNP Paribas has offered an Excellence Program targeting new Master’s level graduates (BAC+5) who show high potential. The aid program lasts 18 months and comprises three assignments of six months each. It serves as a strong career accelerator that enables participants to access high-level management positions at a faster rate. The program allows participants to discover the BNP Paribas Group and its various entities in France and abroad, build an internal and external network by working on different assignments and receive personalized assistance from a mentor and coaching firm at every step along the way.", "qas": [{ "answers": [ { "answer_start": 6, "text": "January 2016" }, { "answer_start": 6, "text": "January 2016" }, { "answer_start": 6, "text": "January 2016" }, ], "question": "Since when does the Excellence Program of BNP Paribas exist?", "id": "56be4db0acb8001400a502ec", }], }], }], "version": "1.1", } with open("./test_data.json", "w") as f: json.dump(test_data, f) cdqa_pipeline = QAPipeline(reader="./models/bert_qa_vCPU-sklearn.joblib", n_jobs=-1) cdqa_pipeline.fit_retriever(X=df) eval_dict = evaluate_pipeline(cdqa_pipeline, "./test_data.json", output_dir=None) assert eval_dict["exact_match"] > 0.8 assert eval_dict["f1"] > 0.8
def fine_tuning_drive(question, file_name): storage.child("docs/" + file_name).download("/docs/", "docs/" + file_name) df = pdf_converter(directory_path="docs/") pd.set_option('display.max_colwidth', -1) df.head() cdqa_pipeline = QAPipeline(reader='./models/bert_qa.joblib', max_df=1.0) cdqa_pipeline.fit_retriever(df=df) joblib.dump(cdqa_pipeline, './models/bert_qa_custom.joblib') cdqa_pipeline=joblib.load('./models/bert_qa_custom.joblib') prediction = cdqa_pipeline.predict(question, 1) os.remove("docs/"+file_name) return prediction
def qna(query): df = pdf_converter(directory_path='./media/pdf') df.head() cdqa_pipeline = QAPipeline(reader='./models/bert_qa.joblib', max_df=1.0) # Fit Retriever to documents cdqa_pipeline.fit_retriever(df=df) # INPUT QUESTION print("\n\n\\n", query) #query = 'when was the second Indian Factory Act passed?' prediction = cdqa_pipeline.predict(query) # ans = 'query: {}\n \nanswer: {} \ntitle: {} \nparagraph: {}'.format(query,prediction[0],prediction[1],prediction[2]) ans = [query, prediction[0], prediction[1], prediction[2]] return ans
def post(self): parser = reqparse.RequestParser() parser.add_argument('query', type=str, required=True) args = parser.parse_args() df = pdf_converter(directory_path='./data/pdf/') cdqa_pipeline = QAPipeline(reader='./models/bert_qa.joblib', max_df=1.0) cdqa_pipeline.fit_retriever(df=df) prediction = cdqa_pipeline.predict(args.query) return {'data': prediction}, 200
def max_qa_bot(query): # df = pdf_converter(directory_path='C:/Users/kvsis/Desktop/Learning/Python Scripts/cdQA_project/data/pdf_files') df = pd.read_csv( 'C:/Users/kvsis/Desktop/Learning/Python Scripts/cdQA_project/data/data/data.csv', converters={'paragraphs': literal_eval}) # df = filter_paragraphs(df) cdqa_pipeline = QAPipeline( reader= 'C:/Users/kvsis/Desktop/Learning/Python Scripts/cdQA_project/models/bert_qa_vCPU-sklearn.joblib' ) cdqa_pipeline.fit_retriever(df=df) # recognizer = sr.Recognizer() # # recognizer.pause_threshold = 5.0 # with sr.Microphone() as source: # # print("[search edureka: search youtube]") # print("Speak Now") # audio = recognizer.listen(source) # query = recognizer.recognize_google(audio).capitalize() # print(query) # query = "What is td ameritrade" prediction = cdqa_pipeline.predict(query) # print('query: {}\n'.format(query)) # print('answer: {}\n'.format(prediction[0])) # print('title: {}\n'.format(prediction[1])) # print('paragraph: {}\n'.format(prediction[2])) # # Initializing the Text-to-Speech engine # engine = pyttsx3.init() # david = "HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Speech\Voices\Tokens\TTS_MS_EN-US_DAVID_11.0" # zira = "HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Speech\Voices\Tokens\TTS_MS_EN-US_ZIRA_11.0" # engine.setProperty('rate', 150) # engine.setProperty('volume', 1.0) # engine.setProperty('voice', david) # engine.say(prediction[2]) # engine.runAndWait() # engine.stop() # result = ('Question: {}\n'.format(query).capitalize()) + ('Answer: {}\n'.format(prediction[0]).capitalize()) + ('Subject: {}\n'.format(prediction[1]).capitalize()) + ('Paragraph: {}\n'.format(prediction[2]).capitalize()) result = prediction[2].capitalize() return result
from cdqa.pipeline import QAPipeline from cdqa.utils.download import download_model # Download model download_model(model='bert-squad_1.1', dir='./models') # INPUT PDFs # Here path is the folder of the PDFs to be used df = pdf_converter( directory_path='C:/Users/Viswash/Desktop/Work/ChatBot/Research/ Papers/') df.head() cdqa_pipeline = QAPipeline(reader='./models/bert_qa.joblib', max_df=1.0) # Fit Retriever to documents cdqa_pipeline.fit_retriever(df=df) # INPUT QUESTION query = 'when was the second Indian Factory Act passed?' prediction = cdqa_pipeline.predict(query) ans = 'query: {} \nanswer: {} \ntitle: {} \nparagraph: {}'.format( query, prediction[0], prediction[1], prediction[2]) print(ans) # OUTPUT # print('query: {}'.format(query)) # print('answer: {}'.format(prediction[0])) # print('title: {}'.format(prediction[1])) # print('paragraph: {}'.format(prediction[2]))
def indexdq(request): if (request.POST): if ('file' in request.FILES): request.session['proj_id'] = request.POST['proj_id'] uploaded_file = request.FILES['file'] request.session['name'] = uploaded_file.name.split(".")[0] fs = FileSystemStorage() if not os.path.exists("media/" + str(request.user.id)): os.makedirs("media/" + str(request.user.id)) filename = fs.save( str(request.user.id) + "/pdfs/" + uploaded_file.name, uploaded_file) uploaded_file_url = fs.url(filename) print(uploaded_file_url) print(os.getcwd()) print(os.listdir('media/2/pdfs/')) df = pdf_converter(directory_path='media/' + str(request.user.id) + '/pdfs/') print(df) from cdqa.utils.download import download_squad, download_model, download_bnpp_data directory = '/home/tanmay/Downloads' # Downloading data download_squad(dir=directory) download_bnpp_data(dir=directory) # Downloading pre-trained BERT fine-tuned on SQuAD 1.1 download_model('bert-squad_1.1', dir=directory) # Downloading pre-trained DistilBERT fine-tuned on SQuAD 1.1 download_model('distilbert-squad_1.1', dir=directory) cdqa_pipeline = QAPipeline( reader='/home/tanmay/Downloads/bert_qa.joblib' ) # use 'distilbert_qa.joblib' for DistilBERT instead of BERT cdqa_pipeline.fit_retriever(df=df) pkl_filename = '/home/tanmay/Downloads/' + request.session[ 'name'] + 'query.pkl' with open(pkl_filename, 'wb') as file: pickle.dump(cdqa_pipeline, file) cdqa_pipeline = "" uploaded_file = "" df = "" gc.collect() # joblib.dump(cdqa_pipeline, '/home/tanmay/Downloads/'+request.session['name']+'query.joblib') #did not work # cdqa_pipeline.dump_reader('/home/tanmay/Downloads/'+request.session['name']+'query.joblib') #did not work request.session[ "model_url"] = '/home/tanmay/Downloads/' + request.session[ 'name'] + 'query.pkl' rdata = {"result": "Model is trained"} return (JsonResponse(rdata)) else: pkl_filename = request.session["model_url"] with open(pkl_filename, 'rb') as file: cdqa_pipeline = pickle.load(file) question = request.POST["question"] # cdqa_pipeline = QAPipeline(reader= request.session['model_url']) Ans = cdqa_pipeline.predict(question) cdqa_pipeline = "" gc.collect() print(Ans) rdata = {"one_word": Ans[0], "paragraph": Ans[2]} return (JsonResponse(rdata)) else: return (render(request, "ml/docquery/index.html"))
from cdqa.pipeline import QAPipeline import pandas as pd import pickle import jsonify import re app = Flask('Customer Warriors') dataframe_from_pkl = pd.read_pickle('./csv_of_df_scm.pkl') with open('urldict.pickle', 'rb') as handle: url_dict = pickle.load(handle) model = QAPipeline(reader='./distilbert_qa_finetuned.joblib', max_df=1.0) model.fit_retriever(df=dataframe_from_pkl) def show_predictions(pred, url_dict): return (pred[0]), (url_dict.get(pred[1])), (pred[2]) @app.route('/') @app.route('/index.html') def home(): return render_template('index.html') @app.route('/SomeSampleQnAs.html') def show_sample_qnas(): return render_template('SomeSampleQnAs.html')
class QA: def __init__(self): # Fix in order to convert only one file at a time # https://github.com/cdqa-suite/cdQA/issues/224 self.cdqa_pipeline = QAPipeline(reader=trained_weights, max_df=1, min_df=1) def predict(self, question): """ Question function Inparameter: A text string containing a question Returns: A tuple of two strings, first element is the direct answer to the question second element is the sentence/context where the answer was found """ answer, title, context, score = self.cdqa_pipeline.predict(question) return answer, context, score def convert_data(self, filepath): """ Convert data files to txt """ filename = os.path.basename(filepath) name, extension = os.path.splitext(str(filename)) root, _ = filepath.split(f"/text/{filename}") filepath_txt = f"{root}/text/{name}.txt" filepath_csv = f"{root}/csv/{name}.csv" if extension == ".csv": # csv needs to have "title" and "paragraphs" features df = pd.read_csv(filepath, converters={"paragraphs": literal_eval}) df = filter_paragraphs(df) # https://stackoverflow.com/questions/51491931/reading-text-files-from-subfolders-and-folders-and-creating-a-dataframe-in-panda elif extension == ".txt" or extension == ".story": lines = [] # Read file and remove non UTF-8 chars with open(filepath, encoding="utf8", errors='ignore') as f: for line in f: lines.append( bytes(line, "utf-8").decode("utf-8", "ignore")) paragraphs = lines # Make df to use in QA df = pd.DataFrame({"title": filename, "paragraphs": [paragraphs]}) with open(filepath_txt, "w+") as f: for line in lines: f.write(line) elif extension == ".pdf": tmp_dir = f"{root}/tmp" tmp_filepath = f"{tmp_dir}/{filename}" if not os.path.exists(tmp_dir): os.makedirs(tmp_dir) shutil.copyfile(filepath, tmp_filepath) df = pdf_converter(directory_path=tmp_dir) shutil.rmtree(tmp_dir, ignore_errors=True) os.remove(filepath) # Remove original pdf file with open(filepath_txt, "w") as file: for line in df.loc[0]["paragraphs"]: file.write("\n" + line) #df.to_csv(f"{filepath_csv}", index=False) self.cdqa_pipeline.fit_retriever(df=df) def convert_and_load(self, filepath=None, filename=None): self.convert_data(filepath) #self.load_data(filepath) def load_data(self, filepath=None): """ Read in date file/path and determines the tile type If no file type, then assumes folder contatins pdfs """ df = pd.read_csv(filepath, converters={"paragraphs": literal_eval}) df = filter_paragraphs(df) self.cdqa_pipeline.fit_retriever(df=df)
import os from ast import literal_eval import pandas as pd from cdqa.utils.filters import filter_paragraphs from cdqa.pipeline import QAPipeline df = pd.read_csv('esrc_pdfs.csv', converters={'paragraphs': literal_eval}) cdqa_pipeline = QAPipeline( reader='/resources/cdQA/bert_qa.joblib' ) # use 'distilbert_qa.joblib' for DistilBERT instead of BERT cdqa_pipeline.fit_retriever(df=df) # should this be fit_reader??? cdqa_pipeline.dump_reader('/resources/cdQA/bert-reader.joblib') prediction = cdqa_pipeline.predict(query, n_predictions=5) def make_prediction(query, n_predictions): prediction = cdqa_pipeline.predict(query, n_predictions=n_predictions) return prediction