def execute_pipeline(query, n_predictions=None): download_bnpp_data("./data/bnpp_newsroom_v1.1/") download_model("bert-squad_1.1", dir="./models") df = pd.read_csv( "./data/bnpp_newsroom_v1.1/bnpp_newsroom-v1.1.csv", converters={"paragraphs": literal_eval}, ) df = filter_paragraphs(df) cdqa_pipeline = QAPipeline(reader="models/bert_qa_vCPU-sklearn.joblib") cdqa_pipeline.fit_retriever(df) if torch.cuda.is_available(): cdqa_pipeline.cuda() if n_predictions is not None: predictions = cdqa_pipeline.predict(query, n_predictions=n_predictions) result = [] for answer, title, paragraph, score in predictions: prediction = (answer, title) result.append(prediction) return result else: prediction = cdqa_pipeline.predict(query) result = (prediction[0], prediction[1]) return result
def fine_tuning_drive(question, file_name): storage.child("docs/" + file_name).download("/docs/", "docs/" + file_name) df = pdf_converter(directory_path="docs/") pd.set_option('display.max_colwidth', -1) df.head() cdqa_pipeline = QAPipeline(reader='./models/bert_qa.joblib', max_df=1.0) cdqa_pipeline.fit_retriever(df=df) joblib.dump(cdqa_pipeline, './models/bert_qa_custom.joblib') cdqa_pipeline=joblib.load('./models/bert_qa_custom.joblib') prediction = cdqa_pipeline.predict(question, 1) os.remove("docs/"+file_name) return prediction
def qna(query): df = pdf_converter(directory_path='./media/pdf') df.head() cdqa_pipeline = QAPipeline(reader='./models/bert_qa.joblib', max_df=1.0) # Fit Retriever to documents cdqa_pipeline.fit_retriever(df=df) # INPUT QUESTION print("\n\n\\n", query) #query = 'when was the second Indian Factory Act passed?' prediction = cdqa_pipeline.predict(query) # ans = 'query: {}\n \nanswer: {} \ntitle: {} \nparagraph: {}'.format(query,prediction[0],prediction[1],prediction[2]) ans = [query, prediction[0], prediction[1], prediction[2]] return ans
def post(self): parser = reqparse.RequestParser() parser.add_argument('query', type=str, required=True) args = parser.parse_args() df = pdf_converter(directory_path='./data/pdf/') cdqa_pipeline = QAPipeline(reader='./models/bert_qa.joblib', max_df=1.0) cdqa_pipeline.fit_retriever(df=df) prediction = cdqa_pipeline.predict(args.query) return {'data': prediction}, 200
def max_qa_bot(query): # df = pdf_converter(directory_path='C:/Users/kvsis/Desktop/Learning/Python Scripts/cdQA_project/data/pdf_files') df = pd.read_csv( 'C:/Users/kvsis/Desktop/Learning/Python Scripts/cdQA_project/data/data/data.csv', converters={'paragraphs': literal_eval}) # df = filter_paragraphs(df) cdqa_pipeline = QAPipeline( reader= 'C:/Users/kvsis/Desktop/Learning/Python Scripts/cdQA_project/models/bert_qa_vCPU-sklearn.joblib' ) cdqa_pipeline.fit_retriever(df=df) # recognizer = sr.Recognizer() # # recognizer.pause_threshold = 5.0 # with sr.Microphone() as source: # # print("[search edureka: search youtube]") # print("Speak Now") # audio = recognizer.listen(source) # query = recognizer.recognize_google(audio).capitalize() # print(query) # query = "What is td ameritrade" prediction = cdqa_pipeline.predict(query) # print('query: {}\n'.format(query)) # print('answer: {}\n'.format(prediction[0])) # print('title: {}\n'.format(prediction[1])) # print('paragraph: {}\n'.format(prediction[2])) # # Initializing the Text-to-Speech engine # engine = pyttsx3.init() # david = "HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Speech\Voices\Tokens\TTS_MS_EN-US_DAVID_11.0" # zira = "HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Speech\Voices\Tokens\TTS_MS_EN-US_ZIRA_11.0" # engine.setProperty('rate', 150) # engine.setProperty('volume', 1.0) # engine.setProperty('voice', david) # engine.say(prediction[2]) # engine.runAndWait() # engine.stop() # result = ('Question: {}\n'.format(query).capitalize()) + ('Answer: {}\n'.format(prediction[0]).capitalize()) + ('Subject: {}\n'.format(prediction[1]).capitalize()) + ('Paragraph: {}\n'.format(prediction[2]).capitalize()) result = prediction[2].capitalize() return result
from cdqa.pipeline import QAPipeline from cdqa.utils.download import download_model # Download model download_model(model='bert-squad_1.1', dir='./models') # INPUT PDFs # Here path is the folder of the PDFs to be used df = pdf_converter( directory_path='C:/Users/Viswash/Desktop/Work/ChatBot/Research/ Papers/') df.head() cdqa_pipeline = QAPipeline(reader='./models/bert_qa.joblib', max_df=1.0) # Fit Retriever to documents cdqa_pipeline.fit_retriever(df=df) # INPUT QUESTION query = 'when was the second Indian Factory Act passed?' prediction = cdqa_pipeline.predict(query) ans = 'query: {} \nanswer: {} \ntitle: {} \nparagraph: {}'.format( query, prediction[0], prediction[1], prediction[2]) print(ans) # OUTPUT # print('query: {}'.format(query)) # print('answer: {}'.format(prediction[0])) # print('title: {}'.format(prediction[1])) # print('paragraph: {}'.format(prediction[2]))
def indexdq(request): if (request.POST): if ('file' in request.FILES): request.session['proj_id'] = request.POST['proj_id'] uploaded_file = request.FILES['file'] request.session['name'] = uploaded_file.name.split(".")[0] fs = FileSystemStorage() if not os.path.exists("media/" + str(request.user.id)): os.makedirs("media/" + str(request.user.id)) filename = fs.save( str(request.user.id) + "/pdfs/" + uploaded_file.name, uploaded_file) uploaded_file_url = fs.url(filename) print(uploaded_file_url) print(os.getcwd()) print(os.listdir('media/2/pdfs/')) df = pdf_converter(directory_path='media/' + str(request.user.id) + '/pdfs/') print(df) from cdqa.utils.download import download_squad, download_model, download_bnpp_data directory = '/home/tanmay/Downloads' # Downloading data download_squad(dir=directory) download_bnpp_data(dir=directory) # Downloading pre-trained BERT fine-tuned on SQuAD 1.1 download_model('bert-squad_1.1', dir=directory) # Downloading pre-trained DistilBERT fine-tuned on SQuAD 1.1 download_model('distilbert-squad_1.1', dir=directory) cdqa_pipeline = QAPipeline( reader='/home/tanmay/Downloads/bert_qa.joblib' ) # use 'distilbert_qa.joblib' for DistilBERT instead of BERT cdqa_pipeline.fit_retriever(df=df) pkl_filename = '/home/tanmay/Downloads/' + request.session[ 'name'] + 'query.pkl' with open(pkl_filename, 'wb') as file: pickle.dump(cdqa_pipeline, file) cdqa_pipeline = "" uploaded_file = "" df = "" gc.collect() # joblib.dump(cdqa_pipeline, '/home/tanmay/Downloads/'+request.session['name']+'query.joblib') #did not work # cdqa_pipeline.dump_reader('/home/tanmay/Downloads/'+request.session['name']+'query.joblib') #did not work request.session[ "model_url"] = '/home/tanmay/Downloads/' + request.session[ 'name'] + 'query.pkl' rdata = {"result": "Model is trained"} return (JsonResponse(rdata)) else: pkl_filename = request.session["model_url"] with open(pkl_filename, 'rb') as file: cdqa_pipeline = pickle.load(file) question = request.POST["question"] # cdqa_pipeline = QAPipeline(reader= request.session['model_url']) Ans = cdqa_pipeline.predict(question) cdqa_pipeline = "" gc.collect() print(Ans) rdata = {"one_word": Ans[0], "paragraph": Ans[2]} return (JsonResponse(rdata)) else: return (render(request, "ml/docquery/index.html"))
#drqa.tokenizers.set_default('corenlp_classpath', './data/corenlp') #tok = drqa.tokenizers.CoreNLPTokenizer() #print(tok.tokenize('hello world').words()) import os import sys import pandas as pd from ast import literal_eval from cdqa.utils.converters import pdf_converter from cdqa.utils.filters import filter_paragraphs from cdqa.pipeline import QAPipeline from cdqa.utils.download import download_model if __name__ == "__main__": cdqa_pipeline = QAPipeline(reader='./models/bert_qa.joblib', max_df=1.0) df = pdf_converter(directory_path='./data/pdf/') df.head() # Fit Retriever to documents cdqa_pipeline.fit_retriever(df=df) query = sys.argv[1] prediction = cdqa_pipeline.predict(query, return_all_preds=True) print('query: {}'.format(query)) for pred in prediction: print(pred) #print('answer: {}'.format(prediction[0])) #print('title: {}'.format(prediction[1])) #print('paragraph: {}'.format(prediction[2]))
df = pd.read_csv('final.csv', converters={'paragraphs': literal_eval}) print(df.head()) df2 = filter_paragraphs(df) print(df2.head()) cdqa_pipeline = QAPipeline(reader='./models/bert_qa.joblib') cdqa_pipeline.fit_retriever(df=df2) queries = [ 'What is known about transmission, incubation, and environmental stability?', 'What do we know about COVID-19 risk factors?', 'What do we know about virus genetics, origin, and evolution?', 'What do we know about vaccines and therapeutics?', 'What do we know about non-pharmaceutical interventions?', 'What has been published about medical care?', 'What do we know about diagnostics and surveillance?' 'What has been published about information sharing and inter-sectoral collaboration?', 'What has been published about ethical and social science considerations?' ] for query in queries: prediction = cdqa_pipeline.predict(query, n_predictions=20, retriever_score_weight=0.6) print('Query: {}'.format(query)) # for x,y,z in zip(prediction[0][:-1],prediction[1][:-1],prediction[2][:-1]): print('Answer: ', str(prediction[0][-2])) print('Title: ', str(prediction[1][-2])) print('Paragraph: ', str(prediction[2][-2])) if query != queries[-1]: print('---------------Next Query---------------------')
class QA: def __init__(self): # Fix in order to convert only one file at a time # https://github.com/cdqa-suite/cdQA/issues/224 self.cdqa_pipeline = QAPipeline(reader=trained_weights, max_df=1, min_df=1) def predict(self, question): """ Question function Inparameter: A text string containing a question Returns: A tuple of two strings, first element is the direct answer to the question second element is the sentence/context where the answer was found """ answer, title, context, score = self.cdqa_pipeline.predict(question) return answer, context, score def convert_data(self, filepath): """ Convert data files to txt """ filename = os.path.basename(filepath) name, extension = os.path.splitext(str(filename)) root, _ = filepath.split(f"/text/{filename}") filepath_txt = f"{root}/text/{name}.txt" filepath_csv = f"{root}/csv/{name}.csv" if extension == ".csv": # csv needs to have "title" and "paragraphs" features df = pd.read_csv(filepath, converters={"paragraphs": literal_eval}) df = filter_paragraphs(df) # https://stackoverflow.com/questions/51491931/reading-text-files-from-subfolders-and-folders-and-creating-a-dataframe-in-panda elif extension == ".txt" or extension == ".story": lines = [] # Read file and remove non UTF-8 chars with open(filepath, encoding="utf8", errors='ignore') as f: for line in f: lines.append( bytes(line, "utf-8").decode("utf-8", "ignore")) paragraphs = lines # Make df to use in QA df = pd.DataFrame({"title": filename, "paragraphs": [paragraphs]}) with open(filepath_txt, "w+") as f: for line in lines: f.write(line) elif extension == ".pdf": tmp_dir = f"{root}/tmp" tmp_filepath = f"{tmp_dir}/{filename}" if not os.path.exists(tmp_dir): os.makedirs(tmp_dir) shutil.copyfile(filepath, tmp_filepath) df = pdf_converter(directory_path=tmp_dir) shutil.rmtree(tmp_dir, ignore_errors=True) os.remove(filepath) # Remove original pdf file with open(filepath_txt, "w") as file: for line in df.loc[0]["paragraphs"]: file.write("\n" + line) #df.to_csv(f"{filepath_csv}", index=False) self.cdqa_pipeline.fit_retriever(df=df) def convert_and_load(self, filepath=None, filename=None): self.convert_data(filepath) #self.load_data(filepath) def load_data(self, filepath=None): """ Read in date file/path and determines the tile type If no file type, then assumes folder contatins pdfs """ df = pd.read_csv(filepath, converters={"paragraphs": literal_eval}) df = filter_paragraphs(df) self.cdqa_pipeline.fit_retriever(df=df)
import os from ast import literal_eval import pandas as pd from cdqa.utils.filters import filter_paragraphs from cdqa.pipeline import QAPipeline df = pd.read_csv('esrc_pdfs.csv', converters={'paragraphs': literal_eval}) cdqa_pipeline = QAPipeline( reader='/resources/cdQA/bert_qa.joblib' ) # use 'distilbert_qa.joblib' for DistilBERT instead of BERT cdqa_pipeline.fit_retriever(df=df) # should this be fit_reader??? cdqa_pipeline.dump_reader('/resources/cdQA/bert-reader.joblib') prediction = cdqa_pipeline.predict(query, n_predictions=5) def make_prediction(query, n_predictions): prediction = cdqa_pipeline.predict(query, n_predictions=n_predictions) return prediction