#Note (possibly): pip install pandas-compat #if pandas compat error appears import streamlit as st import numpy as np from cdqa.utils.download import download_squad, download_model, download_bnpp_data import nltk nltk.download('punkt') import pandas as pd directory = '/home/antony/u4/cdQA/data'# or is it one level up? download_squad(dir=directory) download_bnpp_data(dir=directory)#why? from cdqa.utils.download import download_model #, download_bnpp_data #not now download_model('bert-squad_1.1', dir=directory) download_model('distilbert-squad_1.1', dir=directory) from ast import literal_eval from cdqa.utils.filters import filter_paragraphs from cdqa.pipeline.cdqa_sklearn import QAPipeline from nltk import tokenize def load_from_csv(file): df = pd.read_csv(file) df = df.rename(str.lower, axis='columns') df['paragraphs'] = df['paragraphs'].apply(lambda x: x.replace("'s", " " "s").replace("\n"," ")) df['paragraphs'] = df['paragraphs'].apply(lambda x: tokenize.sent_tokenize(x)) return df df = load_from_csv('./data/test.csv') #make sure bert_qa.joblib is the same directory (cdQA), if not move it here from data cdqa_pipeline = QAPipeline(reader='bert_qa.joblib')
def indexdq(request): if (request.POST): if ('file' in request.FILES): request.session['proj_id'] = request.POST['proj_id'] uploaded_file = request.FILES['file'] request.session['name'] = uploaded_file.name.split(".")[0] fs = FileSystemStorage() if not os.path.exists("media/" + str(request.user.id)): os.makedirs("media/" + str(request.user.id)) filename = fs.save( str(request.user.id) + "/pdfs/" + uploaded_file.name, uploaded_file) uploaded_file_url = fs.url(filename) print(uploaded_file_url) print(os.getcwd()) print(os.listdir('media/2/pdfs/')) df = pdf_converter(directory_path='media/' + str(request.user.id) + '/pdfs/') print(df) from cdqa.utils.download import download_squad, download_model, download_bnpp_data directory = '/home/tanmay/Downloads' # Downloading data download_squad(dir=directory) download_bnpp_data(dir=directory) # Downloading pre-trained BERT fine-tuned on SQuAD 1.1 download_model('bert-squad_1.1', dir=directory) # Downloading pre-trained DistilBERT fine-tuned on SQuAD 1.1 download_model('distilbert-squad_1.1', dir=directory) cdqa_pipeline = QAPipeline( reader='/home/tanmay/Downloads/bert_qa.joblib' ) # use 'distilbert_qa.joblib' for DistilBERT instead of BERT cdqa_pipeline.fit_retriever(df=df) pkl_filename = '/home/tanmay/Downloads/' + request.session[ 'name'] + 'query.pkl' with open(pkl_filename, 'wb') as file: pickle.dump(cdqa_pipeline, file) cdqa_pipeline = "" uploaded_file = "" df = "" gc.collect() # joblib.dump(cdqa_pipeline, '/home/tanmay/Downloads/'+request.session['name']+'query.joblib') #did not work # cdqa_pipeline.dump_reader('/home/tanmay/Downloads/'+request.session['name']+'query.joblib') #did not work request.session[ "model_url"] = '/home/tanmay/Downloads/' + request.session[ 'name'] + 'query.pkl' rdata = {"result": "Model is trained"} return (JsonResponse(rdata)) else: pkl_filename = request.session["model_url"] with open(pkl_filename, 'rb') as file: cdqa_pipeline = pickle.load(file) question = request.POST["question"] # cdqa_pipeline = QAPipeline(reader= request.session['model_url']) Ans = cdqa_pipeline.predict(question) cdqa_pipeline = "" gc.collect() print(Ans) rdata = {"one_word": Ans[0], "paragraph": Ans[2]} return (JsonResponse(rdata)) else: return (render(request, "ml/docquery/index.html"))
import pandas as pd from ast import literal_eval from cdqa.utils.filters import filter_paragraphs from cdqa.utils.download import download_model, download_bnpp_data from cdqa.pipeline.cdqa_sklearn import QAPipeline # Download data and models download_bnpp_data(dir='./data/bnpp_newsroom_v1.1/') # download_model(model='bert-squad_1.1', dir='./models') # Loading data and filtering / preprocessing the documents # df = pd.read_csv('data/bnpp_newsroom_v1.1/bnpp_newsroom-v1.1.csv', converters={'paragraphs': literal_eval}) df = pd.read_csv('data/bnpp_newsroom_v1.1/custom_tax_jlb.csv', converters={'paragraphs': literal_eval}) df = filter_paragraphs(df) # Loading QAPipeline with CPU version of BERT Reader pretrained on SQuAD 1.1 # cdqa_pipeline = QAPipeline(reader='models/bert_qa_vCPU-sklearn.joblib') cdqa_pipeline = QAPipeline(reader='models/bert_qa.joblib') # Fitting the retriever to the list of documents in the dataframe # cdqa_pipeline.fit_retriever(X=df) cdqa_pipeline.fit_retriever(df=df) # Sending a question to the pipeline and getting prediction # query = 'Since when does the Excellence Program of BNP Paribas exist?' # query = 'Who should investors consult with prior to investing?' # query = 'Who do custom animal farmers need to consult with before buying fertilizer?' queries = [ 'Who do custom animal farmers need to consult with before buying fertilizer?',
""" Download pdf files from BNP Paribas public news """ directory = './data/examples/pdf/' models_url = [ 'https://invest.bnpparibas.com/documents/1q19-pr-12648', 'https://invest.bnpparibas.com/documents/4q18-pr-18000', 'https://invest.bnpparibas.com/documents/4q17-pr' ] print('\nDownloading PDF files...') if not os.path.exists(directory): os.makedirs(directory) print("Makeing dir: {}".format(directory)) for url in models_url: wget.download(url=url, out=directory) print("downloading: {}".format(url)) print("Finished downloading") # Check if directories contain files if __name__ == "__main__": if not os.listdir('./data/examples/pdf'): download_pdf() if not os.listdir('./data/examples/bnpp_newsroom_v1.1'): download_bnpp_data(dir='./data/examples/bnpp_newsroom_v1.1') if not os.listdir('./models'): # Download the model weights download_model(model='distilbert-squad_1.1', dir='./models')