def create_answers(deck_id, query): decks_collection = db["decks"] slides_collection = db["slides"] decks = decks_collection.find({"_id": deck_id}) target_deck = decks[0] author_id = target_deck["user"] author_slides = slides_collection.find({"user": author_id}) df = pd.DataFrame() for slide in author_slides: revision = slide["revisions"][-1] usages = revision["usage"] for usage in usages: if usage["id"] == deck_id: content = html2text.html2text(revision["content"]) paragraphs = content.split('\n\n') df = df.append( { "date": revision["timestamp"], "title": revision["title"], "category": "Infromation", "link": "", "abstract": "", "paragraphs": paragraphs, "revision_id": revision["id"], "slide_id": slide["_id"] }, ignore_index=True) break download_model(model='bert-squad_1.1', dir='./models') # df = filter_paragraphs(df) cdqa_pipeline = QAPipeline(reader='models/bert_qa.joblib', max_df=0.95, min_df=3) cdqa_pipeline.fit_retriever(df) predictions = cdqa_pipeline.predict( query, n_predictions=5) #retriever_score_weight=0.99 answers = [] i = 1 for prediction in predictions: slide_id = df.loc[df["title"] == prediction[1]].iloc[0]['slide_id'] revision_id = df.loc[df["title"] == prediction[1]].iloc[0]['revision_id'] answers.append({ "slide_id": int(slide_id), "revision_id": int(revision_id), "answer": prediction[0], "title": prediction[1], "paragraph": prediction[2], "score": prediction[3] }) i += 1 return answers
def download_bert_model(): print("Checking BERT model existence") if os.path.isfile(BERT_MODEL_PATH): print(f"Found model: {BERT_MODEL_PATH}") else: print(f"Downloading model: {BERT_MODEL_PATH}") download_model(model='bert-squad_1.1', dir=MODELS_DIR) print("Finished checking BERT model")
def get_distilbert_model(): if not os.path.exists('./models'): os.makedirs('./models') if not os.path.exists('./models/distilbert_qa.joblib'): download_model(model="{}-squad_1.1".format('distilbert'), dir='./models') return QAPipeline(reader='./models/distilbert_qa.joblib', max_df=1.0, min_df=1)
#!/usr/bin/env python # coding: utf-8 # In[ ]: from cdqa.utils.download import download_model download_model(model='bert-squad_1.1', dir='./models')
#pip install -r 'requirements.txt' import streamlit as st import numpy as np # took out download_bnpp_data from below # if these are in place they can be commented out? from cdqa.utils.download import download_squad, download_model import nltk nltk.download('punkt') import pandas as pd #if these are in place, the following 4 lines can be commented out? download_model('bert-squad_1.1') directory = '/home/antony/u6/cdQA/data' download_squad(dir=directory) download_model('bert-squad_1.1', dir=directory) download_model('distilbert-squad_1.1', dir=directory) from ast import literal_eval from cdqa.utils.filters import filter_paragraphs from cdqa.pipeline.cdqa_sklearn import QAPipeline from nltk import tokenize def load_from_csv(file): df = pd.read_csv(file) df = df.rename(str.lower, axis='columns') df['paragraphs'] = df['paragraphs'].apply( lambda x: x.replace("'s", " " "s").replace("\n", " "))
def indexdq(request): if (request.POST): if ('file' in request.FILES): request.session['proj_id'] = request.POST['proj_id'] uploaded_file = request.FILES['file'] request.session['name'] = uploaded_file.name.split(".")[0] fs = FileSystemStorage() if not os.path.exists("media/" + str(request.user.id)): os.makedirs("media/" + str(request.user.id)) filename = fs.save( str(request.user.id) + "/pdfs/" + uploaded_file.name, uploaded_file) uploaded_file_url = fs.url(filename) print(uploaded_file_url) print(os.getcwd()) print(os.listdir('media/2/pdfs/')) df = pdf_converter(directory_path='media/' + str(request.user.id) + '/pdfs/') print(df) from cdqa.utils.download import download_squad, download_model, download_bnpp_data directory = '/home/tanmay/Downloads' # Downloading data download_squad(dir=directory) download_bnpp_data(dir=directory) # Downloading pre-trained BERT fine-tuned on SQuAD 1.1 download_model('bert-squad_1.1', dir=directory) # Downloading pre-trained DistilBERT fine-tuned on SQuAD 1.1 download_model('distilbert-squad_1.1', dir=directory) cdqa_pipeline = QAPipeline( reader='/home/tanmay/Downloads/bert_qa.joblib' ) # use 'distilbert_qa.joblib' for DistilBERT instead of BERT cdqa_pipeline.fit_retriever(df=df) pkl_filename = '/home/tanmay/Downloads/' + request.session[ 'name'] + 'query.pkl' with open(pkl_filename, 'wb') as file: pickle.dump(cdqa_pipeline, file) cdqa_pipeline = "" uploaded_file = "" df = "" gc.collect() # joblib.dump(cdqa_pipeline, '/home/tanmay/Downloads/'+request.session['name']+'query.joblib') #did not work # cdqa_pipeline.dump_reader('/home/tanmay/Downloads/'+request.session['name']+'query.joblib') #did not work request.session[ "model_url"] = '/home/tanmay/Downloads/' + request.session[ 'name'] + 'query.pkl' rdata = {"result": "Model is trained"} return (JsonResponse(rdata)) else: pkl_filename = request.session["model_url"] with open(pkl_filename, 'rb') as file: cdqa_pipeline = pickle.load(file) question = request.POST["question"] # cdqa_pipeline = QAPipeline(reader= request.session['model_url']) Ans = cdqa_pipeline.predict(question) cdqa_pipeline = "" gc.collect() print(Ans) rdata = {"one_word": Ans[0], "paragraph": Ans[2]} return (JsonResponse(rdata)) else: return (render(request, "ml/docquery/index.html"))