Пример #1
0
#Note (possibly): pip install pandas-compat #if pandas compat error appears

import streamlit as st
import numpy as np
from cdqa.utils.download import download_squad, download_model, download_bnpp_data 
import nltk
nltk.download('punkt')
import pandas as pd

directory = '/home/antony/u4/cdQA/data'# or is it one level up?
download_squad(dir=directory)
download_bnpp_data(dir=directory)#why?
from cdqa.utils.download import download_model #, download_bnpp_data #not now
download_model('bert-squad_1.1', dir=directory)
download_model('distilbert-squad_1.1', dir=directory)

from ast import literal_eval
from cdqa.utils.filters import filter_paragraphs
from cdqa.pipeline.cdqa_sklearn import QAPipeline
from nltk import tokenize

def load_from_csv(file):
    df = pd.read_csv(file)
    df = df.rename(str.lower, axis='columns')
    df['paragraphs'] = df['paragraphs'].apply(lambda x: x.replace("'s", " " "s").replace("\n"," "))
    df['paragraphs'] = df['paragraphs'].apply(lambda x: tokenize.sent_tokenize(x))
    return df

df = load_from_csv('./data/test.csv')
#make sure bert_qa.joblib is the same directory (cdQA), if not move it here from data
cdqa_pipeline = QAPipeline(reader='bert_qa.joblib')
Пример #2
0
def indexdq(request):
    if (request.POST):
        if ('file' in request.FILES):
            request.session['proj_id'] = request.POST['proj_id']
            uploaded_file = request.FILES['file']
            request.session['name'] = uploaded_file.name.split(".")[0]
            fs = FileSystemStorage()
            if not os.path.exists("media/" + str(request.user.id)):
                os.makedirs("media/" + str(request.user.id))
            filename = fs.save(
                str(request.user.id) + "/pdfs/" + uploaded_file.name,
                uploaded_file)
            uploaded_file_url = fs.url(filename)
            print(uploaded_file_url)
            print(os.getcwd())
            print(os.listdir('media/2/pdfs/'))
            df = pdf_converter(directory_path='media/' + str(request.user.id) +
                               '/pdfs/')
            print(df)

            from cdqa.utils.download import download_squad, download_model, download_bnpp_data

            directory = '/home/tanmay/Downloads'

            # Downloading data
            download_squad(dir=directory)
            download_bnpp_data(dir=directory)

            # Downloading pre-trained BERT fine-tuned on SQuAD 1.1
            download_model('bert-squad_1.1', dir=directory)

            # Downloading pre-trained DistilBERT fine-tuned on SQuAD 1.1
            download_model('distilbert-squad_1.1', dir=directory)

            cdqa_pipeline = QAPipeline(
                reader='/home/tanmay/Downloads/bert_qa.joblib'
            )  # use 'distilbert_qa.joblib' for DistilBERT instead of BERT
            cdqa_pipeline.fit_retriever(df=df)

            pkl_filename = '/home/tanmay/Downloads/' + request.session[
                'name'] + 'query.pkl'
            with open(pkl_filename, 'wb') as file:
                pickle.dump(cdqa_pipeline, file)
            cdqa_pipeline = ""
            uploaded_file = ""
            df = ""
            gc.collect()
            # joblib.dump(cdqa_pipeline, '/home/tanmay/Downloads/'+request.session['name']+'query.joblib') #did not work
            # cdqa_pipeline.dump_reader('/home/tanmay/Downloads/'+request.session['name']+'query.joblib') #did not work
            request.session[
                "model_url"] = '/home/tanmay/Downloads/' + request.session[
                    'name'] + 'query.pkl'
            rdata = {"result": "Model is trained"}
            return (JsonResponse(rdata))
        else:
            pkl_filename = request.session["model_url"]
            with open(pkl_filename, 'rb') as file:
                cdqa_pipeline = pickle.load(file)
            question = request.POST["question"]
            # cdqa_pipeline = QAPipeline(reader= request.session['model_url'])
            Ans = cdqa_pipeline.predict(question)
            cdqa_pipeline = ""
            gc.collect()
            print(Ans)
            rdata = {"one_word": Ans[0], "paragraph": Ans[2]}
            return (JsonResponse(rdata))
    else:
        return (render(request, "ml/docquery/index.html"))
Пример #3
0
import pandas as pd
from ast import literal_eval

from cdqa.utils.filters import filter_paragraphs
from cdqa.utils.download import download_model, download_bnpp_data
from cdqa.pipeline.cdqa_sklearn import QAPipeline

# Download data and models
download_bnpp_data(dir='./data/bnpp_newsroom_v1.1/')
# download_model(model='bert-squad_1.1', dir='./models')

# Loading data and filtering / preprocessing the documents
# df = pd.read_csv('data/bnpp_newsroom_v1.1/bnpp_newsroom-v1.1.csv', converters={'paragraphs': literal_eval})
df = pd.read_csv('data/bnpp_newsroom_v1.1/custom_tax_jlb.csv',
                 converters={'paragraphs': literal_eval})
df = filter_paragraphs(df)

# Loading QAPipeline with CPU version of BERT Reader pretrained on SQuAD 1.1
# cdqa_pipeline = QAPipeline(reader='models/bert_qa_vCPU-sklearn.joblib')
cdqa_pipeline = QAPipeline(reader='models/bert_qa.joblib')

# Fitting the retriever to the list of documents in the dataframe
# cdqa_pipeline.fit_retriever(X=df)
cdqa_pipeline.fit_retriever(df=df)

# Sending a question to the pipeline and getting prediction
# query = 'Since when does the Excellence Program of BNP Paribas exist?'
# query = 'Who should investors  consult with prior to investing?'
# query = 'Who do custom animal farmers need to consult with before buying fertilizer?'
queries = [
    'Who do custom animal farmers need to consult with before buying fertilizer?',
Пример #4
0
    """
    Download pdf files from BNP Paribas public news
    """
    directory = './data/examples/pdf/'
    models_url = [
        'https://invest.bnpparibas.com/documents/1q19-pr-12648',
        'https://invest.bnpparibas.com/documents/4q18-pr-18000',
        'https://invest.bnpparibas.com/documents/4q17-pr'
    ]

    print('\nDownloading PDF files...')
    if not os.path.exists(directory):
        os.makedirs(directory)
        print("Makeing dir: {}".format(directory))
    for url in models_url:
        wget.download(url=url, out=directory)
        print("downloading: {}".format(url))

    print("Finished downloading")


# Check if directories contain files
if __name__ == "__main__":
    if not os.listdir('./data/examples/pdf'):
        download_pdf()
    if not os.listdir('./data/examples/bnpp_newsroom_v1.1'):
        download_bnpp_data(dir='./data/examples/bnpp_newsroom_v1.1')
    if not os.listdir('./models'):
        # Download the model weights
        download_model(model='distilbert-squad_1.1', dir='./models')