def execute_pipeline(query, n_predictions=None):
    download_bnpp_data("./data/bnpp_newsroom_v1.1/")
    download_model("bert-squad_1.1", dir="./models")
    df = pd.read_csv(
        "./data/bnpp_newsroom_v1.1/bnpp_newsroom-v1.1.csv",
        converters={"paragraphs": literal_eval},
    )
    df = filter_paragraphs(df)

    cdqa_pipeline = QAPipeline(reader="models/bert_qa_vCPU-sklearn.joblib")
    cdqa_pipeline.fit_retriever(df)
    if torch.cuda.is_available():
        cdqa_pipeline.cuda()
    if n_predictions is not None:
        predictions = cdqa_pipeline.predict(query, n_predictions=n_predictions)
        result = []

        for answer, title, paragraph, score in predictions:
            prediction = (answer, title)
            result.append(prediction)
        return result
    else:
        prediction = cdqa_pipeline.predict(query)
        result = (prediction[0], prediction[1])
        return result
def fine_tuning_drive(question, file_name):
  storage.child("docs/" + file_name).download("/docs/", "docs/" + file_name)
  df = pdf_converter(directory_path="docs/")
  pd.set_option('display.max_colwidth', -1)
  df.head()
  cdqa_pipeline = QAPipeline(reader='./models/bert_qa.joblib', max_df=1.0)
  cdqa_pipeline.fit_retriever(df=df)
  joblib.dump(cdqa_pipeline, './models/bert_qa_custom.joblib')
  cdqa_pipeline=joblib.load('./models/bert_qa_custom.joblib')
  prediction = cdqa_pipeline.predict(question, 1)
  os.remove("docs/"+file_name)
  return prediction
示例#3
0
def qna(query):
    df = pdf_converter(directory_path='./media/pdf')
    df.head()
    cdqa_pipeline = QAPipeline(reader='./models/bert_qa.joblib', max_df=1.0)
    # Fit Retriever to documents
    cdqa_pipeline.fit_retriever(df=df)
    # INPUT QUESTION
    print("\n\n\\n", query)
    #query = 'when was the second Indian Factory Act passed?'
    prediction = cdqa_pipeline.predict(query)
    # ans = 'query: {}\n \nanswer: {} \ntitle: {} \nparagraph: {}'.format(query,prediction[0],prediction[1],prediction[2])
    ans = [query, prediction[0], prediction[1], prediction[2]]
    return ans
示例#4
0
    def post(self):
        parser = reqparse.RequestParser()
        parser.add_argument('query', type=str, required=True)
        args = parser.parse_args()

        df = pdf_converter(directory_path='./data/pdf/')
        cdqa_pipeline = QAPipeline(reader='./models/bert_qa.joblib',
                                   max_df=1.0)

        cdqa_pipeline.fit_retriever(df=df)

        prediction = cdqa_pipeline.predict(args.query)

        return {'data': prediction}, 200
示例#5
0
文件: QABot.py 项目: kvsista/max_bot
def max_qa_bot(query):
    # df = pdf_converter(directory_path='C:/Users/kvsis/Desktop/Learning/Python Scripts/cdQA_project/data/pdf_files')
    df = pd.read_csv(
        'C:/Users/kvsis/Desktop/Learning/Python Scripts/cdQA_project/data/data/data.csv',
        converters={'paragraphs': literal_eval})
    # df = filter_paragraphs(df)

    cdqa_pipeline = QAPipeline(
        reader=
        'C:/Users/kvsis/Desktop/Learning/Python Scripts/cdQA_project/models/bert_qa_vCPU-sklearn.joblib'
    )
    cdqa_pipeline.fit_retriever(df=df)

    # recognizer = sr.Recognizer()
    # # recognizer.pause_threshold = 5.0
    # with sr.Microphone() as source:
    #     # print("[search edureka: search youtube]")
    #     print("Speak Now")
    #     audio = recognizer.listen(source)
    #     query = recognizer.recognize_google(audio).capitalize()
    #     print(query)

    # query = "What is td ameritrade"
    prediction = cdqa_pipeline.predict(query)

    # print('query: {}\n'.format(query))
    # print('answer: {}\n'.format(prediction[0]))
    # print('title: {}\n'.format(prediction[1]))
    # print('paragraph: {}\n'.format(prediction[2]))

    # # Initializing the Text-to-Speech engine
    # engine = pyttsx3.init()

    # david = "HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Speech\Voices\Tokens\TTS_MS_EN-US_DAVID_11.0"
    # zira = "HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Speech\Voices\Tokens\TTS_MS_EN-US_ZIRA_11.0"
    # engine.setProperty('rate', 150)
    # engine.setProperty('volume', 1.0)
    # engine.setProperty('voice', david)
    # engine.say(prediction[2])
    # engine.runAndWait()
    # engine.stop()

    # result = ('Question: {}\n'.format(query).capitalize()) + ('Answer: {}\n'.format(prediction[0]).capitalize()) + ('Subject: {}\n'.format(prediction[1]).capitalize()) + ('Paragraph: {}\n'.format(prediction[2]).capitalize())
    result = prediction[2].capitalize()
    return result
示例#6
0
from cdqa.pipeline import QAPipeline
from cdqa.utils.download import download_model

# Download model
download_model(model='bert-squad_1.1', dir='./models')

# INPUT PDFs
# Here path is the folder of the PDFs to be used
df = pdf_converter(
    directory_path='C:/Users/Viswash/Desktop/Work/ChatBot/Research/ Papers/')
df.head()

cdqa_pipeline = QAPipeline(reader='./models/bert_qa.joblib', max_df=1.0)

# Fit Retriever to documents
cdqa_pipeline.fit_retriever(df=df)

# INPUT QUESTION
query = 'when was the second Indian Factory Act passed?'

prediction = cdqa_pipeline.predict(query)

ans = 'query: {} \nanswer: {} \ntitle: {} \nparagraph: {}'.format(
    query, prediction[0], prediction[1], prediction[2])
print(ans)
# OUTPUT
# print('query: {}'.format(query))
# print('answer: {}'.format(prediction[0]))
# print('title: {}'.format(prediction[1]))
# print('paragraph: {}'.format(prediction[2]))
示例#7
0
def indexdq(request):
    if (request.POST):
        if ('file' in request.FILES):
            request.session['proj_id'] = request.POST['proj_id']
            uploaded_file = request.FILES['file']
            request.session['name'] = uploaded_file.name.split(".")[0]
            fs = FileSystemStorage()
            if not os.path.exists("media/" + str(request.user.id)):
                os.makedirs("media/" + str(request.user.id))
            filename = fs.save(
                str(request.user.id) + "/pdfs/" + uploaded_file.name,
                uploaded_file)
            uploaded_file_url = fs.url(filename)
            print(uploaded_file_url)
            print(os.getcwd())
            print(os.listdir('media/2/pdfs/'))
            df = pdf_converter(directory_path='media/' + str(request.user.id) +
                               '/pdfs/')
            print(df)

            from cdqa.utils.download import download_squad, download_model, download_bnpp_data

            directory = '/home/tanmay/Downloads'

            # Downloading data
            download_squad(dir=directory)
            download_bnpp_data(dir=directory)

            # Downloading pre-trained BERT fine-tuned on SQuAD 1.1
            download_model('bert-squad_1.1', dir=directory)

            # Downloading pre-trained DistilBERT fine-tuned on SQuAD 1.1
            download_model('distilbert-squad_1.1', dir=directory)

            cdqa_pipeline = QAPipeline(
                reader='/home/tanmay/Downloads/bert_qa.joblib'
            )  # use 'distilbert_qa.joblib' for DistilBERT instead of BERT
            cdqa_pipeline.fit_retriever(df=df)

            pkl_filename = '/home/tanmay/Downloads/' + request.session[
                'name'] + 'query.pkl'
            with open(pkl_filename, 'wb') as file:
                pickle.dump(cdqa_pipeline, file)
            cdqa_pipeline = ""
            uploaded_file = ""
            df = ""
            gc.collect()
            # joblib.dump(cdqa_pipeline, '/home/tanmay/Downloads/'+request.session['name']+'query.joblib') #did not work
            # cdqa_pipeline.dump_reader('/home/tanmay/Downloads/'+request.session['name']+'query.joblib') #did not work
            request.session[
                "model_url"] = '/home/tanmay/Downloads/' + request.session[
                    'name'] + 'query.pkl'
            rdata = {"result": "Model is trained"}
            return (JsonResponse(rdata))
        else:
            pkl_filename = request.session["model_url"]
            with open(pkl_filename, 'rb') as file:
                cdqa_pipeline = pickle.load(file)
            question = request.POST["question"]
            # cdqa_pipeline = QAPipeline(reader= request.session['model_url'])
            Ans = cdqa_pipeline.predict(question)
            cdqa_pipeline = ""
            gc.collect()
            print(Ans)
            rdata = {"one_word": Ans[0], "paragraph": Ans[2]}
            return (JsonResponse(rdata))
    else:
        return (render(request, "ml/docquery/index.html"))
示例#8
0
#drqa.tokenizers.set_default('corenlp_classpath', './data/corenlp')
#tok = drqa.tokenizers.CoreNLPTokenizer()
#print(tok.tokenize('hello world').words())

import os
import sys
import pandas as pd
from ast import literal_eval

from cdqa.utils.converters import pdf_converter
from cdqa.utils.filters import filter_paragraphs
from cdqa.pipeline import QAPipeline
from cdqa.utils.download import download_model


if __name__ == "__main__":
    cdqa_pipeline = QAPipeline(reader='./models/bert_qa.joblib', max_df=1.0)
    df = pdf_converter(directory_path='./data/pdf/')
    df.head()
    # Fit Retriever to documents
    cdqa_pipeline.fit_retriever(df=df)
    query = sys.argv[1]
    prediction = cdqa_pipeline.predict(query, return_all_preds=True)
    print('query: {}'.format(query))
    for pred in prediction:
        print(pred)
    #print('answer: {}'.format(prediction[0]))
    #print('title: {}'.format(prediction[1]))
    #print('paragraph: {}'.format(prediction[2]))

示例#9
0
df = pd.read_csv('final.csv', converters={'paragraphs': literal_eval})
print(df.head())
df2 = filter_paragraphs(df)
print(df2.head())

cdqa_pipeline = QAPipeline(reader='./models/bert_qa.joblib')
cdqa_pipeline.fit_retriever(df=df2)

queries = [
    'What is known about transmission, incubation, and environmental stability?',
    'What do we know about COVID-19 risk factors?',
    'What do we know about virus genetics, origin, and evolution?',
    'What do we know about vaccines and therapeutics?',
    'What do we know about non-pharmaceutical interventions?',
    'What has been published about medical care?',
    'What do we know about diagnostics and surveillance?'
    'What has been published about information sharing and inter-sectoral collaboration?',
    'What has been published about ethical and social science considerations?'
]
for query in queries:
    prediction = cdqa_pipeline.predict(query,
                                       n_predictions=20,
                                       retriever_score_weight=0.6)
    print('Query: {}'.format(query))
    # for x,y,z in zip(prediction[0][:-1],prediction[1][:-1],prediction[2][:-1]):
    print('Answer: ', str(prediction[0][-2]))
    print('Title: ', str(prediction[1][-2]))
    print('Paragraph: ', str(prediction[2][-2]))
    if query != queries[-1]:
        print('---------------Next Query---------------------')
示例#10
0
文件: model.py 项目: sebbersk/Surmize
class QA:
    def __init__(self):
        # Fix in order to convert only one file at a time
        # https://github.com/cdqa-suite/cdQA/issues/224
        self.cdqa_pipeline = QAPipeline(reader=trained_weights,
                                        max_df=1,
                                        min_df=1)

    def predict(self, question):
        """
        Question function
        Inparameter: A text string containing a question

        Returns:
        A tuple of two strings, first element is the direct answer to the question
        second element is the sentence/context where the answer was found
        """
        answer, title, context, score = self.cdqa_pipeline.predict(question)
        return answer, context, score

    def convert_data(self, filepath):
        """
        Convert data files 
        to txt
        """
        filename = os.path.basename(filepath)
        name, extension = os.path.splitext(str(filename))
        root, _ = filepath.split(f"/text/{filename}")
        filepath_txt = f"{root}/text/{name}.txt"
        filepath_csv = f"{root}/csv/{name}.csv"

        if extension == ".csv":
            # csv needs to have "title" and "paragraphs" features
            df = pd.read_csv(filepath, converters={"paragraphs": literal_eval})
            df = filter_paragraphs(df)
            # https://stackoverflow.com/questions/51491931/reading-text-files-from-subfolders-and-folders-and-creating-a-dataframe-in-panda

        elif extension == ".txt" or extension == ".story":
            lines = []
            # Read file and remove non UTF-8 chars
            with open(filepath, encoding="utf8", errors='ignore') as f:
                for line in f:
                    lines.append(
                        bytes(line, "utf-8").decode("utf-8", "ignore"))
                paragraphs = lines

            # Make df to use in QA
            df = pd.DataFrame({"title": filename, "paragraphs": [paragraphs]})
            with open(filepath_txt, "w+") as f:
                for line in lines:
                    f.write(line)

        elif extension == ".pdf":
            tmp_dir = f"{root}/tmp"
            tmp_filepath = f"{tmp_dir}/{filename}"

            if not os.path.exists(tmp_dir):
                os.makedirs(tmp_dir)
            shutil.copyfile(filepath, tmp_filepath)

            df = pdf_converter(directory_path=tmp_dir)
            shutil.rmtree(tmp_dir, ignore_errors=True)
            os.remove(filepath)  # Remove original pdf file

            with open(filepath_txt, "w") as file:
                for line in df.loc[0]["paragraphs"]:
                    file.write("\n" + line)

        #df.to_csv(f"{filepath_csv}", index=False)
        self.cdqa_pipeline.fit_retriever(df=df)

    def convert_and_load(self, filepath=None, filename=None):
        self.convert_data(filepath)
        #self.load_data(filepath)

    def load_data(self, filepath=None):
        """
        Read in date file/path and determines the tile type 
        If no file type, then assumes folder contatins pdfs 
        """
        df = pd.read_csv(filepath, converters={"paragraphs": literal_eval})
        df = filter_paragraphs(df)
        self.cdqa_pipeline.fit_retriever(df=df)
示例#11
0
import os
from ast import literal_eval
import pandas as pd

from cdqa.utils.filters import filter_paragraphs
from cdqa.pipeline import QAPipeline

df = pd.read_csv('esrc_pdfs.csv', converters={'paragraphs': literal_eval})

cdqa_pipeline = QAPipeline(
    reader='/resources/cdQA/bert_qa.joblib'
)  # use 'distilbert_qa.joblib' for DistilBERT instead of BERT
cdqa_pipeline.fit_retriever(df=df)  # should this be fit_reader???

cdqa_pipeline.dump_reader('/resources/cdQA/bert-reader.joblib')

prediction = cdqa_pipeline.predict(query, n_predictions=5)


def make_prediction(query, n_predictions):

    prediction = cdqa_pipeline.predict(query, n_predictions=n_predictions)

    return prediction