Exemplo n.º 1
0
def create_answers(deck_id, query):
    decks_collection = db["decks"]
    slides_collection = db["slides"]
    decks = decks_collection.find({"_id": deck_id})
    target_deck = decks[0]
    author_id = target_deck["user"]
    author_slides = slides_collection.find({"user": author_id})

    df = pd.DataFrame()

    for slide in author_slides:
        revision = slide["revisions"][-1]
        usages = revision["usage"]
        for usage in usages:
            if usage["id"] == deck_id:
                content = html2text.html2text(revision["content"])
                paragraphs = content.split('\n\n')
                df = df.append(
                    {
                        "date": revision["timestamp"],
                        "title": revision["title"],
                        "category": "Infromation",
                        "link": "",
                        "abstract": "",
                        "paragraphs": paragraphs,
                        "revision_id": revision["id"],
                        "slide_id": slide["_id"]
                    },
                    ignore_index=True)
                break

    download_model(model='bert-squad_1.1', dir='./models')

    # df = filter_paragraphs(df)
    cdqa_pipeline = QAPipeline(reader='models/bert_qa.joblib',
                               max_df=0.95,
                               min_df=3)
    cdqa_pipeline.fit_retriever(df)

    predictions = cdqa_pipeline.predict(
        query, n_predictions=5)  #retriever_score_weight=0.99

    answers = []
    i = 1
    for prediction in predictions:
        slide_id = df.loc[df["title"] == prediction[1]].iloc[0]['slide_id']
        revision_id = df.loc[df["title"] ==
                             prediction[1]].iloc[0]['revision_id']
        answers.append({
            "slide_id": int(slide_id),
            "revision_id": int(revision_id),
            "answer": prediction[0],
            "title": prediction[1],
            "paragraph": prediction[2],
            "score": prediction[3]
        })
        i += 1

    return answers
Exemplo n.º 2
0
def question(text, query):
    print(text)
    test = []
    for i in sent_tokenize(text):
        if len(i) > 2:
            test.append(i)

    n = 4
    # using list comprehension
    final = [test[i * n:(i + 1) * n] for i in range((len(test) + n - 1) // n)]
    title_s = []
    for j in range(len(final)):
        title_s.append(f'Title{j}')

    data = [title_s, final]
    df3 = pd.DataFrame(data=data)
    df3 = df3.transpose()
    df3.columns = ['title', 'paragraphs']
    print(df3)
    #st.text('Hold on this will take some time')

    from ast import literal_eval

    from cdqa.utils.filters import filter_paragraphs
    from cdqa.utils.download import download_model, download_bnpp_data
    from cdqa.pipeline.cdqa_sklearn import QAPipeline

    # Download data and models
    #download_bnpp_data(dir='./data/bnpp_newsroom_v1.1/')
    #download_model(model='bert-squad_1.1', dir='./models')

    # Loading data and filtering / preprocessing the documents
    df = pd.read_csv(
        'D:/devjams/Machine-Learning-Web-Apps-master/NLPIffy_NLP_Based_SpaCy_Flask_App&_API/cdQA/data/bnpp_newsroom_v1.1/bnpp_newsroom-v1.1.csv',
        converters={'paragraphs': literal_eval})
    df = filter_paragraphs(df)
    #st.text('Please Wait. We are looking for the answer to your question')
    # Loading QAPipeline with CPU version of BERT Reader pretrained on SQuAD 1.1
    cdqa_pipeline = QAPipeline(
        reader=
        'D:/devjams/Machine-Learning-Web-Apps-master/NLPIffy_NLP_Based_SpaCy_Flask_App&_API/bert_qa_vGPU-sklearn.joblib'
    )

    # Fitting the retriever to the list of documents in the dataframe
    cdqa_pipeline.fit_retriever(df3)
    print(query)
    #st.text('Almost done.......')
    #query = 'Intellectual Property Rights'
    try:
        prediction = cdqa_pipeline.predict(query)
    except Exception as e:
        print(e)
    #st.text(prediction[2])
    return prediction[2]
Exemplo n.º 3
0
def find_answer(question):
    # Set your path to pdf directory
    df = pdf_converter(directory_path='pdf_folder/')
    cdqa_pipeline = QAPipeline(reader='models/bert_qa.joblib')
    cdqa_pipeline.fit_retriever(df)
    query = question + '?'
    prediction = cdqa_pipeline.predict(query)

    # print('query: {}\n'.format(query))
    # print('answer: {}\n'.format(prediction[0]))
    # print('title: {}\n'.format(prediction[1]))
    # print('paragraph: {}\n'.format(prediction[2]))
    return prediction[0]
Exemplo n.º 4
0
def search_view(request):
    if request.POST:
        question = request.POST.get('question')
        for idx, url in enumerate(
                search(question, tld="com", num=10, stop=3, pause=2)):
            crawl_result(url, idx)
        # change path to pdfs folder
        df = pdf_converter(directory_path='/path/to/pdfs')
        cdqa_pipeline = QAPipeline(reader='models/bert_qa.joblib')
        cdqa_pipeline.fit_retriever(df)
        prediction = cdqa_pipeline.predict(question)
        data = {'answer': prediction[0]}
        return JsonResponse(data)
    return render(request, 'search.html')
Exemplo n.º 5
0
def execute_pipeline(query):
    download_bnpp_data('./data/bnpp_newsroom_v1.1/')
    download_model('bert-squad_1.1', dir='./models')
    df = pd.read_csv('./data/bnpp_newsroom_v1.1/bnpp_newsroom-v1.1.csv',
                     converters={'paragraphs': literal_eval})
    df = filter_paragraphs(df)

    cdqa_pipeline = QAPipeline(reader='models/bert_qa_vCPU-sklearn.joblib')
    cdqa_pipeline.fit_retriever(X=df)

    prediction = cdqa_pipeline.predict(X=query)

    result = (prediction[0], prediction[1])

    return result
Exemplo n.º 6
0
def execute_pipeline(query):
    df = pd.read_csv('data/bnpp_newsroom_v1.1/bnpp_newsroom-v1.1.csv',
                     converters={'paragraphs': literal_eval})
    df = filter_paragraphs(df)

    cdqa_pipeline = QAPipeline(
        reader='models/bert_qa_vCPU-sklearn.joblib')
    cdqa_pipeline.fit(X=df)
    cdqa_pipeline.reader.output_dir = None

    prediction = cdqa_pipeline.predict(X=query)

    result = (prediction[0], prediction[1])

    return result
Exemplo n.º 7
0
def test_evaluate_pipeline():

    download_bnpp_data("./data/bnpp_newsroom_v1.1/")
    download_model("bert-squad_1.1", dir="./models")
    df = pd.read_csv(
        "./data/bnpp_newsroom_v1.1/bnpp_newsroom-v1.1.csv",
        converters={"paragraphs": literal_eval},
    )
    df = filter_paragraphs(df)

    test_data = {
        "data": [
            {
                "title": "BNP Paribas’ commitment to universities and schools",
                "paragraphs": [
                    {
                        "context": "Since January 2016, BNP Paribas has offered an Excellence Program targeting new Master’s level graduates (BAC+5) who show high potential. The aid program lasts 18 months and comprises three assignments of six months each. It serves as a strong career accelerator that enables participants to access high-level management positions at a faster rate. The program allows participants to discover the BNP Paribas Group and its various entities in France and abroad, build an internal and external network by working on different assignments and receive personalized assistance from a mentor and coaching firm at every step along the way.",
                        "qas": [
                            {
                                "answers": [
                                    {"answer_start": 6, "text": "January 2016"},
                                    {"answer_start": 6, "text": "January 2016"},
                                    {"answer_start": 6, "text": "January 2016"},
                                ],
                                "question": "Since when does the Excellence Program of BNP Paribas exist?",
                                "id": "56be4db0acb8001400a502ec",
                            }
                        ],
                    }
                ],
            }
        ],
        "version": "1.1",
    }

    with open("./test_data.json", "w") as f:
        json.dump(test_data, f)

    cdqa_pipeline = QAPipeline(reader="./models/bert_qa_vCPU-sklearn.joblib", n_jobs=-1)
    cdqa_pipeline.fit_retriever(X=df)

    eval_dict = evaluate_pipeline(cdqa_pipeline, "./test_data.json", output_dir=None)

    assert eval_dict["exact_match"] > 0.8

    assert eval_dict["f1"] > 0.8
def get_distilbert_model():
    if not os.path.exists('./models'):
        os.makedirs('./models')
    if not os.path.exists('./models/distilbert_qa.joblib'):
        download_model(model="{}-squad_1.1".format('distilbert'),
                       dir='./models')
    return QAPipeline(reader='./models/distilbert_qa.joblib',
                      max_df=1.0,
                      min_df=1)
Exemplo n.º 9
0
def ask():
    name = request.form['btn-input']

    #print(name)
    f = open('current.txt')
    file1 = f.read().rstrip()
    f.close()
    cdqa_pipeline = QAPipeline(reader='models/bert_qa_vCPU-sklearn.joblib')
    row = file_open(file1)
    df = pd.DataFrame(row)
    df = df.T
    df.columns = ['title', 'paragraphs']
    #print(df.head())
    # Fitting the retriever to the list of documents in the dataframe
    cdqa_pipeline.fit_retriever(df)
    prediction = cdqa_pipeline.predict(name)
    ret = [name, prediction[0], prediction[1], prediction[2]]
    speech = ret[1] + "\n\n Related Paragraph" + ret[3]
    print('This is error output', speech)
    #return speech
    return render_template('index.html', value1=name, value2=speech)
Exemplo n.º 10
0
def execute_pipeline(query, n_predictions=None):
    download_bnpp_data("./data/bnpp_newsroom_v1.1/")
    download_model("bert-squad_1.1", dir="./models")
    df = pd.read_csv(
        "./data/bnpp_newsroom_v1.1/bnpp_newsroom-v1.1.csv",
        converters={"paragraphs": literal_eval},
    )
    df = filter_paragraphs(df)

    cdqa_pipeline = QAPipeline(reader="models/bert_qa_vCPU-sklearn.joblib")
    cdqa_pipeline.fit_retriever(X=df)
    if n_predictions is not None:
        predictions = cdqa_pipeline.predict(X=query,
                                            n_predictions=n_predictions)
        result = []

        for answer, title, paragraph in predictions:
            prediction = (answer, title)
            result.append(prediction)
        return result
    else:
        prediction = cdqa_pipeline.predict(X=query)
        result = (prediction[0], prediction[1])
        return result
Exemplo n.º 11
0
        for i in range(len(Extract_b['morp'])) : 
            if (Extract_b['morp'][i]['type'] =='NNG' or Extract_b['morp'][i]['type'] =='NNP') or Extract_b['morp'][i]['type'] =='VV': 
                Noun.append(Extract_b['morp'][i]['lemma'])
    return " ".join(Noun)

df = pd.read_csv('data/bnpp_newsroom_v1.1/jungchat_result_191031.csv',converters={'paragraphs': literal_eval})


retriever = BM25Retriever(ngram_range=(1, 2), max_df=1.00,min_df=1, stop_words=None)
retriever_temp= BM25Retriever(ngram_range=(1, 2), max_df=1.00,min_df=1, stop_words=None)
#retriever_doc= BM25Retriever(ngram_range=(1, 2), max_df=1.00,min_df=1, stop_words=None)
retriever.fit(df)

df = filter_paragraphs(df,min_length=10)

cdqa_pipeline = QAPipeline(reader='models/bert_qa_korquad_vCPU.joblib')


best_idx_scores=''

while 100:
    query=input('입력창:')
    if query=='quit':
        break
    POS_query=ETRI_POS_Tagging(query)
    #print(list(list(retriever.predict(ETRI_POS_Tagging(query)).values())[0])[0])
    if max(retriever.predict(POS_query).values())>=1.5 or not best_idx_scores:
        
        if best_idx_scores:#유사도가 있고 두번째 
            if max(retriever_temp.predict(POS_query).values())<1.5:#문서와의 유사도가 낮을때
                    best_idx_scores = retriever.predict(POS_query)
Exemplo n.º 12
0
Arquivo: qa_index.py Projeto: viqee/qa
    cfgs = return reader.read()
configs = json.loads(cfgs)

question = configs['question']

data_directory = '/data/'
models_directory = '/models/'

# download_squad(dir = './' + data_directory)
download_bnpp_data(dir = './' + data_directory)
# download_model('distilbert-squad_1.1', dir = './' + models_directory)
download_model('bert-squad_1.1', dir = './' + models_directory)

df = pandas.read_csv(data_directory + '/bnpp_paribas/-??-.csv', converter = {'paragraphs': ast.literal_evl})
df = filter_paragraphs(df)
cdqa_pipeline = QAPipeline(reader = models_directory + '/bert_qa/bert_qa.joblib')
cdqa_pipeline.fit_retriever(q = df) 
# cdqa_pipeline.fit_reader('path to squad like dataset . json')
prediction = cdqa_pipeline.predict(q = question, n_prediction = ?) # ? = predictions
# cdqa_pipeline.dump_reader('path to save . joblib') # save reader model

query = 'query: {}\n'.format(query),
answer = 'answer: {}\n'.format(prediction[0]),
title = 'title: {}\n'.format(prediction[1]),
paragraph = 'paragraph: {}\n'.format(prediction[2])

result = query, answer, title, paragraph

notify2.init('question answer')
notif = notify2.Notification('qa', result)
# notif.set_urgency(notify2.URGENCY_CRITICAL)
Exemplo n.º 13
0
# coding: utf-8
import os
import pandas as pd
from ast import literal_eval
import cdqa
from cdqa.utils.filters import filter_paragraphs
from cdqa.pipeline.cdqa_sklearn import QAPipeline

df = pd.read_csv('/home/ubuntu/data/bnpp_newsroom_v1.1/bnpp_newsroom-v1.1.csv',
                 converters={'paragraphs': literal_eval})
df = filter_paragraphs(df)

df['title'] = df['category']

cdqa_pipeline = QAPipeline(
    reader='/home/ubuntu/data/bert_qa_vCPU-sklearn.joblib')
cdqa_pipeline.fit(X=df)

print('At result')


class QAModule():
    def __init__(self):
        self.query = 'Since when does the Excellence Program of BNP Paribas exist?'

    def getAnswer(self, query):
        prediction = cdqa_pipeline.predict(X=query)
        return prediction


class SentimentModule():
Exemplo n.º 14
0
from flask import Flask, request, jsonify
from ast import literal_eval
import pandas as pd
from cdqa.utils.filters import filter_paragraphs
from cdqa.utils.download import download_model, download_bnpp_data
from cdqa.pipeline.cdqa_sklearn import QAPipeline
from cdqa.retriever import BM25Retriever
from ETRI import *
import time
from khaiii_def import *
app = Flask(__name__)
df = pd.read_csv('jungchat_result_191102.csv',converters={'paragraphs': literal_eval})
cdqa_pipeline = QAPipeline(reader='bert_qa_korquad_vCPU.joblib')#모델을 불러온다
retriever = BM25Retriever(ngram_range=(1, 2), max_df=1.00,min_df=1, stop_words=None)#문서와의 유사도를 구하기위한 리트리버
retriever_temp= BM25Retriever(ngram_range=(1, 2), max_df=1.00,min_df=1, stop_words=None)#문장과의 유사도를 구하기 위한 리트리버
retriever.fit(df)#모든 문서의 내용을 담는다
df = filter_paragraphs(df)
best_idx_scores = ''

def text_tranform(text) :
    return '\n'.join(text.split(', '))

def make_query(text) :
    dataSend = {
          "version": "2.0",
          "template": {
             "outputs": [{
                    "simpleText":{
                       "text" : text}
               }]
           }
Exemplo n.º 15
0
import pandas as pd
from ast import literal_eval

from cdqa.utils.filters import filter_paragraphs
from cdqa.utils.download import download_model, download_bnpp_data
from cdqa.pipeline.cdqa_sklearn import QAPipeline

# Download data and models
#download_bnpp_data(dir='./data/bnpp_newsroom_v1.1/')
#download_model(model='bert-squad_1.1', dir='./models')

# Loading data and filtering / preprocessing the documents
df = pd.read_csv('data/bnpp_newsroom_v1.1/bnpp_newsroom-v1.1.csv',
                 converters={'paragraphs': literal_eval})
df = filter_paragraphs(df)

# Loading QAPipeline with CPU version of BERT Reader pretrained on SQuAD 1.1
cdqa_pipeline = QAPipeline(reader='models/bert_qa_vCPU-sklearn.joblib')

# Fitting the retriever to the list of documents in the dataframe
_ = cdqa_pipeline.fit_retriever(df)

# Sending a question to the pipeline and getting prediction
query = 'Since when does the Excellence Program of BNP Paribas exist?'
prediction = cdqa_pipeline.predict(query)

print('query: {}\n'.format(query))
print('answer: {}\n'.format(prediction[0]))
print('title: {}\n'.format(prediction[1]))
print('paragraph: {}\n'.format(prediction[2]))
Exemplo n.º 16
0
import requests
from flask import Flask, request, Response
from flask_ngrok import run_with_ngrok
import pandas as pd
from ast import literal_eval
from cdqa.utils.filters import filter_paragraphs
from cdqa.utils.download import download_model, download_bnpp_data
from cdqa.pipeline.cdqa_sklearn import QAPipeline

API_KEY = '936714777:AAGFCBbeOAClrTsgmMMOsYG3HkaV7Ck5p-w'

app = Flask(__name__)
run_with_ngrok(app)
df = pd.read_csv('data/bnpp_newsroom_v1.1/jungchat_result.csv',
                 converters={'paragraphs': literal_eval})
cdqa_pipeline = QAPipeline(reader='models/bert_qa_korquad_vCPU.joblib')
cdqa_pipeline.fit_retriever(df)


def parse_message(message):
    chat_id = message['message']['chat']['id']
    msg = message['message']['text']

    return chat_id, msg


def send_message(chat_id, query):
    url = 'https://api.telegram.org/bot{token}/sendMessage'.format(
        token=API_KEY)
    # 변수들을 딕셔너리 형식으로 묶음
Exemplo n.º 17
0
import pandas as pd
from ast import literal_eval
from cdqa.pipeline.cdqa_sklearn import QAPipeline
from rasa_sdk import Action

# read the csv file
df = pd.read_csv(
    '/Users/ashutoshvishnoi/Data_Science/intern_2/products/BankCurrupcy/qa_system/sample_data2/'
    'answs.csv',
    converters={'paragraphs': literal_eval})

# Load the bert qa model
cdqa_pipeline = QAPipeline(
    reader='/Users/ashutoshvishnoi/Data_Science/intern_2/products/BankCurrupcy/'
    'qa_system/models/bert_qa.joblib')

ques_dict = []

cdqa_pipeline.fit_retriever(df)
print('-----Model loaded successfully and fit successfully----')


class ActionGetNewst(Action):
    def name(self):
        return 'action_get_bertAns'

    def run(self, dispatcher, tracker, domain):
        query = tracker.latest_message['text']
        prediction = cdqa_pipeline.predict(query, n_predictions=3)

        # dispatcher.utter_message('query: {}\n'.format(query))
Exemplo n.º 18
0
import os
from ast import literal_eval
import pandas as pd

from cdqa.utils.filters import filter_paragraphs
from cdqa.pipeline.cdqa_sklearn import QAPipeline

app = Flask(__name__)
CORS(app)

dataset_path = os.environ["dataset_path"]
reader_path = os.environ["reader_path"]

df = pd.read_csv(dataset_path, converters={"paragraphs": literal_eval})
df = filter_paragraphs(df)

cdqa_pipeline = QAPipeline(reader=reader_path)
cdqa_pipeline.fit(X=df)


@app.route("/api", methods=["GET"])
def api():

    query = request.args.get("query")
    prediction = cdqa_pipeline.predict(X=query)

    return jsonify(query=query,
                   answer=prediction[0],
                   title=prediction[1],
                   paragraph=prediction[2])
Exemplo n.º 19
0
download_squad(dir=directory)
download_model('bert-squad_1.1', dir=directory)
download_model('distilbert-squad_1.1', dir=directory)

from ast import literal_eval
from cdqa.utils.filters import filter_paragraphs
from cdqa.pipeline.cdqa_sklearn import QAPipeline
from nltk import tokenize


def load_from_csv(file):
    df = pd.read_csv(file)
    df = df.rename(str.lower, axis='columns')
    df['paragraphs'] = df['paragraphs'].apply(
        lambda x: x.replace("'s", " "
                            "s").replace("\n", " "))
    df['paragraphs'] = df['paragraphs'].apply(
        lambda x: tokenize.sent_tokenize(x))
    return df


df = load_from_csv('./data/test.csv')
#make sure bert_qa.joblib is the same directory (cdQA), if not move it here from data
cdqa_pipeline = QAPipeline(reader='bert_qa.joblib')
cdqa_pipeline.fit_retriever(df=df)

querry = st.text_area('enter mssage', 'type')
if st.button('analyze'):
    message = cdqa_pipeline.predict(query=querry, n_predictions=2)
    st.success(message)
Exemplo n.º 20
0
from cdqa.utils.download import download_model, download_bnpp_data
from cdqa.pipeline.cdqa_sklearn import QAPipeline

# Download data and models
download_bnpp_data(dir='./data/bnpp_newsroom_v1.1/')
# download_model(model='bert-squad_1.1', dir='./models')

# Loading data and filtering / preprocessing the documents
# df = pd.read_csv('data/bnpp_newsroom_v1.1/bnpp_newsroom-v1.1.csv', converters={'paragraphs': literal_eval})
df = pd.read_csv('data/bnpp_newsroom_v1.1/custom_tax_jlb.csv',
                 converters={'paragraphs': literal_eval})
df = filter_paragraphs(df)

# Loading QAPipeline with CPU version of BERT Reader pretrained on SQuAD 1.1
# cdqa_pipeline = QAPipeline(reader='models/bert_qa_vCPU-sklearn.joblib')
cdqa_pipeline = QAPipeline(reader='models/bert_qa.joblib')

# Fitting the retriever to the list of documents in the dataframe
# cdqa_pipeline.fit_retriever(X=df)
cdqa_pipeline.fit_retriever(df=df)

# Sending a question to the pipeline and getting prediction
# query = 'Since when does the Excellence Program of BNP Paribas exist?'
# query = 'Who should investors  consult with prior to investing?'
# query = 'Who do custom animal farmers need to consult with before buying fertilizer?'
queries = [
    'Who do custom animal farmers need to consult with before buying fertilizer?',
    'do I qualify for an automatic extension of time to file without filing Form 4868?',
    'Did the coronavirus pandemic extend the deadline to pay taxes?',
    'What is the new tax deadline?', 'What is the tax payer advocate service?',
    'What is the job of the taxpayer advocate service?',
Exemplo n.º 21
0
reader.fit(X=(train_examples, train_features))

# Output fine-tuned model
reader.model.to('cpu')
reader.device = torch.device('cpu')
joblib.dump(reader, os.path.join(reader.output_dir, 'bert_tim_qa_vCPU.joblib'))

#%% [markdown]
# ### Training

#%%
from cdqa.pipeline.cdqa_sklearn import QAPipeline

# Load standard model
cdqa_pipeline = QAPipeline(model='./cdqa/bert_qa_vCPU-sklearn.joblib', max_answer_length=60)
cdqa_pipeline.fit_retriever(X=df_X)


#%%
# Evaluate QnA system
from cdqa.utils.evaluation import evaluate_pipeline
evaluate_pipeline(cdqa_pipeline, 'cdqa-v1.1-tim_qna.json')

# Standard pre trained model: {'exact_match': 0.0, 'f1': 5.025362668068075}
# Fine-tuned model: {'exact_match': 0.0, 'f1': 5.684362620078064}

#%% [markdown]
# ### Inference

#%%
Exemplo n.º 22
0
#df = pd.read_csv('data/my_data/homework.csv', converters={'paragraphs': literal_eval})
#df = filter_paragraphs(df)

df = pd.DataFrame(columns=['title', 'paragraphs'])
paragraphs = input("Text to Analyze:\n").split('\n')
df = df.append({
    'title': 'Inputed Data',
    'paragraphs': paragraphs
},
               ignore_index=True)

print(df)

cdqa_pipeline = QAPipeline(reader='models/bert_qa.joblib',
                           min_df=1,
                           max_df=1000)

cdqa_pipeline.fit_retriever(df=df)

while True:
    query = input('> ')
    prediction = cdqa_pipeline.predict(query=query)

    #if prediction[3] < -2:
    #    print("cdQA: Sorry, I don't know.")
    #else:
    #print('query: {}\n'.format(query))
    print('cdQA: {}'.format(prediction[0]))
    #print('title: {}\n'.format(prediction[1]))
    #print('paragraph: {}\n'.format(prediction[2]))