Exemplo n.º 1
0
def execute_pipeline(query, n_predictions=None):
    download_bnpp_data("./data/bnpp_newsroom_v1.1/")
    download_model("bert-squad_1.1", dir="./models")
    df = pd.read_csv(
        "./data/bnpp_newsroom_v1.1/bnpp_newsroom-v1.1.csv",
        converters={"paragraphs": literal_eval},
    )
    df = filter_paragraphs(df)

    cdqa_pipeline = QAPipeline(reader="models/bert_qa_vCPU-sklearn.joblib")
    cdqa_pipeline.fit_retriever(df)
    if torch.cuda.is_available():
        cdqa_pipeline.cuda()
    if n_predictions is not None:
        predictions = cdqa_pipeline.predict(query, n_predictions=n_predictions)
        result = []

        for answer, title, paragraph, score in predictions:
            prediction = (answer, title)
            result.append(prediction)
        return result
    else:
        prediction = cdqa_pipeline.predict(query)
        result = (prediction[0], prediction[1])
        return result
Exemplo n.º 2
0
 def load_data(self, filepath=None):
     """
     Read in date file/path and determines the tile type 
     If no file type, then assumes folder contatins pdfs 
     """
     df = pd.read_csv(filepath, converters={"paragraphs": literal_eval})
     df = filter_paragraphs(df)
     self.cdqa_pipeline.fit_retriever(df=df)
Exemplo n.º 3
0
def test_evaluate_pipeline():

    download_bnpp_data("./data/bnpp_newsroom_v1.1/")
    download_model("bert-squad_1.1", dir="./models")
    df = pd.read_csv(
        "./data/bnpp_newsroom_v1.1/bnpp_newsroom-v1.1.csv",
        converters={"paragraphs": literal_eval},
    )
    df = filter_paragraphs(df)

    test_data = {
        "data": [{
            "title":
            "BNP Paribas’ commitment to universities and schools",
            "paragraphs": [{
                "context":
                "Since January 2016, BNP Paribas has offered an Excellence Program targeting new Master’s level graduates (BAC+5) who show high potential. The aid program lasts 18 months and comprises three assignments of six months each. It serves as a strong career accelerator that enables participants to access high-level management positions at a faster rate. The program allows participants to discover the BNP Paribas Group and its various entities in France and abroad, build an internal and external network by working on different assignments and receive personalized assistance from a mentor and coaching firm at every step along the way.",
                "qas": [{
                    "answers": [
                        {
                            "answer_start": 6,
                            "text": "January 2016"
                        },
                        {
                            "answer_start": 6,
                            "text": "January 2016"
                        },
                        {
                            "answer_start": 6,
                            "text": "January 2016"
                        },
                    ],
                    "question":
                    "Since when does the Excellence Program of BNP Paribas exist?",
                    "id":
                    "56be4db0acb8001400a502ec",
                }],
            }],
        }],
        "version":
        "1.1",
    }

    with open("./test_data.json", "w") as f:
        json.dump(test_data, f)

    cdqa_pipeline = QAPipeline(reader="./models/bert_qa_vCPU-sklearn.joblib",
                               n_jobs=-1)
    cdqa_pipeline.fit_retriever(X=df)

    eval_dict = evaluate_pipeline(cdqa_pipeline,
                                  "./test_data.json",
                                  output_dir=None)

    assert eval_dict["exact_match"] > 0.8

    assert eval_dict["f1"] > 0.8
Exemplo n.º 4
0
def question(text, query):
    print(text)
    test = []
    for i in sent_tokenize(text):
        if len(i) > 2:
            test.append(i)

    n = 4
    # using list comprehension
    final = [test[i * n:(i + 1) * n] for i in range((len(test) + n - 1) // n)]
    title_s = []
    for j in range(len(final)):
        title_s.append(f'Title{j}')

    data = [title_s, final]
    df3 = pd.DataFrame(data=data)
    df3 = df3.transpose()
    df3.columns = ['title', 'paragraphs']
    print(df3)
    #st.text('Hold on this will take some time')

    from ast import literal_eval

    from cdqa.utils.filters import filter_paragraphs
    from cdqa.utils.download import download_model, download_bnpp_data
    from cdqa.pipeline.cdqa_sklearn import QAPipeline

    # Download data and models
    #download_bnpp_data(dir='./data/bnpp_newsroom_v1.1/')
    #download_model(model='bert-squad_1.1', dir='./models')

    # Loading data and filtering / preprocessing the documents
    df = pd.read_csv(
        'D:/devjams/Machine-Learning-Web-Apps-master/NLPIffy_NLP_Based_SpaCy_Flask_App&_API/cdQA/data/bnpp_newsroom_v1.1/bnpp_newsroom-v1.1.csv',
        converters={'paragraphs': literal_eval})
    df = filter_paragraphs(df)
    #st.text('Please Wait. We are looking for the answer to your question')
    # Loading QAPipeline with CPU version of BERT Reader pretrained on SQuAD 1.1
    cdqa_pipeline = QAPipeline(
        reader=
        'D:/devjams/Machine-Learning-Web-Apps-master/NLPIffy_NLP_Based_SpaCy_Flask_App&_API/bert_qa_vGPU-sklearn.joblib'
    )

    # Fitting the retriever to the list of documents in the dataframe
    cdqa_pipeline.fit_retriever(df3)
    print(query)
    #st.text('Almost done.......')
    #query = 'Intellectual Property Rights'
    try:
        prediction = cdqa_pipeline.predict(query)
    except Exception as e:
        print(e)
    #st.text(prediction[2])
    return prediction[2]
Exemplo n.º 5
0
    def convert_data(self, filepath):
        """
        Convert data files 
        to txt
        """
        filename = os.path.basename(filepath)
        name, extension = os.path.splitext(str(filename))
        root, _ = filepath.split(f"/text/{filename}")
        filepath_txt = f"{root}/text/{name}.txt"
        filepath_csv = f"{root}/csv/{name}.csv"

        if extension == ".csv":
            # csv needs to have "title" and "paragraphs" features
            df = pd.read_csv(filepath, converters={"paragraphs": literal_eval})
            df = filter_paragraphs(df)
            # https://stackoverflow.com/questions/51491931/reading-text-files-from-subfolders-and-folders-and-creating-a-dataframe-in-panda

        elif extension == ".txt" or extension == ".story":
            lines = []
            # Read file and remove non UTF-8 chars
            with open(filepath, encoding="utf8", errors='ignore') as f:
                for line in f:
                    lines.append(
                        bytes(line, "utf-8").decode("utf-8", "ignore"))
                paragraphs = lines

            # Make df to use in QA
            df = pd.DataFrame({"title": filename, "paragraphs": [paragraphs]})
            with open(filepath_txt, "w+") as f:
                for line in lines:
                    f.write(line)

        elif extension == ".pdf":
            tmp_dir = f"{root}/tmp"
            tmp_filepath = f"{tmp_dir}/{filename}"

            if not os.path.exists(tmp_dir):
                os.makedirs(tmp_dir)
            shutil.copyfile(filepath, tmp_filepath)

            df = pdf_converter(directory_path=tmp_dir)
            shutil.rmtree(tmp_dir, ignore_errors=True)
            os.remove(filepath)  # Remove original pdf file

            with open(filepath_txt, "w") as file:
                for line in df.loc[0]["paragraphs"]:
                    file.write("\n" + line)

        #df.to_csv(f"{filepath_csv}", index=False)
        self.cdqa_pipeline.fit_retriever(df=df)
Exemplo n.º 6
0
def execute_pipeline(query):
    download_bnpp_data('./data/bnpp_newsroom_v1.1/')
    download_model('bert-squad_1.1', dir='./models')
    df = pd.read_csv('./data/bnpp_newsroom_v1.1/bnpp_newsroom-v1.1.csv',
                     converters={'paragraphs': literal_eval})
    df = filter_paragraphs(df)

    cdqa_pipeline = QAPipeline(reader='models/bert_qa_vCPU-sklearn.joblib')
    cdqa_pipeline.fit_retriever(X=df)

    prediction = cdqa_pipeline.predict(X=query)

    result = (prediction[0], prediction[1])

    return result
Exemplo n.º 7
0
def execute_pipeline(query):
    df = pd.read_csv('data/bnpp_newsroom_v1.1/bnpp_newsroom-v1.1.csv',
                     converters={'paragraphs': literal_eval})
    df = filter_paragraphs(df)

    cdqa_pipeline = QAPipeline(
        reader='models/bert_qa_vCPU-sklearn.joblib')
    cdqa_pipeline.fit(X=df)
    cdqa_pipeline.reader.output_dir = None

    prediction = cdqa_pipeline.predict(X=query)

    result = (prediction[0], prediction[1])

    return result
Exemplo n.º 8
0
def wholeShot():
    print("UPDATING CORPUS")

    df = pd.read_csv("pdfs/data.csv")
    df2 = pdf_converter(directory_path='pdfs')
    df2.dropna()
    df2 = df2.mask(df2.eq('None')).dropna()
    df2 = filter_paragraphs(df2)
    # for rows in df2
    #   if row title is not in df
    #       append row to df
    print(len(df2))
    for index, row in df2.iterrows():
        if str(row['title']) not in list(df['title']):
            df = df.append(row, ignore_index=True)
    print(df)
    df.to_csv("pdfs/data.csv", index=False)

    os.system("docker stop search-engine")
    os.system("docker rm search-engine")
    os.system("docker build -t search-engine . -f dockerfile-service")
    os.system(
        "docker run -d -p 5000:5000 -v /home/sidworld/sixgod/pdfs:/pdfs --name search-engine --rm search-engine "
    )
Exemplo n.º 9
0
# coding: utf-8
import os
import pandas as pd
from ast import literal_eval
import cdqa
from cdqa.utils.filters import filter_paragraphs
from cdqa.pipeline.cdqa_sklearn import QAPipeline

df = pd.read_csv('/home/ubuntu/data/bnpp_newsroom_v1.1/bnpp_newsroom-v1.1.csv',
                 converters={'paragraphs': literal_eval})
df = filter_paragraphs(df)

df['title'] = df['category']

cdqa_pipeline = QAPipeline(
    reader='/home/ubuntu/data/bert_qa_vCPU-sklearn.joblib')
cdqa_pipeline.fit(X=df)

print('At result')


class QAModule():
    def __init__(self):
        self.query = 'Since when does the Excellence Program of BNP Paribas exist?'

    def getAnswer(self, query):
        prediction = cdqa_pipeline.predict(X=query)
        return prediction


class SentimentModule():
Exemplo n.º 10
0
    for i in range(len(Extract_a)) : 
        Extract_b = dict(Extract_a[i])
        for i in range(len(Extract_b['morp'])) : 
            if (Extract_b['morp'][i]['type'] =='NNG' or Extract_b['morp'][i]['type'] =='NNP') or Extract_b['morp'][i]['type'] =='VV': 
                Noun.append(Extract_b['morp'][i]['lemma'])
    return " ".join(Noun)

df = pd.read_csv('data/bnpp_newsroom_v1.1/jungchat_result_191031.csv',converters={'paragraphs': literal_eval})


retriever = BM25Retriever(ngram_range=(1, 2), max_df=1.00,min_df=1, stop_words=None)
retriever_temp= BM25Retriever(ngram_range=(1, 2), max_df=1.00,min_df=1, stop_words=None)
#retriever_doc= BM25Retriever(ngram_range=(1, 2), max_df=1.00,min_df=1, stop_words=None)
retriever.fit(df)

df = filter_paragraphs(df,min_length=10)

cdqa_pipeline = QAPipeline(reader='models/bert_qa_korquad_vCPU.joblib')


best_idx_scores=''

while 100:
    query=input('입력창:')
    if query=='quit':
        break
    POS_query=ETRI_POS_Tagging(query)
    #print(list(list(retriever.predict(ETRI_POS_Tagging(query)).values())[0])[0])
    if max(retriever.predict(POS_query).values())>=1.5 or not best_idx_scores:
        
        if best_idx_scores:#유사도가 있고 두번째 
Exemplo n.º 11
0

#df = pd.read_csv('data/bnpp_newsroom_v1.1/jungchat_result_191015.csv',converters={'paragraphs': literal_eval})
df = pd.read_csv('data/bnpp_newsroom_v1.1/jungchat_result_191015_fix.csv',
                 converters={'paragraphs': literal_eval})
#유사도를 구할 retriever 선언 및 훈련
retriever = BM25Retriever(ngram_range=(1, 2),
                          max_df=1.00,
                          min_df=1,
                          stop_words=None)
#df의 content열을 선택해서 훈련
retriever.fit(df)

#학습된 bert모델 불러오기
cdqa_pipeline = QAPipeline(reader='models/bert_qa_korquad_vCPU.joblib')
df = filter_paragraphs(df, min_length=10, max_length=100)

#첫번째 입력을위한 초기화
best_idx_scores = ''
while 100:
    query = input('입력창:')
    #정책과 질문의 유사도가 1보다 높으면 해당 정책의 문서를 선택하여 재학습
    #질문의 유사도가 낮으면 해당정책에 대해 질의응답
    print(
        list(list(retriever.predict(ETRI_POS_Tagging(query)).values())[0])[0])
    if list(list(retriever.predict(ETRI_POS_Tagging(query)).values())
            [0])[0] >= 4. or not best_idx_scores:
        #질문과 유사도가 높은 정책의 인덱스 예측
        best_idx_scores = retriever.predict(ETRI_POS_Tagging(query))
        #청년정책에서 선택된 문서학습
        cdqa_pipeline.fit_retriever(df.loc[best_idx_scores.keys()].head(1))