def execute_pipeline(query, n_predictions=None): download_bnpp_data("./data/bnpp_newsroom_v1.1/") download_model("bert-squad_1.1", dir="./models") df = pd.read_csv( "./data/bnpp_newsroom_v1.1/bnpp_newsroom-v1.1.csv", converters={"paragraphs": literal_eval}, ) df = filter_paragraphs(df) cdqa_pipeline = QAPipeline(reader="models/bert_qa_vCPU-sklearn.joblib") cdqa_pipeline.fit_retriever(df) if torch.cuda.is_available(): cdqa_pipeline.cuda() if n_predictions is not None: predictions = cdqa_pipeline.predict(query, n_predictions=n_predictions) result = [] for answer, title, paragraph, score in predictions: prediction = (answer, title) result.append(prediction) return result else: prediction = cdqa_pipeline.predict(query) result = (prediction[0], prediction[1]) return result
from cdqa.pipeline import QAPipeline from cdqa.utils.evaluation import f1_score, exact_match_score # dataset df = pd.read_csv('./data/data_augmentation.csv', converters={'paragraphs': literal_eval},encoding='utf-8') # paragraphs 새로 정의 : Title + Paragraph df['paragraphs_old'] = df['paragraphs'] df['paragraphs'] = df.apply(lambda row: [row['title']] + row['paragraphs_old'], axis=1).copy() data = pd.read_csv('./data/data_augmentation.csv',encoding='utf-8') data_sampling = data.sample(100,random_state=66) from cdqa.retriever import TfidfRetriever, BM25Retriever cdqa_pipeline = QAPipeline(reader='bert_qa_multi_epoch3.joblib', retrieve_by_doc=True,retriever='bm25') cdqa_pipeline.fit_retriever(df=df) cdqa_pipeline.cuda() retriever = BM25Retriever(ngram_range=(1,2), max_df=0.8, min_df=3, stop_words=None,lowercase=True, top_n=5) retriever.fit(df=df) def f1(dataframe,dataframe2): number = 0 exact_number = 0 # score = [] answer_list=[] while number < 100: # print("Question?") question = dataframe2.iloc[number,2] # 질문 # question = input() best_idx_scores = retriever.predict(question) prediction = df.loc[best_idx_scores.keys()]['paragraphs'].apply(lambda x:x[1]).tolist()[0].replace(u'\xa0',u'') number+=1 answer_list.append(prediction)