def _mini_squad_processor(self, query: List[str], context: List[str]) -> List[SquadExample]: """Squad data processor to create `SquadExamples` * **query** - List of query strings, must be same length as `context` * **context** - List of context strings, must be same length as `query` """ assert len(query) == len(context) examples = [] title = 'qa' is_impossible = False answer_text = None start_position_character = None answers = ['answer'] for idx, (q, c) in enumerate(zip(query, context)): example = SquadExample( qas_id=str(idx), question_text=q, context_text=c, answer_text=answer_text, start_position_character=start_position_character, title=title, is_impossible=is_impossible, answers=answers, ) examples.append(example) return examples
def create_sample( question: Union[str, List[str]], context: Union[str, List[str]]) -> Union[SquadExample, List[SquadExample]]: """ QuestionAnsweringPipeline leverages the SquadExample/SquadFeatures internally. This helper method encapsulate all the logic for converting question(s) and context(s) to SquadExample(s). We currently support extractive question answering. Args: question: (str, List[str]) The question to be ask for the associated context context: (str, List[str]) The context in which we will look for the answer. """ if isinstance(question, list): return [ SquadExample(None, q, c, None, None, None) for q, c in zip(question, context) ] else: return SquadExample(None, question, context, None, None, None)
preds_docs_guid.append( (pred, samples[index].text_b, samples[index].guid)) save = [] last_index = 0 squad_samples = [] for n_doc, question in zip(docs_per_queston, questions): #for to_sort in preds_and_docs: to_sort = preds_docs_guid[last_index:last_index + n_doc] last_index += n_doc to_sort.sort(key=lambda x: x[0], reverse=True) save.append(to_sort) for (_, passage, guid) in to_sort[0:min(args.rerank_n_docs, len(to_sort))]: squad_samples.append( SquadExample(qas_id=guid, question_text=question, context_text=passage, answer_text='', start_position_character=0, title='')) logger.info('Initializing Reader ...') reader = Reader(args.reader_model_type, args.reader_path, args.save_dir) reader.load_model() reader.evaluate(squad_samples) logger.info('Saving reranked docs with scores...') with open(os.path.join(args.save_dir, "reranked_preds"), 'wb') as fp: pickle.dump(save, fp)
preds_and_docs = [] for index, pred in enumerate(preds): preds_and_docs.append((pred, samples[index].text_b)) logger.info('preparing data for extraction...') squad_samples = [] save = [] begin = 0 end = 0 q_id = 0 for indice in docs_per_queston: end += 1 document = preds_and_docs[begin * indice:end * indice] begin += 1 c_id = 0 for doc in document: squad_samples.append( SquadExample(qas_id=str(q_id) + '_' + str(c_id), question_text=questions[q_id], context_text=doc[1], answer_text='', start_position_character=0, title='')) c_id += 1 q_id += 1 logger.info(squad_samples[0]) logger.info('Evaluating...') reader.evaluate(squad_samples)
# start time start = time.time() # read all the data and store it logger.info('Reading data ...') questions = [] answers = [] with open(args.dataset, 'r', encoding='utf-8-sig') as f: data = list(csv.reader(f, delimiter='\t', quotechar=None)) # get the closest docs for each question. logger.info('Reader ...') reader = Reader(args.reader_model_type, args.reader_path, args.reader_output_dir) reader.load_model() logger.info('creating samples ...') squad_samples = [] for line in data: squad_samples.append( SquadExample(qas_id=line[0], question_text=line[1], context_text=line[2], answer_text='', start_position_character=0, title='')) reader.evaluate(squad_samples)