示例#1
0
    def _mini_squad_processor(self, query: List[str],
                              context: List[str]) -> List[SquadExample]:
        """Squad data processor to create `SquadExamples`

        * **query** - List of query strings, must be same length as `context`
        * **context** - List of context strings, must be same length as `query`

        """
        assert len(query) == len(context)
        examples = []
        title = 'qa'
        is_impossible = False
        answer_text = None
        start_position_character = None
        answers = ['answer']
        for idx, (q, c) in enumerate(zip(query, context)):
            example = SquadExample(
                qas_id=str(idx),
                question_text=q,
                context_text=c,
                answer_text=answer_text,
                start_position_character=start_position_character,
                title=title,
                is_impossible=is_impossible,
                answers=answers,
            )
            examples.append(example)
        return examples
示例#2
0
 def create_sample(
     question: Union[str, List[str]],
     context: Union[str,
                    List[str]]) -> Union[SquadExample, List[SquadExample]]:
     """
     QuestionAnsweringPipeline leverages the SquadExample/SquadFeatures internally.
     This helper method encapsulate all the logic for converting question(s) and context(s) to SquadExample(s).
     We currently support extractive question answering.
     Args:
          question: (str, List[str]) The question to be ask for the associated context
          context: (str, List[str]) The context in which we will look for the answer.
     """
     if isinstance(question, list):
         return [
             SquadExample(None, q, c, None, None, None)
             for q, c in zip(question, context)
         ]
     else:
         return SquadExample(None, question, context, None, None, None)
示例#3
0
        preds_docs_guid.append(
            (pred, samples[index].text_b, samples[index].guid))
    save = []
    last_index = 0
    squad_samples = []
    for n_doc, question in zip(docs_per_queston, questions):
        #for to_sort in preds_and_docs:
        to_sort = preds_docs_guid[last_index:last_index + n_doc]
        last_index += n_doc
        to_sort.sort(key=lambda x: x[0], reverse=True)
        save.append(to_sort)
        for (_, passage,
             guid) in to_sort[0:min(args.rerank_n_docs, len(to_sort))]:
            squad_samples.append(
                SquadExample(qas_id=guid,
                             question_text=question,
                             context_text=passage,
                             answer_text='',
                             start_position_character=0,
                             title=''))

    logger.info('Initializing Reader ...')
    reader = Reader(args.reader_model_type, args.reader_path, args.save_dir)
    reader.load_model()

    reader.evaluate(squad_samples)

    logger.info('Saving reranked docs with scores...')
    with open(os.path.join(args.save_dir, "reranked_preds"), 'wb') as fp:
        pickle.dump(save, fp)
示例#4
0
    preds_and_docs = []
    for index, pred in enumerate(preds):
        preds_and_docs.append((pred, samples[index].text_b))

    logger.info('preparing data for extraction...')
    squad_samples = []
    save = []
    begin = 0
    end = 0
    q_id = 0
    for indice in docs_per_queston:
        end += 1
        document = preds_and_docs[begin * indice:end * indice]
        begin += 1

        c_id = 0
        for doc in document:
            squad_samples.append(
                SquadExample(qas_id=str(q_id) + '_' + str(c_id),
                             question_text=questions[q_id],
                             context_text=doc[1],
                             answer_text='',
                             start_position_character=0,
                             title=''))
            c_id += 1
        q_id += 1

    logger.info(squad_samples[0])
    logger.info('Evaluating...')
    reader.evaluate(squad_samples)
示例#5
0
    # start time
    start = time.time()

    # read all the data and store it
    logger.info('Reading data ...')
    questions = []
    answers = []

    with open(args.dataset, 'r', encoding='utf-8-sig') as f:
        data = list(csv.reader(f, delimiter='\t', quotechar=None))

    # get the closest docs for each question.

    logger.info('Reader ...')
    reader = Reader(args.reader_model_type, args.reader_path,
                    args.reader_output_dir)
    reader.load_model()

    logger.info('creating samples ...')
    squad_samples = []
    for line in data:
        squad_samples.append(
            SquadExample(qas_id=line[0],
                         question_text=line[1],
                         context_text=line[2],
                         answer_text='',
                         start_position_character=0,
                         title=''))

    reader.evaluate(squad_samples)