示例#1
0
    def _create_examples(self, input_data, set_type):
        examples = []
        for entry in tqdm(input_data):
            title = entry["title"]
            for paragraph in entry["paragraphs"]:
                context_text = paragraph["context"]
                for qa in paragraph["qas"]:
                    qas_id = qa["id"]
                    question_text = qa["question"]
                    start_position_character = None
                    answer_text = None
                    answers = []

                    is_impossible = qa.get("is_impossible", False)
                    if not is_impossible:
                        answer = qa["answers"][0]
                        answers = qa["answers"]
                        answer_text = answer["text"]
                        start_position_character = answer["answer_start"]

                    example = SquadExample(
                        qas_id=qas_id,
                        question_text=question_text,
                        context_text=context_text,
                        answer_text=answer_text,
                        start_position_character=start_position_character,
                        title=title,
                        is_impossible=is_impossible,
                        answers=answers,
                    )
                    examples.append(example)
        return examples
示例#2
0
def convert_to_example(question, paragraphs):

    qas_id = QUESTION_ID
    question_text = question
    answer_text = None
    title = None
    is_impossible = False

    temp_example = korquadExample(
        qas_id=qas_id,
        question_text=question_text,
        answer_text=answer_text,
        title=title,
        is_impossible=is_impossible,
    )

    for paragraph in paragraphs:
        example = SquadExample(
            qas_id=qas_id,
            question_text=question_text,
            context_text=paragraph,
            answer_text=answer_text,
            start_position_character=None,
            title=title,
            is_impossible=is_impossible,
            answers=[],
        )
        temp_example.add_SquadExample(example)

    return [temp_example]
示例#3
0
def create_squad_examples(data_path):
    with open(data_path, "r", encoding="utf-8") as f:
        input_data = json.load(f)["data"]
    examples = []
    for entry in tqdm(input_data):
        title = entry["title"]
        for paragraph in entry["paragraphs"]:
            context_text = paragraph["context"]
            for qa in paragraph["qas"]:
                qas_id = qa["id"]
                question_text = qa["question"]
                start_position_character = None
                answer_text = None
                answers = []

                is_impossible = qa.get("is_impossible", False)
                if not is_impossible:
                    answer = qa["answers"][0]
                    answer_text = answer["text"]
                    start_position_character = answer["answer_start"]
                    answers = qa["answers"]

                example = SquadExample(
                    qas_id=qas_id,
                    question_text=question_text,
                    context_text=context_text,
                    answer_text=answer_text,
                    start_position_character=start_position_character,
                    title=title,
                    is_impossible=is_impossible,
                    answers=answers,
                )
                examples.append(example)
    return examples
示例#4
0
def convert_to_korquad_example(entry, is_training):
    title = entry["title"]
    html = entry["context"]
    qas = entry["qas"]

    ## 답변길이가 긴 것들은 BERT계열의 모델의 성능만 낮추는 결과를 초래함. 따라서 제한한다.
    modified_qas = converter.get_qas_by_len(qas)
    if len(modified_qas) == 0:
        return []

    modified_paragraphs = converter.convert_to_squad_format(html, modified_qas)
    temp_examples = {}
    for modified_paragraph in modified_paragraphs:
        context_text = modified_paragraph["context"]

        for qa in modified_paragraph["qas"]:
            qas_id = qa["id"]
            question_text = qa["question"]
            start_position_character = None
            answer_text = None
            answers = []

            if "is_impossible" in qa:
                is_impossible = qa["is_impossible"]
            else:
                is_impossible = False

            if not is_impossible:
                if is_training:
                    answer = qa["answers"][0]
                    answer_text = answer["text"]
                    start_position_character = answer["answer_start"]
                else:
                    answers = qa["answers"]

            if qas_id not in temp_examples:
                temp_examples[qas_id] = korquadExample(
                    qas_id=qas_id,
                    question_text=question_text,
                    answer_text=answer_text,
                    title=title,
                    is_impossible=is_impossible,
                )

            example = SquadExample(
                qas_id=qas_id,
                question_text=question_text,
                context_text=context_text,
                answer_text=answer_text,
                start_position_character=start_position_character,
                title=title,
                is_impossible=is_impossible,
                answers=answers,
            )
            temp_examples[qas_id].add_SquadExample(example)
    return [example for qas_id, example in temp_examples.items()]
示例#5
0
    def get(self, episode_idx, entry_idx=None):
        """
        Get a specific example from the dataset.
        """
        ex = self.episodes[episode_idx][entry_idx]
        is_training = self.datatype == "train"
        qas_id = str(episode_idx) + "_" + str(entry_idx)
        question_text = ex['text']
        answer_text = ex['labels'][0]
        start_position_character = None
        is_impossible = answer_text == NO_ANSWER_REPLY
        # if not is_impossible:
        answers = ex['labels']
        if not is_training:
            start_position_character = int(ex["answer_starts"].split('|')[0])
        else:
            start_position_character = int(ex["answer_starts"])
        char_start_end = (start_position_character,
                          start_position_character + len(answer_text))
        # else:
        #     answers = [NO_ANSWER_REPLY]
        #     char_start_end = (-1, -1)

        squad_example = SquadExample(
            qas_id=qas_id,
            question_text=question_text,
            context_text=self.episodes[episode_idx][0]['context'],
            answer_text=answer_text,
            start_position_character=start_position_character,
            title=ex['title'],
            is_impossible=is_impossible,
            answers=answers,
        )

        action = {
            'id': 'squad',
            'turn_id': ex['turn_id'],
            'qas_id': qas_id,
            'labels': answers,
            'context': ex['context'],
            'squad_example': squad_example,
            'single_label_text': answer_text,
            'episode_done': ex['episode_done'],
            'is_impossible': is_impossible,
            'followup': ex['followup'],
            'yesno': ex['yesno'],
            'text': question_text,
            'no_answer_reply': NO_ANSWER_REPLY,
            'background': ex['background'],
            'section_title': ex['section_title'],
            'title': ex['title'],
            'character_start_end': char_start_end
        }
        return action
示例#6
0
    def get(self, episode_idx, entry_idx=None):
        is_training = self.datatype == "train"
        article_idx, paragraph_idx, qa_idx = self.examples[episode_idx]
        article = self.squad[article_idx]
        paragraph = article['paragraphs'][paragraph_idx]
        context_text = paragraph["context"]
        qa = paragraph["qas"][qa_idx]
        qas_id = qa["id"]
        question_text = qa["question"]
        start_position_character = None
        answer_text = None
        answers = []
        is_impossible = qa.get("is_impossible", False)
        if not is_impossible:
            answer = qa["answers"][0]
            answer_text = answer["text"]
            start_position_character = answer["answer_start"]
            answers = [qa['text'] for qa in qa["answers"]]
        else:
            answers = [NO_ANSWER_REPLY]

        squad_example = SquadExample(
            qas_id=qas_id,
            question_text=question_text,
            context_text=context_text,
            answer_text=answer_text,
            start_position_character=start_position_character,
            title="unknown title",
            is_impossible=is_impossible,
            answers=answers,
        )

        action = {
            'id': 'squad',
            'qas_id': qas_id,
            'context': context_text,
            'labels': answers,
            'squad_example': squad_example,
            'single_label_text': answer_text,
            'episode_done': True,
            'is_impossible': is_impossible,
            'no_answer_reply': NO_ANSWER_REPLY
        }
        return action
示例#7
0
def _create_examples(input_data, set_type):
    is_training = set_type == "train"
    examples = []
    for entry in tqdm(input_data):
        title = entry["title"]
        for paragraph in entry["paragraphs"]:
            # context_text = paragraph["context"]
            for qa in paragraph["qas"]:
                context_text = qa["context_sent"]
                qas_id = qa["id"]
                question_text = qa["question"]
                start_position_character = None
                answer_text = None
                answers = []

                if "is_impossible" in qa:
                    is_impossible = qa["is_impossible"]
                else:
                    is_impossible = False

                if is_training:
                    if qa["answers"]:
                        answer = qa["answers"][0]
                    else:
                        answer = qa["plausible_answers"][0]
                    answer_text = answer["text"]
                    start_position_character = context_text.find(answer_text)
                else:
                    answers = qa["answers"]

                example = SquadExample(
                    qas_id=qas_id,
                    question_text=question_text,
                    context_text=context_text,
                    answer_text=answer_text,
                    start_position_character=start_position_character,
                    title=title,
                    is_impossible=is_impossible,
                    answers=answers,
                )

                examples.append(example)
    return examples
示例#8
0
    async def predict(
            self, sources: SourcesContext
    ) -> AsyncIterator[Tuple[Record, Any, float]]:
        if not os.path.isfile(
                os.path.join(self.parent.config.output_dir,
                             "pytorch_model.bin")):
            raise ModelNotTrained("Train model before prediction.")

        self.model = AutoModelForQuestionAnswering.from_pretrained(
            self.parent.config.output_dir)  # , force_download=True)
        self.model.to(self.parent.config.device)
        self.tokenizer = AutoTokenizer.from_pretrained(
            self.parent.config.output_dir,
            do_lower_case=self.parent.config.do_lower_case,
        )
        async for record in sources.records():

            example = SquadExample(
                qas_id=record.key,
                question_text=record.feature("question"),
                context_text=record.feature("context"),
                answer_text=record.feature("answer_text"),
                start_position_character=record.feature("start_pos_char"),
                title=record.feature("title"),
                is_impossible=record.feature("is_impossible"),
                answers=record.feature("answers"),
            )
            features, dataset = squad_convert_examples_to_features(
                examples=[example],
                tokenizer=self.tokenizer,
                max_seq_length=self.parent.config.max_seq_length,
                doc_stride=self.parent.config.doc_stride,
                max_query_length=self.parent.config.max_query_length,
                is_training=False,
                return_dataset="pt",
            )
            prediction = await self._custom_accuracy([example], features,
                                                     dataset)
            record.predicted("Answer", prediction, "Nan")
            yield record
示例#9
0
 async def _preprocess_data(self, sources: Sources):
     all_examples = []
     all_sources = sources.with_features([
         "question",
         "context",
         "answer_text",
         "start_pos_char",
         "title",
         "is_impossible",
         "answers",
     ])
     async for record in all_sources:
         example = SquadExample(
             qas_id=record.key,
             question_text=record.feature("question"),
             context_text=record.feature("context"),
             answer_text=record.feature("answer_text"),
             start_position_character=record.feature("start_pos_char"),
             title=record.feature("title"),
             is_impossible=record.feature("is_impossible"),
             answers=record.feature("answers"),
         )
         all_examples.append(example)
     return all_examples
示例#10
0
    def find_answer(self,
                    question,
                    context,
                    n_best_size=20,
                    max_answer_length=30,
                    full_sentence=False):
        # heavily inspired by "https://github.com/huggingface/transformers/blob/v2.3.0/examples/run_squad.py#L212-L317"
        example_id = '55555'
        example = SquadExample(example_id, question, context, None, None, None)

        features, dataset = squad_convert_examples_to_features(
            [example],
            self.tokenizer,
            self.max_seq_length,
            self.doc_stride,
            self.max_query_length,
            False,
            return_dataset='pt')

        sampler = SequentialSampler(dataset)
        dataloader = DataLoader(dataset, sampler=sampler, batch_size=1)

        all_results = []
        for batch in dataloader:
            self.model.eval()
            batch = tuple(t.to(self.device) for t in batch)

            with torch.no_grad():
                inputs = {
                    "input_ids": batch[0],
                    "attention_mask": batch[1],
                    "token_type_ids": batch[2],
                }

                if self.model_type in {"xlm", "roberta", "distilbert"}:
                    del inputs["token_type_ids"]

                example_index = batch[3]

                # XLNet and XLM use more arguments for their predictions
                if self.model_type in {"xlnet", "xlm"}:
                    inputs.update({"cls_index": batch[4], "p_mask": batch[5]})

                outputs = self.model(**inputs)
                output = [o.detach().cpu().tolist() for o in outputs]

                unique_id = int(features[example_index].unique_id)

                # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
                # models only use two.
                if len(output) >= 5:
                    start_logits = output[0]
                    start_top_index = output[1]
                    end_logits = output[2]
                    end_top_index = output[3]
                    cls_logits = output[4]

                    squad_result = SquadResult(
                        unique_id,
                        start_logits[0],
                        end_logits[0],
                        start_top_index=start_top_index[0],
                        end_top_index=end_top_index[0],
                        cls_logits=cls_logits[0],
                    )

                else:
                    start_logits, end_logits = output
                    squad_result = SquadResult(unique_id, start_logits[0],
                                               end_logits[0])

                all_results.append(squad_result)

        # XLNet and XLM use a more complex post-processing procedure
        if self.model_type in {"xlnet", "xlm"}:
            if hasattr(model, "config"):
                start_n_top = self.model.config.start_n_top
                end_n_top = self.model.config.end_n_top
            else:
                start_n_top = self.model.module.config.start_n_top
                end_n_top = self.model.module.config.end_n_top

            predictions = compute_predictions_log_probs(
                [example],
                features,
                all_results,
                n_best_size,
                max_answer_length,
                '/tmp/pred.out',
                '/tmp/nbest.out',
                '/tmp/null.out',
                start_n_top,
                end_n_top,
                self.version_2_with_negative,
                tokenizer,
                self.verbose,
            )
        else:
            predictions = compute_predictions_logits(
                [example],
                features,
                all_results,
                n_best_size,
                max_answer_length,
                self.do_lower_case,
                '/tmp/pred.out',
                '/tmp/nbest.out',
                '/tmp/null.out',
                self.verbose,
                self.version_2_with_negative,
                self.null_score_diff_threshold,
            )

        prediction = predictions[example_id]

        logger.debug(f'found prediction: "{prediction}"')

        # empty prediction indicates unknown answer
        if not prediction:
            logger.debug('empty prediction')
            return None

        if full_sentence:
            doc = self.nlp(context)
            for sent in doc.sents:
                if prediction in sent.text:
                    prediction = sent.text
                    break

        return prediction
示例#11
0
def run_prediction_multi(question_texts, context_texts):
    """
    Modified from run_squad.py to only produce predicted answer given the question and context.
    This  function will produce multiple answers by splitting the context into paragraphs

    Input: 
        1. List of questions
        2. List of Context
    Output: 
        1. Predicted answer
    """
    examples = []

    for i, question_text in enumerate(question_texts):
        for j, context_text in enumerate(context_texts):
            example = SquadExample(
                qas_id=str(i) + str(j),
                question_text=question_text,
                context_text=context_text,
                answer_text=None,
                start_position_character=None,
                title="Predict",
                is_impossible=False,
                answers=None,
            )

            examples.append(example)
    features, dataset = squad_convert_examples_to_features(
        examples=examples,
        tokenizer=tokenizer,
        max_seq_length=384,
        doc_stride=128,
        max_query_length=64,
        is_training=False,
        return_dataset="pt",
        threads=1,
    )
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=10)
    all_results = []

    for batch in eval_dataloader:
        model.eval()
        batch = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            example_indices = batch[3]

            outputs = model(**inputs)

            for i, example_index in enumerate(example_indices):
                eval_feature = features[example_index.item()]
                unique_id = int(eval_feature.unique_id)

                output = [to_list(output[i]) for output in outputs]

                start_logits, end_logits = output
                result = SquadResult(unique_id, start_logits, end_logits)
                all_results.append(result)

    output_prediction_file = "predictions.json"
    output_nbest_file = "nbest_predictions.json"
    output_null_log_odds_file = "null_predictions.json"

    predictions = compute_predictions_logits(
        examples,
        features,
        all_results,
        n_best_size,
        max_answer_length,
        do_lower_case,
        output_prediction_file,
        output_nbest_file,
        output_null_log_odds_file,
        False,  # verbose_logging
        True,  # version_2_with_negative
        null_score_diff_threshold,
        tokenizer,
    )
    return predictions
示例#12
0
    def answer_question(self, ranked_examples):
        squad_examples = [SquadExample(
            qas_id=str(x['id']),
            question_text=x['question'],
            context_text=x['document'],
            answer_text=None,
            start_position_character=None,
            title='',
            answers=[],
        ) for x in ranked_examples]

        squad_features, squad_dataset = squad_convert_examples_to_features(
            examples=squad_examples,
            tokenizer=self.tokenizer,
            max_seq_length=512,
            doc_stride=128,
            max_query_length=64,
            is_training=False,
            return_dataset="pt",
            threads=cpu_count(),
        )

        eval_batch_size = self.per_gpu_eval_batch_size * max(1, self.n_gpu)
        eval_sampler = SequentialSampler(squad_dataset)
        eval_dataloader = DataLoader(squad_dataset, sampler=eval_sampler, batch_size=eval_batch_size)

        # multi-gpu evaluate
        if self.n_gpu > 1 and not isinstance(self.model, torch.nn.DataParallel):
            self.model = torch.nn.DataParallel(self.model)

        # Eval!
        logger.info("***** Running evaluation of QA *****")
        logger.info("  Num examples = %d", len(squad_dataset))
        logger.info("  Batch size = %d", eval_batch_size)

        all_results = []

        for batch in tqdm(eval_dataloader, desc="Evaluating reader"):
            self.model.eval()
            batch = tuple(t.to(self.device) for t in batch)

            with torch.no_grad():
                inputs = {
                    "input_ids": batch[0],
                    "attention_mask": batch[1],
                    "token_type_ids": batch[2],
                }

                if self.model_type in ["xlm", "roberta", "distilbert"]:
                    del inputs["token_type_ids"]

                example_indices = batch[3]

                # XLNet and XLM use more arguments for their predictions
                if self.model_type in ["xlnet", "xlm"]:
                    inputs.update({"cls_index": batch[4], "p_mask": batch[5]})
                    # for lang_id-sensitive xlm self.models
                    if hasattr(self.model, "config") and hasattr(self.model.config, "lang2id"):
                        inputs.update(
                            {"langs": (torch.ones(batch[0].shape, dtype=torch.int64) * self.lang_id).to(self.device)}
                        )

                outputs = self.model(**inputs)
            for i, example_index in enumerate(example_indices):
                eval_feature = squad_features[example_index.item()]
                unique_id = int(eval_feature.unique_id)

                output = [to_list(output[i]) for output in outputs]

                # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
                # models only use two.
                if len(output) >= 5:
                    start_logits = output[0]
                    start_top_index = output[1]
                    end_logits = output[2]
                    end_top_index = output[3]
                    cls_logits = output[4]

                    result = SquadResult(
                        unique_id,
                        start_logits,
                        end_logits,
                        start_top_index=start_top_index,
                        end_top_index=end_top_index,
                        cls_logits=cls_logits,
                    )

                else:
                    start_logits, end_logits = output
                    result = SquadResult(unique_id, start_logits, end_logits)

                all_results.append(result)

        # Compute predictions
        output_prediction_file = os.path.join(self.model_name_or_path, "predictions.json")
        output_nbest_file = os.path.join(self.model_name_or_path, "nbest_predictions.json")

        if True:
            output_null_log_odds_file = os.path.join(self.model_name_or_path, "null_odds.json")
        else:
            output_null_log_odds_file = None

        # XLNet and XLM use a more complex post-processing procedure

        if self.model_type in ["xlnet", "xlm"]:
            start_n_top = self.model.config.start_n_top if hasattr(self.model, "config") else self.model.module.config.start_n_top
            end_n_top = self.model.config.end_n_top if hasattr(self.model, "config") else self.model.module.config.end_n_top

            predictions = compute_predictions_log_probs(
                squad_examples,
                squad_features,
                all_results,
                n_best_size=self.n_best_size,
                max_answer_length=self.max_answer_length,
                output_prediction_file=output_prediction_file,
                output_nbest_file=output_nbest_file,
                output_null_log_odds_file=output_null_log_odds_file,
                start_n_top=start_n_top,
                end_n_top=end_n_top,
                version_2_with_negative=True,
                tokenizer=self.okenizer,
                verbose_logging=True,
            )
        else:
            predictions = compute_predictions_logits(
                squad_examples,
                squad_features,
                all_results,
                n_best_size=self.n_best_size,
                max_answer_length=self.max_answer_length,
                do_lower_case=True,
                output_prediction_file=output_prediction_file,
                output_nbest_file=output_nbest_file,
                output_null_log_odds_file=output_null_log_odds_file,
                verbose_logging=True,
                version_2_with_negative=True,
                null_score_diff_threshold=0.0,
                tokenizer=self.tokenizer,
            )
        logger.info('predictions: {}'.format(predictions))
        with open(output_nbest_file) as f:
            output_nbest = json.load(f)
        return output_nbest
示例#13
0
def run_prediction(question_texts, context_text, model_path):
    ### Setting hyperparameters
    max_seq_length = 512
    doc_stride = 256
    n_best_size = 1
    max_query_length = 64
    max_answer_length = 512
    do_lower_case = False
    null_score_diff_threshold = 0.0

    # model_name_or_path = "../cuad-models/roberta-base/"

    def to_list(tensor):
        return tensor.detach().cpu().tolist()

    config_class, model_class, tokenizer_class = (
        AutoConfig, AutoModelForQuestionAnswering, AutoTokenizer)
    config = config_class.from_pretrained(model_path)
    tokenizer = tokenizer_class.from_pretrained(
        model_path, do_lower_case=True, use_fast=False)
    model = model_class.from_pretrained(model_path, config=config)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    processor = SquadV2Processor()
    examples = []

    for i, question_text in enumerate(question_texts):
        example = SquadExample(
            qas_id=str(i),
            question_text=question_text,
            context_text=context_text,
            answer_text=None,
            start_position_character=None,
            title="Predict",
            answers=None,
        )

        examples.append(example)

    features, dataset = squad_convert_examples_to_features(
        examples=examples,
        tokenizer=tokenizer,
        max_seq_length=max_seq_length,
        doc_stride=doc_stride,
        max_query_length=max_query_length,
        is_training=False,
        return_dataset="pt",
        threads=1,
    )

    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=10)

    all_results = []

    for batch in eval_dataloader:
        model.eval()
        batch = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            example_indices = batch[3]

            outputs = model(**inputs)

            for i, example_index in enumerate(example_indices):
                eval_feature = features[example_index.item()]
                unique_id = int(eval_feature.unique_id)

                output = [to_list(output[i]) for output in outputs.to_tuple()]

                start_logits, end_logits = output
                result = SquadResult(unique_id, start_logits, end_logits)
                all_results.append(result)

    final_predictions = compute_predictions_logits(
        all_examples=examples,
        all_features=features,
        all_results=all_results,
        n_best_size=n_best_size,
        max_answer_length=max_answer_length,
        do_lower_case=do_lower_case,
        output_prediction_file=None,
        output_nbest_file=None,
        output_null_log_odds_file=None,
        verbose_logging=False,
        version_2_with_negative=True,
        null_score_diff_threshold=null_score_diff_threshold,
        tokenizer=tokenizer
    )

    return final_predictions
示例#14
0
def answergen_albert(question_texts, context_text):
    model_name_or_path = "ktrapeznikov/albert-xlarge-v2-squad-v2"
    output_dir = ""
    # Config
    n_best_size = 1
    max_answer_length = 30
    do_lower_case = True
    null_score_diff_threshold = 0.0

    def to_list(tensor):
        return tensor.detach().cpu().tolist()

    # Setup model
    config_class, model_class, tokenizer_class = (AlbertConfig,
                                                  AlbertForQuestionAnswering,
                                                  AlbertTokenizer)
    config = config_class.from_pretrained(
        "ktrapeznikov/albert-xlarge-v2-squad-v2")
    tokenizer = tokenizer_class.from_pretrained(
        "ktrapeznikov/albert-xlarge-v2-squad-v2", do_lower_case=True)
    model = model_class.from_pretrained(
        "ktrapeznikov/albert-xlarge-v2-squad-v2", config=config)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(device)
    model.to(device)

    #processor = SquadV2Processor()
    """Setup function to compute predictions"""
    examples = []
    print(question_texts)
    for i, question_text in enumerate(question_texts):
        example = SquadExample(
            qas_id=str(i),
            question_text=question_text,
            context_text=context_text,
            answer_text=None,
            start_position_character=None,
            title="Predict",
            is_impossible=False,
            answers=None,
        )

        examples.append(example)
    print(examples)
    features, dataset = squad_convert_examples_to_features(
        examples=examples,
        tokenizer=tokenizer,
        max_seq_length=384,
        doc_stride=128,
        max_query_length=64,
        is_training=False,
        return_dataset="pt",
        threads=1,
    )

    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=10)

    all_results = []

    for batch in eval_dataloader:
        model.eval()
        batch = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            example_indices = batch[3]

            outputs = model(**inputs)

            for i, example_index in enumerate(example_indices):
                eval_feature = features[example_index.item()]
                unique_id = int(eval_feature.unique_id)

                output = [to_list(output[i]) for output in outputs]

                start_logits, end_logits = output
                result = SquadResult(unique_id, start_logits, end_logits)
                all_results.append(result)

    output_prediction_file = "predictions.json"
    output_nbest_file = "nbest_predictions.json"
    output_null_log_odds_file = "null_predictions.json"

    predictions = compute_predictions_logits(
        examples,
        features,
        all_results,
        n_best_size,
        max_answer_length,
        do_lower_case,
        output_prediction_file,
        output_nbest_file,
        output_null_log_odds_file,
        False,  # verbose_logging
        True,  # version_2_with_negative
        null_score_diff_threshold,
        tokenizer,
    )

    return predictions
def run_prediction(model, question_texts, context_text):
    """Setup function to compute predictions"""

    processor = SquadV2Processor()
    config = model.model.config
    tokenizer = model.tokenizer
    examples = []

    model = model.model

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    for i, question_text in enumerate(question_texts):
        example = SquadExample(
            qas_id=str(i),
            question_text=question_text,
            context_text=context_text,
            answer_text=None,
            start_position_character=None,
            title="Predict",
            is_impossible=False,
            answers=None,
        )

        examples.append(example)

    features, dataset = squad_convert_examples_to_features(
        examples=examples,
        tokenizer=tokenizer,
        max_seq_length=384,
        doc_stride=128,
        max_query_length=64,
        is_training=False,
        return_dataset="pt",
        threads=1,
    )

    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=10)

    all_results = []

    for batch in eval_dataloader:
        model.eval()
        batch = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            example_indices = batch[3]

            outputs = model(**inputs)

            for i, example_index in enumerate(example_indices):
                eval_feature = features[example_index.item()]
                unique_id = int(eval_feature.unique_id)

                output = [to_list(output[i]) for output in outputs]

                start_logits, end_logits = output
                result = SquadResult(unique_id, start_logits, end_logits)
                all_results.append(result)

    if not os.path.exists("predictions"):
        os.mkdir("predictions")

    output_prediction_file = "predictions/predictions.json"
    output_nbest_file = "predictions/nbest_predictions.json"
    output_null_log_odds_file = "predictions/null_predictions.json"

    predictions = compute_predictions_logits(
        examples,
        features,
        all_results,
        n_best_size,
        max_answer_length,
        do_lower_case,
        output_prediction_file,
        output_nbest_file,
        output_null_log_odds_file,
        False,  # verbose_logging
        True,  # version_2_with_negative
        null_score_diff_threshold,
        tokenizer,
    )

    return predictions
示例#16
0
def read_squad_examples(input_file, is_training):
    """Read a SQuAD json file into a list of SquadExample."""
    is_bioasq=True # for BioASQ

    with open(input_file, "r") as reader:
        input_data = json.load(reader)["data"]

    def is_whitespace(c):
        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
            return True
        return False

    examples = []
    for entry in input_data:
        for paragraph in entry["paragraphs"]:
            paragraph_text = paragraph["context"]
            doc_tokens = []
            char_to_word_offset = []
            prev_is_whitespace = True
            if is_bioasq:
                paragraph_text.replace('/',' ')  # need review
            for c in paragraph_text:
                if is_whitespace(c):
                    prev_is_whitespace = True
                else:
                    if prev_is_whitespace:
                        doc_tokens.append(c)
                    else:
                        doc_tokens[-1] += c
                    prev_is_whitespace = False
                char_to_word_offset.append(len(doc_tokens) - 1)

            for qa in paragraph["qas"]:
                qas_id = qa["id"]
                question_text = qa["question"]
                answer = None
                is_impossible = False
                if is_training:
                    assert (qa["is_impossible"] == True) != (qa["answers"] == "yes")
                    assert qa["answers"] in ["yes", "no"]
                    # answer = 1 if qa["answers"] == 'yes' else 0
                    is_impossible = qa["is_impossible"]

                example = SquadExample(
                    qas_id=qas_id,
                    question_text=question_text,
                    context_text=paragraph_text,
                    answer_text='',
                    start_position_character=None,
                    title='',
                    answers=[],
                    is_impossible=is_impossible,
                )
                examples.append(example)

    # target_cnt = 500
    if is_training:
        pos_cnt = sum([1 for example in examples if example.is_impossible == False])
        neg_cnt = sum([1 for example in examples if example.is_impossible == True])
        target_cnt = min(pos_cnt,neg_cnt)
        print()
        print('Imbalance btw {} vs {}'.format(pos_cnt, neg_cnt))
        random.shuffle(examples)

        new_examples = []
        new_pos_cnt = 0
        new_neg_cnt = 0
        for example in examples:
            if example.is_impossible == False and new_pos_cnt >= target_cnt:
                continue
            if example.is_impossible == True and new_neg_cnt >= target_cnt:
                continue
            else:
                new_examples.append(example)
                new_pos_cnt += (1 if example.is_impossible == False else 0)
                new_neg_cnt += (1 if example.is_impossible == True else 0)

        pos_cnt = sum([1 for example in new_examples if example.is_impossible == False])
        neg_cnt = sum([1 for example in new_examples if example.is_impossible == True])
        random.shuffle(new_examples)
        print('Balanced as {} vs {}'.format(pos_cnt, neg_cnt))
        print('Sample: {}'.format(new_examples[0]))
        return new_examples
    else:
        return examples
示例#17
0
    def infer(self, context_text, question_texts):
        """Setup function to compute predictions"""
        examples = []

        for i, question_text in enumerate(question_texts):
            example = SquadExample(
                qas_id=str(i),
                question_text=question_text,
                context_text=context_text,
                answer_text=None,
                start_position_character=None,
                title="Predict",
                is_impossible=False,
                answers=None,
            )

            examples.append(example)

        features, dataset = squad_convert_examples_to_features(
            examples=examples,
            tokenizer=self.tokenizer,
            max_seq_length=384,
            doc_stride=128,
            max_query_length=64,
            is_training=False,
            return_dataset="pt",
            threads=1,
        )

        eval_sampler = SequentialSampler(dataset)
        eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=10)

        all_results = []

        for batch in eval_dataloader:
            self.model.eval()
            batch = tuple(t.to(self.device) for t in batch)

            with torch.no_grad():
                inputs = {
                    "input_ids": batch[0],
                    "attention_mask": batch[1],
                    "token_type_ids": batch[2],
                }

                example_indices = batch[3]

                outputs = self.model(**inputs)

                for i, example_index in enumerate(example_indices):
                    eval_feature = features[example_index.item()]
                    unique_id = int(eval_feature.unique_id)

                    output = [self._to_list(output[i]) for output in outputs]

                    start_logits, end_logits = output
                    result = SquadResult(unique_id, start_logits, end_logits)
                    all_results.append(result)

        output_prediction_file = "predictions.json"
        output_nbest_file = "nbest_predictions.json"
        output_null_log_odds_file = "null_predictions.json"

        predictions = compute_predictions_logits(
            examples,
            features,
            all_results,
            self.n_best_size,
            self.max_answer_length,
            self.do_lower_case,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            False,  # verbose_logging
            True,  # version_2_with_negative
            self.null_score_diff_threshold,
            self.tokenizer,
        )

        return predictions
def run_prediction(question_texts, context_text):
    """Setup function to compute predictions"""
    examples = []

    for i, question_text in enumerate(question_texts):
        example = SquadExample(
            qas_id=str(i),
            question_text=question_text,
            context_text=context_text,
            answer_text=None,
            start_position_character=None,
            title="Predict",
            is_impossible=False,
            answers=None,
        )

        examples.append(example)

    features, dataset = squad_convert_examples_to_features(
        examples=examples,
        tokenizer=tokenizer,
        max_seq_length=384,
        doc_stride=128,
        max_query_length=64,
        is_training=False,
        return_dataset="pt",
        threads=4,
    )

    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=10)

    all_results = []

    for batch in eval_dataloader:
        model.eval()
        batch = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            example_indices = batch[3]

            outputs = model(**inputs)

            for i, example_index in enumerate(example_indices):
                eval_feature = features[example_index.item()]
                unique_id = int(eval_feature.unique_id)

                output = [to_list(output[i]) for output in outputs]

                start_logits, end_logits = output
                result = SquadResult(unique_id, start_logits, end_logits)
                all_results.append(result)

    output_prediction_file = "predictions.json"
    output_nbest_file = "nbest_predictions.json"
    output_null_log_odds_file = "null_predictions.json"

    predictions = compute_predictions_logits(
        examples,
        features,
        all_results,
        n_best_size,
        max_answer_length,
        do_lower_case,
        output_prediction_file,
        output_nbest_file,
        output_null_log_odds_file,
        False,  # verbose_logging
        True,  # version_2_with_negative
        null_score_diff_threshold,
        tokenizer,
    )

    return predictions


# context = "New Zealand (Māori: Aotearoa) is a sovereign island country in the southwestern Pacific Ocean. It has a total land area of 268,000 square kilometres (103,500 sq mi), and a population of 4.9 million. New Zealand's capital city is Wellington, and its most populous city is Auckland."
# questions = ["How many people live in New Zealand?",
#              "What's the largest city?"]
#
# # Run method
# predictions = run_prediction(questions, context)
#
# # Print results
# for key in predictions.keys():
#     print(predictions[key])