def _load_and_preprocess_all(self, mode: str):
        self._download_data(mode)

        if os.path.exists(self._get_preprocessed_fname(mode)):
            episodes_fname = self._get_preprocessed_fname(mode)
            colorlog.info(f"Load preprocessed holle from {episodes_fname}")
            with open(episodes_fname, 'r') as fp:
                episodes = []
                for line in fp:
                    episodes.append(json.loads(line))
            dictionary = tokenization.FullTokenizer(self._vocab_fname)
            return episodes, dictionary

        # Load raw dataset
        raw_fname = os.path.join(self._datapath, f'{mode}_data.json')
        with open(raw_fname, 'r') as fp:
            episodes = json.load(fp)
        if mode != 'test':
            episodes = self._to_wow_format(episodes, mode)
        else:
            multi_fname = os.path.join(self._datapath,
                                       'multi_reference_test.json')
            with open(multi_fname, 'r') as fp:
                multi_responses = json.load(fp)
            episodes = self._to_wow_format_multi(episodes, multi_responses,
                                                 mode)

        dictionary = tokenization.FullTokenizer(self._vocab_fname)

        return self._preprocess_episodes(episodes, dictionary, mode)
def generate_tf_record_from_data_file(processor,
                                      data_dir,
                                      vocab_file,
                                      train_data_output_path=None,
                                      eval_data_output_path=None,
                                      max_seq_length=128,
                                      do_lower_case=True):
    """Generates and saves training data into a tf record file.

  Arguments:
      processor: Input processor object to be used for generating data. Subclass
        of `DataProcessor`.
      data_dir: Directory that contains train/eval data to process. Data files
        should be in from "dev.tsv", "test.tsv", or "train.tsv".
      vocab_file: Text file with words to be used for training/evaluation.
      train_data_output_path: Output to which processed tf record for training
        will be saved.
      eval_data_output_path: Output to which processed tf record for evaluation
        will be saved.
      max_seq_length: Maximum sequence length of the to be generated
        training/eval data.
      do_lower_case: Whether to lower case input text.

  Returns:
      A dictionary containing input meta data.
  """
    assert train_data_output_path or eval_data_output_path

    label_list = processor.get_labels()
    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                           do_lower_case=do_lower_case)
    assert train_data_output_path
    train_input_data_examples = processor.get_train_examples(data_dir)
    file_based_convert_examples_to_features(train_input_data_examples,
                                            label_list, max_seq_length,
                                            tokenizer, train_data_output_path)
    num_training_data = len(train_input_data_examples)

    if eval_data_output_path:
        eval_input_data_examples = processor.get_dev_examples(data_dir)
        file_based_convert_examples_to_features(eval_input_data_examples,
                                                label_list, max_seq_length,
                                                tokenizer,
                                                eval_data_output_path)

    meta_data = {
        "task_type": "bert_classification",
        "processor_type": processor.get_processor_name(),
        "num_labels": len(processor.get_labels()),
        "train_data_size": num_training_data,
        "max_seq_length": max_seq_length,
    }

    if eval_data_output_path:
        meta_data["eval_data_size"] = len(eval_input_data_examples)

    return meta_data
示例#3
0
    def test_full_tokenizer(self):
        vocab_tokens = [
            "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un",
            "runn", "##ing", ","
        ]
        with tempfile.NamedTemporaryFile(delete=False) as vocab_writer:
            if six.PY2:
                vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
            else:
                vocab_writer.write("".join([x + "\n" for x in vocab_tokens
                                            ]).encode("utf-8"))

            vocab_file = vocab_writer.name

        tokenizer = tokenization.FullTokenizer(vocab_file)
        os.unlink(vocab_file)

        tokens = tokenizer.tokenize(u"UNwant\u00E9d,running")
        self.assertAllEqual(tokens,
                            ["un", "##want", "##ed", ",", "runn", "##ing"])

        self.assertAllEqual(tokenizer.convert_tokens_to_ids(tokens),
                            [7, 4, 5, 10, 8, 9])
示例#4
0
def generate_tf_record_from_json_file(input_file_path,
                                      vocab_file_path,
                                      output_path,
                                      max_seq_length=384,
                                      do_lower_case=True,
                                      max_query_length=64,
                                      doc_stride=128,
                                      version_2_with_negative=False):
    """Generates and saves training data into a tf record file."""
    train_examples = read_squad_examples(
        input_file=input_file_path,
        is_training=True,
        version_2_with_negative=version_2_with_negative)
    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file_path,
                                           do_lower_case=do_lower_case)
    train_writer = FeatureWriter(filename=output_path, is_training=True)
    number_of_examples = convert_examples_to_features(
        examples=train_examples,
        tokenizer=tokenizer,
        max_seq_length=max_seq_length,
        doc_stride=doc_stride,
        max_query_length=max_query_length,
        is_training=True,
        output_fn=train_writer.process_feature)
    train_writer.close()

    meta_data = {
        "task_type": "bert_squad",
        "train_data_size": number_of_examples,
        "max_seq_length": max_seq_length,
        "max_query_length": max_query_length,
        "doc_stride": doc_stride,
        "version_2_with_negative": version_2_with_negative,
    }

    return meta_data
示例#5
0
def predict_squad(strategy, input_meta_data):
  """Makes predictions for a squad dataset."""
  bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
  doc_stride = input_meta_data['doc_stride']
  max_query_length = input_meta_data['max_query_length']
  # Whether data should be in Ver 2.0 format.
  version_2_with_negative = input_meta_data.get('version_2_with_negative',
                                                False)
  eval_examples = squad_lib.read_squad_examples(
      input_file=FLAGS.predict_file,
      is_training=False,
      version_2_with_negative=version_2_with_negative)

  tokenizer = tokenization.FullTokenizer(
      vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)

  eval_writer = squad_lib.FeatureWriter(
      filename=os.path.join(FLAGS.model_dir, 'eval.tf_record'),
      is_training=False)
  eval_features = []

  def _append_feature(feature, is_padding):
    if not is_padding:
      eval_features.append(feature)
    eval_writer.process_feature(feature)

  # TPU requires a fixed batch size for all batches, therefore the number
  # of examples must be a multiple of the batch size, or else examples
  # will get dropped. So we pad with fake examples which are ignored
  # later on.
  dataset_size = squad_lib.convert_examples_to_features(
      examples=eval_examples,
      tokenizer=tokenizer,
      max_seq_length=input_meta_data['max_seq_length'],
      doc_stride=doc_stride,
      max_query_length=max_query_length,
      is_training=False,
      output_fn=_append_feature,
      batch_size=FLAGS.predict_batch_size)
  eval_writer.close()

  logging.info('***** Running predictions *****')
  logging.info('  Num orig examples = %d', len(eval_examples))
  logging.info('  Num split examples = %d', len(eval_features))
  logging.info('  Batch size = %d', FLAGS.predict_batch_size)

  num_steps = int(dataset_size / FLAGS.predict_batch_size)
  all_results = predict_squad_customized(strategy, input_meta_data, bert_config,
                                         eval_writer.filename, num_steps)

  output_prediction_file = os.path.join(FLAGS.model_dir, 'predictions.json')
  output_nbest_file = os.path.join(FLAGS.model_dir, 'nbest_predictions.json')
  output_null_log_odds_file = os.path.join(FLAGS.model_dir, 'null_odds.json')

  squad_lib.write_predictions(
      eval_examples,
      eval_features,
      all_results,
      FLAGS.n_best_size,
      FLAGS.max_answer_length,
      FLAGS.do_lower_case,
      output_prediction_file,
      output_nbest_file,
      output_null_log_odds_file,
      verbose=FLAGS.verbose_logging)
示例#6
0
    def _load_and_preprocess_all(self, mode: str):
        """
        As default, it returns the following action dict:
        {
            'id': 'wizard_of_wikipedia'
            'text': chosen_topic\n # if first example in episode
                    last_apprentice_message\n # if possible
                    wizard_message # if --label-type is 'chosen_sent'
            'knowledge': title_1 sentence_1\n
                                .
                                .
                                .
                         title_m sentence_n # all knowledge available to wizard
            'labels': [title_checked sentence_checked] # default
                                    OR
                      [wizard_response] # if --label-type set to 'response'
            'label_candidates': knowledge + [no_passages_used no_passages_used]
                                           OR
                                100 response candidates  # if 'validation' or 'test'
            'chosen_topic': chosen_topic as untokenized string
            'checked_sentence': checked sentence if wizard, else None # if --include_checked_sentence
            'title': title of checked sentence # if --include_checked_sentence
            --> if not exists, then checked_sentence = title = 'no_passages_used'
            'episode_done': (Boolean) whether episode is done or not
        }
        """
        if os.path.exists(self._get_preprocessed_fname(mode)):
            episodes_fname = self._get_preprocessed_fname(mode)
            colorlog.info(
                f"Load cached wizard of wikipedia from {episodes_fname}")
            with open(episodes_fname, 'r') as fp:
                episodes = []
                for line in fp:
                    episodes.append(json.loads(line))
            dictionary = tokenization.FullTokenizer(self._vocab_fname)
            return episodes, dictionary

        parlai_opt = self._get_parlai_opt([
            '--task',
            'wizard_of_wikipedia:generator:topic_split' if 'unseen' in mode
            else 'wizard_of_wikipedia:generator:random_split',
            '--datatype',
            '{}:stream'.format(mode.split('_')[0]) if 'unseen' in mode else
            f'{mode}:stream',  # 'train' for shuffled data and 'train:stream' for unshuffled data
            '--datapath',
            self._cache_dir,
            # dict_XXX will not be used if we use bert tokenizer
            '--dict_lower',
            'True',
            '--dict_tokenizer',
            'bpe',
            '--dict_file',
            f"{self._cache_dir}/wow.dict",
            '--dict_textfields',
            "text,labels,chosen_topic,checked_sentence,knowledge,title",  # For retrieval mode, use "text,labels"
            # By following author's code. For retrieval mode, use 250004
            # Also, note that this is the size of bpehelper dictionary.
            # So, final dictionary can be larger than this one
            # And, don't convert special tokens to index with txt2vec method, you must use tok2ind
            '--dict_maxtokens',
            '30000',
            '--dict_nulltoken',
            data_vocab._PARLAI_PAD,
            '--dict_starttoken',
            data_vocab._PARLAI_GO,
            '--dict_endtoken',
            data_vocab._PARLAI_EOS,
            '--dict_unktoken',
            data_vocab._PARLAI_UNK,
            '--include_knowledge_separator',
            'True',  # include speical __knowledge__ token between title and passage
            '--include_checked_sentence',
            'True',
            '--label_type',
            'response',  # choices = ['response', 'chosen_sent']
        ])
        # As a default, world use "WizardDialogKnowledgeTeacher"
        agent = DictionaryAgent(parlai_opt)
        world = create_task(parlai_opt, agent)
        num_examples = world.num_examples()
        num_episodes = world.num_episodes()

        episodes = []
        for _ in range(num_episodes):
            examples = []
            while True:
                world.parley()
                example = world.acts[0]
                examples.append(example)
                if world.episode_done():
                    episodes.append(examples)
                    break

        dictionary = tokenization.FullTokenizer(self._vocab_fname)

        return self._preprocess_episodes(episodes, dictionary, mode)