def _load_and_preprocess_all(self, mode: str): self._download_data(mode) if os.path.exists(self._get_preprocessed_fname(mode)): episodes_fname = self._get_preprocessed_fname(mode) colorlog.info(f"Load preprocessed holle from {episodes_fname}") with open(episodes_fname, 'r') as fp: episodes = [] for line in fp: episodes.append(json.loads(line)) dictionary = tokenization.FullTokenizer(self._vocab_fname) return episodes, dictionary # Load raw dataset raw_fname = os.path.join(self._datapath, f'{mode}_data.json') with open(raw_fname, 'r') as fp: episodes = json.load(fp) if mode != 'test': episodes = self._to_wow_format(episodes, mode) else: multi_fname = os.path.join(self._datapath, 'multi_reference_test.json') with open(multi_fname, 'r') as fp: multi_responses = json.load(fp) episodes = self._to_wow_format_multi(episodes, multi_responses, mode) dictionary = tokenization.FullTokenizer(self._vocab_fname) return self._preprocess_episodes(episodes, dictionary, mode)
def generate_tf_record_from_data_file(processor, data_dir, vocab_file, train_data_output_path=None, eval_data_output_path=None, max_seq_length=128, do_lower_case=True): """Generates and saves training data into a tf record file. Arguments: processor: Input processor object to be used for generating data. Subclass of `DataProcessor`. data_dir: Directory that contains train/eval data to process. Data files should be in from "dev.tsv", "test.tsv", or "train.tsv". vocab_file: Text file with words to be used for training/evaluation. train_data_output_path: Output to which processed tf record for training will be saved. eval_data_output_path: Output to which processed tf record for evaluation will be saved. max_seq_length: Maximum sequence length of the to be generated training/eval data. do_lower_case: Whether to lower case input text. Returns: A dictionary containing input meta data. """ assert train_data_output_path or eval_data_output_path label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) assert train_data_output_path train_input_data_examples = processor.get_train_examples(data_dir) file_based_convert_examples_to_features(train_input_data_examples, label_list, max_seq_length, tokenizer, train_data_output_path) num_training_data = len(train_input_data_examples) if eval_data_output_path: eval_input_data_examples = processor.get_dev_examples(data_dir) file_based_convert_examples_to_features(eval_input_data_examples, label_list, max_seq_length, tokenizer, eval_data_output_path) meta_data = { "task_type": "bert_classification", "processor_type": processor.get_processor_name(), "num_labels": len(processor.get_labels()), "train_data_size": num_training_data, "max_seq_length": max_seq_length, } if eval_data_output_path: meta_data["eval_data_size"] = len(eval_input_data_examples) return meta_data
def test_full_tokenizer(self): vocab_tokens = [ "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing", "," ] with tempfile.NamedTemporaryFile(delete=False) as vocab_writer: if six.PY2: vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) else: vocab_writer.write("".join([x + "\n" for x in vocab_tokens ]).encode("utf-8")) vocab_file = vocab_writer.name tokenizer = tokenization.FullTokenizer(vocab_file) os.unlink(vocab_file) tokens = tokenizer.tokenize(u"UNwant\u00E9d,running") self.assertAllEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"]) self.assertAllEqual(tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
def generate_tf_record_from_json_file(input_file_path, vocab_file_path, output_path, max_seq_length=384, do_lower_case=True, max_query_length=64, doc_stride=128, version_2_with_negative=False): """Generates and saves training data into a tf record file.""" train_examples = read_squad_examples( input_file=input_file_path, is_training=True, version_2_with_negative=version_2_with_negative) tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file_path, do_lower_case=do_lower_case) train_writer = FeatureWriter(filename=output_path, is_training=True) number_of_examples = convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=max_seq_length, doc_stride=doc_stride, max_query_length=max_query_length, is_training=True, output_fn=train_writer.process_feature) train_writer.close() meta_data = { "task_type": "bert_squad", "train_data_size": number_of_examples, "max_seq_length": max_seq_length, "max_query_length": max_query_length, "doc_stride": doc_stride, "version_2_with_negative": version_2_with_negative, } return meta_data
def predict_squad(strategy, input_meta_data): """Makes predictions for a squad dataset.""" bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) doc_stride = input_meta_data['doc_stride'] max_query_length = input_meta_data['max_query_length'] # Whether data should be in Ver 2.0 format. version_2_with_negative = input_meta_data.get('version_2_with_negative', False) eval_examples = squad_lib.read_squad_examples( input_file=FLAGS.predict_file, is_training=False, version_2_with_negative=version_2_with_negative) tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) eval_writer = squad_lib.FeatureWriter( filename=os.path.join(FLAGS.model_dir, 'eval.tf_record'), is_training=False) eval_features = [] def _append_feature(feature, is_padding): if not is_padding: eval_features.append(feature) eval_writer.process_feature(feature) # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. dataset_size = squad_lib.convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=input_meta_data['max_seq_length'], doc_stride=doc_stride, max_query_length=max_query_length, is_training=False, output_fn=_append_feature, batch_size=FLAGS.predict_batch_size) eval_writer.close() logging.info('***** Running predictions *****') logging.info(' Num orig examples = %d', len(eval_examples)) logging.info(' Num split examples = %d', len(eval_features)) logging.info(' Batch size = %d', FLAGS.predict_batch_size) num_steps = int(dataset_size / FLAGS.predict_batch_size) all_results = predict_squad_customized(strategy, input_meta_data, bert_config, eval_writer.filename, num_steps) output_prediction_file = os.path.join(FLAGS.model_dir, 'predictions.json') output_nbest_file = os.path.join(FLAGS.model_dir, 'nbest_predictions.json') output_null_log_odds_file = os.path.join(FLAGS.model_dir, 'null_odds.json') squad_lib.write_predictions( eval_examples, eval_features, all_results, FLAGS.n_best_size, FLAGS.max_answer_length, FLAGS.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, verbose=FLAGS.verbose_logging)
def _load_and_preprocess_all(self, mode: str): """ As default, it returns the following action dict: { 'id': 'wizard_of_wikipedia' 'text': chosen_topic\n # if first example in episode last_apprentice_message\n # if possible wizard_message # if --label-type is 'chosen_sent' 'knowledge': title_1 sentence_1\n . . . title_m sentence_n # all knowledge available to wizard 'labels': [title_checked sentence_checked] # default OR [wizard_response] # if --label-type set to 'response' 'label_candidates': knowledge + [no_passages_used no_passages_used] OR 100 response candidates # if 'validation' or 'test' 'chosen_topic': chosen_topic as untokenized string 'checked_sentence': checked sentence if wizard, else None # if --include_checked_sentence 'title': title of checked sentence # if --include_checked_sentence --> if not exists, then checked_sentence = title = 'no_passages_used' 'episode_done': (Boolean) whether episode is done or not } """ if os.path.exists(self._get_preprocessed_fname(mode)): episodes_fname = self._get_preprocessed_fname(mode) colorlog.info( f"Load cached wizard of wikipedia from {episodes_fname}") with open(episodes_fname, 'r') as fp: episodes = [] for line in fp: episodes.append(json.loads(line)) dictionary = tokenization.FullTokenizer(self._vocab_fname) return episodes, dictionary parlai_opt = self._get_parlai_opt([ '--task', 'wizard_of_wikipedia:generator:topic_split' if 'unseen' in mode else 'wizard_of_wikipedia:generator:random_split', '--datatype', '{}:stream'.format(mode.split('_')[0]) if 'unseen' in mode else f'{mode}:stream', # 'train' for shuffled data and 'train:stream' for unshuffled data '--datapath', self._cache_dir, # dict_XXX will not be used if we use bert tokenizer '--dict_lower', 'True', '--dict_tokenizer', 'bpe', '--dict_file', f"{self._cache_dir}/wow.dict", '--dict_textfields', "text,labels,chosen_topic,checked_sentence,knowledge,title", # For retrieval mode, use "text,labels" # By following author's code. For retrieval mode, use 250004 # Also, note that this is the size of bpehelper dictionary. # So, final dictionary can be larger than this one # And, don't convert special tokens to index with txt2vec method, you must use tok2ind '--dict_maxtokens', '30000', '--dict_nulltoken', data_vocab._PARLAI_PAD, '--dict_starttoken', data_vocab._PARLAI_GO, '--dict_endtoken', data_vocab._PARLAI_EOS, '--dict_unktoken', data_vocab._PARLAI_UNK, '--include_knowledge_separator', 'True', # include speical __knowledge__ token between title and passage '--include_checked_sentence', 'True', '--label_type', 'response', # choices = ['response', 'chosen_sent'] ]) # As a default, world use "WizardDialogKnowledgeTeacher" agent = DictionaryAgent(parlai_opt) world = create_task(parlai_opt, agent) num_examples = world.num_examples() num_episodes = world.num_episodes() episodes = [] for _ in range(num_episodes): examples = [] while True: world.parley() example = world.acts[0] examples.append(example) if world.episode_done(): episodes.append(examples) break dictionary = tokenization.FullTokenizer(self._vocab_fname) return self._preprocess_episodes(episodes, dictionary, mode)