def make_dataset(tf_record_files: str, num_treatments: int, is_training: bool, is_eval=False, missing_outcomes=False, do_masking=False, input_pipeline_context=None): df_file = FLAGS.label_df_file dataset = load_basic_bert_data( tf_record_files, FLAGS.max_seq_length, is_training=is_training, input_pipeline_context=input_pipeline_context) label_df = pd.read_feather(df_file) dataset = dataset_labels_from_pandas(dataset, label_df) # todo: hardcoded for demo, but not the smartest way to do this def _standardize_label_naming(f, l): l['outcome'] = l.pop('accepted') l['treatment'] = l.pop('year') if missing_outcomes: l['outcome_observed'] = tf.not_equal(l['outcome'], -1) # placeholder so that passed in labels are non-negative l['outcome'] = tf.where(l['outcome_observed'], l['outcome'], tf.zeros_like(l['outcome'])) return f, l dataset = dataset.map(_standardize_label_naming, num_parallel_calls=tf.data.experimental.AUTOTUNE) dataset = make_test_train_splits(dataset, num_splits=FLAGS.num_splits, dev_splits=FLAGS.dev_splits, test_splits=FLAGS.test_splits) if do_masking: tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) dataset = add_masking(dataset, tokenizer=tokenizer) if is_training: dataset = filter_training(dataset, is_training=not is_eval) # batching needs to happen before sample weights are created dataset = dataset.shuffle(25000) dataset = dataset.batch(FLAGS.train_batch_size, drop_remainder=True) # create sample weights and label outputs in the manner expected by keras hydra_keras_format = make_hydra_keras_format( num_treatments, missing_outcomes=missing_outcomes) dataset = dataset.map(hydra_keras_format, num_parallel_calls=tf.data.experimental.AUTOTUNE) dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) return dataset else: return dataset.batch(FLAGS.eval_batch_size)
def main(_): tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) input_files = [] for input_pattern in FLAGS.input_file.split(","): input_files.extend(tf.io.gfile.glob(input_pattern)) logging.info("*** Reading from input files ***") for input_file in input_files: logging.info(" %s", input_file) rng = random.Random(FLAGS.random_seed) instances = create_training_instances( input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor, FLAGS.short_seq_prob, FLAGS.masked_lm_prob, FLAGS.max_predictions_per_seq, rng) output_files = FLAGS.output_file.split(",") logging.info("*** Writing to output files ***") for output_file in output_files: logging.info(" %s", output_file) write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length, FLAGS.max_predictions_per_seq, output_files)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--shuffle_buffer_size', type=int, default=100) parser.add_argument('--batch_size', type=int, default=64) parser.add_argument('--max_abs_len', type=int, default=128) args = parser.parse_args() # for easy debugging filename = '../dat/reddit/proc.tf_record' # bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1", # trainable=True) # vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy() vocab_file = '../pre-trained/uncased_L-12_H-768_A-12/vocab.txt' tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True) num_splits = 10 # dev_splits = [0] # test_splits = [0] dev_splits = [] test_splits = [1] labeler = make_real_labeler('gender', 'log_score') input_dataset_from_filenames = make_input_fn_from_file( filename, args.max_abs_len, num_splits, dev_splits, test_splits, tokenizer, do_masking=True, is_training=True, filter_test=False, filter_train=True, shuffle_buffer_size=1000, labeler=labeler, seed=0, subreddits=[13]) params = {'batch_size': 64} dataset = input_dataset_from_filenames(params) # dataset = filter_training(dataset) sample = next(iter(dataset)) print(sample) print("start") for val in dataset.take(100): print("hit") sample = val print("end") sample = next(iter(dataset)) print(sample)
def generate_tf_record_from_data_file(processor, data_dir, vocab_file, train_data_output_path=None, eval_data_output_path=None, max_seq_length=128, do_lower_case=True): """Generates and saves training data into a tf record file. Arguments: processor: Input processor object to be used for generating data. Subclass of `DataProcessor`. data_dir: Directory that contains train/eval data to process. Data files should be in from "dev.tsv", "test.tsv", or "train.tsv". vocab_file: Text file with words to be used for training/evaluation. train_data_output_path: Output to which processed tf record for training will be saved. eval_data_output_path: Output to which processed tf record for evaluation will be saved. max_seq_length: Maximum sequence length of the to be generated training/eval data. do_lower_case: Whether to lower case input text. Returns: A dictionary containing input meta data. """ assert train_data_output_path or eval_data_output_path label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer( vocab_file=vocab_file, do_lower_case=do_lower_case) assert train_data_output_path train_input_data_examples = processor.get_train_examples(data_dir) file_based_convert_examples_to_features(train_input_data_examples, label_list, max_seq_length, tokenizer, train_data_output_path) num_training_data = len(train_input_data_examples) if eval_data_output_path: eval_input_data_examples = processor.get_dev_examples(data_dir) file_based_convert_examples_to_features(eval_input_data_examples, label_list, max_seq_length, tokenizer, eval_data_output_path) meta_data = { "task_type": "bert_classification", "processor_type": processor.get_processor_name(), "num_labels": len(processor.get_labels()), "train_data_size": num_training_data, "max_seq_length": max_seq_length, } if eval_data_output_path: meta_data["eval_data_size"] = len(eval_input_data_examples) return meta_data
def main(): parser = argparse.ArgumentParser() parser.add_argument('--shuffle_buffer_size', type=int, default=100) parser.add_argument('--batch_size', type=int, default=16) parser.add_argument('--max_abs_len', type=int, default=250) args = parser.parse_args() # for easy debugging # tsv_file = "../../dat/PeerRead/proc/acl_2017.tf_record" # tsv_file = glob.glob('/home/victor/Documents/causal-spe-embeddings/dat/PeerRead/proc/*.tf_record') filename = '../dat/PeerRead/proc/arxiv-all.tf_record' # bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1", # trainable=True) # vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy() vocab_file = '../pre-trained/uncased_L-12_H-768_A-12/vocab.txt' tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True) num_splits = 10 # dev_splits = [0] # test_splits = [0] dev_splits = [] test_splits = [1, 2] # labeler = make_buzzy_based_simulated_labeler(0.5, 5.0, 0.0, 'simple', # seed=0) labeler = make_real_labeler('venue', 'accepted') input_dataset_from_filenames = make_dataset_fn_from_file( filename, 250, num_splits, dev_splits, test_splits, tokenizer, do_masking=False, is_training=True, filter_test=False, shuffle_buffer_size=25000, labeler=labeler, seed=0) params = {'batch_size': 10000} dataset = input_dataset_from_filenames(params) print(dataset.element_spec) for val in dataset.take(1): sample = val sample = next(iter(dataset)) print(tf.unique(sample[1]['treatment']))
def make_dataset(is_training: bool, do_masking=False): if FLAGS.simulated == 'real': labeler = make_real_labeler(FLAGS.treatment, 'log_score') elif FLAGS.simulated == 'attribute': labeler = make_subreddit_based_simulated_labeler(FLAGS.beta0, FLAGS.beta1, FLAGS.gamma, FLAGS.simulation_mode, seed=0) else: Exception("simulated flag not recognized") tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) dev_splits = [int(s) for s in str.split(FLAGS.dev_splits)] test_splits = [int(s) for s in str.split(FLAGS.test_splits)] if FLAGS.subreddits == '': subreddits = None else: subreddits = [int(s) for s in FLAGS.subreddits.split(',')] train_input_fn = make_input_fn_from_file( input_files_or_glob=FLAGS.input_files, seq_length=FLAGS.max_seq_length, num_splits=FLAGS.num_splits, dev_splits=dev_splits, test_splits=test_splits, tokenizer=tokenizer, do_masking=do_masking, subreddits=subreddits, is_training=is_training, shuffle_buffer_size= 25000, # note: bert hardcoded this, and I'm following suit seed=FLAGS.seed, labeler=labeler, filter_train=is_training) batch_size = FLAGS.train_batch_size if is_training else FLAGS.eval_batch_size dataset = train_input_fn(params={'batch_size': batch_size}) # format expected by Keras for training if is_training: # dataset = filter_training(dataset) dataset = dataset.map(_keras_format, num_parallel_calls=tf.data.experimental.AUTOTUNE) dataset = dataset.prefetch(4) return dataset
def test_full_tokenizer(self): vocab_tokens = [ "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing", "," ] with tempfile.NamedTemporaryFile(delete=False) as vocab_writer: if six.PY2: vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) else: vocab_writer.write("".join([x + "\n" for x in vocab_tokens ]).encode("utf-8")) vocab_file = vocab_writer.name tokenizer = tokenization.FullTokenizer(vocab_file) os.unlink(vocab_file) tokens = tokenizer.tokenize(u"UNwant\u00E9d,running") self.assertAllEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"]) self.assertAllEqual(tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
def generate_tf_record_from_json_file(input_file_path, vocab_file_path, output_path, max_seq_length=384, do_lower_case=True, max_query_length=64, doc_stride=128, version_2_with_negative=False): """Generates and saves training data into a tf record file.""" train_examples = read_squad_examples( input_file=input_file_path, is_training=True, version_2_with_negative=version_2_with_negative) tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file_path, do_lower_case=do_lower_case) train_writer = FeatureWriter(filename=output_path, is_training=True) number_of_examples = convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=max_seq_length, doc_stride=doc_stride, max_query_length=max_query_length, is_training=True, output_fn=train_writer.process_feature) train_writer.close() meta_data = { "task_type": "bert_squad", "train_data_size": number_of_examples, "max_seq_length": max_seq_length, "max_query_length": max_query_length, "doc_stride": doc_stride, "version_2_with_negative": version_2_with_negative, } return meta_data
def make_dataset(is_training: bool, do_masking=False): tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) dev_splits = [int(s) for s in str.split(FLAGS.dev_splits)] test_splits = [int(s) for s in str.split(FLAGS.test_splits)] train_input_fn = make_input_fn_from_file( input_files_or_glob=FLAGS.input_files, seq_length=FLAGS.max_seq_length, num_splits=FLAGS.num_splits, dev_splits=dev_splits, test_splits=test_splits, tokenizer=tokenizer, do_masking=do_masking, is_training=is_training, shuffle_buffer_size=25000, # note: bert hardcoded this, and I'm following suit seed=FLAGS.seed) batch_size = FLAGS.train_batch_size if is_training else FLAGS.eval_batch_size dataset = train_input_fn(params={'batch_size': batch_size}) dataset = dataset.map(_make_length_labels) return dataset
def make_dataset(is_training: bool, do_masking=False): if FLAGS.simulated == 'real': labeler = make_real_labeler(FLAGS.treatment, 'accepted') elif FLAGS.simulated == 'attribute': labeler = make_buzzy_based_simulated_labeler(FLAGS.beta0, FLAGS.beta1, FLAGS.gamma, FLAGS.simulation_mode, seed=0) elif FLAGS.simulated == 'propensity': model_predictions = pd.read_csv(FLAGS.base_propensities_path, '\t') base_propensity_scores = model_predictions['g'] example_indices = model_predictions['index'] labeler = make_propensity_based_simulated_labeler( treat_strength=FLAGS.beta0, con_strength=FLAGS.beta1, noise_level=FLAGS.gamma, base_propensity_scores=base_propensity_scores, example_indices=example_indices, exogeneous_con=FLAGS.exogenous_confounding, setting=FLAGS.simulation_mode, seed=0) else: Exception("simulated flag not recognized") tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) dev_splits = [int(s) for s in str.split(FLAGS.dev_splits)] test_splits = [int(s) for s in str.split(FLAGS.test_splits)] train_input_fn = make_dataset_fn_from_file( input_files_or_glob=FLAGS.input_files, seq_length=FLAGS.max_seq_length, num_splits=FLAGS.num_splits, dev_splits=dev_splits, test_splits=test_splits, tokenizer=tokenizer, do_masking=do_masking, is_training=is_training, shuffle_buffer_size= 25000, # note: bert hardcoded this, and I'm following suit seed=FLAGS.seed, labeler=labeler) batch_size = FLAGS.train_batch_size if is_training else FLAGS.eval_batch_size dataset = train_input_fn(params={'batch_size': batch_size}) # format expected by Keras for training if is_training: dataset = filter_training(dataset) dataset = dataset.map(_keras_format, num_parallel_calls=tf.data.experimental.AUTOTUNE) dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) return dataset
def predict_squad(strategy, input_meta_data): """Makes predictions for a squad dataset_.""" bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) doc_stride = input_meta_data['doc_stride'] max_query_length = input_meta_data['max_query_length'] # Whether data should be in Ver 2.0 format. version_2_with_negative = input_meta_data.get('version_2_with_negative', False) eval_examples = squad_lib.read_squad_examples( input_file=FLAGS.predict_file, is_training=False, version_2_with_negative=version_2_with_negative) tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) eval_writer = squad_lib.FeatureWriter(filename=os.path.join( FLAGS.model_dir, 'eval.tf_record'), is_training=False) eval_features = [] def _append_feature(feature, is_padding): if not is_padding: eval_features.append(feature) eval_writer.process_feature(feature) # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. dataset_size = squad_lib.convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=input_meta_data['max_seq_length'], doc_stride=doc_stride, max_query_length=max_query_length, is_training=False, output_fn=_append_feature, batch_size=FLAGS.predict_batch_size) eval_writer.close() logging.info('***** Running predictions *****') logging.info(' Num orig examples = %d', len(eval_examples)) logging.info(' Num split examples = %d', len(eval_features)) logging.info(' Batch size = %d', FLAGS.predict_batch_size) num_steps = int(dataset_size / FLAGS.predict_batch_size) all_results = predict_squad_customized(strategy, input_meta_data, bert_config, eval_writer.filename, num_steps) output_prediction_file = os.path.join(FLAGS.model_dir, 'predictions.json') output_nbest_file = os.path.join(FLAGS.model_dir, 'nbest_predictions.json') output_null_log_odds_file = os.path.join(FLAGS.model_dir, 'null_odds.json') squad_lib.write_predictions(eval_examples, eval_features, all_results, FLAGS.n_best_size, FLAGS.max_answer_length, FLAGS.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, verbose=FLAGS.verbose_logging)
def run_customized_training(strategy, bert_config, max_seq_length, max_predictions_per_seq, model_dir, steps_per_epoch, steps_per_loop, epochs, initial_lr, warmup_steps, input_files, train_batch_size): """Run BERT pretrain model training using low-level API.""" # train_input_fn = functools.partial(get_pretrain_input_data, input_files, # max_seq_length, max_predictions_per_seq, # train_batch_size, strategy) tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) def train_input_fn(): train_input_fn = make_input_fn_from_file( input_files_or_glob=FLAGS.input_files, seq_length=FLAGS.max_seq_length, num_splits=1, dev_splits=[2], test_splits=[2], tokenizer=tokenizer, is_training=True, shuffle_buffer_size= 25000, # note: bert hardcoded this, and I'm following suit seed=FLAGS.seed, labeler=None) return train_input_fn(params={'batch_size': train_batch_size}) def _get_pretrain_model(): """Gets a pretraining model.""" pretrain_model, core_model = bert_models.pretrain_model( bert_config, max_seq_length, max_predictions_per_seq) pretrain_model.optimizer = optimization.create_optimizer( initial_lr, steps_per_epoch * epochs, warmup_steps) if FLAGS.fp16_implementation == 'graph_rewrite': # Note: when flags_obj.fp16_implementation == "graph_rewrite", dtype as # determined by flags_core.get_tf_dtype(flags_obj) would be 'float32' # which will ensure tf.compat.v2.keras.mixed_precision and # tf.train.experimental.enable_mixed_precision_graph_rewrite do not double # up. pretrain_model.optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite( pretrain_model.optimizer) return pretrain_model, core_model trained_model = model_training_utils.run_customized_training_loop( strategy=strategy, model_fn=_get_pretrain_model, loss_fn=get_loss_fn( loss_factor=1.0 / strategy.num_replicas_in_sync if FLAGS.scale_loss else 1.0), model_dir=model_dir, train_input_fn=train_input_fn, steps_per_epoch=steps_per_epoch, steps_per_loop=steps_per_loop, epochs=epochs, init_checkpoint=FLAGS.init_checkpoint, ) # Creates the BERT core model outside distribution strategy scope. _, core_model = bert_models.pretrain_model(bert_config, max_seq_length, max_predictions_per_seq) # Restores the core model from model checkpoints and get a new checkpoint only # contains the core model. model_saving_utils.export_pretraining_checkpoint(checkpoint_dir=model_dir, model=core_model) return trained_model