def __call__(self, article_input_list): encoder = get_encoder() input_list = [] for article in article_input_list: item = dict(sample_item) item['article'] = article + " " + article_rest # item['article'] = article context_ids = _flatten_and_tokenize_metadata(encoder=encoder, item=item) input_list.append({ 'info': item, 'ids': context_ids, 'label': item['label'], }) assert item['label'] in LABEL_INV_MAP # dict_intput_list_idx[0] += 1 batch_size = 1 if len(input_list) >= batch_size_large: batch_size = batch_size_large estimator = self.estimator_large elif len(input_list) < batch_size_medium: batch_size = batch_size_small estimator = self.estimator_small else: batch_size = batch_size_medium estimator = self.estimator_medium predict_file = os.path.join(FLAGS.output_dir, 'test.tf_record') classification_convert_examples_to_features( input_list, batch_size=batch_size, max_seq_length=FLAGS.max_seq_length, encoder=encoder, output_file=predict_file, labels=LABEL_LIST, pad_extra_examples=True, chop_from_front_if_needed=False) val_input_fn = classification_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=True, ) probs = np.zeros((len(input_list), 2), dtype=np.float32) for i, res in enumerate( estimator.predict(input_fn=val_input_fn, yield_single_examples=True)): if i < len(input_list): probs[i] = res['probs'] return probs
def main(_): LABEL_LIST = ['machine', 'human'] LABEL_INV_MAP = {label: i for i, label in enumerate(LABEL_LIST)} tf.logging.set_verbosity(tf.logging.INFO) # These lines of code are just to check if we've already saved something into the directory if FLAGS.ingore_model_folder_check: pass elif tf.gfile.Exists( FLAGS.output_dir) or not FLAGS.ingore_model_folder_check: print(f"The output directory {FLAGS.output_dir} exists!") if FLAGS.do_train: print("EXITING BECAUSE DO_TRAIN is true", flush=True) return for split in ['val', 'test']: if tf.gfile.Exists( os.path.join(FLAGS.output_dir, f'{split}-probs.npy')) and getattr( FLAGS, f'predict_{split}'): print(f"EXITING BECAUSE {split}-probs.npy exists", flush=True) return # Double check to see if it has trained! if not tf.gfile.Exists(os.path.join(FLAGS.output_dir, 'checkpoint')): print("EXITING BECAUSE NO CHECKPOINT.", flush=True) return stuff = {} with tf.gfile.Open(os.path.join(FLAGS.output_dir, 'checkpoint'), 'r') as f: # model_checkpoint_path: "model.ckpt-0" # all_model_checkpoint_paths: "model.ckpt-0" for l in f: key, val = l.strip().split(': ', 1) stuff[key] = val.strip('"') if stuff['model_checkpoint_path'] == 'model.ckpt-0': print("EXITING BECAUSE IT LOOKS LIKE NOTHING TRAINED", flush=True) return elif not FLAGS.do_train: print("EXITING BECAUSE DO_TRAIN IS FALSE AND PATH DOESNT EXIST") return else: tf.gfile.MakeDirs(FLAGS.output_dir) news_config = GroverConfig.from_json_file(FLAGS.config_file) # TODO might have to change this encoder = get_encoder() examples = {'train': [], 'val': [], 'test': []} np.random.seed(123456) tf.logging.info("*** Parsing files ***") with tf.gfile.Open(FLAGS.input_data, "r") as f: for l in f: item = json.loads(l) # This little hack is because we don't want to tokenize the article twice context_ids = _flatten_and_tokenize_metadata(encoder=encoder, item=item) examples[item['split']].append({ 'info': item, 'ids': context_ids, 'label': item['label'], }) assert item['label'] in LABEL_INV_MAP additional_data = {'machine': [], 'human': []} if FLAGS.additional_data is not None: print("NOW WERE LOOKING AT ADDITIONAL INPUT DATA", flush=True) with tf.gfile.Open(FLAGS.additional_data, "r") as f: for l in f: item = json.loads(l) # This little hack is because we don't want to tokenize the article twice context_ids = _flatten_and_tokenize_metadata(encoder=encoder, item=item) additional_data[item['label']].append({ 'info': item, 'ids': context_ids, 'label': item['label'], }) tf.logging.info("*** Done parsing files ***") print("LETS GO", flush=True) if FLAGS.max_training_examples > 0: examples_by_label = {'human': [], 'machine': []} for x in examples['train']: examples_by_label[x['label']].append(x) new_examples = [] print("Unique machine examples: {} -> {}".format( len(examples_by_label['machine']), FLAGS.max_training_examples), flush=True) machine_ex_to_keep = examples_by_label[ 'machine'][:FLAGS.max_training_examples] # So we just cut down on the TRUE machine examples. now lets try adding in additional examples # examples_by_label['human'].extend(additional_data['human']) if len(additional_data['machine']) > 0: amount_to_add = len( examples_by_label['human']) - len(machine_ex_to_keep) if amount_to_add > 0: machine_ex_to_keep.extend( additional_data['machine'][:amount_to_add]) for i, human_ex in enumerate(examples_by_label['human']): new_examples.append(human_ex) new_examples.append(machine_ex_to_keep[i % len(machine_ex_to_keep)]) print("Length of examples: {} -> {}".format(len(examples['train']), len(new_examples)), flush=True) examples['train'] = new_examples # =============== SETUP TRAINING =============== if FLAGS.do_train: num_train_steps = int((len(examples['train']) / FLAGS.batch_size) * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) assert num_train_steps > 0 else: num_train_steps = None num_warmup_steps = None # =============== TRAINING BOILERPLATE =============== tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.iterations_per_loop, keep_checkpoint_max=None, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) model_fn = classification_model_fn_builder( news_config, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, num_labels=len(LABEL_LIST), pool_token_id=encoder.begin_summary, adafactor=FLAGS.adafactor) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.batch_size, eval_batch_size=FLAGS.batch_size, predict_batch_size=FLAGS.batch_size, params={'model_dir': FLAGS.output_dir}) # =============== END TRAINING BOILERPLATE =============== # =============== TRAINING =============== if FLAGS.do_train: train_file = os.path.join(FLAGS.output_dir, "train.tf_record") tf.logging.info( f"***** Recreating training file at {train_file} *****") classification_convert_examples_to_features( examples['train'], batch_size=FLAGS.batch_size, max_seq_length=FLAGS.max_seq_length, encoder=encoder, output_file=train_file, labels=LABEL_LIST, chop_from_front_if_needed=False) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", len(examples['train'])) tf.logging.info(" Num epochs = %d", FLAGS.num_train_epochs) tf.logging.info(" Batch size = %d", FLAGS.batch_size) tf.logging.info(" Num steps = %d", num_train_steps) train_input_fn = classification_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True, ) estimator.train(input_fn=train_input_fn, steps=num_train_steps) # =============== END TRAINING =============== # =============== PREDICTION =============== splits_to_predict = [ x for x in ['val', 'test'] if getattr(FLAGS, f'predict_{x}') ] for split in splits_to_predict: num_actual_examples = len(examples[split]) predict_file = os.path.join(FLAGS.output_dir, f'{split}.tf_record') tf.logging.info(f"***** Recreating {split} file {predict_file} *****") classification_convert_examples_to_features( examples[split], batch_size=FLAGS.batch_size, max_seq_length=FLAGS.max_seq_length, encoder=encoder, output_file=predict_file, labels=LABEL_LIST, pad_extra_examples=True, chop_from_front_if_needed=False) val_input_fn = classification_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=True, ) # PREDICT probs = np.zeros((num_actual_examples, 2), dtype=np.float32) for i, res in enumerate( estimator.predict(input_fn=val_input_fn, yield_single_examples=True)): if i < num_actual_examples: probs[i] = res['probs'] _save_np(os.path.join(FLAGS.output_dir, f'{split}-probs.npy'), probs) preds = np.argmax(probs, 1) labels = np.array([ LABEL_INV_MAP[x['label']] for x in examples[split][:num_actual_examples] ]) print('{} ACCURACY IS {:.3f}'.format(split, np.mean(labels == preds)), flush=True)