示例#1
0
    def __call__(self, article_input_list):
        encoder = get_encoder()

        input_list = []
        for article in article_input_list:
            item = dict(sample_item)

            item['article'] = article + " " + article_rest
            # item['article'] = article

            context_ids = _flatten_and_tokenize_metadata(encoder=encoder,
                                                         item=item)
            input_list.append({
                'info': item,
                'ids': context_ids,
                'label': item['label'],
            })
            assert item['label'] in LABEL_INV_MAP

            # dict_intput_list_idx[0] += 1

        batch_size = 1
        if len(input_list) >= batch_size_large:
            batch_size = batch_size_large
            estimator = self.estimator_large
        elif len(input_list) < batch_size_medium:
            batch_size = batch_size_small
            estimator = self.estimator_small
        else:
            batch_size = batch_size_medium
            estimator = self.estimator_medium

        predict_file = os.path.join(FLAGS.output_dir, 'test.tf_record')
        classification_convert_examples_to_features(
            input_list,
            batch_size=batch_size,
            max_seq_length=FLAGS.max_seq_length,
            encoder=encoder,
            output_file=predict_file,
            labels=LABEL_LIST,
            pad_extra_examples=True,
            chop_from_front_if_needed=False)

        val_input_fn = classification_input_fn_builder(
            input_file=predict_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=True,
        )

        probs = np.zeros((len(input_list), 2), dtype=np.float32)

        for i, res in enumerate(
                estimator.predict(input_fn=val_input_fn,
                                  yield_single_examples=True)):
            if i < len(input_list):
                probs[i] = res['probs']

        return probs
示例#2
0
    type=int,
    help='Max sequence length',
)

parser.add_argument(
    '-add_extra_articles_to_end',
    dest='add_extra_articles_to_end',
    type=bool,
    action='store_true',
    help='Whether to minimize padding by adding extra articles to the end',
)

args = parser.parse_args()
random.seed(args.seed + args.fold)

encoder = get_encoder()


class S3TFRecordWriter(object):
    def __init__(self, fn):
        self.fn = fn
        if fn.startswith('s3://'):
            from boto3.s3.transfer import TransferConfig
            import boto3
            self.gclient = None
            self.s3client = boto3.client(
                's3',
                aws_access_key_id='AKIASQPN725TWDJPWJ7F',
                aws_secret_access_key=
                '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',
            )
def main(_):
    LABEL_LIST = ['machine', 'human']
    LABEL_INV_MAP = {label: i for i, label in enumerate(LABEL_LIST)}

    tf.logging.set_verbosity(tf.logging.INFO)

    # These lines of code are just to check if we've already saved something into the directory
    if FLAGS.ingore_model_folder_check:
        pass
    elif tf.gfile.Exists(
            FLAGS.output_dir) or not FLAGS.ingore_model_folder_check:
        print(f"The output directory {FLAGS.output_dir} exists!")
        if FLAGS.do_train:
            print("EXITING BECAUSE DO_TRAIN is true", flush=True)
            return
        for split in ['val', 'test']:
            if tf.gfile.Exists(
                    os.path.join(FLAGS.output_dir,
                                 f'{split}-probs.npy')) and getattr(
                                     FLAGS, f'predict_{split}'):
                print(f"EXITING BECAUSE {split}-probs.npy exists", flush=True)
                return
        # Double check to see if it has trained!
        if not tf.gfile.Exists(os.path.join(FLAGS.output_dir, 'checkpoint')):
            print("EXITING BECAUSE NO CHECKPOINT.", flush=True)
            return
        stuff = {}
        with tf.gfile.Open(os.path.join(FLAGS.output_dir, 'checkpoint'),
                           'r') as f:
            # model_checkpoint_path: "model.ckpt-0"
            # all_model_checkpoint_paths: "model.ckpt-0"
            for l in f:
                key, val = l.strip().split(': ', 1)
                stuff[key] = val.strip('"')
        if stuff['model_checkpoint_path'] == 'model.ckpt-0':
            print("EXITING BECAUSE IT LOOKS LIKE NOTHING TRAINED", flush=True)
            return
    elif not FLAGS.do_train:
        print("EXITING BECAUSE DO_TRAIN IS FALSE AND PATH DOESNT EXIST")
        return
    else:
        tf.gfile.MakeDirs(FLAGS.output_dir)

    news_config = GroverConfig.from_json_file(FLAGS.config_file)

    # TODO might have to change this
    encoder = get_encoder()
    examples = {'train': [], 'val': [], 'test': []}
    np.random.seed(123456)
    tf.logging.info("*** Parsing files ***")
    with tf.gfile.Open(FLAGS.input_data, "r") as f:
        for l in f:
            item = json.loads(l)

            # This little hack is because we don't want to tokenize the article twice
            context_ids = _flatten_and_tokenize_metadata(encoder=encoder,
                                                         item=item)
            examples[item['split']].append({
                'info': item,
                'ids': context_ids,
                'label': item['label'],
            })
            assert item['label'] in LABEL_INV_MAP

    additional_data = {'machine': [], 'human': []}
    if FLAGS.additional_data is not None:
        print("NOW WERE LOOKING AT ADDITIONAL INPUT DATA", flush=True)
        with tf.gfile.Open(FLAGS.additional_data, "r") as f:
            for l in f:
                item = json.loads(l)
                # This little hack is because we don't want to tokenize the article twice
                context_ids = _flatten_and_tokenize_metadata(encoder=encoder,
                                                             item=item)
                additional_data[item['label']].append({
                    'info': item,
                    'ids': context_ids,
                    'label': item['label'],
                })

    tf.logging.info("*** Done parsing files ***")
    print("LETS GO", flush=True)
    if FLAGS.max_training_examples > 0:

        examples_by_label = {'human': [], 'machine': []}
        for x in examples['train']:
            examples_by_label[x['label']].append(x)

        new_examples = []
        print("Unique machine examples: {} -> {}".format(
            len(examples_by_label['machine']), FLAGS.max_training_examples),
              flush=True)
        machine_ex_to_keep = examples_by_label[
            'machine'][:FLAGS.max_training_examples]

        # So we just cut down on the TRUE machine examples. now lets try adding in additional examples
        # examples_by_label['human'].extend(additional_data['human'])

        if len(additional_data['machine']) > 0:
            amount_to_add = len(
                examples_by_label['human']) - len(machine_ex_to_keep)
            if amount_to_add > 0:
                machine_ex_to_keep.extend(
                    additional_data['machine'][:amount_to_add])

        for i, human_ex in enumerate(examples_by_label['human']):
            new_examples.append(human_ex)
            new_examples.append(machine_ex_to_keep[i %
                                                   len(machine_ex_to_keep)])

        print("Length of examples: {} -> {}".format(len(examples['train']),
                                                    len(new_examples)),
              flush=True)
        examples['train'] = new_examples

    # =============== SETUP TRAINING ===============
    if FLAGS.do_train:
        num_train_steps = int((len(examples['train']) / FLAGS.batch_size) *
                              FLAGS.num_train_epochs)
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)
        assert num_train_steps > 0
    else:
        num_train_steps = None
        num_warmup_steps = None

    # =============== TRAINING BOILERPLATE ===============
    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
    run_config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=FLAGS.iterations_per_loop,
        keep_checkpoint_max=None,
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host))

    model_fn = classification_model_fn_builder(
        news_config,
        init_checkpoint=FLAGS.init_checkpoint,
        learning_rate=FLAGS.learning_rate,
        num_train_steps=num_train_steps,
        num_warmup_steps=num_warmup_steps,
        use_tpu=FLAGS.use_tpu,
        num_labels=len(LABEL_LIST),
        pool_token_id=encoder.begin_summary,
        adafactor=FLAGS.adafactor)

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.batch_size,
        eval_batch_size=FLAGS.batch_size,
        predict_batch_size=FLAGS.batch_size,
        params={'model_dir': FLAGS.output_dir})
    # =============== END TRAINING BOILERPLATE ===============

    # =============== TRAINING ===============
    if FLAGS.do_train:
        train_file = os.path.join(FLAGS.output_dir, "train.tf_record")

        tf.logging.info(
            f"***** Recreating training file at {train_file} *****")
        classification_convert_examples_to_features(
            examples['train'],
            batch_size=FLAGS.batch_size,
            max_seq_length=FLAGS.max_seq_length,
            encoder=encoder,
            output_file=train_file,
            labels=LABEL_LIST,
            chop_from_front_if_needed=False)
        tf.logging.info("***** Running training *****")
        tf.logging.info("  Num examples = %d", len(examples['train']))
        tf.logging.info("  Num epochs = %d", FLAGS.num_train_epochs)
        tf.logging.info("  Batch size = %d", FLAGS.batch_size)
        tf.logging.info("  Num steps = %d", num_train_steps)

        train_input_fn = classification_input_fn_builder(
            input_file=train_file,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            drop_remainder=True,
        )
        estimator.train(input_fn=train_input_fn, steps=num_train_steps)
    # =============== END TRAINING ===============

    # =============== PREDICTION ===============
    splits_to_predict = [
        x for x in ['val', 'test'] if getattr(FLAGS, f'predict_{x}')
    ]
    for split in splits_to_predict:
        num_actual_examples = len(examples[split])

        predict_file = os.path.join(FLAGS.output_dir, f'{split}.tf_record')
        tf.logging.info(f"***** Recreating {split} file {predict_file} *****")
        classification_convert_examples_to_features(
            examples[split],
            batch_size=FLAGS.batch_size,
            max_seq_length=FLAGS.max_seq_length,
            encoder=encoder,
            output_file=predict_file,
            labels=LABEL_LIST,
            pad_extra_examples=True,
            chop_from_front_if_needed=False)

        val_input_fn = classification_input_fn_builder(
            input_file=predict_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=True,
        )
        # PREDICT
        probs = np.zeros((num_actual_examples, 2), dtype=np.float32)
        for i, res in enumerate(
                estimator.predict(input_fn=val_input_fn,
                                  yield_single_examples=True)):
            if i < num_actual_examples:
                probs[i] = res['probs']

        _save_np(os.path.join(FLAGS.output_dir, f'{split}-probs.npy'), probs)

        preds = np.argmax(probs, 1)
        labels = np.array([
            LABEL_INV_MAP[x['label']]
            for x in examples[split][:num_actual_examples]
        ])
        print('{} ACCURACY IS {:.3f}'.format(split, np.mean(labels == preds)),
              flush=True)
示例#4
0
def main(_):
    global sample_item, batch_size_large, batch_size_medium, batch_size_small, article_rest
    ATTACKING_TOKENS_NUM = 300
    START_SAMPLE = 0
    END_SAMPLE = 100
    max_perturbed_percent = 1

    encoder = get_encoder()
    np.random.seed(123456)

    label_index = {"machine": 0, "human": 1}

    estimator_large = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=batch_size_large,
        eval_batch_size=batch_size_large,
        predict_batch_size=batch_size_large,
        params={'model_dir': FLAGS.output_dir})

    estimator_medium = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=batch_size_medium,
        eval_batch_size=batch_size_medium,
        predict_batch_size=batch_size_medium,
        params={'model_dir': FLAGS.output_dir})

    estimator_small = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=batch_size_small,
        eval_batch_size=batch_size_small,
        predict_batch_size=batch_size_small,
        params={'model_dir': FLAGS.output_dir})

    model_wrapper = CustomTensorFlowModelWrapper(
        [estimator_large, estimator_medium, estimator_small])

    if FLAGS.recipe == 'PWWS':
        attack = PWWSRen2019.build(model_wrapper)
    elif FLAGS.recipe == 'BAE':
        attack = BAEGarg2019.build(model_wrapper, max_perturbed_percent, False)
    elif FLAGS.recipe == 'BAE_Synonym':
        attack = BAEGarg2019.build(model_wrapper, max_perturbed_percent, True)
    elif FLAGS.recipe == 'BERTAttack':
        attack = BERTAttackLi2020.build(model_wrapper)
    elif FLAGS.recipe == 'BERTAttack_Synonym':
        attack = BERTAttackLi2020_Synonym.build(model_wrapper)

    with open(FLAGS.input_data, "r") as f:
        lines = f.readlines()
        for line in lines[START_SAMPLE:END_SAMPLE]:
            item = json.loads(line)
            sample_item = dict(item)

            article = item['article']

            article_tokens = article.split()  # Tokenize the article
            article_tokens_attacked, article_tokens_rest = article_tokens[:ATTACKING_TOKENS_NUM], article_tokens[
                ATTACKING_TOKENS_NUM:]
            article_attacked = " ".join(article_tokens_attacked)
            article_rest = " ".join(article_tokens_rest)
            # article_attacked = article

            dataset_now = [(article_attacked, label_index[item['label']])]

            # print(model_wrapper([article]))

            # print("####################################################################")
            # print(dataset_now)
            # print("\n")
            # print("####################################################################")

            results_iterable = attack.attack_dataset(dataset_now)
            for result in results_iterable:
                print(result.__str__(color_method='ansi'))