def __init__(self, tokenizer: PacketTokenizer, folder_path: str, filename_patterns_to_exclude: tuple = ()): self.tokenizer = tokenizer raw_flows, targets = load_modeling_data_with_classes( folder_path, filename_patterns_to_exclude=filename_patterns_to_exclude) self.raw_flows: np.ndarray = raw_flows.loc[:, tokenizer.packet_quantizer. raw_columns].values self.targets: np.ndarray = targets.values logger.info('initialized dataset') tokenizer.add_class_tokens(self.target_classes) logger.info('added special tokens representing classes')
def test_tokenize_detokenize(quantizer_checkpoint, raw_dataset): tokenizer = PacketTokenizer.from_pretrained(quantizer_checkpoint) encoded = tokenizer.batch_encode_packets(raw_dataset) tokens = encoded['input_ids'] # since the model limit 128 > 20 in raw_features, we do not expect truncating decoded = tokenizer.batch_decode_packets(tokens) assert _estimate_normalized_packet_difference(raw_dataset.values, decoded) < 0.0003
def test_flowlight_loader(raw_dataset_folder, quantizer_checkpoint): tokenizer = PacketTokenizer.from_pretrained(quantizer_checkpoint, flow_size=20) ds = PretrainDataset(tokenizer, folder_path=raw_dataset_folder) loader = DataLoader(ds, batch_size=4, collate_fn=PretrainCollator(tokenizer), drop_last=True) for flow in loader: assert flow['input_ids'].shape == (4, 22)
def test_dataset_with_classes(raw_dataset_folder, quantizer_checkpoint): tokenizer = PacketTokenizer.from_pretrained(quantizer_checkpoint, flow_size=20) ds = PretrainDatasetWithClasses(tokenizer, folder_path=raw_dataset_folder) loader = DataLoader(ds, batch_size=4, collate_fn=PretrainCollator(tokenizer), drop_last=True) for flow in loader: assert flow['input_ids'].shape == (4, 22) # 9905 is the last non-flow-label token ID assert (flow['input_ids'][:, 0] > 9905).all().tolist()
def main(): parser = argparse.ArgumentParser() parser.add_argument( '--source_dataset', help='path to preprocessed .csv dataset', default='/media/raid_store/pretrained_traffic/train_csv') parser.add_argument( '--pretrained_path', default= '/media/raid_store/pretrained_traffic/gpt2_model_4_6epochs_classes_home_iot' ) parser.add_argument( '--flow_limit_per_app', default=20000, type=int, ) parser.add_argument('--filename_patterns_to_exclude', default='mawi', help='see settings.py::FilePatterns for the options') parser.add_argument( '--evaluate', action='store_true', default=False, ) parser.add_argument( '--markov_model', action='store_true', default=False, ) args = parser.parse_args() filename_patterns_to_exclude = getattr(FilePatterns, args.filename_patterns_to_exclude) source_dataset_folder = pathlib.Path(args.source_dataset) device = 'cuda' if torch.cuda.is_available() else 'cpu' all_source_flows, classes = load_modeling_data_with_classes( source_dataset_folder, filename_patterns_to_exclude=filename_patterns_to_exclude) source_class_counts = classes.value_counts() pretrained_path = pathlib.Path(args.pretrained_path) tokenizer = PacketTokenizer.from_pretrained(pretrained_path) if not args.markov_model: model = GPT2LMHeadModel.from_pretrained(pretrained_path).to(device) generated_flows_path = pretrained_path.parent / ('generated_flows_' + pretrained_path.stem) if args.markov_model: generated_flows_path = generated_flows_path.parent / ( generated_flows_path.name + '_markov') generated_flows_path.mkdir(exist_ok=True) metrics = {} for proto in tokenizer.tokens_to_ids.keys(): # skip special tokens if proto.startswith('['): continue try: source_class_count = source_class_counts[proto] except KeyError: logger.error( f'could not find target class "{proto}" in dataset, skipping') continue n_flows_to_generate = source_class_count \ if source_class_count < args.flow_limit_per_app \ else args.flow_limit_per_app src_flows = all_source_flows[classes == proto] if args.markov_model: markov = MarkovGenerator() X = tokenizer.batch_encode_packets( src_flows.values.astype(np.float64), target_class=proto, add_special_tokens=True, return_attention_mask=False, return_tensors='np')['input_ids'] markov.fit(X) gen_tokens = markov.sample(n_flows_to_generate) gen_flows = tokenizer.batch_decode_packets(gen_tokens) else: gen_flows = generate_packets(proto, n_flows_to_generate, model, tokenizer, device) gen_flows = pd.DataFrame(gen_flows, columns=tokenizer.packet_quantizer. raw_columns[:gen_flows.shape[1]]) save_dataset(gen_flows, save_to=generated_flows_path / f'{proto}.csv') if args.evaluate: results = evaluate_generated_traffic(src_flows.values, gen_flows.values) metrics[proto] = results if args.evaluate: save_metrics( metrics, REPORT_DIR / ('report_' + generated_flows_path.stem + '.csv'))
def main(): args = _parse_args() pprint(args) device = 'cuda' if torch.cuda.is_available() else 'cpu' cpu_counter = os.cpu_count() if args.use_packet_size_only: n_tokens = args.dynamic_ps_range ds_class = partial(ClassificationPacketSizeDataset, max_size_range=n_tokens) else: tokenizer = PacketTokenizer.from_pretrained(args.tokenizer_path, flow_size=args.packet_num) n_tokens = len(tokenizer) ds_class = partial(SimpleClassificationQuantizedDataset, tokenizer=tokenizer) train_val_dataset = ds_class(dataset_path=args.train_dataset, target_column=args.target_column) train_part_len = int(len(train_val_dataset) * 0.9) train_dataset, val_dataset = random_split( train_val_dataset, [train_part_len, len(train_val_dataset) - train_part_len]) test_dataset = ds_class(dataset_path=args.test_dataset, label_encoder=train_val_dataset.target_encoder, target_column=args.target_column) train_dataloader = DataLoader(train_dataset, batch_size=args.batch_size, drop_last=False, shuffle=False, num_workers=cpu_counter) val_dataloader = DataLoader(val_dataset, batch_size=args.batch_size, drop_last=False, shuffle=False, num_workers=cpu_counter) test_dataloader = DataLoader(test_dataset, batch_size=args.batch_size, drop_last=False, num_workers=cpu_counter) class_labels = train_val_dataset.target_encoder.classes_ nn_classifier = FSNETClassifier(args, class_labels=class_labels, n_tokens=n_tokens) early_stop_callback = EarlyStopping(monitor='val_loss', min_delta=1e-4, patience=args.es_patience, verbose=False, mode='min') exp_logger = NeptuneLogger(offline_mode=not args.log_neptune, close_after_fit=False, project_name=NEPTUNE_PROJECT, experiment_name=args.neptune_experiment_name, params=vars(args), upload_source_files=[ (BASE_DIR / 'fs_net/model.py').as_posix() ]) checkpoint_dir = f'{nn_classifier.__class__.__name__}_checkpoints' model_checkpoint = ModelCheckpoint( filepath=checkpoint_dir + '/{epoch}-{val_loss:.2f}-{other_metric:.2f}') trainer = Trainer( early_stop_callback=early_stop_callback, callbacks=[LearningRateLogger()], checkpoint_callback=model_checkpoint, auto_lr_find=False, logger=exp_logger, gpus=int(device == 'cuda'), ) trainer.fit(nn_classifier, train_dataloader, val_dataloader) trainer.test(nn_classifier, test_dataloader) exp_logger.experiment.log_artifact(model_checkpoint.best_model_path) exp_logger.experiment.stop()
def main(): parser = argparse.ArgumentParser() parser.add_argument( '--train_dataset', help='path to preprocessed .csv dataset', ) parser.add_argument( '--test_dataset', help='path to preprocessed .csv dataset', ) parser.add_argument( '--pretrained_path', ) parser.add_argument( '--freeze_pretrained_model', action='store_true', default=False, ) parser.add_argument( '--mask_first_token', action='store_true', default=False, ) parser.add_argument( '--batch_size', default=256, ) parser.add_argument( '--es_patience', default=5, type=int, ) parser.add_argument( '--learning_rate', default=None ) parser.add_argument( '--fc_dropout', default=0.0, ) parser.add_argument( '--reinitialize', action='store_true', default=False ) parser.add_argument( '--n_layers', default=6, type=int, help='number of transformer layers to use, only in use when --reinitialize is provided' ) parser.add_argument( '--log_neptune', dest='log_neptune', action='store_true', default=False ) parser.add_argument( '--neptune_experiment_name', dest='neptune_experiment_name', default='gpt2_class_pretrained' ) args = parser.parse_args() if args.learning_rate is None: args.learning_rate = 0.0005 if args.freeze_pretrained_model else 0.00002 print(args) device = 'cuda' if torch.cuda.is_available() else 'cpu' tokenizer = PacketTokenizer.from_pretrained(args.pretrained_path, flow_size=DEFAULT_PACKET_LIMIT_PER_FLOW) train_val_dataset = ClassificationQuantizedDataset(tokenizer, dataset_path=args.train_dataset) train_part_len = int(len(train_val_dataset) * 0.9) train_dataset, val_dataset = random_split(train_val_dataset, [train_part_len, len(train_val_dataset) - train_part_len]) test_dataset = ClassificationQuantizedDataset(tokenizer, dataset_path=args.test_dataset, label_encoder=train_val_dataset.target_encoder) collator = ClassificationQuantizedDataset.get_collator(mask_first_token=args.mask_first_token) cpu_counter = os.cpu_count() train_dataloader = DataLoader(train_dataset, batch_size=args.batch_size, drop_last=False, shuffle=False, collate_fn=collator, num_workers=cpu_counter) val_dataloader = DataLoader(val_dataset, batch_size=args.batch_size, drop_last=False, shuffle=False, collate_fn=collator, num_workers=cpu_counter ) test_dataloader = DataLoader(test_dataset, batch_size=args.batch_size, drop_last=False, collate_fn=collator, num_workers=cpu_counter) class_labels = train_val_dataset.target_encoder.classes_ nn_classifier = GPT2Classifier( args, class_labels, pretrained_model_path=args.pretrained_path, dropout=args.fc_dropout, freeze_pretrained_part=args.freeze_pretrained_model, reinitialize=args.reinitialize, n_layers=args.n_layers ) early_stop_callback = EarlyStopping( monitor='val_loss', min_delta=1e-4, patience=args.es_patience, verbose=False, mode='min' ) logger = NeptuneLogger( offline_mode=not args.log_neptune, close_after_fit=False, project_name=NEPTUNE_PROJECT, experiment_name=args.neptune_experiment_name, params=vars(args), upload_source_files=[(BASE_DIR / 'gpt_model/classifier/model.py').as_posix()] ) checkpoint_dir = f'{nn_classifier.__class__.__name__}_checkpoints' model_checkpoint = ModelCheckpoint( filepath=checkpoint_dir + '/{epoch}-{val_loss:.2f}-{other_metric:.2f}' ) trainer = Trainer( early_stop_callback=early_stop_callback, callbacks=[LearningRateLogger()], checkpoint_callback=model_checkpoint, auto_lr_find=False, logger=logger, gpus=int(device == 'cuda'), ) trainer.fit(nn_classifier, train_dataloader, val_dataloader) trainer.test(nn_classifier, test_dataloader) logger.experiment.log_artifact(model_checkpoint.best_model_path) logger.experiment.stop()
def tokenizer(quantizer_checkpoint): return PacketTokenizer.from_pretrained(quantizer_checkpoint, flow_size=20)
def test_saving_tokenizer(quantizer_checkpoint): q = PacketTokenizer.from_pretrained(quantizer_checkpoint) q.save_pretrained('/tmp/') assert pathlib.Path('/tmp/clusters.json').is_file() assert pathlib.Path('/tmp/ids_to_tokens.json').is_file()
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() if data_args.eval_data_file is None and training_args.do_eval: raise ValueError( "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " "or remove the --do_eval argument.") if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " f"Use --overwrite_output_dir to overcome.") if data_args.finetune_on_class and data_args.train_with_targets: raise ValueError( "Pretraining with flow labels and fine-tuning on the class simultaneously not supported." ) if not model_args.model_name_or_path and not model_args.quantizer_path: raise ValueError( "Either model or quantizer checkpoint path must be specified") # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if model_args.model_name_or_path: config = GPT2Config.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: config = GPT2Config.from_json_file(model_args.config_name) logger.warning( "You are instantiating a new config instance from scratch.") if model_args.model_name_or_path: tokenizer = PacketTokenizer.from_pretrained( model_args.model_name_or_path) else: tokenizer = PacketTokenizer.from_pretrained(model_args.quantizer_path) if model_args.model_name_or_path: model = GPT2LMHeadModel.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) else: logger.info("Training new model from scratch") model = AutoModelForCausalLM.from_config(config) if data_args.block_size <= 0: data_args.block_size = tokenizer.max_len # Our input block size will be the max possible for the model else: data_args.block_size = min(data_args.block_size, tokenizer.max_len) # Get datasets train_dataset = get_dataset( data_args, tokenizer=tokenizer) if training_args.do_train else None eval_dataset = get_dataset( data_args, tokenizer=tokenizer, evaluate=True) if training_args.do_eval else None model.resize_token_embeddings(len(tokenizer)) print(model) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, data_collator=PretrainCollator(tokenizer), train_dataset=train_dataset, eval_dataset=eval_dataset, prediction_loss_only=True, ) # Training if training_args.do_train: model_path = (model_args.model_name_or_path if model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path) else None) trainer.train(model_path=model_path) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") eval_output = trainer.evaluate() perplexity = math.exp(eval_output["eval_loss"]) result = {"perplexity": perplexity} output_eval_file = os.path.join(training_args.output_dir, "eval_results_lm.txt") if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) results.update(result) return results