def batch_dataset(args): # read data from configuration file config = Parameters.from_config(args.path) # load entire dataset train_data = DataLoader.from_files( config.data.src_train, config.data.tgt_train, config.model.max_length, config.training.batch_size ) outputfile = Path(args.output) with open(outputfile, "w", encoding="utf-8") as ofile: for i, batch in enumerate(train_data): for src, tgt in zip(*batch): s_sen = " ".join(src) t_sen = " ".join(tgt) ofile.write(f"{s_sen}\t{t_sen}\n") # print progress print(f"Batching dataset: {i}/{len(train_data)}", end="\r") print(" " * 50, end="\r") print("Batching dataset: complete")
def train(args): # extract arguments resume = args.resume batched = args.batched params = Parameters.from_config(args.path) # initialize trainer trainer = Trainer(resume, batched, params) trainer.read_data() trainer.create_model() trainer.train_loop() trainer.save_model()
def get_params(test_config): """get params and save them to root dir""" prm = Parameters() # get giles paths prm.override(test_config) test_parameter_file = os.path.join(prm.train.train_control.ROOT_DIR, 'test_parameters.ini') log_file = os.path.join(prm.train.train_control.ROOT_DIR, 'test.log') ret = True if os.path.isfile(test_parameter_file): warnings.warn('Test parameter file {} already exists'.format( test_parameter_file)) ret = query_yes_no('Overwrite parameter file?') if ret: dir = os.path.dirname(test_parameter_file) if not os.path.exists(dir): os.makedirs(dir) prm.save(test_parameter_file) logging = logging_config(log_file) logging.disable(logging.DEBUG) return prm
def get_params(test_config): """get params and save them to root dir""" prm = Parameters() # get giles paths prm.override(test_config) # just to get the LOG_DIR_LIST[0] train_log_dir = prm.test.ensemble.LOG_DIR_LIST[0] parameter_file = os.path.join(train_log_dir, 'parameters.ini') test_parameter_file = os.path.join(prm.train.train_control.ROOT_DIR, 'test_parameters.ini') all_parameter_file = os.path.join(prm.train.train_control.ROOT_DIR, 'all_parameters.ini') log_file = os.path.join(prm.train.train_control.ROOT_DIR, 'test.log') if not os.path.isfile(parameter_file): raise AssertionError('Can not find file: {}'.format(parameter_file)) ret = True if os.path.isfile(test_parameter_file): warnings.warn('Test parameter file {} already exists'.format( test_parameter_file)) ret = query_yes_no('Overwrite parameter file?') if ret: dir = os.path.dirname(test_parameter_file) if not os.path.exists(dir): os.makedirs(dir) prm.save(test_parameter_file) logging = logging_config(log_file) logging.disable(logging.DEBUG) # Done saving test parameters. Now doing the integration: prm = Parameters() prm.override(parameter_file) prm.override(test_parameter_file) ret = True if os.path.isfile(all_parameter_file): warnings.warn( 'All parameter file {} already exists'.format(all_parameter_file)) ret = query_yes_no('Overwrite parameter file?') if ret: dir = os.path.dirname(all_parameter_file) if not os.path.exists(dir): os.makedirs(dir) prm.save(all_parameter_file) return prm
tensor_files = [ [ path + f"data/super/train_word_tensor_{data_name}_2.npy", path + f"data/super/valid_word_tensor_{data_name}_2.npy", ], [ path + f"data/super/train_character_tensor_{data_name}_2.npy", path + f"data/super/valid_character_tensor_{data_name}_2.npy", ], ] batch_loader_2 = BatchLoader(data_files, idx_files, tensor_files, path) # batch_loader_2 = BatchLoader('') params = Parameters(batch_loader_2.max_word_len, batch_loader_2.max_seq_len, batch_loader_2.words_vocab_size, batch_loader_2.chars_vocab_size, data_name, False, False, False) neg_loss = NEG_loss(params.word_vocab_size, params.word_embed_size) if args.use_cuda: neg_loss = neg_loss.cuda() # NEG_loss is defined over two embedding matrixes with shape of [params.word_vocab_size, params.word_embed_size] optimizer = SGD(neg_loss.parameters(), 0.1) for iteration in range(args.num_iterations): input_idx, target_idx = batch_loader_2.next_embedding_seq( args.batch_size)
validate() # save model if not os.path.exists("./checkpoints"): os.makedirs("./checkpoints") save_path = saver.save( sess, "./checkpoints/{}.ckpt".format(params.checkpoint)) print("Model saved in file: %s" % save_path) # builder.add_meta_graph_and_variables(sess, ["main_model"]) if params.use_hdf5 and params.fine_tune: batch_gen.h5f.close() # run inference if params.mode == "inference": inference.inference(params, decoder, val_gen, test_gen, image_f_inputs, saver, sess) if __name__ == '__main__': params = Parameters() params.parse_args() coco_dir = params.coco_dir # save parameters for futher usage if params.save_params: import pickle param_fn = "./pickles/params_{}_{}_{}_{}.pickle".format( params.prior, params.no_encoder, params.checkpoint, params.use_c_v) print("Saving params to: ", param_fn) with open(param_fn, 'wb') as wf: pickle.dump(file=wf, obj=params) # train model, generate captions for val-test sets main(params)
path + 'super/words_vocab_2.pkl', path + 'super/characters_vocab_2.pkl' ] tensor_files = [[ path + 'super/train_word_tensor_2.npy', path + 'super/valid_word_tensor_2.npy' ], [ path + 'super/train_character_tensor_2.npy', path + 'super/valid_character_tensor_2.npy' ]] batch_loader_2 = BatchLoader(data_files, idx_files, tensor_files, path) # batch_loader_2 = BatchLoader('') params = Parameters(batch_loader_2.max_word_len, batch_loader_2.max_seq_len, batch_loader_2.words_vocab_size, batch_loader_2.chars_vocab_size, path) neg_loss = NEG_loss(params.word_vocab_size, params.word_embed_size) if args.use_cuda: neg_loss = neg_loss.cuda() # NEG_loss is defined over two embedding matrixes with shape of [params.word_vocab_size, params.word_embed_size] optimizer = SGD(neg_loss.parameters(), 0.1) for iteration in range(args.num_iterations): input_idx, target_idx = batch_loader_2.next_embedding_seq( args.batch_size) input = Variable(t.from_numpy(input_idx).long())
'batch_size': 20, 'num_epochs': 20, 'embed_size': 464, 'num_hidden': 337, 'num_layers': 1, 'learning_rate': 0.001, 'mode_train': True, 'sent_max_size': 228, 'gen_length': 20, 'temperature': 0.5, 'keep_rate': 0.66, 'input': ['GOT', 'PTB'][1], 'vocab_drop': 3 } # for back compatibility params_c = Parameters() params_c.batch_size = params['batch_size'] params_c.num_epochs = params['num_epochs'] params_c.embed_size = params['embed_size'] params_c.learning_rate = params['learning_rate'] params_c.pre_trained_embed = False params_c.beam_search = False params_c.vocab_drop = params['vocab_drop'] params_c.embed_size = params['embed_size'] def online_inference(sess, data_dict, sample, seq, in_state=None, out_state=None, seed='<BOS>'): """ Generate sequence one character at a time, based on the previous character """ sentence = [seed] state = None
from utils.factories import Factories import numpy as np import matplotlib.pyplot as plt from sklearn.cluster import KMeans from lib.active_kmean import KMeansWrapper from sklearn.datasets import make_blobs logging = logging_config() logging.disable(logging.DEBUG) log = logger.get_logger('main') prm_file = '/data/gilad/logs/log_2210_220817_wrn-fc2_kmeans_SGD_init_200_clusters_4_cap_204/parameters.ini' prm = Parameters() prm.override(prm_file) dev = prm.network.DEVICE factories = Factories(prm) model = factories.get_model() model.print_stats() # debug preprocessor = factories.get_preprocessor() preprocessor.print_stats() # debug train_dataset = factories.get_train_dataset(preprocessor) validation_dataset = factories.get_validation_dataset(preprocessor)
metavar='BS', help='batch size (default: 20)') parser.add_argument('--num-sample', type=int, default=14, metavar='NS', help='num sample (default: 14)') parser.add_argument('--use-cuda', type=bool, default=False, metavar='CUDA', help='use cuda (default: True)') args = parser.parse_args() batch_loader = BatchLoader('') params = Parameters(batch_loader.max_seq_len, batch_loader.vocab_size) neg_loss = NEG_loss( params.vocab_size, params.word_embed_size, weights=[1 - sqrt(5e-5 / i) for i in batch_loader.words_freq]) if args.use_cuda: neg_loss = neg_loss.cuda() """NEG_loss is defined over two embedding matrixes with shape of [params.vocab_size, params.word_embed_size]""" optimizer = SGD(neg_loss.parameters(), 0.1) for iteration in range(args.num_iterations): input_idx, target_idx = batch_loader.next_embedding_seq( args.batch_size)
''' ================================= BatchLoader loading =============================================== ''' data_files = [args.path + args.test_file] idx_files = [ args.path + 'words_vocab.pkl', args.path + 'characters_vocab.pkl' ] tensor_files = [[args.path + 'test_word_tensor.npy'], [args.path + 'test_character_tensor.npy']] preprocess_data(data_files, idx_files, tensor_files, args.use_file, str) batch_loader = BatchLoader(data_files, idx_files, tensor_files) parameters = Parameters(batch_loader.max_word_len, batch_loader.max_seq_len, batch_loader.words_vocab_size, batch_loader.chars_vocab_size, args.path) ''' ============================ BatchLoader for Question-2 =============================================== ''' data_files = [args.path + 'super/train_2.txt'] idx_files = [ args.path + 'super/words_vocab_2.pkl', args.path + 'super/characters_vocab_2.pkl' ] tensor_files = [[args.path + 'super/train_word_tensor_2.npy'], [args.path + 'super/train_character_tensor_2.npy']] batch_loader_2 = BatchLoader(data_files, idx_files, tensor_files) parameters_2 = Parameters(batch_loader_2.max_word_len, batch_loader_2.max_seq_len,
if __name__ == '__main__': #paths = ['/var/www/lildbibio_scielo_org/proc/xml_path/new', '/var/www/lildbibio_scielo_org/proc/xml_path/inproc', '/var/www/lildbibio_scielo_org/proc/xml_path/archive' ] #paths = ['/var/www/lildbibio_scielo_org/proc/teste/new', 'i', 't' ] #python3 bhl_lilacs.py /var/www/lildbibio_scielo_org/bases/cisis1660 /var/www/lildbibio_scielo_org/bases/bhl/bhl /var/www/lildbibio_scielo_org/proc/bhl_lilacs /var/www/lildbibio_scielo_org/bases/bhl/bhl_xml from utils.parameters import Parameters from utils.report import Report from configuration import Configuration configuration = Configuration('configuration.ini') if configuration.check_parameters(['CISIS_PATH', 'REPORT_PATH', 'INBOX_PATH', 'ARCHIVE_PATH', 'DB_FILENAME']): cisis_path, report_path, inbox_path, archive_path, db_filename = configuration.return_parameters(['CISIS_PATH', 'REPORT_PATH', 'INBOX_PATH', 'ARCHIVE_PATH', 'DB_FILENAME']) parameter_list = ['', 'source of xml files: new|archive' ] parameters = Parameters(parameter_list) if parameters.check_parameters(sys.argv): script, xml_source = sys.argv if xml_source != 'archived' or xml_source != 'new': xml_source = 'new' cisis = CISIS(cisis_path) files_set = BHL_Files_Set(inbox_path, archive_path) report = Report(report_path + '/_bhl_db.log', report_path + '/_bhl_db.err', report_path + '/_bhl_db.txt') proc = BHL_LILACS(cisis, files_set, report) proc.generate_db_files(xml_source) proc.generate_db(db_filename)
from utils.evaluation import evaluate_model from utils.sampler import WarpSampler from utils.checkpoint import save_model from utils import utils warnings.filterwarnings("ignore", category=DeprecationWarning) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") if __name__ == '__main__': args = parse_args() print('Loading dataset...', end="") args.device = device args.date_time = datetime.datetime.now() dataset = EmbedDataset(args) params = Parameters(args, dataset) print("\rDataset Statistics:") print(f" Users: {params.num_user} | Lists: {params.num_list} | Items:{params.num_item}") print(f" Train: {params.num_train_instances} | Valid: {params.num_valid_instances} | Test: {params.num_test_instances}") print(f" Density: {100 * params.num_train_instances / (params.num_list * params.num_item):.4f} %") print("="*60) args.args_str = params.get_args_to_string() t1 = time() models = Models(params, device=device) model = models.get_model() model.to(device) save_model_path = os.path.join("./saved_models", params.dataset + ".pth.tar") criterion_li = torch.nn.BCELoss() optimizer_gnn = torch.optim.Adam(model.parameters(), lr=params.lr)
from utils.arguments import parse_args from utils.parameters import Parameters from utils.checkpoint import load_model from utils.valid_test_error_seq import ValidTestErrorSEQ warnings.filterwarnings("ignore", category=DeprecationWarning) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") if __name__ == '__main__': args = parse_args() print('Loading dataset...', end="") args.device = device args.date_time = datetime.datetime.now() dataset = EmbedDataset(args) params = Parameters(args, dataset) print("\rDataset Statistics:") print( f" Users: {params.num_user} | Lists: {params.num_list} | Items:{params.num_item}" ) print( f" Train: {params.num_train_instances} | Valid: {params.num_valid_instances} | Test: {params.num_test_instances}" ) print( f" Density: {100 * params.num_train_instances / (params.num_list * params.num_item):.4f} %" ) save_model_path = os.path.join("./saved_models", params.dataset + ".pth.tar")
"-c", "--config", dest='CONFIG_FILE', default=os.path.join(os.path.curdir, os.path.pardir, "config", "train_config.yaml"), # default=os.path.join(os.path.curdir, os.path.pardir, "config", "debug_config.yaml"), help="config file", ) parser.print_help() args = parser.parse_args() ############################################################################## #%% Read config file # Read config file params = Parameters(args.CONFIG_FILE) # Save parameters params.write_parameters() ############################################################################### #%% Build model # Init model model = models.models_dict[params.getp("MODEL_NAME")]( nb_channels=params.getp("NB_CHANNELS"), nb_classes=params.getp("NB_CLASSES"), nb_scales=params.getp("NB_SCALES")) # Puts the model on device (GPU) model = model.cuda(device=params.getp("DEVICE_ID"))
def get_params(test_config, parser_args=None): """get params and save them to root dir""" # Just to get the ROOT_DIR and save prm test_config prm = Parameters() prm.override(test_config) # get manual test parameters from config: if parser_args is not None: # overriding some parameters manually from parser: prm.train.train_control.ROOT_DIR = parser_args.ROOT_DIR prm.train.train_control.TEST_DIR = parser_args.ROOT_DIR + '/test' prm.train.train_control.PREDICTION_DIR = parser_args.ROOT_DIR + '/prediction' prm.train.train_control.CHECKPOINT_DIR = parser_args.ROOT_DIR + '/checkpoint' prm.test.test_control.KNN_WEIGHTS = parser_args.KNN_WEIGHTS prm.test.test_control.KNN_NORM = parser_args.KNN_NORM prm.train.train_control.PCA_REDUCTION = ( parser_args.PCA_REDUCTION == 'True') prm.train.train_control.PCA_EMBEDDING_DIMS = int( parser_args.PCA_EMBEDDING_DIMS) prm.test.test_control.KNN_NEIGHBORS = int(parser_args.KNN_NEIGHBORS) prm.test.test_control.DUMP_NET = (parser_args.DUMP_NET == 'True') prm.test.test_control.LOAD_FROM_DISK = ( parser_args.LOAD_FROM_DISK == 'True') ROOT_DIR = prm.train.train_control.ROOT_DIR # get time stamp ts = get_timestamp() # get files paths parameter_file = os.path.join(ROOT_DIR, 'parameters.ini') test_parameter_file = os.path.join(ROOT_DIR, 'test_parameters_' + ts + '.ini') all_parameter_file = os.path.join(ROOT_DIR, 'all_parameters_' + ts + '.ini') log_file = os.path.join(ROOT_DIR, 'test_' + ts + '.log') logging = logging_config(log_file) logging.disable(logging.DEBUG) if not os.path.isfile(parameter_file): raise AssertionError('Can not find file: {}'.format(parameter_file)) dir = os.path.dirname(test_parameter_file) if not os.path.exists(dir): os.makedirs(dir) prm.save(test_parameter_file) # Done saving test parameters. Now doing the integration: prm = Parameters() prm.override(parameter_file) prm.override(test_parameter_file) if parser_args is not None: # overriding some parameters manually from parser: prm.train.train_control.ROOT_DIR = parser_args.ROOT_DIR prm.train.train_control.TEST_DIR = parser_args.ROOT_DIR + '/test' prm.train.train_control.PREDICTION_DIR = parser_args.ROOT_DIR + '/prediction' prm.train.train_control.CHECKPOINT_DIR = parser_args.ROOT_DIR + '/checkpoint' prm.test.test_control.KNN_WEIGHTS = parser_args.KNN_WEIGHTS prm.test.test_control.KNN_NORM = parser_args.KNN_NORM prm.train.train_control.PCA_REDUCTION = ( parser_args.PCA_REDUCTION == 'True') prm.train.train_control.PCA_EMBEDDING_DIMS = int( parser_args.PCA_EMBEDDING_DIMS) prm.test.test_control.KNN_NEIGHBORS = int(parser_args.KNN_NEIGHBORS) prm.test.test_control.DUMP_NET = (parser_args.DUMP_NET == 'True') prm.test.test_control.LOAD_FROM_DISK = ( parser_args.LOAD_FROM_DISK == 'True') dir = os.path.dirname(all_parameter_file) if not os.path.exists(dir): os.makedirs(dir) prm.save(all_parameter_file) return prm
formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( "-l", "--log_dir", dest='LOG_DIR', help= "log directory of trained model (should contain config.yaml and subdir models/)", ) parser.print_help() args = parser.parse_args() ############################################################################### #%% Read config file # Read config file (loads config file of trained model) training_params = Parameters(os.path.join(args.LOG_DIR, "config.yaml")) ############################################################################### ABSCISSE_COORDINATE = 0 # epoch # ABSCISSE_COORDINATE = 1 # cpt_backward_pass # ABSCISSE_COORDINATE = 5 # time ############################################################################### nb_classes = training_params.getp("NB_CLASSES") LOG_DIR = training_params.getp("LOG_DIR") log_files = glob.glob(os.path.join(LOG_DIR, "*.txt")) # Training and Validation Loss and Accuracy print("\n 1st figure : Loss and Accuracy")
help="Indices to words dictionary") parser.add_argument('--gpu', default='', help="Specify GPU number if use GPU") parser.add_argument( '--c_v_generator', default=None, help="If use cluster vectors, specify tensorflow api model" "For more information look README") parser.add_argument('--gen_method', default='greedy', help='greedy, beam_search or sample') parser.add_argument('--params_path', default=None, help="specify params pickle file") parser.add_argument('--beam_size', default=2, help="If using beam_search, specify beam_size") args = parser.parse_args() # CUDA settings os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu # parameter of the model params = Parameters() generator = Generator(checkpoint_path=args.checkpoint, params_path=args.params_path, vocab_path=args.vocab_path, gen_method=args.gen_method) caption = generator.generate_caption(args.img_path, args.beam_size) print(caption[0]['caption'])
) parser.add_argument( "-l", "--log_dir", dest='LOG_DIR', help= "log directory of trained model (should contain config.yaml and subdir models/)", ) parser.print_help() args = parser.parse_args() ############################################################################## #%% Read config file # Read config file (loads config file of trained model) params = Parameters(os.path.join(args.LOG_DIR, "config.yaml")) params.update_parameters(args.CONFIG_FILE) # Save parameters params.write_parameters() ############################################################################### #%% Load model print() # Init model model = torch.load( os.path.join(params.getp("MODEL_DIR"), "best_train_model_checkpoint_fold_00_sample_000.tar"))
[ path + f"data/train_word_tensor_{args.embeddings_name}.npy", path + f"data/valid_word_tensor_{args.embeddings_name}.npy", ], [ path + f"data/train_character_tensor_{args.embeddings_name}.npy", path + f"data/valid_character_tensor_{args.embeddings_name}.npy", ], ] batch_loader = BatchLoader(data_files, idx_files, tensor_files, path) parameters = Parameters( batch_loader.max_word_len, batch_loader.max_seq_len, batch_loader.words_vocab_size, batch_loader.chars_vocab_size, args.embeddings_name, args.res_model, args.hrvae, args.wae, ) """ =================== Doing the same for encoder-2 =============================================== """ data_files = [path + f"data/super/train_{args.data_name}_2.txt", path + f"data/super/test_{args.data_name}_2.txt"] idx_files = [ path + f"data/super/words_vocab_{args.embeddings_name}_2.pkl", path + f"data/super/characters_vocab_{args.embeddings_name}_2.pkl", ] tensor_files = [
from utils.split_preprocess_data import SplitPreprocessData datasets = [ "paysim", "paysim-custom", "ccfraud", "ieee", "nslkdd", "saperp-ek", "saperp-vk", "mnist", "cifar10" ] methods = ["all", "ocan", "ocan-ae", "ae", "rbm", "vae", "dae", "cnn"] baselines = ["both", "usv", "sv", "none"] parser = Parser(datasets, methods, baselines) dataset_string, verbosity, seed, method, baseline, iteration_count, use_oversampling, cross_validation_count = \ parser.get_args() # Set parameters parameter_class = Parameters(dataset_string) usv_train, sv_train, sv_train_fraud, test_benign, test_fraud = \ parameter_class.get_main_parameters(cross_validation_count) x_ben, x_fraud, preprocess_class = \ LoadData(dataset_string, parameter_class.get_path(), seed, parameter_class, verbosity).get_data() # Initialize collections for evaluation results prec_coll = list() reca_coll = list() f1_coll = list() acc_coll = list() pr_auc_coll = list() roc_auc_coll = list() method_list = list()
parser = argparse.ArgumentParser(description='Sampler') parser.add_argument('--use-cuda', type=bool, default=True, metavar='CUDA', help='use cuda (default: True)') parser.add_argument('--num-sample', type=int, default=10, metavar='NS', help='num samplings (default: 10)') args = parser.parse_args() batch_loader = BatchLoader('') parameters = Parameters(batch_loader.max_word_len, batch_loader.max_seq_len, batch_loader.words_vocab_size, batch_loader.chars_vocab_size) rvae = RVAE_dilated(parameters) rvae.load_state_dict(t.load('trained_RVAE')) if args.use_cuda: rvae = rvae.cuda() for iteration in range(args.num_sample): seed = np.random.normal(size=[1, parameters.latent_variable_size]) result = rvae.sample(batch_loader, 50, seed, args.use_cuda) print(result) print()
help='dropout (default: 0.12)') parser.add_argument('--aux', type=float, default=0.4, metavar='DR', help='aux loss coef (default: 0.4)') parser.add_argument('--use-trained', type=bool, default=False, metavar='UT', help='load pretrained model (default: False)') args = parser.parse_args() batch_loader = BatchLoader() parameters = Parameters(batch_loader.vocab_size) vae = VAE(parameters.vocab_size, parameters.embed_size, parameters.latent_size, parameters.decoder_rnn_size, parameters.decoder_rnn_num_layers) if args.use_trained: vae.load_state_dict(t.load('trained_VAE')) if args.use_cuda: vae = vae.cuda() optimizer = Adam(vae.parameters(), args.learning_rate) for iteration in range(args.num_iterations): '''Train step''' input, decoder_input, target = batch_loader.next_batch( args.batch_size, 'train', args.use_cuda)