def fit(self, data): """Fit the model to the given data. Args: data(pandas.DataFrame): dataset to fit the model. Returns: None """ self.preprocessor = Preprocessor( continuous_columns=self.continuous_columns) data = self.preprocessor.fit_transform(data) self.metadata = self.preprocessor.metadata dataflow = TGANDataFlow(data, self.metadata) batch_data = BatchData(dataflow, self.batch_size) input_queue = QueueInput(batch_data) self.model = self.get_model(training=True) if self.trainer == 'GANTrainer': trainer = GANTrainer(model=self.model, input_queue=input_queue) elif self.trainer == 'SeparateGANTrainer': trainer = SeparateGANTrainer(model=self.model, input_queue=input_queue) else: raise ValueError( 'Incorrect trainer name. Use GANTrainer or SeparateGANTrainer') # trainer = SeparateGANTrainer(model=self.model, input_queue=input_queue) self.restore_path = os.path.join(self.model_dir, 'checkpoint') if os.path.isfile(self.restore_path) and self.restore_session: session_init = SaverRestore(self.restore_path) with open(os.path.join(self.log_dir, 'stats.json')) as f: starting_epoch = json.load(f)[-1]['epoch_num'] + 1 else: session_init = None starting_epoch = 1 action = 'k' if self.restore_session else None logger.set_logger_dir(self.log_dir, action=action) callbacks = [] if self.save_checkpoints: callbacks.append(ModelSaver(checkpoint_dir=self.model_dir)) trainer.train_with_defaults(callbacks=callbacks, steps_per_epoch=self.steps_per_epoch, max_epoch=self.max_epoch, session_init=session_init, starting_epoch=starting_epoch) self.prepare_sampling()
def main(gpu, args): rank = args.nr * args.gpus + gpu dist.init_process_group(backend='nccl', init_method='env://', world_size=args.world_size, rank=rank) with open(args.config) as config_file: params = json.load(config_file) params['nodes'] = args.nodes params['gpus'] = args.gpus params['nr'] = args.nr params['world_size'] = args.world_size params['rank'] = rank random.seed(params['seed']) np.random.seed(params['seed']) torch.manual_seed(params['seed']) preproc = Preprocessor(sp_model=params['sp_model']) params['vocab_size'] = preproc.vocab_size logger = Logger(params['logfile']) logger.write('Loading data...') train_dataloader, eval_dataloader = prepare_dataset( dataset_dir=params['dataset_dir'], train_sets=params['train_sets'], eval_sets=params['eval_sets'], batch_size=params['batch_size'], preproc=preproc, world_size=args.world_size, rank=rank) logger.write('Model initialization...') model, num_params = make_model(params) logger.write(f'Total number of parameters: {num_params}') torch.cuda.set_device(gpu) model.cuda() optimizer = torch.optim.Adadelta(model.parameters(), lr=params['lr'], eps=1e-8, rho=0.95, weight_decay=params['weight_decay']) model, optimizer = amp.initialize(min_loss_scale=1.0, models=model, optimizers=optimizer, opt_level=params['opt_level']) model = DDP(model) train(data=[train_dataloader, eval_dataloader, preproc], model=model, optimizer=optimizer, logger=logger, params=params)
def single_experiment(args, i, logger): print(f'\033[91m======================{args.dataset} exp: {i}====================\033[00m') np.random.seed(int(args.seed + i)) # for the reproducibility random.seed(int(args.seed + i)) torch.manual_seed(int(args.seed + i)) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False args.seed += 1 server = Server(args, logger) # 서버와 로컬이 만들어짐 data = Preprocessor(args) # Data 불러오는 곳 data.distribute_data(server) # 서버에 데이터를 전달함 server.train(exp_id=i) # 서버 Training 진행함 print("\033[91m=\033[00m" * 50 + "\n") ret_model = server.get_global_model() del data del server gc.collect() torch.cuda.empty_cache() logger.save_data() return ret_model
def train(args): if args.seed is not None: utils.random.seed_everything(args.seed) device = torch.device("cuda" if args.cuda else "cpu") preprocessor = Preprocessor() preprocessor.build_vocab(args.train_file, cache_dir=args.cache_dir) if args.embed_file: preprocessor.load_embeddings(args.embed_file, cache_dir=args.cache_dir) loader_config = dict( preprocessor=preprocessor, batch_size=args.batch_size, device=device, cache_dir=args.cache_dir, ) train_dataloader = create_dataloader(args.train_file, **loader_config, shuffle=True) eval_dataloader = None if args.eval_file: eval_dataloader = create_dataloader(args.eval_file, **loader_config, shuffle=False) model = build_model( word_vocab_size=len(preprocessor.vocabs["word"]), pretrained_word_vocab_size=len(preprocessor.vocabs["pretrained_word"]), postag_vocab_size=len(preprocessor.vocabs["postag"]), pretrained_word_embeddings=preprocessor.pretrained_word_embeddings, n_deprels=len(preprocessor.vocabs["deprel"]), ) model.to(device) trainer = create_trainer( model, lr=args.learning_rate, max_steps=args.max_steps, eval_interval=args.eval_interval ) trainer.add_callback(utils.training.PrintCallback(printer=logger.info)) if eval_dataloader: deprel_map = {v: k for k, v in preprocessor.vocabs["deprel"].mapping.items()} trainer.add_callback(EvaluateCallback(args.eval_file, deprel_map), priority=0) if args.save_dir: torch.save(preprocessor, os.path.join(args.save_dir, "preprocessor.pt")) trainer.add_callback( utils.training.SaveCallback(args.save_dir, monitor="eval/UAS", mode="max") ) with logging_redirect_tqdm(loggers=[logger]): trainer.fit(train_dataloader, eval_dataloader)
class TGANModel: """Main model from TGAN. Args: continuous_columns (list[int]): 0-index list of column indices to be considered continuous. output (str, optional): Path to store the model and its artifacts. Defaults to :attr:`output`. gpu (list[str], optional):Comma separated list of GPU(s) to use. Defaults to :attr:`None`. max_epoch (int, optional): Number of epochs to use during training. Defaults to :attr:`5`. steps_per_epoch (int, optional): Number of steps to run on each epoch. Defaults to :attr:`10000`. save_checkpoints(bool, optional): Whether or not to store checkpoints of the model after each training epoch. Defaults to :attr:`True` restore_session(bool, optional): Whether or not continue training from the last checkpoint. Defaults to :attr:`True`. batch_size (int, optional): Size of the batch to feed the model at each step. Defaults to :attr:`200`. z_dim (int, optional): Number of dimensions in the noise input for the generator. Defaults to :attr:`100`. noise (float, optional): Upper bound to the gaussian noise added to categorical columns. Defaults to :attr:`0.2`. l2norm (float, optional): L2 reguralization coefficient when computing losses. Defaults to :attr:`0.00001`. learning_rate (float, optional): Learning rate for the optimizer. Defaults to :attr:`0.001`. num_gen_rnn (int, optional): Defaults to :attr:`400`. num_gen_feature (int, optional): Number of features of in the generator. Defaults to :attr:`100` num_dis_layers (int, optional): Defaults to :attr:`2`. num_dis_hidden (int, optional): Defaults to :attr:`200`. optimizer (str, optional): Name of the optimizer to use during `fit`,possible values are: [`GradientDescentOptimizer`, `AdamOptimizer`, `AdadeltaOptimizer`]. Defaults to :attr:`AdamOptimizer`. """ def __init__(self, continuous_columns, sensitive_column, output='output', gpu=None, max_epoch=5, steps_per_epoch=10000, save_checkpoints=True, restore_session=True, batch_size=200, z_dim=200, noise=0.2, l2norm=0.00001, discrim_learning_rate=0.001, fair_learning_rate=0.0002, num_gen_rnn=100, num_gen_feature=100, num_dis_layers=1, num_dis_hidden=100, optimizer='AdamOptimizer', trainer='GANTrainer'): """Initialize object.""" # Output self.continuous_columns = continuous_columns self.sensitive_column = sensitive_column self.log_dir = os.path.join(output, 'logs') self.model_dir = os.path.join(output, 'model') self.output = output # Training params self.max_epoch = max_epoch self.steps_per_epoch = steps_per_epoch self.save_checkpoints = save_checkpoints self.restore_session = restore_session # Model params self.model = None self.batch_size = batch_size self.z_dim = z_dim self.noise = noise self.l2norm = l2norm self.discrim_learning_rate = discrim_learning_rate self.fair_learning_rate = fair_learning_rate self.num_gen_rnn = num_gen_rnn self.num_gen_feature = num_gen_feature self.num_dis_layers = num_dis_layers self.num_dis_hidden = num_dis_hidden self.optimizer = optimizer self.trainer = trainer if gpu: os.environ['CUDA_VISIBLE_DEVICES'] = gpu self.gpu = gpu def get_model(self, training=True): """Return a new instance of the model.""" return GraphBuilder(metadata=self.metadata, sensitive_column=self.sensitive_column, batch_size=self.batch_size, z_dim=self.z_dim, noise=self.noise, l2norm=self.l2norm, discrim_learning_rate=self.discrim_learning_rate, fair_learning_rate=self.fair_learning_rate, num_gen_rnn=self.num_gen_rnn, num_gen_feature=self.num_gen_feature, num_dis_layers=self.num_dis_layers, num_dis_hidden=self.num_dis_hidden, optimizer=self.optimizer, training=training) def prepare_sampling(self): """Prepare model for generate samples.""" if self.model is None: self.model = self.get_model(training=False) else: self.model.training = False predict_config = PredictConfig( session_init=SaverRestore(self.restore_path), model=self.model, input_names=['z'], output_names=['gen/gen', 'z'], ) self.simple_dataset_predictor = SimpleDatasetPredictor( predict_config, RandomZData((self.batch_size, self.z_dim))) def fit(self, data): """Fit the model to the given data. Args: data(pandas.DataFrame): dataset to fit the model. Returns: None """ self.preprocessor = Preprocessor( continuous_columns=self.continuous_columns) data = self.preprocessor.fit_transform(data) self.metadata = self.preprocessor.metadata dataflow = TGANDataFlow(data, self.metadata) batch_data = BatchData(dataflow, self.batch_size) input_queue = QueueInput(batch_data) self.model = self.get_model(training=True) if self.trainer == 'GANTrainer': trainer = GANTrainer(model=self.model, input_queue=input_queue) elif self.trainer == 'SeparateGANTrainer': trainer = SeparateGANTrainer(model=self.model, input_queue=input_queue) else: raise ValueError( 'Incorrect trainer name. Use GANTrainer or SeparateGANTrainer') # trainer = SeparateGANTrainer(model=self.model, input_queue=input_queue) self.restore_path = os.path.join(self.model_dir, 'checkpoint') if os.path.isfile(self.restore_path) and self.restore_session: session_init = SaverRestore(self.restore_path) with open(os.path.join(self.log_dir, 'stats.json')) as f: starting_epoch = json.load(f)[-1]['epoch_num'] + 1 else: session_init = None starting_epoch = 1 action = 'k' if self.restore_session else None logger.set_logger_dir(self.log_dir, action=action) callbacks = [] if self.save_checkpoints: callbacks.append(ModelSaver(checkpoint_dir=self.model_dir)) trainer.train_with_defaults(callbacks=callbacks, steps_per_epoch=self.steps_per_epoch, max_epoch=self.max_epoch, session_init=session_init, starting_epoch=starting_epoch) self.prepare_sampling() def sample(self, num_samples): """Generate samples from model. Args: num_samples(int) Returns: None Raises: ValueError """ max_iters = (num_samples // self.batch_size) results = [] for idx, o in enumerate(self.simple_dataset_predictor.get_result()): results.append(o[0]) if idx + 1 == max_iters: break results = np.concatenate(results, axis=0) ptr = 0 features = {} for col_id, col_info in enumerate(self.metadata['details']): if col_info['type'] == 'category': features['f%02d' % col_id] = results[:, ptr:ptr + 1] ptr += 1 elif col_info['type'] == 'value': gaussian_components = col_info['n'] val = results[:, ptr:ptr + 1] ptr += 1 pro = results[:, ptr:ptr + gaussian_components] ptr += gaussian_components features['f%02d' % col_id] = np.concatenate([val, pro], axis=1) else: raise ValueError( "self.metadata['details'][{}]['type'] must be either `category` or " "`values`. Instead it was {}.".format( col_id, col_info['type'])) return self.preprocessor.reverse_transform( features)[:num_samples].copy() def tar_folder(self, tar_name): """Generate a tar of :self.output:.""" with tarfile.open(tar_name, 'w:gz') as tar_handle: for root, dirs, files in os.walk(self.output): for file_ in files: tar_handle.add(os.path.join(root, file_)) tar_handle.close() @classmethod def load(cls, path): """Load a pretrained model from a given path.""" with tarfile.open(path, 'r:gz') as tar_handle: destination_dir = os.path.dirname(tar_handle.getmembers()[0].name) tar_handle.extractall() with open('{}/TGANModel'.format(destination_dir), 'rb') as f: instance = pickle.load(f) instance.prepare_sampling() return instance def save(self, path, force=False): """Save the fitted model in the given path.""" if os.path.exists(path) and not force: logger.info( 'The indicated path already exists. Use `force=True` to overwrite.' ) return base_path = os.path.dirname(path) if not os.path.exists(base_path): os.makedirs(base_path) model = self.model dataset_predictor = self.simple_dataset_predictor self.model = None self.simple_dataset_predictor = None with open('{}/TGANModel'.format(self.output), 'wb') as f: pickle.dump(self, f) self.model = model self.simple_dataset_predictor = dataset_predictor self.tar_folder(path) logger.info('Model saved successfully.')
def preprocess_SNLI_data(inputdir, embeddings_file, targetdir, lowercase=False, ignore_punctuation=False, num_words=None, stopwords=[], labeldict={}, bos=None, eos=None): """ Preprocess the data from the SNLI corpus so it can be used by the ESIM model. Compute a worddict from the train set, and transform the words in the sentences of the corpus to their indices, as well as the labels. Build an embedding matrix from pretrained word vectors. The preprocessed data is saved in pickled form in some target directory. Args: inputdir: The path to the directory containing the NLI corpus. embeddings_file: The path to the file containing the pretrained word vectors that must be used to build the embedding matrix. targetdir: The path to the directory where the preprocessed data must be saved. lowercase: Boolean value indicating whether to lowercase the premises and hypotheseses in the input data. Defautls to False. ignore_punctuation: Boolean value indicating whether to remove punctuation from the input data. Defaults to False. num_words: Integer value indicating the size of the vocabulary to use for the word embeddings. If set to None, all words are kept. Defaults to None. stopwords: A list of words that must be ignored when preprocessing the data. Defaults to an empty list. bos: A string indicating the symbol to use for beginning of sentence tokens. If set to None, bos tokens aren't used. Defaults to None. eos: A string indicating the symbol to use for end of sentence tokens. If set to None, eos tokens aren't used. Defaults to None. """ if not os.path.exists(targetdir): os.makedirs(targetdir) # Retrieve the train, dev and test data files from the dataset directory. train_file = "" dev_file = "" test_file = "" for file in os.listdir(inputdir): if fnmatch.fnmatch(file, "*_train.txt"): train_file = file elif fnmatch.fnmatch(file, "*_dev.txt"): dev_file = file elif fnmatch.fnmatch(file, "*_test.txt"): test_file = file # -------------------- Train data preprocessing -------------------- # preprocessor = Preprocessor(lowercase=lowercase, ignore_punctuation=ignore_punctuation, num_words=num_words, stopwords=stopwords, labeldict=labeldict, bos=bos, eos=eos) print(20*"=", " Preprocessing train set ", 20*"=") print("\t* Reading data...") data = preprocessor.read_data(os.path.join(inputdir, train_file)) print("\t* Computing worddict and saving it...") preprocessor.build_worddict(data) with open(os.path.join(targetdir, "worddict.pkl"), "wb") as pkl_file: pickle.dump(preprocessor.worddict, pkl_file) print("\t* Transforming words in premises and hypotheses to indices...") transformed_data = preprocessor.transform_to_indices(data) print("\t* Saving result...") with open(os.path.join(targetdir, "train_data.pkl"), "wb") as pkl_file: pickle.dump(transformed_data, pkl_file) # -------------------- Validation data preprocessing -------------------- # print(20*"=", " Preprocessing dev set ", 20*"=") print("\t* Reading data...") data = preprocessor.read_data(os.path.join(inputdir, dev_file)) print("\t* Transforming words in premises and hypotheses to indices...") transformed_data = preprocessor.transform_to_indices(data) print("\t* Saving result...") with open(os.path.join(targetdir, "dev_data.pkl"), "wb") as pkl_file: pickle.dump(transformed_data, pkl_file) # -------------------- Test data preprocessing -------------------- # print(20*"=", " Preprocessing test set ", 20*"=") print("\t* Reading data...") data = preprocessor.read_data(os.path.join(inputdir, test_file)) print("\t* Transforming words in premises and hypotheses to indices...") transformed_data = preprocessor.transform_to_indices(data) print("\t* Saving result...") with open(os.path.join(targetdir, "test_data.pkl"), "wb") as pkl_file: pickle.dump(transformed_data, pkl_file) # -------------------- Embeddings preprocessing -------------------- # print(20*"=", " Preprocessing embeddings ", 20*"=") print("\t* Building embedding matrix and saving it...") embed_matrix = preprocessor.build_embedding_matrix(embeddings_file) with open(os.path.join(targetdir, "embeddings.pkl"), "wb") as pkl_file: pickle.dump(embed_matrix, pkl_file)
from data import Data, Preprocessor import sys input_file = './data/chat-200w.txt' output_file = './data/combine.txt' stop_words_file = './data/stop_words.txt' num = None if __name__ == '__main__': data_loader = Data(input_file, num) data_loader.seperate_conversation() preprocessor = Preprocessor(stop_words_file) data_loader.preprocess(preprocessor, format=3) data = data_loader.conversation_QAQAQ_preprocessed f = open(output_file, 'w') sys.stdout = f for i, conv in enumerate(data): print('Conversation %d:' % i) for sen in conv: if not sen[1]: sen[1] = ['EMOJI'] print(str(sen[0]) + ' ' + ''.join(sen[1])) print() f.close()