def main(_): """ Start either train or eval. Note hardcoded parts of path for training and eval data """ hps = LM.get_default_hparams().parse(FLAGS.hpconfig) hps._set("num_gpus", FLAGS.num_gpus) print('*****HYPER PARAMETERS*****') print(hps) print('**************************') vocab = Vocabulary.from_file(os.path.join(FLAGS.datadir, "vocabulary.txt")) if FLAGS.mode == "train": #hps.batch_size = 256 dataset = Dataset(vocab, os.path.join(FLAGS.datadir, "train.txt")) run_train(dataset, hps, os.path.join(FLAGS.logdir, "train"), ps_device="/gpu:0") elif FLAGS.mode.startswith("eval"): data_dir = os.path.join(FLAGS.datadir, "eval.txt") #predict_model = prediction.Model('/dir/ckpt',os.path.join(FLAGS.datadir, "vocabulary.txt"), hps) dataset = Dataset(vocab, data_dir, deterministic=True) prefix_words = "<brk>".split() predict_model = predict.Model(hps, FLAGS.logdir, FLAGS.datadir) print('start input') out = predict_model.predictnextkwords(prefix_words, FLAGS.num_sen) for row in out: print(' '.join(row) + "\n") print("len_out: " + str(len(out)))
def experiment_harness ( args ): experimentLog = {} experimentCount = 0 # TODO: add ranges to argparser paramRanges = { 0: ['max_depth', 3, 20, 'int'], 1: ['learning_rate', .001, 1, 'float'], 2: ['gamma', 0, 2, 'float'] } client = cluster = None # create logfile and write headers logFilename = 'results.csv' if not os.path.isfile( logFilename ): with open( logFilename, mode='w+') as outputCSV: outputCSV.write("elapsedTime,nSamples,asyncMode,nGPUs,nParticles,nEpochs,globalBestAccuracy,globalBest_max_depth,globalBest_learning_rate,globalBest_gamma,globalBest_nTrees,datasetName\n") for iDataSamples in args.num_rows: # generate or load data directly to the GPU if args.dataset == 'synthetic': dataset = Dataset('synthetic', iDataSamples) if args.dataset == 'airline': dataset = Dataset('airline', iDataSamples) if args.dataset == 'fashion-mnist': dataset = Dataset('fashion-mnist', iDataSamples) for iGPUs in args.num_gpus: for iParticles in args.num_particles: for iEpochs in args.num_epochs: client, cluster = launch_dask(iGPUs) if args.async_flag: s = swarm.AsyncSwarm(client, dataset, paramRanges=paramRanges, nParticles=iParticles, nEpochs=iEpochs) else: s = swarm.SyncSwarm(client, dataset, paramRanges=paramRanges, nParticles=iParticles, nEpochs=iEpochs) startTime = time.time() s.run_search() elapsedTime = time.time() - startTime # TODO: remove fake nTrees s.globalBest['nTrees'] = 9999 stringToOutput = f"{elapsedTime},{iDataSamples},{args.async_flag},{iGPUs},{iParticles},{iEpochs}," stringToOutput += f"{s.globalBest['accuracy']},{s.globalBest['params'][0]},{s.globalBest['params'][1]},{s.globalBest['params'][2]}," stringToOutput += f"{s.globalBest['nTrees']},{args.dataset}\n" print( stringToOutput ) with open(logFilename, mode='a') as outputCSV: outputCSV.write(stringToOutput) print( 'closing dask cluster in between experiment runs [ sleeping for 5 seconds ]') client.close() cluster.close() time.sleep(5)
def main(): # configuration config = Config() config.parse_arg(FLAGS) config.setup_path() config.print_arg() # dataset if (config.dataset == 'wikibio'): dset = DatasetTable2text(config) dset.load() config.key_size = len(dset.key2id) else: dset = Dataset(config) dset.build() config.vocab_size = len(dset.word2id) config.dec_start_id = dset.word2id["_GOO"] config.dec_end_id = dset.word2id["_EOS"] config.pad_id = dset.pad_id config.stop_words = dset.stop_words config.id2wordemb = dset.id2wordemb # model if (config.model_name == "transformer_bow"): Model = TransformerBow elif (config.model_name == "seq2seq"): if (config.dataset == 'wikibio'): Model = Seq2seqData2text else: Model = Seq2seq elif (config.model_name == "bow_seq2seq"): Model = BowSeq2seq elif (config.model_name == "vae"): Model = Vae elif (config.model_name == "hierarchical_vae"): Model = Hierarchical_Vae elif (config.model_name == "latent_bow"): if (config.dataset == 'wikibio'): Model = LatentBowData2text else: Model = LatentBow elif (config.model_name == "lm"): Model = LM else: msg = "the model name shoule be in ['transformer_bow', 'seq2seq', 'vae', 'hierarchical_vae', 'latent_low', 'lm'], " msg += "current name: %s" % config.model_name raise Exception(msg) model = Model(config) with tf.variable_scope(config.model_name): model.build() # controller controller = Controller(config) if (config.model_name != "lm"): if ("lm" in controller.eval_metrics_list): controller.build_lm(LM, config) controller.train(model, dset) return
def main(_): """ Start either train or eval. Note hardcoded parts of path for training and eval data """ hps = LM.get_default_hparams().parse(FLAGS.hpconfig) hps._set("num_gpus", FLAGS.num_gpus) print('*****HYPER PARAMETERS*****') print(hps) print('**************************') print_debug('our training DataSetDir=%s , LogDir=%s' % (FLAGS.datadir, FLAGS.logdir)) #vocab = Vocabulary.from_file(os.path.join(FLAGS.datadir, "1b_word_vocab.txt")) vocab = Vocabulary.from_file(os.path.join(FLAGS.datadir, "vocabulary.txt")) FLAGS.mode = "train" for i in range(10): print("Iteration ", i, " phase: ", FLAGS.mode) if FLAGS.mode == "train": #hps.batch_size = 256 # dataset = Dataset(vocab, os.path.join(FLAGS.datadir, # "training-monolingual.tokenized.shuffled/*")) dataset = Dataset(vocab, os.path.join(FLAGS.datadir, "ptb.train.txt")) trainlogdir = ( FLAGS.logdir + str("/") + "train" ) #(FLAGS.logdir+str("\\")+"train")#os.path.join(FLAGS.logdir, "train") print_debug('train log dir=%s' % (trainlogdir)) run_train(dataset, hps, trainlogdir, ps_device="/gpu:0") print_debug('Finished run_train !!!!!!!!!!!') elif FLAGS.mode.startswith("eval"): print_debug('eval mode') # if FLAGS.mode.startswith("eval_train"): # data_dir = os.path.join(FLAGS.datadir, "training-monolingual.tokenized.shuffled/*") # elif FLAGS.mode.startswith("eval_full"): # data_dir = os.path.join(FLAGS.datadir, "heldout-monolingual.tokenized.shuffled/*") # else: # data_dir = os.path.join(FLAGS.datadir, "heldout-monolingual.tokenized.shuffled/news.en.heldout-00000-of-00050") dataset = Dataset(vocab, os.path.join(FLAGS.datadir, "ptb.test.txt"), deterministic=True) run_eval(dataset, hps, FLAGS.logdir, FLAGS.mode, FLAGS.eval_steps) print_debug('Finished run_eval !!!!!!!!!!!') if FLAGS.mode == "train": FLAGS.mode = "eval_full" else: FLAGS.mode = "train"
def build_data(config): """ Procedure to build data Args: config: defines attributes needed in the function Returns: creates vocab files from the datasets creates a npz embedding file from trimmed glove vectors """ processing_word = get_processing_word(lowercase=True) processing_word = get_processing_word(lowercase=True) # clean data train_filepath, dev_filepath_a = write_clear_data( config.train_filename, build_dev=config.build_dev_from_trainset, dev_ratio=config.dev_ratio) test_filepath, dev_filepath_b = write_clear_data( config.test_filename, build_dev=config.build_dev_from_testset, dev_ratio=config.dev_ratio) dev_filepath = dev_filepath_a or dev_filepath_b # Generators dev = Dataset(dev_filepath, processing_word) test = Dataset(test_filepath, processing_word) train = Dataset(train_filepath, processing_word) # Build Word and Tag vocab vocab_words, vocab_tags = get_vocabs([train, dev, test]) vocab_glove = get_glove_vocab(config.glove_filename) vocab = vocab_words & vocab_glove vocab.add(UNK) vocab.add(NUM) # Save vocab write_vocab(vocab, config.words_filename) write_vocab(vocab_tags, config.tags_filename) # Trim GloVe Vectors vocab = load_vocab(config.words_filename) export_trimmed_glove_vectors(vocab, config.glove_filename, config.trimmed_filename, config.dim) # Build and save char vocab train = Dataset(train_filepath) vocab_chars = get_char_vocab(train) write_vocab(vocab_chars, config.chars_filename)
def main(_): hps = LM.get_default_hparams().parse(FLAGS.hpconfig) hps.num_gpus = FLAGS.num_gpus vocab = Vocabulary.from_file("1b_word_vocab.txt") if FLAGS.mode == "train": hps.batch_size = 256 dataset = Dataset( vocab, FLAGS.datadir + "/training-monolingual.tokenized.shuffled/*") run_train(dataset, hps, FLAGS.logdir + "/train", ps_device="/gpu:0") elif FLAGS.mode.startswith("eval_"): data_dir = FLAGS.datadir dataset = Dataset(vocab, data_dir, deterministic=True) run_eval(dataset, hps, FLAGS.logdir, FLAGS.mode, FLAGS.eval_steps, FLAGS.ckptpath)
def main(_): """ Start either train or eval. Note hardcoded parts of path for training and eval data """ hps = LM.get_default_hparams().parse(FLAGS.hpconfig) hps._set("num_gpus", FLAGS.num_gpus) print('*****HYPER PARAMETERS*****') print(hps) print('**************************') vocab = Vocabulary.from_file( os.path.join(FLAGS.datadir, "1b_word_vocab.txt")) if FLAGS.mode == "train": #hps.batch_size = 256 dataset = Dataset( vocab, os.path.join(FLAGS.datadir, "training-monolingual.tokenized.shuffled/*")) run_train(dataset, hps, os.path.join(FLAGS.logdir, "train"), ps_device="/gpu:0") elif FLAGS.mode.startswith("eval_"): if FLAGS.mode.startswith("eval_train"): data_dir = os.path.join( FLAGS.datadir, "training-monolingual.tokenized.shuffled/*") elif FLAGS.mode.startswith("eval_full"): data_dir = os.path.join( FLAGS.datadir, "heldout-monolingual.tokenized.shuffled/news.en.heldout-00000-of-00050" ) else: data_dir = os.path.join( FLAGS.datadir, "heldout-monolingual.tokenized.shuffled/news.en.heldout-00000-of-00050" ) dataset = Dataset(vocab, data_dir, deterministic=True) run_eval(dataset, hps, FLAGS.logdir, FLAGS.mode, FLAGS.eval_steps) elif FLAGS.mode.startswith("infer"): data_dir = os.path.join( FLAGS.datadir, "heldout-monolingual.tokenized.shuffled/news.en.heldout-00000-of-00050" ) dataset = Dataset(vocab, data_dir, deterministic=True) run_infer(dataset, hps, FLAGS.logdir, FLAGS.mode, vocab)
def test_dataset(self): vocab = Vocabulary.from_file("testdata/test_vocab.txt") dataset = Dataset(vocab, "testdata/*") def generator(): for i in range(1, 10): yield [0] + list(range(1, i + 1)) + [0] counts = [0] * 10 for seq in generator(): for v in seq: counts[v] += 1 counts2 = [0] * 10 for x, y in dataset._iterate(generator(), 2, 4): for v in x.ravel(): counts2[v] += 1 for i in range(1, 10): self.assertEqual(counts[i], counts2[i], "Mismatch at i=%d. counts[i]=%s, counts2[i]=%s" % (i,counts[i], counts2[i]))
def main(): config = Config() args = add_arguments(config) config.parse_arg(args) dset = Dataset(config) dset.build() # print('debug:') # print(dset.id2word[1]) config.vocab_size = len(dset.word2id) # read the transfered sentences transfer_analysis = PivotTransferAnalysis(config) if(config.model == 'cmu'): transfer_analysis.pipeline_w_cmu(dset) else: transfer_analysis.pipeline(dset) return
def main(config, local): n_gpu = int(GPU_NUM) n_gpu = 1 if n_gpu == 0 else n_gpu np.random.seed(config.random_seed) if n_gpu > 0: torch.cuda.manual_seed_all(config.random_seed) # Create data instances vocab = Vocabulary(config.vocab_path) if config.mode == 'train': # Prepare train data loader train_dataset, val_dataset = Dataset(vocab), Dataset(vocab) train_path = os.path.join(config.data_dir, 'train_data/train_data') val_path = os.path.join(config.data_dir, 'train_data/val_data') train_dataset.create_instances(train_path, config.max_seq_length, type='train') val_dataset.create_instances(val_path, config.max_seq_length, type='val') train_loader = DataLoader(train_dataset, batch_size=config.batch_size * n_gpu, shuffle=True) val_loader = DataLoader(val_dataset, batch_size=config.batch_size * n_gpu) else: train_loader, val_loader = None, None trainer = Trainer(config, n_gpu, vocab, train_loader, val_loader) if nsml.IS_ON_NSML: bind_model(trainer.model, vocab, config) if config.pause: nsml.paused(scope=local) if config.mode == 'train': trainer.train()
def __init__(self, opt): self.opt = opt if opt.dataset_file['val'] == None: fnames = [opt.dataset_file['train'], opt.dataset_file['test']] else: fnames = [ opt.dataset_file['train'], opt.dataset_file['val'], opt.dataset_file['test'] ] tokenizer = build_tokenizer(fnames, max_seq_len=opt.max_seq_len, dat_fname='{0}_tokenizer.dat'.format( opt.dataset)) embedding_matrix = build_embedding_matrix( word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format( str(opt.embed_dim), opt.dataset)) self.model = opt.model_class(embedding_matrix, opt).to(opt.device) self.trainset = Dataset(opt.dataset_file['train'], tokenizer, dat_fname='{0}_train.dat'.format(opt.dataset)) # self.weight_classes =torch.tensor( compute_class_weight('balanced', np.unique([i['polarity'] for i in self.trainset.data]), self.trainset[4]), dtype = torch.float).to(self.opt.device) # self.valset = ABSADataset(opt.dataset_file['val'], tokenizer)self.trainset[4] self.testset = Dataset(opt.dataset_file['test'], tokenizer, dat_fname='{0}_test.dat'.format(opt.dataset)) assert 0 <= opt.valset_ratio < 1 if opt.valset_ratio > 0: valset_len = int(len(self.trainset) * opt.valset_ratio) self.trainset, self.valset = random_split( self.trainset, (len(self.trainset) - valset_len, valset_len)) else: self.valset = Dataset(opt.dataset_file['val'], tokenizer, dat_fname='{0}_val.dat'.format(opt.dataset)) if opt.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format( torch.cuda.memory_allocated(device=opt.device.index))) self._print_args()
def inference(path, model, vocab, config, **kwargs): model.eval() test_dataset = Dataset(vocab) test_path = os.path.join(path, 'test_data') test_dataset.create_instances(test_path, config.max_seq_length, type='test') test_loader = DataLoader(test_dataset, batch_size=1) pred_results = [] for step, batch in enumerate(test_loader): batch = tuple(t.to(device) for t in batch) batch = sort_batch(batch) input_ids, input_lengths, labels = batch outputs = model(input_ids) top_1_result = outputs['predicted_intents'][0].item() pred_results.append([step, top_1_result]) return pred_results
def main(_): hps = LM.get_default_hparams().parse(FLAGS.hpconfig) hps.num_gpus = FLAGS.num_gpus vocab = Vocabulary.from_file(FLAGS.datadir + "/lm_vocab.txt", hps.vocab_size) if FLAGS.mode == "train": hps.batch_size = 256 # reset batchsize dataset = Dataset(vocab, FLAGS.datadir + "/train/*") run_train(dataset, hps, FLAGS.logdir + "/train", ps_device="/gpu:0") elif FLAGS.mode.startswith("eval_"): if FLAGS.mode.startswith("eval_train"): data_dir = FLAGS.datadir + "/train/*" elif FLAGS.mode.startswith("eval_test"): data_dir = FLAGS.datadir + "/heldout/*" print("data_dir:",data_dir) dataset = Dataset(vocab, data_dir, deterministic=True) run_eval(dataset, hps, FLAGS.logdir, FLAGS.mode, FLAGS.eval_steps) elif FLAGS.mode.startswith("predict_next"): data_dir = "data/news.en.heldout-00001-of-00050" dataset = Dataset(vocab, data_dir) predict_next(dataset, hps, FLAGS.logdir, FLAGS.mode, FLAGS.eval_steps,vocab)
def run_hpo(args): client, cluster = launch_dask(args.num_gpus, args.min_gpus, args.k8s, args.adapt, args.spec) # generate or load data directly to the GPU if args.dataset == 'synthetic': dataset = Dataset('synthetic', args.num_rows) if args.dataset == 'airline': dataset = Dataset('airline', args.num_rows) if args.dataset == 'fashion-mnist': dataset = Dataset('fashion-mnist', args.num_rows) # TODO: add ranges to argparser paramRanges = { 0: ['max_depth', 3, 20, 'int'], 1: ['learning_rate', .001, 1, 'float'], 2: ['gamma', 0, 2, 'float'] } if args.async_flag: s = swarm.AsyncSwarm(client, dataset, paramRanges=paramRanges, nParticles=args.num_particles, nEpochs=args.num_epochs) else: s = swarm.SyncSwarm(client, dataset, paramRanges=paramRanges, nParticles=args.num_particles, nEpochs=args.num_epochs) s.run_search() # Shut down K8S workers close_dask(cluster, args.k8s)
def main(): assert os.path.exists(model_dir) assert os.path.exists(conf_path) assert os.path.exists(summary_dir) assert os.path.exists(FLAGS.data_prefix + '.train.txt') and \ os.path.exists(FLAGS.data_prefix + '.valid.txt') and \ os.path.exists(FLAGS.data_prefix + '.test.txt') assert FLAGS.mode in ['train', 'test'] logger = logging.getLogger("lm_zh") logger.setLevel(logging.INFO) formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') if log_path: file_handler = logging.FileHandler(log_path) file_handler.setLevel(logging.INFO) file_handler.setFormatter(formatter) logger.addHandler(file_handler) else: console_handler = logging.StreamHandler() console_handler.setLevel(logging.INFO) console_handler.setFormatter(formatter) logger.addHandler(console_handler) os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = '0' logger.info('Parse config file ...') config = default_config.parse(conf_path) logger.info('Running with config: {}'.format(config.items)) if FLAGS.mode == 'test': config.batch_size *= 2 logger.info('Build vocab and dataset ...') dataset = Dataset(FLAGS.data_prefix, config.num_steps, config.batch_size, train=(FLAGS.mode == 'train')) print('Use algo:', config.algo) if FLAGS.mode == 'train': train(config, dataset, model_dir, summary_dir) elif FLAGS.mode == 'test': test(config, dataset, model_dir, summary_dir)
def main(_): hvd.init() hps = LM.get_default_hparams().parse(FLAGS.hpconfig) hps.num_gpus = FLAGS.num_gpus vocab = Vocabulary.from_file(FLAGS.vocab) hps.vocab_size = vocab.num_tokens config = tf.ConfigProto() config.gpu_options.visible_device_list = str(hvd.local_rank()) os.environ["CUDA_VISIBLE_DEVICES"] = str(hvd.local_rank()) if FLAGS.logdir is None: FLAGS.logdir = os.path.join('/tmp', 'lm-run-{}'.format(int(time.time()))) print('logdir: {}'.format(FLAGS.logdir)) hps.batch_size = 256 dataset = Dataset(vocab, FLAGS.datadir) run_train(dataset, hps, FLAGS.logdir + '/train', ps_device='/gpu:' + str(hvd.local_rank()))
def build_data(Config): """ Procedure to build data Args: Config: defines attributes needed in the function Returns: creates vocab files from the datasets """ # Generators train = Dataset(words_filename=Config.source_path, tags_filename=Config.source_tgt_path) # test = Dataset(words_filename=Config.test_path, # tags_filename=Config.test_tgt_path) # Build Word and Tag vocab # vocab_words, vocab_tags = get_vocabs([train, test]) # vocab_words.add(UNK) # Save vocab # write_vocab(vocab_words, Config.words_vocab) # write_vocab(vocab_tags, Config.tags_vocab) vocab_build(train, Config.min_count, Config.words_vocab)
def main(): ''' Main function ''' parser = argparse.ArgumentParser() ## General experimental parameters parser.add_argument('-exp', type=str, default='') # which experiment to run parser.add_argument('-reddit_path', type=str, default='data/posts.npy') # path to reddit post data parser.add_argument( '-val_interval', type=int, default=1000) # how often to evaluate models during training parser.add_argument('-size', type=str, default='med') # maximum post history length parser.add_argument('-n_user', type=int, default=8000) # number of users for experiment parser.add_argument( '-no_try', action='store_true') # whether or not to run code in a try-except ## arguments for training HBERT models parser.add_argument( '-max_tokens_batch', type=int, default=10000) # how big batches we allow BERT to run (depends on GPU) parser.add_argument( '-lr', type=float, default=0.00001) # learning rate for HBERT classification layers parser.add_argument('-bs', type=int, default=10) # batch size for training parser.add_argument('-n_it', type=int, default=8000) # number of iterations to train for parser.add_argument('-seq', action='store_true') parser.add_argument( '-temp_file_path', type=str, default='') # path to directory for temp files. if '' this is not used parser.add_argument('-preembed_size', type=int, default=10) # internal hidden size opt = parser.parse_args() # # # ################################### ################################### # # # """ The first section loads the reddit user data, does some preprocessing, and carries out train/val/test split """ exp_name = 'experiment_' + str(opt.exp) + '_' + opt.size if opt.n_user != 8000: exp_name += '_nuser{}'.format(opt.n_user) exp_classes = experiment_dict[opt.exp] print(exp_classes) # creat data if not done already if (not os.path.isdir(exp_name)): # '/projects/bdata/datasets_peter/dataset_3/posts.npy' Reddit_posts = np.load(opt.reddit_path, allow_pickle=True)[0] Reddit_posts = order_users(Reddit_posts)[:opt.n_user] try: opt.size = int(opt.size) Reddit_posts = [user[:opt.size] for user in Reddit_posts] opt.size = 'size' + str(opt.size) except: if opt.size == 'xsmall': Reddit_posts = [user[:50] for user in Reddit_posts] elif opt.size == 'test': Reddit_posts = [user[:2] for user in Reddit_posts] elif opt.size == 'min': Reddit_posts = [user[:10] for user in Reddit_posts] elif opt.size == 'small': Reddit_posts = [user[:100] for user in Reddit_posts] elif opt.size == 'med': Reddit_posts = [user[:200] for user in Reddit_posts] elif opt.size == 'big': pass else: assert (False) print(exp_classes) Users, Users_full_posts, T, Y, classes = process_users_synth( Reddit_posts, #user_list, #order_users(MH2SW_posts)+ order_users(MH_posts), exp_classes, keep_class=True) os.mkdir(exp_name) np.save('{}/data.npy'.format(exp_name), [Users, Users_full_posts, T, Y, classes]) Users, Users_full_posts, T, Y, classes = tuple( np.load('{}/data.npy'.format(exp_name), allow_pickle=True)) # # # ################################### ################################### # # # """ This section produces feature sets for the different models Feature sets represent some featurization of user histories e.g. X_chi and X_chi_counts use bag of words, with and without counts X_HBERT largely leaves user histories as text X_LDA processes X_chi_counts using LDA refer to paper for further details """ print('starting data loading...') s_time = time.time() X_chi = get_features_chi(Users_full_posts) X_chi_counts = get_features_chi(Users_full_posts, counts=True) print('time = {}'.format(time.time() - s_time)) X_chi_uni = get_features_chi(Users_full_posts, include_bigrams=False) X_HBERT = get_features_HBERT(Users, tokenizer, pretokenize=True) print('fitting LDA...') n_topics = 20 lda = LatentDirichletAllocation(n_components=n_topics, random_state=0) X_LDA = lda.fit_transform(X_chi_counts) print('fit LDA') X_inds = list(range(len(Users))) dataset = Dataset(X_inds, T, Y, train_frac=0.4, val_frac=0.1) inds_train = [data[0] for data in dataset.train_epoch(true_set=True)] inds_val = [data[0] for data in dataset.valid_epoch()] inds_test = [data[0] for data in dataset.test_epoch()] np.save('{}/{}.npy'.format(exp_name, 'inds_dict'), { 'inds_train': inds_train, 'inds_val': inds_val, 'inds_test': inds_test }) print('{} train examples, {} val examples, {} test examples'.format( len(inds_train), len(inds_val), len(inds_test))) time.sleep(3) print('done data loading') # # # ################################### ################################### # # # """ The next section defines model_dict data structures, which are used to organize training and evaluation of the models First, model dicts are defined, than added to a list of models to run, model_dicts """ ## instantiate model dicts model_dict_0 = { 'X': X_chi, 'model': LogReg_PT_propensity_model(input_dim=len(X_chi[0]), lr=0.0001, experiment_name=exp_name + '/LR_12_' + exp_name), 'model_name': 'Logistic_Regression' } model_dict_1 = { 'X': X_chi, 'model': NN_PT_propensity_model(input_dim=len(X_chi[0]), lr=0.0001, experiment_name=exp_name + '/NN_12_' + exp_name), 'model_name': 'Simple_NN' } model_dict_2 = { 'X': X_chi_uni, 'model': LogReg_PT_propensity_model(input_dim=len(X_chi_uni[0]), lr=0.0001, experiment_name=exp_name + '/LR_1_' + exp_name), 'model_name': 'Logistic_Regression_(1gram)' } model_dict_3 = { 'X': X_chi_uni, 'model': NN_PT_propensity_model(input_dim=len(X_chi_uni[0]), lr=0.0001, experiment_name=exp_name + '/NN_1_' + exp_name), 'model_name': 'Simple_NN_(1gram)' } # A temporary file can be added to do some precalculation, making HBERT more efficient # '/projects/bdata/datasets_peter/precalc/' d_input = None if len(opt.temp_file_path) > 0: d = tempfile.TemporaryDirectory(prefix=opt.temp_file_path) d_input = d.name + '/' + exp_name model_dict_4 = { 'X': X_HBERT, 'model': Hierarchical_BERT_propensity_model( n_it=opt.n_it, val_interval=opt.val_interval, lr=opt.lr, batch_size=opt.bs, h_size_sent=1000, h_size_user=1000, tokenize=False, precalc_path=d_input, experiment_name=exp_name + '/hbert' + exp_name, seq=opt.seq, max_tokens_batch=opt.max_tokens_batch, preembed_size=opt.preembed_size), 'model_name': 'HBERT' } model_dict_5 = { 'X': X_chi_counts, 'model': LogReg_PT_propensity_model(input_dim=len(X_chi[0]), lr=0.0001, experiment_name='LR_12_' + exp_name), 'model_name': 'Logistic_Regression_counts' } model_dict_6 = { 'X': X_chi_counts, 'model': NN_PT_propensity_model(input_dim=len(X_chi[0]), lr=0.0001, experiment_name='NN_12_' + exp_name), 'model_name': 'Simple_NN_counts' } model_dict_8 = { 'X': X_LDA, 'model': LogReg_PT_propensity_model(input_dim=n_topics, lr=0.0001, experiment_name='LR_12_' + exp_name), 'model_name': 'Logistic_Regression_LDA' } model_dict_9 = { 'X': X_LDA, 'model': NN_PT_propensity_model(input_dim=n_topics, lr=0.0001, experiment_name='NN_12_' + exp_name), 'model_name': 'Simple_NN_LDA' } # A temporary file can be added to do some precalculation, making HBERT more efficient d_input = None if len(opt.temp_file_path) > 0: d = tempfile.TemporaryDirectory(prefix=opt.temp_file_path) d_input = d.name + '/' + exp_name model_dict_7 = { 'X': X_HBERT, 'model': Average_BERT_propensity_model(n_it=opt.n_it, val_interval=opt.val_interval, lr=opt.lr, batch_size=opt.bs, h_size_sent=1000, h_size_user=768, tokenize=False, precalc_path=d_input, experiment_name='avgbert' + exp_name, seq=opt.seq, max_tokens_batch=opt.max_tokens_batch), #preembed_size = opt.preembed_size), 'model_name': 'avgBERT' } # a list of dictionaries to keep track of all models to run model_dicts = [ model_dict_8, model_dict_9, model_dict_5, model_dict_6, model_dict_0, model_dict_1, model_dict_2, model_dict_3, model_dict_4 ] # # # ################################### ################################### # # # """ This last section runs each model for the given experiment """ ## loop over the models stat_dicts = [] if opt.no_try: for i, model_dict in enumerate(model_dicts): # only run the model if you haven't yet if not os.path.isfile('{}/{}.npy'.format( exp_name, model_dict['model_name'])): dataset.update_X(model_dict['X']) # fit model model = model_dict['model'] _, stat_dict = train_propensity_model(model, dataset, data_test=True) stat_dicts += [stat_dict] np.save('{}/{}.npy'.format(exp_name, model_dict['model_name']), stat_dict) stat_dict = np.load('{}/{}.npy'.format(exp_name, model_dict['model_name']), allow_pickle=True).item() print(stat_dict) print(type(stat_dict)) stat_dict_print = { key: stat_dict[key] for key in [ k for k in stat_dict.keys() if 'P_' not in k and 'Z_' not in k and 'Y_' not in k ] } print('model {}, statdict {}'.format(model_dict['model_name'], stat_dict_print)) return for i, model_dict in enumerate(model_dicts): try: # only run the model if you haven't yet if not os.path.isfile('{}/{}.npy'.format( exp_name, model_dict['model_name'])): dataset.update_X(model_dict['X']) # fit model model = model_dict['model'] _, stat_dict = train_propensity_model(model, dataset, data_test=True) stat_dicts += [stat_dict] np.save('{}/{}.npy'.format(exp_name, model_dict['model_name']), stat_dicts) stat_dict = np.load('{}/{}.npy'.format(exp_name, model_dict['model_name']), allow_pickle=True).item() stat_dict_print = { key: stat_dict[key] for key in [k for k in stat_dict.keys() if 'P_' not in k] } print('model {}, statdict {}'.format(model_dict['model_name'], stat_dict_print)) except: print('model {} FAILED'.format(model_dict['model_name']))
os.makedirs(directoryOut) os.makedirs(directoryData) os.makedirs(directoryCkpt) os.makedirs(directoryOutLogs) num_words = None seq_len = 25 batch_size = 16 valid_batch_size = 16 ## Needs to be smaller due to memory issues embed_size = 64 num_epochs = 20 hidden_size = 64 num_layers = 1 dataset = Dataset(data_dir, num_words) dataset.set_batch_size(batch_size) dataset.set_seq_len(seq_len) dataset.save(dataset_specific_info) params = {} #take account of the 0 token for padding params['vocab_size'] = dataset.vocab_size + 1 params['num_classes'] = dataset.vocab_size params['batch_size'] = batch_size params['valid_batch_size'] = valid_batch_size params['seq_len'] = seq_len params['hidden_dim'] = hidden_size params['num_layers'] = num_layers params['embed_size'] = embed_size
def main(): path_embedding_glove = './glove.6B.100d.txt' path_dataset_train = './datasets/restaurants_train.json' #path_dataset_train = './datasets/mini.json' path_dataset_trial = './datasets/restaurants_trial.json' path_study_cases = './datasets/study_cases.json' path_saved = './saved_at/' path_log = './log_at.txt' embedding = EmbeddingGlove(path_embedding_glove) dataset_train = Dataset(path_dataset_train, embedding) dataset_trial = Dataset(path_dataset_trial, embedding) study_cases = Dataset(path_study_cases, embedding) max_sentence_len_train = dataset_train.metadata.max_sentence_len max_sentence_len_trial = dataset_trial.metadata.max_sentence_len max_aspect_len_train = dataset_train.metadata.max_aspect_len max_aspect_len_trial = dataset_trial.metadata.max_aspect_len # ====================================================================== embedding_matrix = torch.tensor(embedding.matrix, dtype=torch.float) embedding_dim = embedding.embedding_dim hidden_dim = 150 polarity_dim = 3 batch_size = 40 max_sentence_len = max(max_sentence_len_train, max_sentence_len_trial) max_aspect_len = max(max_aspect_len_train, max_aspect_len_trial) epochs = 40 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print('embedding_dim: ' + str(embedding_dim)) print('hidden_dim: ' + str(hidden_dim)) print('polarity_dim: ' + str(polarity_dim)) print('batch_size: ' + str(batch_size)) print('max_sentence_len: ' + str(max_sentence_len)) print('max_aspect_len: ' + str(max_aspect_len)) print('epochs: ' + str(epochs)) print('device: ' + str(device)) # ====================================================================== batches_train_sentences, batches_train_aspects, batches_train_polarities = dataset_train.GenerateBatches( batch_size, max_sentence_len, max_aspect_len) batches_trial_sentences, batches_trial_aspects, batches_trial_polarities = dataset_trial.GenerateBatches( batch_size, max_sentence_len, max_aspect_len) study_cases_sentences, study_cases_aspects, study_cases_polarities = study_cases.GenerateBatches( batch_size, max_sentence_len, max_aspect_len) num_batches = len(batches_train_sentences) # ====================================================================== model = AT(embedding_matrix, embedding_dim, hidden_dim, polarity_dim, max_sentence_len, max_aspect_len) model.to(device) loss_function = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=0.01) # ====================================================================== train = False file_name_saved = 'aoa_epoch38_acuracy0.951239224137931' if train: file_log = open(path_log, 'w') max_acuracy = 0.0 for epoch in range(epochs): print('\n========== Epoch ' + str(epoch) + ' ==========') model.train() for i in range(num_batches): optimizer.zero_grad() batch_sentences = batches_train_sentences[i] batch_sentences = torch.tensor(batch_sentences, dtype=torch.long).to(device) batch_aspects = batches_train_aspects[i] batch_aspects = torch.tensor(batch_aspects, dtype=torch.long).to(device) batch_polarities = batches_train_polarities[i] batch_polarities = torch.tensor(batch_polarities, dtype=torch.long).to(device) prediction, _ = model(batch_sentences, batch_aspects) loss = loss_function(prediction, batch_polarities) loss.backward() optimizer.step() acuracy, f1, _ = CalculateAcuracyF1(model, device, batches_train_sentences, batches_train_aspects, batches_train_polarities) print('acuracy train: ' + str(acuracy)) print('f1 train: ' + str(f1)) file_log.write('epoch: ' + str(epoch) + '\n') file_log.write('acuracy_train: ' + str(acuracy) + ' f1_train: ' + str(f1) + '\n') if acuracy >= max_acuracy: max_acuracy = acuracy file_name_saved = 'at_epoch' + str(epoch) + '_acuracy' + str( acuracy) torch.save(model.state_dict(), path_saved + file_name_saved) print('saved: ' + path_saved + file_name_saved) file_log.close() else: print('\n========== Load saved ==========') model.load_state_dict(torch.load(path_saved + file_name_saved)) print('load: ' + path_saved + file_name_saved) acuracy, f1, _ = CalculateAcuracyF1(model, device, batches_train_sentences, batches_train_aspects, batches_train_polarities) print('acuracy train: ' + str(acuracy)) print('f1 train: ' + str(f1)) # ====================================================================== print('\n********** Trial dataset **********') acuracy, f1, indices_failures = CalculateAcuracyF1( model, device, batches_trial_sentences, batches_trial_aspects, batches_trial_polarities) print('acuracy trial: ' + str(acuracy)) print('f1 trial: ' + str(f1)) print('indices failures:') print(indices_failures) for index in indices_failures: print(dataset_trial.opinions[index]) # ====================================================================== print('\n********** Study cases **********') with torch.no_grad(): for i in range(len(study_cases_sentences)): batch_sentences = study_cases_sentences[i] batch_sentences = torch.tensor(batch_sentences, dtype=torch.long).to(device) batch_aspects = study_cases_aspects[i] batch_aspects = torch.tensor(batch_aspects, dtype=torch.long).to(device) batch_polarities = study_cases_polarities[i] prediction, attention = model(batch_sentences, batch_aspects) print('Sentences: ') print(batch_sentences) print('Aspects: ') print(batch_aspects) print('Polarities: ') print(batch_polarities) print('Prediction: ') print(prediction) print('Attention: ') print(attention.squeeze(-1))
yield (s, sl, q, a, al) del (batch_s[:], batch_sl[:], batch_q[:], batch_a[:], batch_al[:]) #batch = [(s,sl,q,a,al) for s,sl,q,a,al in zip(batch_s,batch_sl,batch_q,batch_a,batch_al)] #batch = sorted(batch, key=lambda tup:len(tup[0]),reverse=True) #s,sl,q,a,al = zip(*batch) #if len(batch_s) == batch_size: # yield (s,sl,q,a,al) if __name__ == '__main__': from data_utils import Dataset for i in np.arange(14, 20): print('start') data = Dataset(i + 1) data.preprocess('train') data.preprocess('valid') data.preprocess('test') pickle.dump(data, open('data/qa' + str(i + 1) + '.pickle', 'wb')) print(i) #dataset = pickle.load(open('data/qa2.pickle','rb')) #for idx, (s, sl, q, a, al) in enumerate(dataset.data_loader('train')): #print(s[0].shape) #print(sl[0]) #print(q[1]) #print(dataset.idx2word(a))
def train_pos(args): src_embedding = None target_embedding = None logger = get_logger(args.log) logger.info('Model Type: {}'.format(args.type)) if os.path.exists(args.config) and (not args.config == 'debug.json'): logger.info('Loading config from {}'.format(args.config)) config = json.load(open(args.config, 'r')) try: vocab_word = pickle.load(open(config['word'], 'rb')) vocab_tag = pickle.load(open(config['tag'], 'rb')) target_vocab_word = pickle.load(open(config['target_word'], 'rb')) assert len(vocab_word) == config['nword'] assert len(vocab_tag) == config['ntag'] assert len(target_vocab_word) == config['ntarword'] if args.use_pretrain_src: _, src_embedding = load_pre_train(args.src_embedding) if args.use_pretrain_target: _, target_embedding = load_pre_train(args.target_embedding) except Exception as e: logger.error(e) exit(0) else: if args.use_pretrain_src: pre_dictionary, src_embedding = load_pre_train(args.src_embedding) vocab_word, vocab_tag = load_vocab(args.train_file, pre_dictionary) else: vocab_word, vocab_tag = load_vocab(args.train_file) if args.use_pretrain_target: pre_dictionary, target_embedding = load_pre_train( args.target_embedding) target_vocab_word, _ = load_vocab(args.train_file, pre_dictionary) else: target_vocab_word, _ = load_vocab(args.target_train_file) i = 0 while os.path.exists('./.cache/vocab_{}.pickle'.format( str(i))) or os.path.exists('./.cache/tag_{}.pickle'.format( str(i))): i += 1 if not os.path.exists('./.cache'): os.makedirs('./.cache') with open('./.cache/vocab_{}.pickle'.format(str(i)), 'wb') as vocab, open('./.cache/tag_{}.pickle'.format( str(i)), 'wb') as tag, open( './.cache/target_vocab_{}.pickle'.format(str(i)), 'wb') as tar_vocab: pickle.dump(vocab_word, vocab) pickle.dump(vocab_tag, tag) pickle.dump(target_vocab_word, tar_vocab) with open(args.config, 'w+') as config: json.dump( { 'word': './.cache/vocab_{}.pickle'.format(str(i)), 'tag': './.cache/tag_{}.pickle'.format(str(i)), 'target_word': './.cache/target_vocab_{}.pickle'.format( str(i)), 'nword': len(vocab_word), 'ntag': len(vocab_tag), 'ntarword': len(target_vocab_word) }, config, indent='\t') nword = len(vocab_word) ntag = len(vocab_tag) ntarword = len(target_vocab_word) logger.info("Src: {} {}".format(nword, ntag)) logger.info("Target: {}".format(ntarword)) logger.info("Flag: {}".format(args.flag)) logger.info( "Src embed trainable: {}".format(not args.disable_src_embed_training)) logger.info("\ntrain:{}\ndev :{}\ntest :{}\n\n".format( args.train_file, args.dev_file, args.test_file)) logger.info("\nTarget: \ntrain:{}\ndev :{}\ntest :{}\n".format( args.target_train_file, args.target_dev_file, args.target_test_file)) logger.info("MSG: {}\n".format(args.msg)) logger.info("lr_ratio: {}\n".format(str(args.lr_ratio))) logger.info("penalty_ratio: {}\n".format(str(args.penalty_ratio))) logger.info("penalty: {}\n".format(str(args.penalty))) processing_word = get_processing(vocab_word) processing_tag = get_processing(vocab_tag) processing_target_word = get_processing(target_vocab_word) src_train = Dataset(args.train_file, processing_word, processing_tag, None) src_dev = Dataset(args.dev_file, processing_word, processing_tag, None) src_test = Dataset(args.test_file, processing_word, processing_tag, None) target_train = Dataset(args.target_train_file, processing_target_word, processing_tag) target_dev = Dataset(args.target_dev_file, processing_target_word, processing_tag) target_test = Dataset(args.target_test_file, processing_target_word, processing_tag) src_len = len(src_train) target_len = len(target_train) ratio = target_len / (src_len + target_len) logger.info("\nsrc: {}\ntarget: {}\n".format(src_len, target_len)) # ratio = 0.1 if ratio < 0.1 else ratio target_batch_size = int(ratio * args.batch_size) target_batch_size = 1 if target_batch_size < 1 else target_batch_size src_batch_size = args.batch_size - target_batch_size logger.info("\nsrc_batch_size: {}\ntarget_batch_size: {}".format( src_batch_size, target_batch_size)) assert target_batch_size >= 0 model = Model(args, ntag, nword, ntarwords=ntarword, src_embedding=src_embedding, target_embedding=target_embedding, logger=logger, src_batch_size=src_batch_size) model.build() try: print("========If !!! it's debugging!==========") print(args.debug) if args.debug: print("========it's debugging!==========") model.train(src_dev, src_dev, vocab_tag, target_dev, target_dev, target_test, src_batch_size, target_batch_size) else: # model.train(src_train, src_dev, vocab_tag, target_train, target_dev, src_batch_size, target_batch_size) model.train(src_train, src_dev, vocab_tag, target_train, target_dev, target_test, src_batch_size, target_batch_size) except KeyboardInterrupt: model.evaluate(target_dev, vocab_tag, target='target')
def main(): # Read datasets data = Dataset(args.DATA_DIR) sents, tags = data.get_all_data() # Construct the model MyModel = BiLSTMModel(args.MAX_SEQ_LEN, args.EMBEDDING, args.LSTM_HIDDEN_UNITS, args.LSTM_DENSE_DIM, data.get_nwords(), data.get_ntags()) model = MyModel.define_model() num_train_sents = len(data.train_sents) num_val_sents = len(data.val_sents) num_test_sents = len(data.test_sents) print( "# train sents = {0} \n # of val sents = {1} \n # of test sents = {2}". format(num_train_sents, num_val_sents, num_test_sents), flush=True) # indexes to train, val and test data partition = { "train": list(range(num_train_sents)), "val": list(range(num_val_sents)), "test": list(range(num_test_sents)) } # Parameters params = { 'dim': args.MAX_SEQ_LEN, 'batch_size': args.BATCH_SIZE, 'n_classes': data.get_ntags(), 'shuffle': True, 'word2idx': data.get_word2idx(), 'tag2idx': data.get_tag2idx() } # Generators training_generator = DG.DataGenerator(partition['train'], data.train_sents, data.train_tags, **params) validation_generator = DG.DataGenerator(partition['val'], data.val_sents, data.val_tags, **params) # Train model on dataset history = model.fit_generator(generator=training_generator, validation_data=validation_generator, use_multiprocessing=True, epochs=args.NUM_EPOCHS, verbose=1) # Parameters params_test = { 'dim': args.MAX_SEQ_LEN, 'batch_size': 1, 'n_classes': data.get_ntags(), 'shuffle': False, 'word2idx': data.get_word2idx(), 'tag2idx': data.get_tag2idx() } # Make predictions testing_generator = DG.DataGenerator(partition['test'], data.test_sents, data.train_tags, **params_test) pred_test = model.predict_generator(generator=testing_generator, steps=num_test_sents) pred_test = np.argmax(pred_test, axis=-1) # print(pred_test.shape) def pad(x): x1 = [ tgs + ([data.get_tag2idx()["PAD"]] * (args.MAX_SEQ_LEN - len(tgs))) for tgs in x ] x2 = [tgs[:args.MAX_SEQ_LEN] for tgs in x1] return np.array(x2) test_tags_padded = pad(data.test_tags) # print(test_tags_padded.shape) def get_measures(yTrue, yPred): y1 = yTrue.reshape(1, -1).squeeze() y2 = yPred.reshape(1, -1).squeeze() P = precision_score(y1, y2, average=None) R = recall_score(y1, y2, average=None) F1 = f1_score(y1, y2, average=None) print("Precision=", flush=True) print(P, flush=True) print("Recall=", flush=True) print(R, flush=True) print("F1 score=", flush=True) print(F1, flush=True) print("Test...", flush=True) get_measures(test_tags_padded, pred_test)
f.write('\n') avg_loss = dev_loss/len(data) print(avg_loss) f.close() return invalid data = load_data(data_dir + '/msmarco/' + args.data +'_v2.1.json') params = {'batch_size': 256, 'shuffle': False, 'num_workers': 16} dev_set = Dataset(data, max_plen, max_qlen, glove_vec_size, data_dir) dev_generator = utils.data.DataLoader(dev_set, **params) device = torch.device('cpu') cuda = torch.cuda.is_available() if(cuda): device = torch.device('cuda') config = Config(glove_vec_size, hidden_size, max_plen, max_qlen, num_para, device) model = Model(config) optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay) if(cuda):
}, 'checkpoints/saved_model.pth') train_data = load_data('preprocessed_data/train_data.json', thres, max_plen)[0:100000] print('Done loading Training data.') train_params = {'batch_size': 32, 'shuffle': True, 'num_workers': 32, 'pin_memory': True} training_set = Dataset(train_data, max_plen, max_qlen, data_dir, glove_vec_size) training_generator = DataLoader(training_set, **train_params) cuda = torch.cuda.is_available() device = torch.device('cpu') if(cuda): device = torch.device('cuda') config = Config(glove_vec_size, elmo_options, elmo_weights, elmo_emb_size, hidden_size, max_plen, max_qlen, num_para, device) model = Model(config) if(cuda): model = model.to(device) optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
plt.axis('off') netG = Generator(name="dcgan_g_html") netD = Discriminator(name="dcgan_d_html") loss = mx.gluon.loss.SigmoidBinaryCrossEntropyLoss() real_label = nd.ones((batch_size, ), ctx=ctx) fake_label = nd.zeros((batch_size, ), ctx=ctx) img_list = [ os.path.join(data_path, x) for x in os.listdir(data_path) if x.endswith('png') ] train_data = Dataset(img_list, img_dims, batch_size=batch_size) def init_params(): netG.initialize(mx.init.Normal(0.02), ctx=ctx) netD.initialize(mx.init.Normal(0.02), ctx=ctx) def load_weights(): netG.load_params(ctx=ctx) netD.load_params(ctx=ctx) def init_optimizers(): trainerG = mx.gluon.Trainer(netG.collect_params(), 'adam', { 'learning_rate': lr,
from data_utils import read_dictionary, Dataset, vocab_tags from general_utils import get_logger from model import Model from config import Config import os import sys if not os.path.exists(Config.output_path): os.makedirs(Config.output_path) # vocab_words = load_vocab(Config.words_vocab) # vocab_tags = load_vocab(Config.tags_vocab) vocab_words = read_dictionary(Config.words_vocab) # print(vocab_words) # print(vocab_tags) # sys.exit(0) test = Dataset(Config.test_path, Config.test_tgt_path, Config.max_iter) train = Dataset(Config.source_path, Config.source_tgt_path, Config.max_iter) logger = get_logger(Config.log_path) model = Model(Config, ntags=len(vocab_tags), n_words=len(vocab_words), logger=logger) model.build() model.train(train, test, vocab_tags, vocab_words)
def main(_): vocab = Vocabulary.from_file( os.path.join(FLAGS.datadir, "1b_word_vocab.txt")) dataset = Dataset( vocab, os.path.join(FLAGS.datadir, "training-monolingual.tokenized.shuffled/*")) single_gpu_graph = tf.Graph() with single_gpu_graph.as_default(): with tf.variable_scope("model"): model = language_model_graph.build_model() def run(sess, num_workers, worker_id, num_replicas_per_worker): state_c = [] state_h = [] if len(state_c) == 0: state_c.extend([ np.zeros([FLAGS.batch_size, model.state_size], dtype=np.float32) for _ in range(num_replicas_per_worker) ]) state_h.extend([ np.zeros([FLAGS.batch_size, model.projected_size], dtype=np.float32) for _ in range(num_replicas_per_worker) ]) prev_global_step = sess.run(model.global_step)[0] prev_time = time.time() data_iterator = dataset.iterate_forever( FLAGS.batch_size * num_replicas_per_worker, FLAGS.num_steps, num_workers, worker_id) fetches = { 'global_step': model.global_step, 'loss': model.loss, 'train_op': model.train_op, 'final_state_c': model.final_state_c, 'final_state_h': model.final_state_h } for local_step in range(FLAGS.max_steps): if FLAGS.use_synthetic: x = np.random.randint( low=0, high=model.vocab_size, size=(FLAGS.batch_size * num_replicas_per_worker, FLAGS.num_steps)) y = np.random.randint( low=0, high=model.vocab_size, size=(FLAGS.batch_size * num_replicas_per_worker, FLAGS.num_steps)) w = np.ones((FLAGS.batch_size * num_replicas_per_worker, FLAGS.num_steps)) else: x, y, w = next(data_iterator) feeds = {} feeds[model.x] = np.split(x, num_replicas_per_worker) feeds[model.y] = np.split(y, num_replicas_per_worker) feeds[model.w] = np.split(w, num_replicas_per_worker) feeds[model.initial_state_c] = state_c feeds[model.initial_state_h] = state_h fetched = sess.run(fetches, feeds) state_c = fetched['final_state_c'] state_h = fetched['final_state_h'] if local_step % FLAGS.log_frequency == 0: cur_time = time.time() elapsed_time = cur_time - prev_time num_words = FLAGS.batch_size * FLAGS.num_steps wps = (fetched['global_step'][0] - prev_global_step) * num_words / elapsed_time prev_global_step = fetched['global_step'][0] parallax.log.info( "Iteration %d, time = %.2fs, wps = %.0f, train loss = %.4f" % (fetched['global_step'][0], cur_time - prev_time, wps, fetched['loss'][0])) prev_time = cur_time sess, num_workers, worker_id, num_replicas_per_worker = \ parallax.parallel_run(single_gpu_graph, FLAGS.resource_info_file, sync=FLAGS.sync, parallax_config=parallax_config.build_config()) run(sess, num_workers, worker_id, num_replicas_per_worker)
return results_sel_para, results_pred_start, results_pred_end dev_data = load_data('preprocessed_data/dev_data.json', thres, max_plen)[0:25000] print('Done loading dev data.') params = { 'batch_size': 32, 'shuffle': False, 'num_workers': 32, 'pin_memory': True } dev_set = Dataset(dev_data, max_plen, max_qlen, data_dir, glove_vec_size) dev_generator = DataLoader(dev_set, **params) config = Config(glove_vec_size, elmo_options, elmo_weights, elmo_emb_size, hidden_size, max_plen, max_qlen, num_para, device) model = Model(config) optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay) if (cuda): model = model.to(device) checkpoint = torch.load('checkpoints/saved_model.pth') model.load_state_dict(checkpoint['model_state_dict']) optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
num_cates = max(ent2idx.values()) + 1 sent_len = 64 vocab_size = 2320 emb_size = 256 sent_pad = 10 seq_len = sent_len + 2 * sent_pad test_data_dir = '../data/chusai_xuanshou' test_docs = Documents(data_dir=test_data_dir) sent_extrator = SentenceExtractor(window_size=sent_len, pad_size=sent_pad) test_sents = sent_extrator(test_docs) with open('word2idx.json', 'r') as f: word2idx = eval(f.read()) test_data = Dataset(test_sents, word2idx=word2idx, cate2idx=ent2idx) test_X, _ = test_data[:] print(len(test_docs)) w2v_embeddings = np.load('w2v_embeddings.npy') model = build_lstm_crf_model(num_cates, seq_len=seq_len, vocab_size=vocab_size, model_opts={ 'emb_matrix': w2v_embeddings, 'emb_size': emb_size, 'emb_trainable': False }) model.load_weights(