def test_train(self): seterr(divide='raise', over='raise', invalid='raise') gsm = GSM(1, 10) gsm.initialize(method='cauchy') samples = gsm.sample(5000) mog = MoGaussian(num_components=10) mog.initialize(method='laplace') mog.train(samples, 100)
def test_loglikelihood(self): """ Tests whether 1-dimensional GSMs are normalized. Tests the log-likelihood of several instantiations of the GSM. """ # check whether the log-likelihood of 1D GSMs is normalized for num_scales in [1, 2, 3, 4, 5]: model = GSM(1, num_scales=num_scales) # implied probability density of model pdf = lambda x: exp(model.loglikelihood(array(x).reshape(1, -1))) # compute normalization constant and upper bound on error partf, err = integrate.quad(pdf, -inf, inf) self.assertTrue(partf - err <= 1.) self.assertTrue(partf + err >= 1.) # test the log-likelihood of a couple of GSMs for dim in [1, 2, 3, 4, 5]: for num_scales in [1, 2, 3, 4, 5]: # create Gaussian scale mixture model = GSM(dim, num_scales=num_scales) scales = model.scales.reshape(-1, 1) # create random data data = randn(model.dim, 100) # evaluate likelihood ll = logmeanexp( -0.5 * sum(square(data), 0) / square(scales) - model.dim * log(scales) - model.dim / 2. * log(2. * pi), 0) self.assertTrue( all(abs(ll - model.loglikelihood(data)) < 1E-6)) # random scales scales = rand(num_scales, 1) + 0.5 model.scales[:] = scales.flatten() # sample data from model data = model.sample(100) # evaluate likelihood ll = logmeanexp( -0.5 * sum(square(data), 0) / square(scales) - model.dim * log(scales) - model.dim / 2. * log(2. * pi), 0) self.assertTrue( all(abs(ll - model.loglikelihood(data)) < 1E-6))
def test_logjacobian(self): ica = ICA(4) # standard normal distribution gauss = GSM(4, 1) gauss.scales[0] = 1. # generate test data samples = ica.sample(100) mg = MarginalGaussianization(ica) # after Gaussianization, samples should be Gaussian distributed loglik_ica = ica.loglikelihood(samples) loglik_gauss = gauss.loglikelihood(mg(samples)) + mg.logjacobian(samples) dist = abs(loglik_ica - loglik_gauss) self.assertTrue(all(dist < 1E-6))
def main(): global args taskname = args.taskname no_below = args.no_below no_above = args.no_above num_epochs = args.num_epochs n_topic = args.n_topic n_cpu = cpu_count() - 2 if cpu_count() > 2 else 2 bkpt_continue = args.bkpt_continue use_tfidf = args.use_tfidf rebuild = args.rebuild batch_size = args.batch_size criterion = args.criterion n_topic = args.n_topic auto_adj = args.auto_adj show_topics = args.show_topics device = torch.device('cpu') docSet = DocDataset(taskname, no_below=no_below, no_above=no_above, rebuild=rebuild, use_tfidf=False) if auto_adj: no_above = docSet.topk_dfs(topk=20) docSet = DocDataset(taskname, no_below=no_below, no_above=no_above, rebuild=rebuild, use_tfidf=False) voc_size = docSet.vocabsize print('voc size:', voc_size) model = GSM(bow_dim=voc_size, n_topic=n_topic, taskname=taskname, device=device) if bkpt_continue: path = os.listdir('./ckpt')[0] checkpoint = torch.load(os.path.join('./ckpt', path)) model.vae.load_state_dict(checkpoint) model.train(train_data=docSet, batch_size=batch_size, test_data=docSet, num_epochs=num_epochs, log_every=10, beta=1.0, criterion=criterion) model.evaluate(test_data=docSet) if show_topics: with open(f'./result/{taskname}_ep{num_epochs}.txt', 'w') as f: for topic in model.show_topic_words(): print(topic, file=f) save_name = f'./ckpt/GSM_{taskname}_tp{n_topic}_{time.strftime("%Y-%m-%d-%H-%M", time.localtime())}.ckpt' torch.save(model.vae.state_dict(), save_name)
def test_logjacobian(self): ica = ICA(4) # standard normal distribution gauss = GSM(4, 1) gauss.scales[0] = 1. # generate test data samples = ica.sample(100) mg = MarginalGaussianization(ica) # after Gaussianization, samples should be Gaussian distributed loglik_ica = ica.loglikelihood(samples) loglik_gauss = gauss.loglikelihood( mg(samples)) + mg.logjacobian(samples) dist = abs(loglik_ica - loglik_gauss) self.assertTrue(all(dist < 1E-6))
def test_sample(self): """ Compares model density with histogram obtained from samples. """ model = GSM(1, 3) model.scales = array([1., 3., 8.]) data = model.sample(50000) try: hist, x = histogram(data, 100, density=True) except: # use deprecated method with older versions of Python hist, x = histogram(data, 100, normed=True) x = (x[1:] + x[:-1]) / 2. pdf = exp(model.loglikelihood(x.reshape(1, -1))) self.assertTrue(all(abs(pdf - hist) < 1E-1))
def test_train(self): """ Tests whether training can recover parameters. """ for dim in [1, 2, 3]: gsm1 = GSM(dim, 2) gsm1.scales = array([0.5, 4.]) data = gsm1.sample(20000) gsm2 = GSM(dim, 2) gsm2.gamma = 0. gsm2.train(data, max_iter=100) self.assertTrue(any(abs(gsm1.scales[0] - gsm2.scales) < 1E-1)) self.assertTrue(any(abs(gsm1.scales[1] - gsm2.scales) < 1E-1))
def test_inverse(self): """ Make sure inverse Gaussianization is inverse to Gaussianization. """ gsm = GSM(3, 10) gsm.initialize('cauchy') # generate test data samples = gsm.sample(100) rg = RadialGaussianization(gsm) # reconstructed samples samples_ = rg.inverse(rg(samples)) # distance between norm and reconstructed norm dist = abs(sqrt(sum(square(samples_))) - sqrt(sum(square(samples)))) self.assertTrue(all(dist < 1E-6)) ### # test one-dimensional GSM gsm = GSM(1, 7) gsm.initialize('cauchy') # generate test data samples = gsm.sample(100) rg = RadialGaussianization(gsm) # reconstructed samples samples_rg = rg.inverse(rg(samples)) # distance between norm and reconstructed norm dist = abs(sqrt(sum(square(samples_rg))) - sqrt(sum(square(samples)))) self.assertTrue(all(dist < 1E-6))
def main(): global args taskname = args.taskname no_below = args.no_below no_above = args.no_above num_epochs = args.num_epochs n_topic = args.n_topic n_cpu = cpu_count() - 2 if cpu_count() > 2 else 2 bkpt_continue = args.bkpt_continue use_tfidf = args.use_tfidf rebuild = args.rebuild batch_size = args.batch_size criterion = args.criterion n_topic = args.n_topic auto_adj = args.auto_adj device = torch.device('cuda') docSet = DocDataset(taskname, no_below=no_below, no_above=no_above, rebuild=rebuild, use_tfidf=False) if auto_adj: no_above = docSet.topk_dfs(topk=20) docSet = DocDataset(taskname, no_below=no_below, no_above=no_above, rebuild=rebuild, use_tfidf=False) voc_size = docSet.vocabsize print('voc size:', voc_size) model = GSM(bow_dim=voc_size, n_topic=n_topic, taskname=taskname, device=device) model.train(train_data=docSet, batch_size=batch_size, test_data=docSet, num_epochs=num_epochs, log_every=10, beta=1.0, criterion=criterion) model.evaluate(test_data=docSet) save_name = f'./ckpt/GSM_{taskname}_tp{n_topic}_{time.strftime("%Y-%m-%d-%H-%M", time.localtime())}.ckpt' torch.save(model.vae.state_dict(), save_name) txt_lst, embeds = model.get_embed(train_data=docSet, num=1000) with open('topic_dist_gsm.txt', 'w', encoding='utf-8') as wfp: for t, e in zip(txt_lst, embeds): wfp.write(f'{e}:{t}\n') pickle.dump({ 'txts': txt_lst, 'embeds': embeds }, open('gsm_embeds.pkl', 'wb'))
def test_loglikelihood(self): """ Tests whether 1-dimensional GSMs are normalized. Tests the log-likelihood of several instantiations of the GSM. """ # check whether the log-likelihood of 1D GSMs is normalized for num_scales in [1, 2, 3, 4, 5]: model = GSM(1, num_scales=num_scales) # implied probability density of model pdf = lambda x: exp(model.loglikelihood(array(x).reshape(1, -1))) # compute normalization constant and upper bound on error partf, err = integrate.quad(pdf, -inf, inf) self.assertTrue(partf - err <= 1.) self.assertTrue(partf + err >= 1.) # test the log-likelihood of a couple of GSMs for dim in [1, 2, 3, 4, 5]: for num_scales in [1, 2, 3, 4, 5]: # create Gaussian scale mixture model = GSM(dim, num_scales=num_scales) scales = model.scales.reshape(-1, 1) # create random data data = randn(model.dim, 100) # evaluate likelihood ll = logmeanexp( -0.5 * sum(square(data), 0) / square(scales) - model.dim * log(scales) - model.dim / 2. * log(2. * pi), 0) self.assertTrue(all(abs(ll - model.loglikelihood(data)) < 1E-6)) # random scales scales = rand(num_scales, 1) + 0.5 model.scales[:] = scales.flatten() # sample data from model data = model.sample(100) # evaluate likelihood ll = logmeanexp( -0.5 * sum(square(data), 0) / square(scales) - model.dim * log(scales) - model.dim / 2. * log(2. * pi), 0) self.assertTrue(all(abs(ll - model.loglikelihood(data)) < 1E-6))
def test_logjacobian(self): isa = ISA(4, 4, 2) # standard normal distribution gauss = GSM(4, 1) gauss.scales[0] = 1. # generate test data samples = isa.sample(100) sg = SubspaceGaussianization(isa) # after Gaussianization, samples should be Gaussian distributed loglik_isa = isa.loglikelihood(samples) loglik_gauss = gauss.loglikelihood( sg(samples)) + sg.logjacobian(samples) dist = abs(loglik_isa - loglik_gauss) self.assertTrue(all(dist < 1E-6)) ### # test ICA isa = ISA(3, 3, 1) # standard normal distribution gauss = GSM(3, 1) gauss.scales[0] = 1. # generate test data samples = isa.sample(100) sg = SubspaceGaussianization(isa) # after Gaussianization, samples should be Gaussian distributed loglik_isa = isa.loglikelihood(samples) loglik_gauss = gauss.loglikelihood( sg(samples)) + sg.logjacobian(samples) dist = abs(loglik_isa - loglik_gauss) self.assertTrue(all(dist < 1E-6))
def main(): global args taskname = args.taskname no_below = args.no_below no_above = args.no_above num_epochs = args.num_epochs n_topic = args.n_topic n_cpu = cpu_count() - 2 if cpu_count() > 2 else 2 bkpt_continue = args.bkpt_continue use_tfidf = args.use_tfidf rebuild = args.rebuild batch_size = args.batch_size criterion = args.criterion n_topic = args.n_topic use_fc1 = args.use_fc1 #TBD_fc1 auto_adj = args.auto_adj device = torch.device('cuda') docSet = DocDataset(taskname, no_below=no_below, no_above=no_above, rebuild=rebuild, use_tfidf=False) if auto_adj: no_above = docSet.topk_dfs(topk=20) docSet = DocDataset(taskname, no_below=no_below, no_above=no_above, rebuild=rebuild, use_tfidf=False) voc_size = docSet.vocabsize print('voc size:', voc_size) model = GSM(bow_dim=voc_size, n_topic=n_topic, taskname=taskname, device=device, use_fc1=use_fc1) #TBD_fc1 model.train(train_data=docSet, batch_size=batch_size, test_data=docSet, num_epochs=num_epochs, log_every=10, beta=1.0, criterion=criterion) model.evaluate(test_data=docSet) save_name = f'./ckpt/GSM_{taskname}_tp{n_topic}_{time.strftime("%Y-%m-%d-%H-%M", time.localtime())}.ckpt' torch.save(model.vae.state_dict(), save_name)
def test_energy_gradient(self): """ Tests whether the energy gradient is similar to a numerical gradient. """ step_size = 1E-5 model = GSM(3, num_scales=7) model.initialize('laplace') # samples and true gradient X = model.sample(100) G = model.energy_gradient(X) # numerical gradient N = zeros(G.shape) for i in range(N.shape[0]): d = zeros(X.shape) d[i] = step_size N[i] = (model.energy(X + d) - model.energy(X - d)) / (2. * step_size) # test consistency of energy and gradient self.assertTrue(all(abs(G - N) < 1E-5))
def test_logjacobian(self): """ Test log-Jacobian. """ gsm = GSM(3, 10) gsm.initialize('cauchy') # standard normal distribution gauss = GSM(3, 1) gauss.scales[0] = 1. # generate test data samples = gsm.sample(100) rg = RadialGaussianization(gsm) # after Gaussianization, samples should be Gaussian distributed loglik_gsm = gsm.loglikelihood(samples) loglik_gauss = gauss.loglikelihood( rg(samples)) + rg.logjacobian(samples) dist = abs(loglik_gsm - loglik_gauss) self.assertTrue(all(dist < 1E-6)) ### # test one-dimensional Gaussian gsm = GSM(1, 10) gsm.initialize('cauchy') # standard normal distribution gauss = GSM(1, 1) gauss.scales[0] = 1. # generate test data samples = gsm.sample(100) rg = RadialGaussianization(gsm) # after Gaussianization, samples should be Gaussian distributed loglik_gsm = gsm.loglikelihood(samples) loglik_gauss = gauss.loglikelihood( rg(samples)) + rg.logjacobian(samples) dist = abs(loglik_gsm - loglik_gauss) self.assertTrue(all(dist < 1E-6))
def fetch_report_data(): C = config['fetch_report_data'] logger.info('start fetching report data') res = sshexec(C['host'], C['username'], C['cmd']) if not res: logger.error(res) logger.error( 'not output returned from {0}@{1}, {2}, possible communication ' 'error with remote host'.format(C['username'], C['host'], C['cmd'])) return data = json.loads(res[0]) # logger.debug(data) # too verbose changed = False # to flag if anything has changed gsm_objs = [] for path in sorted(data.keys()): for gse in sorted(data[path].keys()): # _: ignore the value created variable gse_obj, _ = GSE.objects.get_or_create(name=gse) for species in sorted(data[path][gse].keys()): # homo_sapiens => H**o sapiens species_name = species.replace('_', ' ').capitalize() species_obj, _ = Species.objects.get_or_create( name=species_name) for gsm in sorted(data[path][gse][species].keys()): status = data[path][gse][species][gsm]['status'] kwargs = dict(name=gsm, gse=gse_obj, species=species_obj, path=os.path.join(path, gse, species, gsm), status=status) try: gsm_obj = GSM.objects.get(gse=gse_obj, name=gsm) if gsm_obj.status != status: # need to do some update logger.info('Updating {0}-{1}: {2} => {3}'.format( gse, gsm, gsm_obj.status, status)) for key, value in kwargs.iteritems(): setattr(gsm_obj, key, value) gsm_obj.save() changed = True except GSM.DoesNotExist: logger.info('Creating {0}-{1}'.format(gse, gsm)) gsm_obj = GSM(**kwargs) gsm_objs.append(gsm_obj) if gsm_objs: GSM.objects.bulk_create(gsm_objs) changed = True # update GSEs based on passed gses = GSE.objects.all() for gse in gses: # passed_gsms = gse.gsm_set.filter(status='passed') running_gsms = gse.gsm_set.filter(status='running') queued_gsms = gse.gsm_set.filter(status='queued') failed_gsms = gse.gsm_set.filter(status='failed') none_gsms = gse.gsm_set.filter(status='none') if (running_gsms.count() == 0 and queued_gsms.count() == 0 and failed_gsms.count() == 0 and none_gsms.count() == 0): if gse.passed == False: gse.passed = True gse.save() changed = True else: # adapts to mannual changed, e.g. adding or removing GSMs after the # GSE is once passed if gse.passed == True: gse.passed = False gse.save() changed = True if changed: logger.info('updating memcaches for all GSEs') update_cache_all_gses() logger.info('updating memcaches for passed GSEs') update_cache_passed_gses() logger.info('updating memcaches for not passed GSEs') update_cache_not_passed_gses() logger.info('updating memcaches for stats') update_cache_stats() else: logger.info('nothing changed')
def main(): global args train_data_file = args.train_data_file test_data_file = args.test_data_file no_below = args.no_below no_above = args.no_above num_epochs = args.num_epochs n_topic = args.n_topic hidden_size = args.hidden_size learning_rate = args.learning_rate log_every = args.log_every n_cpu = cpu_count() - 2 if cpu_count() > 2 else 2 bkpt_continue = args.bkpt_continue use_tfidf = args.use_tfidf rebuild = args.rebuild batch_size = args.batch_size auto_adj = args.auto_adj ckpt = args.ckpt device = torch.device('cuda') trainSet = DocDataset(train_data_file, no_below=no_below, no_above=no_above, rebuild=rebuild, use_tfidf=False) testSet = DocDataset(test_data_file, no_below=no_below, no_above=no_above, rebuild=rebuild, use_tfidf=False) # if auto_adj: # no_above = docSet.topk_dfs(topk=20) # docSet = DocDataset(taskname,no_below=no_below,no_above=no_above,rebuild=rebuild,use_tfidf=False) voc_size = trainSet.vocabsize print('train voc size:', voc_size) print("train:", type(trainSet), len(trainSet)) print("test:", type(testSet), len(testSet)) if ckpt: checkpoint = torch.load(ckpt) param.update({"device": device}) model = GSM(**param) model.train(train_data=trainSet, test_data=testSet, batch_size=batch_size, learning_rate=learning_rate, num_epochs=num_epochs, log_every=log_every, ckpt=checkpoint) else: model = GSM(bow_dim=voc_size, n_topic=n_topic, hidden_size=hidden_size, device=device) model.train(train_data=trainSet, test_data=testSet, batch_size=batch_size, learning_rate=learning_rate, num_epochs=num_epochs, log_every=log_every) #model.evaluate(test_data=docSet) save_name = f'./ckpt/GSM_{taskname}_tp{n_topic}_{time.strftime("%Y-%m-%d-%H-%M", time.localtime())}.ckpt' torch.save(model.vae.state_dict(), save_name) txt_lst, embeds = model.get_embed(train_data=docSet, num=1000) with open('topic_dist_gsm.txt', 'w', encoding='utf-8') as wfp: for t, e in zip(txt_lst, embeds): wfp.write(f'{e}:{t}\n') pickle.dump({ 'txts': txt_lst, 'embeds': embeds }, open('gsm_embeds.pkl', 'wb'))
def main(): global args taskname = args.taskname # 数据集名字 no_below = args.no_below # 文档频率小于阈值的词会被过滤掉 no_above = args.no_above # 文档频率小于阈值的词将被过滤掉 num_epochs = args.num_epochs # 训练周期 n_topic = args.n_topic # 主题数 n_cpu = cpu_count() - 2 if cpu_count() > 2 else 2 bkpt_continue = args.bkpt_continue # 是否在之前的checkoint上继续训练 use_tfidf = args.use_tfidf # 是否用tfidf作为BOW输入 rebuild = args.rebuild # 是否重建语料,默认不会 batch_size = args.batch_size # 批次大小 criterion = args.criterion # loss的种类 auto_adj = args.auto_adj # 是否自动调整频率,如去掉top20 ckpt = args.ckpt # ckpt路径 device = torch.device('cpu') docSet = DocDataset(taskname, no_below=no_below, no_above=no_above, rebuild=rebuild, use_tfidf=False) # 载入数据集,并分词 if auto_adj: no_above = docSet.topk_dfs(topk=20) docSet = DocDataset(taskname, no_below=no_below, no_above=no_above, rebuild=rebuild, use_tfidf=False) voc_size = docSet.vocabsize print('voc size:', voc_size) if ckpt: # 载入ckpt checkpoint = torch.load(ckpt) param.update({"device": device}) model = GSM(**param) model.train(train_data=docSet, batch_size=batch_size, test_data=docSet, num_epochs=num_epochs, log_every=10, beta=1.0, criterion=criterion, ckpt=checkpoint) else: # 初始化模型并开始执行train程序 model = GSM(bow_dim=voc_size, n_topic=n_topic, taskname=taskname, device=device) model.train(train_data=docSet, batch_size=batch_size, test_data=docSet, num_epochs=num_epochs, log_every=10, beta=1.0, criterion=criterion) model.evaluate(test_data=docSet) # 用训练之后的模型做评估 # 存模型,特征,统计等等结果 save_name = f'./ckpt/GSM_{taskname}_tp{n_topic}_{time.strftime("%Y-%m-%d-%H-%M", time.localtime())}.ckpt' torch.save(model.vae.state_dict(), save_name) txt_lst, embeds = model.get_embed(train_data=docSet, num=1000) with open('topic_dist_gsm.txt', 'w', encoding='utf-8') as wfp: for t, e in zip(txt_lst, embeds): wfp.write(f'{e}:{t}\n') pickle.dump({ 'txts': txt_lst, 'embeds': embeds }, open('gsm_embeds.pkl', 'wb'))