Python GSM示例，models.GSM Python示例

示例#1

0

显示文件

    def test_train(self):
        seterr(divide='raise', over='raise', invalid='raise')

        gsm = GSM(1, 10)
        gsm.initialize(method='cauchy')

        samples = gsm.sample(5000)

        mog = MoGaussian(num_components=10)
        mog.initialize(method='laplace')
        mog.train(samples, 100)

示例#2

0

显示文件

文件： mogaussian_test.py 项目： lucastheis/isa

	def test_train(self):
		seterr(divide='raise', over='raise', invalid='raise')

		gsm = GSM(1, 10)
		gsm.initialize(method='cauchy')


		samples = gsm.sample(5000)

		mog = MoGaussian(num_components=10)
		mog.initialize(method='laplace')
		mog.train(samples, 100)

示例#3

0

显示文件

文件： gsm_test.py 项目： afcarl/isa

    def test_loglikelihood(self):
        """
		Tests whether 1-dimensional GSMs are normalized. Tests the log-likelihood
		of several instantiations of the GSM.
		"""

        # check whether the log-likelihood of 1D GSMs is normalized
        for num_scales in [1, 2, 3, 4, 5]:
            model = GSM(1, num_scales=num_scales)

            # implied probability density of model
            pdf = lambda x: exp(model.loglikelihood(array(x).reshape(1, -1)))

            # compute normalization constant and upper bound on error
            partf, err = integrate.quad(pdf, -inf, inf)

            self.assertTrue(partf - err <= 1.)
            self.assertTrue(partf + err >= 1.)

        # test the log-likelihood of a couple of GSMs
        for dim in [1, 2, 3, 4, 5]:
            for num_scales in [1, 2, 3, 4, 5]:
                # create Gaussian scale mixture
                model = GSM(dim, num_scales=num_scales)
                scales = model.scales.reshape(-1, 1)

                # create random data
                data = randn(model.dim, 100)

                # evaluate likelihood
                ll = logmeanexp(
                    -0.5 * sum(square(data), 0) / square(scales) -
                    model.dim * log(scales) - model.dim / 2. * log(2. * pi), 0)

                self.assertTrue(
                    all(abs(ll - model.loglikelihood(data)) < 1E-6))

                # random scales
                scales = rand(num_scales, 1) + 0.5
                model.scales[:] = scales.flatten()

                # sample data from model
                data = model.sample(100)

                # evaluate likelihood
                ll = logmeanexp(
                    -0.5 * sum(square(data), 0) / square(scales) -
                    model.dim * log(scales) - model.dim / 2. * log(2. * pi), 0)

                self.assertTrue(
                    all(abs(ll - model.loglikelihood(data)) < 1E-6))

示例#4

0

显示文件

文件： marginalgaussianization_test.py 项目： lucastheis/isa

	def test_logjacobian(self):
		ica = ICA(4)

		# standard normal distribution
		gauss = GSM(4, 1)
		gauss.scales[0] = 1.

		# generate test data
		samples = ica.sample(100)

		mg = MarginalGaussianization(ica)

		# after Gaussianization, samples should be Gaussian distributed
		loglik_ica = ica.loglikelihood(samples)
		loglik_gauss = gauss.loglikelihood(mg(samples)) + mg.logjacobian(samples)

		dist = abs(loglik_ica - loglik_gauss)

		self.assertTrue(all(dist < 1E-6))

示例#5

0

显示文件

文件： GSM_run.py 项目： AyaRamazanova/Neural_Topic_Models

def main():
    global args
    taskname = args.taskname
    no_below = args.no_below
    no_above = args.no_above
    num_epochs = args.num_epochs
    n_topic = args.n_topic
    n_cpu = cpu_count() - 2 if cpu_count() > 2 else 2
    bkpt_continue = args.bkpt_continue
    use_tfidf = args.use_tfidf
    rebuild = args.rebuild
    batch_size = args.batch_size
    criterion = args.criterion
    n_topic = args.n_topic
    auto_adj = args.auto_adj
    show_topics = args.show_topics

    device = torch.device('cpu')
    docSet = DocDataset(taskname,
                        no_below=no_below,
                        no_above=no_above,
                        rebuild=rebuild,
                        use_tfidf=False)
    if auto_adj:
        no_above = docSet.topk_dfs(topk=20)
        docSet = DocDataset(taskname,
                            no_below=no_below,
                            no_above=no_above,
                            rebuild=rebuild,
                            use_tfidf=False)

    voc_size = docSet.vocabsize
    print('voc size:', voc_size)
    model = GSM(bow_dim=voc_size,
                n_topic=n_topic,
                taskname=taskname,
                device=device)
    if bkpt_continue:
        path = os.listdir('./ckpt')[0]
        checkpoint = torch.load(os.path.join('./ckpt', path))
        model.vae.load_state_dict(checkpoint)
    model.train(train_data=docSet,
                batch_size=batch_size,
                test_data=docSet,
                num_epochs=num_epochs,
                log_every=10,
                beta=1.0,
                criterion=criterion)
    model.evaluate(test_data=docSet)

    if show_topics:
        with open(f'./result/{taskname}_ep{num_epochs}.txt', 'w') as f:
            for topic in model.show_topic_words():
                print(topic, file=f)

    save_name = f'./ckpt/GSM_{taskname}_tp{n_topic}_{time.strftime("%Y-%m-%d-%H-%M", time.localtime())}.ckpt'
    torch.save(model.vae.state_dict(), save_name)

示例#6

0

显示文件

文件： marginalgaussianization_test.py 项目： afcarl/isa

    def test_logjacobian(self):
        ica = ICA(4)

        # standard normal distribution
        gauss = GSM(4, 1)
        gauss.scales[0] = 1.

        # generate test data
        samples = ica.sample(100)

        mg = MarginalGaussianization(ica)

        # after Gaussianization, samples should be Gaussian distributed
        loglik_ica = ica.loglikelihood(samples)
        loglik_gauss = gauss.loglikelihood(
            mg(samples)) + mg.logjacobian(samples)

        dist = abs(loglik_ica - loglik_gauss)

        self.assertTrue(all(dist < 1E-6))

示例#7

0

显示文件

文件： gsm_test.py 项目： lucastheis/isa

	def test_sample(self):
		"""
		Compares model density with histogram obtained from samples.
		"""

		model = GSM(1, 3)
		model.scales = array([1., 3., 8.])

		data = model.sample(50000)

		try:
			hist, x = histogram(data, 100, density=True)
		except:
			# use deprecated method with older versions of Python
			hist, x = histogram(data, 100, normed=True)
		x = (x[1:] + x[:-1]) / 2.

		pdf = exp(model.loglikelihood(x.reshape(1, -1)))

		self.assertTrue(all(abs(pdf - hist) < 1E-1))

示例#8

0

显示文件

文件： gsm_test.py 项目： afcarl/isa

    def test_sample(self):
        """
		Compares model density with histogram obtained from samples.
		"""

        model = GSM(1, 3)
        model.scales = array([1., 3., 8.])

        data = model.sample(50000)

        try:
            hist, x = histogram(data, 100, density=True)
        except:
            # use deprecated method with older versions of Python
            hist, x = histogram(data, 100, normed=True)
        x = (x[1:] + x[:-1]) / 2.

        pdf = exp(model.loglikelihood(x.reshape(1, -1)))

        self.assertTrue(all(abs(pdf - hist) < 1E-1))

示例#9

0

显示文件

文件： gsm_test.py 项目： afcarl/isa

    def test_train(self):
        """
		Tests whether training can recover parameters.
		"""

        for dim in [1, 2, 3]:
            gsm1 = GSM(dim, 2)
            gsm1.scales = array([0.5, 4.])

            data = gsm1.sample(20000)

            gsm2 = GSM(dim, 2)
            gsm2.gamma = 0.
            gsm2.train(data, max_iter=100)

            self.assertTrue(any(abs(gsm1.scales[0] - gsm2.scales) < 1E-1))
            self.assertTrue(any(abs(gsm1.scales[1] - gsm2.scales) < 1E-1))

示例#10

0

显示文件

文件： radialgaussianization_test.py 项目： afcarl/isa

    def test_inverse(self):
        """
		Make sure inverse Gaussianization is inverse to Gaussianization.
		"""

        gsm = GSM(3, 10)
        gsm.initialize('cauchy')

        # generate test data
        samples = gsm.sample(100)

        rg = RadialGaussianization(gsm)

        # reconstructed samples
        samples_ = rg.inverse(rg(samples))

        # distance between norm and reconstructed norm
        dist = abs(sqrt(sum(square(samples_))) - sqrt(sum(square(samples))))

        self.assertTrue(all(dist < 1E-6))

        ###

        # test one-dimensional GSM
        gsm = GSM(1, 7)
        gsm.initialize('cauchy')

        # generate test data
        samples = gsm.sample(100)

        rg = RadialGaussianization(gsm)

        # reconstructed samples
        samples_rg = rg.inverse(rg(samples))

        # distance between norm and reconstructed norm
        dist = abs(sqrt(sum(square(samples_rg))) - sqrt(sum(square(samples))))

        self.assertTrue(all(dist < 1E-6))

示例#11

0

显示文件

def main():
    global args
    taskname = args.taskname
    no_below = args.no_below
    no_above = args.no_above
    num_epochs = args.num_epochs
    n_topic = args.n_topic
    n_cpu = cpu_count() - 2 if cpu_count() > 2 else 2
    bkpt_continue = args.bkpt_continue
    use_tfidf = args.use_tfidf
    rebuild = args.rebuild
    batch_size = args.batch_size
    criterion = args.criterion
    n_topic = args.n_topic
    auto_adj = args.auto_adj

    device = torch.device('cuda')
    docSet = DocDataset(taskname,
                        no_below=no_below,
                        no_above=no_above,
                        rebuild=rebuild,
                        use_tfidf=False)
    if auto_adj:
        no_above = docSet.topk_dfs(topk=20)
        docSet = DocDataset(taskname,
                            no_below=no_below,
                            no_above=no_above,
                            rebuild=rebuild,
                            use_tfidf=False)

    voc_size = docSet.vocabsize
    print('voc size:', voc_size)
    model = GSM(bow_dim=voc_size,
                n_topic=n_topic,
                taskname=taskname,
                device=device)
    model.train(train_data=docSet,
                batch_size=batch_size,
                test_data=docSet,
                num_epochs=num_epochs,
                log_every=10,
                beta=1.0,
                criterion=criterion)
    model.evaluate(test_data=docSet)
    save_name = f'./ckpt/GSM_{taskname}_tp{n_topic}_{time.strftime("%Y-%m-%d-%H-%M", time.localtime())}.ckpt'
    torch.save(model.vae.state_dict(), save_name)
    txt_lst, embeds = model.get_embed(train_data=docSet, num=1000)
    with open('topic_dist_gsm.txt', 'w', encoding='utf-8') as wfp:
        for t, e in zip(txt_lst, embeds):
            wfp.write(f'{e}:{t}\n')
    pickle.dump({
        'txts': txt_lst,
        'embeds': embeds
    }, open('gsm_embeds.pkl', 'wb'))

示例#12

0

显示文件

文件： gsm_test.py 项目： lucastheis/isa

	def test_loglikelihood(self):
		"""
		Tests whether 1-dimensional GSMs are normalized. Tests the log-likelihood
		of several instantiations of the GSM.
		"""

		# check whether the log-likelihood of 1D GSMs is normalized
		for num_scales in [1, 2, 3, 4, 5]:
			model = GSM(1, num_scales=num_scales)

			# implied probability density of model
			pdf = lambda x: exp(model.loglikelihood(array(x).reshape(1, -1)))

			# compute normalization constant and upper bound on error
			partf, err = integrate.quad(pdf, -inf, inf)

			self.assertTrue(partf - err <= 1.)
			self.assertTrue(partf + err >= 1.)

		# test the log-likelihood of a couple of GSMs
		for dim in [1, 2, 3, 4, 5]:
			for num_scales in [1, 2, 3, 4, 5]:
				# create Gaussian scale mixture
				model = GSM(dim, num_scales=num_scales)
				scales = model.scales.reshape(-1, 1)

				# create random data
				data = randn(model.dim, 100)

				# evaluate likelihood
				ll = logmeanexp(
					-0.5 * sum(square(data), 0) / square(scales)
					- model.dim * log(scales)
					- model.dim / 2. * log(2. * pi), 0)

				self.assertTrue(all(abs(ll - model.loglikelihood(data)) < 1E-6))

				# random scales
				scales = rand(num_scales, 1) + 0.5
				model.scales[:] = scales.flatten()

				# sample data from model
				data = model.sample(100)

				# evaluate likelihood
				ll = logmeanexp(
					-0.5 * sum(square(data), 0) / square(scales)
					- model.dim * log(scales)
					- model.dim / 2. * log(2. * pi), 0)

				self.assertTrue(all(abs(ll - model.loglikelihood(data)) < 1E-6))

示例#13

0

显示文件

    def test_logjacobian(self):
        isa = ISA(4, 4, 2)

        # standard normal distribution
        gauss = GSM(4, 1)
        gauss.scales[0] = 1.

        # generate test data
        samples = isa.sample(100)

        sg = SubspaceGaussianization(isa)

        # after Gaussianization, samples should be Gaussian distributed
        loglik_isa = isa.loglikelihood(samples)
        loglik_gauss = gauss.loglikelihood(
            sg(samples)) + sg.logjacobian(samples)

        dist = abs(loglik_isa - loglik_gauss)

        self.assertTrue(all(dist < 1E-6))

        ###

        # test ICA
        isa = ISA(3, 3, 1)

        # standard normal distribution
        gauss = GSM(3, 1)
        gauss.scales[0] = 1.

        # generate test data
        samples = isa.sample(100)

        sg = SubspaceGaussianization(isa)

        # after Gaussianization, samples should be Gaussian distributed
        loglik_isa = isa.loglikelihood(samples)
        loglik_gauss = gauss.loglikelihood(
            sg(samples)) + sg.logjacobian(samples)

        dist = abs(loglik_isa - loglik_gauss)

        self.assertTrue(all(dist < 1E-6))

示例#14

0

显示文件

文件： gsm_test.py 项目： lucastheis/isa

	def test_train(self):
		"""
		Tests whether training can recover parameters.
		"""

		for dim in [1, 2, 3]:
			gsm1 = GSM(dim, 2)
			gsm1.scales = array([0.5, 4.])

			data = gsm1.sample(20000)

			gsm2 = GSM(dim, 2)
			gsm2.gamma = 0.
			gsm2.train(data, max_iter=100)

			self.assertTrue(any(abs(gsm1.scales[0] - gsm2.scales) < 1E-1))
			self.assertTrue(any(abs(gsm1.scales[1] - gsm2.scales) < 1E-1))

示例#15

0

显示文件

文件： GSM_run.py 项目： yqlicglingfei/Neural_Topic_Models

def main():
    global args
    taskname = args.taskname
    no_below = args.no_below
    no_above = args.no_above
    num_epochs = args.num_epochs
    n_topic = args.n_topic
    n_cpu = cpu_count() - 2 if cpu_count() > 2 else 2
    bkpt_continue = args.bkpt_continue
    use_tfidf = args.use_tfidf
    rebuild = args.rebuild
    batch_size = args.batch_size
    criterion = args.criterion
    n_topic = args.n_topic
    use_fc1 = args.use_fc1  #TBD_fc1
    auto_adj = args.auto_adj

    device = torch.device('cuda')
    docSet = DocDataset(taskname,
                        no_below=no_below,
                        no_above=no_above,
                        rebuild=rebuild,
                        use_tfidf=False)
    if auto_adj:
        no_above = docSet.topk_dfs(topk=20)
        docSet = DocDataset(taskname,
                            no_below=no_below,
                            no_above=no_above,
                            rebuild=rebuild,
                            use_tfidf=False)

    voc_size = docSet.vocabsize
    print('voc size:', voc_size)
    model = GSM(bow_dim=voc_size,
                n_topic=n_topic,
                taskname=taskname,
                device=device,
                use_fc1=use_fc1)  #TBD_fc1
    model.train(train_data=docSet,
                batch_size=batch_size,
                test_data=docSet,
                num_epochs=num_epochs,
                log_every=10,
                beta=1.0,
                criterion=criterion)
    model.evaluate(test_data=docSet)
    save_name = f'./ckpt/GSM_{taskname}_tp{n_topic}_{time.strftime("%Y-%m-%d-%H-%M", time.localtime())}.ckpt'
    torch.save(model.vae.state_dict(), save_name)

示例#16

0

显示文件

文件： gsm_test.py 项目： afcarl/isa

    def test_energy_gradient(self):
        """
		Tests whether the energy gradient is similar to a numerical gradient.
		"""

        step_size = 1E-5

        model = GSM(3, num_scales=7)
        model.initialize('laplace')

        # samples and true gradient
        X = model.sample(100)
        G = model.energy_gradient(X)

        # numerical gradient
        N = zeros(G.shape)
        for i in range(N.shape[0]):
            d = zeros(X.shape)
            d[i] = step_size
            N[i] = (model.energy(X + d) - model.energy(X - d)) / (2. *
                                                                  step_size)

        # test consistency of energy and gradient
        self.assertTrue(all(abs(G - N) < 1E-5))

示例#17

0

显示文件

文件： gsm_test.py 项目： lucastheis/isa

	def test_energy_gradient(self):
		"""
		Tests whether the energy gradient is similar to a numerical gradient.
		"""

		step_size = 1E-5

		model = GSM(3, num_scales=7)
		model.initialize('laplace')

		# samples and true gradient
		X = model.sample(100)
		G = model.energy_gradient(X)

		# numerical gradient
		N = zeros(G.shape)
		for i in range(N.shape[0]):
			d = zeros(X.shape)
			d[i] = step_size
			N[i] = (model.energy(X + d) - model.energy(X - d)) / (2. * step_size)

		# test consistency of energy and gradient
		self.assertTrue(all(abs(G - N) < 1E-5))

示例#18

0

显示文件

文件： radialgaussianization_test.py 项目： afcarl/isa

    def test_logjacobian(self):
        """
		Test log-Jacobian.
		"""

        gsm = GSM(3, 10)
        gsm.initialize('cauchy')

        # standard normal distribution
        gauss = GSM(3, 1)
        gauss.scales[0] = 1.

        # generate test data
        samples = gsm.sample(100)

        rg = RadialGaussianization(gsm)

        # after Gaussianization, samples should be Gaussian distributed
        loglik_gsm = gsm.loglikelihood(samples)
        loglik_gauss = gauss.loglikelihood(
            rg(samples)) + rg.logjacobian(samples)

        dist = abs(loglik_gsm - loglik_gauss)

        self.assertTrue(all(dist < 1E-6))

        ###

        # test one-dimensional Gaussian
        gsm = GSM(1, 10)
        gsm.initialize('cauchy')

        # standard normal distribution
        gauss = GSM(1, 1)
        gauss.scales[0] = 1.

        # generate test data
        samples = gsm.sample(100)

        rg = RadialGaussianization(gsm)

        # after Gaussianization, samples should be Gaussian distributed
        loglik_gsm = gsm.loglikelihood(samples)
        loglik_gauss = gauss.loglikelihood(
            rg(samples)) + rg.logjacobian(samples)

        dist = abs(loglik_gsm - loglik_gauss)

        self.assertTrue(all(dist < 1E-6))

示例#19

0

显示文件

def fetch_report_data():
    C = config['fetch_report_data']
    logger.info('start fetching report data')
    res = sshexec(C['host'], C['username'], C['cmd'])
    if not res:
        logger.error(res)
        logger.error(
            'not output returned from {0}@{1}, {2}, possible communication '
            'error with remote host'.format(C['username'], C['host'],
                                            C['cmd']))
        return
    data = json.loads(res[0])
    # logger.debug(data)          # too verbose

    changed = False  # to flag if anything has changed
    gsm_objs = []
    for path in sorted(data.keys()):
        for gse in sorted(data[path].keys()):
            # _: ignore the value created variable
            gse_obj, _ = GSE.objects.get_or_create(name=gse)
            for species in sorted(data[path][gse].keys()):
                # homo_sapiens => H**o sapiens
                species_name = species.replace('_', ' ').capitalize()
                species_obj, _ = Species.objects.get_or_create(
                    name=species_name)
                for gsm in sorted(data[path][gse][species].keys()):
                    status = data[path][gse][species][gsm]['status']
                    kwargs = dict(name=gsm,
                                  gse=gse_obj,
                                  species=species_obj,
                                  path=os.path.join(path, gse, species, gsm),
                                  status=status)
                    try:
                        gsm_obj = GSM.objects.get(gse=gse_obj, name=gsm)
                        if gsm_obj.status != status:
                            # need to do some update
                            logger.info('Updating {0}-{1}: {2} => {3}'.format(
                                gse, gsm, gsm_obj.status, status))
                            for key, value in kwargs.iteritems():
                                setattr(gsm_obj, key, value)
                            gsm_obj.save()
                            changed = True
                    except GSM.DoesNotExist:
                        logger.info('Creating {0}-{1}'.format(gse, gsm))
                        gsm_obj = GSM(**kwargs)
                        gsm_objs.append(gsm_obj)
    if gsm_objs:
        GSM.objects.bulk_create(gsm_objs)
        changed = True

    # update GSEs based on passed
    gses = GSE.objects.all()
    for gse in gses:
        # passed_gsms = gse.gsm_set.filter(status='passed')
        running_gsms = gse.gsm_set.filter(status='running')
        queued_gsms = gse.gsm_set.filter(status='queued')
        failed_gsms = gse.gsm_set.filter(status='failed')
        none_gsms = gse.gsm_set.filter(status='none')

        if (running_gsms.count() == 0 and queued_gsms.count() == 0
                and failed_gsms.count() == 0 and none_gsms.count() == 0):
            if gse.passed == False:
                gse.passed = True
                gse.save()
                changed = True
        else:
            # adapts to mannual changed, e.g. adding or removing GSMs after the
            # GSE is once passed
            if gse.passed == True:
                gse.passed = False
                gse.save()
                changed = True
    if changed:
        logger.info('updating memcaches for all GSEs')
        update_cache_all_gses()
        logger.info('updating memcaches for passed GSEs')
        update_cache_passed_gses()
        logger.info('updating memcaches for not passed GSEs')
        update_cache_not_passed_gses()
        logger.info('updating memcaches for stats')
        update_cache_stats()
    else:
        logger.info('nothing changed')

示例#20

0

显示文件

def main():
    global args
    train_data_file = args.train_data_file
    test_data_file = args.test_data_file
    no_below = args.no_below
    no_above = args.no_above
    num_epochs = args.num_epochs
    n_topic = args.n_topic
    hidden_size = args.hidden_size
    learning_rate = args.learning_rate
    log_every = args.log_every
    n_cpu = cpu_count() - 2 if cpu_count() > 2 else 2
    bkpt_continue = args.bkpt_continue
    use_tfidf = args.use_tfidf
    rebuild = args.rebuild
    batch_size = args.batch_size
    auto_adj = args.auto_adj
    ckpt = args.ckpt

    device = torch.device('cuda')
    trainSet = DocDataset(train_data_file,
                          no_below=no_below,
                          no_above=no_above,
                          rebuild=rebuild,
                          use_tfidf=False)
    testSet = DocDataset(test_data_file,
                         no_below=no_below,
                         no_above=no_above,
                         rebuild=rebuild,
                         use_tfidf=False)
    # if auto_adj:
    #     no_above = docSet.topk_dfs(topk=20)
    #     docSet = DocDataset(taskname,no_below=no_below,no_above=no_above,rebuild=rebuild,use_tfidf=False)

    voc_size = trainSet.vocabsize
    print('train voc size:', voc_size)
    print("train:", type(trainSet), len(trainSet))
    print("test:", type(testSet), len(testSet))
    if ckpt:
        checkpoint = torch.load(ckpt)
        param.update({"device": device})
        model = GSM(**param)
        model.train(train_data=trainSet,
                    test_data=testSet,
                    batch_size=batch_size,
                    learning_rate=learning_rate,
                    num_epochs=num_epochs,
                    log_every=log_every,
                    ckpt=checkpoint)
    else:
        model = GSM(bow_dim=voc_size,
                    n_topic=n_topic,
                    hidden_size=hidden_size,
                    device=device)
        model.train(train_data=trainSet,
                    test_data=testSet,
                    batch_size=batch_size,
                    learning_rate=learning_rate,
                    num_epochs=num_epochs,
                    log_every=log_every)
    #model.evaluate(test_data=docSet)
    save_name = f'./ckpt/GSM_{taskname}_tp{n_topic}_{time.strftime("%Y-%m-%d-%H-%M", time.localtime())}.ckpt'
    torch.save(model.vae.state_dict(), save_name)
    txt_lst, embeds = model.get_embed(train_data=docSet, num=1000)
    with open('topic_dist_gsm.txt', 'w', encoding='utf-8') as wfp:
        for t, e in zip(txt_lst, embeds):
            wfp.write(f'{e}:{t}\n')
    pickle.dump({
        'txts': txt_lst,
        'embeds': embeds
    }, open('gsm_embeds.pkl', 'wb'))

示例#21

0

显示文件

def main():
    global args
    taskname = args.taskname  # 数据集名字
    no_below = args.no_below  # 文档频率小于阈值的词会被过滤掉
    no_above = args.no_above  # 文档频率小于阈值的词将被过滤掉
    num_epochs = args.num_epochs  # 训练周期
    n_topic = args.n_topic  # 主题数
    n_cpu = cpu_count() - 2 if cpu_count() > 2 else 2
    bkpt_continue = args.bkpt_continue  # 是否在之前的checkoint上继续训练
    use_tfidf = args.use_tfidf  # 是否用tfidf作为BOW输入
    rebuild = args.rebuild  # 是否重建语料，默认不会
    batch_size = args.batch_size  # 批次大小
    criterion = args.criterion  # loss的种类
    auto_adj = args.auto_adj  # 是否自动调整频率，如去掉top20
    ckpt = args.ckpt  # ckpt路径

    device = torch.device('cpu')
    docSet = DocDataset(taskname,
                        no_below=no_below,
                        no_above=no_above,
                        rebuild=rebuild,
                        use_tfidf=False)  # 载入数据集，并分词
    if auto_adj:
        no_above = docSet.topk_dfs(topk=20)
        docSet = DocDataset(taskname,
                            no_below=no_below,
                            no_above=no_above,
                            rebuild=rebuild,
                            use_tfidf=False)

    voc_size = docSet.vocabsize
    print('voc size:', voc_size)

    if ckpt:  # 载入ckpt
        checkpoint = torch.load(ckpt)
        param.update({"device": device})
        model = GSM(**param)
        model.train(train_data=docSet,
                    batch_size=batch_size,
                    test_data=docSet,
                    num_epochs=num_epochs,
                    log_every=10,
                    beta=1.0,
                    criterion=criterion,
                    ckpt=checkpoint)
    else:
        # 初始化模型并开始执行train程序
        model = GSM(bow_dim=voc_size,
                    n_topic=n_topic,
                    taskname=taskname,
                    device=device)
        model.train(train_data=docSet,
                    batch_size=batch_size,
                    test_data=docSet,
                    num_epochs=num_epochs,
                    log_every=10,
                    beta=1.0,
                    criterion=criterion)
    model.evaluate(test_data=docSet)  # 用训练之后的模型做评估
    # 存模型，特征，统计等等结果
    save_name = f'./ckpt/GSM_{taskname}_tp{n_topic}_{time.strftime("%Y-%m-%d-%H-%M", time.localtime())}.ckpt'
    torch.save(model.vae.state_dict(), save_name)
    txt_lst, embeds = model.get_embed(train_data=docSet, num=1000)
    with open('topic_dist_gsm.txt', 'w', encoding='utf-8') as wfp:
        for t, e in zip(txt_lst, embeds):
            wfp.write(f'{e}:{t}\n')
    pickle.dump({
        'txts': txt_lst,
        'embeds': embeds
    }, open('gsm_embeds.pkl', 'wb'))