Пример #1
0
def generate_ngram_naive_bayes_model(training_iter, alpha):
    labelCounts = ntorch.ones(len(LABEL.vocab), names=("class")).cuda() * 0
    vocabCounts = ntorch.tensor(
        [alpha[f[0]]
         for f in NGRAMS.vocab.itos], names=("vocab", )).cuda() * ntorch.ones(
             len(LABEL.vocab), names=("class", )).cuda()
    classes = ntorch.tensor(torch.eye(len(LABEL.vocab)),
                            names=("class", "classIndex")).cuda()
    encoding = ntorch.tensor(torch.eye(len(NGRAMS.vocab)),
                             names=("vocab", "vocabIndex")).cuda()
    for batch in training_iter:
        oneHot = encoding.index_select("vocabIndex", batch.text)
        setofwords, _ = oneHot.max("ngramlen")
        classRep = classes.index_select("classIndex", batch.label.long())
        labelCounts += classRep.sum("batch")
        vocabCounts += setofwords.dot("batch", classRep)

    p = vocabCounts.get("class", 1)
    q = vocabCounts.get("class", 0)
    r = ((p * q.sum()) / (q * p.sum())).log()
    # r= (p/q).log()
    weight = r
    b = (labelCounts.get("class", 1) / labelCounts.get("class", 0)).log()

    def naive_bayes(test_batch):
        oneHotTest = encoding.index_select("vocabIndex", test_batch.cuda())
        setofwords, _ = oneHotTest.max("seqlen")
        y = (weight.dot("vocab", setofwords) + b).sigmoid()
        return (y - 0.5) * (ntorch.tensor([-1., 1.],
                                          names=("class")).cuda()) + 0.5

    return naive_bayes
Пример #2
0
	def forward(self, seq): 
		seq_len = seq.shape["seqlen"]
		batch_size = seq.shape["batch"]
		  
		pad_token = self.text.vocab.stoi["<pad>"]
		additional_padding = ntorch.ones(batch_size, self.longest_n, 
										names=("batch", "seqlen")).long().to(self.device)
		additional_padding *= pad_token
		
		seq = ntorch.cat([additional_padding, seq, additional_padding],
						dim="seqlen")
		
		
		amino_acids = self.codon_to_aa[seq.values]
		
		return_ar = ntorch.zeros(seq_len, batch_size, self.out_vocab,
								 names=("seqlen", "batch", "vocablen"))
		
		# convert to numpy to leave GPU 
		amino_acids = amino_acids.detach().cpu().numpy()
		for batch_item in range(batch_size): 
		  # start at n, end at seq_len - n
			for seq_item in range(self.longest_n, seq_len - self.longest_n):
				# Must iterate over all dictionaries
				for weight, n, ngram_dict in zip(self.weight_list,
												self.n_list, self.dict_list):
					# N gram is a 2d numpy array containing an amino acid embedding in each row
					n_gram = amino_acids[batch_item,seq_item - n : seq_item + n + 1]

					# note, we want to populate the return ar before padding!
					return_ar[{"seqlen" : seq_item - self.longest_n, 
							 "batch" : batch_item}] += weight * ngram_dict[str(n_gram)].float()

		return return_ar.to(self.device)
Пример #3
0
def make_n_gram_dict(train_iter, n, amino_acid_conversion, TEXT, AA_LABEL):
	''' Helper function to create a frequency default dictionary
	
	Args: 
		train_iter: Training bucket iterator
		n: Number of amino acids to each side of AA (e.g. 0 is unigram, 1 is trigram)
		amino_acid_conversion: index_table converting the codon index to AA index
		TEXT: torchtext field for the vocab of nucleotides
		AA_LABEL: Torchtext for amino acids

	Returns: 
		default_dict: dictionary mapping a sequence of amino acids to probability over codons
	TODO: 
		Make this faster
	'''

	default_obj = lambda : torch.tensor(np.zeros(len(TEXT.vocab.stoi)))
	default_dict = defaultdict(default_obj)

	with torch.no_grad():
		ident_mat = np.eye(len(TEXT.vocab.stoi))
		ident_mat_aa = np.eye(len(AA_LABEL.vocab))
		for i, batch in enumerate(train_iter):
			# Select for all non zero tensors
			# Use this to find all indices that aren't padding
			seq_len = batch.sequence.shape["seqlen"]
			batch_size = batch.sequence.shape["batch"]

			# Pad amino acids and seq with <pad> token 
			pad_token = TEXT.vocab.stoi["<pad>"]
			additional_padding = ntorch.ones(batch_size, n, 
											names=("batch", "seqlen")).long()
			additional_padding *= pad_token

			seq = ntorch.cat([additional_padding, batch.sequence, additional_padding],
							dim="seqlen")

			# Now one hots.. 
			amino_acids = amino_acid_conversion[seq.values].detach().cpu().numpy()
			# Note: we should assert that start and pad are treated the same
			# This is because at test time, presumably we narrow the start for the AA.. 
			if i == 0:
				assert((amino_acids[0,n] == amino_acids[0,0]).all())

			seq = seq.detach().cpu().numpy()
			# Pad with padding token
			for batch_item in range(batch_size): 
				# start at n, end at seq_len - n
				for seq_item in range(n, seq_len - n):
					# Middle token is a discrete number representing the codon (0 to 66)
					middle_token = seq[batch_item, seq_item]
					# N gram is a 2d numpy array containing an amino acid embedding in each row
					n_gram = amino_acids[batch_item,seq_item - n : seq_item + n + 1]

					default_dict[str(n_gram)][middle_token] += 1

	for key in default_dict: 
		default_dict[key] /= (default_dict[key]).sum()
			
	return default_dict
Пример #4
0
    def reinforce(self, premise, hypothesis, label):
        # REINFORCE
        q = self.q(premise, hypothesis, label).rename('label', 'latent')
        latent_dist = nds.Categorical(logits=q, dim_logit='latent')
        # Sample to appromixate E[]
        samples = latent_dist.sample([self.num_samples], names=('samples', ))

        # Batch premises and hypotheses
        batches = defaultdict(list)
        premise_n = premise.unbind('batch')
        hypothesis_n = hypothesis.unbind('batch')

        # Get some samples
        samples_n = samples.transpose('batch', 'samples').tolist()

        # Idea is to work with samples based on their sampled model
        for i, batch in enumerate(samples_n):
            p = premise_n[i]
            h = hypothesis_n[i]
            for sample in batch:
                batches[sample].append((i, p, h))

        # Can now evaluate sampled models with batching
        batch_size = premise.shape['batch']
        counts = [0] * batch_size
        res = [None] * (self.num_samples * batch_size)

        correct = label.tolist()
        for i, items in batches.items():
            # for item in items:
            #     batch_p = ntorch.
            batch_p = ntorch.stack([p for _, p, _ in items], 'batch')
            batch_h = ntorch.stack([h for _, _, h in items], 'batch')
            batch_i = [i for i, _, _ in items]

            # Evaluate model per batch, then update
            preds = self.models[i](batch_p, batch_h)
            for i, log_probs in zip(batch_i, preds.unbind('batch')):
                res[self.num_samples * i +
                    counts[i]] = log_probs.values[correct[i]]
                counts[i] += 1

        # Finally average results for sample
        res = torch.stack(res, dim=0).reshape(batch_size, self.num_samples)
        res = ntorch.tensor(res, names=(
            'batch',
            'sample',
        ))

        # Onward to estimating gradient + calculating loss
        surrogate = (latent_dist.log_prob(samples) * res.detach() +
                     res).mean('sample')
        prior = ntorch.ones(self.K, names='latent').log_softmax(dim='latent')
        prior = nds.Categorical(logits=prior, dim_logit='latent')
        KLD = nds.kl_divergence(latent_dist, prior) * self.kl_weight

        loss = (KLD - surrogate._tensor).mean()  # -(surrogate = kl)
        elbo = (KLD.detach() - res.detach().mean('sample')._tensor).mean()
        return loss, elbo
Пример #5
0
    def elbo_reinforce(self, premise, hypothesis, label):
        # computing the q distribution: p(c | a, b, y)
        q = self.q(premise, hypothesis, label).rename('label', 'latent')
        latent_dist = ds.Categorical(logits=q, dim_logit='latent')

        # generating some samples
        samples = latent_dist.sample([self.sample_size], names=('samples', ))

        # bucketing samples by the sampled model to maximize efficiency
        buckets = defaultdict(list)
        premise_lst = premise.unbind('batch')
        hypothesis_lst = hypothesis.unbind('batch')

        samples_list = samples.transpose('batch', 'samples').tolist()
        for i, batch in enumerate(samples_list):
            p, h = premise_lst[i], hypothesis_lst[i]
            for sample in batch:
                buckets[sample].append((i, p, h))

        # evaluating the sampled models efficiently using batching
        orig_batch_size = premise.shape['batch']
        counts = [0] * orig_batch_size
        res = [None] * (self.sample_size * orig_batch_size)

        correct = label.tolist()
        for c, items in buckets.items():
            # stacking data points into batches
            batch_premise = ntorch.stack([p for _, p, _ in items], 'batch')
            batch_hypothesis = ntorch.stack([h for _, _, h in items], 'batch')
            ids = [i for i, _, _ in items]

            # evaluating the model on that batch
            predictions = self.models[c](batch_premise, batch_hypothesis)

            # updating the result at the appropriate index
            for i, log_probs in zip(ids, predictions.unbind('batch')):
                res[self.sample_size * i +
                    counts[i]] = log_probs.values[correct[i]]
                counts[i] += 1

        # reforming and averaging the results for each sample
        res = torch.stack(res, dim=0).reshape(orig_batch_size,
                                              self.sample_size)
        res = ntorch.tensor(res, names=('batch', 'sample'))

        # computing a surrogate objective for REINFORCE
        # https://pyro.ai/examples/svi_part_iii.html
        q_log_prob = latent_dist.log_prob(samples)
        surrogate_objective = (q_log_prob * res.detach() + res).mean('sample')

        # adding on the KL regularizing term
        ones = ntorch.ones(self.K, names='latent').log_softmax(dim='latent')
        uniform_dist = ds.Categorical(logits=ones, dim_logit='latent')
        kl = ds.kl_divergence(latent_dist, uniform_dist) * self.kl_importance

        # reporting the surrogate objective as well as the actual elbo
        loss = -(surrogate_objective - kl).mean()
        elbo = -(res.detach().mean('sample') - kl.detach()).mean()
        return loss, elbo
Пример #6
0
 def decode(self, premise, hypothesis):
     label = ntorch.ones(premise.shape['batch'], names=('batch', ))
     preds = 0
     for i in range(1, self.num_samples + 1):
         q = self.q(premise, hypothesis,
                    label * i).rename('label', 'latent').exp()
         for c in range(len(self.models)):
             log_probs = self.models[c](premise, hypothesis)
             preds += log_probs * q.get('latent', c) / len(self.models)
     return preds / self.num_samples, q
Пример #7
0
def loss_function(recon_x, x, var_posterior):
    BCE = recon_x.reduce2(
        x.stack(h=("ch", "height", "width")),
        lambda x, y: F.binary_cross_entropy(x, y, reduction="sum"),
        ("batch", "x"),
    )
    prior = ndistributions.Normal(ntorch.zeros(dict(batch=1, z=1)),
                                  ntorch.ones(dict(batch=1, z=1)))
    KLD = ndistributions.kl_divergence(var_posterior, prior).sum()
    return BCE + KLD
Пример #8
0
 def infer(self, premise, hypothesis):
     label = ntorch.ones(premise.shape['batch'], names=('batch', )).long()
     predictions = 0
     for i in range(1, 4):
         q = self.q(premise, hypothesis,
                    label * i).rename('label', 'latent').exp()
         for c in range(len(self.models)):
             log_probs = self.models[c](premise, hypothesis)
             predictions += log_probs * q.get('latent', c) / len(
                 self.models)
     return predictions / 3
Пример #9
0
    def elbo_exact(self, premise, hypothesis, label):
        # computing the q distribution: p(c | a, b, y)
        q = self.q(premise, hypothesis, label).rename('label', 'latent')
        latent_dist = ds.Categorical(logits=q, dim_logit='latent')

        one_hot_label = torch.eye(4).index_select(0, label.values)
        one_hot_label = ntorch.tensor(one_hot_label, names=('batch', 'label'))

        # computing p(y | a, b, c) for every c
        objective = 0
        q = q.exp()
        for c in range(len(self.models)):
            log_probs = self.models[c](premise, hypothesis)
            model_probs = q.get('latent', c)
            objective += (log_probs * one_hot_label).sum('label') * model_probs

        # adding on the KL regularizing term
        ones = ntorch.ones(self.K, names='latent').log_softmax(dim='latent')
        uniform_dist = ds.Categorical(logits=ones, dim_logit='latent')

        kl = ds.kl_divergence(latent_dist, uniform_dist) * self.kl_importance
        loss = -(objective.mean() - kl.mean())
        return loss, loss.detach()
Пример #10
0
    def exact(self, premise, hypothesis, label):
        q = self.q(premise, hypothesis, label).rename('label', 'latent')
        latent_dist = nds.Categorical(logits=q, dim_logit='latent')

        one_hot = torch.eye(4, out=torch.cuda.FloatTensor()).index_select(
            0, label.values)
        one_hot = ntorch.tensor(one_hot, names=('batch', 'label'))

        # Calculate p(y | a, b, c) across all models K
        surrogate = 0
        q = q.exp()
        for c in range(len(self.models)):
            log_probs = self.models[c](premise, hypothesis)
            model_probs = q.get('latent', c)
            surrogate += (log_probs * one_hot).sum('label') * model_probs

        # KL regularization
        ones = ntorch.ones(self.K, names='latent').log_softmax(dim='latent')
        prior = nds.Categorical(logits=ones, dim_logit='latent')

        KLD = nds.kl_divergence(latent_dist, prior) * self.kl_weight
        loss = KLD.mean() - surrogate._tensor.mean(
        )  # -(surrogate.mean() - kl.mean())
        return loss, loss.detach()
Пример #11
0
    def forward(self,
                source,
                target=None,
                teacher_forcing=1.,
                max_length=20,
                encode_only=False):
        if target:
            max_length = target.shape["trgSeqlen"]
        x = self.in_embedding(source)
        out, (h, c) = self.encoder(x)
        h = ntorch.cat((h[{
            "layers": slice(0, 1)
        }], h[{
            "layers": slice(1, 2)
        }]),
                       dim="rnnOutput")
        c = ntorch.cat((c[{
            "layers": slice(0, 1)
        }], c[{
            "layers": slice(1, 2)
        }]),
                       dim="rnnOutput")

        if self.attention:

            def attend(x_t):
                alpha = out.dot("rnnOutput", x_t).softmax("srcSeqlen")
                context = alpha.dot("srcSeqlen", out)
                return context

        batch_size = source.shape["batch"]
        output_dists = ntorch.zeros(
            (batch_size, max_length, self.out_vocab_size),
            names=("batch", "trgSeqlen", "outVocab"),
            device=device)
        output_seq = ntorch.zeros((batch_size, max_length),
                                  names=("batch", "trgSeqlen"),
                                  device=device)
        #for the above, should set zeroith index to SOS

        score = ntorch.zeros((batch_size, max_length),
                             names=("batch", "trgSeqlen"),
                             device=device)

        if encode_only:
            return score, out, (h, c), output_seq

        for t in range(max_length - 1):  #Oh god
            if t == 0:
                # always start with SOS token
                next_input = ntorch.ones((batch_size, 1),
                                         names=("batch", "trgSeqlen"),
                                         device=device).long()
                next_input *= EN.vocab.stoi["<s>"]
            elif np.random.random(
            ) < teacher_forcing and target:  # we will force
                next_input = target[{"trgSeqlen": slice(t, t + 1)}]
            else:
                next_input = sample

            x_t, (h, c) = self.decoder(self.out_embedding(next_input), (h, c))

            if t == 0:
                syntax_out, (s_h, s_c) = self.syntax_decoder(
                    self.out_embedding(next_input))
            else:
                syntax_out, (s_h, s_c) = self.syntax_decoder(
                    self.out_embedding(next_input), (s_h, s_c))

            if self.attention:
                fc = self.fc(ntorch.cat([attend(x_t), x_t], dim="rnnOutput"))
            else:
                fc = self.fc(x_t)

            s_fc = self.syntax_fc(syntax_out).sum("trgSeqlen")
            s_fc = s_fc.log_softmax("outVocab")

            dist = ntorch.distributions.Categorical(logits=fc,
                                                    dim_logit="outVocab")
            sample = dist.sample()

            fc = fc.sum("trgSeqlen")

            next_token = (sample) if not target else target[{
                "trgSeqlen":
                slice(t + 1, t + 2)
            }]  #TODO

            #this is the line where the syntax thing does it's stuff
            fc = fc.log_softmax("outVocab") + s_fc

            indices = next_token.sum("trgSeqlen").rename("batch", "indices")
            batch_indices = ntorch.tensor(
                torch.tensor(np.arange(fc.shape["batch"]), device=device),
                ("batchIndices"))

            newsc = fc.index_select("outVocab", indices).index_select(
                "indices", batch_indices).get("batchIndices", 0)

            score[{"trgSeqlen": t + 1}] = newsc

            output_seq[{
                "trgSeqlen": t + 1
            }] = next_token.sum("trgSeqlen")  #todo
            output_dists[{"trgSeqlen": t + 1}] = fc  #Todo

        return output_seq, output_dists, score