def fit_partial(self, rsty_ids, raut_ids, rwrd_ids, window=5): sty_ids, aut_ids, wrd_ids = move(self.xp, rsty_ids, raut_ids, rwrd_ids) pivot_idx = next(move(self.xp, rwrd_ids[window: -window])) pivot = F.embed_id(pivot_idx, self.sampler.W) sty_at_pivot = rsty_ids[window: -window] aut_at_pivot = raut_ids[window: -window] sty = self.mixture_sty(next(move(self.xp, sty_at_pivot))) aut = self.mixture_aut(next(move(self.xp, aut_at_pivot))) loss = 0.0 start, end = window, rwrd_ids.shape[0] - window context = sty + aut + F.dropout(pivot, self.dropout_ratio) for frame in range(-window, window + 1): # Skip predicting the current pivot if frame == 0: continue # Predict word given context and pivot word # The target starts before the pivot targetidx = rwrd_ids[start + frame: end + frame] sty_at_target = rsty_ids[start + frame: end + frame] aut_at_target = raut_ids[start + frame: end + frame] sty_is_same = sty_at_target == sty_at_pivot aut_is_same = aut_at_target == aut_at_pivot # Randomly dropout words (default is to never do this) rand = np.random.uniform(0, 1, sty_is_same.shape[0]) mask = (rand > self.word_dropout_ratio).astype('bool') sty_and_aut_are_same = np.logical_and(sty_is_same, aut_is_same) weight = np.logical_and(sty_and_aut_are_same, mask).astype('int32') # If weight is 1.0 then targetidx # If weight is 0.0 then -1 targetidx = targetidx * weight + -1 * (1 - weight) target, = move(self.xp, targetidx) loss = self.sampler(context, target) loss.backward() return loss.data
def fit_partial(self, rdoc_ids, rword_indices, window=5): doc_ids, word_indices = move(self.xp, rdoc_ids, rword_indices) pivot_idx = next(move(self.xp, rword_indices[window:-window])) pivot = F.embed_id(pivot_idx, self.sampler.W) doc_at_pivot = rdoc_ids[window:-window] doc = self.mixture(next(move(self.xp, doc_at_pivot))) loss = 0.0 start, end = window, rword_indices.shape[0] - window context = (F.dropout(doc, self.dropout_ratio) + F.dropout(pivot, self.dropout_ratio)) for frame in range(-window, window + 1): # Skip predicting the current pivot if frame == 0: continue # Predict word given context and pivot word # The target starts before the pivot targetidx = rword_indices[start + frame:end + frame] doc_at_target = rdoc_ids[start + frame:end + frame] doc_is_same = doc_at_target == doc_at_pivot rand = np.random.uniform(0, 1, doc_is_same.shape[0]) mask = (rand > self.word_dropout_ratio).astype('bool') weight = np.logical_and(doc_is_same, mask).astype('int32') # If weight is 1.0 then targetidx # If weight is 0.0 then -1 targetidx = targetidx * weight + -1 * (1 - weight) target, = move(self.xp, targetidx) loss = self.sampler(context, target) loss.backward() return loss.data
def fit_partial(self, rsty_ids, raut_ids, rwrd_ids, window=5): sty_ids, aut_ids, wrd_ids = move(self.xp, rsty_ids, raut_ids, rwrd_ids) pivot_idx = next(move(self.xp, rwrd_ids[window: -window])) pivot = F.embed_id(pivot_idx, self.sampler.W) sty_at_pivot = rsty_ids[window: -window] aut_at_pivot = raut_ids[window: -window] sty = self.mixture_sty(next(move(self.xp, sty_at_pivot))) # aut = self.mixture_aut(next(move(self.xp, aut_at_pivot))) loss = 0.0 start, end = window, rwrd_ids.shape[0] - window context = F.dropout(pivot, self.dropout_ratio) # + aut + sty for frame in range(-window, window + 1): # Skip predicting the current pivot if frame == 0: continue # Predict word given context and pivot word # The target starts before the pivot targetidx = rwrd_ids[start + frame: end + frame] sty_at_target = rsty_ids[start + frame: end + frame] # aut_at_target = raut_ids[start + frame: end + frame] sty_is_same = sty_at_target == sty_at_pivot # aut_is_same = aut_at_target == aut_at_pivot # Randomly dropout words (default is to never do this) rand = np.random.uniform(0, 1, sty_is_same.shape[0]) mask = (rand > self.word_dropout_ratio).astype('bool') # sty_and_aut_are_same = np.logical_and(sty_is_same, aut_is_same) # weight = np.logical_and(sty_and_aut_are_same, mask).astype('int32') # If weight is 1.0 then targetidx # If weight is 0.0 then -1 targetidx = targetidx # * weight + -1 * (1 - weight) target, = move(self.xp, targetidx) loss = self.sampler(context, target) loss.backward() return loss.data
def fit_partial(self, rdoc_ids, rword_indices, window=5): doc_ids, word_indices = move(self.xp, rdoc_ids, rword_indices) pivot_idx = next(move(self.xp, rword_indices[window: -window])) pivot = F.embed_id(pivot_idx, self.sampler.W) doc_at_pivot = rdoc_ids[window: -window] doc = self.mixture(next(move(self.xp, doc_at_pivot))) loss = 0.0 start, end = window, rword_indices.shape[0] - window context = (F.dropout(doc, self.dropout_ratio) + F.dropout(pivot, self.dropout_ratio)) for frame in range(-window, window + 1): # Skip predicting the current pivot if frame == 0: continue # Predict word given context and pivot word # The target starts before the pivot targetidx = rword_indices[start + frame: end + frame] doc_at_target = rdoc_ids[start + frame: end + frame] doc_is_same = doc_at_target == doc_at_pivot rand = np.random.uniform(0, 1, doc_is_same.shape[0]) mask = (rand > self.word_dropout_ratio).astype('bool') weight = np.logical_and(doc_is_same, mask).astype('int32') # If weight is 1.0 then targetidx # If weight is 0.0 then -1 targetidx = targetidx * weight + -1 * (1 - weight) target, = move(self.xp, targetidx) loss = self.sampler(context, target) loss.backward() return loss.data
def forward(self, doc, wrd, window=5): doc, wrd = utils.move(self.xp, doc, wrd) proportions = self.proportions(doc) ld = dirichlet_likelihood(self.proportions.W) context = F.matmul(F.softmax(proportions), self.factors()) loss = self.loss_func(context, wrd) return loss, ld
def forward(self, ids, bow): bow, ids = utils.move(self.xp, bow, ids) proportions = self.proportions(ids) ld = dirichlet_likelihood(proportions) doc = F.matmul(F.softmax(proportions), self.factors()) logp = F.dropout(self.embedding(doc)) # loss = -F.sum(bow * F.log_softmax(logp)) sources, targets, counts = [], [], [] lpi = F.sum(bow * F.log_softmax(logp), axis=1) loss = -F.sum(lpi) return loss, ld
def fit_partial(self, rdoc_ids, rword_indices, window=5, update_words=False, update_topics=True): """ Function where all the training happens. Word vector training, topic vector training and the topic distribution is updated """ doc_ids, word_indices = move(self.xp, rdoc_ids, rword_indices) pivot_idx = next(move(self.xp, rword_indices[window: -window])) pivot = F.embed_id(pivot_idx, self.sampler.W) if not update_words: pivot.unchain_backward() doc_at_pivot = rdoc_ids[window: -window] doc = self.mixture(next(move(self.xp, doc_at_pivot)), update_only_docs=not update_topics) loss = 0.0 start, end = window, rword_indices.shape[0] - window context = (F.dropout(doc, self.dropout_ratio) + F.dropout(pivot, self.dropout_ratio)) for frame in range(-window, window + 1): # Skip predicting the current pivot if frame == 0: continue # Predict word given context and pivot word # The target starts before the pivot targetidx = rword_indices[start + frame: end + frame] doc_at_target = rdoc_ids[start + frame: end + frame] doc_is_same = doc_at_target == doc_at_pivot rand = np.random.uniform(0, 1, doc_is_same.shape[0]) mask = (rand > self.word_dropout_ratio).astype('bool') weight = np.logical_and(doc_is_same, mask).astype('int32') # If weight is 1.0 then targetidx # If weight is 0.0 then -1 targetidx = targetidx * weight + -1 * (1 - weight) target, = move(self.xp, targetidx) loss = self.sampler(context, target) loss.backward() if not update_words: # Wipe out any gradient accumulation on word vectors self.sampler.W.grad *= 0.0 return loss.data
def fit_partial(self, rsty_ids, raut_ids, rwrd_ids, window=5): doc_idx, usr_idx, wrd_idx = move(self.xp, rsty_ids, raut_ids, rwrd_ids) pivot = self.embed(next(move(self.xp, rwrd_ids[window: -window]))) sty_at_pivot = rsty_ids[window: -window] aut_at_pivot = raut_ids[window: -window] sty = self.mixture_stories(next(move(self.xp, sty_at_pivot))) aut = self.mixture_authors(next(move(self.xp, aut_at_pivot))) start, end = window, rwrd_ids.shape[0] - window context = (F.dropout(sty, self.dropout_ratio) + F.dropout(aut, self.dropout_ratio) + F.dropout(pivot, self.dropout_ratio)) n_frame = 2 * window # Precompute all neg samples since they're indep of frame size = context.data.shape[0] samples = self.sampler.sampler.sample((self.n_samples * n_frame, size)) samples = chainer.cuda.cupy.split(samples.ravel(), n_frame) sources = [] targets = [] weights = [] for frame in range(-window, window + 1): # Predict word given context and pivot word # The target starts before the pivot # Skip predicting the current pivot if frame == 0: continue # Here we're creating a weight mask. We don't want to # predict tokens that are outside this document or user # scope. wrd_at_target = rwrd_ids[start + frame: end + frame] sty_at_target = rsty_ids[start + frame: end + frame] aut_at_target = raut_ids[start + frame: end + frame] sty_is_same = sty_at_target == sty_at_pivot usr_is_same = aut_at_target == aut_at_pivot is_same = sty_is_same & usr_is_same weight, = move(self.xp, is_same.astype('float32')) target, = move(self.xp, wrd_at_target) sources.append(context) targets.append(target) weights.append(weight) sample, = move(self.xp, samples.pop()) targets.append(sample) for _ in range(self.n_samples): # Note that the context is now negative sources.append(-context) weights.append(weight) sources = F.concat(sources, axis=0) targets = F.concat(targets, axis=0) weights = F.concat(weights, axis=0) loss = self.loss(sources, targets, weights) return loss
def fit_partial(self, rdoc_ids, rword_indices, window=5): doc_ids, word_indices = move(self.xp, rdoc_ids, rword_indices) pivot = self.embed(next(move(self.xp, rword_indices[window:-window]))) doc_at_pivot = rdoc_ids[window:-window] doc = self.mixture(next(move(self.xp, doc_at_pivot))) loss = 0.0 start, end = window, rword_indices.shape[0] - window context = (F.dropout(doc, self.dropout_ratio) + F.dropout(pivot, self.dropout_ratio)) n_frame = 2 * window # Precompute all neg samples since they're indep of frame size = context.data.shape[0] samples = self.sampler.sampler.sample((self.n_samples * n_frame, size)) samples = chainer.cuda.cupy.split(samples.ravel(), n_frame) sources = [] targets = [] weights = [] for frame in range(-window, window + 1): # Skip predicting the current pivot if frame == 0: continue # Predict word given context and pivot word # The target starts before the pivot targetidx = rword_indices[start + frame:end + frame] doc_at_target = rdoc_ids[start + frame:end + frame] doc_is_same = doc_at_target == doc_at_pivot rand = np.random.uniform(0, 1, doc_is_same.shape[0]) mask = (rand > self.word_dropout_ratio).astype('bool') weight = np.logical_and(doc_is_same, mask) weight, = move(self.xp, weight.astype('float32')) target, = move(self.xp, targetidx) sources.append(context) targets.append(target) weights.append(weight) sample, = move(self.xp, samples.pop()) targets.append(sample) for _ in range(self.n_samples): # Note that the context is now negative sources.append(-context) weights.append(weight) sources = F.concat(sources, axis=0) targets = F.concat(targets, axis=0) weights = F.concat(weights, axis=0) loss = self.loss(sources, targets, weights) return loss
def fit_partial(self, rdoc_ids, rword_indices, window=5): doc_ids, word_indices = move(self.xp, rdoc_ids, rword_indices) pivot = self.embed(next(move(self.xp, rword_indices[window: -window]))) doc_at_pivot = rdoc_ids[window: -window] doc = self.mixture(next(move(self.xp, doc_at_pivot))) loss = 0.0 start, end = window, rword_indices.shape[0] - window context = (F.dropout(doc, self.dropout_ratio) + F.dropout(pivot, self.dropout_ratio)) n_frame = 2 * window # Precompute all neg samples since they're indep of frame size = context.data.shape[0] samples = self.sampler.sampler.sample((self.n_samples * n_frame, size)) samples = chainer.cuda.cupy.split(samples.ravel(), n_frame) sources = [] targets = [] weights = [] for frame in range(-window, window + 1): # Skip predicting the current pivot if frame == 0: continue # Predict word given context and pivot word # The target starts before the pivot targetidx = rword_indices[start + frame: end + frame] doc_at_target = rdoc_ids[start + frame: end + frame] doc_is_same = doc_at_target == doc_at_pivot rand = np.random.uniform(0, 1, doc_is_same.shape[0]) mask = (rand > self.word_dropout_ratio).astype('bool') weight = np.logical_and(doc_is_same, mask) weight, = move(self.xp, weight.astype('float32')) target, = move(self.xp, targetidx) sources.append(context) targets.append(target) weights.append(weight) sample, = move(self.xp, samples.pop()) targets.append(sample) for _ in range(self.n_samples): # Note that the context is now negative sources.append(-context) weights.append(weight) sources = F.concat(sources, axis=0) targets = F.concat(targets, axis=0) weights = F.concat(weights, axis=0) loss = self.loss(sources, targets, weights) return loss
def fit_partial(self, rdoc_ids, rword_indices, window=5, update_only_docs=False, word2vec_only=False, update_only_docs_topics=False): """ Compact indices of chunk words, from flattened (Pdb) len(rword_indices) -> 4096, batch size (Pdb) rword_indices.max() -> 4874, max word compact # in this chunk The belonged document ids of chunk words: 1660, from doc_ids (Pdb) len(rdoc_ids) -> 4096, batch size (Pdb) rdoc_ids.max() -> 1660, max doc id in this chunk """ if update_only_docs_topics: update_only_docs = False # Note that self.xp is module numpy. Function move uses following stmt # to convert both rdoc_ids and rword_indices as Chainer's Variable: # ---> yield Variable(xp.asarray(arg, dtype='float32')) # # so doc_ids and word_indices are just Variable wrapper of rdoc_ids # and rword_indices. # (Pdb) len(doc_ids.data) -> 4096 # (Pdb) len(word_indices.data) -> 4096 # # Note that doc_ids NOT IN USE doc_ids, word_indices = move(self.xp, rdoc_ids, rword_indices) # pivot_idx is Variable wrapper of rword_indices[window: -window] # (Pdb) len(pivot_idx.data) -> 4086, note that windows is 5 pivot_idx = next(move(self.xp, rword_indices[window:-window])) # (Pdb) pivot.data.shape -> (4086, 300) # Again batchsize is 4096 while window size is (5, -5) pivot = F.embed_id(pivot_idx, self.sampler.W) # max word compact hash# < compacted vocabulary size (4891) assert pivot_idx.data.max() < self.sampler.W.shape[0] # Note that we meed to adjust word2vec from GoogleNews as we never # train word2vec using twenty_newgroups so that the context words prediction # not work well at the begining if update_only_docs or update_only_docs_topics: pivot.unchain_backward() # (Pdb) window -> 5 # (Pdb) len(doc_at_pivot) -> 4086, 10 less than rdoc_ids # (Pdb) doc_at_pivot.max() -> 1660 doc_at_pivot = rdoc_ids[window:-window] doc = self.mixture(next(move(self.xp, doc_at_pivot)), update_only_docs=update_only_docs) if word2vec_only: doc.unchain_backward() loss = 0.0 # (Pdb) start -> 5 # (Pdb) rword_indices.shape[0] -> 4096 # (Pdb) end -> 4091 start, end = window, rword_indices.shape[0] - window # (Pdb) context.data.shape -> (4086, 300) if not update_only_docs_topics: context = (F.dropout(doc, self.dropout_ratio) + F.dropout(pivot, self.dropout_ratio)) else: context = F.dropout(doc, self.dropout_ratio) # from -5 to 5, that is: # With given context vector (pivot wordvec + doc-topic_vec), predicts # each target word in the window frame. # Note that we do this for all words in the whole batch size. for frame in tqdm(range(-window, window + 1)): # Skip predicting the current pivot if frame == 0 and not update_only_docs_topics: continue # Predict word given context and pivot word # The target starts before the pivot. # # Initial round: # (Pdb) start + frame -> 5 + -5 -> 0 # (Pdb) end + frame -> 4091 + 5 -> 4086 # # Word compact indices targetidx = rword_indices[start + frame:end + frame] # Word's document IDs doc_at_target = rdoc_ids[start + frame:end + frame] # Since we flatten everything: all words from all different documents # now in one array, we need to make sure we only predict words in the # same document. # # Note that doc_at_pivot is rdoc_ids[window/5: -window/4091], # And doc_at_target is rdoc_ids[0: 4086] in the starting round # # (Pdb) doc_is_same -> array([ True, True, True, ..., True, True, True]) # (Pdb) len(doc_is_same) -> 4086 doc_is_same = doc_at_target == doc_at_pivot # Generate <SKIP>, OOV mask mask_SKIP = targetidx != np.array([0]) mask_OOV = targetidx != np.array([1]) assert True in mask_SKIP and True in mask_OOV # Generate drop-out mask # (Pdb) rand -> array([0.7982769 , 0.12706805, 0.77982534, ..., 0.69266078]) rand = np.random.uniform(0, 1, doc_is_same.shape[0]) # (Pdb) mask -> array([ True, True, True, ..., True, True, True]) mask = (rand > self.word_dropout_ratio).astype('bool') # (Pdb) weight -> array([1, 1, 1, ..., 1, 1, 1], dtype=int32) weight = np.logical_and(doc_is_same, mask) weight = np.logical_and(weight, mask_SKIP) weight = np.logical_and(weight, mask_OOV).astype('int32') # targetindex = target word indices # If weight is 1.0 then targetidx # If weight is 0.0 then -1, <SKIP>? => compact index 0 # (Pdb) targetidx -> array([ 28, 9, 2094, ..., 16, 1357, 16]) # # Note that this is skip-gram, from pivot word -> target context words # See NegativeSampling below for ignore label -1. chainer_nce_ignore_label = -1 targetidx = targetidx * weight + chainer_nce_ignore_label * ( 1 - weight) target, = move(self.xp, targetidx) # context, word_vec + docu-topic_vec, -> target words in context # # (Pdb) context.shape -> (4086, 300), dtype('float32') # (Pdb) weight.shape -> (4086,), dtype('int32') # (Pdb) targetidx.shape -> (4086,), dtype('int64') # (Pdb) target.shape -> (4086,), dtype('int32') # (Pdb) pivot_idx.shape -> (4086,), dtype('int32') # (Pdb) pivot.shape -> (4086, 300), dtype('float32') # # REF # self.sampler.__call__ = # negative_sampling.negative_sampling( # x, t, self.W, self.sampler.sample, self.sample_size, # reduce='sum') # here: # context -> x (~chainer.Variable): Input of the weight matrix multiplication. # target -> t (~chainer.Variable): Batch of ground truth labels. # GoogleNews Embedding -> self.sampler.W.data # L.NegativeSampling -> sampler # # returns loss value, sum of all losses on the whole batchsize data. # # Source (https://github.com/chainer/chainer/blob/v3.4.0/chainer/functions/loss/negative_sampling.py#L315) # NegativeSamplingFunction(function_node.FunctionNode): # ignore_label = -1 # target as t -- self.sampler.W --> w # context as x OP w --> loss # note that (Pdb) self.sampler.W.data.shape -> (4891, 300) # # DEBUG # b chainer/functions/loss/negative_sampling.py:48 loss = self.sampler(context, target) loss.backward() if update_only_docs or update_only_docs_topics: # Wipe out any gradient accumulation on word vectors # self.sampler.W.grad *= 0.0 self.sampler.W.cleargrad() if word2vec_only and self.mixture.weights.W.grad is not None: assert self.mixture.weights.W.grad.min() == 0.0 assert self.mixture.weights.W.grad.max() == 0.0 if word2vec_only and self.mixture.factors.W.grad is not None: assert self.mixture.factors.W.grad.min() == 0.0 assert self.mixture.factors.W.grad.max() == 0.0 return loss.data
def observe(self, bow): bow, = utils.move(self.xp, bow * 1.0) sample, kl = self.encode(bow) rec = self.decode(sample, bow) return rec, kl