def get_negatives(all_contexts, corpus, K): counter = d2l.count_corpus(corpus) sampling_weights = [counter[i]**0.75 for i in range(len(counter))] all_negatives, generator = [], RandomGenerator(sampling_weights) for contexts in all_contexts: negatives = [] while len(negatives) < len(contexts) * K: neg = generator.draw() if neg not in contexts: negatives.append(neg) all_negatives.append(negatives) return all_negatives
def subsampling(sentences, vocab): # Map low frequency words into <unk> sentences = [[vocab.idx_to_token[vocab[tk]] for tk in line] for line in sentences] # Count the frequency for each word counter = d2l.count_corpus(sentences) num_tokens = sum(counter.values()) def keep(token): return (random.uniform(0, 1) < math.sqrt( 1e-4 / counter[token] * num_tokens)) return [[tk for tk in line if keep(tk)] for line in sentences]