def create_ngram(sentences, n): """Create n-gram dictionary from set of sentences.""" ngram = Ngram(n) for sentence in sentences.astype('int64'): for i in range(len(sentence) - n + 1): ngram[tuple(sentence[i:i+n])] += 1 return ngram.norm()
def sequence_ngram(n, entries, out_dim=10): """Create sequence-based n-gram""" ngram = Ngram(n) idx = np.random.randint(0, out_dim, n) while ngram.size() < entries: ngram[tuple(idx)] = np.random.random() idx = np.append(idx[1:], np.random.randint(0, out_dim))
def retrieve_ngram(sequence_loader, n): """Retrieve ngram from data loader""" ngram = Ngram(n) for _, y in sequence_loader: for sample in y: ngram[tuple(sample.to('cpu').numpy())] += 1 return ngram.norm()
def get_brown_ngram(n=3, dim=6): text = ''.join(brown.words()).lower() pattern = re.compile('[^' + 'etaoinsrhl'[:dim] + ']+') vowels = pattern.sub('', text) ngram = Ngram(n) for i in range(len(vowels) - n + 1): ngram[strtotuple(vowels[i:i + n])] += 1 return ngram.norm()
def randomized_ngram(n, entries, out_dim=10): """Create randomized n-gram""" ngram = Ngram(n) while ngram.size() < entries: ngram[tuple(np.random.randint(0, out_dim, n))] = np.random.random() unique = set() for idx in ngram: for i in idx: unique.add(i) if len(unique) != out_dim: return randomized_ngram(n, entries, out_dim) return ngram.norm()
def randomized_ngram(n, size, out_dim=10, min_var=0): """Create randomized n-gram""" ngram = Ngram(n) while ngram.size() < size: ngram[tuple(np.random.randint(0, out_dim, n))] = np.random.random() unique = set() for idx in ngram: for i in idx: unique.add(i) if len(unique) != out_dim: return randomized_ngram(n, size, out_dim, min_var) ngram.norm() mu = sum(ngram.values()) / size var = sum([(x - mu)**2 for x in ngram.values()]) / size if var < min_var: return randomized_ngram(n, size, out_dim, min_var) return ngram
def randomized_ngram(n, entries, out_dim=10): """Create randomized n-gram""" ngram = Ngram(n) while ngram.size() < entries: ngram[tuple(np.random.randint(0, out_dim, n))] = np.random.random() return ngram.norm()
save_every = 100 log_every = 100 test_every = 1 primal_lr = 1e-6 dual_lr = 1e-4 show_dual = False predictions_on_sequences = True predictions_on_data = False ngram_data_stats = True ngram_test_stats = True loss_on_test = False # %% CREATING NGRAM # ngram = randomized_ngram(3, 20, out_dim=4, min_var=-1e-2) ngram = Ngram(3) ngram[(0, 1, 2)] = 6. ngram[(1, 2, 3)] = 4. # ngram.norm() # ngram_gen = get_brown_ngram(dim=3) ngram_gen = ngram # ngram_gen.show() # %% GENERATING DATASET data_loader = train_loader_MNIST() test_loader = test_loader_MNIST() sequence_loader = sequence_loader_MNIST(ngram_gen, num_samples=40000) sequence_test_loader = sequence_loader_MNIST(ngram_gen, num_samples=10000, train=False) t = time.time()