예제 #1
0
def create_ngram(sentences, n):
    """Create n-gram dictionary from set of sentences."""
    ngram = Ngram(n)
    for sentence in sentences.astype('int64'):
        for i in range(len(sentence) - n + 1):
            ngram[tuple(sentence[i:i+n])] += 1
    return ngram.norm()
예제 #2
0
def sequence_ngram(n, entries, out_dim=10):
    """Create sequence-based n-gram"""
    ngram = Ngram(n)
    idx = np.random.randint(0, out_dim, n)
    while ngram.size() < entries:
        ngram[tuple(idx)] = np.random.random()
        idx = np.append(idx[1:], np.random.randint(0, out_dim))
예제 #3
0
def retrieve_ngram(sequence_loader, n):
    """Retrieve ngram from data loader"""
    ngram = Ngram(n)
    for _, y in sequence_loader:
        for sample in y:
            ngram[tuple(sample.to('cpu').numpy())] += 1
    return ngram.norm()
예제 #4
0
파일: brown.py 프로젝트: gcie/licencjat
def get_brown_ngram(n=3, dim=6):
    text = ''.join(brown.words()).lower()
    pattern = re.compile('[^' + 'etaoinsrhl'[:dim] + ']+')
    vowels = pattern.sub('', text)
    ngram = Ngram(n)
    for i in range(len(vowels) - n + 1):
        ngram[strtotuple(vowels[i:i + n])] += 1
    return ngram.norm()
예제 #5
0
def randomized_ngram(n, entries, out_dim=10):
    """Create randomized n-gram"""
    ngram = Ngram(n)
    while ngram.size() < entries:
        ngram[tuple(np.random.randint(0, out_dim, n))] = np.random.random()
    unique = set()
    for idx in ngram:
        for i in idx:
            unique.add(i)
    if len(unique) != out_dim:
        return randomized_ngram(n, entries, out_dim)
    return ngram.norm()
예제 #6
0
def randomized_ngram(n, size, out_dim=10, min_var=0):
    """Create randomized n-gram"""
    ngram = Ngram(n)
    while ngram.size() < size:
        ngram[tuple(np.random.randint(0, out_dim, n))] = np.random.random()
    unique = set()
    for idx in ngram:
        for i in idx:
            unique.add(i)
    if len(unique) != out_dim:
        return randomized_ngram(n, size, out_dim, min_var)
    ngram.norm()
    mu = sum(ngram.values()) / size
    var = sum([(x - mu)**2 for x in ngram.values()]) / size
    if var < min_var:
        return randomized_ngram(n, size, out_dim, min_var)
    return ngram
예제 #7
0
def randomized_ngram(n, entries, out_dim=10):
    """Create randomized n-gram"""
    ngram = Ngram(n)
    while ngram.size() < entries:
        ngram[tuple(np.random.randint(0, out_dim, n))] = np.random.random()
    return ngram.norm()
예제 #8
0
파일: main.py 프로젝트: gcie/licencjat
save_every = 100
log_every = 100
test_every = 1
primal_lr = 1e-6
dual_lr = 1e-4

show_dual = False
predictions_on_sequences = True
predictions_on_data = False
ngram_data_stats = True
ngram_test_stats = True
loss_on_test = False

# %% CREATING NGRAM
# ngram = randomized_ngram(3, 20, out_dim=4, min_var=-1e-2)
ngram = Ngram(3)
ngram[(0, 1, 2)] = 6.
ngram[(1, 2, 3)] = 4.
# ngram.norm()
# ngram_gen = get_brown_ngram(dim=3)
ngram_gen = ngram
# ngram_gen.show()


# %% GENERATING DATASET
data_loader = train_loader_MNIST()
test_loader = test_loader_MNIST()
sequence_loader = sequence_loader_MNIST(ngram_gen, num_samples=40000)
sequence_test_loader = sequence_loader_MNIST(ngram_gen, num_samples=10000, train=False)

t = time.time()