def read_data(filename, words=False, sep='_', max_length=None): with open(filename, 'r') as f: if words: data = [sep+w for w in f.read().split()] else: data = [x for x in sep.join(f.read().split())] if max_length is not None: data = data[:max_length] data, alphabet = bvmm.create_index(data) print('Sequence characters:', len(data)) print('Distinct characters:', len(alphabet)) return data, alphabet
def read_data(filename, sep='-', max_length=None): with open(filename, 'r') as f: data = [] for l in f: v, w = l.split()[:2] data.append((sep+v, sep+w)) if max_length is not None: data = data[:max_length] data, alphabet = bvmm.create_index(data, kind='network') print('Sequence characters:', len(data)) print('Distinct characters:', len(alphabet)) return data, alphabet
alphabet = ['-{}'.format(x) for x in range(25)] with open('dat/synthetic_3.pkl', 'rb') as f: root = pickle.load(f) bvmm.print_tree(root, alphabet, max_counts=4) # root = bvmm.rand_tree(50, alphabet) # bvmm.print_tree(root, alphabet, max_counts=4) # # with open('dat/synthetic_3.pkl', 'wb') as f: # # pickle.dump(root, f) # %% n = 10_000 with open('dat/synthetic_3_{}.txt'.format(n), 'r') as f: data = ['-' + x for x in f.read().split('-') if x] data, alphabet = bvmm.create_index(data) # data = bvmm.rand_data(root, n) # # with open('dat/synthetic_3_{}.txt'.format(n), 'w') as f: # # f.write(''.join(bvmm.apply_alphabet(data, alphabet))) # %% %%time mcmc, counts = bvmm.mcmc(data, alphabet, 100_000, 10) bvmm.print_tree(mcmc, alphabet, min_samples=.01, max_counts=4) print(counts) bvmm.write_tree(mcmc, alphabet, 'dat/synthetic_3_{}.net'.format(n), min_samples=.01) # %% def activate_same(v1, v2):