def simulate(prob, alphabet, N=1000): # need a mapping from token to index in the transition matrix char2idx = get_char2idx(alphabet) last_tokens = seed sim_text = ''.join(seed) for ii in range(N): idxs = tuple([char2idx[x] for x in last_tokens]) next_token = draw_token(prob[idxs], alphabet) sim_text = sim_text + next_token last_tokens.append(next_token) last_tokens.pop(0) return sim_text
def build_markov_chain_corpus(corpus, alphabet, order=1): # need a mapping from token to index in the transition matrix char2idx = get_char2idx(alphabet) # store the transtion matrix (or tensor) transition_counts = np.zeros([len(alphabet)] * (order + 1), dtype='float64') for ii in range(len(corpus)): kmer = corpus[ii:ii + order + 1] idxs = tuple([char2idx[x] for x in kmer]) transition_counts[idxs] += 1 return transition_counts + np.finfo(float).eps
def build_markov_chain(kmers_stream, alphabet, order=1): # need a mapping from token to index in the transition matrix char2idx = get_char2idx(alphabet) # the sorted function is guaranteed to be stable, so we can sort # alphabetically then by length to get a list sorted by length # and also alphabetically within each length sorted_alphabet = sorted(alphabet) sorted_alphabet = sorted(sorted_alphabet, key=lambda x: len(x), reverse=True) # need this to make sure that our kmers are longer than the tokens max_token_length = max([len(x) for x in alphabet]) # store the transtion matrix (or tensor) transition_counts = np.zeros([len(alphabet)] * (order + 1), dtype='int64') skip = 0 # hold on to the last n tokens, where n is the order of the model + 1 last_tokens = [] for kmer in kmers_stream: if skip > 0: skip = skip - 1 continue if not max_token_length <= len(kmer): raise Exception("The kmers must be longer than the longest token") if skip > 0: skip = skip - 1 continue curr_token, skip = next_token(kmer, alphabet) last_tokens.append(curr_token) if len(last_tokens) < order + 1: continue idxs = tuple([char2idx[x] for x in last_tokens]) transition_counts[idxs] += 1 last_tokens.pop(0) return transition_counts
words = {} if __name__ == '__main__': test_file = sys.argv[1] alphabet_file = sys.argv[2] train_data = sys.argv[3] maxK = int(os.path.basename(train_data).split('__')[1][1:]) counts, total_kmers = pickle.load(open(train_data, 'rb')) results = score_kmers([test_file], maxK, counts, total_kmers) breaks = candidate_breaks(results, maxK // 2) alphabet = load_alphabet(alphabet_file) char2idx = get_char2idx(alphabet) text = ''.join(yield_all_text([test_file])) for ii in range(len(breaks) - 1): if ii < 1: continue min_log_p_ratio = 1000000 min_b_curr = None min_chunk1 = None min_chunk2 = None for w in [0]: b_prev = breaks[ii - 1] b_curr = breaks[ii] + w b_next = breaks[ii + 1]
# lists to hold the descriptive stats at each round stds = [] means = [] medians = [] cvs = [] frac_lt_zero = [] eigvals = [] ii = -1 while True: # get ready for this round plt.close() ii += 1 char2idx = get_char2idx(alphabet2strings(alphabet)) counts, total_kmers = count_kmers_corpus(corpus, maxK, alphabet2strings(alphabet)) results = score_kmers_corpus(corpus, maxK, counts, total_kmers) transition_counts = build_markov_chain_corpus( corpus, alphabet2strings(alphabet)) trans_mat = norm_transition_counts(transition_counts) w, _ = LA.eig(trans_mat) realw = sorted([x.real for x in w], reverse=True) breaks = candidate_breaks(results, maxK // 2) new_token, n, betas = guess_next_token(corpus, breaks, alphabet2strings(alphabet), counts, total_kmers) corpus = replace_token_in_corpus(corpus, new_token) alphabet.append(new_token) print(ii, new_token, n)