Пример #1
0
def simulate(prob, alphabet, N=1000):
    # need a mapping from token to index in the transition matrix
    char2idx = get_char2idx(alphabet)

    last_tokens = seed
    sim_text = ''.join(seed)
    for ii in range(N):
        idxs = tuple([char2idx[x] for x in last_tokens])
        next_token = draw_token(prob[idxs], alphabet)
        sim_text = sim_text + next_token
        last_tokens.append(next_token)
        last_tokens.pop(0)
    return sim_text
def build_markov_chain_corpus(corpus, alphabet, order=1):

    # need a mapping from token to index in the transition matrix
    char2idx = get_char2idx(alphabet)

    # store the transtion matrix (or tensor)
    transition_counts = np.zeros([len(alphabet)] * (order + 1),
                                 dtype='float64')

    for ii in range(len(corpus)):
        kmer = corpus[ii:ii + order + 1]
        idxs = tuple([char2idx[x] for x in kmer])
        transition_counts[idxs] += 1

    return transition_counts + np.finfo(float).eps
def build_markov_chain(kmers_stream, alphabet, order=1):

    # need a mapping from token to index in the transition matrix
    char2idx = get_char2idx(alphabet)

    # the sorted function is guaranteed to be stable, so we can sort
    # alphabetically then by length to get a list sorted by length
    # and also alphabetically within each length
    sorted_alphabet = sorted(alphabet)
    sorted_alphabet = sorted(sorted_alphabet,
                             key=lambda x: len(x),
                             reverse=True)

    # need this to make sure that our kmers are longer than the tokens
    max_token_length = max([len(x) for x in alphabet])

    # store the transtion matrix (or tensor)
    transition_counts = np.zeros([len(alphabet)] * (order + 1), dtype='int64')

    skip = 0
    # hold on to the last n tokens, where n is the order of the model + 1
    last_tokens = []
    for kmer in kmers_stream:
        if skip > 0:
            skip = skip - 1
            continue
        if not max_token_length <= len(kmer):
            raise Exception("The kmers must be longer than the longest token")

        if skip > 0:
            skip = skip - 1
            continue

        curr_token, skip = next_token(kmer, alphabet)
        last_tokens.append(curr_token)
        if len(last_tokens) < order + 1:
            continue
        idxs = tuple([char2idx[x] for x in last_tokens])
        transition_counts[idxs] += 1
        last_tokens.pop(0)

    return transition_counts

words = {}

if __name__ == '__main__':
    test_file = sys.argv[1]
    alphabet_file = sys.argv[2]
    train_data = sys.argv[3]

    maxK = int(os.path.basename(train_data).split('__')[1][1:])
    counts, total_kmers = pickle.load(open(train_data, 'rb'))
    results = score_kmers([test_file], maxK, counts, total_kmers)
    breaks = candidate_breaks(results, maxK // 2)

    alphabet = load_alphabet(alphabet_file)
    char2idx = get_char2idx(alphabet)

    text = ''.join(yield_all_text([test_file]))
    for ii in range(len(breaks) - 1):
        if ii < 1:
            continue

        min_log_p_ratio = 1000000
        min_b_curr = None
        min_chunk1 = None
        min_chunk2 = None
        for w in [0]:

            b_prev = breaks[ii - 1]
            b_curr = breaks[ii] + w
            b_next = breaks[ii + 1]
Пример #5
0
    # lists to hold the descriptive stats at each round
    stds = []
    means = []
    medians = []
    cvs = []
    frac_lt_zero = []
    eigvals = []

    ii = -1
    while True:

        # get ready for this round
        plt.close()
        ii += 1

        char2idx = get_char2idx(alphabet2strings(alphabet))
        counts, total_kmers = count_kmers_corpus(corpus, maxK,
                                                 alphabet2strings(alphabet))
        results = score_kmers_corpus(corpus, maxK, counts, total_kmers)
        transition_counts = build_markov_chain_corpus(
            corpus, alphabet2strings(alphabet))
        trans_mat = norm_transition_counts(transition_counts)
        w, _ = LA.eig(trans_mat)
        realw = sorted([x.real for x in w], reverse=True)
        breaks = candidate_breaks(results, maxK // 2)
        new_token, n, betas = guess_next_token(corpus, breaks,
                                               alphabet2strings(alphabet),
                                               counts, total_kmers)
        corpus = replace_token_in_corpus(corpus, new_token)
        alphabet.append(new_token)
        print(ii, new_token, n)