def unaligned_error_list(length, error_p): e_dict = {} error_rate = { (0, 0.4): "match", (0.4, 0.7): "mis", (0.7, 0.85): "ins", (0.85, 1): "del" } pos = 0 last_is_ins = False while pos < length: p = random.random() for k_error in error_rate.keys(): if k_error[0] <= p < k_error[1]: error_type = error_rate[k_error] break if error_type == "match": step = 1 elif error_type == "mis": step = mm.pois_geom(error_p["mis"][0], error_p["mis"][2], error_p["mis"][3]) e_dict[pos] = ["mis", step] elif error_type == "ins": step = mm.wei_geom(error_p["ins"][0], error_p["ins"][1], error_p["ins"][2], error_p["ins"][3]) if last_is_ins: e_dict[pos + 0.1][1] += step else: e_dict[pos + 0.1] = ["ins", step] last_is_ins = True else: step = mm.wei_geom(error_p["del"][0], error_p["del"][1], error_p["del"][2], error_p["del"][3]) e_dict[pos] = ["del", step] if error_type != "ins": pos += step last_is_ins = False if pos > length: length = pos return length, e_dict
def error_list(m_ref, m_model, m_ht_list, error_p, trans_p): # l_old is the original length, and l_new is used to control the new length after introducing errors l_new = m_ref pos = 0 e_dict = {} middle_ref = m_ref prev_error = "start" # The first match come from m_ht_list p = random.random() k1 = list(m_ht_list.keys())[0] for k2, v2 in m_ht_list[k1].items(): if k2[0] < p <= k2[1]: prev_match = int( np.floor((p - k2[0]) / (k2[1] - k2[0]) * (v2[1] - v2[0]) + v2[0])) if prev_match < 2: prev_match = 2 pos += prev_match # Select an error, then the step size, and then a match and so on so forth. while pos < middle_ref: # pick the error based on Markov chain p = random.random() for k in trans_p[prev_error].keys(): if k[0] <= p < k[1]: error = trans_p[prev_error][k] break if error == "mis": step = mm.pois_geom(error_p[error][0], error_p[error][2], error_p[error][3]) elif error == "ins": step = mm.wei_geom(error_p[error][0], error_p[error][1], error_p[error][2], error_p[error][3]) l_new += step else: step = mm.wei_geom(error_p[error][0], error_p[error][1], error_p[error][2], error_p[error][3]) l_new -= step if error != "ins": e_dict[pos] = [error, step] pos += step if pos >= middle_ref: l_new += pos - middle_ref middle_ref = pos else: e_dict[pos - 0.5] = [error, step] prev_error = error # Randomly select a match length for k1 in m_model.keys(): if k1[0] <= prev_match < k1[1]: break p = random.random() for k2, v2 in m_model[k1].items(): if k2[0] < p <= k2[1]: step = int( np.floor((p - k2[0]) / (k2[1] - k2[0]) * (v2[1] - v2[0]) + v2[0])) break # there are no two 0 base matches together if prev_match == 0 and step == 0: step = 1 prev_match = step if pos + prev_match > middle_ref: l_new += pos + prev_match - middle_ref middle_ref = pos + prev_match pos += prev_match if prev_match == 0: prev_error += "0" return l_new, middle_ref, e_dict