def test_path_with_affine(): readset, var_pos, clustering, genotypes = create_testinstance1() ploidy = 3 index, rev_index = get_position_map(readset) num_vars = len(rev_index) positions = get_cluster_start_end_positions(readset, clustering, index) coverage = get_coverage(readset, clustering, index) cov_map = get_pos_to_clusters_map(coverage, ploidy) consensus = get_local_cluster_consensus(readset, clustering, cov_map, positions) path = compute_threading_path( readset, clustering, num_vars, coverage, cov_map, consensus, ploidy, genotypes ) cluster_paths = ["".join([str(path[i][j]) for i in range(len(path))]) for j in range(3)] first_block = set([cluster_paths[0][:9], cluster_paths[1][:9], cluster_paths[2][:9]]) first_truth = set(["000000000", "111111111", "044444444"]) second_block = set([cluster_paths[0][9:20], cluster_paths[1][9:20], cluster_paths[2][9:20]]) second_truth = set(["33333333333", "22222222222", "44444555555"]) third_block = set([cluster_paths[0][20:], cluster_paths[1][20:], cluster_paths[2][20:]]) third_truth = set(["66", "77", "55"]) print(cluster_paths) assert first_block == first_truth assert second_block == second_truth assert third_block == third_truth
def find_inconsistencies(readset, clustering, ploidy): # Returns the number of cluster positions with inconsistencies # (counts position multiple times, if multiple clusters are inconsistent there) # Also returns a list of read pairs, which need to be seperated num_inconsistent_positions = 0 separated_pairs = [] exp_error = 0.05 p_val_threshold = 0.02 # Compute consensus and coverage index, rev_index = get_position_map(readset) num_vars = len(rev_index) coverage = get_coverage(readset, clustering, index) cov_map = get_pos_to_clusters_map(coverage, ploidy) positions = get_cluster_start_end_positions(readset, clustering, index) abs_coverage = get_coverage_absolute(readset, clustering, index) consensus = get_local_cluster_consensus_withfrac(readset, clustering, cov_map, positions) # Search for positions in clusters with ambivalent consensus for pos in range(num_vars): # print(str(pos)+" -> "+str(len(coverage[pos]))+" , "+str(len(consensus[pos]))) for c_id in coverage[pos]: if c_id not in consensus[pos]: continue # do binomial hypothesis test, whether the deviations from majority allele is significant enough for splitting abs_count = abs_coverage[pos][c_id] abs_deviations = int(abs_count * (1 - consensus[pos][c_id][1])) p_val = binom_test(abs_deviations, abs_count, exp_error, alternative="greater") if p_val < p_val_threshold: # print(" inconsistency in cluster "+str(c_id)+" at position"+str(pos)+" with coverage "+str(coverage[pos][c_id])+" and consensus "+str(consensus[pos][c_id])) num_inconsistent_positions += 1 zero_reads = [] one_reads = [] for read in clustering[c_id]: for var in readset[read]: if index[var.position] == pos: if var.allele == 0: zero_reads.append(read) else: one_reads.append(read) for r0 in zero_reads: for r1 in one_reads: separated_pairs.append((r0, r1)) return num_inconsistent_positions, separated_pairs