def test_non_existing_read_name2(): rs = ReadSet() r = Read('Read A', 56, 1) r.add_variant(100, 1, 37) r.add_variant(101, 0, 18) rs.add(r) rs[(2, 'Read A')]
def test_non_existing_read_name(): rs = ReadSet() r = Read('Read A', 56) r.add_variant(100, 1, 37) r.add_variant(101, 0, 18) rs.add(r) rs[(0, 'foo')]
def matrix_to_readset(lines): rs = ReadSet() index_tracker = 0 for line in lines: s = line.split() assert len(s) % 2 == 1, "Not in matrix format." index = int(s[0]) index_tracker += 1 assert index == index_tracker, "Not in matrix format." read = Read("Read {}".format(index), 50) for i in range(int(len(s) / 2)): offset = int(s[2 * i + 1]) for pos, c in enumerate(s[2 * i + 2]): read.add_variant(position=(offset + pos) * 10, allele=int(c), quality=1) rs.add(read) print(rs) return rs
def test_merge_many_reads(): reads = [ Read("Name1"), Read("Name2"), Read("Name3"), ] reads[0].add_variant(100, 0, 31) reads[0].add_variant(200, 1, 32) reads[0].add_variant(300, 0, 33) reads[1].add_variant(200, 1, 41) reads[1].add_variant(400, 0, 42) reads[1].add_variant(500, 0, 43) reads[2].add_variant(200, 0, 51) reads[2].add_variant(500, 0, 52) reads[2].add_variant(600, 0, 53) expected = [ Variant(100, 0, 31), Variant(200, 1, 73), # see note: this depends on order of reads Variant(300, 0, 33), Variant(400, 0, 42), Variant(500, 0, 43 + 52), Variant(600, 0, 53), ] assert expected == list(merge_reads(*reads)) # TODO merging should not depend on the order of reads expected[1] = Variant(200, 0, 51) assert expected == list(merge_reads(*reads[::-1]))
def test_non_existing_read_name(): rs = ReadSet() r = Read("Read A", 56) r.add_variant(100, 1, 37) r.add_variant(101, 0, 18) rs.add(r) with raises(KeyError): _ = rs[(0, "foo")]
def test_readset2(): rs = ReadSet() rs.add(Read("Read A", 1, 23)) rs.add(Read("Read A", 2, 70)) rs.add(Read("Read B", 3, 23)) assert rs[(23, "Read A")].mapqs == (1, ) assert rs[(70, "Read A")].mapqs == (2, ) assert rs[(23, "Read B")].mapqs == (3, )
def test_readset2(): rs = ReadSet() rs.add(Read('Read A', 1, 23)) rs.add(Read('Read A', 2, 70)) rs.add(Read('Read B', 3, 23)) assert rs[(23, 'Read A')].mapqs == (1, ) assert rs[(70, 'Read A')].mapqs == (2, ) assert rs[(23, 'Read B')].mapqs == (3, )
def test_non_existing_read_name2(): rs = ReadSet() r = Read('Read A', 56, 1) r.add_variant(100, 1, 37) r.add_variant(101, 0, 18) rs.add(r) with raises(KeyError): _ = rs[(2, 'Read A')]
def verify_mec_score_and_partitioning(dp_table, reads): """Confirms that the results reported by dp_table are consistent: check whether the reported partitioning leads to the reported MEC score.""" superreads, transmission_vector = dp_table.get_super_reads() assert len(superreads) == 1 superreads = superreads[0] assert len(superreads) == 2 # create new superreads that don't contain 3s (EQUAL COST) new_superreads = [Read("superread0", 0), Read("superread1", 0)] assert len(superreads[0]) == len(superreads[1]) for i in range(len(superreads[0])): for j in range(2): v = superreads[j][i] allele = v.allele if allele == 3: allele = j new_superreads[j].add_variant(v.position, allele, v.quality) partitioning = dp_table.get_optimal_partitioning() position_to_index = { variant.position: index for index, variant in enumerate(new_superreads[0]) } swapped = False mec_score = 0 n = 0 for read_index, read in enumerate(reads): cost0 = 0 cost1 = 0 for variant in read: if variant.position in position_to_index: if new_superreads[0][position_to_index[ variant.position]].allele != variant.allele: cost0 = cost0 + variant.quality if new_superreads[1][position_to_index[ variant.position]].allele != variant.allele: cost1 = cost1 + variant.quality mec_score += min(cost0, cost1) if cost0 == cost1: continue haplotype = 0 if (cost0 < cost1) != swapped else 1 if partitioning[read_index] != haplotype: if n == 0: swapped = True else: assert False n += 1 print("Expected MEC score: {}, obtained MEC score: {}".format( mec_score, dp_table.get_optimal_cost())) assert mec_score == dp_table.get_optimal_cost()
def run_hapcut2vcf(hapcut, vcf, output=sys.stdout): command_line = "(whatshap {}) {}".format(__version__, " ".join(sys.argv[1:])) with ExitStack() as stack: if isinstance(output, str): output = stack.enter_context(open(output, "w")) writer = PhasedVcfWriter(vcf, command_line, out_file=output) if len(writer.samples) > 1: # This would be easy to support with a --sample command-line parameter, # but hapCUT does not seem to support multi-sample VCFs, so something # must be wrong anyway. raise CommandLineError("There is more than one sample in this VCF") sample = writer.samples[0] f = stack.enter_context(open(hapcut)) parser = HapCutParser(f) for chromosome, blocks in parser: logger.info("Read %d phased blocks for chromosome %s", len(blocks), chromosome) # Build one read for each haplotype and the connected components haplotypes = [Read(str(i)) for i in (1, 2)] components = dict() for block in blocks: for variant in block: haplotypes[0].add_variant(variant.position, variant.haplotype1, 0) haplotypes[1].add_variant(variant.position, variant.haplotype2, 0) components[variant.position] = variant.component_id sample_superreads = {sample: haplotypes} sample_components = {sample: components} writer.write(chromosome, sample_superreads, sample_components)
def string_to_readset(s, n_alleles, w=None, sample_ids=None): s = textwrap.dedent(s).strip() if w is not None: w = textwrap.dedent(w).strip().split('\n') rs = ReadSet() for index, line in enumerate(s.split('\n')): if len(line) == 0: continue if sample_ids is None: read = Read('Read {}'.format(index + 1), 50) else: read = Read('Read {}'.format(index + 1), 50, 0, sample_ids[index]) for pos, c in enumerate(line): if c == ' ': continue q = 1 if w is not None: q = int(w[index][pos]) quality = [q] * n_alleles quality[int(c)] = 0 read.add_variant(position=(pos + 1) * 10, allele=int(c), quality=quality) assert len( read) > 1, 'Reads covering less than two variants are not allowed' rs.add(read) print(rs) return rs
def test_read_iteration(): r = Read("name", 15) r.add_variant(100, 1, 37) r.add_variant(23, 0, 99) v1 = Variant(position=100, allele=1, quality=37) v2 = Variant(position=23, allele=0, quality=99) variants = list(r) assert variants == [v1, v2] # negative indices assert r[-1] == v2 assert r[-2] == v1
def string_to_readset(s, w=None, sample_ids=None, source_id=0, scale_quality=None): s = textwrap.dedent(s).strip() if w is not None: w = textwrap.dedent(w).strip().split("\n") rs = ReadSet() for index, line in enumerate(s.split("\n")): if len(line) == 0: continue if sample_ids is None: read = Read("Read {}".format(index + 1), 50, source_id) else: read = Read("Read {}".format(index + 1), 50, source_id, sample_ids[index]) for pos, c in enumerate(line): if c == " ": continue q = 1 if w is not None: q = int(w[index][pos]) if not scale_quality == None: read.add_variant(position=(pos + 1) * 10, allele=int(c), quality=q * scale_quality) else: read.add_variant(position=(pos + 1) * 10, allele=int(c), quality=q) assert len( read) > 1, "Reads covering less than two variants are not allowed" rs.add(read) print(rs) return rs
def split_readset(readset, ext_block_starts, index): """ Creates one sub-readset for every block. Reads which cross block borders are also split, so parts of them appear in multiple blocks. Reads inside a sub-readset are trimmed, such that they do not contain variants outside of their associated blocks. """ var_to_block = [0 for _ in range(ext_block_starts[-1])] for i in range(len(ext_block_starts) - 1): for var in range(ext_block_starts[i], ext_block_starts[i + 1]): var_to_block[var] = i block_readsets = [ReadSet() for i in range(len(ext_block_starts) - 1)] for i, read in enumerate(readset): if not read.is_sorted(): read.sort() start = var_to_block[index[read[0].position]] end = var_to_block[index[read[-1].position]] if start == end: # if read lies entirely in one block, copy it into according readset block_readsets[start].add(read) else: # split read by creating one new read for each covered block current_block = start read_slice = Read( name=read.name, source_id=read.source_id, sample_id=read.sample_id, reference_start=read.sample_id, BX_tag=read.BX_tag, ) for variant in read: if var_to_block[index[variant.position]] != current_block: block_readsets[current_block].add(read_slice) current_block = var_to_block[index[variant.position]] read_slice = Read( name=str(current_block) + "_" + read.name, source_id=read.source_id, sample_id=read.sample_id, reference_start=read.sample_id, BX_tag=read.BX_tag, ) read_slice.add_variant(variant.position, variant.allele, variant.quality) block_readsets[current_block].add(read_slice) return block_readsets
def phase_single_individual(readset, phasable_variant_table, sample, phasing_param, output, timers): # Compute the genotypes that belong to the variant table and create a list of all genotypes genotype_list = create_genotype_list(phasable_variant_table, sample) # Select reads, only for debug # selected_reads = select_reads(readset, 120, preferred_source_ids = vcf_source_ids) # readset = selected_reads # Precompute block borders based on read coverage and linkage between variants logger.info("Detecting connected components with weak interconnect ..") timers.start("detecting_blocks") index, rev_index = get_position_map(readset) num_vars = len(rev_index) if phasing_param.block_cut_sensitivity == 0: block_starts = [0] elif phasing_param.block_cut_sensitivity == 1: block_starts = compute_linkage_based_block_starts(readset, index, phasing_param.ploidy, single_linkage=True) else: block_starts = compute_linkage_based_block_starts(readset, index, phasing_param.ploidy, single_linkage=False) # Set block borders and split readset ext_block_starts = block_starts + [num_vars] num_non_singleton_blocks = len([ i for i in range(len(block_starts)) if ext_block_starts[i] < ext_block_starts[i + 1] - 1 ]) logger.info( "Split heterozygous variants into {} blocks (and {} singleton blocks)." .format(num_non_singleton_blocks, len(block_starts) - num_non_singleton_blocks)) block_readsets = split_readset(readset, ext_block_starts, index) timers.stop("detecting_blocks") # Process blocks independently ( blockwise_clustering, blockwise_paths, blockwise_haplotypes, blockwise_cut_positions, blockwise_haploid_cuts, ) = ([], [], [], [], []) processed_non_singleton_blocks = 0 for block_id, block_readset in enumerate(block_readsets): block_start = ext_block_starts[block_id] block_end = ext_block_starts[block_id + 1] block_num_vars = block_end - block_start assert len(block_readset.get_positions()) == block_num_vars if block_num_vars > 1: # Only print for non-singleton block processed_non_singleton_blocks += 1 logger.info( "Processing block {} of {} with {} reads and {} variants.". format( processed_non_singleton_blocks, num_non_singleton_blocks, len(block_readset), block_num_vars, )) genotype_slice = genotype_list[block_start:block_end] clustering, path, haplotypes, cut_positions, haploid_cuts = phase_single_block( block_readset, genotype_slice, phasing_param, timers) blockwise_clustering.append(clustering) blockwise_paths.append(path) blockwise_haplotypes.append(haplotypes) blockwise_cut_positions.append(cut_positions) blockwise_haploid_cuts.append(haploid_cuts) # Aggregate blockwise results clustering, threading, haplotypes, cut_positions, haploid_cuts = aggregate_phasing_blocks( block_starts, block_readsets, blockwise_clustering, blockwise_paths, blockwise_haplotypes, blockwise_cut_positions, blockwise_haploid_cuts, phasing_param, ) # Summarize data for VCF file accessible_positions = sorted(readset.get_positions()) components = {} haploid_components = {} ext_cuts = cut_positions + [num_vars] for i, cut_pos in enumerate(cut_positions): for pos in range(ext_cuts[i], ext_cuts[i + 1]): components[accessible_positions[pos]] = accessible_positions[ ext_cuts[i]] components[accessible_positions[pos] + 1] = accessible_positions[ext_cuts[i]] haploid_components[ accessible_positions[pos]] = [0] * phasing_param.ploidy haploid_components[accessible_positions[pos] + 1] = [0] * phasing_param.ploidy for j in range(phasing_param.ploidy): ext_cuts = haploid_cuts[j] + [num_vars] for i, cut_pos in enumerate(haploid_cuts[j]): for pos in range(ext_cuts[i], ext_cuts[i + 1]): haploid_components[accessible_positions[pos]][ j] = accessible_positions[ext_cuts[i]] haploid_components[accessible_positions[pos] + 1][j] = accessible_positions[ext_cuts[i]] superreads = ReadSet() for i in range(phasing_param.ploidy): read = Read("superread {}".format(i + 1), 0, 0) # insert alleles for j, allele in enumerate(haplotypes[i]): if allele == "n": continue allele = int(allele) # TODO: Needs changes for multi-allelic and we might give an actual quality value read.add_variant(accessible_positions[j], allele, 0) superreads.add(read) # Plot option if phasing_param.plot_clusters or phasing_param.plot_threading: timers.start("create_plots") draw_plots( block_readsets, clustering, threading, haplotypes, cut_positions, genotype_list, phasable_variant_table, phasing_param, output, ) timers.stop("create_plots") # Return results return components, haploid_components, superreads
def test_read(): r = Read("name", 15) assert r.name == "name" assert r.mapqs[0] == 15 assert r.is_sorted() r.add_variant(100, 1, 37) r.add_variant(23, 0, 99) assert not r.is_sorted() r.sort() assert r.is_sorted() assert 100 in r assert 23 in r assert 22 not in r assert 24 not in r assert 1000 not in r assert -1000 not in r
def test_readset(): rs = ReadSet() r = Read('Read A', 56) r.add_variant(100, 1, 37) r.add_variant(101, 0, 18) rs.add(r) r = Read('Read B', 0) r.add_variant(101, 0, 23) rs.add(r) r = Read('Read C', 17) r.add_variant(99, 1, 27) r.add_variant(80, 1, 17) r[1] = Variant(position=105, allele=0, quality=14) rs.add(r) assert rs[0].name == 'Read A' assert rs[1].name == 'Read B' assert rs[2].name == 'Read C' rs.sort() # should be sorted after finalization assert rs[0].name == 'Read C' assert rs[1].name == 'Read A' assert rs[2].name == 'Read B' assert len(rs) == 3 assert rs.get_positions() == [99, 100, 101, 105] r = rs[(0, 'Read A')] assert r.name == 'Read A' assert r.mapqs == (56, ), str(r.mapqs) r = rs[(0, 'Read B')] assert r.name == 'Read B' assert r.mapqs == (0, ) r = rs[(0, 'Read C')] assert r.name == 'Read C' assert r.mapqs == (17, ) assert len(r) == 2 assert r[0] == Variant(position=99, allele=1, quality=27) assert r[1] == Variant(position=105, allele=0, quality=14)
def test_read_indexerror2(): r = Read("name", 15) r.add_variant(100, 1, 37) r.add_variant(23, 0, 99) with raises(IndexError): _ = r[-3]
def phase_single_individual(readset, phasable_variant_table, sample, phasing_param, output, timers): # Compute the genotypes that belong to the variant table and create a list of all genotypes genotype_list = create_genotype_list(phasable_variant_table, sample) # Select reads, only for debug # selected_reads = select_reads(readset, 120, preferred_source_ids = vcf_source_ids) # readset = selected_reads # Precompute block borders based on read coverage and linkage between variants logger.info("Detecting connected components with weak interconnect ..") timers.start("detecting_blocks") index, rev_index = get_position_map(readset) num_vars = len(rev_index) if phasing_param.block_cut_sensitivity == 0: block_starts = [0] elif phasing_param.block_cut_sensitivity == 1: block_starts = compute_linkage_based_block_starts( readset, index, phasing_param.ploidy, single_linkage=True ) else: block_starts = compute_linkage_based_block_starts( readset, index, phasing_param.ploidy, single_linkage=False ) # Set block borders and split readset ext_block_starts = block_starts + [num_vars] num_non_singleton_blocks = len( [i for i in range(len(block_starts)) if ext_block_starts[i] < ext_block_starts[i + 1] - 1] ) logger.info( "Split heterozygous variants into {} blocks (and {} singleton blocks).".format( num_non_singleton_blocks, len(block_starts) - num_non_singleton_blocks ) ) block_readsets = split_readset(readset, ext_block_starts, index) timers.stop("detecting_blocks") # Process blocks independently ( blockwise_clustering, blockwise_paths, blockwise_haplotypes, blockwise_cut_positions, blockwise_haploid_cuts, ) = ([], [], [], [], []) # Create genotype slices for blocks genotype_slices = [] for block_id, block_readset in enumerate(block_readsets): block_start = ext_block_starts[block_id] block_end = ext_block_starts[block_id + 1] block_num_vars = block_end - block_start assert len(block_readset.get_positions()) == block_num_vars genotype_slices.append(genotype_list[block_start:block_end]) processed_non_singleton_blocks = 0 # use process pool for multiple threads if phasing_param.threads == 1: # for single-threading, process everything individually to minimize memory footprint for block_id, block_readset in enumerate(block_readsets): block_num_vars = ext_block_starts[block_id + 1] - ext_block_starts[block_id] if block_num_vars > 1: # Only print for non-singleton block processed_non_singleton_blocks += 1 logger.info( "Processing block {} of {} with {} reads and {} variants.".format( processed_non_singleton_blocks, num_non_singleton_blocks, len(block_readset), block_num_vars, ) ) clustering, path, haplotypes, cut_positions, haploid_cuts = phase_single_block( block_readset, genotype_slices[block_id], phasing_param, timers ) blockwise_clustering.append(clustering) blockwise_paths.append(path) blockwise_haplotypes.append(haplotypes) blockwise_cut_positions.append(cut_positions) blockwise_haploid_cuts.append(haploid_cuts) else: # sort block readsets in descending order by number of reads joblist = [(i, len(block_readsets[i])) for i in range(len(block_readsets))] joblist.sort(key=lambda x: -x[1]) timers.start("phase_blocks") # process large jobs first, 4/3-approximation for scheduling problem with Pool(processes=phasing_param.threads) as pool: """ TODO: Python's multiprocessing makes hard copies of the passed arguments, which is not trivial for cython objects, especially when they contain pointers to other cython objects. Any passed object must be (de)serializable (in Python: pickle). All other objects created in the main thread are also accessible by the workers, but they are handled via the copy-on-write policy. This means, that e.g. the large main readset is not hardcopied for every thread, as long as it is not modified there. Since this would cause a massive waste of memory, this must not be done and the main readset must also never be passed as argument to the workers. """ process_results = [ pool.apply_async( phase_single_block_mt, ( block_readsets[block_id], genotype_slices[block_id], phasing_param, timers, block_id, job_id, num_non_singleton_blocks, ), ) for job_id, (block_id, block_readset) in enumerate(joblist) ] blockwise_results = [res.get() for res in process_results] # reorder results again blockwise_results.sort(key=lambda x: x[-1]) # collect all blockwise results for ( clustering, path, haplotypes, cut_positions, haploid_cuts, block_id, ) in blockwise_results: blockwise_clustering.append(clustering) blockwise_paths.append(path) blockwise_haplotypes.append(haplotypes) blockwise_cut_positions.append(cut_positions) blockwise_haploid_cuts.append(haploid_cuts) timers.stop("phase_blocks") # Aggregate blockwise results clustering, threading, haplotypes, cut_positions, haploid_cuts = aggregate_phasing_blocks( block_starts, block_readsets, blockwise_clustering, blockwise_paths, blockwise_haplotypes, blockwise_cut_positions, blockwise_haploid_cuts, phasing_param, ) # Summarize data for VCF file accessible_positions = sorted(readset.get_positions()) components = {} haploid_components = {} ext_cuts = cut_positions + [num_vars] for i, cut_pos in enumerate(cut_positions): for pos in range(ext_cuts[i], ext_cuts[i + 1]): components[accessible_positions[pos]] = accessible_positions[ext_cuts[i]] components[accessible_positions[pos] + 1] = accessible_positions[ext_cuts[i]] haploid_components[accessible_positions[pos]] = [0] * phasing_param.ploidy haploid_components[accessible_positions[pos] + 1] = [0] * phasing_param.ploidy for j in range(phasing_param.ploidy): ext_cuts = haploid_cuts[j] + [num_vars] for i, cut_pos in enumerate(haploid_cuts[j]): for pos in range(ext_cuts[i], ext_cuts[i + 1]): haploid_components[accessible_positions[pos]][j] = accessible_positions[ext_cuts[i]] haploid_components[accessible_positions[pos] + 1][j] = accessible_positions[ ext_cuts[i] ] superreads = ReadSet() for i in range(phasing_param.ploidy): read = Read("superread {}".format(i + 1), 0, 0) # insert alleles for j, allele in enumerate(haplotypes[i]): if allele == "n": continue allele = int(allele) # TODO: Needs changes for multi-allelic and we might give an actual quality value read.add_variant(accessible_positions[j], allele, 0) superreads.add(read) # Plot option if phasing_param.plot_clusters or phasing_param.plot_threading: timers.start("create_plots") draw_plots( block_readsets, clustering, threading, haplotypes, cut_positions, genotype_list, phasable_variant_table, phasing_param.plot_clusters, phasing_param.plot_threading, output, ) timers.stop("create_plots") # Return results return components, haploid_components, superreads
def test_readscoring_toy(): readset = ReadSet() read1 = Read("name1", 15) read1.add_variant(0, 0, 1) read1.add_variant(1, 0, 1) read1.add_variant(2, 0, 1) read1.add_variant(3, 1, 1) readset.add(read1) read2 = Read("name2", 15) read2.add_variant(1, 1, 1) read2.add_variant(2, 0, 1) read2.add_variant(3, 0, 1) read2.add_variant(4, 1, 1) readset.add(read2) read3 = Read("name3", 15) read3.add_variant(2, 0, 1) read3.add_variant(3, 1, 1) read3.add_variant(4, 0, 1) read3.add_variant(5, 1, 1) readset.add(read3) read4 = Read("name4", 15) read4.add_variant(3, 0, 1) read4.add_variant(4, 1, 1) read4.add_variant(5, 0, 1) read4.add_variant(6, 0, 1) readset.add(read4) read5 = Read("name5", 15) read5.add_variant(4, 0, 1) read5.add_variant(5, 1, 1) read5.add_variant(6, 1, 1) read5.add_variant(7, 0, 1) readset.add(read5) read6 = Read("name6", 15) read6.add_variant(5, 0, 1) read6.add_variant(6, 0, 1) read6.add_variant(7, 0, 1) read6.add_variant(8, 1, 1) readset.add(read6) read7 = Read("name7", 15) read7.add_variant(6, 1, 1) read7.add_variant(7, 0, 1) read7.add_variant(8, 0, 1) read7.add_variant(9, 1, 1) readset.add(read7) sim = scoreReadsetGlobal(readset, 2, 2) assert sim.get(0, 1) < 0.0 assert sim.get(0, 2) > 0.0 assert sim.get(0, 3) <= 0.0 assert sim.get(0, 4) >= 0.0 assert sim.get(0, 5) <= 0.0 assert sim.get(0, 6) >= 0.0 assert sim.get(1, 2) < 0.0 assert sim.get(1, 3) > 0.0 assert sim.get(1, 4) <= 0.0 assert sim.get(1, 5) >= 0.0 assert sim.get(1, 6) <= 0.0 assert sim.get(2, 3) < 0.0 assert sim.get(2, 4) > 0.0 assert sim.get(2, 5) <= 0.0 assert sim.get(2, 6) >= 0.0 assert sim.get(3, 4) < 0.0 assert sim.get(3, 5) > 0.0 assert sim.get(3, 6) <= 0.0 assert sim.get(4, 5) < 0.0 assert sim.get(4, 6) > 0.0 assert sim.get(5, 6) < 0.0
def create_testinstance1(): var_pos = [ 24, 56, 89, 113, 162, 166, 187, 205, 211, 248, 273, 299, 307, 324, 351, 370, 378, 400, 441, 455, 478, 492, ] readset = ReadSet() matrix = [ "0011000", "11010100", " 101011010", " 0001011000", " 11001001", " 0010100000", " 100010001", " 0100000101", " 101110001", " 0001110011", " 1010001010", " 011100011", " 0010100111", " 1010101011", " 0101001110", " 01000001", " 01010001", " 101100", " 111010", ] for i in range(len(matrix)): read = Read(name="read" + str(i), mapq=15) for j in range(len(matrix[i])): if matrix[i][j] != " ": read.add_variant(var_pos[j], int(matrix[i][j]), 0) readset.add(read) clustering = [ [0, 4, 6], [1, 2], [7, 10, 13], [9, 12, 14], [3, 5, 8, 11], [15, 16], [17], [18], ] genotypes = [ {0: 2, 1: 1}, {0: 2, 1: 1}, {0: 2, 1: 1}, {0: 1, 1: 2}, {0: 2, 1: 1}, {0: 2, 1: 1}, {0: 2, 1: 1}, {0: 2, 1: 1}, {0: 2, 1: 1}, {0: 3, 1: 0}, {0: 2, 1: 1}, {0: 2, 1: 1}, {0: 2, 1: 1}, {0: 1, 1: 2}, {0: 2, 1: 1}, {0: 2, 1: 1}, {0: 1, 1: 2}, {0: 2, 1: 1}, {0: 1, 1: 2}, {0: 2, 1: 1}, {0: 2, 1: 1}, {0: 2, 1: 1}, ] return readset, var_pos, clustering, genotypes
def test_merge_pair_with_shared_positions(merge): left = Read("Name1") left.add_variant(100, 0, 31) left.add_variant(200, 0, 32) left.add_variant(300, 0, 33) right = Read("Name2") right.add_variant(200, 0, 41) # alleles disagree right.add_variant(300, 1, 42) # alleles agree right.add_variant(400, 1, 43) expected = [ Variant(100, 0, 31), Variant(200, 0, 32 + 41), Variant(300, 1, 42), Variant(400, 1, 43), ] assert expected == list(merge(left, right)) assert expected == list(merge(right, left))
def test_read_indexerror2(): r = Read("name", 15) r.add_variant(100, 1, 37) r.add_variant(23, 0, 99) r[-3]
def merge(self, readset): """ Return a set of reads after merging together subsets of reads (into super reads) from an input readset according to a probabilistic model of how likely sets of reads are to appear together on one haplotype and on opposite haplotypes. readset -- the input .core.ReadSet object error_rate -- the probability that a nucleotide is wrong max_error_rate -- the maximum error rate of any edge of the read merging graph allowed before we discard it threshold -- the threshold of the ratio between the probabilities that a pair ' 'of reads come from the same haplotype and different haplotypes neg_threshold -- The threshold of the ratio between the probabilities that a pair of reads come from the same haplotype and different haplotypes. """ logger.info( "Merging %d reads with error rate %.2f, maximum error rate %.2f, " "positive threshold %d and negative threshold %d ...", len(readset), self._error_rate, self._max_error_rate, self._positive_threshold, self._negative_threshold, ) logger.debug("Merging started.") gblue = Graph() gred = Graph() gnotblue = Graph() gnotred = Graph() # Probability that any nucleotide is wrong error_rate = self._error_rate logger.debug("Error Rate: %s", error_rate) # If an edge has too many errors, we discard it since it is not reliable max_error_rate = self._max_error_rate logger.debug("Max Error Rate: %s", max_error_rate) # Threshold of the ratio between the probabilities that the two reads come from # the same side or from different sides thr = self._positive_threshold logger.debug("Positive Threshold: %s", thr) # Threshold_neg is a more conservative threshold for the evidence # that two reads should not be clustered together. thr_neg = self._negative_threshold logger.debug("Negative Threshold: %s", thr_neg) thr_diff = 1 + int(log(thr, (1 - error_rate) / (error_rate / 3))) thr_neg_diff = 1 + int( log(thr_neg, (1 - error_rate) / (error_rate / 3))) logger.debug("Thr. Diff.: %s - Thr. Neg. Diff.: %s", thr_diff, thr_neg_diff) logger.debug("Start reading the reads...") id = 0 orig_reads = {} queue = {} reads = {} for read in readset: id += 1 begin_str = read[0][0] snps = [] orgn = [] for variant in read: site = variant[0] zyg = variant[1] qual = variant[2] orgn.append([str(site), str(zyg), str(qual)]) if int(zyg) == 0: snps.append("G") else: snps.append("C") begin = int(begin_str) end = begin + len(snps) orig_reads[id] = orgn gblue.add_node(id, begin=begin, end=end, sites="".join(snps)) gnotblue.add_node(id, begin=begin, end=end, sites="".join(snps)) gred.add_node(id, begin=begin, end=end, sites="".join(snps)) gnotred.add_node(id, begin=begin, end=end, sites="".join(snps)) queue[id] = {"begin": begin, "end": end, "sites": snps} reads[id] = {"begin": begin, "end": end, "sites": snps} for x in [id for id in queue.keys() if queue[id]["end"] <= begin]: del queue[x] for id1 in queue.keys(): if id == id1: continue match, mismatch = eval_overlap(queue[id1], queue[id]) if (match + mismatch >= thr_neg_diff and min(match, mismatch) / (match + mismatch) <= max_error_rate and match - mismatch >= thr_diff): gblue.add_edge(id1, id, match=match, mismatch=mismatch) if mismatch - match >= thr_diff: gred.add_edge(id1, id, match=match, mismatch=mismatch) if match - mismatch >= thr_neg_diff: gnotred.add_edge(id1, id, match=match, mismatch=mismatch) if mismatch - match >= thr_neg_diff: gnotblue.add_edge(id1, id, match=match, mismatch=mismatch) logger.debug("Finished reading the reads.") logger.debug("Number of reads: %s", id) logger.debug("Blue Graph") logger.debug( "Nodes: %s - Edges: %s - ConnComp: %s", number_of_nodes(gblue), number_of_edges(gblue), len(list(connected_components(gblue))), ) logger.debug("Non-Blue Graph") logger.debug( "Nodes: %s - Edges: %s - ConnComp: %s", number_of_nodes(gnotblue), number_of_edges(gnotblue), len(list(connected_components(gnotblue))), ) logger.debug("Red Graph") logger.debug( "Nodes: %s - Edges: %s - ConnComp: %s", number_of_nodes(gred), number_of_edges(gred), len(list(connected_components(gred))), ) logger.debug("Non-Red Graph") logger.debug( "Nodes: %s - Edges: %s - ConnComp: %s", number_of_nodes(gnotred), number_of_edges(gnotred), len(list(connected_components(gnotred))), ) # We consider the notblue edges as an evidence that two reads # should not be merged together # Since we want to merge each blue connected components into # a single superread, we check each notblue edge (r1, r2) and # we remove some blue edges so that r1 and r2 are not in the # same blue connected component blue_component = {} current_component = 0 for conncomp in connected_components(gblue): for v in conncomp: blue_component[v] = current_component current_component += 1 # Keep only the notblue edges that are inside a blue connected component good_notblue_edges = [(v, w) for (v, w) in gnotblue.edges() if blue_component[v] == blue_component[w]] for (u, v) in good_notblue_edges: while v in node_connected_component(gblue, u): path = shortest_path(gblue, source=u, target=v) # Remove the edge with the smallest support # A better strategy is to weight each edge with -log p # and remove the minimum (u,v)-cut w, x = min( zip(path[:-1], path[1:]), key=lambda p: gblue[p[0]][p[1]]["match"] - gblue[p[0]][p[ 1]]["mismatch"], ) gblue.remove_edge(w, x) # Merge blue components (somehow) logger.debug("Started Merging Reads...") superreads = {} # superreads given by the clusters (if clustering) rep = {} # cluster representative of a read in a cluster for cc in connected_components(gblue): if len(cc) > 1: r = min(cc) superreads[r] = {} for id in cc: rep[id] = r for id in orig_reads: if id in rep: for tok in orig_reads[id]: site = int(tok[0]) zyg = int(tok[1]) qual = int(tok[2]) r = rep[id] if site not in superreads[r]: superreads[r][site] = [0, 0] superreads[r][site][zyg] += qual merged_reads = ReadSet() readn = 0 for id in orig_reads: read = Read("read" + str(readn)) readn += 1 if id in rep: if id == rep[id]: for site in sorted(superreads[id]): z = superreads[id][site] if z[0] >= z[1]: read.add_variant(site, 0, z[0] - z[1]) elif z[1] > z[0]: read.add_variant(site, 1, z[1] - z[0]) merged_reads.add(read) else: for tok in orig_reads[id]: read.add_variant(int(tok[0]), int(tok[1]), int(tok[2])) merged_reads.add(read) logger.debug("Finished merging reads.") logger.info( "... after merging: merged %d reads into %d reads", len(readset), len(merged_reads), ) return merged_reads
def test_merge_pair_without_shared_positions(merge): empty1 = Read("Name1") empty2 = Read("Name2") assert merge(empty1, empty2).name == "Name1" assert merge(empty2, empty1).name == "Name2" # add_variant parameters are: (position, allele, quality) left = Read("Name1") left.add_variant(100, 0, 31) left.add_variant(200, 0, 32) right = Read("Name2") right.add_variant(300, 1, 41) right.add_variant(400, 1, 42) expected = [ Variant(100, 0, 31), Variant(200, 0, 32), Variant(300, 1, 41), Variant(400, 1, 42), ] assert expected == list(merge(left, right)) assert expected == list(merge(right, left)) outer = Read("Name1") outer.add_variant(100, 0, 31) outer.add_variant(400, 1, 42) inner = Read("Name2") inner.add_variant(200, 0, 32) inner.add_variant(300, 1, 41) assert expected == list(merge(inner, outer)) assert expected == list(merge(outer, inner))
def gfa_to_readset(gfa_filename, split_gap=100, w=None, sample_ids=None, source_id=0, scale_quality=None): rs = ReadSet() node_length = {} node_coverage = {} with open(gfa_filename) as gfa_file: for line in gfa_file: fields = line.strip().split("\t") if fields[0] != "S": continue node_length[int(fields[1])] = len(fields[2]) with open(gfa_filename) as gfa_file: for line in gfa_file: fields = line.strip().split("\t") if fields[0] != "P": continue path_name = fields[1] path_str = fields[2] for i in [int(s[:-1]) for s in path_str.split(",")]: if i in node_coverage: node_coverage[i] += 1 else: node_coverage[i] = 1 with open(gfa_filename) as gfa_file: for line in gfa_file: fields = line.strip().split("\t") if fields[0] != "P": continue path_name = fields[1] path_str = fields[2] # order it path = sorted(set([int(s[:-1]) for s in path_str.split(",")])) # break each path into pieces separated by > x nodes (todo: use actual distance in the graph) # for each, add it to the ReadSet path_length = len(path) segment_idx = 0 i = 0 # how do we find segments? longest_read = None while i < path_length: #read = Read("{}\t{}".format(path_name, segment_idx), 50, source_id) read = Read("{}".format(path_name), 50, source_id) segment_idx += 1 q = 1 # while the distance to the next node is less than our split_gap threshold curr = path[i] read.add_variant( position=curr, allele=1, quality=-10 * math.log10(1 - 1.0 / node_coverage[curr] + 0.001)) last = curr i += 1 while i < path_length: curr = path[i] dist = 0 for node_id in range(last + 1, curr): dist += node_length[node_id] #eprint("for", path_name, "dist is", dist) if dist > split_gap: break else: for node_id in range(last + 1, curr): #eprint(node_coverage[node_id]) read.add_variant( position=node_id, allele=0, quality=1 ) #-10*math.log10(1-1.0/node_coverage[node_id]+0.001)) read.add_variant( position=curr, allele=1, quality=-10 * math.log10(1 - 1.0 / node_coverage[curr] + 0.001)) i += 1 last = curr #read.sort() # not sure if needed #if len(read) > min_read_length: if longest_read is None or len(read) > len(longest_read): longest_read = read #rs.add(read) rs.add(longest_read) rs.sort() #print(rs) return rs