示例#1
0
def test_not_conflicting_hedges__without_common_positions():
    """
    ATG
       TGA
    """
    he1 = HEdge(positions=[1, 2, 3], nucls=list('ATG'))
    he2 = HEdge(positions=[4, 5, 6], nucls=list('TGA'))
    assert not he1.is_ambiguous_with(he2)
示例#2
0
def test_not_conflicting_hedges__distinct_endings_of_common_positions():
    """
    ATAG
     *|*
     GATA
    """
    he1 = HEdge(positions=[1, 2, 3, 4], nucls=list('ATAG'))
    he2 = HEdge(positions=[2, 3, 4, 5], nucls=list('GATA'))
    assert not he1.is_ambiguous_with(he2)
示例#3
0
def test_not_conflicting_hedges__all_common_positions_are_distinct():
    """
    ATG
     **
     GTA
    """
    he1 = HEdge(positions=[1, 2, 3], nucls=list('ATG'))
    he2 = HEdge(positions=[2, 3, 4], nucls=list('GTA'))
    assert not he1.is_ambiguous_with(he2)
示例#4
0
def test_conflicting_hedges__distinct_rightmost_common_nucl():
    """
    AAG
     |*
     ATA
    """
    he1 = HEdge(positions=[1, 2, 3], nucls=list('AAG'))
    he2 = HEdge(positions=[2, 3, 4], nucls=list('ATA'))
    assert he1.is_ambiguous_with(
        he2), 'Rightmost common nucl must raise conflict'
示例#5
0
def test_merge_hedges__inclusion():
    """
    ATGA
     ||
     TG
    """
    he1 = HEdge(positions=[1, 2, 3, 4], nucls=list('ATGA'))
    he2 = HEdge(positions=[2, 3], nucls=list('TG'))
    he = he1.merge_with(he2)
    assert he.positions == [1, 2, 3, 4]
    assert he.nucls == list('ATGA')
    assert he1.merge_with(he2) == he2.merge_with(he1)
示例#6
0
def get_pairs(hedges: Dict[frozenset, Dict[str, HEdge]],
              ever_created_hedges: Dict[frozenset, Dict[str, HEdge]]):
    int_sets = list(k for k in hedges.keys() if len(hedges[k]) > 0)
    interesting_snp_sets = set(k for k in hedges.keys() if len(hedges[k]) > 0)
    found_any_new_conditions = False
    pairs = []
    for set1_i in range(len(int_sets)):
        set1 = int_sets[set1_i]
        for set2_i in range(set1_i + 1, len(interesting_snp_sets)):
            set2 = int_sets[set2_i]
            if min(max(set1), max(set2)) + 1 < max(min(set1), min(set2)):
                continue
            if set1 != set2:
                if not set1.issubset(set2) and not set2.issubset(set1):
                    found_any_new_conditions = True
                # check_for_snps = not set1.isdisjoint(set2)
                for nucls1, hedge1 in hedges[set1].items():
                    for nucls2, hedge2 in hedges[set2].items():
                        can_merge = True
                        for pos in hedge1.positions:
                            if pos in hedge2.positions:
                                if hedge1.snp_to_nucl[
                                        pos] != hedge2.snp_to_nucl[pos]:
                                    can_merge = False
                                    break

                        if set(hedge1.positions).issubset(set(hedge2.positions)) or \
                                set(hedge2.positions).issubset(set(hedge1.positions)):
                            continue
                        if can_merge:
                            new_hedge = HEdge.union(hedge1, hedge2)
                            new_h_nucls = new_hedge.nucls
                            frozen_positions = frozenset(new_hedge.positions)
                            if frozen_positions not in ever_created_hedges or new_h_nucls not in ever_created_hedges[
                                    frozen_positions]:
                                hedge1.used = False
                                hedge2.used = False
                                pairs.append((hedge1, hedge2))
    return pairs if found_any_new_conditions else []
示例#7
0
文件: core.py 项目: VEK239/HyperHaplo
def create_hedges(reads: List[List[pysam.AlignedSegment]],
                  target_snps: List[SNP],
                  region_start: int,
                  verbose=True) -> List[HEdge]:
    all_snp_positions = set(snp.position
                            for snp in target_snps)  # ref positions
    snp2genome, genome2snp = SNP.reindex_snp_and_genome_mapping(target_snps)

    non_snp_reads_count = 0
    holed_reads_count = 0
    chimera_paired_read_count = 0

    hedges: List[HEdge] = []
    hedges_weight = defaultdict(int)
    hedges_ids = defaultdict(set)
    for read_id, read in enumerate(
            tqdm(reads, desc='Create hedges from reads', disable=not verbose)):
        # here still ref positions
        read_hash = hash(frozenset(read))
        if len(read) == 1:
            positions, nucls = SNP.select_snps_from_single_read(
                read[0], all_snp_positions, region_start)
            positions, nucls = [positions], [nucls]
            start_pos = [read[0].pos]
        elif len(read) == 2:
            positions1, nucls1 = SNP.select_snps_from_single_read(
                read[0], all_snp_positions, region_start)
            positions2, nucls2 = SNP.select_snps_from_single_read(
                read[1], all_snp_positions, region_start)
            positions = []
            nucls = []
            start_pos = []
            if len(positions1) > 0:
                positions.append(positions1)
                nucls.append(nucls1)
                start_pos.append(read[0].pos)
            if len(positions2) > 0:
                positions.append(positions2)
                nucls.append(nucls2)
                start_pos.append(read[1].pos)
            # todo chimera reads
            # if selected_snps is not None:
            #     positions, nucls = selected_snps
            # else:
            #     chimera_paired_read_count += 1
            #     continue
        else:
            raise ValueError(
                f'Read must be single or paired, but given read with {len(read)} parts'
            )

        if any(map(len, positions)):  # any part has SNP
            # here still ref positions
            try:
                created_hedges = []
                for pos, nucl in zip(positions, nucls):
                    created_hedges.append(
                        HEdge.build([pos], [nucl], snp2genome, genome2snp,
                                    start_pos))

                for hedge in created_hedges:
                    if hedge is not None:

                        if hash(hedge) not in hedges_weight:
                            hedges.append(hedge)
                        hedges_weight[hash(hedge)] += 1
                        hedges_ids[hash(hedge)].add(read_id)
                    else:
                        # TODO handle holes in reads
                        holed_reads_count += 1
                # if len(positions) == 1:
                #     hedge = HEdge.build(positions, nucls, snp2genome, genome2snp, reference_length, start_pos)
                # if hedge is not None:
                #     if hash(hedge) not in hedges_weight:
                #         hedges.append(hedge)
                #     hedges_weight[hash(hedge)] += 1
                # else:
                #     # TODO handle holes in reads
                #     holed_reads_count += 1
            except ValueError as err:
                # TODO fix indels in read
                raise ValueError('Fix indels in read')

        else:
            non_snp_reads_count += 1

    if verbose:
        print(f'Skipped reads without SNP   : {non_snp_reads_count}')
        print(f'Skipped reads with holes    : {holed_reads_count}')
        print(f'Skipped chimera paired reads: {chimera_paired_read_count}')

        for he in hedges:
            he.init_weight(hedges_weight[hash(he)])
            he.edge_ids = hedges_ids[hash(he)]
            print(he.start_pos, he.weight, he.positions, he.nucls)

    return hedges
示例#8
0
def algo_merge_hedge_contigs(
        hedges: Dict[frozenset, Dict[str, HEdge]],
        target_snp_count: int,
        error_probability: float = 0,
        verbose: bool = False,
        debug: bool = False) -> Tuple[List[HEdge], Mapping[str, List]]:
    ever_created_hedges = deepcopy(hedges)
    metrics = defaultdict(list)
    if verbose:
        print('----Algo started----')

    remove_leftovers(hedges, error_probability)

    keys_to_delete = set()
    hedges_keys = list(hedges.keys())
    for key1_i in range(len(hedges_keys)):
        key1 = hedges_keys[key1_i]
        for key2_i in range(key1_i + 1, len(hedges_keys)):
            key2 = hedges_keys[key2_i]
            if key1.issubset(key2):
                keys_to_delete.add(key1)
            elif key2.issubset(key1):
                keys_to_delete.add(key2)
    print(keys_to_delete)
    for key in keys_to_delete:
        del hedges[key]

    pairs = get_pairs(hedges, ever_created_hedges)
    old_hedges = None
    haplo_hedges = []
    while len(pairs) > 0:
        print('Iteration started, pairs count: ', len(pairs))
        if verbose:
            print('Current hedges')
            for s, h in hedges.items():
                print(s)
                print(h)

            print('-------------printing pairs---------')
            for pair in pairs:
                print(pair)

        pair = max(pairs,
                   key=lambda x: (
                       get_intersection_snp_length(x),
                       get_union_snp_length(x),
                       get_union_pairs_percent(x),
                       min(x[0].frequency, x[1].frequency),
                       -abs(x[0].frequency - x[1].frequency),
                       min(x[0].positions[0], x[1].positions[0]),
                   ))
        index = pairs.index(pair)
        he1 = pair[0]
        he2 = pair[1]
        if verbose:
            print(f'Merging {pair[0]} and {pair[1]}')
        new_hedge = HEdge.union(he1, he2)
        freq1, freq2, freq_new = check_leftovers_distribution(
            he1.weight, he1.frequency, he2.weight, he2.frequency)
        new_hedge.frequency = freq_new * 100
        new_hedge.weight = (1 - freq1) * he1.weight + (1 - freq2) * he2.weight
        if len(new_hedge.positions) == target_snp_count:
            if new_hedge.frequency > error_probability * 100:
                haplo_hedges.append(new_hedge)
        else:
            new_h_nucls = new_hedge.nucls
            frozen_positions = frozenset(new_hedge.positions)
            if frozen_positions not in ever_created_hedges or new_h_nucls not in ever_created_hedges[
                    frozen_positions]:
                hedges[frozen_positions][new_h_nucls] = new_hedge
                ever_created_hedges[frozen_positions][new_h_nucls] = new_hedge
            else:
                hedges = remove_leftovers(hedges, error_probability)
                pairs = get_pairs(hedges, ever_created_hedges)
                continue
        pairs[index][0].weight *= freq1 * 100 / pairs[index][0].frequency
        pairs[index][1].weight *= freq2 * 100 / pairs[index][1].frequency
        pairs[index][0].frequency = freq1 * 100
        pairs[index][1].frequency = freq2 * 100
        # print(pairs[i])
        hedges[frozenset(
            pairs[index][0].positions)][he1.nucls] = pairs[index][0]
        hedges[frozenset(
            pairs[index][1].positions)][he2.nucls] = pairs[index][1]
        hedges = remove_leftovers(hedges, error_probability)
        pairs = get_pairs(hedges, ever_created_hedges)
    print('----Finished algo----')
    print(haplo_hedges)
    print('----Recounting frequencies----')
    freq_sum = 0
    for hedge in haplo_hedges:
        freq_sum += hedge.frequency
    print(freq_sum)
    for i in range(len(haplo_hedges)):
        haplo_hedges[i].frequency = haplo_hedges[i].frequency / freq_sum * 100
    for haplo in haplo_hedges:
        print(haplo)
    return haplo_hedges, metrics