예제 #1
0
def _merge_scaffolds(big_scaffolds, small_scaffolds):
    """
    Performs the final merging step
    """
    count_diff_scaf = 0
    count_diff_orient = 0
    count_inconsistent = 0

    total_success = 0
    total_fail = 0
    total_inserted = 0
    not_found = 0

    big_count = defaultdict(int)
    for scf in big_scaffolds:
        for c in scf.contigs:
            big_count[c.perm] += 1

    small_count = defaultdict(int)
    for scf in small_scaffolds:
        for c in scf.contigs:
            small_count[c.perm] += 1

    repeats = set(seq for (
        seq,
        count) in chain(list(big_count.items()), list(small_count.items()))
                  if count > 1)
    big_unique = set(seq for (seq, count) in big_count.items() if count == 1)

    small_index = {}
    for scf in small_scaffolds:
        for pos, contig in enumerate(scf.contigs):
            if contig.perm not in repeats:
                assert contig.perm not in small_index
                small_index[contig.perm] = (scf, pos)

    new_scafflods = []
    for big_scf in big_scaffolds:
        new_contigs = []
        #non_repeats = list(filter(lambda i: big_scf.contigs[i].perm
        #                                not in repeats,
        #                          xrange(len(big_scf.contigs))))
        non_repeats = [
            i for i in range(len(big_scf.contigs))
            if big_scf.contigs[i].perm not in repeats
        ]
        for left_idx, right_idx in zip(non_repeats[:-1], non_repeats[1:]):
            left_cnt = big_scf.contigs[left_idx]
            right_cnt = big_scf.contigs[right_idx]

            consistent = False
            weak_contigs = None
            link_to_change = None
            if (left_cnt.perm in small_index
                    and right_cnt.perm in small_index):
                consistent = True
                left_scf, left_pos = small_index[left_cnt.perm]
                right_scf, right_pos = small_index[right_cnt.perm]

                big_sign = left_cnt.sign == right_cnt.sign
                small_sign = (left_scf.contigs[left_pos].sign ==
                              right_scf.contigs[right_pos].sign)

                if left_scf != right_scf:
                    count_diff_scaf += 1
                    consistent = False
                elif big_sign != small_sign:
                    count_diff_orient += 1
                    consistent = False
                else:
                    same_dir = left_pos < right_pos
                    if not same_dir:
                        left_pos, right_pos = right_pos, left_pos

                    weak_contigs = left_scf.contigs[left_pos + 1:right_pos]
                    if any(c.perm in big_unique for c in weak_contigs):
                        count_inconsistent += 1
                        consistent = False

                    link_to_change = copy(left_scf.contigs[left_pos].link)
                    #reverse complement
                    if weak_contigs and not same_dir:
                        link_to_change = copy(left_scf.contigs[right_pos -
                                                               1].link)
                        weak_contigs = [
                            c.reverse_copy() for c in weak_contigs[::-1]
                        ]
                        for pw, nw in zip(weak_contigs[:-1], weak_contigs[1:]):
                            pw.link = copy(nw.link)
                        weak_contigs[-1].link = copy(
                            left_scf.contigs[left_pos].link)

            else:
                not_found += 1

            new_contigs.append(left_cnt)
            if consistent and weak_contigs:
                new_contigs[-1].link = link_to_change
                new_contigs.extend(weak_contigs)
                total_success += 1
                total_inserted += len(weak_contigs)
                #logger.debug("Inserting '{0}' between {1} and {2}"
                #             .format(map(lambda c: c.perm, weak_contigs),
                #                     left_cnt, right_cnt))
            else:
                new_contigs.extend(big_scf.contigs[left_idx + 1:right_idx])
                total_fail += 1

        if len(new_contigs) > 1:
            new_contigs.append(right_cnt)
            s = Scaffold(big_scf.name)
            s.contigs = new_contigs
            new_scafflods.append(s)
        else:  #because of repeats
            new_scafflods.append(big_scf)

    logger.debug("Fail: not found: %d", not_found)
    logger.debug("Fail: different scaffolds: %d", count_diff_scaf)
    logger.debug("Fail: different orientatilns: %d", count_diff_orient)
    logger.debug("Fail: inconsistent: %d", count_inconsistent)
    logger.debug("Total success: %d", total_success)
    logger.debug("Total fail: %d", total_fail)
    logger.debug("Total inserted: %d", total_inserted)

    num_contigs = 0
    for scf in new_scafflods:
        num_contigs += len(scf.contigs)
    logger.debug("Result: %d contigs in %d scaffolds", num_contigs,
                 len(new_scafflods))

    return new_scafflods
예제 #2
0
def _merge_scaffolds(big_scaffolds, small_scaffolds):
    """
    Performs the final merging step
    """
    count_diff_scaf = 0
    count_diff_orient = 0
    count_inconsistent = 0

    total_success = 0
    total_fail = 0
    total_inserted = 0
    not_found = 0

    big_count = defaultdict(int)
    for scf in big_scaffolds:
        for c in scf.contigs:
            big_count[c.perm] += 1

    small_count = defaultdict(int)
    for scf in small_scaffolds:
        for c in scf.contigs:
            small_count[c.perm] += 1

    repeats = set(seq for (seq, count) in
                  chain(big_count.items(), small_count.items()) if count > 1)
    big_unique = set(seq for (seq, count) in big_count.items() if count == 1)

    small_index = {}
    for scf in small_scaffolds:
        for pos, contig in enumerate(scf.contigs):
            if contig.perm not in repeats:
                assert contig.perm not in small_index
                small_index[contig.perm] = (scf, pos)

    new_scafflods = []
    for big_scf in big_scaffolds:
        new_contigs = []
        non_repeats = list(filter(lambda i: big_scf.contigs[i].perm
                                        not in repeats,
                                  xrange(len(big_scf.contigs))))
        for left_idx, right_idx in zip(non_repeats[:-1], non_repeats[1:]):
            left_cnt = big_scf.contigs[left_idx]
            right_cnt = big_scf.contigs[right_idx]

            consistent = False
            if (left_cnt.perm in small_index and
                right_cnt.perm in small_index):
                consistent = True
                left_scf, left_pos = small_index[left_cnt.perm]
                right_scf, right_pos = small_index[right_cnt.perm]

                big_sign = left_cnt.sign == right_cnt.sign
                small_sign = (left_scf.contigs[left_pos].sign ==
                              right_scf.contigs[right_pos].sign)

                if left_scf != right_scf:
                    count_diff_scaf += 1
                    consistent = False
                elif big_sign != small_sign:
                    count_diff_orient += 1
                    consistent = False
                else:
                    same_dir = left_pos < right_pos
                    if not same_dir:
                        left_pos, right_pos = right_pos, left_pos

                    weak_contigs = left_scf.contigs[left_pos + 1 : right_pos]
                    if any(c.perm in big_unique for c in weak_contigs):
                        count_inconsistent += 1
                        consistent = False

                    if not same_dir:
                        weak_contigs = list(map(lambda c: c.reverse_copy(),
                                                weak_contigs[::-1]))
                    link_to_change = left_scf.contigs[left_pos].link
            else:
                not_found += 1

            new_contigs.append(left_cnt)
            if consistent:
                new_contigs[-1].link = link_to_change
                new_contigs.extend(weak_contigs)
                total_success += 1
                total_inserted += len(weak_contigs)
                #logger.debug("Inserting '{0}' between {1} and {2}"
                #             .format(map(lambda c: c.perm, weak_contigs),
                #                     left_cnt, right_cnt))
            else:
                new_contigs.extend(big_scf.contigs[left_idx+1:right_idx])
                total_fail += 1

        if len(new_contigs) > 1:
            new_contigs.append(right_cnt)
            s = Scaffold(big_scf.name)
            s.contigs = new_contigs
            new_scafflods.append(s)
        else:   #because of repeats
            new_scafflods.append(big_scf)

    logger.debug("Fail: not found: {0}".format(not_found))
    logger.debug("Fail: different scaffolds: {0}".format(count_diff_scaf))
    logger.debug("Fail: different orientatilns: {0}".format(count_diff_orient))
    logger.debug("Fail: inconsistent: {0}".format(count_inconsistent))
    logger.debug("Total success: {0}".format(total_success))
    logger.debug("Total fail: {0}".format(total_fail))
    logger.debug("Total inserted: {0}".format(total_inserted))

    num_contigs = 0
    for scf in new_scafflods:
        num_contigs += len(scf.contigs)
    logger.debug("Result: {0} contigs in {1} scaffolds"
                                    .format(num_contigs, len(new_scafflods)))

    return new_scafflods
예제 #3
0
파일: merge_iters.py 프로젝트: wimag/Ragout
def merge(big_scaffolds, small_scaffolds):
    """
    The only function here
    """
    logger.info("Merging two iterations")
    big_index = set()
    for scf in big_scaffolds:
        for c in scf.contigs:
            big_index.add(c.name)

    small_index = {}
    for scf in small_scaffolds:
        for pos, contig in enumerate(scf.contigs):
            assert contig.name not in small_index
            small_index[contig.name] = (scf, pos)

    count = 0
    new_scafflods = []
    for scf in big_scaffolds:
        result = []
        for prev_cont, new_cont in zip(scf.contigs[:-1], scf.contigs[1:]):
            result.append(prev_cont)

            try:
                scf_prev, begin = small_index[prev_cont.name]
                scf_new, end = small_index[new_cont.name]
            except KeyError:
                continue
            if scf_prev.name != scf_new.name:
                continue

            assert end != begin
            same_dir = True
            if end < begin:
                same_dir = False
                end, begin = begin, end

            consistent = True
            for c in scf_prev.contigs[begin + 1 : end]:
                if c.name in big_index:
                    consistent = False
                    break

            if not consistent or end - begin == 1:
                continue

            if ((prev_cont.sign == new_cont.sign) !=
                (scf_prev.contigs[begin].sign == scf_prev.contigs[end].sign)):
                continue

            count += end - begin - 1
            contigs = scf_prev.contigs[begin + 1 : end]
            if not same_dir:
                contigs = contigs[::-1]
                contigs = list(map(lambda c: c.reverse(), contigs))
            #keeping gap from new contigs
            result[-1].link = scf_prev.contigs[begin].link
            result.extend(contigs)

        result.append(new_cont)
        s = Scaffold(scf.name)
        s.contigs = result
        new_scafflods.append(s)

    return new_scafflods
예제 #4
0
def merge(big_scaffolds, small_scaffolds):
    logger.info("Merging two iterations")
    big_index = set()
    for scf in big_scaffolds:
        for c in scf.contigs:
            big_index.add(c.name)

    small_index = {}
    for scf in small_scaffolds:
        for pos, contig in enumerate(scf.contigs):
            assert contig.name not in small_index
            small_index[contig.name] = (scf, pos)

    count = 0
    new_scafflods = []
    for scf in big_scaffolds:
        result = []
        for prev_cont, new_cont in zip(scf.contigs[:-1], scf.contigs[1:]):
            result.append(prev_cont)

            try:
                scf_prev, begin = small_index[prev_cont.name]
                scf_new, end = small_index[new_cont.name]
            except KeyError:
                continue
            if scf_prev.name != scf_new.name:
                continue

            assert end != begin
            same_dir = True
            if end < begin:
                same_dir = False
                end, begin = begin, end

            consistent = True
            for c in scf_prev.contigs[begin + 1:end]:
                if c.name in big_index:
                    consistent = False
                    break

            if not consistent or end - begin == 1:
                continue

            if ((prev_cont.sign == new_cont.sign) !=
                (scf_prev.contigs[begin].sign == scf_prev.contigs[end].sign)):
                continue

            count += end - begin - 1
            contigs = scf_prev.contigs[begin + 1:end]
            if not same_dir:
                contigs = contigs[::-1]
                contigs = list(
                    map(lambda c: Contig(c.name, -c.sign, 0), contigs))
            result.extend(contigs)

        result.append(new_cont)
        s = Scaffold(scf.name)
        s.contigs = result
        new_scafflods.append(s)

    return new_scafflods