Пример #1
0
def _insert_from_graph_experement(graph_file, scaffolds_in, max_path_len):
    new_scaffolds = []
    graph = _load_dot(graph_file)

    ordered_contigs = set()
    for scf in scaffolds_in:
        ordered_contigs |= set(map(lambda s: s.name, scf.contigs))

    for scf in scaffolds_in:
        new_scaffolds.append(Scaffold(scf.name))

        for prev_cont, new_cont in zip(scf.contigs[:-1], scf.contigs[1:]):
            new_scaffolds[-1].contigs.append(prev_cont)

            #find unique path
            path_nodes = _get_unigue_path_experiment(graph, prev_cont, new_cont,
                                                     max_path_len, ordered_contigs)

            if not path_nodes:
                continue

            #insert contigs along the path
            for node in path_nodes:
                new_scaffolds[-1].contigs.append(Contig.from_sting(node))

        new_scaffolds[-1].contigs.append(new_cont)
    return new_scaffolds
Пример #2
0
def _insert_from_graph(graph, scaffolds_in, max_path_len, contigs_fasta):
    """
    Inserts contigs from the assembly graph into scaffolds
    """
    new_scaffolds = []
    ordered_contigs = set()
    for scf in scaffolds_in:
        ordered_contigs |= set([c.name() for c in scf.contigs])
    reverse_graph = graph.reverse()

    for scf in scaffolds_in:
        new_scaffolds.append(Scaffold(scf.name))

        for prev_cont, new_cont in zip(scf.contigs[:-1], scf.contigs[1:]):
            new_scaffolds[-1].contigs.append(prev_cont)

            #find contigs to insert
            path_nodes = _get_cut_vertices(graph, reverse_graph, prev_cont,
                                           new_cont, max_path_len,
                                           ordered_contigs)

            if not path_nodes:
                continue

            #insert contigs along the path
            supp_genomes = prev_cont.link.supporting_genomes
            prev_cont.link.supporting_assembly = True
            prev_cont.link.gap = config.vals["min_scaffold_gap"]
            for node in path_nodes:
                sign = 1 if node[0] == "+" else -1
                name = node[1:]

                new_contig = Contig.with_sequence(name,
                                                  len(contigs_fasta[name]),
                                                  sign)
                new_contig.link.supporting_assembly = True
                new_contig.link.gap = config.vals["min_scaffold_gap"]
                new_contig.link.supporting_genomes = supp_genomes
                new_scaffolds[-1].contigs.append(new_contig)

        new_scaffolds[-1].contigs.append(scf.contigs[-1])

    return new_scaffolds
Пример #3
0
def _insert_from_graph(graph_file, scaffolds_in, max_path_len):
    new_scaffolds = []
    graph = _load_dot(graph_file)
    logger.debug("Loaded overlap graph with {0} nodes".format(len(graph)))

    ordered_contigs = set()
    for scf in scaffolds_in:
        ordered_contigs |= set(map(lambda s: s.name, scf.contigs))

    for scf in scaffolds_in:
        new_scaffolds.append(Scaffold(scf.name))

        for prev_cont, new_cont in zip(scf.contigs[:-1], scf.contigs[1:]):
            new_scaffolds[-1].contigs.append(prev_cont)

            #find unique path
            path_nodes = _get_unique_path(graph, prev_cont,
                                          new_cont, max_path_len)

            if not path_nodes:
                continue

            #check path consistency
            consistent = True
            for node in path_nodes:
                if node[1:] in ordered_contigs:
                    logger.debug("Path inconsistency {0} -- {1}: {2}"
                                 .format(prev_cont, new_cont, node))
                    consistent = False
                    break
            if not consistent:
                continue

            #insert contigs along the path
            for node in path_nodes:
                new_scaffolds[-1].contigs.append(Contig.from_sting(node))

        new_scaffolds[-1].contigs.append(new_cont)
    return new_scaffolds
Пример #4
0
def _merge_scaffolds(big_scaffolds, small_scaffolds):
    """
    Performs the final merging step
    """
    count_diff_scaf = 0
    count_diff_orient = 0
    count_inconsistent = 0

    total_success = 0
    total_fail = 0
    total_inserted = 0
    not_found = 0

    big_count = defaultdict(int)
    for scf in big_scaffolds:
        for c in scf.contigs:
            big_count[c.perm] += 1

    small_count = defaultdict(int)
    for scf in small_scaffolds:
        for c in scf.contigs:
            small_count[c.perm] += 1

    repeats = set(seq for (
        seq,
        count) in chain(list(big_count.items()), list(small_count.items()))
                  if count > 1)
    big_unique = set(seq for (seq, count) in big_count.items() if count == 1)

    small_index = {}
    for scf in small_scaffolds:
        for pos, contig in enumerate(scf.contigs):
            if contig.perm not in repeats:
                assert contig.perm not in small_index
                small_index[contig.perm] = (scf, pos)

    new_scafflods = []
    for big_scf in big_scaffolds:
        new_contigs = []
        #non_repeats = list(filter(lambda i: big_scf.contigs[i].perm
        #                                not in repeats,
        #                          xrange(len(big_scf.contigs))))
        non_repeats = [
            i for i in range(len(big_scf.contigs))
            if big_scf.contigs[i].perm not in repeats
        ]
        for left_idx, right_idx in zip(non_repeats[:-1], non_repeats[1:]):
            left_cnt = big_scf.contigs[left_idx]
            right_cnt = big_scf.contigs[right_idx]

            consistent = False
            weak_contigs = None
            link_to_change = None
            if (left_cnt.perm in small_index
                    and right_cnt.perm in small_index):
                consistent = True
                left_scf, left_pos = small_index[left_cnt.perm]
                right_scf, right_pos = small_index[right_cnt.perm]

                big_sign = left_cnt.sign == right_cnt.sign
                small_sign = (left_scf.contigs[left_pos].sign ==
                              right_scf.contigs[right_pos].sign)

                if left_scf != right_scf:
                    count_diff_scaf += 1
                    consistent = False
                elif big_sign != small_sign:
                    count_diff_orient += 1
                    consistent = False
                else:
                    same_dir = left_pos < right_pos
                    if not same_dir:
                        left_pos, right_pos = right_pos, left_pos

                    weak_contigs = left_scf.contigs[left_pos + 1:right_pos]
                    if any(c.perm in big_unique for c in weak_contigs):
                        count_inconsistent += 1
                        consistent = False

                    link_to_change = copy(left_scf.contigs[left_pos].link)
                    #reverse complement
                    if weak_contigs and not same_dir:
                        link_to_change = copy(left_scf.contigs[right_pos -
                                                               1].link)
                        weak_contigs = [
                            c.reverse_copy() for c in weak_contigs[::-1]
                        ]
                        for pw, nw in zip(weak_contigs[:-1], weak_contigs[1:]):
                            pw.link = copy(nw.link)
                        weak_contigs[-1].link = copy(
                            left_scf.contigs[left_pos].link)

            else:
                not_found += 1

            new_contigs.append(left_cnt)
            if consistent and weak_contigs:
                new_contigs[-1].link = link_to_change
                new_contigs.extend(weak_contigs)
                total_success += 1
                total_inserted += len(weak_contigs)
                #logger.debug("Inserting '{0}' between {1} and {2}"
                #             .format(map(lambda c: c.perm, weak_contigs),
                #                     left_cnt, right_cnt))
            else:
                new_contigs.extend(big_scf.contigs[left_idx + 1:right_idx])
                total_fail += 1

        if len(new_contigs) > 1:
            new_contigs.append(right_cnt)
            s = Scaffold(big_scf.name)
            s.contigs = new_contigs
            new_scafflods.append(s)
        else:  #because of repeats
            new_scafflods.append(big_scf)

    logger.debug("Fail: not found: %d", not_found)
    logger.debug("Fail: different scaffolds: %d", count_diff_scaf)
    logger.debug("Fail: different orientatilns: %d", count_diff_orient)
    logger.debug("Fail: inconsistent: %d", count_inconsistent)
    logger.debug("Total success: %d", total_success)
    logger.debug("Total fail: %d", total_fail)
    logger.debug("Total inserted: %d", total_inserted)

    num_contigs = 0
    for scf in new_scafflods:
        num_contigs += len(scf.contigs)
    logger.debug("Result: %d contigs in %d scaffolds", num_contigs,
                 len(new_scafflods))

    return new_scafflods
Пример #5
0
def merge(big_scaffolds, small_scaffolds):
    logger.info("Merging two iterations")
    big_index = set()
    for scf in big_scaffolds:
        for c in scf.contigs:
            big_index.add(c.name)

    small_index = {}
    for scf in small_scaffolds:
        for pos, contig in enumerate(scf.contigs):
            assert contig.name not in small_index
            small_index[contig.name] = (scf, pos)

    count = 0
    new_scafflods = []
    for scf in big_scaffolds:
        result = []
        for prev_cont, new_cont in zip(scf.contigs[:-1], scf.contigs[1:]):
            result.append(prev_cont)

            try:
                scf_prev, begin = small_index[prev_cont.name]
                scf_new, end = small_index[new_cont.name]
            except KeyError:
                continue
            if scf_prev.name != scf_new.name:
                continue

            assert end != begin
            same_dir = True
            if end < begin:
                same_dir = False
                end, begin = begin, end

            consistent = True
            for c in scf_prev.contigs[begin + 1:end]:
                if c.name in big_index:
                    consistent = False
                    break

            if not consistent or end - begin == 1:
                continue

            if ((prev_cont.sign == new_cont.sign) !=
                (scf_prev.contigs[begin].sign == scf_prev.contigs[end].sign)):
                continue

            count += end - begin - 1
            contigs = scf_prev.contigs[begin + 1:end]
            if not same_dir:
                contigs = contigs[::-1]
                contigs = list(
                    map(lambda c: Contig(c.name, -c.sign, 0), contigs))
            result.extend(contigs)

        result.append(new_cont)
        s = Scaffold(scf.name)
        s.contigs = result
        new_scafflods.append(s)

    return new_scafflods