Exemplo n.º 1
0
def parse_assembler_output(assembler_name, input_dirpath, input_fpath,
                           output_dirpath, input_fasta_fpath, min_edge_len):
    edges_fpath = None
    if not is_empty_file(input_fpath):
        contig_edges = []
        if input_fpath.endswith("fastg"):
            input_fpath = fastg_to_gfa(input_fpath, output_dirpath,
                                       assembler_name)
        if not input_fpath:
            sys.exit("ERROR! Failed parsing " + input_fpath + " file.")
        if input_fpath.endswith("gfa") or input_fpath.endswith("gfa2"):
            dict_edges = parse_gfa(input_fpath, min_edge_len)
            edges_fpath = get_edges_from_gfa(input_fpath, output_dirpath,
                                             min_edge_len)
        elif input_fpath.endswith("dot") or input_fpath.endswith("gv"):
            edges_fpath = format_edges_file(input_fasta_fpath, output_dirpath)
            dict_edges = dict()
            if is_abyss(assembler_name):
                dict_edges = parse_abyss_dot(input_fpath, min_edge_len)
            if not dict_edges:
                try:
                    dict_edges = parse_flye_dot(input_fpath, min_edge_len)
                except Exception as e:
                    sys.exit(
                        "ERROR! Failed parsing " + input_fpath + " file.\n"
                        "During parsing the following error has occured: " +
                        str(e) +
                        "\nPlease make sure that you correctly specified the assembler name using -a option. "
                        "DOT files produced by different assemblers can have very different formats.\n"
                        "Examples of input data can be found here https://github.com/almiheenko/AGB/tree/master/test_data"
                    )
    else:
        if is_canu(assembler_name):
            dict_edges, contig_edges, edges_fpath = parse_canu_output(
                input_dirpath, output_dirpath, min_edge_len)
        elif is_flye(assembler_name):
            dict_edges, contig_edges, edges_fpath = parse_flye_output(
                input_dirpath, output_dirpath, min_edge_len)
        elif is_spades(assembler_name):
            dict_edges, contig_edges, edges_fpath = parse_spades_output(
                input_dirpath, output_dirpath, min_edge_len)
        else:
            sys.exit(
                "Output folder of %s assembler can not be parsed! Supported assemblers: %s. "
                "More assemblers will be added in the next release.\n"
                "To visualize the assembly graph produced by this assembler, "
                "you should manually specify the assembly graph file in GFA/FASTG/GraphViz formats using --graph option "
                "and (optionally) file with edge sequences using --fasta option"
                % (assembler_name, ', '.join(SUPPORTED_ASSEMBLERS)))
    for edge_id, edge in dict_edges.items():
        dict_edges[edge_id].start, dict_edges[edge_id].end = str(
            edge.start), str(edge.end)
    return dict_edges, contig_edges, edges_fpath
Exemplo n.º 2
0
def parse_flye_output(input_dirpath, output_dirpath, min_edge_len):
    dot_fpath = find_file_by_pattern(
        input_dirpath, "assembly_graph.gv") or find_file_by_pattern(
            input_dirpath, "assembly_graph.dot")
    if not dot_fpath:
        print("ERROR! File %s is not found in %s! Please check the options" %
              (dot_fpath, abspath(input_dirpath)))
        sys.exit(1)
    dict_edges = parse_flye_dot(dot_fpath, min_edge_len)
    contig_edges = parse_flye_assembly_info(input_dirpath, dict_edges)
    gfa_fpath = find_file_by_pattern(input_dirpath, "assembly_graph.gfa")
    edges_fpath = get_edges_from_gfa(gfa_fpath, output_dirpath, min_edge_len)
    return dict_edges, contig_edges, edges_fpath