Пример #1
0
def get_edges_from_gfa(gfa_fpath, output_dirpath, min_edge_len):
    if not gfa_fpath:
        return None

    input_edges_fpath = join(dirname(gfa_fpath), get_filename(gfa_fpath) + ".fasta")
    edges_fpath = join(output_dirpath, basename(input_edges_fpath))
    if not is_empty_file(gfa_fpath) and not can_reuse(edges_fpath, files_to_check=[gfa_fpath]):
        print("Extracting edge sequences from " + gfa_fpath + "...")
        with open(edges_fpath, "w") as out:
            with open(gfa_fpath) as f:
                for line in f:
                    if line.startswith('S'):
                        fs = line.strip().split()
                        seq_name = fs[1]
                        seq = None
                        if is_acgt_seq(fs[2]):
                            seq = fs[2]
                        elif len(fs) >= 4 and is_acgt_seq(fs[3]):
                            seq = fs[3]
                        if seq and len(seq) >= min_edge_len:
                            out.write(">%s\n" % get_edge_agv_id(get_edge_num(seq_name)))
                            out.write(seq)
                            out.write("\n")
    if is_empty_file(edges_fpath) and not is_empty_file(input_edges_fpath):
        with open(edges_fpath, "w") as out:
            with open(input_edges_fpath) as f:
                for line in f:
                    if line.startswith('>'):
                        seq_name = line.strip().split()[0][1:]
                        out.write(">%s\n" % get_edge_agv_id(get_edge_num(seq_name)))
                    else:
                        out.write(line)
    return edges_fpath
Пример #2
0
def map_edges_to_ref(input_fpath, output_dirpath, reference_fpath, threads):
    mapping_fpath = join(output_dirpath, "mapping.paf")
    if reference_fpath:
        if not can_reuse(mapping_fpath,
                         files_to_check=[input_fpath, reference_fpath]):
            if not is_empty_file(input_fpath):
                print("Aligning graph edges to the reference...")
                cmdline = [
                    "minimap2", "-x", "asm20", "--score-N", "0", "-E", "1,0",
                    "-N", "200", "-p", "0.5", "-f", "200", "-t",
                    str(threads), reference_fpath, input_fpath
                ]
                return_code = subprocess.call(cmdline,
                                              stdout=open(mapping_fpath, "w"),
                                              stderr=open(
                                                  join(output_dirpath,
                                                       "minimap.log"), "w"))
                if return_code != 0 or is_empty_file(mapping_fpath):
                    print(
                        "Warning! Minimap2 failed aligning edges to the reference"
                    )
            else:
                print(
                    "Warning! File with edge sequences was not found, failed aligning edges to the reference"
                )
    return mapping_fpath
Пример #3
0
def run(input_fpath, reference_fpath, out_fpath, output_dirpath, threads,
        is_meta):
    if not exists(output_dirpath):
        os.makedirs(output_dirpath)
    if not can_reuse(out_fpath, files_to_check=[input_fpath, reference_fpath]):
        quast_exec_path = get_path_to_program("quast.py")
        if not quast_exec_path:
            print("QUAST is not found!")
            return None
        cmdline = [quast_exec_path, "--fast",  "--agb", input_fpath, "-r", reference_fpath,
                   "-t", str(threads), "-o", output_dirpath, "--min-contig", "0"] + \
                  (["--large"] if getsize(input_fpath) > 10 * 1024 * 1024 or is_meta else []) + (["--min-identity", "90"] if is_meta else [])
        subprocess.call(cmdline,
                        stdout=open("/dev/null", "w"),
                        stderr=open("/dev/null", "w"))
    if is_empty_file(out_fpath) or not can_reuse(
            out_fpath, files_to_check=[input_fpath, reference_fpath]):
        return None
    return out_fpath
Пример #4
0
def format_edges_file(input_fpath, output_dirpath):
    if is_empty_file(input_fpath):
        return None
    edges_fpath = join(output_dirpath, "edges.fasta")
    if not can_reuse(edges_fpath, files_to_check=[input_fpath]):
        with open(input_fpath) as f:
            with open(edges_fpath, "w") as out_f:
                for line in f:
                    if line.startswith('>'):
                        edge_id = get_edge_agv_id(get_edge_num(line[1:]))
                        out_f.write(">%s\n" % edge_id)
                    else:
                        out_f.write(line)
    return edges_fpath
Пример #5
0
def parse_canu_output(input_dirpath, output_dirpath, min_edge_len):
    raw_gfa_fpath = find_file_by_pattern(input_dirpath, ".unitigs.gfa")
    if not raw_gfa_fpath:
        print("ERROR! GFA file is not found in %s! Please check the options" %
              abspath(input_dirpath))
        sys.exit(1)
    edges_fpath = get_edges_from_gfa(raw_gfa_fpath, output_dirpath,
                                     min_edge_len)
    gfa_fpath = join(output_dirpath, basename(raw_gfa_fpath))
    if is_empty_file(gfa_fpath) or not can_reuse(
            gfa_fpath, files_to_check=[raw_gfa_fpath]):
        cmd = 'sed "1s/bogart.edges/1.0/" ' + raw_gfa_fpath
        subprocess.call(shlex.split(cmd), stdout=open(gfa_fpath, 'w'))
    dict_edges = parse_gfa(gfa_fpath,
                           min_edge_len,
                           input_dirpath,
                           assembler="canu")
    contig_edges = parse_canu_assembly_info(input_dirpath, dict_edges)
    return dict_edges, contig_edges, edges_fpath