Пример #1
0
def get_edges_from_gfa(gfa_fpath, output_dirpath, min_edge_len):
    if not gfa_fpath:
        return None

    input_edges_fpath = join(dirname(gfa_fpath), get_filename(gfa_fpath) + ".fasta")
    edges_fpath = join(output_dirpath, basename(input_edges_fpath))
    if not is_empty_file(gfa_fpath) and not can_reuse(edges_fpath, files_to_check=[gfa_fpath]):
        print("Extracting edge sequences from " + gfa_fpath + "...")
        with open(edges_fpath, "w") as out:
            with open(gfa_fpath) as f:
                for line in f:
                    if line.startswith('S'):
                        fs = line.strip().split()
                        seq_name = fs[1]
                        seq = None
                        if is_acgt_seq(fs[2]):
                            seq = fs[2]
                        elif len(fs) >= 4 and is_acgt_seq(fs[3]):
                            seq = fs[3]
                        if seq and len(seq) >= min_edge_len:
                            out.write(">%s\n" % get_edge_agv_id(get_edge_num(seq_name)))
                            out.write(seq)
                            out.write("\n")
    if is_empty_file(edges_fpath) and not is_empty_file(input_edges_fpath):
        with open(edges_fpath, "w") as out:
            with open(input_edges_fpath) as f:
                for line in f:
                    if line.startswith('>'):
                        seq_name = line.strip().split()[0][1:]
                        out.write(">%s\n" % get_edge_agv_id(get_edge_num(seq_name)))
                    else:
                        out.write(line)
    return edges_fpath
Пример #2
0
def parse_canu_unitigs_info(input_dirpath, dict_edges):
    tiginfo_fpath = find_file_by_pattern(input_dirpath, ".unitigs.layout.tigInfo")
    if not is_empty_file(tiginfo_fpath):
        with open(tiginfo_fpath) as f:
            for i, line in enumerate(f):
                if i == 0:
                    header = line.strip().split()
                    repeat_col = header.index("sugRept") if "sugRept" in header else None
                    cov_col = header.index("coverage") if "coverage" in header else None
                    if repeat_col is None or cov_col is None:
                        break
                    continue
                fs = line.strip().split()
                edge_id = get_edge_agv_id(get_edge_num(fs[0]))
                rc_edge_id = get_edge_agv_id(-get_edge_num(fs[0]))
                if edge_id in dict_edges:
                    coverage = int(float(fs[cov_col]))
                    dict_edges[edge_id].cov = coverage
                    dict_edges[rc_edge_id].cov = coverage
                    if fs[repeat_col] == "yes":
                        dict_edges[edge_id].repetitive = True
                        dict_edges[rc_edge_id].repetitive = True
                # else:
                #    print("Warning! Edge %s is not found!" % edge_id)
    return dict_edges
Пример #3
0
def parse_canu_contigs_info(input_dirpath):
    contig_info = dict()
    edges_by_contig = defaultdict(list)
    unitigs_info_fpath = find_file_by_pattern(input_dirpath, "unitigs.bed")
    if input_dirpath and not is_empty_file(unitigs_info_fpath):
        with open(unitigs_info_fpath) as f:
            for line in f:
                fs = line.strip().split()
                contig, start, end, unitig = fs[:4]
                strand = fs[-1]
                edge_name = get_edge_num(
                    unitig) if strand == "+" else -get_edge_num(unitig)
                contig_id = get_canu_id(contig)
                edges_by_contig[contig_id].append(str(edge_name))
    contigs_info_fpath = find_file_by_pattern(input_dirpath,
                                              "contigs.layout.tigInfo")
    if input_dirpath and not is_empty_file(contigs_info_fpath):
        len_col = None
        cov_col = None
        with open(contigs_info_fpath) as f:
            for i, line in enumerate(f):
                if i == 0:
                    header = line.strip().split()
                    len_col = header.index(
                        "tigLen") if "tigLen" in header else None
                    cov_col = header.index(
                        "coverage") if "coverage" in header else None
                    if len_col is None or cov_col is None:
                        break
                    continue
                fs = line.strip().split()
                length = int(float(fs[len_col]))
                coverage = int(float(fs[cov_col]))
                contig_id = get_canu_id(fs[0])
                if contig_id in edges_by_contig:
                    contig_info[contig_id] = {
                        'length': length,
                        'cov': coverage,
                        'mult': 1
                    }
    for contig_id, edges in edges_by_contig.items():
        contig_info[contig_id]['edges'] = edges
    return contig_info
Пример #4
0
def format_edges_file(input_fpath, output_dirpath):
    if is_empty_file(input_fpath):
        return None
    edges_fpath = join(output_dirpath, "edges.fasta")
    if not can_reuse(edges_fpath, files_to_check=[input_fpath]):
        with open(input_fpath) as f:
            with open(edges_fpath, "w") as out_f:
                for line in f:
                    if line.startswith('>'):
                        edge_id = get_edge_agv_id(get_edge_num(line[1:]))
                        out_f.write(">%s\n" % edge_id)
                    else:
                        out_f.write(line)
    return edges_fpath
Пример #5
0
def parse_canu_assembly_info(input_dirpath, dict_edges):
    contig_edges = defaultdict(list)
    unitigs_fpath = find_file_by_pattern(input_dirpath, ".unitigs.bed")
    if is_empty_file(unitigs_fpath):
        print(
            "Warning! Unitigs.bed is not found, information about contigs will not be provided"
        )
    with open(unitigs_fpath) as f:
        for line in f:
            fs = line.strip().split()
            contig, start, end, unitig = fs[:4]
            edge_id = get_edge_agv_id(get_edge_num(unitig))
            if edge_id in dict_edges:
                contig_id = get_canu_id(contig)
                contig_edges[contig_id].append((start, end, edge_id))
    return contig_edges
Пример #6
0
def parse_gfa(gfa_fpath, min_edge_len, input_dirpath=None, assembler=None):
    dict_edges = dict()
    predecessors = defaultdict(list)
    successors = defaultdict(list)
    g = nx.DiGraph()

    print("Parsing " + gfa_fpath + "...")
    # gfa = gfapy.Gfa.from_file(gfa_fpath, vlevel = 0)
    links = []
    edge_overlaps = defaultdict(dict)
    with open(gfa_fpath) as f:
        for line in f:
            record_type = line[0]
            if record_type == 'S':
                fs = line.split()
                name, seq_len = fs[1], len(fs[2])
                if fs[2] == '*':
                    seq_len = None
                add_fields = fs[3:] if len(fs) > 3 else []
                add_info = dict((f.split(':')[0].lower(), f.split(':')[-1]) for f in add_fields)
                cov = 1
                if "dp" in add_info:
                    cov = float(add_info["dp"])  ## coverage depth
                elif "kc" in add_info:
                    cov = max(1, int(add_info["kc"]) / seq_len)  ## k-mer count / edge length
                if "ln" in add_info:
                    seq_len = int(add_info["ln"])  ## sequence length
                if seq_len and seq_len >= min_edge_len:
                    edge_id = get_edge_agv_id(get_edge_num(name))
                    edge = Edge(edge_id, get_edge_num(name), seq_len, cov, element_id=edge_id)
                    dict_edges[edge_id] = edge
                    for overlapped_edge, overlap in edge_overlaps[edge_id].items():
                        dict_edges[edge_id].overlaps.append((edge_id_to_name(overlapped_edge), overlapped_edge, overlap))
                    rc_edge_id = get_edge_agv_id(-get_edge_num(name))
                    rc_edge = Edge(rc_edge_id, -get_edge_num(name), seq_len, cov, element_id=rc_edge_id)
                    dict_edges[rc_edge_id] = rc_edge
                    for overlapped_edge, overlap in edge_overlaps[rc_edge_id].items():
                        dict_edges[edge_id].overlaps.append((edge_id_to_name(overlapped_edge), overlapped_edge, overlap))

            if record_type != 'L' and record_type != 'E':
                continue
            if record_type == 'L':
                _, from_name, from_orient, to_name, to_orient = line.split()[:5]
            else:
                # E       *       2+      65397+  21      68$     0       47      47M
                from_name, to_name = line.split()[2], line.split()[3]
                from_orient, to_orient = from_name[-1], to_name[-1]
                from_name, to_name = from_name[:-1], to_name[:-1]
            edge1 = get_edge_agv_id(get_edge_num(from_name))
            edge2 = get_edge_agv_id(get_edge_num(to_name))
            if from_orient == '-': edge1 = get_match_edge_id(edge1)
            if to_orient == '-': edge2 = get_match_edge_id(edge2)
            overlap = 0
            overlap_operations = re.split('(\d+)', line.split()[-1].strip())
            for i in range(0, len(overlap_operations) - 1, 1):
                if not overlap_operations[i]:
                    continue
                if overlap_operations[i+1] == 'M' or overlap_operations[i+1] == 'I':
                    overlap += int(overlap_operations[i])
            links.append((from_name, from_orient, to_name, to_orient, overlap))
            if overlap:
                edge_overlaps[edge1][edge2] = overlap
                edge_overlaps[edge2][edge1] = overlap

    ### gfa retains only canonical links
    for link in links:
        from_name, from_orient, to_name, to_orient, overlap = link
        edge1 = get_edge_agv_id(get_edge_num(from_name))
        edge2 = get_edge_agv_id(get_edge_num(to_name))
        if from_orient == '-': edge1 = get_match_edge_id(edge1)
        if to_orient == '-': edge2 = get_match_edge_id(edge2)
        if edge1 != edge2:
            predecessors[edge2].append(edge1)
            successors[edge1].append(edge2)
        g.add_edge(edge1, edge2)
        if is_spades(assembler) or is_abyss(assembler):
            edge1, edge2 = get_match_edge_id(edge2), get_match_edge_id(edge1)
            if edge1 != edge2:
                predecessors[edge2].append(edge1)
                successors[edge1].append(edge2)
            g.add_edge(edge1, edge2)

    if assembler == "canu" and input_dirpath:
        dict_edges = parse_canu_unitigs_info(input_dirpath, dict_edges)
    dict_edges = construct_graph(dict_edges, predecessors, successors)
    print("Finish parsing.")
    return dict_edges
Пример #7
0
def run_quast_analysis(input_fpath,
                       reference_fpath,
                       output_dirpath,
                       json_output_dirpath,
                       threads,
                       contig_edges,
                       dict_edges=None,
                       is_meta=False):
    ms_out_fpath = None
    quast_output_dir = join(
        output_dirpath,
        "quast_output" if not dict_edges else "quast_edge_output")
    if not is_empty_file(input_fpath) and not is_empty_file(reference_fpath):
        ms_out_fpath = get_mis_report_fpath(quast_output_dir, input_fpath)
        ms_out_fpath = run(input_fpath, reference_fpath, ms_out_fpath,
                           quast_output_dir, threads, is_meta)
    if not ms_out_fpath:
        if not is_empty_file(input_fpath) and not is_empty_file(
                reference_fpath):
            print(
                "QUAST failed! Make sure you are using the latest version of QUAST"
            )
        print("No information about %s mappings to the reference genome" %
              ("edge" if dict_edges else "contig"))
        with open(join(json_output_dirpath, "reference.json"), 'w') as handle:
            handle.write("chrom_lengths=" + json.dumps([]) + ";\n")
            handle.write("edgeMappingInfo=" + json.dumps([]) + ";\n")
            handle.write("chromGaps=" + json.dumps([]) + ";\n")
            handle.write("chromAligns=" + json.dumps([]) + ";\n")
        with open(join(json_output_dirpath, 'errors.json'), 'w') as handle:
            handle.write("misassembledContigs=[];\n")
        return None, None, None, dict_edges

    # search for misassemblies and store them for each edge and contig
    misassembled_seqs = defaultdict(list)
    with open(ms_out_fpath) as f:
        seq_id = ''
        for line in f:
            if line.startswith("Extensive misassembly"):
                match = re.search(align_pattern, line)
                if not match or len(match.groups()) < 4:
                    continue
                start1, end1, start2, end2 = match.group(
                    'start1'), match.group('end1'), match.group(
                        'start2'), match.group('end2')
                if dict_edges:
                    edge_id = get_edge_agv_id(get_edge_num(seq_id))
                    dict_edges[edge_id].errors.append(
                        (start1, end1, start2, end2))
                else:
                    misassembled_seqs[seq_id].append(
                        (start1, end1, start2, end2))
                ## add misassembl edge
            else:
                seq_id = line.strip()

    if not dict_edges:
        with open(join(json_output_dirpath, 'errors.json'), 'w') as handle:
            handle.write("misassembledContigs='" +
                         json.dumps(misassembled_seqs) + "';\n")
        return None, None, None, dict_edges
    else:
        parse_alignments(get_alignments_fpath(quast_output_dir, input_fpath),
                         json_output_dirpath)
        mapping_fpath = map_edges_to_ref(input_fpath, output_dirpath,
                                         reference_fpath, threads)
        mapping_info, chrom_names, edge_by_chrom = parse_mapping_info(
            mapping_fpath, json_output_dirpath, dict_edges)
        return mapping_info, chrom_names, edge_by_chrom, dict_edges
Пример #8
0
def parse_mapping_info(mapping_fpath, json_output_dir, dict_edges):
    # assign edges to chromosomes and color edges to corresponding colors

    mapping_info = defaultdict(set)

    edge_mappings = defaultdict(lambda: defaultdict(list))
    edge_lengths = dict()
    chrom_lengths = dict()
    with open(mapping_fpath) as f:
        for line in f:
            # contig_1        257261  14      160143  -       chr13   924431  196490  356991  147365  161095  60      tp:A:P  cm:i:14049      s1:i:147260     s2:i:4375       dv:f:0.0066
            fs = line.split()
            edge_id = get_edge_agv_id(get_edge_num(fs[0]))
            start, end = int(fs[2]), int(fs[3])
            edge_lengths[edge_id] = int(fs[1])
            chrom, chrom_len = fs[5], int(fs[6])
            ref_start, ref_end = int(fs[7]), int(fs[8])
            chrom_lengths[chrom] = chrom_len
            edge_mappings[edge_id][chrom].append(
                (start, end, ref_start, ref_end))

    chroms_by_edge = defaultdict(set)
    edge_by_chrom = defaultdict(set)
    chrom_names = set()
    best_aligns = defaultdict(defaultdict)
    for edge_id in edge_mappings:
        # assign an edge to a chromosome if more than 90% of edge aligned to the chromosome
        len_threshold = 0.9 * edge_lengths[edge_id]
        gap_threshold = min(5000, 0.05 * edge_lengths[edge_id])
        for chrom, mappings in edge_mappings[edge_id].items():
            mappings.sort(key=lambda x: (x[0], -x[1]), reverse=False)
        for chrom, mappings in edge_mappings[edge_id].items():
            aligns = []
            covered_len = 0
            last_pos = 0
            last_ref_pos = 0
            align_s, align_e = 0, 0
            # calculate covered length (do not count overlaps)
            for (start, end, ref_start, ref_end) in mappings:
                start = max(start, last_pos)
                covered_len += max(0, end - start + 1)
                last_pos = max(last_pos, end + 1)

            if covered_len >= len_threshold:
                chroms_by_edge[edge_id].add(chrom)
                chrom_names.add(chrom)
                edge_by_chrom[chrom].add(edge_id)
                mappings.sort(key=lambda x: (x[2], -x[3]), reverse=False)
                for (start, end, ref_start, ref_end) in mappings:
                    ref_start = max(ref_start, last_ref_pos)
                    last_ref_pos = max(last_ref_pos, ref_end + 1)
                    if not align_s:
                        align_s = ref_start
                    if align_e and ref_start - align_e >= gap_threshold:
                        if align_e - align_s >= 500:  # break alignments if gap longer than 500 bp
                            aligns.append((chrom, align_s, align_e))
                        align_s = ref_start
                    align_e = ref_end - 1
                if align_e and align_e - align_s >= 500:
                    aligns.append((chrom, align_s, align_e))
                aligns.sort(reverse=True, key=lambda x: x[2] - x[1])
                edge_alignment = chrom + ":"
                if aligns:
                    best_aligns[edge_id][chrom] = aligns[0][1]
                for align in aligns[:
                                    3]:  # store top 3 alignments for each edge
                    edge_alignment += " %s-%s," % (format_pos(
                        align[1]), format_pos(align[2]))
                dict_edges[edge_id].aligns[chrom] = edge_alignment[:-1]
                if get_match_edge_id(edge_id) in dict_edges:
                    dict_edges[get_match_edge_id(
                        edge_id)].aligns[chrom] = edge_alignment[:-1]

    chrom_len_dict = OrderedDict(
        (chrom, chrom_lengths[chrom])
        for i, chrom in enumerate(list(natural_sort(chrom_names))))
    non_alt_chroms = [
        c for c in chrom_names
        if 'alt' not in c and 'random' not in c and 'chrUn' not in c
    ]
    chrom_order = OrderedDict(
        (chrom, i)
        for i, chrom in enumerate(list(natural_sort(non_alt_chroms))))
    color_list = [
        '#e6194b', '#3cb44b', '#ffe119', '#1792d4', '#f58231', '#911eb4',
        '#46f0f0', '#f032e6', '#d2f53c', '#fabebe', '#00dbb1', '#dba2ff',
        '#aa6e28', '#83360e', '#800000', '#003bff', '#808000', '#8d73d4',
        '#000080', '#806680', '#51205a', '#558859', '#d1a187', '#87a1d1',
        '#87a1d1', '#afd187'
    ]

    edge_chroms = defaultdict(set)
    for edge_id, chroms in chroms_by_edge.items():
        match_edge_id = edge_id.replace(
            'rc', 'e') if edge_id.startswith('rc') else edge_id.replace(
                'e', 'rc')
        for chrom in chroms:
            edge_chroms[edge_id].add(chrom)
            edge_chroms[match_edge_id].add(chrom)
            if match_edge_id in dict_edges:
                edge_by_chrom[chrom].add(match_edge_id)

    is_single_chrom = len(chrom_order.keys()) == 1
    for edge_id, chroms in edge_chroms.items():
        if edge_id not in dict_edges:
            continue
        mapping_info[edge_id] = list(chroms)
        colors = set()
        for chrom in chroms:
            if chrom in chrom_order:
                if is_single_chrom:  # color an edge according to its position in reference
                    pos = best_aligns[edge_id][chrom] if best_aligns[edge_id] else \
                        best_aligns[get_match_edge_id(edge_id)][chrom]
                    color = get_rainbow_color(pos, chrom_len_dict[chrom])
                else:
                    color = color_list[chrom_order[chrom] % len(color_list)]
            else:
                color = '#808080'
            colors.add(color)
        if len(colors) <= 5:
            dict_edges[edge_id].chrom = ':'.join(list(colors))
        else:
            dict_edges[edge_id].chrom = 'white:red:black:red:black:white'
    with open(join(json_output_dir, "reference.json"), 'a') as handle:
        handle.write("chrom_lengths=" + json.dumps(chrom_len_dict) + ";\n")
        handle.write("edgeMappingInfo=" + json.dumps(mapping_info) + ";\n")
    return mapping_info, non_alt_chroms, edge_by_chrom