Exemplo n.º 1
0
def extractSubgraph(gf):
#    graph = SimpleGraph().ReadDot(os.path.join(flye_dir, "20-repeat", "graph_after_rr.gv"))
    graph1 =SimpleGraph().ReadDot(os.path.join(gf))
    vertex_ids = graph1.v.keys()
    # print "{|}|" + "|".join(["id " + r + "\\\\" for r in edge_ids])
    print "{|}|" + "|".join(["\"" + str(r) + "\"" for r in vertex_ids])
    print " ".join(graph1.e.keys())
Exemplo n.º 2
0
def extractSubgraph(dir, flye_dir, contigs):
    basic.ensure_dir_existance(dir)
    d = parseUPaths(flye_dir)
    edge_ids = []
    for contig in contigs:
        for s in d[contig]:
            edge_ids.append(s)
    graph = SimpleGraph().ReadDot(
        os.path.join(flye_dir, "20-repeat", "graph_after_rr.gv"))
    vertex_ids = set()
    len = 0
    for eid in edge_ids:
        len += graph.e[eid].len
        vertex_ids.add(graph.e[eid].start)
        vertex_ids.add(graph.e[eid].end)
        if len > 10000:
            break
    # print "{|}|" + "|".join(["id " + r + "\\\\" for r in edge_ids])
    print "{|}|" + "|".join(["\"" + str(r) + "\"" for r in vertex_ids])
Exemplo n.º 3
0
import sys

import os

sys.path.append("py")
from common.SimpleGraph import SimpleGraph
from common import basic

g = SimpleGraph()
g.ReadDot(sys.argv[1])
basic.ensure_dir_existance(sys.argv[2])
args = sys.argv[3:]
if "merge" in args:
    g = g.Merge()
cnt = 0
oppa = []
for comp in g.Split(1000000000):
    if len(comp) < 3:
        if len(g.v[comp[0]].inc) + len(g.v[comp[0]].out) + len(
                g.v[comp[-1]].inc) + len(g.v[comp[-1]].out) <= 2:
            pass
        else:
            oppa.extend(comp)
        if len(oppa) > 30:
            comp = list(oppa)
            oppa = []
        else:
            continue
    print cnt, len(comp)
    f = open(os.path.join(sys.argv[2], str(cnt) + ".dot"), "w")
    g.Draw(comp, f)
Exemplo n.º 4
0
def main(flye_dir, rf, dir, edge_id, k):
    params.technology = "nano"
    params.k = k
    basic.ensure_dir_existance(dir)
    basic.CreateLog(dir)
    dd = DirDistributor(os.path.join(dir, "alignments"))
    print "Reading graph"
    graph = SimpleGraph().ReadGFA(os.path.join(flye_dir, "assembly_graph.gfa"))
    print "Parsing edge mapping"
    id_map = parseUPaths(flye_dir)
    edge_ids = edge_id.split(",")
    print "Extracting relevant graph component"
    res = open(os.path.join(dir, "contigs.fasta"), "w")
    unique = dict()
    for eid in edge_ids:
        for e in graph.v[graph.e[eid].start].inc:
            if basic.Normalize(e.id) in edge_ids:
                continue
            if len(e.seq) < 10000:
                if e.id.startswith("-"):
                    unique[e.id[1:]] = NamedSequence(basic.RC(e.seq), e.id[1:])
                else:
                    unique[e.id] = NamedSequence(e.seq, e.id)
            else:
                if e.id.startswith("-"):
                    unique[e.id[1:] + "l"] = NamedSequence(
                        basic.RC(e.seq[:5000]), e.id[1:] + "l")
                else:
                    unique[e.id + "r"] = NamedSequence(e.seq[-5000:],
                                                       e.id + "r")
        for e in graph.v[graph.e[eid].end].out:
            if basic.Normalize(e.id) in edge_ids:
                continue
            if len(e.seq) < 10000:
                if e.id.startswith("-"):
                    unique[e.id[1:]] = NamedSequence(basic.RC(e.seq), e.id[1:])
                else:
                    unique[e.id] = NamedSequence(e.seq, e.id)
            else:
                if e.id.startswith("-"):
                    unique[e.id[1:] + "r"] = NamedSequence(
                        basic.RC(e.seq[-5000:]), e.id[1:] + "r")
                else:
                    unique[e.id + "l"] = NamedSequence(e.seq[:5000],
                                                       e.id + "l")

    for c in unique.values():
        print c.id
        SeqIO.write(c, res, "fasta")
    res.close()
    old_ids = []
    for eid in edge_ids:
        for olde in id_map[eid[len("edge_"):]]:
            old_ids.append(basic.Normalize(olde))
    print "Finding reads that align to", edge_ids
    print "Old ids:", old_ids
    relevant_read_ids = set()
    for s in open(os.path.join(flye_dir, "20-repeat", "read_alignment_dump"),
                  "r").readlines():
        s = s.split()
        if s[0] != "Aln":
            continue
        if s[6].split("_")[1] in old_ids:
            relevant_read_ids.add(s[2][1:])
            print s[2][1:], s[6].split("_")[1]
    print "Reading reads"
    res = open(os.path.join(dir, "reads.fasta"), "w")
    for read in SeqIO.parse_fasta(open(rf, "r")):
        if read.id in relevant_read_ids and len(read) > k * 1.2:
            SeqIO.write(read, res, "fasta")
    res.close()
Exemplo n.º 5
0
def CreateContigCollection(graph_file, contigs_file, min_cov, aligner, polisher, reads, force_unique, all_unique):
    sys.stdout.info("Creating contig collection")
    if force_unique is None and not all_unique:
        graph = SimpleGraph().ReadDot(graph_file)
        graph.FillSeq(contigs_file)
        covs = []
        for e in graph.e.values():
            covs.append((e.len, e.cov))
        tmp_cov = []
        total = sum(l for c,l in covs) / 2
        for l, c in sorted(covs)[::-1]:
            if total < 0:
                break
            tmp_cov.append((l, c))
            total -= l
        avg_cov = float(sum([l * c for l, c in tmp_cov])) / sum(l for l, c in tmp_cov)
        sys.stdout.info("Average coverage determined:", avg_cov)
        nonunique = set()
        for edge in graph.e.values():
            if edge.unique and edge.len < 20000 and len(graph.v[edge.start].out) > 1:
                if edge.cov >= min_cov and (edge.cov < 0.8 * avg_cov or edge.len > 40000):
                    alter = ContigStorage()
                    for e in graph.v[edge.start].out:
                        if e != edge:
                            alter.add(Contig(e.seq, e.id))
                    for al in aligner.localAlign([Contig(edge.seq, edge.id)], alter):#type: AlignmentPiece
                        if al.percentIdentity() > 0.98 and (al.seg_from.left < 100 and al.seg_to.left < 100 and len(al) > min(500, edge.len)):
                            nonunique.add(edge.id)
                            nonunique.add(basic.Reverse(edge.id))
        contigs = ContigCollection()
        for edge in graph.e.values():
            if basic.isCanonocal(edge.id):
                if edge.unique and (edge.len > params.min_isolated_length or len(graph.v[edge.end].out) > 0 or len(graph.v[edge.start].inc) > 0):
                    if edge.cov >= min_cov and (edge.cov < 1.5 * avg_cov or edge.len > 40000):
                        if edge.id in nonunique:
                            sys.stdout.info("Edge removed based on alignment to alternative:", edge.id, edge.cov, edge.len)
                        else:
                            contigs.add(Contig(edge.seq, edge.id))
                    else:
                        sys.stdout.info("Edge removed based on coverage:", edge.id, edge.cov, edge.len)
                elif (edge.len > 100000 and edge.cov < 1.5 * avg_cov) or (edge.len > 40000 and 1.3 * avg_cov > edge.cov > 0.7 * avg_cov):
                    contigs.add(Contig(edge.seq, edge.id))
                    sys.stdout.info("Edge added based on length and coverage:", edge.id, edge.cov, edge.len)

    elif force_unique is not None:
        sys.stdout.info("Using forced unique edge set")
        sys.stdout.trace(force_unique)
        contigs = ContigCollection().loadFromFile(contigs_file).filter(lambda contig: contig.id in force_unique)
    else:
        sys.stdout.info("Considering all contigs unique")
        contigs = ContigCollection().loadFromFile(contigs_file)
    # contigs.loadFromFasta(open(contigs_file, "r"), num_names=True)
    # contigs = contigs.filter(lambda contig: contig.id not in nonunique and len(contig) > params.k + 20)
    sys.stdout.info("Created", len(contigs), "initial contigs")
    if not all_unique or force_unique is not None:
        sys.stdout.info("Polishing contigs")
        polished_contigs = polisher.polishMany(reads, list(contigs.unique()))
        contigs = ContigCollection().addAll(polished_contigs)
    else:
        sys.stdout.info("Skipping contig polishing step since manual unique contig initialization was used")
    return contigs
Exemplo n.º 6
0
import sys

sys.path.append("py")
from common import basic, SeqIO
from common.SimpleGraph import SimpleGraph

graph = SimpleGraph().ReadGFA(sys.argv[1])
for e_id in graph.e:
    if basic.isCanonocal(e_id):
        SeqIO.write(graph.e[e_id], sys.stdout, "fasta")
Exemplo n.º 7
0
import os
import sys
sys.path.append("py")

from common import basic, SeqIO
from common.SimpleGraph import SimpleGraph

if __name__ == "__main__":
    g = SimpleGraph()
    g.ReadGFA(sys.argv[1])
    fasta = open(sys.argv[2] + ".fasta", "w")
    dot = open(sys.argv[2] + ".dot", "w")
    if "merge" in sys.argv:
        g = g.Merge()
    g.Print(dot)
    g.PrintFasta(fasta)
    fasta.close()
    dot.close()
Exemplo n.º 8
0
        if len(comp) > 1:
            yield graph.Component(comp)

def SplitGraph(graph, calculator):
    max_cov = graph.covPerc(0.5)
    for comp in SplitGraphByCondition(graph, lambda edge: edge.len >= calculator.edge_length and edge.cov < max_cov * 1.8):
        cov = calculator.calculateComponentCoverage(comp, max_cov)
        if cov == 0:
            print "Zero component"
        for comp1 in SplitGraphByCondition(comp, calculator.uniqueCondition(cov)):
            print comp1.v.__len__(), comp1.e.__len__()
            yield comp1, cov


if __name__ == "__main__":
    g = SimpleGraph()
    g.ReadDot(sys.argv[1])
    basic.ensure_dir_existance(sys.argv[2])
    args = sys.argv[3:]
    if "merge" in args:
        g = g.Merge()
    diploid = "--diploid" in args
    cnt = 0
    oppa = []
    simple = 0
    complex = 0
    max_cov = g.covPerc(0.5)
    if diploid:
        calculator = DipolidCalculator(150000)
    else:
        calculator = HaploidCalculator(150000)
Exemplo n.º 9
0
def main(flye_dir, rf, dir, edge_id, to_resolve, min_contig_length):
    params.technology = "nano"
    basic.ensure_dir_existance(dir)
    basic.CreateLog(dir)
    dd = DirDistributor(os.path.join(dir, "alignments"))
    aligner = Aligner(dd)
    print " ".join(sys.argv)
    print "Reading graph"
    graph = SimpleGraph().ReadDot(
        os.path.join(flye_dir, "20-repeat", "graph_before_rr.gv"))
    graph.FillSeq(os.path.join(flye_dir, "20-repeat", "graph_before_rr.fasta"),
                  True)
    print "Extracting relevant graph component"
    edge_ids = edge_id.split(",")
    to_resolve = to_resolve.split(",")
    to_resolve = [(a, int(b))
                  for a, b in zip(to_resolve[0::2], to_resolve[1::2])]
    unique = uniqueNeighbours(edge_ids, graph, min_contig_length)

    if rf == "none":
        return
    print "Finding reads that align to", edge_ids
    reads_to_resolve = dict()  # type: Dict[str, List[str]]
    for eid, mult in to_resolve:
        reads_to_resolve[eid] = []
    for unique_edge, initial in unique:
        reads_to_resolve[initial] = []
    relevant_read_ids = set()
    for rid, eid in parseReadDump(
            os.path.join(flye_dir, "20-repeat", "read_alignment_dump")):
        if eid in edge_ids:
            relevant_read_ids.add(rid)
            print rid, eid
    for rid, eid in parseReadDump(
            os.path.join(flye_dir, "20-repeat", "read_alignment_dump")):
        if rid in relevant_read_ids and eid in reads_to_resolve:
            reads_to_resolve[eid].append(rid)
    for eid in reads_to_resolve:
        reads_to_resolve[eid] = list(set(reads_to_resolve[eid]))
    print "Reading reads"
    res_reads = ContigStorage()
    res = open(os.path.join(dir, "reads.fasta"), "w")
    for read in SeqIO.parse_by_name(rf):
        if read.id in relevant_read_ids:
            res_reads.add(Contig(read.seq, read.id))
            SeqIO.write(read, res, "fasta")
    res.close()
    random_down = open(os.path.join(dir, "random_down.fasta"), "w")
    cnt = 0
    for read in res_reads:
        if cnt % 5 == 0:
            SeqIO.write(read, random_down, "fasta")
        cnt += 1
    random_down.close()
    res = open(os.path.join(dir, "contigs.fasta"), "w")
    lcf = open(os.path.join(dir, "contigs.lc"), "w")
    for eid, mult in to_resolve:
        repeat_reads = [res_reads[rid] for rid in reads_to_resolve[eid]]
        print reads_to_resolve[eid]
        print map(str, repeat_reads)
        split_contigs = splitRepeat(aligner, graph.e[eid].seq, mult,
                                    repeat_reads, min_contig_length)
        if split_contigs is None:
            print "Failed to resove edge", eid, "Aborting"
        print "Edge", eid, "was split into", mult, "copies"
        for contig, contig_reads in split_contigs:
            print contig.id
            SeqIO.write(contig, res, "fasta")
            lcf.write(contig.id + "\n")
            lcf.write(" ".join([r.id for r in contig_reads]) + "\n")
    res = open(os.path.join(dir, "contigs.fasta"), "w")
    for unique_edge, initial in unique:
        print unique_edge.id
        SeqIO.write(unique_edge, res, "fasta")
        lcf.write(unique_edge.id + "\n")
        lcf.write(" ".join(reads_to_resolve[initial]) + "\n")
    res.close()
Exemplo n.º 10
0
def main(flye_dir, output_dir, diploid):
    basic.ensure_dir_existance(output_dir)
    CreateLog(output_dir)
    print("Version:", subprocess.check_output(["git", "rev-parse", "HEAD"]))
    print("Modifications:")
    print subprocess.check_output(["git", "diff"])
    graph_file = os.path.join(flye_dir, "20-repeat", "graph_before_rr.gv")
    edge_file = os.path.join(flye_dir, "20-repeat", "graph_before_rr.fasta")
    dump_file = os.path.join(flye_dir, "20-repeat", "read_alignment_dump")
    if diploid:
        calculator = DipolidCalculator(150000)
    else:
        calculator = HaploidCalculator(150000)
    print "Reading graph from", graph_file
    graph = SimpleGraph()
    graph.ReadDot(graph_file)
    print "Reading sequences from", edge_file
    graph.FillSeq(edge_file, True)
    print "Splitting graph", edge_file
    componentRecords, edgecomp = constructComponentRecords(graph, calculator)
    print "Reading alignment dump from", dump_file
    rcnt = 0
    for rid, eids in AlignmentDumpParser(dump_file).parse():
        compids = set()
        eids = map(basic.Normalize, eids)
        for eid in eids:
            for compid in edgecomp[eid]:
                compids.add(compid)
        for compid in compids:
            comp_eids = [
                eid for eid in eids
                if eid in componentRecords[compid].component.e
            ]
            if comp_eids.__len__() == 0:
                print "GOPA", compid, compids, rid, eids
            componentRecords[compid].addRead(rid, eids)
        rcnt += 1
        if rcnt % 100000 == 0:
            print "Processed", rcnt, "reads"
    print "Filling flye repeat resolution results"
    flye_next = FillFlyeNext(componentRecords,
                             os.path.join(flye_dir, "flye.log"))
    for compRec in componentRecords:
        half = compRec.half()
        for norm_eid in compRec.unique:
            for eid in [norm_eid, basic.Reverse(norm_eid)]:
                if eid not in compRec.component.e:
                    assert not basic.isCanonocal(eid)
                    assert basic.Reverse(eid) in compRec.component.e
                    continue
                if compRec.component.e[eid].end in half:
                    if compRec.component.isBorder(
                            compRec.component.e[eid].end):
                        compRec.out += 1
                    if compRec.component.isBorder(
                            compRec.component.e[eid].start):
                        compRec.inc += 1
                if not compRec.component.isBorder(
                        compRec.component.e[eid].end):
                    if flye_next[eid] is None:
                        compRec.unresolved_connections += 1
                    else:
                        compRec.resolved_connections.append(
                            (eid, flye_next[eid]))
                        if flye_next[eid] not in compRec.component.e:
                            compRec.outside_connections += 1

    basic.ensure_dir_existance(output_dir)
    print "Printing components to disk"
    subdataset_dir = os.path.join(output_dir, "subdatasets")
    basic.ensure_dir_existance(subdataset_dir)
    order = range(componentRecords.__len__())
    order = sorted(order, key=lambda i: componentRecords[i].score())
    ordered_components = [
        componentRecords[order[i]] for i in range(len(order))
    ]
    componentRecords = ordered_components
    basic.ensure_dir_existance(os.path.join(output_dir, "pics"))
    for i, component in enumerate(componentRecords):
        comp_dir = os.path.join(subdataset_dir, str(i))
        component.dump(comp_dir)
        fig_name = os.path.join(comp_dir, "graph.dot")
        component.draw(fig_name, calculator)
        if component.component.__len__() <= 100:
            fig_file = os.path.join(output_dir, "pics", str(i) + ".dot")
            component.draw(fig_file, calculator)

    table_file = os.path.join(output_dir, "table.txt")
    print "Printing table to file", table_file
    f = open(table_file, "w")
    f.write(
        "Id v e unique inc out repeats unresolved resolved outside zero hub badborder score\n"
    )
    for i, compRec in enumerate(componentRecords):
        comp = compRec.component
        f.write(" ".join([
            str(i),
            str(comp.v.__len__()),
            str(comp.e.__len__()),
            str(compRec.unique.__len__() * 2),
            str(compRec.inc),
            str(compRec.out),
            str(compRec.repeat_edges),
            str(compRec.unresolved_connections),
            str(compRec.resolved_connections.__len__()),
            str(compRec.outside_connections),
            str(compRec.zero),
            str(compRec.red),
            str(compRec.bad_border),
            str(compRec.overcovered_edges),
            str(compRec.score())
        ]) + "\n")
    f.close()
    table_file = os.path.join(output_dir, "list.txt")
    f = open(table_file, "w")
    for a in range(len(componentRecords)):
        f.write(str(a) + "\n")
    f.close()