Exemplo n.º 1
0
 def splitFromContigs(self, contigs, max_contig=50000, cut_size=20000):
     # type: (ContigStorage, int, int) -> None
     for contig in contigs.unique():
         if not basic.isCanonocal(contig.id):
             contig = contig.rc
         if len(contig) > max_contig:
             line1 = self.addNew(contig.seq[:cut_size],
                                 "L" + contig.id + "l")
             line2 = self.addNew(contig.seq[-cut_size:],
                                 "L" + contig.id + "r")
             line1.initial.add(
                 AlignmentPiece.Identical(
                     contig.asSegment().prefix(length=cut_size),
                     line1.asSegment()))
             line2.initial.add(
                 AlignmentPiece.Identical(
                     contig.asSegment().suffix(length=cut_size),
                     line2.asSegment()))
             line1.tie(line2,
                       len(contig) - 2 * cut_size,
                       contig.seq[cut_size:-cut_size])
         else:
             line = self.addNew(contig.seq, "L" + contig.id)
             line.initial.add(
                 AlignmentPiece.Identical(contig.asSegment(),
                                          line.asSegment()))
Exemplo n.º 2
0
 def analyseSegments(self, segs):
     # type: (List[Segment]) -> None
     contigs = ContigStorage()
     contigs.addAll([seg.asContig() for seg in segs if len(seg) > 5000])
     res = [] # type: List[Segment]
     for al in self.aligner.overlapAlign(self.reads, contigs):
         if basic.isCanonocal(al.seg_to.contig.id):
             res.append(al.seg_to)
         else:
             res.append(al.seg_to.RC())
     res = sorted(res, key=lambda seg: (seg.contig.id, seg.left))
     covs = [[0] * params.maxCoverageThreshold for i in range(100)]
     for contig, it in itertools.groupby(res, key = lambda seg: seg.contig):
         segs = list(it)
         shrink = contig.asSegment().shrink(1000)
         bad_seg = False
         for cov, slen in self.covSegments(shrink, segs, 1):
             if cov < 3:
                 bad_seg = True
         if bad_seg:
             continue
         for i in range(len(covs)):
             k = 500 + i * 100
             for cov, slen in self.covSegments(shrink, segs, k):
                 covs[i][min(cov, len(covs[i]) - 1)] += slen
     self.recs = [CoverageAnalyser.CoverageRecord(500 + i * 100, covs[i]) for i in range(len(covs)) if covs[i] > 1000]
Exemplo n.º 3
0
 def unique(self):
     if self.add_rc:
         for item in self.items.values():
             if basic.isCanonocal(item.id):
                 yield item
     else:
         for item in UniqueList(self).__iter__():
             yield item
Exemplo n.º 4
0
 def rcEdge(self, edge):
     # type: (Edge) -> Edge
     if basic.isCanonocal(edge.id) and basic.Reverse(edge.id) not in self.e:
         start = self.v[edge.start]
         end = self.v[edge.end]
         assert len(start.inc) == len(end.out) and len(start.out) == len(
             end.inc)
         return edge
     return self.e[basic.Reverse(edge.id)]
Exemplo n.º 5
0
 def addNew(self, seq, name=None):
     # type: (str, Optional[str]) -> NewLine
     if name is None:
         name = "L" + str(self.cnt)
         self.cnt += 1
     else:
         if not basic.isCanonocal(name):
             name = basic.Reverse(basic.Reverse(name))
     new_line = NewLine(seq, str(name),
                        ExtensionHandler(self.disjointigs, self.aligner))
     self.add(new_line)
     new_line.name_printer = self.name_printer
     new_line.rc.name_printer = self.name_printer
     return new_line
Exemplo n.º 6
0
 def save(self, handler):
     # type: (TokenWriter) -> None
     keys = [
         key for key in self.lines.items.keys() if basic.isCanonocal(key)
     ]
     handler.writeTokens(keys)
     for l1, d1 in self.alignmentsToFrom.items():
         if not basic.isCanonocal(l1):
             continue
         for l2, als in d1.items():
             if l1 < basic.Normalize(l2):
                 handler.writeToken(l1)
                 handler.writeToken(l2)
                 handler.newLine()
                 als.save(handler)
     handler.writeToken("0")
     handler.writeToken("0")
     handler.newLine()
     for lid in keys:
         storage = self.rc_alignments[lid]
         storage.save(handler)
     for lid in keys:
         storage = self.auto_alignments[lid]
         storage.save(handler)
Exemplo n.º 7
0
 def construct(self, aligner):
     # type: (Aligner) -> None
     for al in aligner.dotplotAlign(self.lines.unique(), self.lines):
         if len(al) > params.k and al.percentIdentity() > 0.8:
             if al.seg_from.contig.id == al.seg_to.contig.id:
                 ok = al.seg_from <= al.seg_to
             elif al.seg_from.contig == al.seg_to.contig.rc:
                 if basic.isCanonocal(al.seg_from.contig.id):
                     ok = al.seg_from < al.seg_to.RC()
                 else:
                     ok = al.seg_from.RC() < al.seg_to
             else:
                 ok = basic.canonical(
                     al.seg_from.contig.id) < basic.canonical(
                         al.seg_to.contig.id)
             if ok:
                 self.addAlignment(al)
Exemplo n.º 8
0
def evaluatePI(dir, contigs_file, initial_file, ref_file):
    basic.ensure_dir_existance(dir)
    CreateLog(dir)
    dd = DirDistributor(os.path.join(dir, "alignments"))
    aligner = Aligner(dd)
    contigs = ContigStorage().loadFromFasta(open(contigs_file, "r"), False)
    initial = ContigStorage().loadFromFasta(open(initial_file, "r"), False)
    ref = ContigStorage().loadFromFasta(open(ref_file, "r"), False)
    segs = []
    for al in aligner.overlapAlign(initial.unique(), contigs):
        if basic.isCanonocal(al.seg_to.contig.id):
            segs.append(al.seg_to)
        else:
            segs.append(al.rc.seg_to)
    segs = sorted(segs, key=lambda seg: basic.Normalize(seg.contig.id))
    interesting = dict()
    print "Interesting segments:"
    for contig in contigs:
        interesting[contig.id] = [contig.asSegment()]
    for contig, segit in itertools.groupby(segs, lambda seg: seg.contig):
        csegs = SegmentStorage().addAll(segit)
        csegs.mergeSegments()
        csegs = csegs.reverse(contig)
        interesting[contig.id] = list(csegs)
        print list(csegs)
    print "Analysis of contigs"
    scorer = Scorer()
    for al in aligner.localAlign(contigs.unique(), ref):
        print al
        for seg in interesting[al.seg_from.contig.id]:
            if al.seg_from.expand(500).contains(
                    seg) or al.seg_from.interSize(seg) > 40000:
                tmp_al = al.reduce(query=al.seg_from.cap(seg))
                scorer.polyshMatching(tmp_al.matchingSequence(),
                                      params.score_counting_radius)
                print tmp_al.seg_from, tmp_al.seg_to, str(events)
    print ""
    print "Analysis of initial"
    for al in aligner.overlapAlign(initial, ref):
        scorer.polyshMatching(al.matchingSequence(),
                              params.score_counting_radius)
        print al.seg_from, al.seg_to, str(events)
Exemplo n.º 9
0
def main(k, dir, contigs_file, reads_file):
    # type: (int, str, str, str) -> None
    basic.ensure_dir_existance(dir)
    CreateLog(dir)
    dd = DirDistributor(os.path.join(dir, "alignments"))
    aligner = Aligner(dd)
    params.k = k
    print "Loading contigs"
    tmp = sorted(ContigStorage().loadFromFasta(open(contigs_file, "r"),
                                               False).unique(),
                 key=lambda contig: len(contig))
    cnt = 1
    contigs = ContigStorage()
    for c1, c2 in zip(tmp[::2], tmp[1::2]):
        # if c1.seq == c2.rc.seq:
        contigs.add(Contig(c1.seq, str(cnt)))
        print cnt, c1.id, c2.id
        cnt += 1
        # else:
        #     contigs.add(Contig(c1.seq, str(cnt)))
        #     print cnt, c1.id
        #     cnt += 1
        #     contigs.add(Contig(c2.seq, str(cnt)))
        #     print cnt, c2.id
        #     cnt += 1
    print "Loading reads"
    reads = ReadCollection().loadFromFasta(open(reads_file, "r"))
    print "Aligning reads"
    for al in aligner.localAlign(reads, contigs):
        if len(al) > k:
            read = al.seg_from.contig  # type:AlignedRead
            read.addAlignment(al)
    res = open(os.path.join(dir, "reads.fasta"), "w")
    for read in reads:
        if not basic.isCanonocal(read.id):
            continue
        if len(read.alignments) > 1:
            SeqIO.write(read, res, "fasta")
    res.close()
Exemplo n.º 10
0
 def realignLine(self, line):
     # type: (NewLine) -> None
     for storage in self.alignmentsToFrom[line.id].values():
         line_from = storage.line_from  # type: NewLine
         self.alignmentsToFrom[line_from.rc.id][line.rc.id].content.clean()
         self.alignmentsToFrom[line.rc.id][line_from.rc.id].content.clean()
     self.rc_alignments[line.id].content.clean()
     self.rc_alignments[line.rc.id].content.clean()
     self.auto_alignments[line.id].content.clean()
     self.auto_alignments[line.rc.id].content.clean()
     for al in self.aligner.dotplotAlign([line], self.lines):
         if len(al) > params.k and al.percentIdentity() > 0.8:
             if al.seg_from.contig.id == al.seg_to.contig.id:
                 ok = al.seg_from <= al.seg_to
             elif al.seg_from.contig == al.seg_to.contig.rc:
                 if basic.isCanonocal(al.seg_from.contig.id):
                     ok = al.seg_from < al.seg_to.RC()
                 else:
                     ok = al.seg_from.RC() < al.seg_to
             else:
                 ok = True
             if ok:
                 self.addAlignment(al)
Exemplo n.º 11
0
    dir = sys.argv[1]
    extra_params = sys.argv[4:]
    CreateLog(dir)
    dd = DirDistributor(dir)
    aligner = Aligner(dd)
    polisher = Polisher(aligner, dd)
    reads = ContigStorage().loadFromFasta(open(reads_file, "r"),
                                          num_names=False)
    ref = ContigStorage().loadFromFasta(open(consensus_file, "r"),
                                        num_names=False)
    if "accurate" in extra_params:
        res = []
        als = sorted(aligner.overlapAlign(reads, ref),
                     key=lambda al: al.seg_to.contig.id)
        for rid, rals in itertools.groupby(als,
                                           key=lambda al: al.seg_to.contig.id):
            if basic.isCanonocal(rid):
                contig = ref[rid]
                corrected_seq = polisher.polishSegment(
                    contig.asSegment(), list(rals)).seg_from.Seq()
                res.append(Contig(corrected_seq, rid))
    else:
        res = polisher.polishMany(reads, list(ref.unique()))
    res_file = os.path.join(dir, "res.fasta")
    rf = open(res_file, "w")
    for c in res:
        SeqIO.write(c, rf, "fasta")
    rf.close()
    aligner.align_files(res_file, [reads_file], 16, "pacbio", "overlap",
                        os.path.join(dir, "res.sam"))
Exemplo n.º 12
0
def CreateContigCollection(graph_file, contigs_file, min_cov, aligner, polisher, reads, force_unique, all_unique):
    sys.stdout.info("Creating contig collection")
    if force_unique is None and not all_unique:
        graph = SimpleGraph().ReadDot(graph_file)
        graph.FillSeq(contigs_file)
        covs = []
        for e in graph.e.values():
            covs.append((e.len, e.cov))
        tmp_cov = []
        total = sum(l for c,l in covs) / 2
        for l, c in sorted(covs)[::-1]:
            if total < 0:
                break
            tmp_cov.append((l, c))
            total -= l
        avg_cov = float(sum([l * c for l, c in tmp_cov])) / sum(l for l, c in tmp_cov)
        sys.stdout.info("Average coverage determined:", avg_cov)
        nonunique = set()
        for edge in graph.e.values():
            if edge.unique and edge.len < 20000 and len(graph.v[edge.start].out) > 1:
                if edge.cov >= min_cov and (edge.cov < 0.8 * avg_cov or edge.len > 40000):
                    alter = ContigStorage()
                    for e in graph.v[edge.start].out:
                        if e != edge:
                            alter.add(Contig(e.seq, e.id))
                    for al in aligner.localAlign([Contig(edge.seq, edge.id)], alter):#type: AlignmentPiece
                        if al.percentIdentity() > 0.98 and (al.seg_from.left < 100 and al.seg_to.left < 100 and len(al) > min(500, edge.len)):
                            nonunique.add(edge.id)
                            nonunique.add(basic.Reverse(edge.id))
        contigs = ContigCollection()
        for edge in graph.e.values():
            if basic.isCanonocal(edge.id):
                if edge.unique and (edge.len > params.min_isolated_length or len(graph.v[edge.end].out) > 0 or len(graph.v[edge.start].inc) > 0):
                    if edge.cov >= min_cov and (edge.cov < 1.5 * avg_cov or edge.len > 40000):
                        if edge.id in nonunique:
                            sys.stdout.info("Edge removed based on alignment to alternative:", edge.id, edge.cov, edge.len)
                        else:
                            contigs.add(Contig(edge.seq, edge.id))
                    else:
                        sys.stdout.info("Edge removed based on coverage:", edge.id, edge.cov, edge.len)
                elif (edge.len > 100000 and edge.cov < 1.5 * avg_cov) or (edge.len > 40000 and 1.3 * avg_cov > edge.cov > 0.7 * avg_cov):
                    contigs.add(Contig(edge.seq, edge.id))
                    sys.stdout.info("Edge added based on length and coverage:", edge.id, edge.cov, edge.len)

    elif force_unique is not None:
        sys.stdout.info("Using forced unique edge set")
        sys.stdout.trace(force_unique)
        contigs = ContigCollection().loadFromFile(contigs_file).filter(lambda contig: contig.id in force_unique)
    else:
        sys.stdout.info("Considering all contigs unique")
        contigs = ContigCollection().loadFromFile(contigs_file)
    # contigs.loadFromFasta(open(contigs_file, "r"), num_names=True)
    # contigs = contigs.filter(lambda contig: contig.id not in nonunique and len(contig) > params.k + 20)
    sys.stdout.info("Created", len(contigs), "initial contigs")
    if not all_unique or force_unique is not None:
        sys.stdout.info("Polishing contigs")
        polished_contigs = polisher.polishMany(reads, list(contigs.unique()))
        contigs = ContigCollection().addAll(polished_contigs)
    else:
        sys.stdout.info("Skipping contig polishing step since manual unique contig initialization was used")
    return contigs
Exemplo n.º 13
0
import sys

sys.path.append("py")
from common import basic, SeqIO
from common.SimpleGraph import SimpleGraph

graph = SimpleGraph().ReadGFA(sys.argv[1])
for e_id in graph.e:
    if basic.isCanonocal(e_id):
        SeqIO.write(graph.e[e_id], sys.stdout, "fasta")
Exemplo n.º 14
0
def main(flye_dir, output_dir, diploid):
    basic.ensure_dir_existance(output_dir)
    CreateLog(output_dir)
    print("Version:", subprocess.check_output(["git", "rev-parse", "HEAD"]))
    print("Modifications:")
    print subprocess.check_output(["git", "diff"])
    graph_file = os.path.join(flye_dir, "20-repeat", "graph_before_rr.gv")
    edge_file = os.path.join(flye_dir, "20-repeat", "graph_before_rr.fasta")
    dump_file = os.path.join(flye_dir, "20-repeat", "read_alignment_dump")
    if diploid:
        calculator = DipolidCalculator(150000)
    else:
        calculator = HaploidCalculator(150000)
    print "Reading graph from", graph_file
    graph = SimpleGraph()
    graph.ReadDot(graph_file)
    print "Reading sequences from", edge_file
    graph.FillSeq(edge_file, True)
    print "Splitting graph", edge_file
    componentRecords, edgecomp = constructComponentRecords(graph, calculator)
    print "Reading alignment dump from", dump_file
    rcnt = 0
    for rid, eids in AlignmentDumpParser(dump_file).parse():
        compids = set()
        eids = map(basic.Normalize, eids)
        for eid in eids:
            for compid in edgecomp[eid]:
                compids.add(compid)
        for compid in compids:
            comp_eids = [
                eid for eid in eids
                if eid in componentRecords[compid].component.e
            ]
            if comp_eids.__len__() == 0:
                print "GOPA", compid, compids, rid, eids
            componentRecords[compid].addRead(rid, eids)
        rcnt += 1
        if rcnt % 100000 == 0:
            print "Processed", rcnt, "reads"
    print "Filling flye repeat resolution results"
    flye_next = FillFlyeNext(componentRecords,
                             os.path.join(flye_dir, "flye.log"))
    for compRec in componentRecords:
        half = compRec.half()
        for norm_eid in compRec.unique:
            for eid in [norm_eid, basic.Reverse(norm_eid)]:
                if eid not in compRec.component.e:
                    assert not basic.isCanonocal(eid)
                    assert basic.Reverse(eid) in compRec.component.e
                    continue
                if compRec.component.e[eid].end in half:
                    if compRec.component.isBorder(
                            compRec.component.e[eid].end):
                        compRec.out += 1
                    if compRec.component.isBorder(
                            compRec.component.e[eid].start):
                        compRec.inc += 1
                if not compRec.component.isBorder(
                        compRec.component.e[eid].end):
                    if flye_next[eid] is None:
                        compRec.unresolved_connections += 1
                    else:
                        compRec.resolved_connections.append(
                            (eid, flye_next[eid]))
                        if flye_next[eid] not in compRec.component.e:
                            compRec.outside_connections += 1

    basic.ensure_dir_existance(output_dir)
    print "Printing components to disk"
    subdataset_dir = os.path.join(output_dir, "subdatasets")
    basic.ensure_dir_existance(subdataset_dir)
    order = range(componentRecords.__len__())
    order = sorted(order, key=lambda i: componentRecords[i].score())
    ordered_components = [
        componentRecords[order[i]] for i in range(len(order))
    ]
    componentRecords = ordered_components
    basic.ensure_dir_existance(os.path.join(output_dir, "pics"))
    for i, component in enumerate(componentRecords):
        comp_dir = os.path.join(subdataset_dir, str(i))
        component.dump(comp_dir)
        fig_name = os.path.join(comp_dir, "graph.dot")
        component.draw(fig_name, calculator)
        if component.component.__len__() <= 100:
            fig_file = os.path.join(output_dir, "pics", str(i) + ".dot")
            component.draw(fig_file, calculator)

    table_file = os.path.join(output_dir, "table.txt")
    print "Printing table to file", table_file
    f = open(table_file, "w")
    f.write(
        "Id v e unique inc out repeats unresolved resolved outside zero hub badborder score\n"
    )
    for i, compRec in enumerate(componentRecords):
        comp = compRec.component
        f.write(" ".join([
            str(i),
            str(comp.v.__len__()),
            str(comp.e.__len__()),
            str(compRec.unique.__len__() * 2),
            str(compRec.inc),
            str(compRec.out),
            str(compRec.repeat_edges),
            str(compRec.unresolved_connections),
            str(compRec.resolved_connections.__len__()),
            str(compRec.outside_connections),
            str(compRec.zero),
            str(compRec.red),
            str(compRec.bad_border),
            str(compRec.overcovered_edges),
            str(compRec.score())
        ]) + "\n")
    f.close()
    table_file = os.path.join(output_dir, "list.txt")
    f = open(table_file, "w")
    for a in range(len(componentRecords)):
        f.write(str(a) + "\n")
    f.close()
Exemplo n.º 15
0
def main(model_file, k, dir, contigs_file, reads_file):
    # type: (str, int, str, str, str) -> None
    basic.ensure_dir_existance(dir)
    CreateLog(dir)
    dd = DirDistributor(os.path.join(dir, "alignments"))
    aligner = Aligner(dd)
    params.scores = ComplexScores()
    params.scores.load(open(model, "r"))
    params.k = k
    print "Loading contigs"
    tmp = sorted(ContigStorage().loadFromFasta(open(contigs_file, "r"),
                                               False).unique(),
                 key=lambda contig: len(contig))
    cnt = 1
    contigs = ContigStorage()
    for c1, c2 in zip(tmp[::2], tmp[1::2]):
        # if c1.seq == c2.rc.seq:
        contigs.add(Contig(c1.seq, str(cnt)))
        print cnt, c1.id, c2.id
        cnt += 1
        # else:
        #     contigs.add(Contig(c1.seq, str(cnt)))
        #     print cnt, c1.id
        #     cnt += 1
        #     contigs.add(Contig(c2.seq, str(cnt)))
        #     print cnt, c2.id
        #     cnt += 1
    print "Loading reads"
    reads = ReadCollection().loadFromFasta(open(reads_file, "r"))
    print "Aligning reads"
    for al in aligner.localAlign(reads, contigs):
        if len(al) > k:
            read = al.seg_from.contig  # type:AlignedRead
            read.addAlignment(al)
    for read in reads:
        if not basic.isCanonocal(read.id):
            continue
        cnt = 0
        al0 = None
        others = []
        for al in read.alignments:
            if not al.contradictingRTC():
                cnt += 1
                al0 = al
            else:
                others.append(al)
        if cnt != 1 or len(others) == 0:
            continue
        print al0
        print others
        seg = al0.seg_from
        for al in others:
            if al.seg_from.interSize(seg) < k:
                seg = None
                break
            else:
                seg = al.seg_from.cap(seg)
        print seg
        if seg is None:
            continue
        al0 = al0.reduce(query=seg)
        others = [al.reduce(query=seg) for al in others]
        scorer = Scorer(params.scores)
        for al in others:
            a, b, c = scorer.scoreCommon(al0, al)
            print "win", a, b, c, len(seg)
        if len(seg) > 1000:
            for i in range(len(seg) / 1000):
                seg1 = seg.prefix(length=i * 1000 + 1000).suffix(length=1000)
                for al in others:
                    a, b, c = scorer.scoreCommon(al0.reduce(query=seg1),
                                                 al.reduce(query=seg1))
                    print "win1000", a, b, c, len(seg1)
        for al1 in others:
            for al2 in others:
                if al1 == al2:
                    continue
                a, b, c = scorer.scoreCommon(al1, al2)
                print "draw", a, b, c, len(seg)