示例#1
0
 def ReadGFA(self, f):
     seqs = dict()
     v = DisjointSet()
     for s in open(f, "r").readlines():
         s = s.split()
         if s[0] == "S":
             seqs[s[1]] = s[2]
             v.add((True, s[1], True))
             v.add((True, s[1], False))
             v.add((False, s[1], True))
             v.add((False, s[1], False))
         elif s[0] == "L":
             v1 = (s[2] == "+", s[1], True)
             v2 = (s[4] == "+", s[3], False)
             v.union(v1, v2)
             v1, v2 = ((not v2[0], v2[1], not v2[2]), (not v1[0], v1[1],
                                                       not v1[2]))
             v.union(v1, v2)
     ids = dict()
     cnt = 1
     for vid, vl in v.listComponenets():
         self.AddVertex(str(cnt), str(vid))
         ids[vid] = str(cnt)
         cnt += 1
     for eid, seq in seqs.items():
         self.AddEdge(eid, ids[v.get((True, eid, False))], ids[v.get(
             (True, eid, True))], len(seq), eid, seq)
         self.AddEdge("-" + eid, ids[v.get((False, eid, False))], ids[v.get(
             (False, eid, True))], len(seq), "-" + eid, basic.RC(seq))
     return self
示例#2
0
 def __init__(self, seq, id, extension_handler, rc = None):
     # type: (str, str, ExtensionHandler, Optional[NewLine]) -> None
     self.extensionHandler = extension_handler
     self.seq = seq
     self.id = id # type: str
     self.circular = False
     self.name_printer = None
     self.max_extension = False
     if rc is None:
         self.initial = AlignmentStorage()
         self.correct_segments = SegmentStorage()
         self.completely_resolved = SegmentStorage()
         self.disjointig_alignments = AlignmentStorage()
         self.read_alignments = ReadAlignmentStorage()
         self.listeners = [self.initial, self.correct_segments, self.completely_resolved, self.disjointig_alignments, self.read_alignments, extension_handler] # type: List[LineListener]
         rc = NewLine(basic.RC(seq), basic.Reverse(self.id), extension_handler.rc, self) #type: NewLine
         self.rc = rc
         self.addListener(ReadAlignmentListener(self))
         # self.initial.add(AlignmentPiece.Identical(self.asSegment().asContig().asSegment(), self.asSegment()))
     else:
         self.initial = rc.initial.rc # type: AlignmentStorage
         self.correct_segments = rc.correct_segments.rc # type: SegmentStorage
         self.completely_resolved = rc.completely_resolved.rc # type: SegmentStorage
         self.disjointig_alignments = rc.disjointig_alignments.rc # type: AlignmentStorage
         self.read_alignments = rc.read_alignments.rc # type: ReadAlignmentStorage
         self.listeners = [listener.rc for listener in rc.listeners] # type: List[LineListener]
     Contig.__init__(self, seq, id, rc)
     self.rc = rc #type: NewLine
     self.knot = None # type: Knot
示例#3
0
def uniqueNeighbours(edge_ids, graph, min_contig_length):
    unique = []
    for eid in edge_ids:
        print "Finding neighbours of", eid
        for e in graph.v[graph.e[eid].start].inc:
            if basic.Normalize(e.id) in edge_ids:
                continue
            id = basic.Normalize(e.id)
            if len(e.seq) < min_contig_length + params.bad_end_length:
                seq = uniquePathForward(
                    graph, e, min_contig_length + params.bad_end_length)
                id = id + "p"
                # seq = e.seq
            else:
                seq = e.seq[-min_contig_length - params.bad_end_length:]
                if e.id.startswith("-"):
                    id = id + "l"
                else:
                    id = id + "r"
            if e.id.startswith("-"):
                seq = basic.RC(seq)
            print "Right neighbour", eid, id
            unique.append((NamedSequence(seq, id), basic.Normalize(e.id)))
        for e in graph.v[graph.e[eid].end].out:
            if basic.Normalize(e.id) in edge_ids:
                continue
            id = basic.Normalize(e.id)
            if len(e.seq) < min_contig_length + params.bad_end_length:
                seq = uniquePathBackward(
                    graph, e, min_contig_length + params.bad_end_length)
                id = id + "p"
                # seq = e.seq
            else:
                seq = e.seq[:min_contig_length + params.bad_end_length]
                if e.id.startswith("-"):
                    id = id + "r"
                else:
                    id = id + "l"
            if e.id.startswith("-"):
                seq = basic.RC(seq)
            print "Left neighbour", eid, id
            unique.append((NamedSequence(seq, id), basic.Normalize(e.id)))
    return unique
示例#4
0
 def __init__(self, line_left, line_right, gap, gap_seq = "", rc = None):
     # type: (NewLine, NewLine, int, str, Knot) -> None
     assert gap <= 0 or len(gap_seq) > 0
     self.line_left = line_left
     self.line_right = line_right
     self.gap = gap
     self.gap_seq = gap_seq
     if rc is None:
         rc = Knot(line_right.rc, line_left.rc, gap, basic.RC(gap_seq), self)
     self.rc = rc
示例#5
0
 def FillSeq(self, f, numeric=True):
     for s in SeqIO.parse_fasta(open(f, "r")):
         if numeric:
             s.id = str(basic.parseNumber(s.id))
         if s.id in self.e:
             self.e[s.id].seq = s.seq
             self.e[s.id].len = len(s.seq)
         if "-" + s.id in self.e:
             self.e["-" + s.id].seq = basic.RC(s.seq)
             self.e["-" + s.id].len = len(s.seq)
     return self
示例#6
0
 def extendRight(self, seq, relevant_als=None):
     # type: (str, List[AlignmentPiece]) -> None
     sys.stdout.trace("Line operation Extend:", self, len(seq),
                      relevant_als)
     assert self.knot is None
     if relevant_als is None:
         relevant_als = []
     new_seq = Contig(self.seq + seq, "TMP2_" + self.id)
     self.notifyBeforeExtendRight(new_seq, seq)
     self.seq = self.seq + seq
     self.rc.seq = basic.RC(seq) + self.rc.seq
     self.notifyAfterExtendRight(seq, relevant_als)
示例#7
0
 def FillSeq(self, f, numeric=True):
     for s in SeqIO.parse_fasta(open(f, "r")):
         if numeric:
             s.id = str(basic.parseNumber(s.id))
         if s.id in self.e:
             self.e[s.id].seq = s.seq
             self.e[s.id].len = len(s.seq)
         if basic.Reverse(s.id) in self.e:
             self.e[basic.Reverse(s.id)].seq = basic.RC(s.seq)
             self.e[basic.Reverse(s.id)].len = len(s.seq)
     for edge in self.e.values():
         assert (edge.seq is not None)
     return self
示例#8
0
 def __init__(self, seq, id, rc=None):
     # type: (str, str, Optional[Disjointig]) -> None
     self.seq = seq
     self.id = id
     if rc is None:
         self.read_alignments = AlignmentStorage()  # type: AlignmentStorage
         rc = Disjointig(basic.RC(seq), basic.Reverse(id),
                         self)  # type: Disjointig
         self.rc = rc
     else:
         self.rc = rc
         self.read_alignments = self.rc.read_alignments.rc  # type: AlignmentStorage
     Contig.__init__(self, seq, id, rc)
     self.rc = rc  # type:Disjointig
示例#9
0
 def correctSequence(self, alignments):
     # type: (Iterable[AlignmentPiece]) -> None
     sys.stdout.trace("Line operation Correct:", alignments)
     alignments = [al.cutIdenticalEnds() for al in alignments if al.seg_from.Seq() != al.seg_to.Seq()]
     if len(alignments) == 0:
         sys.stdout.trace("Skipping trivial correction operation")
         return
     assert len(alignments) > 0
     correction = Correction.constructCorrection(alignments)
     self.notifyBeforeCorrect(correction)
     old = Contig(self.seq, "old")
     self.seq = correction.seq_from.seq
     self.rc.seq = basic.RC(self.seq)
     correction.changeQT(self, old)
     self.notifyAfterCorrect(correction)
def main(ref_file, segment, dir):
    ref = ContigCollection().loadFromFasta(open(ref_file, "r"), False)
    chr1 = ref["chr1"]
    if segment[0] < 0:
        segment = (-segment[0], -segment[1])
        chr1 = chr1.rc
    reads = ReadCollection()
    reads_list = []
    for i in range(segment[0], segment[1], 500):
        read = reads.addNewRead(Segment(chr1, i, i + 500).asNamedSequence())
        reads_list.append(read)
    chr1.seq = chr1.seq[:segment[0]] + "N" * (segment[1] - segment[0]) + chr1.seq[segment[1]:]
    chr1.rc.seq = basic.RC(chr1.seq)
    basic.ensure_dir_existance(dir)
    aligner = Aligner(DirDistributor(dir))
    aligner.alignReadCollection(reads, ref)
    out = sys.stdout
    for read in reads_list:
        # print read
        out.write(str(len(read.alignments)) + " " + str(max([0] + map(lambda al: al.percentIdentity(), read.alignments))) + "\n")
    out.close()
示例#11
0
 def correctSequence(self, alignments):
     # type: (Iterable[AlignmentPiece]) -> None
     sys.stdout.trace("Line operation Correct:", alignments)
     alignments = list(alignments)
     new_alignments = []
     for al in alignments:
         if al.seg_from.Seq() == al.seg_to.Seq():
             sys.stdout.trace("Skipping trivial correction alignment", al)
         else:
             new_alignments.append(al)
     if len(new_alignments) == 0:
         sys.stdout.trace("Skipping trivial correction operation")
         return
     assert len(alignments) > 0
     correction = Correction.constructCorrection(alignments)
     self.notifyBeforeCorrect(correction)
     old = Contig(self.seq, "old")
     self.seq = correction.seq_from.seq
     self.rc.seq = basic.RC(self.seq)
     correction.changeQT(self, old)
     self.notifyAfterCorrect(correction)
示例#12
0
def main(flye_dir, rf, dir, edge_id, k):
    params.technology = "nano"
    params.k = k
    basic.ensure_dir_existance(dir)
    basic.CreateLog(dir)
    dd = DirDistributor(os.path.join(dir, "alignments"))
    print "Reading graph"
    graph = SimpleGraph().ReadGFA(os.path.join(flye_dir, "assembly_graph.gfa"))
    print "Parsing edge mapping"
    id_map = parseUPaths(flye_dir)
    edge_ids = edge_id.split(",")
    print "Extracting relevant graph component"
    res = open(os.path.join(dir, "contigs.fasta"), "w")
    unique = dict()
    for eid in edge_ids:
        for e in graph.v[graph.e[eid].start].inc:
            if basic.Normalize(e.id) in edge_ids:
                continue
            if len(e.seq) < 10000:
                if e.id.startswith("-"):
                    unique[e.id[1:]] = NamedSequence(basic.RC(e.seq), e.id[1:])
                else:
                    unique[e.id] = NamedSequence(e.seq, e.id)
            else:
                if e.id.startswith("-"):
                    unique[e.id[1:] + "l"] = NamedSequence(
                        basic.RC(e.seq[:5000]), e.id[1:] + "l")
                else:
                    unique[e.id + "r"] = NamedSequence(e.seq[-5000:],
                                                       e.id + "r")
        for e in graph.v[graph.e[eid].end].out:
            if basic.Normalize(e.id) in edge_ids:
                continue
            if len(e.seq) < 10000:
                if e.id.startswith("-"):
                    unique[e.id[1:]] = NamedSequence(basic.RC(e.seq), e.id[1:])
                else:
                    unique[e.id] = NamedSequence(e.seq, e.id)
            else:
                if e.id.startswith("-"):
                    unique[e.id[1:] + "r"] = NamedSequence(
                        basic.RC(e.seq[-5000:]), e.id[1:] + "r")
                else:
                    unique[e.id + "l"] = NamedSequence(e.seq[:5000],
                                                       e.id + "l")

    for c in unique.values():
        print c.id
        SeqIO.write(c, res, "fasta")
    res.close()
    old_ids = []
    for eid in edge_ids:
        for olde in id_map[eid[len("edge_"):]]:
            old_ids.append(basic.Normalize(olde))
    print "Finding reads that align to", edge_ids
    print "Old ids:", old_ids
    relevant_read_ids = set()
    for s in open(os.path.join(flye_dir, "20-repeat", "read_alignment_dump"),
                  "r").readlines():
        s = s.split()
        if s[0] != "Aln":
            continue
        if s[6].split("_")[1] in old_ids:
            relevant_read_ids.add(s[2][1:])
            print s[2][1:], s[6].split("_")[1]
    print "Reading reads"
    res = open(os.path.join(dir, "reads.fasta"), "w")
    for read in SeqIO.parse_fasta(open(rf, "r")):
        if read.id in relevant_read_ids and len(read) > k * 1.2:
            SeqIO.write(read, res, "fasta")
    res.close()
示例#13
0
 def RC(self):
     # type: () -> NamedSequence
     return NamedSequence(basic.RC(self.seq), basic.Reverse(self.id))
示例#14
0
 def RC(self):
     return Consensus(basic.RC(self.seq), self.cov[::-1])
示例#15
0
 def __init__(self, seq, id, rc=None):
     # type: (str, str, Optional[Contig]) -> None
     NamedSequence.__init__(self, seq, id)
     if rc is None:
         rc = Contig(basic.RC(seq), basic.Reverse(id), self)
     self.rc = rc
示例#16
0
def main(args):
    flye_dir = sys.argv[1]
    repeats, starts, ends = parse(sys.argv[2])
    graph_file, unique_file, disjointigs_file, rep_dir, tmp, their_file = cl_params.parseFlyeDir(
        flye_dir)
    dump = os.path.join(rep_dir, "read_alignment_dump")
    reads_file = sys.argv[3]
    dir = sys.argv[4]
    CreateLog(dir)
    print " ".join(args)
    print "Printing contigs"
    edges_file = os.path.join(rep_dir, "graph_before_rr.fasta")
    edges = ContigStorage().loadFromFasta(open(edges_file, "r"))
    unique = open(os.path.join(dir, "contigs"), "w")
    for l in starts:
        seq = "".join(map(lambda eid: edges[eid].seq, l))
        if len(seq) > 15000:
            seq = seq[-15000:]
        SeqIO.write(NamedSequence(seq, "(" + "_".join(l) + ")"), unique,
                    "fasta")
    for l in ends:
        seq = "".join(map(lambda eid: edges[eid].seq, l))
        if len(seq) > 15000:
            seq = seq[:15000]
        SeqIO.write(NamedSequence(basic.RC(seq), "(" + "_".join(l) + ")"),
                    unique, "fasta")
    unique.close()
    print "Selecting reads"
    reads = set()
    cur_read = None
    als = []
    for s in open(dump).readlines():
        if s.startswith("Chain"):
            if cur_read == "06dfd536-e258-446f-a019-8d8340ef831e":
                print als
            for al in als:
                if al in repeats:
                    if cur_read == "06dfd536-e258-446f-a019-8d8340ef831e":
                        print "oppa"
                    reads.add(cur_read)
                    break
            als = []

        else:
            s = s.split()
            cur_read = s[2][1:]
            eid = s[6].split("_")[1]
            if s[6][0] == "-":
                eid = "-" + eid
            if cur_read == "06dfd536-e258-446f-a019-8d8340ef831e":
                print eid
            als.append(eid)
    print "Selected", len(reads), "reads"
    print "\n".join(reads)
    print "Reading and printing reads"
    freads = open(os.path.join(dir, "reads.fasta"), "w")
    cnt = 0
    for read in SeqIO.parse_by_name(reads_file):
        cnt += 1
        if cnt % 10000 == 0:
            print cnt
        if read.id in reads:
            SeqIO.write(read, freads, "fasta")
    freads.close()
示例#17
0
 def RC(self):
     # type: () -> SeqRecord
     qual = self.qual
     if qual is not None:
         qual = qual[::-1]
     return SeqRecord(basic.RC(self.seq), basic.Reverse(self.id), qual)