示例#1
0
def printSegs(f, segs):
    c = ContigStorage().loadFromFasta(open(f, "r"), False)
    for seg in segs:
        if seg[2] == 0:
            seg[2] = len(c[seg[0]])
        SeqIO.write(c[seg[0]].segment(seg[1], seg[2]).asContig(), sys.stdout,
                    "fasta")
示例#2
0
 def printKnottedToFasta(self, handler):
     # type: (BinaryIO) -> None
     printed = set()
     cnt = 1
     for chain in self.chains():
         if chain[0].rc.id in printed:
             continue
         for line in chain:
             printed.add(line.id)
         seq = []
         id = []
         if chain[-1].knot is not None:
             id.append("Circular")
         for line in chain:
             id.append(line.id)
             if line.knot is not None:
                 id.append(str(line.knot.gap))
                 if line.knot.gap < 0:
                     seq.append(line.seq[:line.knot.gap])
                 else:
                     seq.append(line.seq)
                     seq.append(line.knot.gap_seq)
             else:
                 seq.append(line.seq)
         sys.stdout.trace(cnt, ":", ";".join(id))
         SeqIO.write(NamedSequence("".join(seq), "contig_" + str(cnt)),
                     handler, "fasta")
         cnt += 1
示例#3
0
def simulate1(dir, mutation, genome):
    print "Simulating", genome
    ds = dataset_simulation.TestDataset(genome, 5000, mutation_rate=mutation)
    genome_seq = ds.mutate(ds.genome, mutation / 2)[0]
    f = open(os.path.join(dir, genome + str(mutation * 100) + ".fasta"), "w")
    SeqIO.write(NamedSequence(genome_seq, genome), f, "fasta")
    f.close()
示例#4
0
文件: support.py 项目: nickp60/EToKi
def get_max_reads_length(reads_file, log, num_checked):
    file_type = SeqIO.get_read_file_type(reads_file)
    if not file_type:
        error('Incorrect extension of reads file: ' + reads_file, log)

    max_reads_length = max([len(rec) for rec in itertools.islice(SeqIO.parse(SeqIO.Open(reads_file, "r"), file_type), num_checked)])
    log.info(reads_file + ': max reads length: ' + str(max_reads_length))
    return max_reads_length
示例#5
0
def prepare_disjointigs_file(disjointigs_file, disjointigs_file_list):
    recs = []
    for fn in disjointigs_file_list:
        for rec in SeqIO.parse_fasta(open(fn, "r")):
            recs.append(rec)
    h = open(disjointigs_file, "w")
    for rec in recs:
        SeqIO.write(rec, h, "fasta")
    h.close()
示例#6
0
def collect_contigs(dataset, barcodes_dir, output_base, format):
    output = open(output_base + "." + format, "w")
    for barcode in dataset:
        file = os.path.join(barcodes_dir, barcode.id, "truseq_long_reads." + format)
        if os.path.exists(file):
            contigs = SeqIO.parse(open(file), format)
            for contig in contigs:
                contig.id = barcode.id + "-" + contig.id
                SeqIO.write(contig, output, format)
    output.close()
示例#7
0
def PrintResults(recs, reference, references_file, coordinates_file):
    aln = open(coordinates_file, "w")
    fasta = open(references_file, "w")
    for rec in recs:
        aln.write(str(rec) + "\n")
        sequence = reference[rec.rname][rec.left:rec.right]
        rec_id = str(rec.rname) + "_(" + str(rec.left) + "-" + str(rec.right)+")"
        SeqIO.write(SeqIO.SeqRecord(sequence, rec_id), fasta, "fasta")
    aln.close()
    fasta.close()
示例#8
0
def get_read_file_type(input_filename, log=None):
    if input_filename in options_storage.dict_of_prefixes:
        ext = options_storage.dict_of_prefixes[input_filename]
        file_type = SeqIO.get_read_file_type("filename" + ext)
    else:
        file_type = SeqIO.get_read_file_type(input_filename)

    if not file_type:
        error("incorrect extension of reads file: %s" % input_filename, log)
    return file_type
示例#9
0
def check_file_not_empty(input_filename, message="", log=None):
    filename = abspath(expanduser(input_filename))
    file_type = get_read_file_type(input_filename, log)
    if (file_type == 'bam'):
        return

    try:
        reads_iterator = SeqIO.parse(SeqIO.Open(filename, "r"), file_type)
        if next(reads_iterator, None) is None:
            error("file is empty: %s (%s)" % (filename, message), log=log)
    except Exception as inst:
        error(inst.args[0].format(FILE=filename) + "\n\n" +
              traceback.format_exc().format(FILE=filename),
              log=log)
示例#10
0
 def loadFromFasta(self,
                   handler,
                   save_names=False,
                   int_ids=False,
                   filter=lambda rec: True):
     # type: (BinaryIO, bool, bool, Callable[[NamedSequence], bool]) -> DisjointigCollection
     recs = list(SeqIO.parse_fasta(handler))
     if save_names:
         for rec in recs:
             assert rec.id not in self.items.keys() and basic.Reverse(
                 rec.id) not in self.items.keys()
     for rec in recs:
         if not filter(rec):
             continue
         if save_names:
             number = basic.parseNegativeNumber(rec.id)
             if number is not None:
                 self.cnt = max(self.cnt, int(abs(number)) + 1)
             if int_ids:
                 self.addNew(rec.seq, str(number))
             else:
                 self.addNew(rec.seq)
         else:
             self.addNew(rec.seq)
     return self
示例#11
0
def main(contig_file, reads_file, sam_file, dir, contig_id):
    # type: (str, str, str, str, str) -> None
    basic.ensure_dir_existance(dir)
    contigs = ContigCollection()
    contigs.loadFromFasta(open(contig_file, "r"))
    print "Contigs loaded"
    contig = contigs[contig_id]
    read_names = set()
    for rec in Samfile(open(sam_file, "r")):
        read_names.add(rec.query_name)
    reads = ReadCollection()
    cnt = 0
    for rec in SeqIO.parse_fasta(open(reads_file, "r")):
        if rec.id in read_names:
            rec.id = "Read" + str(cnt)
            reads.add(AlignedRead(rec))
            cnt += 1
    reads.print_fasta(open(os.path.join(dir, "reads.fasta"), "w"))
    print "Reads loaded", len(reads)
    reads.addAllRC()
    print "RC added", len(reads)

    aligner = Aligner(DirDistributor(os.path.join(dir, "alignments")))
    aligner.alignReadCollection(reads, contigs)
    print "Reads aligned", len(reads)
    reads = reads.inter(contig.asSegment())
    print "Reads filtered", len(reads)
    sorted_reads = sorted(list(reads.reads.values()), key = lambda read: read.alignmentsTo(contig.asSegment()).next().seg_to.left)
    for read in sorted_reads:
        print read
        for al in read.alignmentsTo(contig.asSegment()):
            print "\n".join(al.asMatchingStrings())
示例#12
0
def get_max_reads_length(reads_file, log, num_checked):
    if reads_file in options_storage.dict_of_prefixes:
        ext = options_storage.dict_of_prefixes[reads_file]
        file_type = SeqIO.get_read_file_type(ext)
    else:
        file_type = SeqIO.get_read_file_type(reads_file)

    if not file_type:
        error("incorrect extension of reads file: %s" % reads_file, log)

    max_reads_length = max([
        len(rec) for rec in itertools.islice(
            SeqIO.parse(SeqIO.Open(reads_file, "r"), file_type), num_checked)
    ])
    log.info("%s: max reads length: %s" % (reads_file, str(max_reads_length)))
    return max_reads_length
示例#13
0
def get_max_reads_length(reads_file, log, num_checked):
    file_type = get_read_file_type(reads_file, log)
    max_reads_length = 0
    try:
        max_reads_length = max([
            len(rec) for rec in itertools.islice(
                SeqIO.parse(SeqIO.Open(reads_file, "r"), file_type),
                num_checked)
        ])
    except Exception as inst:
        error(inst.args[0].format(FILE=reads_file) + "\n\n" +
              traceback.format_exc().format(FILE=reads_file),
              log=log)
    else:
        log.info("%s: max reads length: %s" %
                 (reads_file, str(max_reads_length)))
    return max_reads_length
示例#14
0
def readsN50(dir):
    for fn in os.listdir(dir):
        tmp = []
        f = os.path.join(dir, fn, fn + ".fasta")
        for rec in SeqIO.parse_fasta(open(f, "r")):
            if len(tmp) >= 1000:
                break
            tmp.append(len(rec))
        print fn, sorted(tmp)[len(tmp) / 2]
示例#15
0
def simulate2(dir, mutation, error_rate, genome):
    print "Simulating", genome
    ds = dataset_simulation.TestDataset(genome, 4000, mutation_rate=mutation)
    genome_seq = ds.mutate(ds.genome, mutation / 2)[0]
    total = 0
    f = open(os.path.join(dir, genome + ".fasta"), "w")
    SeqIO.write(NamedSequence(genome_seq, genome), f, "fasta")
    f.close()
    f = open(os.path.join(dir, "reads.fasta"), "w")
    cnt = 0
    while total < len(genome_seq) * 30:
        l = random.randint(3000, 3500)
        pos = random.randint(0, len(genome_seq) - l)
        seq = ds.mutate(genome_seq[pos:pos + l], error_rate)[0]
        SeqIO.write(NamedSequence(seq, str(cnt)), f, "fasta")
        cnt += 1
        total += len(seq)
    f.close()
示例#16
0
 def loadFromFasta(self, handler, num_names=True):
     # type: (BinaryIO, bool) -> ContigCollection
     for rec in SeqIO.parse_fasta(handler):
         if num_names:
             self.add(
                 Contig(rec.seq,
                        str(basic.parseNegativeNumberAndMod(rec.id))))
         else:
             self.add(Contig(rec.seq, rec.id))
     return self
示例#17
0
def main(ref_file, contig_size, rlen, cov, dir):
    basic.ensure_dir_existance(dir)
    all_contigs = ContigCollection().loadFromFasta(open(ref_file, "r"), False)
    contig_file_name = os.path.join(dir, "contigs.fasta")
    contig_file = open(contig_file_name, "w")
    reads_file_name = os.path.join(dir, "reads.fasta")
    reads_file = open(reads_file_name, "w")
    for ref in all_contigs.unique():
        if len(ref) < contig_size:
            continue
        SeqIO.write(ref, contig_file, "fasta")
        for i in range(0, len(ref), max(1, rlen / cov)):
            read = ref.segment(i, min(i + rlen, len(ref))).asNamedSequence()
            SeqIO.write(read, reads_file, "fasta")
    reads_file.close()
    contig_file.close()
    print "Done"
    print contig_file_name
    print reads_file_name
示例#18
0
 def loadFromFile(self, fname, num_names=True):
     # type: (str, bool) -> ContigCollection
     for rec in SeqIO.parse_by_name(fname):
         if num_names:
             self.add(
                 Contig(rec.seq,
                        str(basic.parseNegativeNumberAndMod(rec.id))))
         else:
             self.add(Contig(rec.seq, rec.id))
     return self
示例#19
0
 def CheckSequences(self, reads, reads_file):
     # type: (Iterable[NamedSequence], str) -> bool
     if not os.path.exists(reads_file):
         return False
     try:
         for rec, read in itertools.izip_longest(SeqIO.parse_fasta(open(reads_file, "r")), reads):
             if str(rec.id) != str(read.id) or rec.seq != read.seq:
                 return False
         return True
     except:
         return False
示例#20
0
 def FillSeq(self, f, numeric=True):
     for s in SeqIO.parse_fasta(open(f, "r")):
         if numeric:
             s.id = str(basic.parseNumber(s.id))
         if s.id in self.e:
             self.e[s.id].seq = s.seq
             self.e[s.id].len = len(s.seq)
         if "-" + s.id in self.e:
             self.e["-" + s.id].seq = basic.RC(s.seq)
             self.e["-" + s.id].len = len(s.seq)
     return self
示例#21
0
def main(k, dir, contigs_file, reads_file):
    # type: (int, str, str, str) -> None
    basic.ensure_dir_existance(dir)
    CreateLog(dir)
    dd = DirDistributor(os.path.join(dir, "alignments"))
    aligner = Aligner(dd)
    params.k = k
    print "Loading contigs"
    tmp = sorted(ContigStorage().loadFromFasta(open(contigs_file, "r"),
                                               False).unique(),
                 key=lambda contig: len(contig))
    cnt = 1
    contigs = ContigStorage()
    for c1, c2 in zip(tmp[::2], tmp[1::2]):
        # if c1.seq == c2.rc.seq:
        contigs.add(Contig(c1.seq, str(cnt)))
        print cnt, c1.id, c2.id
        cnt += 1
        # else:
        #     contigs.add(Contig(c1.seq, str(cnt)))
        #     print cnt, c1.id
        #     cnt += 1
        #     contigs.add(Contig(c2.seq, str(cnt)))
        #     print cnt, c2.id
        #     cnt += 1
    print "Loading reads"
    reads = ReadCollection().loadFromFasta(open(reads_file, "r"))
    print "Aligning reads"
    for al in aligner.localAlign(reads, contigs):
        if len(al) > k:
            read = al.seg_from.contig  # type:AlignedRead
            read.addAlignment(al)
    res = open(os.path.join(dir, "reads.fasta"), "w")
    for read in reads:
        if not basic.isCanonocal(read.id):
            continue
        if len(read.alignments) > 1:
            SeqIO.write(read, res, "fasta")
    res.close()
示例#22
0
 def FillSeq(self, f, numeric=True):
     for s in SeqIO.parse_fasta(open(f, "r")):
         if numeric:
             s.id = str(basic.parseNumber(s.id))
         if s.id in self.e:
             self.e[s.id].seq = s.seq
             self.e[s.id].len = len(s.seq)
         if basic.Reverse(s.id) in self.e:
             self.e[basic.Reverse(s.id)].seq = basic.RC(s.seq)
             self.e[basic.Reverse(s.id)].len = len(s.seq)
     for edge in self.e.values():
         assert (edge.seq is not None)
     return self
示例#23
0
def align(dir, contigs_file):
    CreateLog(dir)
    contigs = list(SeqIO.parse_fasta(open(contigs_file, "r")))
    assert len(contigs) == 2
    contigs = [
        Contig(contigs[0].seq, contigs[0].id),
        Contig(contigs[1].seq, contigs[1].id)
    ]
    aligner = Aligner(DirDistributor(os.path.join(dir, "alignments")))
    als = iter_align(aligner, contigs[0], contigs[1])
    printVar(os.path.join(dir, "diff.txt"), als)
    for al in als:
        print al
示例#24
0
def main(reads_file, ref_file, dir, error_rate):
    sys.stderr.write("Reading reference" + "\n")
    ref = sorted(list(SeqIO.parse_fasta(open(ref_file, "r"))),
                 key=lambda rec: len(rec))[-1]
    ref = Contig(ref.seq, ref.id)
    refs = ContigCollection()
    for i in range(0, len(ref) - 500, 500):
        if random.random() > 0.95:
            tmp = list(ref.segment(i, i + 500).Seq())
            for j in range(error_rate * 500 / 100):
                pos = random.randint(0, 499)
                tmp[pos] = basic.rc[tmp[pos]]
            refs.add(
                Contig("".join(tmp),
                       ref.id + "(" + str(i) + "," + str(i + 500) + ")"))
    refs.print_names(sys.stderr)
    sys.stderr.write("Reading reads" + "\n")
    reads = ReadCollection()
    reads.loadFromFasta(open(reads_file, "r"))

    sys.stderr.write("Aligning reads" + "\n")
    basic.ensure_dir_existance(dir)
    aligner = Aligner(DirDistributor(dir))
    aligner.alignReadCollection(reads, refs)
    sys.stderr.write("Analysing alignments" + "\n")
    alignments = []
    for read in reads:
        alignments.extend(read.alignments)
    alignments = filter(lambda al: len(al) > 450, alignments)
    alignments = sorted(alignments,
                        key=lambda al:
                        (al.seg_to.contig.id, al.seg_from.contig.id))
    scorer = Scorer()
    scorer.scores.homo_score = 3
    scorer.scores.ins_score = 5
    scorer.scores.del_score = 5
    cnt = 0
    for contig, iter in itertools.groupby(alignments,
                                          key=lambda al: al.seg_to.contig):
        iter = list(iter)
        sys.stderr.write(str(contig) + " " + str(len(iter)) + "\n")
        if len(iter) < 150:
            for al in iter:
                print scorer.accurateScore(al.matchingSequence(),
                                           params.alignment_correction_radius)
                cnt += 1
                if cnt >= 5000:
                    break
        if cnt >= 5000:
            break
示例#25
0
 def polishMany(self, reads, sequences):
     # type: (Iterable[AlignedRead], List[Contig]) -> List[Contig]
     dir, new_files, same = self.dir_distributor.fillNextDir([(list(sequences), "ref.fasta"), (reads, "reads.fasta")])
     consensus_file_name = new_files[0]
     reads_file_name = new_files[1]
     args = FakePolishingArgs()
     basic.ensure_dir_existance(os.path.join(dir, "work"))
     job = JobPolishing(args, os.path.join(dir, "work"), os.path.join(dir, "log.info"), [reads_file_name], consensus_file_name, "polish")
     polished_file = job.out_files["contigs"]
     if same and not params.clean and os.path.exists(polished_file):
         sys.stdout.trace("Polishing reused:", polished_file)
     else:
         sys.stdout.trace("Running polishing:", polished_file)
         job.run()
     return map(lambda rec: Contig(rec.seq, rec.id), SeqIO.parse_fasta(open(polished_file, "r")))
示例#26
0
 def polish(self, reads, consensus):
     # type: (Iterable[NamedSequence], Contig) -> str
     dir, new_files, same = self.dir_distributor.fillNextDir([([consensus], "ref.fasta"), (reads, "reads.fasta")])
     consensus_file_name = new_files[0]
     reads_file_name = new_files[1]
     args = FakePolishingArgs()
     basic.ensure_dir_existance(os.path.join(dir, "work"))
     job = JobPolishing(args, os.path.join(dir, "work"), os.path.join(dir, "log.info"), [reads_file_name], consensus_file_name, "polish")
     polished_file = job.out_files["contigs"]
     if same and not params.clean and os.path.exists(polished_file):
         sys.stdout.trace("Polishing reused:", polished_file)
     else:
         sys.stdout.trace("Running polishing:", polished_file)
         job.run()
     return list(SeqIO.parse_fasta(open(polished_file, "r")))[0].seq
示例#27
0
def moleculo_postprocessing(contigs_file, output_file, sam_files, log):
    log.info("===== Starting postprocessing based on read alignment")
    log.info("Processing scaffolds from " + contigs_file)
    log.info("Using read alignments to break and filter scaffolds")
    contigs = list(SeqIO.parse(open(contigs_file, "rU"), "fasta"))
    sam = sam_parser.SamChain([sam_parser.Samfile(sam_file) for sam_file in sam_files])
    generate_quality.GenerateQuality(contigs, sam)
    pattern_filter = moleculo_filter_contigs.PatternContigFilter(contigs, sam, pattern, rc_pattern)
    length_filter = moleculo_filter_contigs.ContigLengthFilter(1500)
    coverage_breaker = break_by_coverage.ContigBreaker(contigs, sam, 100, 50)
    pattern_breaker = break_by_coverage.PatternBreaker(pattern, rc_pattern, 150)
    n_breaker = break_by_coverage.NBreaker(3)
    result = SplitAndFilter(contigs, coverage_breaker, length_filter, n_breaker, pattern_breaker, pattern_filter)
    OutputResults(output_file, "fasta", result)
    OutputResults(output_file, "fastq", result)
    log.info("===== Postprocessing finished. Results can be found in " + output_file + ".fastq")
def draw(contigs_file, output_dir, k):
    aligner = Aligner(DirDistributor(os.path.join(output_dir, "alignments")))
    CreateLog(output_dir)
    print "Reading contigs"
    tmp = sorted(SeqIO.parse_fasta(open(contigs_file, "r")),
                 key=lambda contig: len(contig))
    lens = map(len, tmp)[::-1]
    print lens
    contigs = ContigStorage()
    if lens[1::2] == lens[0::2]:
        tmp = tmp[0::2]
        print "Removed extra contigs"
    for i, contig in enumerate(tmp):
        print i, contig
        contigs.add(Contig(contig.seq, str(i)))
    print "Constructing components"
    componenets = ExtractRepeatComponents(contigs, aligner, k)
    print "Components:"
    for comp in componenets:
        print comp.segments
        print comp.alignments
    for cnt, comp in enumerate(componenets):
        print "Processing component", cnt
        print comp.segments
        # print comp.alignments
        print "Forming blocks"
        Block.id_cnt = 0
        blocks = CreateBlocks(comp)
        if len(blocks) == 1:
            print "Skipping trivial repeat"
            continue
        for block in blocks:
            print "Block", block.id, ":", block.segs
        for block in blocks:
            for other in block.out:
                print block.id, "->", other.id
        print "Placing blocks on X axis"
        code = placeX(blocks)
        if code == 1:
            print "WARNING: component", cnt, "contains cycle. Aborting visualization."
            continue
        print "Placing blocks on Y axis"
        placeY(blocks, comp.segments)
        print "Printing figure"
        SimplePrinter().printBlocks(blocks, sys.stdout)
        print "Finished printing figure"
示例#29
0
    dir = sys.argv[1]
    extra_params = sys.argv[4:]
    CreateLog(dir)
    dd = DirDistributor(dir)
    aligner = Aligner(dd)
    polisher = Polisher(aligner, dd)
    reads = ContigStorage().loadFromFasta(open(reads_file, "r"),
                                          num_names=False)
    ref = ContigStorage().loadFromFasta(open(consensus_file, "r"),
                                        num_names=False)
    if "accurate" in extra_params:
        res = []
        als = sorted(aligner.overlapAlign(reads, ref),
                     key=lambda al: al.seg_to.contig.id)
        for rid, rals in itertools.groupby(als,
                                           key=lambda al: al.seg_to.contig.id):
            if basic.isCanonocal(rid):
                contig = ref[rid]
                corrected_seq = polisher.polishSegment(
                    contig.asSegment(), list(rals)).seg_from.Seq()
                res.append(Contig(corrected_seq, rid))
    else:
        res = polisher.polishMany(reads, list(ref.unique()))
    res_file = os.path.join(dir, "res.fasta")
    rf = open(res_file, "w")
    for c in res:
        SeqIO.write(c, rf, "fasta")
    rf.close()
    aligner.align_files(res_file, [reads_file], 16, "pacbio", "overlap",
                        os.path.join(dir, "res.sam"))
示例#30
0
cur = None
dump = open(sys.argv[2]).readlines()
d = dict()
for s in dump:
    s = s.strip()
    if s == "":
        continue
    if s.startswith("#"):
        s = s[1:].split()
        if s[0] == "Repeat":
            repeat = s[1]
        if s[0] in ["All", "Input", "Output"]:
            cur = s[1]
    else:
        if repeat not in interest:
            continue
        sign = s[0]
        s = s[1:]
        if s not in d:
            d[s] = []
        d[s].append((repeat, cur, sign))
for rec in SeqIO.parse_fasta(open(sys.argv[1])):
    id = rec.id.split()[0]
    if id in d:
        tmp = d[id]
        if ("reads", "-") in [(a[1], a[2]) for a in tmp]:
            rec.seq = RC(rec.seq)
        SeqIO.write(
            common.seq_records.SeqRecord(rec.seq, id + "_" + str(d[id])),
            sys.stdout, "fasta")
        sys.stderr.write(id + "_" + str(d[id]) + "\n")