Пример #1
0
def _run_polisher_only(args):
    """
    Runs standalone polisher
    """
    logger.info("Running Flye polisher")
    logger.debug("Cmd: %s", " ".join(sys.argv))
    bam_input = False

    for read_file in args.reads:
        if not os.path.exists(read_file):
            raise ResumeException("Can't open " + read_file)

        without_gz = read_file.rstrip(".gz")
        if not any([
                without_gz.endswith(x)
                for x in ["fasta", "fa", "fastq", "fq", "bam"]
        ]):
            raise ResumeException(
                "Unsupported input. Supported types: fasta/fastq/bam")
        if without_gz.endswith("bam"):
            bam_input = True

    if bam_input and len(args.reads) > 1:
        raise ResumeException("Only single bam input supported")

    pol.polish(args.polish_target,
               args.reads,
               args.out_dir,
               args.num_iters,
               args.threads,
               args.platform,
               args.read_type,
               output_progress=True)
    logger.info("Done!")
Пример #2
0
def _run_polisher_only(args):
    """
    Runs standalone polisher
    """
    logger.info("Running Flye polisher")
    logger.debug("Cmd: {0}".format(" ".join(sys.argv)))

    pol.polish(args.polish_target, args.reads, args.out_dir,
               args.num_iters, args.threads, args.platform,
               output_progress=True)
Пример #3
0
    def run(self):
        super(JobPolishing, self).run()
        if not os.path.isdir(self.polishing_dir):
            os.mkdir(self.polishing_dir)

        pol.polish(self.in_contigs, self.args.reads, self.polishing_dir,
                   self.args.num_iters, self.args.threads, self.args.platform,
                   output_progress=True)

        polished_file = os.path.join(self.polishing_dir, "polished_{0}.fasta"
                                     .format(self.args.num_iters))
        pol.generate_polished_edges(self.in_graph_edges, self.in_graph_gfa,
                                    polished_file,
                                    self.polishing_dir, self.args.platform,
                                    self.args.threads)
Пример #4
0
    def run(self):
        super(JobPolishing, self).run()
        if not os.path.isdir(self.polishing_dir):
            os.mkdir(self.polishing_dir)

        contigs, stats = \
            pol.polish(self.in_contigs, self.args.reads, self.polishing_dir,
                       self.args.num_iters, self.args.threads, self.args.platform,
                       output_progress=True)
        #contigs = os.path.join(self.polishing_dir, "polished_1.fasta")
        #stats = os.path.join(self.polishing_dir, "contigs_stats.txt")
        pol.filter_by_coverage(self.args, stats, contigs,
                               self.out_files["stats"], self.out_files["contigs"])
        pol.generate_polished_edges(self.in_graph_edges, self.in_graph_gfa,
                                    self.out_files["contigs"],
                                    self.polishing_dir, self.args.platform,
                                    self.args.threads)
        os.remove(contigs)
Пример #5
0
    def run(self):
        super(JobPolishing, self).run()
        if not os.path.isdir(self.polishing_dir):
            os.mkdir(self.polishing_dir)

        contigs, stats = \
            pol.polish(self.in_contigs, self.args.reads, self.polishing_dir,
                       self.args.num_iters, self.args.threads, self.args.platform,
                       self.args.read_type, output_progress=True)
        #contigs = os.path.join(self.polishing_dir, "polished_1.fasta")
        #stats = os.path.join(self.polishing_dir, "contigs_stats.txt")
        pol.filter_by_coverage(self.args, stats, contigs,
                               self.out_files["stats"],
                               self.out_files["contigs"])
        pol.generate_polished_edges(self.in_graph_edges, self.in_graph_gfa,
                                    self.out_files["contigs"],
                                    self.polishing_dir, self.args.platform,
                                    stats, self.args.threads)
        os.remove(contigs)
        if os.path.getsize(self.out_files["contigs"]) == 0:
            raise asm.AssembleException("No contigs were assembled - "
                                        "pipeline stopped")
Пример #6
0
def assemble_short_plasmids(args, work_dir, contigs_path):
    logger.debug("Extracting unmapped reads")
    reads2contigs_mapping = os.path.join(work_dir, "reads2contigs.paf")
    make_alignment(contigs_path, args.reads, args.threads,
                   work_dir, args.platform, reads2contigs_mapping,
                   reference_mode=True, sam_output=False)

    unmapped_reads_path = os.path.join(work_dir, "unmapped_reads.fasta")
    unmapped.extract_unmapped_reads(args, reads2contigs_mapping,
                                    unmapped_reads_path,
                                    mapping_rate_threshold=0.5)

    logger.debug("Finding self-mappings for unmapped reads")
    unmapped_reads_mapping = os.path.join(work_dir, "unmapped_ava.paf")
    make_alignment(unmapped_reads_path, [unmapped_reads_path], args.threads,
                   work_dir, args.platform, unmapped_reads_mapping,
                   reference_mode=False, sam_output=False)

    logger.debug("Extracting circular reads")
    circular_reads = circular.extract_circular_reads(unmapped_reads_mapping)
    logger.debug("Extracted %d circular reads", len(circular_reads))

    logger.debug("Extracing circular pairs")
    circular_pairs = circular.extract_circular_pairs(unmapped_reads_mapping)
    logger.debug("Extracted %d circular pairs", len(circular_pairs))

    #extracting only the necesssary subset of reads (the entire file could be pretty big)
    interesting_reads = {}
    for read in circular_reads:
        interesting_reads[read] = None
    for pair in circular_pairs:
        interesting_reads[pair[0].query] = None
        interesting_reads[pair[0].target] = None
    for hdr, seq in fp.stream_sequence(unmapped_reads_path):
        if hdr in interesting_reads:
            interesting_reads[hdr] = seq

    trimmed_circular_reads = \
        circular.trim_circular_reads(circular_reads, interesting_reads)
    trimmed_circular_pairs = \
        circular.trim_circular_pairs(circular_pairs, interesting_reads)
    trimmed_sequences_path = os.path.join(work_dir, "trimmed_sequences.fasta")
    fp.write_fasta_dict(dict(list(trimmed_circular_reads.items()) +
                             list(trimmed_circular_pairs.items())),
                        trimmed_sequences_path)

    logger.debug("Clustering circular sequences")
    trimmed_sequences_mapping = os.path.join(work_dir, "trimmed.paf")
    make_alignment(trimmed_sequences_path, [trimmed_sequences_path], args.threads,
                   work_dir, args.platform, trimmed_sequences_mapping,
                   reference_mode=False, sam_output=False)

    plasmids = \
        circular.extract_unique_plasmids(trimmed_sequences_mapping,
                                         trimmed_sequences_path)

    plasmids_raw = os.path.join(work_dir, "plasmids_raw.fasta")
    fp.write_fasta_dict(plasmids, plasmids_raw)
    _, polished_stats = \
        pol.polish(plasmids_raw, [unmapped_reads_path], work_dir, 1,
                   args.threads, args.platform, output_progress=False)

    #extract coverage
    plasmids_with_coverage = {}
    if os.path.isfile(polished_stats):
        with open(polished_stats, "r") as f:
            for line in f:
                if line.startswith("#"): continue
                tokens = line.strip().split()
                seq_id, coverage = tokens[0], int(tokens[2])
                if coverage > 0:
                    plasmids_with_coverage[seq_id] = plasmids[seq_id], coverage

    logger.info("Added %d extra contigs", len(plasmids_with_coverage))

    # remove all unnecesarry files
    os.remove(reads2contigs_mapping)
    os.remove(unmapped_reads_path)
    os.remove(unmapped_reads_mapping)
    os.remove(trimmed_sequences_path)
    os.remove(trimmed_sequences_mapping)

    return plasmids_with_coverage
Пример #7
0
def assemble_short_plasmids(args, work_dir, contigs_path):
    logger.debug("Assembling short plasmids")

    reads2contigs_mapping = os.path.join(work_dir, "reads2contigs.paf")
    make_alignment(contigs_path,
                   args.reads,
                   args.threads,
                   work_dir,
                   args.platform,
                   reads2contigs_mapping,
                   reference_mode=True,
                   sam_output=False)

    logger.debug("Extracting unmapped reads")
    unmapped_reads, n_processed_reads = \
        unmapped.extract_unmapped_reads(args, reads2contigs_mapping,
                                        mapping_rate_threshold=0.5)

    n_unmapped_reads = len(unmapped_reads)
    unmapped_reads_ratio = 100 * float(len(unmapped_reads)) / n_processed_reads
    unmapped_reads_ratio = round(unmapped_reads_ratio, 1)
    logger.debug("Extracted {} unmapped reads ({} %)".format(
        n_unmapped_reads, unmapped_reads_ratio))

    unmapped_reads_path = os.path.join(work_dir, "unmapped_reads.fasta")
    fp.write_fasta_dict(unmapped_reads, unmapped_reads_path)

    unmapped_reads_mapping = os.path.join(work_dir, "unmapped_ava.paf")

    logger.debug("Finding self-mappings for unmapped reads")
    make_alignment(unmapped_reads_path, [unmapped_reads_path],
                   args.threads,
                   work_dir,
                   args.platform,
                   unmapped_reads_mapping,
                   reference_mode=False,
                   sam_output=False)

    logger.debug("Extracting circular reads")
    circular_reads = circular.extract_circular_reads(unmapped_reads_mapping)
    logger.debug("Extracted {} circular reads".format(len(circular_reads)))

    logger.debug("Extracing circular pairs")
    circular_pairs = circular.extract_circular_pairs(unmapped_reads_mapping)
    logger.debug("Extracted {} circular pairs".format(len(circular_pairs)))

    logger.debug("Extracting unique plasmids from circular sequences")
    trimmed_circular_reads = \
        circular.trim_circular_reads(circular_reads, unmapped_reads)
    trimmed_circular_pairs = \
        circular.trim_circular_pairs(circular_pairs, unmapped_reads)
    trimmed_sequences_path = os.path.join(work_dir, "trimmed_sequences.fasta")

    fp.write_fasta_dict(
        dict(trimmed_circular_reads.items() + trimmed_circular_pairs.items()),
        trimmed_sequences_path)

    trimmed_sequences_mapping = os.path.join(work_dir, "trimmed.paf")

    make_alignment(trimmed_sequences_path, [trimmed_sequences_path],
                   args.threads,
                   work_dir,
                   args.platform,
                   trimmed_sequences_mapping,
                   reference_mode=False,
                   sam_output=False)

    plasmids = \
        circular.extract_unique_plasmids(trimmed_sequences_mapping,
                                         trimmed_sequences_path)

    plasmids_raw = os.path.join(work_dir, "plasmids_raw.fasta")
    fp.write_fasta_dict(plasmids, plasmids_raw)
    pol.polish(plasmids_raw, [unmapped_reads_path],
               work_dir,
               1,
               args.threads,
               args.platform,
               output_progress=False)

    #extract coverage
    plasmids_with_coverage = {}
    if os.path.isfile(os.path.join(work_dir, "contigs_stats.txt")):
        with open(os.path.join(work_dir, "contigs_stats.txt"), "r") as f:
            for line in f:
                if line.startswith("seq"): continue
                tokens = line.strip().split()
                seq_id, coverage = tokens[0], int(tokens[2])
                if coverage > 0:
                    plasmids_with_coverage[seq_id] = plasmids[seq_id], coverage

    logger.info("Added {} extra contigs".format(len(plasmids_with_coverage)))

    # remove all unnecesarry files
    os.remove(reads2contigs_mapping)
    os.remove(unmapped_reads_path)
    os.remove(unmapped_reads_mapping)
    os.remove(trimmed_sequences_path)
    os.remove(trimmed_sequences_mapping)

    return plasmids_with_coverage