예제 #1
0
    def run(self):
        """
          1. extract contigs.fasta and read-contig.sam
          2. run pile up
        """
        contigs, _scaffolds, read_contig_sam, _stats = self.input_files_local[
            0]
        coverage_json, coverage_summary_csv = self.output_files_local()

        if os.path.getsize(contigs) < MIN_CONTIG_FILE_SIZE:
            command.write_text_to_file('{}', coverage_json)
            command.write_text_to_file('No Contigs', coverage_summary_csv)
            return

        # generate bam files
        bam_file = read_contig_sam.replace(".sam", ".bam")
        command.execute(
            command_patterns.ShellScriptCommand(
                script=
                r'''samtools view -S -b "${read_contig_sam}" | samtools sort - -o "${bam_file}";''',
                named_args={
                    'read_contig_sam': read_contig_sam,
                    'bam_file': bam_file
                }))
        command.execute(
            command_patterns.SingleCommand(cmd="samtools",
                                           args=["index", bam_file]))
        # run coverage info
        output_csv, output_json = self.calc_contig2coverage(bam_file)
        os.rename(output_csv, coverage_summary_csv)
        os.rename(output_json, coverage_json)
예제 #2
0
 def validate(self):
     ''' Make sure all the output files are generated. '''
     for f in self.output_files_local():
         if not os.path.exists(f):
             raise RuntimeError("output file %s should be generated after run" % f)
         # Tag the done files
         done_file = self.done_file(f)
         fmt_now = datetime.datetime.now(tz=pytz.UTC).strftime("%a %b %e %H:%M:%S %Z %Y")
         command.write_text_to_file(fmt_now, done_file)
     self.count_reads()
예제 #3
0
    def run(self):
        '''
            Dummy implementation. just copy the files over.
            Real thing to be implemented later.
        '''
        input_files = self.input_files_local[0]
        output_files = self.output_files_local()
        for i in range(len(input_files)):
            command.copy_file(input_files[i], output_files[i])

        command.write_text_to_file('1234', output_files[4])
예제 #4
0
 def fetch_input_files_from_s3(input_files, input_dir_s3, result_dir_local):
     for f in input_files:
         s3_file = os.path.join(input_dir_s3, f)
         local_file = os.path.join(result_dir_local, f)
         local_dir = os.path.dirname(local_file)
         command.make_dirs(local_dir)
         # copy the file over
         output_file = idseq_dag.util.s3.fetch_from_s3(s3_file, local_dir, allow_s3mi=True)
         if output_file:
             # write the done_file
             done_file = PipelineStep.done_file(local_file)
             fmt_now = datetime.datetime.now(tz=pytz.UTC).strftime("%a %b %e %H:%M:%S %Z %Y")
             command.write_text_to_file(fmt_now, done_file)
         else:
             raise RuntimeError(f"{s3_file} likely doesn't exist")
예제 #5
0
    def make_star_index(fasta_file, gtf_file, output_star_genome_path,
                        max_star_part_size):
        star_genome_dir_name = output_star_genome_path[:-4]

        # star genome organization
        # STAR_genome/part-${i}, parts.txt
        fasta_file_list = []
        if max_star_part_size and os.path.getsize(
                fasta_file) > max_star_part_size:
            fasta_file_list = PipelineStepGenerateHostGenome.split_fasta(
                fasta_file, max_star_part_size)
        else:
            fasta_file_list.append(fasta_file)

        for i in range(len(fasta_file_list)):
            log.write("start making STAR index part %d" % i)
            gtf_command_part = []
            if i == 0 and gtf_file:
                gtf_command_part = ["--sjdbGTFfile", gtf_file]

            star_genome_part_dir = f"{star_genome_dir_name}/part-{i}"

            command.make_dirs(star_genome_part_dir)
            star_command_params = [
                '--runThreadN',
                str(multiprocessing.cpu_count()), '--runMode',
                'genomeGenerate', *gtf_command_part, '--genomeDir',
                star_genome_part_dir, '--genomeFastaFiles', fasta_file_list[i],
                '--limitGenomeGenerateRAM',
                virtual_memory().available
            ]
            command.execute(
                command_patterns.SingleCommand(cmd='STAR',
                                               args=star_command_params))
            log.write(f"finished making STAR index part {i}")
        # record # parts into parts.txt
        command.write_text_to_file(
            len(fasta_file_list),
            os.path.join(star_genome_dir_name, "parts.txt"))
        star_genome = os.path.basename(star_genome_dir_name)
        star_work_dir = os.path.dirname(star_genome_dir_name)
        command.execute(
            command_patterns.SingleCommand(cmd="tar",
                                           args=[
                                               "cvf", output_star_genome_path,
                                               "-C", star_work_dir, star_genome
                                           ]))
예제 #6
0
    def assemble(
            input_fasta,
            input_fasta2,
            bowtie_fasta,  # fasta file for running bowtie against contigs
            duplicate_cluster_sizes_path,
            assembled_contig,
            assembled_scaffold,
            bowtie_sam,
            contig_stats,
            read2contig,
            memory=100):
        basedir = os.path.dirname(assembled_contig)
        assembled_dir = os.path.join(basedir, 'spades')
        command.make_dirs(assembled_dir)
        assembled_contig_tmp = os.path.join(assembled_dir, 'contigs.fasta')
        assembled_scaffold_tmp = os.path.join(assembled_dir, 'scaffolds.fasta')

        try:
            if input_fasta2:
                command.execute(
                    command_patterns.SingleCommand(cmd="spades.py",
                                                   args=[
                                                       "-1", input_fasta, "-2",
                                                       input_fasta2, "-o",
                                                       assembled_dir, "-m",
                                                       memory, "-t", 32,
                                                       "--only-assembler"
                                                   ]))
            else:
                command.execute(
                    command_patterns.SingleCommand(cmd="spades.py",
                                                   args=[
                                                       "-s", input_fasta, "-o",
                                                       assembled_dir, "-m",
                                                       memory, "-t", 32,
                                                       "--only-assembler"
                                                   ]))
            command.move_file(assembled_contig_tmp, assembled_contig)
            command.move_file(assembled_scaffold_tmp, assembled_scaffold)

            PipelineStepRunAssembly.generate_read_to_contig_mapping(
                assembled_contig, bowtie_fasta, read2contig,
                duplicate_cluster_sizes_path, bowtie_sam, contig_stats)
        except:
            # Assembly failed. create dummy output files
            command.write_text_to_file(';ASSEMBLY FAILED', assembled_contig)
            command.write_text_to_file(';ASSEMBLY FAILED', assembled_scaffold)
            command.write_text_to_file('@NO INFO', bowtie_sam)
            command.write_text_to_file('{}', contig_stats)
            traceback.print_exc()
        command.remove_rf(assembled_dir)
    def run(self):
        '''
            1. summarize hits
            2. built blast index
            3. blast assembled contigs to the index
            4. update the summary
        '''
        _align_m8, deduped_m8, hit_summary, orig_counts_with_dcr = self.input_files_local[0]
        assembled_contig, _assembled_scaffold, bowtie_sam, _contig_stats = self.input_files_local[1]
        reference_fasta, = self.input_files_local[2]
        duplicate_cluster_sizes_path, = self.input_files_local[3]

        blast_m8, refined_m8, refined_hit_summary, refined_counts_with_dcr, contig_summary_json, blast_top_m8 = self.output_files_local()

        assert refined_counts_with_dcr.endswith("with_dcr.json"), self.output_files_local()
        assert orig_counts_with_dcr.endswith("with_dcr.json"), self.output_files_local()

        db_type = self.additional_attributes["db_type"]
        no_assembled_results = (
            os.path.getsize(assembled_contig) < MIN_ASSEMBLED_CONTIG_SIZE or
            os.path.getsize(reference_fasta) < MIN_REF_FASTA_SIZE)

        if no_assembled_results:
            # No assembled results or refseq fasta available.
            # Create empty output files.
            command.write_text_to_file(' ', blast_m8)
            command.write_text_to_file(' ', blast_top_m8)
            command.copy_file(deduped_m8, refined_m8)
            command.copy_file(hit_summary, refined_hit_summary)
            command.copy_file(orig_counts_with_dcr, refined_counts_with_dcr)
            command.write_text_to_file('[]', contig_summary_json)
            return  # return in the middle of the function

        (read_dict, accession_dict, _selected_genera) = m8.summarize_hits(hit_summary)
        PipelineStepBlastContigs.run_blast(db_type, blast_m8, assembled_contig, reference_fasta, blast_top_m8)
        read2contig = {}
        PipelineStepRunAssembly.generate_info_from_sam(bowtie_sam, read2contig, duplicate_cluster_sizes_path)

        (updated_read_dict, read2blastm8, contig2lineage, added_reads) = self.update_read_dict(
            read2contig, blast_top_m8, read_dict, accession_dict, db_type)
        self.generate_m8_and_hit_summary(updated_read_dict, added_reads, read2blastm8,
                                         hit_summary, deduped_m8,
                                         refined_hit_summary, refined_m8)

        # Generating taxon counts based on updated results
        lineage_db = s3.fetch_reference(
            self.additional_files["lineage_db"],
            self.ref_dir_local,
            allow_s3mi=False)  # Too small to waste s3mi

        deuterostome_db = None
        if self.additional_files.get("deuterostome_db"):
            deuterostome_db = s3.fetch_reference(self.additional_files["deuterostome_db"],
                                                 self.ref_dir_local, allow_s3mi=False)  # Too small for s3mi

        blacklist_s3_file = self.additional_files.get('taxon_blacklist', DEFAULT_BLACKLIST_S3)
        taxon_blacklist = s3.fetch_reference(blacklist_s3_file, self.ref_dir_local)

        taxon_whitelist = None
        if self.additional_attributes.get("use_taxon_whitelist"):
            taxon_whitelist = s3.fetch_reference(self.additional_files.get("taxon_whitelist", DEFAULT_WHITELIST_S3),
                                                 self.ref_dir_local)

        with TraceLock("PipelineStepBlastContigs-CYA", PipelineStepBlastContigs.cya_lock, debug=False):
            with log.log_context("PipelineStepBlastContigs", {"substep": "generate_taxon_count_json_from_m8", "db_type": db_type, "refined_counts": refined_counts_with_dcr}):
                m8.generate_taxon_count_json_from_m8(refined_m8, refined_hit_summary, db_type.upper(),
                                                     lineage_db, deuterostome_db, taxon_whitelist, taxon_blacklist,
                                                     duplicate_cluster_sizes_path, refined_counts_with_dcr)

        # generate contig stats at genus/species level
        with log.log_context("PipelineStepBlastContigs", {"substep": "generate_taxon_summary"}):
            contig_taxon_summary = self.generate_taxon_summary(
                read2contig,
                contig2lineage,
                updated_read_dict,
                added_reads,
                db_type,
                duplicate_cluster_sizes_path,
                # same filter as applied in generate_taxon_count_json_from_m8
                m8.build_should_keep_filter(deuterostome_db, taxon_whitelist, taxon_blacklist)
            )

        with log.log_context("PipelineStepBlastContigs", {"substep": "generate_taxon_summary_json", "contig_summary_json": contig_summary_json}):
            with open(contig_summary_json, 'w') as contig_outf:
                json.dump(contig_taxon_summary, contig_outf)

        # Upload additional file
        contig2lineage_json = os.path.join(os.path.dirname(contig_summary_json), f"contig2lineage.{db_type}.json")
        with log.log_context("PipelineStepBlastContigs", {"substep": "contig2lineage_json", "contig2lineage_json": contig2lineage_json}):
            with open(contig2lineage_json, 'w') as c2lf:
                json.dump(contig2lineage, c2lf)

        self.additional_output_files_hidden.append(contig2lineage_json)
예제 #8
0
    def test_write_text_to_file(self):
        '''WHEN write_text_to_file is invoked, THEN write to the file the provided string'''
        command.write_text_to_file("done", TMP_FILE)

        self.assertEqual(file_contents(TMP_FILE), "done\n")
예제 #9
0
    def run(self):
        '''
            1. summarize hits
            2. built blast index
            3. blast assembled contigs to the index
            4. update the summary
        '''
        (_align_m8, deduped_m8, hit_summary,
         orig_counts) = self.input_files_local[0]
        assembled_contig, _assembled_scaffold, bowtie_sam, _contig_stats = self.input_files_local[
            1]
        reference_fasta = self.input_files_local[2][0]

        (blast_m8, refined_m8, refined_hit_summary, refined_counts,
         contig_summary_json, blast_top_m8) = self.output_files_local()
        db_type = self.additional_attributes["db_type"]
        if os.path.getsize(assembled_contig) < MIN_ASSEMBLED_CONTIG_SIZE or \
           os.path.getsize(reference_fasta) < MIN_REF_FASTA_SIZE:
            # No assembled results or refseq fasta available.
            # Create empty output files.
            command.write_text_to_file(' ', blast_m8)
            command.write_text_to_file(' ', blast_top_m8)
            command.copy_file(deduped_m8, refined_m8)
            command.copy_file(hit_summary, refined_hit_summary)
            command.copy_file(orig_counts, refined_counts)
            command.write_text_to_file('[]', contig_summary_json)
            return  # return in the middle of the function

        (read_dict, accession_dict,
         _selected_genera) = m8.summarize_hits(hit_summary)
        PipelineStepBlastContigs.run_blast(db_type, blast_m8, assembled_contig,
                                           reference_fasta, blast_top_m8)
        read2contig = {}
        contig_stats = defaultdict(int)
        PipelineStepRunAssembly.generate_info_from_sam(bowtie_sam, read2contig,
                                                       contig_stats)

        (updated_read_dict, read2blastm8, contig2lineage,
         added_reads) = self.update_read_dict(read2contig, blast_top_m8,
                                              read_dict, accession_dict,
                                              db_type)
        self.generate_m8_and_hit_summary(updated_read_dict, added_reads,
                                         read2blastm8, hit_summary, deduped_m8,
                                         refined_hit_summary, refined_m8)

        # Generating taxon counts based on updated results
        lineage_db = s3.fetch_reference(
            self.additional_files["lineage_db"],
            self.ref_dir_local,
            allow_s3mi=False)  # Too small to waste s3mi
        deuterostome_db = None
        evalue_type = 'raw'
        if self.additional_files.get("deuterostome_db"):
            deuterostome_db = s3.fetch_reference(
                self.additional_files["deuterostome_db"],
                self.ref_dir_local,
                allow_s3mi=False)  # Too small for s3mi
        with TraceLock("PipelineStepBlastContigs-CYA",
                       PipelineStepBlastContigs.cya_lock,
                       debug=False):
            with log.log_context(
                    "PipelineStepBlastContigs", {
                        "substep": "generate_taxon_count_json_from_m8",
                        "db_type": db_type,
                        "refined_counts": refined_counts
                    }):
                m8.generate_taxon_count_json_from_m8(
                    refined_m8, refined_hit_summary, evalue_type,
                    db_type.upper(), lineage_db, deuterostome_db,
                    refined_counts)

        # generate contig stats at genus/species level
        with log.log_context("PipelineStepBlastContigs",
                             {"substep": "generate_taxon_summary"}):
            contig_taxon_summary = self.generate_taxon_summary(
                read2contig, contig2lineage, updated_read_dict, added_reads,
                db_type)

        with log.log_context(
                "PipelineStepBlastContigs", {
                    "substep": "generate_taxon_summary_json",
                    "contig_summary_json": contig_summary_json
                }):
            with open(contig_summary_json, 'w') as contig_outf:
                json.dump(contig_taxon_summary, contig_outf)

        # Upload additional file
        contig2lineage_json = os.path.join(
            os.path.dirname(contig_summary_json),
            f"contig2lineage.{db_type}.json")
        with log.log_context(
                "PipelineStepBlastContigs", {
                    "substep": "contig2lineage_json",
                    "contig2lineage_json": contig2lineage_json
                }):
            with open(contig2lineage_json, 'w') as c2lf:
                json.dump(contig2lineage, c2lf)

        self.additional_output_files_hidden.append(contig2lineage_json)