Пример #1
0
    def workflow(self):

        # Input files are either located in SRA or AWS S3
        assert self.input_location in ["SRA", "S3"]

        # Read in the metadata sheet
        metadata = pd.read_table(self.metadata_fp, sep=self.metadata_fp_sep)

        for col_name in [self.input_column_name, self.sample_column_name]:
            assert col_name in metadata.columns, "{} not found in {}".format(
                col_name, self.metadata_fp)
            # Make sure that all samples and files are unique
            assert metadata[col_name].unique().shape[0] == metadata.shape[0]

        # Make tasks that will make sure the reference databases exist
        ref_fasta = self.new_task("load_ref_fasta",
                                  LoadFile,
                                  path=self.ref_fasta)

        # Keep track of all of the jobs for getting the input files
        tasks_load_inputs = {}

        # Keep track of all of the jobs for aligning against the FASTA
        tasks_align_bwa = {}

        # Iterate over all of the rows of samples
        for ix, r in metadata.iterrows():

            # Get the sample name and the file location
            sample_name = r[self.sample_column_name]
            input_path = r[self.input_column_name]

            # If the inputs are on SRA, execute jobs that will download them
            if self.input_location == "SRA":

                tasks_load_inputs[sample_name] = self.new_task(
                    "download_from_sra_{}".format(sample_name),
                    ImportSRAFastq,
                    sra_accession=input_path,
                    base_s3_folder=self.base_s3_folder,
                    containerinfo=sl.ContainerInfo(
                        vcpu=1,
                        mem=4096,
                        engine=self.engine,
                        aws_s3_scratch_loc=self.aws_s3_scratch_loc,
                        aws_jobRoleArn=self.aws_job_role_arn,
                        aws_batch_job_queue=self.aws_batch_job_queue,
                        mounts={
                            "/docker_scratch": {
                                "bind": self.temp_folder,
                                "mode": "rw"
                            }
                        }))
            else:
                # Make sure the file exists on S3
                assert self.input_location == "S3"
                tasks_load_inputs[sample_name] = self.new_task(
                    "load_from_s3_{}".format(sample_name),
                    LoadFile,
                    path=input_path)

            # Make a task to align the reads, wherever they came from
            tasks_align_bwa[sample_name] = self.new_task(
                "align_bwa_{}".format(sample_name),
                AlignFastqTask,
                ref_name=self.ref_name,
                base_s3_folder=self.base_s3_folder,
                sample_name=sample_name,
                threads=self.align_threads,
                temp_folder=self.temp_folder,
                containerinfo=sl.ContainerInfo(
                    vcpu=int(self.align_threads),
                    mem=int(self.align_mem),
                    engine=self.engine,
                    aws_s3_scratch_loc=self.aws_s3_scratch_loc,
                    aws_jobRoleArn=self.aws_job_role_arn,
                    aws_batch_job_queue=self.aws_batch_job_queue,
                    mounts={
                        "/docker_scratch": {
                            "bind": self.temp_folder,
                            "mode": "rw"
                        }
                    }))
        # Assign the output from tasks_load_inputs to the input to tasks_align_bwa
        for sample_name in tasks_load_inputs:
            assert sample_name in tasks_align_bwa

            # Assign the input for the reference database
            tasks_align_bwa[sample_name].in_ref_fasta = ref_fasta.out_file
            tasks_align_bwa[sample_name].in_fastq = tasks_load_inputs[
                sample_name].out_fastq

        return tasks_align_bwa
    def workflow(self):

        # Read in the metadata sheet
        metadata = pd.read_table(self.metadata_fp, sep=self.metadata_fp_sep)

        for col_name in [self.input_column_name, self.sample_column_name]:
            assert col_name in metadata.columns, "{} not found in {}".format(
                col_name, self.metadata_fp)
            # Make sure that all samples and files are unique
            assert metadata[col_name].unique().shape[0] == metadata.shape[0]

        # Keep track of the jobs for each step, for each sample
        tasks_load_inputs = {}
        tasks_fastqp = {}
        tasks_humann = {}

        # Iterate over all of the rows of samples
        for ix, r in metadata.iterrows():

            # Get the sample name and the file location
            sample_name = r[self.sample_column_name]
            input_path = r[self.input_column_name]

            # Make a UUID to isolate temp files for this task from any others
            task_uuid = str(uuid.uuid4())[:8]

            # 1. LOAD THE INPUT FILES

            tasks_load_inputs[sample_name] = self.new_task(
                "load_from_s3_{}".format(sample_name),
                LoadFile,
                path=input_path)

            # 2. CALCULATE FASTQ QUALITY METRICS
            tasks_fastqp[sample_name] = self.new_task(
                "fastqp_{}".format(sample_name),
                FastqpTask,
                summary_path=os.path.join(self.base_s3_folder, "fastqp",
                                          sample_name + ".fastqp.tsv"),
                input_mount_point="/scratch/{}_fastqp/input/".format(
                    task_uuid),
                output_mount_point="/scratch/{}_fastqp/output/".format(
                    task_uuid),
                containerinfo=sl.ContainerInfo(
                    vcpu=1,
                    mem=10000,
                    engine=self.engine,
                    aws_s3_scratch_loc=self.aws_s3_scratch_loc,
                    aws_jobRoleArn=self.aws_job_role_arn,
                    aws_batch_job_queue=self.aws_batch_job_queue,
                    aws_batch_job_prefix=re.sub(
                        '[^a-zA-Z0-9-_]', '_',
                        "fastqp_{}".format(sample_name)),
                    mounts={
                        "/docker_scratch": {
                            "bind": self.temp_folder,
                            "mode": "rw"
                        }
                    }))

            # 3. ANALYZE WITH HUMANn2
            tasks_humann[sample_name] = self.new_task(
                "humann2_{}".format(sample_name),
                HUMAnN2Task,
                sample_name=sample_name,
                output_folder=os.path.join(self.base_s3_folder, "humann2"),
                threads=self.humann2_threads,
                ref_db=self.humann2_ref_db,
                temp_folder=self.temp_folder,
                containerinfo=sl.ContainerInfo(
                    vcpu=int(self.humann2_threads),
                    mem=int(self.humann2_mem),
                    engine=self.engine,
                    aws_s3_scratch_loc=self.aws_s3_scratch_loc,
                    aws_jobRoleArn=self.aws_job_role_arn,
                    aws_batch_job_queue=self.aws_batch_job_queue,
                    aws_batch_job_prefix=re.sub(
                        '[^a-zA-Z0-9-_]', '_',
                        "humann2_{}".format(sample_name)),
                    mounts={
                        "/docker_scratch": {
                            "bind": self.temp_folder,
                            "mode": "rw"
                        },
                        "/refdbs": {
                            "bind": "/refdbs",
                            "mode": "ro"
                        }
                    }))

        # Assign the output from tasks_load_inputs to the input to tasks_fastqp
        for sample_name in tasks_load_inputs:
            assert sample_name in tasks_fastqp
            tasks_fastqp[sample_name].in_fastq = tasks_load_inputs[
                sample_name].out_file

            assert sample_name in tasks_humann
            tasks_humann[sample_name].in_fastq = tasks_load_inputs[
                sample_name].out_file

        return tasks_fastqp, tasks_humann
Пример #3
0
    def workflow(self):
        light_containerinfo = sl.ContainerInfo()
        light_containerinfo.from_config(section='light')
        highmem_containerinfo = sl.ContainerInfo()
        highmem_containerinfo.from_config(section='highmem')
        heavy_containerinfo = sl.ContainerInfo()
        heavy_containerinfo.from_config(section='heavy')
        midcpu_containerinfo = sl.ContainerInfo()
        midcpu_containerinfo.from_config(section='midcpu')

        #
        #  Load the manifest of files
        #
        manifest = self.new_task(
            'load_manifest',
            LoadManifest,
            path=self.manifest,
        )

        # For each specimen....
        specimen_tasks = defaultdict(dict)
        specimens = manifest.get_specimens()
        for specimen in specimens:
            # Load the specimen reads.
            specimen_tasks[specimen]['reads'] = self.new_task(
                'specimen_load_{}'.format(specimen),
                LoadSpecimenReads,
                specimen=specimen)
            specimen_tasks[specimen]['reads'].in_manifest = manifest.out_file
            if self.barcodecop and "I1" in specimen_tasks[specimen][
                    'reads'].out_reads() and manifest.is_paired():
                specimen_tasks[specimen]['verified_reads'] = self.new_task(
                    'specimen_bcc_{}'.format(specimen),
                    BCCSpecimenReads,
                    containerinfo=light_containerinfo,
                    specimen=specimen,
                    path=os.path.join(self.working_dir, 'sv', 'bcc'))
                specimen_tasks[specimen][
                    'verified_reads'].in_reads = specimen_tasks[specimen][
                        'reads'].out_reads
            else:
                specimen_tasks[specimen]['verified_reads'] = specimen_tasks[
                    specimen]['reads']

            # DADA2 filer and trim
            specimen_tasks[specimen]['dada2_ft'] = self.new_task(
                'dada2_ft_{}'.format(specimen),
                DADA2_FilterAndTrim,
                containerinfo=light_containerinfo,
                specimen=specimen,
                f_trunc=self.truncLenF,
                r_trunc=self.truncLenR,
                trim_left=self.trimLeft,
                maxN=self.maxN,
                maxEE=self.maxEE,
                path=os.path.join(self.working_dir, 'sv', 'dada2', 'ft'))
            specimen_tasks[specimen]['dada2_ft'].in_reads = specimen_tasks[
                specimen]['verified_reads'].out_reads

            specimen_tasks[specimen]['dada2_derep'] = self.new_task(
                'dada2_derep_{}'.format(specimen),
                DADA2_Dereplicate,
                containerinfo=light_containerinfo,
                specimen=specimen,
                path=os.path.join(self.working_dir, 'sv', 'dada2', 'derep'))
            specimen_tasks[specimen]['dada2_derep'].in_reads = specimen_tasks[
                specimen]['dada2_ft'].out_reads

        # Now we need the specimens grouped by batch to create error models.
        batch_errModels = {}
        for batch, batched_specimens in manifest.batched_specimens():
            batch_errModels[batch] = self.new_task(
                'dada2_learn_error_batch_{}'.format(batch),
                DADA2_LearnError,
                containerinfo=midcpu_containerinfo,
                batch=batch,
                tar_reads=False,
                path=os.path.join(self.working_dir, 'sv', 'dada2', 'errM'))
            batch_errModels[batch].in_reads = [
                specimen_tasks[s]['dada2_ft'].out_reads for s in specimen_tasks
                if s in batched_specimens
            ]
            for specimen in batched_specimens:
                specimen_tasks[specimen]['dada2_errM'] = batch_errModels[batch]

        # Back to for each specimen...
        for specimen in specimens:
            # DADA
            specimen_tasks[specimen]['dada2_dada'] = self.new_task(
                'dada2_dada_{}'.format(specimen),
                DADA2_DADA,
                containerinfo=midcpu_containerinfo,
                specimen=specimen,
                path=os.path.join(self.working_dir, 'sv', 'dada2', 'dada'))
            specimen_tasks[specimen]['dada2_dada'].in_derep = specimen_tasks[
                specimen]['dada2_derep'].out_rds
            specimen_tasks[specimen]['dada2_dada'].in_errM = specimen_tasks[
                specimen]['dada2_errM'].out_rds

            # MERGE
            specimen_tasks[specimen]['dada2_merge'] = self.new_task(
                'dada2_merge_{}'.format(specimen),
                DADA2_Merge,
                containerinfo=light_containerinfo,
                specimen=specimen,
                path=os.path.join(self.working_dir, 'sv', 'dada2', 'merged'))
            specimen_tasks[specimen]['dada2_merge'].in_dada = specimen_tasks[
                specimen]['dada2_dada'].out_rds
            specimen_tasks[specimen]['dada2_merge'].in_derep = specimen_tasks[
                specimen]['dada2_derep'].out_rds

            # Seqtab
            specimen_tasks[specimen]['dada2_seqtab'] = self.new_task(
                'dada2_seqtab_{}'.format(specimen),
                DADA2_Specimen_Seqtab,
                containerinfo=light_containerinfo,
                specimen=specimen,
                path=os.path.join(self.working_dir, 'sv', 'dada2', 'seqtab'))
            specimen_tasks[specimen]['dada2_seqtab'].in_merge = specimen_tasks[
                specimen]['dada2_merge'].out_rds

        # Combine seqtabs by batch
        batch_seqtab = {}
        for batch, batched_specimens in manifest.batched_specimens():
            batch_seqtab[batch] = self.new_task(
                'dada2_combine_seqtabs_{}'.format(batch),
                DADA2_Combine_Seqtabs,
                containerinfo=light_containerinfo,
                fn=os.path.join(
                    self.working_dir,
                    'sv',
                    'dada2',
                    'seqtab',
                    'batches',
                    'seqtab.{}.combined.rds'.format(batch),
                ))
            batch_seqtab[batch].in_seqtabs = [
                specimen_tasks[s]['dada2_seqtab'].out_rds
                for s in specimen_tasks if s in batched_specimens
            ]
        # Now combine all the batch_seqtabs into one master seqtab
        combined_seqtab = self.new_task('dada2_combine_seqtabs',
                                        DADA2_Combine_Seqtabs,
                                        containerinfo=highmem_containerinfo,
                                        fn=os.path.join(
                                            self.working_dir, 'sv', 'dada2',
                                            'seqtab.combined.rds'))
        combined_seqtab.in_seqtabs = [
            st.out_rds for st in batch_seqtab.values()
        ]

        combined_seqtab_nochim = self.new_task(
            'dada2_remove_chimera',
            DADA2_Remove_Chimera,
            containerinfo=heavy_containerinfo,
            fn_rds=os.path.join(self.working_dir, 'sv', 'dada2',
                                'seqtab.combined.nochim.rds'),
            fn_csv=os.path.join(self.destination_dir,
                                'seqtab.combined.nochim.csv'))
        combined_seqtab_nochim.in_seqtab = combined_seqtab.out_rds

        dada2_sv_to_pplacer = self.new_task('dada2_sv_to_pplacer',
                                            DADA2_SV_to_PPlacer,
                                            containerinfo=light_containerinfo,
                                            fasta_fn=os.path.join(
                                                self.destination_dir,
                                                'dada2.sv.fasta',
                                            ),
                                            weights_fn=os.path.join(
                                                self.destination_dir,
                                                'dada2.sv.weights.csv',
                                            ),
                                            map_fn=os.path.join(
                                                self.destination_dir,
                                                'dada2.sv.map.csv',
                                            ))
        dada2_sv_to_pplacer.in_seqtab_csv = combined_seqtab_nochim.out_csv

        return (dada2_sv_to_pplacer)
Пример #4
0
    def workflow(self):

        # Make sure the project name is alphanumeric
        assert all([s.isalnum() or s == "_" for s in self.project_name
                    ]), "Project name must be alphanumeric"

        # Data can come from either SRA or S3
        assert self.input_location in ["SRA", "S3"]

        # Read in the metadata sheet
        metadata = pd.read_table(self.metadata_fp, sep=self.metadata_fp_sep)

        for col_name in [self.input_column_name, self.sample_column_name]:
            assert col_name in metadata.columns, "{} not found in {}".format(
                col_name, self.metadata_fp)
            # Make sure that all samples and files are unique
            assert metadata[col_name].unique().shape[0] == metadata.shape[0]

        # Keep track of the jobs for each step, for each sample
        tasks_load_inputs = {}
        tasks_fastqp = {}
        tasks_metaspades = {}
        tasks_prokka = {}
        tasks_famli = {}

        # Iterate over all of the rows of samples
        for _, r in metadata.iterrows():

            # Get the sample name and the file location
            sample_name = r[self.sample_column_name]
            input_path = r[self.input_column_name]

            # Make a UUID to isolate temp files for this task from any others
            task_uuid = str(uuid.uuid4())[:8]

            # 1. LOAD THE INPUT FILES

            if self.input_location == "S3":
                tasks_load_inputs[sample_name] = self.new_task(
                    "load_from_s3_{}".format(sample_name),
                    LoadFile,
                    path=input_path)
            elif self.input_location == "SRA":
                assert input_path.startswith("SRR"), input_path

                tasks_load_inputs[sample_name] = self.new_task(
                    "download_from_SRA_{}".format(sample_name),
                    ImportSRAFastq,
                    sra_accession=input_path,
                    base_s3_folder=self.base_s3_folder,
                    input_mount_point="/scratch/{}_get_sra/input/".format(
                        task_uuid),
                    output_mount_point="/scratch/{}_get_sra/output/".format(
                        task_uuid),
                    containerinfo=sl.ContainerInfo(
                        vcpu=1,
                        mem=32000,
                        engine=self.engine,
                        aws_s3_scratch_loc=self.aws_s3_scratch_loc,
                        aws_batch_job_poll_sec=120,
                        aws_jobRoleArn=self.aws_job_role_arn,
                        aws_batch_job_queue=self.aws_batch_job_queue,
                        aws_batch_job_prefix=re.sub(
                            '[^a-zA-Z0-9-_]', '_',
                            "get_sra_{}".format(sample_name)),
                        mounts={
                            "/docker_scratch": {
                                "bind": self.temp_folder,
                                "mode": "rw"
                            }
                        }))
            else:
                raise Exception("Data must be from S3 or SRA")

            # 2. CALCULATE FASTQ QUALITY METRICS
            tasks_fastqp[sample_name] = self.new_task(
                "fastqp_{}".format(sample_name),
                FastqpTask,
                summary_path=os.path.join(self.base_s3_folder, "fastqp",
                                          sample_name + ".fastqp.tsv"),
                input_mount_point="/scratch/{}_fastqp/input/".format(
                    task_uuid),
                output_mount_point="/scratch/{}_fastqp/output/".format(
                    task_uuid),
                containerinfo=sl.ContainerInfo(
                    vcpu=1,
                    mem=32000,
                    engine=self.engine,
                    aws_s3_scratch_loc=self.aws_s3_scratch_loc,
                    aws_batch_job_poll_sec=120,
                    aws_jobRoleArn=self.aws_job_role_arn,
                    aws_batch_job_queue=self.aws_batch_job_queue,
                    aws_batch_job_prefix=re.sub(
                        '[^a-zA-Z0-9-_]', '_',
                        "fastqp_{}".format(sample_name)),
                    mounts={
                        "/docker_scratch": {
                            "bind": self.temp_folder,
                            "mode": "rw"
                        }
                    }))

            # 3. ASSEMBLE WITH METASPADES
            tasks_metaspades[sample_name] = self.new_task(
                "metaspades_{}".format(sample_name),
                AssembleMetaSPAdes,
                sample_name=sample_name,
                output_folder=os.path.join(self.base_s3_folder, "metaspades"),
                threads=self.assemble_threads,
                max_mem=int(int(self.assemble_mem) / 1000),
                temp_folder=self.temp_folder,
                containerinfo=sl.ContainerInfo(
                    vcpu=int(self.assemble_threads),
                    mem=int(self.assemble_mem),
                    engine=self.engine,
                    aws_s3_scratch_loc=self.aws_s3_scratch_loc,
                    aws_batch_job_poll_sec=120,
                    aws_jobRoleArn=self.aws_job_role_arn,
                    aws_batch_job_queue=self.aws_batch_job_queue,
                    aws_batch_job_prefix=re.sub(
                        '[^a-zA-Z0-9-_]', '_',
                        "metaspades_{}".format(sample_name)),
                    mounts={
                        "/docker_scratch": {
                            "bind": self.temp_folder,
                            "mode": "rw"
                        }
                    }))

            # 4. ANNOTATE ASSEMBLIES WITH PROKKA
            tasks_prokka[sample_name] = self.new_task(
                "prokka_{}".format(sample_name),
                AnnotateProkka,
                sample_name=sample_name,
                output_folder=os.path.join(self.base_s3_folder, "prokka"),
                threads=self.assemble_threads,
                temp_folder=self.temp_folder,
                containerinfo=sl.ContainerInfo(
                    vcpu=int(self.assemble_threads),
                    mem=int(self.assemble_mem),
                    engine=self.engine,
                    aws_s3_scratch_loc=self.aws_s3_scratch_loc,
                    aws_batch_job_poll_sec=120,
                    aws_jobRoleArn=self.aws_job_role_arn,
                    aws_batch_job_queue=self.aws_batch_job_queue,
                    aws_batch_job_prefix=re.sub(
                        '[^a-zA-Z0-9-_]', '_',
                        "prokka_{}".format(sample_name)),
                    mounts={
                        "/docker_scratch": {
                            "bind": self.temp_folder,
                            "mode": "rw"
                        }
                    }))

        # Assign the output from tasks_load_inputs to the input to tasks_fastqp
        for sample_name in tasks_load_inputs:
            assert sample_name in tasks_fastqp

            if self.input_location == "S3":
                tasks_fastqp[sample_name].in_fastq = tasks_load_inputs[
                    sample_name].out_file
            elif self.input_location == "SRA":
                tasks_fastqp[sample_name].in_fastq = tasks_load_inputs[
                    sample_name].out_fastq

            assert sample_name in tasks_metaspades
            if self.input_location == "S3":
                tasks_metaspades[sample_name].in_fastq = tasks_load_inputs[
                    sample_name].out_file
            elif self.input_location == "SRA":
                tasks_metaspades[sample_name].in_fastq = tasks_load_inputs[
                    sample_name].out_fastq

            assert sample_name in tasks_prokka
            tasks_prokka[sample_name].in_fasta = tasks_metaspades[
                sample_name].out_fasta

        # 5. COMBINE ASSEMBLIES
        task_integrate_assemblies = self.new_task(
            "integrate_assemblies-{}".format(self.project_name),
            IntegrateAssembliesTask,
            output_prefix=self.project_name,
            output_folder=os.path.join(self.base_s3_folder,
                                       "integrated_assembly"),
            gff_folder=os.path.join(self.base_s3_folder, "prokka"),
            fastp_folder=os.path.join(self.base_s3_folder, "prokka"),
            temp_folder=self.temp_folder,
            containerinfo=sl.ContainerInfo(
                vcpu=8,
                mem=120000,
                engine=self.engine,
                aws_s3_scratch_loc=self.aws_s3_scratch_loc,
                aws_batch_job_poll_sec=120,
                aws_jobRoleArn=self.aws_job_role_arn,
                aws_batch_job_queue=self.aws_batch_job_queue,
                aws_batch_job_prefix="integrate_assemblies_{}".format(
                    self.project_name),
                mounts={
                    "/docker_scratch": {
                        "bind": self.temp_folder,
                        "mode": "rw"
                    }
                }))

        task_integrate_assemblies.in_fastp_list = [
            t.out_faa for t in tasks_prokka.values()
        ]
        task_integrate_assemblies.in_gff_list = [
            t.out_gff for t in tasks_prokka.values()
        ]

        # 6. ALIGN AGAINST THE ASSEMBLY USING FAMLI
        tasks_famli = {}
        # Iterate over all of the rows of samples
        for _, r in metadata.iterrows():

            # Get the sample name and the file location
            sample_name = r[self.sample_column_name]
            input_path = r[self.input_column_name]

            tasks_famli[sample_name] = self.new_task(
                "famli_{}".format(sample_name),
                FAMLITask,
                sample_name=sample_name,
                output_folder=os.path.join(self.base_s3_folder, "famli"),
                threads=self.famli_threads,
                temp_folder=self.temp_folder,
                containerinfo=sl.ContainerInfo(
                    vcpu=int(self.famli_threads),
                    mem=int(self.famli_mem),
                    engine=self.engine,
                    aws_s3_scratch_loc=self.aws_s3_scratch_loc,
                    aws_batch_job_poll_sec=120,
                    aws_jobRoleArn=self.aws_job_role_arn,
                    aws_batch_job_queue=self.aws_batch_job_queue,
                    aws_batch_job_prefix="famli_{}".format(sample_name),
                    mounts={
                        "/docker_scratch": {
                            "bind": self.temp_folder,
                            "mode": "rw"
                        }
                    }))
            # Connect the raw FASTQ input
            if self.input_location == "S3":
                tasks_famli[sample_name].in_fastq = tasks_load_inputs[
                    sample_name].out_file
            elif self.input_location == "SRA":
                tasks_famli[sample_name].in_fastq = tasks_load_inputs[
                    sample_name].out_fastq

            # Connect the reference database
            tasks_famli[
                sample_name].in_ref_dmnd = task_integrate_assemblies.out_daa

        return tasks_famli, tasks_fastqp
    def workflow(self):

        # Make sure that the S3 folder is formatted with the proper prefix
        assert self.s3_folder.startswith("s3://")

        # Parse the bucket and key for the s3 folder for all results
        s3_bucket, s3_prefix = self.s3_folder[5:].split("/", 1)

        # Connect to S3
        s3 = boto3.resource('s3')

        # 1. Get the summary of all genomes
        genome_metadata_fp = os.path.join(s3_prefix, "patric_genome_metadata.tsv")
        
        print("Writing PATRIC genome metadata to s3://{}/{}".format(
            s3_bucket,
            genome_metadata_fp
        ))

        with urlopen("ftp://ftp.patricbrc.org/RELEASE_NOTES/genome_metadata") as fi:
            s3.Bucket(
                s3_bucket
            ).put_object(
                Key=genome_metadata_fp, 
                Body=fi.read()
            )

        # Now read in all of that information as a table
        genome_metadata = read_tsv_from_s3_as_dataframe(s3_bucket, genome_metadata_fp, sep="\t")

        # 2. Fetch the transcripts and annotation files for every genome
        fetch_transcripts_tasks = {}
        fetch_annotation_tasks = {}

        for genome_accession in map(str, genome_metadata.index.values):
            
            fetch_annotation_tasks[genome_accession] = [
                self.new_task(
                    "fetch_patric_annotations_{}".format(genome_accession),
                    TransferFTPtoS3,
                    ftp_url="ftp://ftp.patricbrc.org/genomes/{}/{}.{}".format(
                        genome_accession,
                        genome_accession,
                        suffix
                    ),
                    s3_url=os.path.join(
                        self.s3_folder,
                        genome_accession,
                        "annotation.tsv"
                    ),
                    containerinfo=sl.ContainerInfo(
                        vcpu=1,
                        mem=1000,
                        engine=self.engine,
                        aws_batch_job_poll_sec=120,
                        aws_jobRoleArn=self.aws_job_role_arn,
                        aws_batch_job_queue=self.aws_batch_job_queue,
                        aws_batch_job_prefix=re.sub(
                            '[^a-zA-Z0-9-_]', '_',
                            "fetch_patric_annotations_{}".format(genome_accession)
                        )
                    )
                )
                for suffix in ["PATRIC.pathway.tab", "RefSeq.pathway.tab"]
            ]

            fetch_transcripts_tasks[genome_accession] = [
                self.new_task(
                    "fetch_patric_transcripts_{}".format(genome_accession),
                    TransferFTPtoS3,
                    ftp_url="ftp://ftp.patricbrc.org/genomes/{}/{}.{}".format(
                        genome_accession,
                        genome_accession,
                        suffix
                    ),
                    s3_url=os.path.join(
                        self.s3_folder,
                        genome_accession,
                        "transcripts.frn"
                    ),
                    containerinfo=sl.ContainerInfo(
                        vcpu=1,
                        mem=1000,
                        engine=self.engine,
                        aws_batch_job_poll_sec=120,
                        aws_jobRoleArn=self.aws_job_role_arn,
                        aws_batch_job_queue=self.aws_batch_job_queue,
                        aws_batch_job_prefix=re.sub(
                            '[^a-zA-Z0-9-_]', '_',
                            "fetch_patric_transcripts_{}".format(
                                genome_accession)
                        )
                    )
                )
                for suffix in ["PATRIC.frn", "RefSeq.frn"]
            ]

        # 3. Make a flat file for the 16S records

        extract_all_16S = self.new_task(
            "extract_all_16S",
            Extract16S,
            s3_parent_folder=self.s3_folder,
            s3_url=os.path.join(self.s3_folder, "transcripts.fasta"),
            temp_folder=self.temp_folder,
            containerinfo=sl.ContainerInfo(
                vcpu=1,
                mem=1000,
                engine=self.engine,
                aws_batch_job_poll_sec=120,
                aws_jobRoleArn=self.aws_job_role_arn,
                aws_batch_job_queue=self.aws_batch_job_queue,
                aws_batch_job_prefix="extract_all_16s"
            )
        )

        extract_all_16S.in_fastas = [
            genome_transcript[0].out_file for genome_transcript in fetch_transcripts_tasks.values()
        ]

        # 4. Make a flat file for the annotations
        extract_all_annotations = self.new_task(
            "extract_all_annotations",
            ExtractAnnotations,
            s3_parent_folder=self.s3_folder,
            s3_url=os.path.join(self.s3_folder, "annotations.tsv"),
            temp_folder=self.temp_folder,
            containerinfo=sl.ContainerInfo(
                vcpu=1,
                mem=1000,
                engine=self.engine,
                aws_batch_job_poll_sec=120,
                aws_jobRoleArn=self.aws_job_role_arn,
                aws_batch_job_queue=self.aws_batch_job_queue,
                aws_batch_job_prefix="extract_all_annotations"
            )
        )

        extract_all_annotations.in_fastas = {
            genome_id: genome_transcript[0].out_file
            for genome_id, genome_transcript in fetch_transcripts_tasks.items()
        }
        extract_all_annotations.in_annotations = {
            genome_id: genome_annotation[0].out_file
            for genome_id, genome_annotation in fetch_annotation_tasks.items()
        }

        return extract_all_16S, extract_all_annotations
Пример #6
0
    def workflow(self):
        # Intialize our container info
        light_containerinfo = sl.ContainerInfo()
        light_containerinfo.from_config(section='light')
        long_containerinfo = light_containerinfo
        midcpu_containerinfo = sl.ContainerInfo()
        midcpu_containerinfo.from_config(section='midcpu')
        heavy_containerinfo = sl.ContainerInfo()
        heavy_containerinfo.from_config(section='heavy')
        highmem_containerinfo = sl.ContainerInfo()
        highmem_containerinfo.from_config(section='highmem')

        #
        #  Load the refpkg (in tgz format)
        #
        refpkg_tgz = self.new_task(
            'load_refpkg_tgz',
            LoadRefpkgTGZ,
            path=self.refpkg_tgz,
            file_format='gzip',
        )

        jplace = self.new_task(
            'load_jplace',
            LoadFile,
            path=self.jplace,
        )

        # Load the seq map
        seq_map = self.new_task('load_seq_map',
                                LoadFile,
                                path=self.seq_map_csv)

        # Load the weights if provided
        if self.sv_weights_csv:
            sv_weights = self.new_task('load_sv_weight',
                                       LoadFile,
                                       path=self.sv_weights_csv)
        else:
            sv_weights = None

        if self.labels:
            labels = self.new_task('load_labels', LoadFile, path=self.labels)
        else:
            labels = None

        #  And unpack the refpkg to the relevant bits
        refpkg_alignments = self.new_task(
            'refpkg_alignments',
            ExtractRefpkgAlignment,
            aln_fasta_fn=os.path.join(self.working_dir, 'placement',
                                      'refpkg.aln.fasta'),
            aln_sto_fn=os.path.join(self.working_dir, 'placement',
                                    'refpkg.aln.sto'),
        )
        refpkg_alignments.in_refpkg_tgz = refpkg_tgz.out_refpkg_tgz

        #
        #  Load the sequence variants (fasta format)
        #
        sv_fasta = self.new_task('load_sv',
                                 LoadFastaSeqs,
                                 fasta_seq_path=self.sv_fasta)

        #
        #  Align the sequence variants
        #
        sv_aligned = self.new_task(
            'align_sv',
            CMAlignSeqs,
            containerinfo=heavy_containerinfo,
            alignment_sto_fn=os.path.join(self.working_dir, 'placement',
                                          'sv.aln.sto'),
            alignment_score_fn=os.path.join(self.working_dir, 'placement',
                                            'sv.aln.scores'),
        )
        sv_aligned.in_seqs = sv_fasta.out_seqs

        sv_aligned_fasta = self.new_task(
            'align_sv_to_fasta',
            AlignmentStoToFasta,
            align_fasta_fn=os.path.join(self.working_dir, 'placement',
                                        'sv.aln.fasta'),
        )
        sv_aligned_fasta.in_align_sto = sv_aligned.out_align_sto

        #
        #  Combine the refpkg alignment with the sequence variant alignment
        #

        sv_refpkg_aln_sto = self.new_task('combine_sv_refpkg_aln_sto',
                                          CombineAlignmentsSTO,
                                          containerinfo=light_containerinfo,
                                          combined_aln_sto_fn=os.path.join(
                                              self.working_dir, 'placement',
                                              'sv_refpkg_aln.sto'))
        sv_refpkg_aln_sto.in_aln_sto_1 = refpkg_alignments.out_aln_sto
        sv_refpkg_aln_sto.in_aln_sto_2 = sv_aligned.out_align_sto

        #
        #  Prep the placements.db using the refpkg
        #

        prepped_placementdb = self.new_task(
            'prep_placementdb',
            PlacementDB_Prep,
            containerinfo=light_containerinfo,
            placement_db_fn=os.path.join(self.destination_dir,
                                         'classification', 'placement.db'))
        prepped_placementdb.in_refpkg_tgz = refpkg_tgz.out_refpkg_tgz

        #
        #  Insert the seq_info / map of sv -> specimens
        #

        placement_db_w_si = self.new_task(
            'placement_db_add_si',
            PlacementDB_AddSI,
            containerinfo=light_containerinfo,
        )
        placement_db_w_si.in_placement_db = prepped_placementdb.out_placement_db
        placement_db_w_si.in_seq_map = seq_map.out_file

        #
        #  Classify the sequence variants
        #

        placement_db_classified = self.new_task(
            'classify_into_placement_db',
            PlacementDB_Classify_SV,
            containerinfo=midcpu_containerinfo,
        )
        placement_db_classified.in_placement_db = placement_db_w_si.out_placement_db
        placement_db_classified.in_refpkg_tgz = refpkg_tgz.out_refpkg_tgz
        placement_db_classified.in_sv_refpkg_aln_sto = sv_refpkg_aln_sto.out_aln_sto
        placement_db_classified.in_jplace = jplace.out_file

        #
        #  Multiclass concat names
        #

        placement_db_mcc = self.new_task(
            'placement_db_multiclass_concat',
            PlacementDB_MCC,
            containerinfo=long_containerinfo,
        )
        placement_db_mcc.in_placement_db = placement_db_classified.out_placement_db
        placement_db_mcc.in_weights = sv_weights.out_file

        #
        #  Tabular CSV outputs
        #
        tables_for_rank = {}
        for rank in ['phylum', 'class', 'order', 'family', 'genus', 'species']:
            tables_for_rank[rank] = self.new_task(
                'by_specimen_{}'.format(rank),
                GenerateTables,
                containerinfo=light_containerinfo,
                tables_path=os.path.join(
                    self.destination_dir,
                    'classification',
                    'tables',
                ),
                rank=rank)
            tables_for_rank[
                rank].in_placement_db = placement_db_mcc.out_placement_db
            tables_for_rank[rank].in_seq_map = seq_map.out_file
            if labels:
                tables_for_rank[rank].in_labels = labels.out_file

        return (placement_db_mcc, tables_for_rank)
Пример #7
0
class Workflow_NCBI_16s(sl.WorkflowTask):
    #
    # Take a set of sequence variants in FASTA format and at least one repository
    # of reference sequences.
    # Search the repository / repositories for matches above a specified threshold
    # for the sequence variants.
    #  Use those recruited full length repo sequences to build a refpkg.
    #
    working_dir = sl.Parameter()
    ncbi_email = sl.Parameter()
    repo_url = sl.Parameter()
    example_seqs = sl.Parameter()

    heavy_containerinfo = sl.ContainerInfo(
        vcpu=36,
        mem=70000,
        container_cache=os.path.abspath(
            os.path.join('../working', 'containers/')),
        engine='aws_batch',
        aws_s3_scratch_loc='s3://fh-pi-fredricks-d/lab/golob/sl_temp/',
        aws_jobRoleArn=
        'arn:aws:iam::064561331775:role/fh-pi-fredricks-d-batchtask',
        aws_batch_job_queue='optimal',
        slurm_partition='boneyard')

    light_containerinfo = sl.ContainerInfo(
        vcpu=2,
        mem=2024,
        container_cache=os.path.abspath(
            os.path.join('../working', 'containers/')),
        engine='aws_batch',
        aws_s3_scratch_loc='s3://fh-pi-fredricks-d/lab/golob/sl_temp/',
        aws_jobRoleArn=
        'arn:aws:iam::064561331775:role/fh-pi-fredricks-d-batchtask',
        aws_batch_job_queue='optimal',
        slurm_partition='boneyard')

    test_containerinfo = sl.ContainerInfo(
        vcpu=2,
        mem=4096,
        container_cache=os.path.abspath(
            os.path.join('../working', 'containers/')),
        engine=ENGINE,
        aws_s3_scratch_loc='s3://fh-pi-fredricks-d/lab/golob/sl_temp/',
        aws_jobRoleArn=
        'arn:aws:iam::064561331775:role/fh-pi-fredricks-d-batchtask',
        aws_batch_job_queue='optimal',
        slurm_partition='boneyard')

    local_containerinfo = sl.ContainerInfo(
        vcpu=2,
        mem=4096,
        container_cache=os.path.abspath(
            os.path.join('../working', 'containers/')),
        engine='docker',
        aws_s3_scratch_loc='s3://fh-pi-fredricks-d/lab/golob/sl_temp/',
        aws_jobRoleArn=
        'arn:aws:iam::064561331775:role/fh-pi-fredricks-d-batchtask',
        aws_batch_job_queue='optimal',
        slurm_partition='boneyard')

    def workflow(self):
        #
        # Load current accessions with 16s in a genome
        #

        repo_url = self.new_task(
            'load_repo_url',
            LoadFile,
            path=self.repo_url,
        )

        example_seqs = self.new_task('load_example_seqs',
                                     LoadFile,
                                     path=self.example_seqs)

        acc_genome_16s = self.new_task(
            'genome_16s_accessions',
            NT_AccessionsForQuery,
            containerinfo=self.test_containerinfo,
            email=self.ncbi_email,
            accessions_fn=os.path.join(self.working_dir, 'ncbi_16s',
                                       'accession', 'genome_16s.csv'),
            query=("16s[All Fields] AND rRNA[Feature Key]"
                   " AND Bacteria[Organism]"
                   " AND 500000 : 99999999999[Sequence Length]"
                   " AND genome[All Fields]"),
        )

        repo_genome_update = self.new_task(
            'repo_genome_update',
            NT_Repo_Update_Accessions,
            extra_values={'is_genome': True},
        )
        repo_genome_update.in_repo_url = repo_url.out_file
        repo_genome_update.in_accessions = acc_genome_16s.out_accessions

        repo_filled = self.new_task(
            'repo_fill',
            NT_Repo_Fill,
            containerinfo=self.test_containerinfo,
            email=self.ncbi_email,
            working_dir=os.path.join(
                self.working_dir,
                'ncbi_16s',
            ),
        )
        repo_filled.in_repo = repo_genome_update.out_repo

        #  Now dump out 16S / seq_info from the genomes.
        repo_dumped = self.new_task(
            'repo_dump',
            NT_Repo_Output_FastaSeqInfo,
            fn_fasta_gz=os.path.join(self.working_dir, 'ncbi_16s',
                                     'genomes.16s.fasta.gz'),
            fn_seq_info=os.path.join(self.working_dir, 'ncbi_16s',
                                     'genomes.16s.seq_info.csv'),
        )
        repo_dumped.in_repo = repo_filled.out_repo

        # Find genomes missing peptide / rRNA annotations
        prokka_annotation = self.new_task(
            'prokka_annotation',
            NT_Repo_Prokka,
            containerinfo=self.light_containerinfo,
            num_concurrent=100,
            workdir=os.path.join(self.working_dir, 'ncbi_16s', 'prokka'))
        prokka_annotation.in_repo = repo_filled.out_repo

        return (prokka_annotation)
        # Use cmsearch to be sure these are vaguely like rRNA
        cmsearch_verify = self.new_task(
            'cmsearch_verify',
            CMSearchVerify,
            containerinfo=self.heavy_containerinfo,
            results_fn=os.path.join(self.working_dir, 'ncbi_16s',
                                    'genomes.16s.cmsearch.tsv'),
        )
        cmsearch_verify.in_seqs = repo_dumped.out_seqs

        #  And filter to rRNA.
        verified_seqs = self.new_task(
            'verify_repo',
            VerifyRepo,
            containerinfo=self.heavy_containerinfo,
            uc_fn=os.path.join(self.working_dir, 'ncbi_16s',
                               'genomes.16s.verified.uc'),
            verified_seqs_fn=os.path.join(self.working_dir, 'ncbi_16s',
                                          'genomes.16s.verified.fasta.gz'),
            unverified_seqs_fn=os.path.join(self.working_dir, 'ncbi_16s',
                                            'genomes.16s.unverified.fasta.gz'),
        )
        verified_seqs.in_repo_seqs = repo_dumped.out_seqs
        verified_seqs.in_expected_seqs = example_seqs.out_file

        return (repo_dumped)
Пример #8
0
    def workflow(self):
        # Intialize our container info
        light_containerinfo = sl.ContainerInfo()
        light_containerinfo.from_config(
            section='light'
        )
        long_containerinfo = light_containerinfo
        midcpu_containerinfo = sl.ContainerInfo()
        midcpu_containerinfo.from_config(
            section='midcpu'
        )
        heavy_containerinfo = sl.ContainerInfo()
        heavy_containerinfo.from_config(
            section='heavy'
        )
        highmem_containerinfo = sl.ContainerInfo()
        highmem_containerinfo.from_config(
            section='highmem'
        )

        #
        #  Load the refpkg (in tgz format)
        #
        refpkg_tgz = self.new_task(
            'load_refpkg_tgz',
            LoadRefpkgTGZ,
            path=self.refpkg_tgz,
            file_format='gzip',
        )

        # Load the seq map
        seq_map = self.new_task(
            'load_seq_map',
            LoadFile,
            path=self.seq_map_csv
        )

        # Load the weights if provided
        if self.sv_weights_csv:
            sv_weights = self.new_task(
                'load_sv_weight',
                LoadFile,
                path=self.sv_weights_csv
            )
        else:
            sv_weights = None

        #  And unpack the refpkg to the relevant bits
        refpkg_alignments = self.new_task(
            'refpkg_alignments',
            ExtractRefpkgAlignment,
            aln_fasta_fn=os.path.join(
                self.working_dir,
                'placement',
                'refpkg.aln.fasta'
            ),
            aln_sto_fn=os.path.join(
                self.working_dir,
                'placement',
                'refpkg.aln.sto'
            ),
        )
        refpkg_alignments.in_refpkg_tgz = refpkg_tgz.out_refpkg_tgz

        #
        #  Load the sequence variants (fasta format)
        #
        sv_fasta = self.new_task(
            'load_sv',
            LoadFastaSeqs,
            fasta_seq_path=self.sv_fasta
        )

        #
        #  Align the sequence variants
        #
        sv_aligned = self.new_task(
            'align_sv',
            CMAlignSeqs,
            containerinfo=heavy_containerinfo,
            alignment_sto_fn=os.path.join(
                self.working_dir,
                'placement',
                'sv.aln.sto'
            ),
            alignment_score_fn=os.path.join(
                self.working_dir,
                'placement',
                'sv.aln.scores'
            ),
        )
        sv_aligned.in_seqs = sv_fasta.out_seqs

        sv_aligned_fasta = self.new_task(
            'align_sv_to_fasta',
            AlignmentStoToFasta,
            align_fasta_fn=os.path.join(
                self.working_dir,
                'placement',
                'sv.aln.fasta'
            ),
        )
        sv_aligned_fasta.in_align_sto = sv_aligned.out_align_sto

        #
        #  Combine the refpkg alignment with the sequence variant alignment
        #

        sv_refpkg_aln_sto = self.new_task(
            'combine_sv_refpkg_aln_sto',
            CombineAlignmentsSTO,
            containerinfo=heavy_containerinfo,
            combined_aln_sto_fn=os.path.join(
                self.working_dir,
                'placement',
                'sv_refpkg_aln.sto'
            )
        )
        sv_refpkg_aln_sto.in_aln_sto_1 = refpkg_alignments.out_aln_sto
        sv_refpkg_aln_sto.in_aln_sto_2 = sv_aligned.out_align_sto

        #
        #  Place the sequence variants using this combined aligment
        #
        dedup_jplace = self.new_task(
            'make_dedup_jplace',
            PPLACER_PlaceAlignment,
            containerinfo=heavy_containerinfo,
            jplace_fn=os.path.join(
                self.destination_dir,
                'placement',
                'dedup.jplace'
            )
        )
        dedup_jplace.in_refpkg_tgz = refpkg_tgz.out_refpkg_tgz
        dedup_jplace.in_merged_aln_sto = sv_refpkg_aln_sto.out_aln_sto

        #
        #  Reduplicate
        #

        if not sv_weights:
            redup_jplace = dedup_jplace
        else:
            redup_jplace = self.new_task(
                'reduplicate_jplace',
                Jplace_Reduplicate,
                containerinfo=light_containerinfo,
                jplace_fn=os.path.join(
                    self.destination_dir,
                    'placement',
                    'redup.jplace.gz'
                )
            )
            redup_jplace.in_jplace = dedup_jplace.out_jplace
            redup_jplace.in_weights = sv_weights.out_file

        #
        #  ADCL
        #
        adcl = self.new_task(
            'create_adcl',
            Jplace_ADCL,
            containerinfo=light_containerinfo,
            adcl_fn=os.path.join(
                self.destination_dir,
                'placement',
                'adcl.gz'
            )
        )
        adcl.in_jplace = redup_jplace.out_jplace

        #
        #  EDPL
        #

        edpl = self.new_task(
            'calculate_edpl',
            Jplace_EDPL,
            containerinfo=highmem_containerinfo,
            edpl_fn=os.path.join(
                self.destination_dir,
                'placement',
                'edpl.gz'
            )
        )
        edpl.in_jplace = redup_jplace.out_jplace

        #
        #  EPCA
        #
        epca = self.new_task(
            'calculate_epca',
            Jplace_PCA,
            containerinfo=long_containerinfo,
            path=os.path.join(
                self.destination_dir,
                'placement',
                'pca'
            ),
            prefix='epca',
            pca='epca'
        )
        epca.in_refpkg_tgz = refpkg_tgz.out_refpkg_tgz
        epca.in_seq_map = seq_map.out_file
        epca.in_jplace = redup_jplace.out_jplace

        #
        #  LPCA
        #

        lpca = self.new_task(
            'calculate_lpca',
            Jplace_PCA,
            containerinfo=highmem_containerinfo,
            path=os.path.join(
                self.destination_dir,
                'placement',
                'pca'
            ),
            prefix='lpca',
            pca='lpca'
        )
        lpca.in_refpkg_tgz = refpkg_tgz.out_refpkg_tgz
        lpca.in_seq_map = seq_map.out_file
        lpca.in_jplace = redup_jplace.out_jplace

        #
        #  KR-distance
        #

        kr_distance = self.new_task(
            'calculate_kr_distance',
            Jplace_KR_Distance,
            containerinfo=long_containerinfo,
            kr_fn=os.path.join(
                self.destination_dir,
                'placement',
                'kr_distance.csv'
            ),
        )
        kr_distance.in_refpkg_tgz = refpkg_tgz.out_refpkg_tgz
        kr_distance.in_seq_map = seq_map.out_file
        kr_distance.in_jplace = redup_jplace.out_jplace

        # 
        #  Alpha-Diversity
        #

        alpha_diversity = self.new_task(
            'calculate_alpha_diversity',
            Jplace_Alpha_Diversity,
            containerinfo=light_containerinfo,
            alpha_diversity_fn=os.path.join(
                self.destination_dir,
                'placement',
                'alpha_diversity.csv'
            ),
        )
        alpha_diversity.in_refpkg_tgz = refpkg_tgz.out_refpkg_tgz
        alpha_diversity.in_seq_map = seq_map.out_file
        alpha_diversity.in_jplace = redup_jplace.out_jplace

        return(epca, lpca, adcl, edpl, kr_distance, alpha_diversity)
Пример #9
0
    def workflow(self):

        # Input files are either located in SRA or AWS S3
        assert self.input_location in ["SRA", "S3"]

        # Read in the metadata sheet
        metadata = pd.read_table(self.metadata_fp, sep=self.metadata_fp_sep)
        for col_name in [self.input_column_name, self.sample_column_name]:
            assert col_name in metadata.columns, "{} not found in {}".format(
                col_name, self.metadata_fp)
            # Make sure that all samples and files are unique
            assert metadata[col_name].unique().shape[0] == metadata.shape[0]

        # Make tasks that will make sure the reference databases exist
        ref_db_dmnd = self.new_task("load_ref_db_dmnd",
                                    LoadFile,
                                    path=self.ref_db_dmnd)
        ref_db_metadata = self.new_task("load_ref_db_metadata",
                                        LoadFile,
                                        path=self.ref_db_metadata)

        # Keep track of all of the jobs for getting the input files
        tasks_load_inputs = {}

        # Keep track of all of the jobs for aligning against the viral database
        tasks_map_viruses = {}

        # Assembling datasets de novo
        tasks_metaspades = {}

        # Running VirFinder on assembled contigs
        tasks_virfinder = {}

        # Iterate over all of the rows of samples
        for ix, r in metadata.iterrows():

            # Get the sample name and the file location
            sample_name = r[self.sample_column_name]
            input_path = r[self.input_column_name]

            # If the inputs are on SRA, execute jobs that will download them
            if self.input_location == "SRA":

                tasks_load_inputs[sample_name] = self.new_task(
                    "download_from_sra_{}".format(sample_name),
                    ImportSRAFastq,
                    sra_accession=input_path,
                    base_s3_folder=self.base_s3_folder,
                    containerinfo=sl.ContainerInfo(
                        vcpu=1,
                        mem=4096,
                        engine=self.engine,
                        aws_s3_scratch_loc=self.aws_s3_scratch_loc,
                        aws_jobRoleArn=self.aws_job_role_arn,
                        aws_batch_job_queue=self.aws_batch_job_queue,
                        aws_batch_job_prefix=re.sub(
                            '[^a-zA-Z0-9-_]', '_',
                            "download_from_sra_{}".format(sample_name)),
                        mounts={
                            "/docker_scratch": {
                                "bind": self.temp_folder,
                                "mode": "rw"
                            }
                        }))
            else:
                # Make sure the file exists on S3
                assert self.input_location == "S3"
                tasks_load_inputs[sample_name] = self.new_task(
                    "load_from_s3_{}".format(sample_name),
                    LoadFile,
                    path=input_path)

            # Make a task to align the reads, wherever they came from
            tasks_map_viruses[sample_name] = self.new_task(
                "map_viruses_{}".format(sample_name),
                MapVirusesTask,
                output_folder=os.path.join(self.base_s3_folder,
                                           self.mapping_output_folder),
                sample_name=sample_name,
                threads=self.align_threads,
                temp_folder=self.temp_folder,
                containerinfo=sl.ContainerInfo(
                    vcpu=int(self.align_threads),
                    mem=int(self.align_mem),
                    engine=self.engine,
                    aws_s3_scratch_loc=self.aws_s3_scratch_loc,
                    aws_jobRoleArn=self.aws_job_role_arn,
                    aws_batch_job_queue=self.aws_batch_job_queue,
                    aws_batch_job_prefix=re.sub(
                        '[^a-zA-Z0-9-_]', '_',
                        "map_viruses_{}".format(sample_name)),
                    mounts={
                        "/docker_scratch": {
                            "bind": self.temp_folder,
                            "mode": "rw"
                        }
                    }))

            # De novo assembly with metaSPAdes
            tasks_metaspades[sample_name] = self.new_task(
                "metaspades_{}".format(sample_name),
                AssembleMetaSPAdes,
                sample_name=sample_name,
                output_folder=os.path.join(self.base_s3_folder, "metaspades"),
                threads=self.assemble_threads,
                max_mem=int(int(self.assemble_mem) / 1000),
                temp_folder=self.temp_folder,
                containerinfo=sl.ContainerInfo(
                    vcpu=int(self.assemble_threads),
                    mem=int(self.assemble_mem),
                    engine=self.engine,
                    aws_s3_scratch_loc=self.aws_s3_scratch_loc,
                    aws_jobRoleArn=self.aws_job_role_arn,
                    aws_batch_job_queue=self.aws_batch_job_queue,
                    aws_batch_job_prefix=re.sub(
                        '[^a-zA-Z0-9-_]', '_',
                        "metaspades_{}".format(sample_name)),
                    mounts={
                        "/docker_scratch": {
                            "bind": self.temp_folder,
                            "mode": "rw"
                        }
                    }))

            # Run VirFinder on the assembled contigs
            tasks_virfinder[sample_name] = self.new_task(
                "virfinder_{}".format(sample_name),
                VirFinderTask,
                base_s3_folder=self.base_s3_folder,
                sample_name=sample_name,
                containerinfo=sl.ContainerInfo(
                    vcpu=int(self.align_threads),
                    mem=int(self.align_mem),
                    engine=self.engine,
                    aws_s3_scratch_loc=self.aws_s3_scratch_loc,
                    aws_jobRoleArn=self.aws_job_role_arn,
                    aws_batch_job_queue=self.aws_batch_job_queue,
                    aws_batch_job_prefix=re.sub(
                        '[^a-zA-Z0-9-_]', '_',
                        "virfinder_{}".format(sample_name)),
                ))

        # Assign the output from tasks_load_inputs to the input to tasks_map_viruses
        for sample_name in tasks_load_inputs:
            assert sample_name in tasks_map_viruses

            # Assign the input for the reference database
            tasks_map_viruses[
                sample_name].in_ref_db_dmnd = ref_db_dmnd.out_file
            tasks_map_viruses[
                sample_name].in_ref_db_metadata = ref_db_metadata.out_file
            tasks_map_viruses[sample_name].in_fastq = tasks_load_inputs[
                sample_name].out_file

            # VirFinder depends on metaspades
            tasks_metaspades[sample_name].in_fastq = tasks_load_inputs[
                sample_name].out_file
            tasks_virfinder[sample_name].in_fasta = tasks_metaspades[
                sample_name].out_fasta

        return tasks_map_viruses, tasks_virfinder
Пример #10
0
    def workflow(self):

        # Make sure the project name is alphanumeric
        assert all([s.isalnum() or s == "_" for s in self.project_name
                    ]), "Project name must be alphanumeric"

        # Data can come from either SRA or S3
        assert self.input_location in ["SRA", "S3"]

        # Read in the metadata sheet
        metadata = pd.read_table(self.metadata_fp, sep=self.metadata_fp_sep)

        for col_name in [self.input_column_name, self.sample_column_name]:
            assert col_name in metadata.columns, "{} not found in {}".format(
                col_name, self.metadata_fp)
            # Make sure that all samples and files are unique
            assert metadata[col_name].unique().shape[0] == metadata.shape[0]

        # Keep track of the jobs for each step, for each sample
        tasks_load_inputs = {}
        tasks_famli = {}

        # Iterate over all of the rows of samples
        for _, r in metadata.iterrows():

            # Get the sample name and the file location
            sample_name = r[self.sample_column_name]
            input_path = r[self.input_column_name]

            # Make a UUID to isolate temp files for this task from any others
            task_uuid = str(uuid.uuid4())[:8]

            # 0. LOAD THE DATABASE
            tasks_load_db = self.new_task("load_db_from_s3",
                                          LoadFile,
                                          path=self.famli_db_location)

            # 1. LOAD THE INPUT FILES

            if self.input_location == "S3":
                tasks_load_inputs[sample_name] = self.new_task(
                    "load_from_s3_{}".format(sample_name),
                    LoadFile,
                    path=input_path)
            elif self.input_location == "SRA":
                assert input_path.startswith("SRR"), input_path

                tasks_load_inputs[sample_name] = self.new_task(
                    "download_from_SRA_{}".format(sample_name),
                    ImportSRAFastq,
                    sra_accession=input_path,
                    base_s3_folder=self.base_s3_folder,
                    input_mount_point="/scratch/{}_get_sra/input/".format(
                        task_uuid),
                    output_mount_point="/scratch/{}_get_sra/output/".format(
                        task_uuid),
                    containerinfo=sl.ContainerInfo(
                        vcpu=1,
                        mem=32000,
                        engine=self.engine,
                        aws_s3_scratch_loc=self.aws_s3_scratch_loc,
                        aws_batch_job_poll_sec=120,
                        aws_jobRoleArn=self.aws_job_role_arn,
                        aws_batch_job_queue=self.aws_batch_job_queue,
                        aws_batch_job_prefix=re.sub(
                            '[^a-zA-Z0-9-_]', '_',
                            "get_sra_{}".format(sample_name)),
                        mounts={
                            "/docker_scratch": {
                                "bind": self.temp_folder,
                                "mode": "rw"
                            }
                        }))
            else:
                raise Exception("Data must be from S3 or SRA")

            # 2. ALIGN AGAINST THE DATABASE USING FAMLI

            tasks_famli[sample_name] = self.new_task(
                "famli_{}".format(sample_name),
                FAMLITask,
                sample_name=sample_name,
                output_folder=os.path.join(self.base_s3_folder,
                                           self.output_folder),
                threads=self.famli_threads,
                temp_folder=self.temp_folder,
                containerinfo=sl.ContainerInfo(
                    vcpu=int(self.famli_threads),
                    mem=int(self.famli_mem),
                    engine=self.engine,
                    aws_s3_scratch_loc=self.aws_s3_scratch_loc,
                    aws_batch_job_poll_sec=120,
                    aws_jobRoleArn=self.aws_job_role_arn,
                    aws_batch_job_queue=self.aws_batch_job_queue,
                    aws_batch_job_prefix="famli_{}".format(sample_name),
                    mounts={
                        "/docker_scratch": {
                            "bind": self.temp_folder,
                            "mode": "rw"
                        }
                    }))
            # Connect the raw FASTQ input
            if self.input_location == "S3":
                tasks_famli[sample_name].in_fastq = tasks_load_inputs[
                    sample_name].out_file
            elif self.input_location == "SRA":
                tasks_famli[sample_name].in_fastq = tasks_load_inputs[
                    sample_name].out_fastq

            # Connect the reference database
            tasks_famli[sample_name].in_ref_dmnd = tasks_load_db.out_file

        return tasks_famli
Пример #11
0
    def workflow(self):
        # Intialize our container info
        light_containerinfo = sl.ContainerInfo()
        light_containerinfo.from_config(section='light')
        midcpu_containerinfo = sl.ContainerInfo()
        midcpu_containerinfo.from_config(section='midcpu')
        heavy_containerinfo = sl.ContainerInfo()
        heavy_containerinfo.from_config(section='heavy')
        highmem_containerinfo = sl.ContainerInfo()
        highmem_containerinfo.from_config(section='highmem')

        #
        # Build our taxonomy db
        #
        taxonomy_db = self.new_task('taxonomy_db',
                                    BuildTaxtasticDB,
                                    containerinfo=light_containerinfo,
                                    tax_db_path=os.path.join(
                                        self.working_dir, 'refpkg',
                                        'taxonomy.db'))

        #
        # Load the sequence variants
        #

        sequence_variants = self.new_task(
            'load_sequence_variants',
            LoadFastaSeqs,
            fasta_seq_path=self.sequence_variants_path)
        log.info("Loaded sequence variants")

        # Load the sequence information
        seq_info_files = [
            self.new_task('load_si_{}'.format(si_i), LoadFile, path=si_path)
            for si_i, si_path in enumerate(self.repo_seq_info.split(','))
        ]
        log.info("Loaded %d sequence information files", len(seq_info_files))

        #
        # Load the annotated repositories
        #
        repo_annotated = [
            self.new_task('load_annotated_repo_{}'.format(r_i),
                          LoadFastaSeqs,
                          fasta_seq_path=r_path)
            for r_i, r_path in enumerate(self.repo_annotated_fasta.split(','))
        ]
        log.info("Loaded %d Annotated Repositories", len(repo_annotated))

        #
        # Search the sequence variants in the annotated repository
        #
        search_sv_annotated = []
        for ra_i, r_annotated in enumerate(repo_annotated):

            r_a_task = self.new_task(
                'search_sv_annotated_{}'.format(ra_i),
                SearchRepoForMatches,
                containerinfo=midcpu_containerinfo,
                matches_uc_path=os.path.join(
                    self.working_dir, 'refpkg',
                    'repo.annotated__{}.matches.uc'.format(ra_i)),
                unmatched_exp_seqs_path=os.path.join(
                    self.working_dir, 'refpkg',
                    'repo.annotated__{}.annotated.exp_seqs_unmatched.fasta'.
                    format(ra_i)),
                matched_repo_seqs_path=os.path.join(
                    self.working_dir, 'refpkg',
                    'repo.annotated__{}.recruited_repo_seqs.fasta'.format(
                        ra_i)),
                min_id=self.min_id_annotated,
                maxaccepts=
                10,  # Default take the top 10 (roughly corresponding to a 95% id for most)
            )
            r_a_task.in_exp_seqs = sequence_variants.out_seqs
            r_a_task.in_repo_seqs = r_annotated.out_seqs
            search_sv_annotated.append(r_a_task)
        #
        # Combine Recruits into one file
        #

        combined_repo_matches = self.new_task(
            'combine_repo_matches',
            CombineRepoMatches,
            seqs_fn=os.path.join(self.working_dir, 'refpkg',
                                 'combined.repo.maches.fasta'),
            seq_info_fn=os.path.join(self.working_dir, 'refpkg',
                                     'combined.repo.maches.seq_info.csv'),
        )
        combined_repo_matches.in_seqs = [
            ssv.out_matched_repo_seqs for ssv in search_sv_annotated
        ]
        combined_repo_matches.in_seq_info = [
            sif.out_file for sif in seq_info_files
        ]

        refpkg_seqs = combined_repo_matches.out_seqs
        refpkg_seqinfo = combined_repo_matches.out_seq_info

        #
        # Verify the taxonomy for the refpkg seqinfo file.
        #

        verified_refpkg_seqinfo = self.new_task(
            'verify_refpkg_seqinfo_taxonomy',
            ConfirmSeqInfoTaxonomy,
            email=self.entrez_email,
            containerinfo=light_containerinfo,
            confirmed_seqinfo_path=os.path.join(
                self.working_dir, 'refpkg',
                'seq_info.refpkg.verified_tax.csv'))
        verified_refpkg_seqinfo.in_seq_info = refpkg_seqinfo
        verified_refpkg_seqinfo.in_tax_db = taxonomy_db.out_tax_db

        #
        # Parse UC file to determine if we achieved our minimum-best goal
        # for each SV.
        #

        #
        # Align recruited repo seqs
        #

        align_recruits = self.new_task(
            'align_recruits',
            CMAlignSeqs,
            containerinfo=highmem_containerinfo,
            alignment_sto_fn=os.path.join(self.working_dir, 'refpkg',
                                          'recruit.aln.sto'),
            alignment_score_fn=os.path.join(self.working_dir, 'refpkg',
                                            'recruit.aln.scores'),
        )
        align_recruits.in_seqs = refpkg_seqs

        #
        # Make a fasta version of the alignment
        #

        align_fasta = self.new_task(
            'align_fasta',
            AlignmentStoToFasta,
            align_fasta_fn=os.path.join(self.working_dir, 'refpkg',
                                        'recruit.aln.fasta'),
        )
        align_fasta.in_align_sto = align_recruits.out_align_sto

        #
        #  Make a tree of the reference package sequences
        #

        raxml_tree = self.new_task(
            'raxml_tree',
            RAxMLTree,
            containerinfo=heavy_containerinfo,
            tree_path=os.path.join(self.working_dir, 'refpkg', 'refpkg.tre'),
            tree_stats_path=os.path.join(self.working_dir, 'refpkg',
                                         'refpkg.tre.info'),
        )
        raxml_tree.in_align_fasta = align_fasta.out_align_fasta

        #
        # Cleanup the tree info to remove cruft
        #

        tree_info_cleanup = self.new_task(
            'tree_info_cleanup',
            CleanupTreeInfo,
            tree_info_path=os.path.join(self.working_dir, 'refpkg',
                                        'refpkg.tre.cleaned.info'),
        )
        tree_info_cleanup.in_tree_info = raxml_tree.out_tree_stats

        #
        #  Start to assemble the reference package at this point
        #

        # Taxtable
        refpkg_taxtable = self.new_task('refpkg_taxtable',
                                        TaxTableForSeqInfo,
                                        containerinfo=light_containerinfo,
                                        taxtable_path=os.path.join(
                                            self.working_dir, 'refpkg',
                                            'taxtable.csv'))
        refpkg_taxtable.in_seq_info = verified_refpkg_seqinfo.out_seq_info
        refpkg_taxtable.in_tax_db = taxonomy_db.out_tax_db

        # Covariance Matrix
        obtain_cm = self.new_task('obtain_cm',
                                  ObtainCM,
                                  containerinfo=light_containerinfo,
                                  cm_destination=os.path.join(
                                      self.working_dir, 'refpkg',
                                      'rRNA_16S_SSU.cm'))

        # And the actual combination step
        combine_refpgk = self.new_task(
            'combine_refpkg',
            CombineRefpkg,
            containerinfo=light_containerinfo,
            refpkg_path=os.path.join(
                self.new_refpkg_path,
                'refpkg',
            ),
            refpkg_name=self.new_refpkg_name,
        )
        combine_refpgk.in_aln_fasta = align_fasta.out_align_fasta
        combine_refpgk.in_aln_sto = align_recruits.out_align_sto
        combine_refpgk.in_tree = raxml_tree.out_tree
        combine_refpgk.in_tree_stats = tree_info_cleanup.out_tree_info
        combine_refpgk.in_taxtable = refpkg_taxtable.out_taxtable
        combine_refpgk.in_seq_info = verified_refpkg_seqinfo.out_seq_info
        combine_refpgk.in_cm = obtain_cm.out_cm

        return (combine_refpgk)

        #
        # Combine the sequences, avoiding duplicate sequences
        #

        combined_recruits = self.new_task(
            'combine_repo_recruits',
            CombineRepoMatches,
            seqs_fn=os.path.join(self.working_dir, 'refpkg',
                                 'recruits.combined.fasta'),
            seq_info_fn=os.path.join(self.working_dir, 'refpkg',
                                     'recruits.combined.seq_info.csv'))
        combined_recruits.in_seqs = [
            search_sv_genomes.out_matched_repo_seqs,
            search_sv_filtered.out_matched_repo_seqs,
        ]
        combined_recruits.in_seq_info = [
            repo_genomes_seq_info.out_file,
            repo_filtered_seq_info.out_file,
        ]

        refpkg_taxtable = self.new_task('refpkg_taxtable',
                                        TaxTableForSeqInfo,
                                        containerinfo=self.local_containerinfo,
                                        taxtable_path=os.path.join(
                                            self.working_dir, 'refpkg',
                                            'taxtable.csv'))
        refpkg_taxtable.in_seq_info = combined_recruits.out_seq_info
        refpkg_taxtable.in_tax_db = taxonomy_db.out_tax_db

        obtain_cm = self.new_task('obtain_cm',
                                  ObtainCM,
                                  containerinfo=self.local_containerinfo,
                                  cm_destination=os.path.join(
                                      self.working_dir, 'refpkg',
                                      'rRNA_16S_SSU.cm'))

        combine_refpgk = self.new_task(
            'combine_refpkg',
            CombineRefpkg,
            containerinfo=self.local_containerinfo,
            refpkg_path=os.path.join(
                self.working_dir,
                'refpkg',
            ),
            refpkg_name='test',
        )
        combine_refpgk.in_aln_fasta = align_fasta.out_align_fasta
        combine_refpgk.in_aln_sto = align_recruits.out_align_sto
        combine_refpgk.in_tree = raxml_tree.out_tree
        combine_refpgk.in_tree_stats = raxml_tree.out_tree_stats
        combine_refpgk.in_taxtable = refpkg_taxtable.out_taxtable
        combine_refpgk.in_seq_info = combined_recruits.out_seq_info
        combine_refpgk.in_cm = obtain_cm.out_cm

        return (combine_refpgk)
    def workflow(self):

        # Load the input file
        genome_fasta = self.new_task("load_genome_fasta",
                                     LoadFile,
                                     path=self.genome_fasta)

        # Run Prokka
        annotate_prokka = self.new_task(
            "annotate_prokka_{}".format(self.genome_name),
            AnnotateProkka,
            sample_name=self.genome_name,
            output_folder=os.path.join(self.base_s3_folder, "prokka"),
            threads=self.checkm_threads,
            temp_folder=self.temp_folder,
            containerinfo=sl.ContainerInfo(
                vcpu=int(self.checkm_threads),
                mem=int(self.checkm_memory),
                engine=self.engine,
                aws_s3_scratch_loc=self.aws_s3_scratch_loc,
                aws_jobRoleArn=self.aws_job_role_arn,
                aws_batch_job_queue=self.aws_batch_job_queue,
                aws_batch_job_name="annotate_prokka_{}".format(
                    self.genome_name),
                mounts={
                    "/docker_scratch": {
                        "bind": self.temp_folder,
                        "mode": "rw"
                    }
                }))

        # Link the file for prokka annotation
        annotate_prokka.in_fasta = genome_fasta.out_file

        # Run CheckM
        checkm = self.new_task(
            "checkm_{}".format(self.genome_name),
            CheckM,
            sample_name=self.genome_name,
            output_folder=os.path.join(self.base_s3_folder, "checkm"),
            threads=8,
            temp_folder=self.temp_folder,
            containerinfo=sl.ContainerInfo(
                vcpu=int(8),
                mem=int(64000),
                engine=self.engine,
                aws_s3_scratch_loc=self.aws_s3_scratch_loc,
                aws_jobRoleArn=self.aws_job_role_arn,
                aws_batch_job_queue=self.aws_batch_job_queue,
                aws_batch_job_name="checkm_{}".format(self.genome_name),
                mounts={
                    "/docker_scratch": {
                        "bind": self.temp_folder,
                        "mode": "rw"
                    }
                }))

        # Link the protein coding sequences from prokka into the inputs for checkm
        checkm.in_faa = annotate_prokka.out_faa

        return checkm