Пример #1
0
class FastqpTask(sl.ContainerTask):

    # Input: FASTQ
    in_fastq = None

    # Output is a summary of the FASTQ quality
    summary_path = sl.Parameter()

    input_mount_point = sl.Parameter(default="/mnt/input/")
    output_mount_point = sl.Parameter(default="/mnt/output/")

    # URL of the container
    container = "quay.io/fhcrc-microbiome/fastqp:fastqp-v0.2"

    def out_summary(self):
        return sl.ContainerTargetInfo(self, self.summary_path)

    def run(self):

        input_targets = {"fastq": self.in_fastq()}

        output_targets = {"summary_file": self.out_summary()}

        self.ex(
            command="fastqp " + "-e $summary_file " + "$fastq",
            input_targets=input_targets,
            output_targets=output_targets,
            input_mount_point=self.input_mount_point,
            output_mount_point=self.output_mount_point,
        )
Пример #2
0
class FastQC(sl.Task):

    in_data = None
    tool_path = sl.Parameter(default=None)
    outp_path = sl.Parameter()

    # owner_email = '*****@*****.**'

    def out_fastq(self):
        print(self.in_data().path)

        print(self.outp_path)
        return sl.TargetInfo(self, self.outp_path + 'fastq_res')

    def run(self):
        print(self.in_data)
        fastqs = utils.deserialize(self.in_data().path, load)
        # #print(self.deps())
        utils.mkdir_if_not_exist(self.outp_path)
        args_dict = OrderedDict()
        args_dict['-o'] = self.outp_path
        cmd = '{tool} {args} {inp1} {inp2}'.format(
            tool=self.tool_path,
            args=utils.make_args(args_dict),
            inp1=fastqs[0],
            inp2=fastqs[1])
        print(cmd)
        try:
            run(cmd, shell=True, check=True)
        except CalledProcessError as e:
            raise e('i failed')

        utils.serialize(glob(self.outp_path), self.out_fastq().path, dump)
Пример #3
0
class MarkDuplicates(sl.Task):

    tool_path = sl.Parameter()
    outp_path = sl.Parameter()
    working_dir = sl.Parameter()
    in_data = None

    def out_data(self):
        return sl.TargetInfo(self, self.outp_path + 'mark_dupl_res')

    def run(self):
        with utils.cd(self.working_dir):
            utils.mkdir_if_not_exist(self.outp_path)
            res = utils.deserialize(self.in_data().path, load)
            cmds = ['java -jar {tool} MarkDuplicates INPUT=sorted.bam OUTPUT=dedup_reads.bam ' \
                  'METRICS_FILE=metrics.txt'.format(tool=self.tool_path),
            # Add readgroups info
            'java -jar {tool} AddOrReplaceReadGroups I=dedup_reads.bam ' \
                   'O=dedup_reads_w_readgroups.bam RGID=4 RGLB=lib1 ' \
                   'RGPL=illumina RGPU=unit1 RGSM=20'.format(tool=self.tool_path)]
            for cmd in cmds:
                print('Command', cmd)
                try:
                    run(cmd, shell=True, check=True)
                except CalledProcessError:
                    print('i failed')
                else:
                    utils.serialize(glob('*'), self.out_data().path, dump)
Пример #4
0
class Samtools(sl.Task):

    in_data = None
    tool_path = sl.Parameter()
    outp_path = sl.Parameter()
    working_dir = sl.Parameter()
    ref_fasta = sl.Parameter()

    def out_data(self):
        return sl.TargetInfo(self, self.outp_path + 'samtools_res')

    def run(self):
        sam_file = utils.deserialize(self.in_data().path, load)[0]
        utils.mkdir_if_not_exist(self.outp_path)
        with utils.cd(self.working_dir):
            cmds = ['{tool} view -bS {sam_file} > res.bam'.format(tool=self.tool_path, sam_file=sam_file),
                    '{tool} sort res.bam -o sorted.bam -O BAM'.format(tool=self.tool_path),
                    '{tool} index sorted.bam'.format(tool=self.tool_path),
                    # Make index fo reference
                    '{tool} faidx {ref_fasta}'.format(tool=self.tool_path, ref_fasta=self.ref_fasta)]
            for cmd in cmds:
                try:
                    run(cmd, shell=True, check=True)
                except CalledProcessError as e:
                    raise e('failed')
            utils.serialize(glob('*'), self.out_data().path, dump)
class TransferFTPtoS3(sl.ContainerTask):
    """Transfer a file from an FTP server to an AWS S3 bucket."""
    # FTP path
    ftp_url = sl.Parameter()
    # S3 path
    s3_url = sl.Parameter()

    # Container with wget
    container = "quay.io/fhcrc-microbiome/python:python-v0.1"

    def out_file(self):
        # File on S3
        return sl.ContainerTargetInfo(
            self,
            self.s3_url
        )

    def run(self):

        output_targets = {
            "out_file": self.out_file()
        }

        self.ex(
            command="wget -O $out_file {}".format(self.ftp_url),
            output_targets=output_targets
        )
Пример #6
0
class ImportSRAFastq(sl.ContainerTask):
    # Parameter: SRA accession to download data from
    sra_accession = sl.Parameter()

    # Parameter: AWS S3 folder for this project
    base_s3_folder = sl.Parameter()

    # Scratch directory
    scratch_directory = sl.Parameter(default="/scratch")

    # URL of the container
    container = "quay.io/fhcrc-microbiome/get_sra:v0.3"

    def out_fastq(self):
        # Output is an S3 object
        return sl.ContainerTargetInfo(
            self,
            os.path.join(self.base_s3_folder, "reads",
                         self.sra_accession + ".fastq.gz"))

    def run(self):

        self.ex(command=" ".join([
            "get_sra.py", "--accession", self.sra_accession, "--output-path",
            self.out_fastq().path, "--temp-folder", self.scratch_directory
        ]))
class ExtractAnnotations(sl.Task):
    """Extract all of annotations and make a TSV keyed by the 16S accession names."""
    # Input files
    in_fastas = None
    in_annotations = None

    # Single flat file with all of the annotations
    s3_url = sl.Parameter()

    # Temporary folder to use for downloading data inside the Docker container
    temp_folder = sl.Parameter()

    def out_file(self):
        # File on S3
        return sl.ContainerTargetInfo(
            self,
            self.s3_url
        )

    def run(self):

        # The final output is going to be a DataFrame with rows as 16S accessions and columns as annotations, values are copy numbers
        output = {}

        for genome_id, genome_transcripts in self.in_fastas.items():
            # Make sure that we also have an annotation for this genome
            assert genome_id in self.in_annotations

            # Get the transcript names from the FASTA
            bucket, transcript_key = genome_transcripts().path[5:].split("/", 1)
            transcript_ids = [
                header.split(" ", 1)[0]
                for header, seq in read_fasta_from_s3(bucket, transcript_key)
                if " 16S " in header or " SSU " in header
            ]

            # Get the annotations for this genome
            bucket, annotation_key = self.in_annotations[genome_id]().path[5:].split("/", 1)
            genome_annotations = read_tsv_from_s3_as_dataframe(bucket, annotation_key)

            # Make a copy number vector
            functional_copy_numbers = genome_annotations["product"].value_counts().to_dict()
            
            # Add the annotations to the list
            for transcript in transcript_ids:
                assert transcript not in output, "Transcript found twice, stopping ({})".format(transcript)
                output[transcript] = functional_copy_numbers

        # Make a single DataFrame
        output = pd.DataFrame(output).fillna(0).T

        # Now write it to S3
        output_bucket, output_key = self.out_file().path[5:].split("/", 1)
        tsv_buffer = io.StringIO()
        output.to_csv(tsv_buffer, sep='\t')
        s3 = boto3.resource('s3')
        s3.Object(output_bucket, output_key).put(Body=tsv_buffer.getvalue())
Пример #8
0
class CheckM(sl.ContainerTask):
    # Input FASTP file of protein sequences
    in_faa = None

    # Sample name
    sample_name = sl.Parameter()
    # Output folder
    output_folder = sl.Parameter()
    # Number of threads to use
    threads = sl.Parameter(default=4)
    # Scratch directory
    temp_folder = sl.Parameter(default="/scratch")

    # URL of the container
    container = "quay.io/fhcrc-microbiome/checkm:checkm-v1.0.11"

    def out_tsv(self):
        # Output is a tarball with all of the results
        return sl.ContainerTargetInfo(
            self,
            os.path.join(self.output_folder, self.sample_name + ".checkm.tsv"))

    def run(self):

        if self.output_folder.endswith("/") is False:
            self.output_folder = self.output_folder + "/"

        input_targets = {"faa": self.in_faa()}

        output_targets = {"tsv": self.out_tsv()}

        temp_dir = os.path.join(self.temp_folder, str(uuid.uuid4())[:8])

        self.ex(
            command="echo 'Checking to see if temp directory ({}) exists' && ".
            format(temp_dir) + "[ ! -d '{}' ] && ".format(temp_dir) +
            "echo Making temp directory {} && ".format(temp_dir) +
            "mkdir {} && ".format(temp_dir) +
            "echo Making temp directory for input files {}/checkm_input && ".
            format(temp_dir) + "mkdir {}/checkm_input && ".format(temp_dir) +
            "echo Making temp directory for output files {}/checkm_output && ".
            format(temp_dir) + "mkdir {}/checkm_output && ".format(temp_dir) +
            "echo Moving gene FAA file into input directory && " + "mv $faa " +
            "{}/checkm_input/ && ".format(temp_dir) +
            "echo Decompressing input file && " +
            "gunzip {}/checkm_input/* && ".format(temp_dir) +
            "ls -lhtr {}/checkm_input/ && ".format(temp_dir) +
            "echo Running checkm && " +
            "checkm lineage_wf --genes -x fastp -t {} --file {}/checkm.tsv {}/checkm_input/ {}/checkm_output/ && "
            .format(self.threads, temp_dir, temp_dir, temp_dir) +
            "echo Finished running checkm && " +
            "echo Copying results out of the container && " +
            "mv {}/checkm.tsv ".format(temp_dir) + "$tsv && " +
            "echo Deleting temporary folders && " +
            "rm -r {}".format(temp_dir),
            input_targets=input_targets,
            output_targets=output_targets)
Пример #9
0
class TrimmoTaskWParameters(sl.Task):
    """
    Execute trimmomatic on fastq files
    """
    # Parameter
    in_data = None
    trimmo_parameters = None  # OrderedDict
    adapters_path = sl.Parameter()
    tool_path = sl.Parameter()
    adapter = sl.Parameter()
    outp_path = sl.Parameter()

    def out_trimmo(self):
        outp_name = 'trimmed_seq'
        adapter_full_path = self.adapters_path + self.adapter
        self.default_trimmo_args = OrderedDict()
        args = [
            'SE', 'PE', '-basein', '-baseout', 'ILLUMINACLIP:', 'MAXINFO:',
            'SLIDINGWINDOW:', 'LEADING:', 'CROP:', 'TRAILING:', 'HEADCROP:',
            'MINLEN:'
        ]
        vals = [
            None, True, None, None, adapter_full_path + ':2:30:10', None, None,
            30, None, 30, 15, 20
        ]
        # "leading": head_qual, "trailing": 30, "headcrop": 15}
        for arg, vals in zip_longest(args, vals):
            self.default_trimmo_args[arg] = vals

        # TODO move this constants to config file
        # print(self.default_trimmo_args)
        # print(self.tool_path)
        print(self.outp_path + 'trimmo_res.txt')
        return sl.TargetInfo(self, self.outp_path + 'trimmo_res.txt')

    def run(self):
        outp_full_path = self.outp_path + 'trimmo'
        print('!!!', self.in_data().path)
        fastqs = utils.deserialize(self.in_data().path, load)
        utils.mkdir_if_not_exist(self.outp_path)
        if not self.trimmo_parameters:
            self.trimmo_parameters = self.default_trimmo_args
        self.trimmo_parameters['-basein'] = fastqs[0]
        self.trimmo_parameters['-baseout'] = outp_full_path
        cmd = 'java -jar {tool_path} {args}'.format(
            tool_path=self.tool_path,
            args=utils.make_args(self.trimmo_parameters))
        print('Command', cmd)
        try:
            run(cmd, shell=True, check=True)
        except CalledProcessError:
            print('i failed')
        else:
            utils.serialize(glob(self.outp_path + '*'),
                            self.out_trimmo().path, dump)
Пример #10
0
class MapVirusesTask(sl.ContainerTask):

    # Inputs: FASTQ and reference database
    in_fastq = None
    in_ref_db_metadata = None
    in_ref_db_dmnd = None

    # Parameter: AWS S3 folder for output files
    output_folder = sl.Parameter()

    # Parameter: Name for output file(s)
    sample_name = sl.Parameter()

    # Parameter: Number of threads for alignment
    threads = sl.Parameter()

    # Parameter: Temporary folder to use on the devide
    temp_folder = sl.Parameter()

    # URL of the container
    container = "quay.io/fhcrc-microbiome/map_viruses:v0.7"

    def out_json(self):
        # Output is an S3 object
        return sl.ContainerTargetInfo(
            self,
            os.path.join(self.output_folder, self.sample_name + ".json.gz"))

    def out_sam(self):
        # Output is an S3 object
        return sl.ContainerTargetInfo(
            self, os.path.join(self.output_folder,
                               self.sample_name + ".sam.gz"))

    def run(self):

        self.ex(command=" ".join([
            "map_viruses.py",
            "--input",
            self.in_fastq().path,
            "--metadata",
            self.in_ref_db_metadata().path,
            "--ref-db",
            self.in_ref_db_dmnd().path,
            "--output-path",
            self.out_json().path,
            "--threads",
            str(self.threads),
            "--temp-folder",
            self.temp_folder,
            "--keep-alignments",
        ]))
class TrimReads(sl.Task, AutoPairedFastq):
    in_fastq1 = None
    in_fastq2 = None
    method = sl.Parameter(default="sickle")

    def run(self):
        if self.method == "sickle":
            self.ex(
                'sickle pe -g -f "{reads1}" -r "{reads2}" -o "{out1}" -p "{out2}" -t sanger -s /dev/null'
                .format(reads1=self.in_fastq1().path,
                        reads2=self.in_fastq2().path,
                        out1=self.out_fastq1().path,
                        out2=self.out_fastq2().path))
        elif self.method == "fastq-mcf":
            pass  # TODO: Add fastq-mcf
        elif self.method == "bbduk":
            self.ex(
                'bbduk.sh in="{reads1}" in2="{reads2}" '
                'out="{out1}" out2="{out2}" ref=adapters,phix stats="{outstats}'
                .format(reads1=self.in_fastq1().path,
                        reads2=self.in_fastq2().path,
                        out1=self.out_fastq1().path,
                        out2=self.out_fastq2().path,
                        outstats=os.path.join(
                            os.path.dirname(self.out_fastq2().path),
                            "stats.txt")))
        else:
            raise_(ValueError, "Unimplemented trimming method chosen",
                   traceback)
class Extract16S(sl.ContainerTask):
    """Extract all of the 16S transcripts from a list of S3 FASTA files."""
    # Input files
    in_fastas = None
    # Folder with all of the transcripts in subfolders
    s3_parent_folder = sl.Parameter()
    # Single flat file with all of the transcripts
    s3_url = sl.Parameter()
    # Temporary folder to use for downloading data inside the Docker container
    temp_folder = sl.Parameter()

    # Container with wget
    container = "quay.io/fhcrc-microbiome/python:python-v0.1"

    def out_file(self):
        # File on S3
        return sl.ContainerTargetInfo(
            self,
            self.s3_url
        )

    def run(self):

        # Save all of the transcripts with " 16S " or " SSU " in the header
        output = {}

        for genome_transcripts in self.in_fastas:

            # Get the transcript names from the FASTA
            bucket, transcript_key = genome_transcripts(
            ).path[5:].split("/", 1)
            
            for header, seq in read_fasta_from_s3(bucket, transcript_key):
                if " 16S " in header or " SSU " in header:
                    header = header.split(" ", 1)[0]
                    assert header not in output, "Duplicated transcript ID, stopping ({})".format(header)

                    output[header] = seq

        # Now write it to S3
        output_bucket, output_key = self.out_file().path[5:].split("/", 1)
        fasta_buffer = io.StringIO()
        for header, seq in output.items():
            fasta_buffer.write(">{}\n{}\n".format(header, seq))

        s3 = boto3.resource('s3')
        s3.Object(output_bucket, output_key).put(Body=fasta_buffer.getvalue())
Пример #13
0
class FAMLITask(sl.ContainerTask):

    # Inputs: FASTQ and reference database
    in_fastq = None
    in_ref_dmnd = None

    # Parameter: Prefix for output file
    sample_name = sl.Parameter()

    # Parameter: Output folder
    output_folder = sl.Parameter()

    # Parameter: Number of threads for alignment
    threads = sl.Parameter()

    # Parameter: Number of blocks for alignment (each block takes ~6Gb)
    blocks = sl.Parameter(default=5)

    # Parameter: Temporary folder to use on the device
    temp_folder = sl.Parameter()

    # URL of the container
    container = "quay.io/fhcrc-microbiome/famli:v1.1"

    def out_json(self):
        # Output is an S3 object
        return sl.ContainerTargetInfo(
            self,
            os.path.join(self.output_folder,
                         "{}.json.gz".format(self.sample_name)))

    def run(self):

        if self.output_folder[-1] != '/':
            self.output_folder += '/'

        self.ex(command=" ".join([
            "famli", "align", "--input",
            self.in_fastq().path, "--sample-name", self.sample_name,
            "--ref-db",
            self.in_ref_dmnd().path, "--output-folder", self.output_folder,
            "--threads",
            str(self.threads), "--blocks",
            str(self.blocks), "--temp-folder", self.temp_folder
        ]))
Пример #14
0
class BatchJobRunner(sciluigi.Task):
    "common runner class"
    job_id = None
    job_def_name = None
    job_def_revision = None
    script_url = None
    queue = sciluigi.Parameter()
    bucket_name = sciluigi.Parameter()
    pipeline_name = sciluigi.Parameter()
    sample_list_file = sciluigi.Parameter()
    reference = sciluigi.Parameter()

    job_name = None
    submit_args = None

    def run(self):
        self.job_def_revision = get_latest_jobdef_revision(self.job_def_name)
        jobdef = self.job_def_name + ":" + str(self.job_def_revision)
        bytebuf = io.BytesIO()
        s3client = boto3.client("s3")
        url = urlparse(self.sample_list_file)
        bucket = url.netloc
        path = url.path.lstrip("/")
        s3client.download_fileobj(bucket, path, bytebuf)
        raw_sample = bytebuf.getvalue().decode("utf-8")
        samples = raw_sample.splitlines()
        samples = [x.replace(".bam", "") for x in samples]

        array_size = len(samples)
        if array_size < 2:
            LOG.info(
                "You must specify at least two samples to run an array job!")
            sys.exit(1)
        env = [
            dict(name="BUCKET_NAME", value=self.bucket_name),
            dict(name="LIST_OF_SAMPLES", value=self.sample_list_file),
            dict(name="BATCH_FILE_S3_URL", value=self.script_url),
            dict(name="REFERENCE", value=self.reference)
        ]

        self.submit_args = dict(jobQueue=self.queue,
                                arrayProperties=dict(size=array_size),
                                jobDefinition=jobdef,
                                containerOverrides=dict(environment=env))
Пример #15
0
class IntegrateAssembliesTask(sl.ContainerTask):
    # Input FASTP files
    in_fastp_list = None
    # Input GFF files
    in_gff_list = None

    # Folder with GFF files
    gff_folder = sl.Parameter()
    # Folder with FASTP files
    fastp_folder = sl.Parameter()
    # Output prefix
    output_prefix = sl.Parameter()
    # Output folder
    output_folder = sl.Parameter()
    # Scratch directory
    temp_folder = sl.Parameter(default="/scratch")

    # URL of the container
    container = "quay.io/fhcrc-microbiome/integrate-metagenomic-assemblies:v0.4"

    def out_daa(self):
        # DIAMOND database
        return sl.ContainerTargetInfo(
            self, os.path.join(self.output_folder,
                               self.output_prefix + ".dmnd"))

    def out_json(self):
        # JSON summary of all data
        return sl.ContainerTargetInfo(
            self,
            os.path.join(self.output_folder, self.output_prefix + ".json.gz"))

    def run(self):

        if self.output_folder.endswith("/") is False:
            self.output_folder = self.output_folder + "/"

        self.ex(command=" ".join([
            "integrate_assemblies.py", "--gff-folder", self.gff_folder,
            "--prot-folder", self.fastp_folder, "--output-name",
            self.output_prefix, "--output-folder", self.output_folder,
            "--temp-folder", self.temp_folder
        ]))
Пример #16
0
class AlignFastqTask(sl.ContainerTask):

    # Inputs: FASTQ and reference database
    in_fastq = None
    in_ref_fasta = None

    # Parameter: Short name for reference
    ref_name = sl.Parameter()

    # Parameter: AWS S3 folder for this project
    base_s3_folder = sl.Parameter()

    # Parameter: Name for output file(s)
    sample_name = sl.Parameter()

    # Parameter: Number of threads for alignment
    threads = sl.Parameter()

    # Parameter: Temporary folder to use on the device
    temp_folder = sl.Parameter()

    # URL of the container
    container = "quay.io/fhcrc-microbiome/bwa:v0.7.17--3"

    def out_bam(self):
        # Output is an S3 object
        return sl.ContainerTargetInfo(
            self,
            os.path.join(self.base_s3_folder,
                         "align_bwa_{}".format(self.ref_name),
                         "{}.{}.bam".format(self.ref_name, self.sample_name)))

    def run(self):

        self.ex(command=" ".join([
            "run.py", "--input",
            self.in_fastq().path, "--ref-db",
            self.in_ref_fasta().path, "--sample-name", "{}.{}".format(
                self.ref_name, self.sample_name), "--output-folder",
            os.path.join(self.base_s3_folder, "align_bwa_{}".format(
                self.ref_name)), "--threads",
            str(self.threads), "--temp-folder", self.temp_folder
        ]))
Пример #17
0
class WF(sciluigi.WorkflowTask):
    "workflow class"
    queue = sciluigi.Parameter()
    bucket_name = sciluigi.Parameter()
    pipeline_name = sciluigi.Parameter()
    sample_list_file = sciluigi.Parameter()
    reference = sciluigi.Parameter()

    def workflow(self):
        now = datetime.datetime.now()
        self.pipeline_name = self.pipeline_name + "-" + now.strftime(
            "%Y%m%d%H%M%S")
        LOG.info("Pipeline name is %s.", self.pipeline_name)

        step1 = self.new_task('step1',
                              StepOneJobRunner,
                              queue=self.queue,
                              bucket_name=self.bucket_name,
                              pipeline_name=self.pipeline_name,
                              sample_list_file=self.sample_list_file,
                              reference=self.reference)

        step2 = self.new_task('step2',
                              StepTwoJobRunner,
                              queue=self.queue,
                              bucket_name=self.bucket_name,
                              pipeline_name=self.pipeline_name,
                              sample_list_file=self.sample_list_file,
                              reference=self.reference)

        step2.in_step1 = step1.out_jobid

        step3 = self.new_task('step3',
                              StepThreeJobRunner,
                              queue=self.queue,
                              bucket_name=self.bucket_name,
                              pipeline_name=self.pipeline_name,
                              sample_list_file=self.sample_list_file,
                              reference=self.reference)

        step3.in_step2 = step2.out_jobid
        return step3
class Input(sl.Task):
    in_fastq1 = sl.Parameter()
    in_fastq2 = sl.Parameter()

    def out_fastq1(self):
        return sl.TargetInfo(
            self,
            os.path.join(str(self.workflow_task.workdir), "input_reads",
                         str(self.workflow_task.prefix) + "_1.fq.gz"))

    def out_fastq2(self):
        return sl.TargetInfo(
            self,
            os.path.join(str(self.workflow_task.workdir), "input_reads",
                         str(self.workflow_task.prefix) + "_2.fq.gz"))

    def run(self):
        self.out_fastq1().target.fs.mkdir(
            os.path.dirname(self.out_fastq1().path))
        os.symlink(os.path.abspath(self.in_fastq1), self.out_fastq1().path)
        os.symlink(os.path.abspath(self.in_fastq2), self.out_fastq2().path)
class MultiSampleWorkflow(sl.WorkflowTask):
    """
    This workflow is meant to take an entire dataset description and run the the SingleSampleWorkflow on each sample
    """
    midas_db = sl.Parameter()
    dataset_description = sl.Parameter()
    workdir = sl.Parameter()
    contaminant_removal_method = sl.Parameter(default="bbsplit")
    filter_genomes = luigi.ListParameter()
    ref_info_dir = sl.Parameter()
    ref_combo_hash = sl.Parameter()

    def workflow(self):
        dataset_spec = json.load(self.dataset_description)
        tasks = []
        if len(self.filter_genomes) > 0:
            index_task = self.new_task("ref_index",
                                       CreateIndexForContamRemoval)
        tasks.append(index_task)
        # Samples are in an array in the json. Each sample has a prefix and two read files
        for sample in dataset_spec["samples"]:
            wf = self.new_task('SampleWorkflow_' + sample["prefix"],
                               SingleSampleWorkflow,
                               workdir=self.workdir,
                               prefix=sample["prefix"],
                               in_fastq1=sample["in_fastq1"],
                               in_fastq2=sample["in_fastq2"],
                               midas_db=self.midas_db,
                               filter_genomes=self.filter_genomes,
                               ref_info_dir=self.ref_info_dir,
                               ref_combo_hash=self.ref_combo_hash)
            tasks.append(wf)
        return tasks
Пример #20
0
class PreprocessNematus(sciluigi.Task):
    src_lang = sciluigi.Parameter()
    trg_lang = sciluigi.Parameter()

    in_parallel = None

    def out_processed(self):
        return [
            TargetInfo(self, 'data/translate/preprocess/source.tok'),
            TargetInfo(self, 'data/translate/preprocess/target.tok'),
            TargetInfo(self, 'data/translate/preprocess/source.tok.json'),
            TargetInfo(self, 'data/translate/preprocess/target.tok.json')
        ]

    def run(self):
        self.ex('mkdir -p data/translate/preprocess/')

        if False:
            logging.info('Lowercasing and tokenizing source and target data.')
            self.ex('perl nematus/data/lowercase.perl < %s | \
                    perl nematus/data/tokenizer.perl -threads 5 -l %s > data/translate/preprocess/source.tok' \
                    % (self.in_parallel[0].path, self.src_lang))
            self.ex('perl nematus/data/lowercase.perl < %s | \
                    perl nematus/data/tokenizer.perl -threads 5 -l %s > data/translate/preprocess/target.tok.ul' \
                    % (self.in_parallel[1].path, self.trg_lang))

        # underline fix
        self.ex(
            "cat data/translate/preprocess/target.tok.ul | sed 's/ _ /_/g' > \
                data/translate/preprocess/target.tok")

        logging.info('Building vocabularies.')
        self.ex(
            '(. /home/jwei/miniconda3/etc/profile.d/conda.sh && conda activate nematus \
                && python nematus/data/build_dictionary.py \
                data/translate/preprocess/source.tok)')
        self.ex(
            '(. /home/jwei/miniconda3/etc/profile.d/conda.sh && conda activate nematus \
                && python nematus/data/build_dictionary.py \
                data/translate/preprocess/target.tok)')
Пример #21
0
class VirFinderTask(sl.ContainerTask):
    """Run the VirFinder tool on a set of contigs."""

    # Inputs: FASTA
    in_fasta = None

    # Parameter: AWS S3 folder for this project
    base_s3_folder = sl.Parameter()

    # Parameter: Name for output file(s)
    sample_name = sl.Parameter()

    # Use this to specify a different mount point for temporary files, if needed
    input_mount_point = sl.Parameter(default="/mnt/input/")
    output_mount_point = sl.Parameter(default="/mnt/output/")

    # URL of the container
    container = "quay.io/fhcrc-microbiome/virfinder:v1.1--0"

    def out_tsv(self):
        # Output is an S3 object
        return sl.ContainerTargetInfo(
            self,
            os.path.join(self.base_s3_folder, "virfinder",
                         self.sample_name + ".tsv"))

    def run(self):

        input_targets = {"input_fasta": self.in_fasta()}

        output_targets = {"output_tsv": self.out_tsv()}

        self.ex(
            command="run_virfinder.Rscript $input_fasta $output_tsv",
            input_targets=input_targets,
            output_targets=output_targets,
            input_mount_point=self.input_mount_point,
            output_mount_point=self.output_mount_point,
        )
Пример #22
0
class PrepareFastq(sl.Task):

    #def __init__(self):
    #    print('Init!!')
    #    super(sl.Task, self).__init__()

    data_path = sl.Parameter()
    outp_path = sl.Parameter()

    # Write parameters in config whithout ', or it will interpret this as directory name
    # Path must be FILE!!!!!

    def out_data(self):
        return sl.TargetInfo(self, self.outp_path)

    def run(self):
        print('run')
        s = sorted(glob(self.data_path + '*.fastq.gz'))
        paired_fastq = list(zip(s[::2], s[1::2]))[0]

        with self.out_data().open('w') as outfile:
            dump(paired_fastq, outfile)
Пример #23
0
class AnnotateProkka(sl.ContainerTask):
    # Input FASTA file
    in_fasta = None

    # Sample name
    sample_name = sl.Parameter()
    # Output folder
    output_folder = sl.Parameter()
    # Number of threads to use
    threads = sl.Parameter(default=4)
    # Scratch directory
    temp_folder = sl.Parameter(default="/scratch")

    # URL of the container
    container = "quay.io/fhcrc-microbiome/metaspades:v3.11.1--8"

    def out_gff(self):
        # Output is an S3 object
        return sl.ContainerTargetInfo(
            self, os.path.join(self.output_folder,
                               self.sample_name + ".gff.gz"))

    def out_faa(self):
        # Output is an S3 object
        return sl.ContainerTargetInfo(
            self,
            os.path.join(self.output_folder, self.sample_name + ".fastp.gz"))

    def run(self):

        if self.output_folder.endswith("/") is False:
            self.output_folder = self.output_folder + "/"

        self.ex(command=" ".join([
            "run_prokka.py", "--input",
            self.in_fasta().path, "--sample-name", self.sample_name,
            "--output-folder", self.output_folder, "--threads",
            str(int(self.threads)), "--temp-folder", self.temp_folder
        ]))
class RunMIDAS(sl.Task, AutoSentinel):
    midas_subtask = sl.Parameter()
    in_fastq1 = None
    in_fastq2 = None
    nproc = sl.Parameter(default=1)

    def run(self):
        output = self.out_put()
        outdir = os.path.dirname(output.path)
        self.ex('''
        run_midas.py {subtask} \
        {outdir} \
        -d \
        -1 {in1} \
        -2 {in2} \
        -t {nproc} --remove_temp
        '''.format(subtask=self.midas_subtask,
                   outdir=outdir,
                   in1=self.in_fastq1().path,
                   in2=self.in_fastq2().path,
                   nproc=self.nproc))
        with output.open("w") as fp:
            self.log_info(fp)
Пример #25
0
class BQSR(sl.Task):

    tool_path = sl.Parameter()
    outp_path = sl.Parameter()
    working_dir = sl.Parameter()
    ref_fasta = sl.Parameter()
    dbsnp = sl.Parameter()
    indels = sl.Parameter()
    in_data = None

    def out_data(self):
        return sl.TargetInfo(self, self.outp_path + 'BQSR_res')

    def run(self):
        with utils.cd(self.working_dir):
            utils.mkdir_if_not_exist(self.outp_path)
            res = utils.deserialize(self.in_data().path, load)

            # Analyze patterns of covariation in the sequence dataset
            # TODO: Добавить индексацию файла dedup_reads_w...
            cmds = ['java -jar {tool_path} -T BaseRecalibrator ' \
                   '-R {ref_fasta} -I dedup_reads_w_readgroups.bam ' \
                   '-knownSites {dbsnp} ' \
                   '-knownSites {indels} ' \
                   '-o recal_data.table'.format(tool_path=self.tool_path,
                                                ref_fasta=self.ref_fasta,
                                                dbsnp=self.dbsnp,
                                                indels=self.indels),
            # Do a second pass to analyze covariation remaining after recalibration
            'java -jar {tool_path} -T BaseRecalibrator ' \
                   '-R {ref_fasta} -I dedup_reads_w_readgroups.bam ' \
                   '-knownSites {dbsnp} ' \
                   '-knownSites {indels} ' \
                   '-BQSR recal_data.table -o post_recal_data.table'.format(tool_path=self.tool_path,
                                                                            ref_fasta=self.ref_fasta,
                                                                            dbsnp=self.dbsnp,
                                                                            indels=self.indels),
            # Generate before/after plots
            'java -jar {tool_path} -T AnalyzeCovariates -R {ref_fasta} ' \
                   '-L chr20 -before recal_data.table ' \
                   '-after post_recal_data.table ' \
                   '-plots recalibration_plots.pdf'.format(tool_path=self.tool_path,
                                                           ref_fasta=self.ref_fasta),
            # Apply the recalibration to your sequence data
            'java -jar {tool_path} -T PrintReads -R {ref_fasta} ' \
                   '-I dedup_reads_w_readgroups.bam ' \
                   '-BQSR recal_data.table -o recal_reads.bam'.format(tool_path=self.tool_path,
                                                                      ref_fasta=self.ref_fasta)]
            for cmd in cmds[2:]:
                print('Command', cmd)
                try:
                    run(cmd, shell=True, check=True)
                except CalledProcessError:
                    print('i failed')
                    raise
                else:
                    utils.serialize(glob('*'), self.out_data().path, dump)
Пример #26
0
class MyFooReplacer(sciluigi.Task):
    replacement = sciluigi.Parameter() # Here, we take as a parameter
                                  # what to replace foo with.
    # Here we have one input, a "foo file":
    in_foo = None
    # ... and an output, a "bar file":
    def out_replaced(self):
        # As the path to the returned target(info), we
        # use the path of the foo file:
        return sciluigi.TargetInfo(self, self.in_foo().path + '.bar.txt')
    def run(self):
        with self.in_foo().open() as in_f:
            with self.out_replaced().open('w') as out_f:
                # Here we see that we use the parameter self.replacement:
                out_f.write(in_f.read().replace('foo', self.replacement))
Пример #27
0
class AssembleMetaSPAdes(sl.ContainerTask):
    # Input FASTQ file
    in_fastq = None

    # Sample name
    sample_name = sl.Parameter()
    # Output folder
    output_folder = sl.Parameter()
    # Number of threads to use
    threads = sl.Parameter(default=4)
    # Maximum amount of memory to use (gigabytes)
    max_mem = sl.Parameter(default=10)
    # Scratch directory
    temp_folder = sl.Parameter(default="/scratch")

    # URL of the container
    container = "quay.io/fhcrc-microbiome/metaspades:v3.11.1--8"

    def out_fasta(self):
        # Output is an S3 object
        return sl.ContainerTargetInfo(
            self,
            os.path.join(self.output_folder, self.sample_name + ".fasta.gz"))

    def run(self):

        if self.output_folder.endswith("/") is False:
            self.output_folder = self.output_folder + "/"

        self.ex(command=" ".join([
            "run_metaspades.py", "--input",
            self.in_fastq().path, "--sample-name", self.sample_name,
            "--output-folder", self.output_folder, "--threads",
            str(int(self.threads)), "--max-mem",
            str(int(self.max_mem)), "--temp-folder", self.temp_folder
        ]))
class RunMetaphlan2(sl.Task, AutoTxt):
    in_fastq1 = None
    in_fastq2 = None
    nproc = sl.Parameter(default=1)

    def run(self):
        self.ex('''
        metaphlan2.py \
        {reads1},{reads2} \
        --no_map -t rel_ab_w_read_stats \
        --sample_id {prefix} \
        --nproc {nproc} --input_type fastq -o {out}
        '''.format(reads1=self.in_fastq1().path,
                   reads2=self.in_fastq2().path,
                   out=self.out_put().path,
                   nproc=self.nproc,
                   prefix=self.workflow_task.prefix))
class HUMAnN2Task(sl.ContainerTask):
    # Input FASTQ file
    in_fastq = None

    # Reference database
    ref_db = sl.Parameter(default="")
    
    # Sample name
    sample_name = sl.Parameter()
    # Output folder
    output_folder = sl.Parameter()
    # Number of threads to use
    threads = sl.Parameter(default=4)
    # Maximum amount of memory to use (gigabytes)
    max_mem = sl.Parameter(default=10)
    # Scratch directory
    temp_folder = sl.Parameter(default="/scratch")

    # URL of the container
    container = "quay.io/fhcrc-microbiome/humann2:v0.11.1--7"

    def out_json(self):
        # Output is an S3 object
        return sl.ContainerTargetInfo(
            self,
            os.path.join(
                self.output_folder,
                self.sample_name + ".json.gz"
            )
        )

    def run(self):

        if self.output_folder.endswith("/") is False:
            self.output_folder = self.output_folder + "/"

        self.ex(
            command=" ".join([
                "run.py",
                "--input",
                self.in_fastq().path,
                "--sample-name",
                self.sample_name,
                "--output-folder",
                self.output_folder,
                "--ref-db",
                self.ref_db,
                "--threads",
                str(int(self.threads)),
                "--temp-folder",
                self.temp_folder
            ])
        )
Пример #30
0
class LoadFile(sl.ExternalTask):
    path = sl.Parameter()

    def out_file(self):
        return sl.ContainerTargetInfo(self, self.path)