Пример #1
0
def multiple_fastq_dowloading(job, config, sample_disk):
    """
    Convenience function for handling the downloading of multiple fastq files

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param Expando config: Dict-like object containing workflow options as attributes
    :param int sample_disk: Amount of disk space to allocate to download jobs
    :return: FileStoreIDs for all fastqs downloaded
    :rtype: list(str,)
    """
    # Spawn download job per fastq file
    fastq_ids = []
    urls = config.url.split(',')
    if config.paired:
        require(
            len(urls) % 2 == 0,
            'Fastq pairs must have multiples of 2 URLS separated by comma')
    for url in urls:
        fastq_ids.append(
            job.addChildJobFn(download_url_job,
                              url,
                              s3_key_path=config.ssec,
                              disk=sample_disk).rv())

    return fastq_ids
Пример #2
0
def s3am_upload(fpath, s3_dir, num_cores=1, s3_key_path=None):
    """
    Uploads a file to s3 via S3AM
    S3AM binary must be on the PATH to use this function
    For SSE-C encryption: provide a path to a 32-byte file

    :param str fpath: Path to file to upload
    :param str s3_dir: Ouptut S3 path. Format: s3://bucket/[directory]
    :param int num_cores: Number of cores to use for up/download with S3AM
    :param str s3_key_path: (OPTIONAL) Path to 32-byte key to be used for SSE-C encryption
    """
    require(s3_dir.startswith('s3://'),
            'Format of s3_dir (s3://) is incorrect: {}'.format(s3_dir))
    s3_dir = os.path.join(s3_dir, os.path.basename(fpath))
    _s3am_with_retry(num_cores,
                     file_path=fpath,
                     s3_url=s3_dir,
                     mode='upload',
                     s3_key_path=s3_key_path)
Пример #3
0
def run_cutadapt(job, r1_id, r2_id, fwd_3pr_adapter, rev_3pr_adapter):
    """
    Adapter trimming for RNA-seq data

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param str r1_id: FileStoreID of fastq read 1
    :param str r2_id: FileStoreID of fastq read 2 (if paired data)
    :param str fwd_3pr_adapter: Adapter sequence for the forward 3' adapter
    :param str rev_3pr_adapter: Adapter sequence for the reverse 3' adapter (second fastq pair)
    :return: R1 and R2 FileStoreIDs
    :rtype: tuple(str, str)
    """
    # Retrieve files and define parameters
    job.fileStore.readGlobalFile(r1_id, os.path.join(job.tempDir, 'R1.fastq'))
    parameters = ['-a', fwd_3pr_adapter, '-m', '35']

    # If R2 fastq is present...
    if r2_id:
        require(rev_3pr_adapter,
                "Paired end data requires a reverse 3' adapter sequence.")
        job.fileStore.readGlobalFile(r2_id,
                                     os.path.join(job.tempDir, 'R2.fastq'))
        parameters.extend([
            '-A', rev_3pr_adapter, '-o', '/data/R1_cutadapt.fastq', '-p',
            '/data/R2_cutadapt.fastq', '/data/R1.fastq', '/data/R2.fastq'
        ])
    else:
        parameters.extend(['-o', '/data/R1_cutadapt.fastq', '/data/R1.fastq'])

    # Call: CutAdapt
    dockerCall(job=job,
               tool=cutadapt_version,
               workDir=job.tempDir,
               parameters=parameters)

    # Write to fileStore
    r1_cut_id = job.fileStore.writeGlobalFile(
        os.path.join(job.tempDir, 'R1_cutadapt.fastq'))
    r2_cut_id = job.fileStore.writeGlobalFile(
        os.path.join(job.tempDir, 'R2_cutadapt.fastq')) if r2_id else None

    return r1_cut_id, r2_cut_id
Пример #4
0
def move_or_upload(config, files, enforce_ssec=False):
    """
    Move or upload file based on configuration settings

    :param Expando config: Dict-like object containing workflow options as attributes
    :param list(str,) files: List of files to be moved or uploaded
    :param bool enforce_ssec: If True, enforces SSEC be set in config or else fails
    """
    if urlparse(config.output_dir).scheme == 's3':
        if enforce_ssec:
            require(
                config.ssec,
                'SSEC encryption required to upload sensitive read data to S3.'
            )
        for f in files:
            s3am_upload(fpath=f,
                        s3_dir=config.output_dir,
                        s3_key_path=config.ssec)
    elif urlparse(config.output_dir).scheme != 's3':
        copy_files(file_paths=files, output_dir=config.output_dir)
Пример #5
0
def process_sample(job, config, input_tar=None, fastq_ids=None):
    """
    Converts sample.tar(.gz) or collection of fastqs into a fastq pair (or single fastq if single-ended.)
    WARNING: Here be dragons. I may or may not ever get the time to clean this up.

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param Expando config: Dict-like object containing workflow options as attributes
    :param str input_tar: fileStoreID of the tarball (if applicable)
    :param list(str,) fastq_ids: FileStoreIDs of fastq files
    :return: FileStoreID from Cutadapt or from fastqs directly if workflow was run without Cutadapt option
    :rtype: tuple(str, str)
    """
    job.fileStore.logToMaster('Processing sample: {}'.format(config.uuid))
    delete_fastqs = True
    processed_r1, processed_r2 = None, None
    # I/O
    if input_tar:
        job.fileStore.readGlobalFile(input_tar,
                                     os.path.join(job.tempDir, 'sample.tar'),
                                     mutable=True)
        tar_path = os.path.join(job.tempDir, 'sample.tar')
        # Untar sample
        subprocess.check_call(['tar', '-xvf', tar_path, '-C', job.tempDir],
                              stderr=PIPE,
                              stdout=PIPE)
        os.remove(tar_path)
    else:
        ext = '.fq.gz' if config.gz else '.fq'
        for i, fastq_id in enumerate(fastq_ids):
            if i % 2 == 0:
                job.fileStore.readGlobalFile(
                    fastq_id,
                    os.path.join(job.tempDir, 'Fastq_{}_R1{}'.format(i, ext)))
            else:
                job.fileStore.readGlobalFile(
                    fastq_id,
                    os.path.join(job.tempDir, 'Fastq_{}_R2{}'.format(i, ext)))
    fastqs = []
    for root, subdir, files in os.walk(job.tempDir):
        fastqs.extend([os.path.join(root, x) for x in files])
    if config.paired:
        r1, r2 = [], []
        # Pattern convention: Look for "R1" / "R2" in the filename, or "_1" / "_2" before the extension
        pattern = re.compile('(?:^|[._-])(R[12]|[12]\.f)')
        for fastq in sorted(fastqs):
            match = pattern.search(os.path.basename(fastq))
            if not match:
                raise UserError(
                    'FASTQ file name fails to meet required convention for paired reads '
                    '(see documentation). ' + fastq)
            elif '1' in match.group():
                r1.append(fastq)
            elif '2' in match.group():
                r2.append(fastq)
            else:
                assert False, match.group()
        require(
            len(r1) == len(r2),
            'Check fastq names, uneven number of pairs found.\nr1: {}\nr2: {}'.
            format(r1, r2))
        # Concatenate fastqs
        command = 'zcat' if r1[0].endswith('.gz') and r2[0].endswith(
            '.gz') else 'cat'

        # If sample is already a single R1 / R2 fastq
        if command == 'cat' and len(fastqs) == 2:
            processed_r1 = fastq_ids[0]
            processed_r2 = fastq_ids[1]
            delete_fastqs = False
        else:
            with open(os.path.join(job.tempDir, 'R1.fastq'), 'w') as f1:
                p1 = subprocess.Popen([command] + r1, stdout=f1)
            with open(os.path.join(job.tempDir, 'R2.fastq'), 'w') as f2:
                p2 = subprocess.Popen([command] + r2, stdout=f2)
            p1.wait()
            p2.wait()
            processed_r1 = job.fileStore.writeGlobalFile(
                os.path.join(job.tempDir, 'R1.fastq'))
            processed_r2 = job.fileStore.writeGlobalFile(
                os.path.join(job.tempDir, 'R2.fastq'))
        disk = 2 * (processed_r1.size + processed_r2.size)
    else:
        command = 'zcat' if fastqs[0].endswith('.gz') else 'cat'
        if command == 'cat' and len(fastqs) == 1:
            processed_r1 = fastq_ids[0]
            delete_fastqs = False
        else:
            with open(os.path.join(job.tempDir, 'R1.fastq'), 'w') as f:
                subprocess.check_call([command] + fastqs, stdout=f)
            processed_r1 = job.fileStore.writeGlobalFile(
                os.path.join(job.tempDir, 'R1.fastq'))
        disk = 2 * processed_r1.size

    # Cleanup Intermediates
    ids_to_delete = [input_tar
                     ] + fastq_ids if delete_fastqs and fastq_ids else [
                         input_tar
                     ]
    job.addFollowOnJobFn(cleanup_ids, ids_to_delete)

    # Start cutadapt step
    if config.cutadapt:
        return job.addChildJobFn(run_cutadapt,
                                 processed_r1,
                                 processed_r2,
                                 config.fwd_3pr_adapter,
                                 config.rev_3pr_adapter,
                                 disk=disk).rv()
    else:
        return processed_r1, processed_r2