def main(argv):

    def _add_input(parser):
        parser.add_option("--data-dir", default=".")
        parser.add_option("--force", default=False, action="store_true")
        parser.add_option("--min-depth", default=0, type="int")
        parser.add_option("--follow-links", default=False, action="store_true")
        parser.add_option("--limit-metrics", default=0, type="int")
        parser.add_option("--output-filename-metrics")
        parser.add_option("--input-filename-metrics")

    P.initialize(argv, callback=_add_input)
    options = E.get_args()

    if options.config_file:
        PARAMS = P.get_parameters(options.config_file)
    else:
        sys.exit(P.main(options))

    if os.path.exists("results.commit"):
        if not options.force:
            raise ValueError(
                "a results.commit file already exists. Please remove "
                "before uploading.")

    data_dir = os.path.abspath(options.data_dir)
    if options.input_filename_metrics:
        with IOTools.open_file(options.input_filename_metrics) as inf:
            infiles = [x.strip() for x in inf.readlines() if x.strip()]
        if options.limit_metrics:
            infiles = infiles[:options.limit_metrics]
    else:
        E.info(f"collecting files to upload starting in {data_dir}")
        infiles = []
        for root, dirs, files in os.walk(data_dir, followlinks=options.follow_links):
            E.debug(f"working on {root}: dirs={len(dirs)}, files={len(files)}")
            # ignore first level (tools) (needs better check)
            depth = root[len(data_dir):].count(os.sep)
            if "benchmark.info" in files:
                if depth <= options.min_depth:
                    E.info(f"skipping - depth not high enough: {depth}")
                else:
                    infiles.append(os.path.join(root, "benchmark.info"))

            if options.limit_metrics and len(infiles) > options.limit_metrics:
                E.info(f"stopping collection as {len(infiles)} reached")
                break

    E.info("found a potential {} benchmark.info files to upload".format(len(infiles)))
    if options.output_filename_metrics:
        with IOTools.open_file(options.output_filename_metrics, "w") as outf:
            outf.write("\n".join(infiles) + "\n")

    # find all files of interest
    oldwd = os.getcwd()
    os.chdir(data_dir)
    upload_result(infiles, "results.commit", PARAMS)
    os.chdir(oldwd)

    E.stop()
    def test_job_should_fail_if_cancelled(self):

        if not P.will_run_on_cluster(P.get_parameters()):
            return

        if QUEUE_MANAGER == "slurm":
            self.assertRaises(
                OSError,
                P.run,
                "scancel $SLURM_JOB_ID",
                to_cluster=self.to_cluster)
        elif QUEUE_MANAGER == "sge":
            self.assertRaises(
                OSError,
                P.run,
                "qdel $SGE_TASK_ID",
                to_cluster=self.to_cluster)
    def test_job_should_fail_if_too_little_memory_required(self):

        outfile = os.path.join(self.work_dir, "out")

        if P.get_parameters()['os'] == 'Linux':
            self.assertRaises(
                OSError,
                P.run,
                "python -c 'import numpy; "
                "a = numpy.array(numpy.arange(0, {memory}), numpy.int8); "
                "out = open(\"{outfile}\", \"w\"); "
                "out.write(str(len(a)) + \"\\n\"); "
                "out.close()'".format(
                    memory=self.test_memory_size,
                    outfile=outfile),
                to_cluster=self.to_cluster,
                cluster_memory_ulimit=True,                
                job_memory="{}G".format(
                    0.5 * self.test_memory_size / 10**9))
        else:
            pass
    def test_job_should_fail_if_too_little_memory_required_in_second_statement(self):

        outfile = os.path.join(self.work_dir, "out")
        infile = "arv=by_id/glon1-4zz18-3cbje7tmr0nitut/study_list.txt"

        if P.get_parameters()['os'] == 'Linux':
            self.assertRaises(
                OSError,
                P.run,
                "hostname > {outfile}; "
                "python -c 'import numpy; "
                "a = numpy.array(numpy.arange(0, {memory}), numpy.int8); "
                "out = open(\"{outfile}\", \"w\"); "
                "out.write(str(len(a)) + \"\\n\"); "
                "out.close()'".format(
                    memory=self.test_memory_size,
                    infile=infile,
                    outfile=outfile),
                to_cluster=self.to_cluster,
                cluster_memory_ulimit=True,
                job_memory="{}G".format(
                    0.5 * self.test_memory_size / 10**9))
        else:
            pass
def count_genes(infile, outfile):
    statement = '''wc -l %(infile)s > %(outfile)s''' #counts no. of lines, s specifies string. Each line = transcript
    P.run(statement) #if on cbrg have to specify job queue. Will run and save to single file per chromosome
	#or can do len(chr1.gtf) - count number of lines in file


@merge(count_genes,'all.average')
def average (infiles, outfile): #each count file has 2 items e.g 100 (no. of transcripts) chr1.gtf (original file name)
    total_counts = {} #create dictionary
    for infile in infiles:
        with open (infile) as inf:
            count, chrom = inf.read().strip().split(' ') #.read = reading the line, .strip = taking away white space, .split = setting 2 variables
            total_counts[chrom] = int(count) #key = chrom, count (make integer) = value
    median = statistics.median(total_counts.values()) #calculate median from integers only
    with open(outfile,'w') as count:
        for key, value in total_counts.items():
            count.write(f'{key}\t{value}\n') #write dictionary
        count.write(f'Median\t{median}\n') #write median

if __name__ == "__main__":
    sys.exit( P.main(sys.argv) ) #if have this at bottom can run in command line eg $python workflow.py make for full pipeline

#If want to set input file using .yml then make .yml file with (save .yml file in same directory):
gtf = /ifs/obds-training/lingf/obds/devel/OBDS_Training_Sep_2019/genes.gtf.gz

#Then under import statements
P.get_parameters('workflow.yml')

#And have to change decorators e.g
@split(P.PARAMS['gtf'], 'chr*.gtf') #end up with dictionary, gtf is key and value is file path
def main(argv=None):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-r",
        "--restrict-regex",
        dest="restrict_regex",
        action="append",
        help="pattern to restrict tests to certain tools/metrics. "
        "Can be specified multiple times [%default]")

    parser.add_option(
        "--data-directory",
        dest="data_directory",
        help="directory with sample data sets. This will override the default "
        "datadir in the configuration file and the environment variable "
        "DAISY_TEST_DATADIR [%default]")

    parser.add_option(
        "--library-directory",
        dest="library_directory",
        action="append",
        help="directory tasks functions. Will be added to the built-in "
        "and the one specified in DAISY_TASKLIBRARY environment variable "
        "[%default]")

    parser.add_option("--always-mount",
                      dest="always_mount",
                      action="store_true",
                      help="force mounting of arvados keep [%default]")

    parser.add_option("--keep-failed-temp",
                      dest="keep_failed_temp",
                      action="store_true",
                      help="keep temporary files of failed tests [%default]")

    parser.set_defaults(
        restrict_regex=[],
        always_mount=False,
        data_directory=None,
        keep_failed_temp=False,
        library_directories=[],
    )

    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    P.get_parameters()

    # load the built-in tests
    filenames = [
        os.path.join(os.path.dirname(os.path.dirname(__file__)), "tasks",
                     "test_task_library.yml")
    ]
    if "DAISY_TASKLIBRARY" in os.environ:
        filenames.append(
            os.path.join(os.environ["DAISY_TASKLIBRARY"],
                         "test_task_library.yml"))
    filenames.extend(options.library_directories)

    master_config = None
    for fn in filenames:
        if not os.path.exists(fn):
            E.warn("file {} does not exist".format(fn))
            continue
        with IOTools.open_file(fn) as inf:
            raw_txt = inf.read()
            test_config = yaml.load(raw_txt, Loader=yaml.FullLoader)
            if test_config is None:
                E.warn("file {} is empty".format(fn))
                continue

            data_directory = os.environ.get("DAISY_TEST_DATADIR",
                                            test_config.get("data_directory"))

            if options.data_directory:
                data_directory = options.data_directory

            # reload config with placeholders replaced
            test_config = yaml.load(re.sub("DATADIR", data_directory, raw_txt),
                                    Loader=yaml.FullLoader)
            if master_config is None:
                master_config = test_config
            else:
                # add additional tool/test metrics
                master_config["tool"].update(test_config.get("tool", {}))
                master_config["metric"].update(test_config.get("metric", {}))

    for test_section, testclass, map_name_to_runner in [
        ("tool", TestTool, map_tool_to_runner),
        ("metric", TestMetric, map_metric_to_runner)
    ]:

        ignore = master_config[test_section].get("ignore", [])
        # propagate config variables
        testclass.test_config = master_config

        for task, taskf in sorted(map_name_to_runner.items()):
            found = False
            for to_ignore in ignore:
                if re.match(to_ignore, task):
                    found = True
            if found:
                continue
            if options.restrict_regex:
                take = False
                for x in options.restrict_regex:
                    if re.search(x, task):
                        take = True
                if not take:
                    continue
            add_tests(task, taskf, testclass)

    failed = False
    with arvados_enabled(always_mount=options.always_mount):
        for testclass in [TestTool, TestMetric]:
            suite = unittest.TestLoader().loadTestsFromTestCase(testclass)
            result = unittest.TextTestRunner(verbosity=2).run(suite)
            failed |= not result.wasSuccessful()

            # remove all tests in test class - necessary if function is
            # called repeatedly
            clear_tests(testclass)

    E.stop()
    return failed
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Sep 30 15:37:44 2019

@author: asmith
"""
import os
import sys
import numpy as np
import pandas as pd
from ruffus import *
from cgatcore import pipeline as P

# Put parameter YAML here
P.get_parameters('RNA_seq.yml')


@follows(mkdir('fastqc'))
@transform('*.fastq.gz', regex(r'(.*_.*).fastq.gz'), r'fastqc/\1_fastqc.zip')
def run_fastqc(infile, outfile):
    cmd = 'fastqc -q -t %(threads)s --nogroup %(infile)s --outdir fastqc/'
    P.run(cmd, job_queue=P.PARAMS['queue'], job_threads=P.PARAMS['threads'])


@follows(mkdir('bam'))
@collate('*.fastq.gz', regex(r'(.*)_[1|2].fastq.gz'), r'bam/\1.bam')
def align_fastq_paired(infiles, outfile):

    if not P.PARAMS['hisat_options']:
        P.PARAMS['hisat_options'] = ''
示例#8
0
def main(argv=None):

    # Parse the options
    parser = E.OptionParser(
        version=
        "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option(
        "-p",
        "--params",
        "--args",
        dest="params",
        type="string",
        help="comma separated list of addtional parameter strings")

    parser.add_option("-m",
                      "--module",
                      dest="module",
                      type="string",
                      help="the full path to the module file",
                      default=None)

    parser.add_option("-i",
                      "--input",
                      dest="input_filenames",
                      type="string",
                      action="append",
                      help="input filename")

    parser.add_option("-o",
                      "--output-section",
                      dest="output_filenames",
                      type="string",
                      action="append",
                      help="output filename")

    parser.add_option("-f",
                      "--function",
                      dest="function",
                      type="string",
                      help="the module function",
                      default=None)

    parser.set_defaults(input_filenames=[], output_filenames=[], params=None)

    (options, args) = E.start(parser)

    # Check a module and function have been specified
    if not options.module or not options.function:
        raise ValueError("Both a function and Module must be specified")

    # initialize defaults
    P.get_parameters()

    # If a full path was given, add this path to the system path
    location = os.path.dirname(options.module)
    if location != "":
        sys.path.append(location)

    # Establish the module name, accomodating cases where the
    # .py extension has been included in the module name
    module_name = os.path.basename(options.module)
    if module_name.endswith(".py"):
        module_base_name = module_name[:-3]
    else:
        module_base_name = module_name

    # Import the specified module and map the specified fuction
    E.info("importing module '%s' " % module_base_name)
    E.debug("sys.path is: %s" % sys.path)

    module = importlib.import_module(module_base_name)
    try:
        function = getattr(module, options.function)
    except AttributeError as msg:
        raise AttributeError(
            msg.message + "unknown function, available functions are: %s" %
            ",".join([x for x in dir(module) if not x.startswith("_")]))

    if options.input_filenames and not options.input_filenames == ["None"]:
        infiles = options.input_filenames
    else:
        infiles = False

    if options.output_filenames and not options.output_filenames == ["None"]:
        outfiles = options.output_filenames
    else:
        outfiles = False

    # Parse the parameters into an array
    if options.params:
        params = [param.strip() for param in options.params.split(",")]
    else:
        params = False

    # deal with single file case
    if infiles and len(infiles) == 1:
        infiles = infiles[0]
    if outfiles and len(outfiles) == 1:
        outfiles = outfiles[0]

    # Make the function call
    if infiles and outfiles and params:
        function(infiles, outfiles, params)
    elif infiles and outfiles and not params:
        function(infiles, outfiles)
    elif params:
        function(params)
    else:
        raise ValueError(
            "Expecting infile+outfile+params or infile+outfile or params")

    E.stop()
示例#9
0
from cgatcore import pipeline as P
import os
import sqlite3
import re
import pandas as pd
import gzip

# Pipeline configuration
P.get_parameters([
    "%s/pipeline.yml" % os.path.splitext(__file__)[0], "../pipeline.yml",
    "pipeline.yml"
], )

PARAMS = P.PARAMS

db = PARAMS['database']['url'].split('./')[1]

#####################################################
####              Helper functions               ####
#####################################################


def isPaired(files):
    '''Check whether input files are single or paired end
       Note: this is dependent on files having correct suffix'''

    paired = []

    for fastq in files:
        Fpair = re.findall(".*.fastq.1.gz", fastq)
        paired = paired + Fpair
#load modules
from ruffus import *
import os
import sys, re
import subprocess

###################################################
###################################################
###################################################
# Pipeline configuration
###################################################
# load options from the config file
import cgatcore.pipeline as P
P.get_parameters([
    "%s/pipeline.yml" % __file__[:-len(".py")], "../pipeline.yml",
    "pipeline.yml"
])
PARAMS = P.PARAMS

from pipeline_assembly import PipelineAssembly
from pipeline_annotate import PipelineAnnotate

#get all files within the directory to process
SEQUENCEFILES = ("*.fasta", "*.fasta.gz", "*.fasta.1.gz", "*.fasta.1", "*.fna",
                 "*.fna.gz", "*.fna.1.gz", "*.fna.1", "*.fa", "*.fa.gz",
                 "*.fa.1.gz", "*.fa.1", "*.fastq", "*.fastq.gz",
                 "*.fastq.1.gz", "*.fastq.1")

SEQUENCEFILES_REGEX = regex(
    r"(\S+).(fasta$|fasta.gz|fasta.1.gz|fasta.1|fna$|fna.gz|fna.1.gz|fna.1|fa$|fa.gz|fa.1.gz|fa.1|fastq$|fastq.gz|fastq.1.gz|fastq.1)"
)
示例#11
0
"""
#Capture-C pipeline exercise - files required: read1.fastq and read2.fastq, fragment.txt Hba-1 mouse globin locus

#fastqc code as before

#trimming using trim-galore - need to use collate decorator as need to put in reads as pair for trimming
#FLASH (Fast Length Adjustment of SHort reads) is a very fast and accurate software tool to merge paired-end reads from 
#next-generation sequencing experiments. FLASH is designed to merge pairs of reads when the original DNA fragments are shorter than 
#twice the length of reads. The resulting longer reads can significantly improve genome assemblies. 
#@collate is for collating pairs or groups; @merge is to merge all samples, irrespective of grouping

import sys
from cgatcore import pipeline as P
from ruffus import *

P.get_parameters('capturec_pipeline.yml')

@follows(mkdir('fastqc'))
@transform('*.fastq.gz', regex(r'(.*).fastq.gz'),r'fastqc/\1_fastqc.html')
def qc_reads(infile, outfile):
    statement = 'fastqc -q -t %(threads)s --nogroup %(infile)s --outdir fastqc'
    P.run(statement,
          job_queue  = P.PARAMS['queue'],
          job_memory = P.PARAMS['memory']
          job_threads = P.PARAMS['threads'])

@follows (mkdir('trim'))
@collate('*.fastq.gz', regex(r'(.*)_[1-2].fastq.gz'), r'trim/\1_1_val_1.fq.gz')
def trim(infiles, outfile):
    ''' Trim fastq files'''
    fq1, fq2 = infiles
'''
run cellranger on 10X fastq files

'''

import gzip
import re
import pandas as pd
from ruffus import *
from cgatcore import pipeline as P
import sys

params = P.get_parameters("pipeline_cellranger.yml")  # define params here
samples = pd.read_csv("cellranger_samples.csv")
samples.set_index('name', inplace=True)
print(samples)

@follows(mkdir("count"))
@transform('data/*/.sample', regex(r'data/(.+)/.sample'), r'count/\1/outs/filtered_feature_bc_matrix.h5')
def cellranger_count(infile, outfile):
    # python module looking for regular expression, group(1) is equiv to '\1'
    sampleid = re.search('data/(.+)/.sample', infile).group(1)
    print(sampleid)
    fastqs = samples['fastqs'][sampleid]
    cellnumber = samples['cells'][sampleid]
    chemistry = samples['chemistry'][sampleid]

    statement = '''cellranger count
    --id=%(sampleid)s
    --transcriptome=%(cellrangercount_transcriptome)s
    --fastqs=%(fastqs)s
示例#13
0
import cgatcore.experiment as E
import cgatcore.iotools as IOTools
import cgatcore.database as Database
import cgat.FastaIterator as FastaIterator
import numpy as np
from PipelinePrimerDesign import PrimerSet

###################################################
###################################################
###################################################
## Pipeline configuration
###################################################

# load options from the config file
import cgatcore.pipeline as P
P.get_parameters( 
    [ "pipeline.yml" ] )

PARAMS = P.PARAMS


###################################################
###################################################
###################################################

def readIdentifiers(identifiers):
    '''
    return list of identifiers from file
    '''
    ids = [x.strip("\n") for x in IOTools.open_file(identifiers).readlines()]
    return ids
示例#14
0
                        'pipeline_docs', 'themes')
logopath = os.path.join(themedir, "cgat_logo.png")

################################################################
# Import pipeline configuration from pipeline.ini in the current
# directory and the common one.

# PATH were code for pipelines is stored
pipelinesdir = os.path.dirname(cgatpipelines.__file__)

# The default configuration file - 'inifile' is read by
# sphinx-report.
inifile = os.path.join(os.path.dirname(cgatpipelines.__file__),
                       'configuration', 'pipeline.yml')

PARAMS = P.get_parameters([inifile, "pipeline.yml"])

# Definition now part of cgatReport
# def setup(app):
#     app.add_config_value('PARAMS', {}, True)

################################################################
################################################################
################################################################
# The pipeline assumes that sphinxreport is called within the
# working directory. If the report is in a separate build directory,
# change the paths below.
#
# directory with export directory from pipeline
# This should be a directory in the build directory - you can
# link from here to a directory outside the build tree, though.
示例#15
0
    merge,
    originate,
    collate,
    regex,
    add_inputs,
    active_if,
)
from cgatcore.iotools import zap_file, touch_file
from utils import is_none, is_on

##################
# Pipeline setup #
##################

# Read in parameter file
P.get_parameters("config_rna.yml")


# Small edits to config to enable cluster usage
P.PARAMS["cluster_queue_manager"] = P.PARAMS.get("pipeline_cluster_queue_manager")
P.PARAMS["conda_env"] = os.path.basename(os.environ["CONDA_PREFIX"])

# Make sure that params dict is typed correctly
for key in P.PARAMS:
    if is_none(P.PARAMS[key]):
        P.PARAMS[key] = None
    elif is_on(P.PARAMS):
        P.PARAMS[key] = True

# Global variables
CREATE_BIGWIGS = P.PARAMS.get("run_options_bigwigs")
示例#16
0
文件: farm.py 项目: pythseq/cgat-core
def main(argv=None):

    parser = get_option_parser()

    (options, args) = E.start(parser, add_cluster_options=True)

    if len(args) == 0:
        raise ValueError(
            "command line argument missing - see usage information")

    options.renumber_column = [x.split(":") for x in options.renumber_column]

    cmd = args[0]
    if len(args) > 1:
        cmd += " '" + "' '".join(args[1:]) + "'"

    if options.dry_run:

        cmd = re.sub("%DIR%", "", cmd)
        retcode = subprocess.call(cmd,
                                  shell=True,
                                  stdin=sys.stdin,
                                  stdout=sys.stdout,
                                  cwd=os.getcwd(),
                                  close_fds=True)
        E.stop()
        sys.exit(0)

    failed_requests = []
    started_requests = []
    niterations = 0

    P.get_parameters()
    P.start_session()

    if not options.collect:
        tmpdir = os.path.abspath(tempfile.mkdtemp(dir=options.tmpdir))

        E.info(" working in directory %s" % tmpdir)

        if options.split_at_lines:
            chunk_iterator = chunk_iterator_lines
            args = (options.split_at_lines, )
        elif options.split_at_column:
            chunk_iterator = chunk_iterator_column
            args = (options.split_at_column - 1, options.max_files)
        elif options.split_at_regex:
            chunk_iterator = chunk_iterator_regex_split
            args = (re.compile(options.split_at_regex), 0, options.chunksize,
                    options.max_lines)
        elif options.group_by_regex:
            chunk_iterator = chunk_iterator_regex_group
            args = (re.compile(options.group_by_regex), 0, options.chunksize)
        else:
            raise ValueError("please specify a way to chunk input data")

        data = [(x, cmd, options, None, options.subdirs)
                for x in chunk_iterator(options.stdin,
                                        args,
                                        prefix=tmpdir,
                                        use_header=options.input_header)]

        statements = [build_command(x) for x in data]
        started_requests = [(x[0], x[0] + ".out") for x in data]

        if len(data) == 0:
            E.warn("no data received")
            E.stop()
            sys.exit(0)

        P.run(statements)
    else:
        tmpdir = options.collect
        started_requests = [(x[:-4], x) for x in glob.glob(tmpdir + "/*.out")]

        E.info("collecting %i files from %s" % (len(started_requests), tmpdir))

    if failed_requests:
        for fn, cmd in failed_requests:
            E.error("failed request: filename= %s, cmd= %s" % (fn, cmd))
    else:
        E.info("building result from %i parts" % len(started_requests))

        if options.renumber:
            mapper = MapperLocal(pattern=options.renumber)
        else:
            mapper = MapperEmpty()

        # deal with stdout
        name = None
        index = None

        for pattern, column in options.renumber_column:

            if re.search(pattern, "stdout"):
                try:
                    index = int(column) - 1
                except ValueError:
                    name = column
                    break

        if options.binary:
            ResultBuilderBinary()(started_requests, options.stdout, options)
        else:
            regex = None
            if options.output_regex_header:
                regex = re.compile(options.output_regex_header)
            ResultBuilder(mapper=mapper,
                          field_index=index,
                          field_name=name,
                          header_regex=regex)(started_requests, options.stdout,
                                              options)

        # deal with logfiles : combine them into a single file
        rr = re.search("'--log=(\S+)'", cmd) or re.search("'--L\s+(\S+)'", cmd)
        if rr:
            E.info("logging output goes to %s" % rr.groups()[0])
            logfile = iotools.open_file(rr.groups()[0], "a")
            ResultBuilderLog()([(x[0], "%s.log" % x[0])
                                for x in started_requests], logfile, options)
            logfile.close()

        # deal with other files
        if options.subdirs:

            files = glob.glob("%s/*.dir/*" % tmpdir)
            # remove directory
            filenames = set([os.path.basename(x) for x in files])
            xx = len(".out")

            for filename in filenames:

                _, filetype = os.path.splitext(filename)

                name = None
                index = None

                for pattern, column in options.renumber_column:
                    if re.search(pattern, filename):
                        try:
                            index = int(column) - 1
                        except ValueError:
                            name = column
                        break

                if options.binary:
                    builder = ResultBuilderBinary(mapper=mapper)
                elif filetype in (".fa", ".fasta"):
                    builder = ResultBuilderFasta(mapper=mapper)
                elif filetype in (".mali", ):
                    builder = ResultBuilderFasta(mapper=MapperEmpty())
                elif filetype in (".png"):
                    builder = ResultBuilderCopies(mapper=mapper)
                else:
                    builder = ResultBuilder(mapper=mapper,
                                            field_index=index,
                                            field_name=name)

                E.debug("chose the following builder for %s: %s: %s" %
                        (filename, filetype, str(builder)))

                E.info("collecting results for %s" % filename)

                input_filenames = []
                for fi, fn in started_requests:
                    fn = fn[:-xx] + ".dir/" + filename
                    if os.path.exists(fn):
                        input_filenames.append((fi, fn))

                E.info("output of %i files goes to %s" %
                       (len(filenames), filename))

                outfile = iotools.open_file(options.output_pattern % filename,
                                            "w")
                builder(input_filenames, outfile, options)
                outfile.close()

    if not options.debug and (not options.resume or not options.collect):
        if len(failed_requests) == 0:
            E.info("removing directory %s" % tmpdir)
            shutil.rmtree(tmpdir)
        else:
            E.info("directory %s not removed due to %i failed jobs" %
                   (tmpdir, len(failed_requests)))

    E.info("job control: nstarted=%i, nfinished=%i, nerrors=%i, nrepeats=%i" %
           (len(started_requests), len(started_requests) -
            len(failed_requests), len(failed_requests), niterations))

    E.stop()
示例#17
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue May 12 10:16:38 2020

@author: sumeet
"""

from ruffus import *
from cgatcore import pipeline as P
import sys

#Import parameters
Params = P.get_parameters("pipeline_rna_seq.yml")


#This part is the fastq to generate fastqc html files
@transform('*.fastq.gz', suffix('.fastq.gz'), '_fastqc.html')
def fastqc(infile, outfile):
    statement = '''fastqc %(infile)s > %(outfile)s.log'''
    P.run(statement)


#Next want to do multiqc to make a nice report containing all the fastqc from each fastq file
#We want to merge the input from fastqc files into multiqc report
#First use decorator @follows to make a directory for output
#use the input from fastqc in the merge function, output will be multiqc.html report
#define multiqc function, need multiple infiles
#run this like  . - to look in cd
#Name the output file usin -n and specify output directory using -o
@follows(mkdir('multiqc_reports'))
示例#18
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
ChIP-seq pipeline
"""
#copy files from shared to my directory. They are already symbolic links so have to copy
#cp -d /ifs/obds-training/apr20/shared/week3/chipseq/* .
#will be rsyncing script
#rsync -a /Users/rhodgson/GitHub/OBDS_Training_Apr_2020/chipseq_pipeline.* [email protected]:/ifs/obds-training/apr20/rose/pipelines/chipseqpipeline

import sys
from ruffus import *
from cgatcore import pipeline as P

params = P.get_parameters("chipseq_pipeline.yml")

#fastq input
#Do fastqc of the the files and put them in a new directory. name file using the first part of the regex
#no group is an option for fastqc - where you have plots, it doesn't average over bases. give you bases over evry base.
#params["q"] in param file - put the name of the q to allow you to put the pipeline from one to another q


@follows(mkdir("fastqc"))
@transform("*.fastq.gz", regex(r"(.*).fastq.gz"), r"fastqc/\1_fastqc.html")
def fastqc(infile, outfile):
    statement = "fastqc --nogroup -o fastqc %(infile)s > %(outfile)s.log"
    P.run(statement, job_queue=params["q"])


#Next bit -multiqc
示例#19
0
.. glossary::


Code
====

"""
import sys
import os
import glob
from pathlib import Path
from ruffus import *
from cgatcore import pipeline as P

# load options from the config file
PARAMS = P.get_parameters(
    ["%s/pipeline.yml" % os.path.splitext(__file__)[0], "pipeline.yml"])

#get all files within the directory to process
SEQUENCEFILES = ("*fastq.gz")

SEQUENCEFILES_REGEX = regex(r"(\S+).(fastq.gz)")

scriptsdir = os.path.dirname(os.path.abspath(__file__))
scriptsdir = P.snip(scriptsdir, "pipelines") + "scripts"
PARAMS["scriptsdir"] = scriptsdir

reportdir = os.path.dirname(os.path.abspath(__file__))
reportdir = os.path.join(reportdir, "pipeline_docs", "Rmd")
PARAMS["reportdir"] = reportdir

########################################################
示例#20
0
##############################################
# Simple pipeline to run prokka on multiple
# assemblies
##############################################
##############################################
##############################################

from ruffus import *
import cgatcore.pipeline as P
import cgatcore.iotools as IOTools
import os
import sys
import collections
import cgatcore.experiment as E

PARAMS = P.get_parameters(filenames=["pipeline.yml"])

##############################################
##############################################
##############################################


@follows(mkdir("annotations.dir"))
@transform("*.fna.gz", regex("(\S+).fna.gz"), r"annotations.dir/\1/\1.tsv")
def runProkka(infile, outfile):
    '''
    run prokka annotations - 
    very basic with no parameterisation
    '''
    newdirname = os.path.join("annotations.dir", P.snip(infile, ".fna.gz"))
    job_memory = PARAMS["prokka_memory"]
 def setUp(self):
     BaseTest.setUp(self)
     P.get_parameters()
"""Salmon alevin"""

import re
import pandas as pd
from ruffus import *
from cgatcore import pipeline as P
import sys
import glob
import os

params = P.get_parameters("project_alevin.yml")  # define params here
samples = pd.read_csv("cellranger_samples.csv")
samples.set_index('name', inplace=True)
print(samples)


def get_gex_fastq(dir):
    '''Docstring'''
    fastq1_pattern = params["pattern"]["fastq1"]
    fastq1_glob = f"{dir}/*{fastq1_pattern}*"
    fastq1 = glob.glob(fastq1_glob)
    if len(fastq1) == 0:
        raise OSError(f"No file matched pattern: {fastq1_glob}")
    fastq2 = [
        file.replace(params["pattern"]["fastq1"], params["pattern"]["fastq2"])
        for file in fastq1
    ]
    for file in fastq2:
        if not os.path.exists(file):
            raise OSError(f"Paired file not found: {file}")
    return {'fastq1': fastq1, 'fastq2': fastq2}
示例#23
0
def connect():
    '''connect to database.
    Use this method to connect to additional databases.
    Returns a database connection.
    '''
    dbh = sqlite3.connect(PARAMS["database_name"])

    return dbh


#########################################################################
P.get_parameters(
    ["%s/pipeline.yml" % os.path.splitext(__file__)[0],
     "../pipeline.yml",
     "pipeline.yml"],
    defaults={
        'paired_end': False},
    only_import=__name__ != "__main__")

PARAMS = P.PARAMS

mapping.PARAMS = PARAMS
mappingqc.PARAMS = PARAMS
exome.PARAMS = PARAMS
#########################################################################


#########################################################################
# Load manual annotations
#########################################################################
'''
RNAseq pipeline
process fastq file into count files/matrices
'''

from ruffus import *
from cgatcore import pipeline as P
import sys

params = P.get_parameters("rnaseq_pipeline.yml")


@follows(mkdir("fastqc"))
@transform("*.fastq.gz", regex(r'(.*).fastq.gz'), r'fastqc/\1_fastqc.html')
def fastqc(infile, outfile):
    statement = "fastqc --nogroup -o fastqc %(infile)s "
    P.run(statement,
          job_queue='all.q',
          job_threads=1,
          job_memory='2G',
          job_condaenv='obds-py3')


@merge(fastqc, r'fastqc/multiqc_report.html')
def multiqc(infiles, outfile):
    statement = "multiqc -f -n %(outfile)s fastqc"
    P.run(statement,
          job_queue='all.q',
          job_threads=1,
          job_memory='2G',
          job_condaenv='obds-py3')
def main(argv=sys.argv):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-n",
        "--dry-run",
        dest="dry_run",
        action="store_true",
        help="only show what will be done, don't do it [%default]")

    parser.add_option("-l",
                      "--link",
                      dest="link",
                      action="store_true",
                      help="link instead of rename [%default]")

    parser.set_defaults(dry_run=False, link=False)

    (options, args) = E.start(parser, argv)

    config = P.get_parameters("benchmark.yml")

    old_data, new_data = [], []

    for old_info in glob.glob("*.dir/tool.info"):
        old_dir, old_file = os.path.split(old_info)
        old_info = toolkit.read_data(old_info)
        old_data.append((old_dir, old_info))

    tool_functions = workflow.build_tool_functions(map_tool_to_runner, config)

    config_files = workflow.expand_globs(config["input"])
    input_combos = workflow.build_combinations(config_files)

    map_property_to_dir = collections.defaultdict(list)

    for toolf, input_files in itertools.product(tool_functions, input_combos):

        # create a copy of the task function and give it its unique name
        # by mangling it with the input_files
        taskf = copy.copy(toolf)
        taskf.register_input(input_files)
        result_dir = os.path.basename(os.path.join(taskf.__name__ + ".dir"))
        new_data.append((result_dir, taskf))
        for a, x, y in IOTools.nested_iter(taskf.input_files):
            map_property_to_dir[(x, y)].append(result_dir)
        map_property_to_dir[("name", taskf.name)].append(result_dir)
        for x, y in list(taskf._option_dict.items()):
            map_property_to_dir[(x, y)].append(result_dir)

    # match by input_files
    options.stdout.write("\t".join(("old", "new", "matching")) + "\n")

    for old_dir, old_info in old_data:
        targets = []
        for a, x, y in IOTools.nested_iter(old_info["input_files"]):
            if (x, y) in map_property_to_dir:
                targets.extend(map_property_to_dir[(x, y)])
        for x, y in list(old_info.items()):
            try:
                targets.extend(map_property_to_dir[(x, y)])
            except TypeError:
                pass

        counts = collections.Counter(targets)
        max_count = max(counts.values())
        max_count_items = [
            x for x, y in list(counts.items()) if y == max_count
        ]

        if len(max_count_items) > 1:
            E.warn("multiple matches for {}, ignored".format(old_dir))
            continue

        new_dir = max_count_items[0]

        options.stdout.write("\t".join(map(str, (old_dir, new_dir,
                                                 max_count))) + "\n")

        if os.path.exists(new_dir):
            raise ValueError("directory {} already exists".format(new_dir))

        if options.dry_run:
            continue

        if options.link:
            os.symlink(old_dir, new_dir)
        else:
            os.rename(old_dir, new_dir)

    E.stop()
示例#26
0
queue: all.q
threads: 12
memory: 8G
bowtie2:
    options:
    ref: /ifs/mirror/genomes/bowtie/mm10
picard:
    ref: /ifs/mirror/genomes/plain/mm10.fasta
"""
import sys
import gzip
from cgatcore import pipeline as P
from ruffus import *

P.get_parameters('chipseq_pipeline.yml')


@follows(mkdir('fastqc')
         )  #same as before, can run independently of other processes
@transform('*.fastq.gz', regex(r'(.*).fastq.gz'), r'fastqc/\1_fastqc.html')
def qc_reads(infile, outfile):
    statement = 'fastqc -q -t %(threads)s --nogroup %(infile)s --outdir fastqc'
    P.run(statement,
          job_queue=P.PARAMS['queue'],
          job_memory=P.PARAMS['memory'])


@follows(mkdir('sam'))
@collate('*.fastq.gz', regex(r'(.*)_[1-2].fastq.gz'), r'sam/\1.sam')
def align_reads(infiles, outfile):
示例#27
0
import cgatcore.experiment as E
import cgatcore.iotools as iotools

import cgatpipelines.tasks.motifs as motifs
import cgatpipelines.tasks.tracks as tracks

###################################################
###################################################
###################################################
# Pipeline configuration
###################################################
from cgatcore import pipeline as P
P.get_parameters([
    "%s/pipeline.yml" % os.path.splitext(__file__)[0], "../pipeline.yml",
    "pipeline.yml"
],
                 defaults={'annotations_dir': ""})

PARAMS = P.PARAMS

PARAMS_ANNOTATIONS = P.peek_parameters(PARAMS["annotations_dir"], "genesets")

###################################################################
###################################################################
###################################################################
# Helper functions mapping tracks to conditions, etc
###################################################################
# load all tracks - exclude input/control tracks
Sample = tracks.Sample
@author: rhodgson
"""

#This pipeline is to run rnaseq pipelines from the fastq file from all
#I think I will create my github repository here

#rsync -a /Users/rhodgson/GitHub/OBDS_Training_Apr_2020/rnaseq_pipeline.py [email protected]:/ifs/obds-training/apr20/rose/pipelines/rnaseqpipeline
#Now import section
import sys
from ruffus import *
from cgatcore import pipeline as P
import gzip

#write parameters
Params = P.get_parameters("pipeline.yml")

#First going to do the fastqc on the fastq files
#This will create fastqc.html files and fastqc.zip
#It also feeds evertyhing into a fastqc.zip file


@transform('*.fastq.gz', suffix('.fastq.gz'), '_fastqc.html')
def fastqqc(infile, outfile):
    statement = '''fastqc %(infile)s> %(outfile)s.log'''
    P.run(statement)


#main - will allow to run from cgat core - always goes to the end of the file

if __name__ == "__main__":
示例#29
0
#Import everything we need
#Working directory: /ifs/obds-training/apr20/rose/pipelines/pseudoalignment
#Make a directory with where we want the files- made a symbolic link to fastq files from/ifs/obds-training/apr20/exercises/rnaseq/
#will be rsyncing script:
#rsync -a /Users/rhodgson/GitHub/obdsRNAseqpipeline/pseudoalignmentRH* [email protected]:/ifs/obds-training/apr20/rose/pipelines/pseudoalignment
#We will be using kallisto

from ruffus import *
from cgatcore import pipeline as P
import sys

#So the first thing we still want to do is the QC of the fastq files (including multiQC)
#see file pipeline_rna_seq.py for notes on these functions

#Import parameters - sure this will change
Params = P.get_parameters("pseudoalignmentRH.yml")

#Fastqc - just added an output folder here
#as this is a transform thing, we needed to use a regular expression to put these reports into a new folder


@follows(mkdir('fastqc_reports'))
@transform('*.fastq.gz', regex(r'(.*).fastq.gz'),
           r'fastqc_reports/\1_fastqc.html')
def fastqc(infile, outfile):
    statement = '''fastqc --outdir fastqc_reports  %(infile)s > %(outfile)s.log'''
    P.run(statement)


#Multiqc -hmm
@follows(mkdir('multiqc_reports'))
示例#30
0
fastq matches the characters fastq literally (case sensitive)
. matches any character (except for line terminators)
gz matches the characters gz literally (case sensitive)
Global pattern flags
g modifier: global. All matches (don't return after first match)
m modifier: multi line. Causes ^ and $ to match the begin/end of each line (not only begin/end of string)

"""

import gzip
from ruffus import *
from cgatcore import pipeline as P
import sys
import statistics

P.get_parameters('rnaseq_pipeline.yaml') #.yml for cbrg, .yaml for cgat

@follows(mkdir('fastqc')) #make fastqc folder before running code below
@transform('*.fastq.gz', regex(r'(.*).fastq.gz'),r'fastqc/\1_fastqc.html') #find all fastq.gz files, save to fastqc folder and use name_fastqc.html
def fastqc (infile, outfile): #next time call functions run_fastqc etc so don't have confused!
    statement = '''fastqc -q -t %(threads)s --nogroup %(infile)s --outdir fastqc''' #need to direct output
    P.run(statement, job_queue=P.PARAMS['queue'], job_threads=P.PARAMS['threads']) #only for setting run parameters eg memory, threads, queue

@follows(mkdir('sam')) #make sam folder before running code below
@collate('*.fastq.gz', regex(r'(.*)_[1-2].fastq.gz'), r'sam/\1.sam') #look for fastq.gz files with same name ending in _1 or _2, output as sam file
def hisat2 (infiles, outfile):
    read1, read2 = infiles #2 infiles
    statement = '''hisat2 -p %(threads)s %(hisat_option)s -x %(hisat_ref)s #hisat options in yaml file
    -1 %(read1)s -2 %(read2)s -S %(outfile)s'''
    P.run(statement, job_queue=P.PARAMS['queue'], job_threads=P.PARAMS['threads'], job_memory ='8G')