Exemplo n.º 1
0
def main():
    """main function
    """

    default_parser = default_argparser(CFG_DIR)
    parser = argparse.ArgumentParser(description=__doc__.format(
        PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version()),
                                     parents=[default_parser])

    parser._optionals.title = "Arguments"
    # pipeline specific args
    parser.add_argument('--sample-bam-map', required=True,
                        help="Yaml file listing BAM file input (value)"
                        " per sample (key; reused for output filenames here)")

    args = parser.parse_args()

    # FIXME how to remove the arguments froma argparser in the first place?
    assert not args.sample_cfg, ("Usual sample config not supported. Replaced in this pipeline with --sample-bam-map")

    # Repeateable -v and -q for setting logging level.
    # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/
    # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4
    logger.setLevel(logging.WARN + 10*args.quiet - 10*args.verbose)
    aux_logger.setLevel(logging.WARN + 10*args.quiet - 10*args.verbose)

    if os.path.exists(args.outdir):
        logger.fatal("Output directory %s already exists", args.outdir)
        sys.exit(1)


    # turn arguments into cfg_dict (gets merged with other configs late)
    #
    cfg_dict = dict()
    cfg_dict['readunits'] = dict()
    cfg_dict['samples'] = dict()

    with open(args.sample_bam_map) as fh:
        sample_bam_map = dict(yaml.safe_load(fh))
    for sample, bam in sample_bam_map.items():
        assert os.path.exists(bam)
        # if we have relative paths, make them abs relative to cfgfile
        if not os.path.isabs(bam):
            bam = os.path.abspath(os.path.join(os.path.dirname(args.sample_bam_map), bam))
            sample_bam_map[sample] = bam
    cfg_dict['sample_bam_map'] = sample_bam_map

    pipeline_handler = PipelineHandler(
        PIPELINE_NAME, PIPELINE_BASEDIR,
        args, cfg_dict,
        cluster_cfgfile=get_cluster_cfgfile(CFG_DIR))
    pipeline_handler.setup_env()
    pipeline_handler.submit(args.no_run)
Exemplo n.º 2
0
def LDRefine(args):
	logger.info("Preparing LD-based genotype refinement pipeline...")
	print_parameters_given(args)

	assert os.path.exists("varCall"), "Cannot detect the directory of varaiant detection.\nWEScall varCall has to be run before LD-based genotype refinement."

	assert args.num_record_per_file > 0, "Number of records per file has to be larger than 1!"

	assert args.num_overlap_record >= 0, "Number of overlapping records has to be larger than 1!"

	assert args.num_record_per_file>args.num_overlap_record, "Number of records per file has to be larger than the number of overlapping records."

	if not os.path.exists("LDRefine"):
		os.mkdir("LDRefine")

	LDRefine_cfg=dict()
	LDRefine_cfg["num_record_per_file"]=args.num_record_per_file
	LDRefine_cfg["num_overlap_record"]=args.num_overlap_record

	PIPELINE_BASEDIR = os.path.join(os.path.dirname(sys.argv[0]))
	CFG_DIR = os.path.join(PIPELINE_BASEDIR, "cfg")

	path_cluster_cfg=os.path.join(PIPELINE_BASEDIR,"cfg","cluster.LDRefine.yaml")

	# has to merge cluster
	with open(path_cluster_cfg, 'r') as fh:
		cluster_cfg = yaml.safe_load(fh)

	# turn arguments into user_data that gets merged into pipeline config
	#
	# generic data first
	user_data = dict()
	user_data['cluster'] = cluster_cfg
	user_data['LDRefine'] = LDRefine_cfg

	pipeline_handler = PipelineHandler(
		"WEScall_LDRefine", PIPELINE_BASEDIR, 
		"LDRefine",user_data,
		Snakefile="pipelines/LDRefine/Snakefile.beagle."+get_seq_type_from_user_cfg(args.userCfg),
		cluster_cfgfile=path_cluster_cfg,
		user_cfgfile=args.userCfg)

	pipeline_handler.setup_env()
	pipeline_handler.submit(no_run=True)
Exemplo n.º 3
0
def varCall(args):
	logger.info("Preparing varCall pipeline...")
	print_parameters_given(args)

	logger.info("Validating sample index ...")
	validate_sample_list_file(args)

	logger.info("Validating user config file ...")
	validate_user_cfg(args)

	logger.info("Checking existence of essenstial resource files...")
	check_resource_files_for_varCall()

	logger.info("Checking dependencies...")
	check_dependencies()

	pipeline_handler = PipelineHandler(
		"WEScall_varCall",
		PIPELINE_BASEDIR,
		Snakefile="pipelines/varCall/Snakefile."+get_seq_type_from_user_cfg(args.userCfg),
		outdir="./varCall",
		user_data="",
		user_cfgfile=args.userCfg,
		cluster_cfgfile=CFG_DIR+"/cluster.varCall.yaml"
		)


	os.system("mkdir -p ./varCall/data")
	shutil.copy2(args.sample_list,"./varCall/data/samples.index")

	# automatically generate the pedigree file for the user
	# Since WEScall does not utilize pedigree information, the pedigree file
	# is just a formality so as to let the pipeline run
	with open(args.sample_list) as f_in, open("./varCall/data/samples.ped","w") as f_out:
		for line in f_in:
			record = line.strip().split("\t")
			f_out.write("{smp}\t{smp}\t{smp}\t0\t0\n".format(smp=record[0]))

	pipeline_handler.setup_env()
	pipeline_handler.submit(no_run=True)
Exemplo n.º 4
0
def main():
    """main function
    """

    default_parser = default_argparser(CFG_DIR, with_readunits=True)
    parser = configargparse.ArgumentParser(description=__doc__.format(
        PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version()),
                                           parents=[default_parser])

    parser._optionals.title = "Arguments"
    # pipeline specific args
    parser.add_argument('-t',
                        "--seqtype",
                        required=True,
                        choices=['WGS', 'WES', 'targeted'],
                        help="Sequencing type")
    parser.add_argument('-l',
                        "--bed",
                        help="Bed file listing regions of interest."
                        " Required for WES and targeted sequencing.")
    parser.add_argument('-D',
                        '--dont-mark-dups',
                        action='store_true',
                        help="Don't mark duplicate reads")
    # raw bam not possible because the pipeline splits on the fly into chromosomes
    parser.add_argument(
        '--proc-bam',
        help="Advanced: Injects processed BAM (overwrites fq options)."
        " WARNING: reference and pre-processing need to match pipeline requirements"
    )
    parser.add_argument('--bam-only',
                        action='store_true',
                        help="Don't call variants, just process BAM file")

    args = parser.parse_args()

    # Repeateable -v and -q for setting logging level.
    # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/
    # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4
    # script -vv -> DEBUG
    # script -v -> INFO
    # script -> WARNING
    # script -q -> ERROR
    # script -qq -> CRITICAL
    # script -qqq -> no logging at all
    logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose)
    aux_logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose)

    if os.path.exists(args.outdir):
        logger.fatal("Output directory %s already exists", args.outdir)
        sys.exit(1)

    # samples is a dictionary with sample names as key (mostly just
    # one) and readunit keys as value. readunits is a dict with
    # readunits (think: fastq pairs with attributes) as value
    if args.sample_cfg:
        if any([args.fq1, args.fq2, args.sample, args.proc_bam]):
            logger.fatal("Config file overrides fastq and sample arguments."
                         " Use one or the other")
            sys.exit(1)
        if not os.path.exists(args.sample_cfg):
            logger.fatal("Config file %s does not exist", args.sample_cfg)
            sys.exit(1)
        samples, readunits = get_samples_and_readunits_from_cfgfile(
            args.sample_cfg)

    else:  # no sample config, so input is either fastq or existing bam
        samples = dict()

        if not args.sample:
            logger.fatal("Need sample name if not using config file")
            sys.exit(1)

        if args.proc_bam:
            assert not args.fq1, ("BAM injection overwrites fastq arguments")

            if args.proc_bam:
                assert os.path.exists(args.proc_bam)

            readunits = dict()
            samples[args.sample] = []

        elif args.fq1:

            readunits = get_readunits_from_args(args.fq1, args.fq2)
            # all readunits go into this one sample specified on the command-line
            samples[args.sample] = list(readunits.keys())

        else:
            logger.fatal(
                "Need at least one fastq files as argument if not using config file"
            )
            sys.exit(1)

    if args.seqtype in ['WES', 'targeted']:
        if not args.bed:
            logger.fatal(
                "Analysis of exome and targeted sequence runs requires a bed file"
            )
            sys.exit(1)
        else:
            if not os.path.exists(args.bed):
                logger.fatal("Bed file %s does not exist", args.sample_cfg)
                sys.exit(1)

    # turn arguments into cfg_dict (gets merged with other configs late)
    #
    cfg_dict = dict()
    cfg_dict['readunits'] = readunits
    cfg_dict['samples'] = samples
    cfg_dict['seqtype'] = args.seqtype
    cfg_dict['intervals'] = os.path.abspath(
        args.bed
    ) if args.bed else None  # always safe, might be used for WGS as well
    cfg_dict['mark_dups'] = not args.dont_mark_dups
    cfg_dict['bam_only'] = args.bam_only

    pipeline_handler = PipelineHandler(
        PIPELINE_NAME,
        PIPELINE_BASEDIR,
        args,
        cfg_dict,
        cluster_cfgfile=get_cluster_cfgfile(CFG_DIR))
    pipeline_handler.setup_env()

    # Inject existing BAM by symlinking (everything upstream is temporary anyway)
    # WARNING: filename has to match definition in Snakefile!
    if args.proc_bam:
        target = os.path.join(args.outdir, "out", args.sample,
                              "{}.bwamem.lofreq".format(args.sample))
        if cfg_dict['mark_dups']:
            target += ".dedup"
        target += ".lacer.bam"
        os.makedirs(os.path.dirname(target))
        os.symlink(os.path.abspath(args.proc_bam), target)

    pipeline_handler.submit(args.no_run)
Exemplo n.º 5
0
def main():
    """main function
    """

    default_parser = default_argparser(CFG_DIR, allow_missing_cfgfile=True)
    parser = argparse.ArgumentParser(description=__doc__.format(
        PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version()),
                                     parents=[default_parser])

    parser._optionals.title = "Arguments"
    # pipeline specific args
    parser.add_argument(
        '-1',
        "--fq1",
        nargs="+",
        help="FastQ file/s (gzip only)."
        " Multiple input files supported (auto-sorted)."
        " Note: each file (or pair) gets a unique read-group id."
        " Collides with --sample-cfg.")
    parser.add_argument(
        '-2',
        "--fq2",
        nargs="+",
        help="FastQ file/s (if paired) (gzip only). See also --fq1")
    parser.add_argument('-s',
                        "--sample",
                        help="Sample name. Collides with --sample-cfg.")
    parser.add_argument(
        '-r', "--reffa", required=True,
        help="Reference genome")  # FIXME create local copy for indexing?
    args = parser.parse_args()

    # Repeateable -v and -q for setting logging level.
    # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/
    # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4
    # script -vv -> DEBUG
    # script -v -> INFO
    # script -> WARNING
    # script -q -> ERROR
    # script -qq -> CRITICAL
    # script -qqq -> no logging at all
    logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose)
    aux_logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose)

    if os.path.exists(args.outdir):
        logger.fatal("Output directory %s already exists", args.outdir)
        sys.exit(1)

    # samples is a dictionary with sample names as key (mostly just
    # one) and readunit keys as value. readunits is a dict with
    # readunits (think: fastq pairs with attributes) as value
    if args.sample_cfg:
        if any([args.fq1, args.fq2, args.sample]):
            logger.fatal("Config file overrides fastq and sample arguments."
                         " Use one or the other")
            sys.exit(1)
        if not os.path.exists(args.sample_cfg):
            logger.fatal("Config file %s does not exist", args.sample_cfg)
            sys.exit(1)
        samples, readunits = get_samples_and_readunits_from_cfgfile(
            args.sample_cfg)

    else:  # no sample config, so input is either fastq or existing bam
        samples = dict()

        if not args.sample:
            logger.fatal("Need sample name if not using config file")
            sys.exit(1)

        if args.fq1:
            readunits = get_readunits_from_args(args.fq1, args.fq2)
            # all readunits go into this one sample specified on the command-line
            samples[args.sample] = list(readunits.keys())

        else:
            logger.fatal(
                "Need at least one fastq files as argument if not using config file"
            )
            sys.exit(1)

    for ru in readunits.values():
        assert ru['fq2'], (
            "FastQ R2 missing, but assemblers assume paired-end reads")

    # turn arguments into cfg_dict (gets merged with other configs late)
    #
    cfg_dict = dict()
    cfg_dict['readunits'] = readunits
    cfg_dict['samples'] = samples
    assert os.path.exists(args.reffa)
    # FIXME only works because yaml missing and thus not overwritten
    cfg_dict['references'] = {'genome': os.path.abspath(args.reffa)}

    pipeline_handler = PipelineHandler(
        PIPELINE_NAME,
        PIPELINE_BASEDIR,
        args,
        cfg_dict,
        cluster_cfgfile=get_cluster_cfgfile(CFG_DIR))
    pipeline_handler.setup_env()
    pipeline_handler.submit(args.no_run)
Exemplo n.º 6
0
def main():
    """main function
    """

    default_parser = default_argparser(CFG_DIR)
    parser = configargparse.ArgumentParser(description=__doc__.format(
        PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version()),
                                           parents=[default_parser])

    parser._optionals.title = "Arguments"
    # pipeline specific args
    parser.add_argument(
        "--control-fq1",
        nargs="+",
        help="Control FastQ file/s (gzip only)."
        " Multiple input files supported (auto-sorted)."
        " Note: each file (or pair) gets a unique read-group id."
        " Collides with --sample-cfg.")
    parser.add_argument(
        '--control-fq2',
        nargs="+",
        help=
        "Control FastQ file/s (if paired) (gzip only). See also --control-fq1")
    parser.add_argument(
        "--treatment-fq1",
        nargs="+",
        help="Treatment FastQ file/s (gzip only)."
        " Multiple input files supported (auto-sorted)."
        " Note: each file (or pair) gets a unique read-group id."
        " Collides with --sample-cfg.")
    parser.add_argument(
        '--treatment-fq2',
        nargs="+",
        help=
        "Treatment FastQ file/s (if paired) (gzip only). See also --treatment-fq1"
    )
    parser.add_argument(
        '--control-bam',
        help="Advanced: Injects control BAM (overwrites control-fq options)."
        " WARNING: reference and postprocessing need to match pipeline requirements"
    )
    parser.add_argument(
        '--treatment-bam',
        help="Advanced: Injects treatment BAM (overwrites treatment-fq options)."
        " WARNING: reference and postprocessing need to match pipeline requirements"
    )
    choices = ['bwa-aln', 'bwa-mem']
    default = choices[0]
    parser.add_argument('--mapper',
                        default=default,
                        choices=choices,
                        help="Mapper to use. One of {}. Default {}".format(
                            ",".join(choices), default))

    choices = ['TF', 'histone-narrow', 'histone-broad']  #, 'open-chromatin']
    parser.add_argument('-t',
                        '--peak-type',
                        required=True,
                        choices=choices,
                        help="Peak type. One of {}".format(",".join(choices)))
    parser.add_argument('--skip-macs2',
                        action='store_true',
                        help="Don't run MACS2")
    parser.add_argument('--skip-dfilter',
                        action='store_true',
                        help="Don't run DFilter")

    args = parser.parse_args()

    # Repeateable -v and -q for setting logging level.
    # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/
    # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4
    # script -vv -> DEBUG
    # script -v -> INFO
    # script -> WARNING
    # script -q -> ERROR
    # script -qq -> CRITICAL
    # script -qqq -> no logging at all
    logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose)
    aux_logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose)

    if os.path.exists(args.outdir):
        logger.fatal("Output directory %s already exists", args.outdir)
        sys.exit(1)

    # samples is a dictionary with sample names as key (mostly just
    # one) and readunit keys as value. readunits is a dict with
    # readunits (think: fastq pairs with attributes) as value
    if args.sample_cfg:
        if any([
                args.control_fq1, args.control_fq2, args.treatment_fq1,
                args.treatment_fq2, args.control_bam, args.treatment_bam
        ]):
            logger.fatal(
                "Config file overrides fastq and sample input arguments."
                " Use one or the other")
            sys.exit(1)
            if not os.path.exists(args.sample_cfg):
                logger.fatal("Config file %s does not exist", args.sample_cfg)
                sys.exit(1)
        samples, readunits = get_samples_and_readunits_from_cfgfile(
            args.sample_cfg)
    else:
        samples = dict()

        if args.control_bam:
            control_readunits = dict()
            samples["control"] = []
            assert os.path.exists(args.control_bam)
        else:
            if not all([args.control_fq1, args.treatment_fq1]):
                logger.fatal(
                    "Need at least fq1 and sample without config file")
                sys.exit(1)
            control_readunits = get_readunits_from_args(
                args.control_fq1, args.control_fq2)
            samples["control"] = list(control_readunits.keys())

        if args.treatment_bam:
            treatment_readunits = dict()
            samples["treatment"] = []
            assert os.path.exists(args.treatment_bam)
        else:
            treatment_readunits = get_readunits_from_args(
                args.treatment_fq1, args.treatment_fq2)
            samples["treatment"] = list(treatment_readunits.keys())

        readunits = dict(control_readunits)
        readunits.update(treatment_readunits)

    assert sorted(samples) == sorted(["control", "treatment"])

    # turn arguments into cfg_dict that gets merged into pipeline config
    #
    cfg_dict = dict()
    cfg_dict['readunits'] = readunits
    cfg_dict['samples'] = samples

    # either paired end or not, but no mix allows
    if all([ru.get('fq2') for ru in readunits.values()]):
        cfg_dict['paired_end'] = True
    elif not any([ru.get('fq2') for ru in readunits.values()]):
        cfg_dict['paired_end'] = False
    else:
        logger.fatal("Mixed paired-end and single-end not allowed")
        sys.exit(1)
    cfg_dict['peak_type'] = args.peak_type
    cfg_dict['mapper'] = args.mapper
    cfg_dict['skip_macs2'] = args.skip_macs2
    cfg_dict['skip_dfilter'] = args.skip_dfilter

    pipeline_handler = PipelineHandler(
        PIPELINE_NAME,
        PIPELINE_BASEDIR,
        args,
        cfg_dict,
        cluster_cfgfile=get_cluster_cfgfile(CFG_DIR))
    pipeline_handler.setup_env()

    if args.control_bam or args.treatment_bam:
        raise NotImplementedError("BAM injection not implemented yet")

    pipeline_handler.submit(args.no_run)
Exemplo n.º 7
0
def main():
    """main function
    """

    default_parser = default_argparser(CFG_DIR)
    parser = configargparse.ArgumentParser(description=__doc__.format(
        PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version()),
                                           parents=[default_parser])

    parser._optionals.title = "Arguments"
    # pipeline specific args
    parser.add_argument(
        "--normal-fq1",
        nargs="+",
        help="Normal FastQ file/s (gzip only)."
        " Multiple input files supported (auto-sorted)."
        " Note: each file (or pair) gets a unique read-group id."
        " Collides with --sample-cfg.")
    parser.add_argument(
        '--normal-fq2',
        nargs="+",
        help=
        "Normal FastQ file/s (if paired) (gzip only). See also --normal-fq1")
    parser.add_argument(
        "--tumor-fq1",
        nargs="+",
        help="Tumor FastQ file/s (gzip only)."
        " Multiple input files supported (auto-sorted)."
        " Note: each file (or pair) gets a unique read-group id."
        " Collides with --sample-cfg.")
    parser.add_argument(
        '--tumor-fq2',
        nargs="+",
        help="Tumor FastQ file/s (if paired) (gzip only). See also --tumor-fq1"
    )
    parser.add_argument('-t',
                        "--seqtype",
                        required=True,
                        choices=['WGS', 'WES', 'targeted'],
                        help="Sequencing type")
    parser.add_argument('-l',
                        "--bed",
                        help="Bed file listing regions of interest."
                        " Required for WES and targeted sequencing.")
    parser.add_argument('-D',
                        '--dont-mark-dups',
                        action='store_true',
                        help="Don't mark duplicate reads")
    parser.add_argument(
        '--normal-bam',
        help="Advanced: Injects normal BAM (overwrites normal-fq options)."
        " WARNING: reference and postprocessing need to match pipeline requirements"
    )
    parser.add_argument(
        '--tumor-bam',
        help="Advanced: Injects tumor BAM (overwrites tumor-fq options)."
        " WARNING: reference and postprocessing need to match pipeline requirements"
    )

    args = parser.parse_args()

    # Repeateable -v and -q for setting logging level.
    # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/
    # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4
    # script -vv -> DEBUG
    # script -v -> INFO
    # script -> WARNING
    # script -q -> ERROR
    # script -qq -> CRITICAL
    # script -qqq -> no logging at all
    logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose)
    aux_logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose)

    if os.path.exists(args.outdir):
        logger.fatal("Output directory %s already exists", args.outdir)
        sys.exit(1)

    # samples is a dictionary with sample names as key (mostly just
    # one) and readunit keys as value. readunits is a dict with
    # readunits (think: fastq pairs with attributes) as value
    if args.sample_cfg:
        if any([
                args.normal_fq1, args.normal_fq2, args.tumor_fq1,
                args.tumor_fq2, args.normal_bam, args.tumor_bam
        ]):
            logger.fatal(
                "Config file overrides fastq and sample input arguments."
                " Use one or the other")
            sys.exit(1)
        if not os.path.exists(args.sample_cfg):
            logger.fatal("Config file %s does not exist", args.sample_cfg)
            sys.exit(1)
        samples, readunits = get_samples_and_readunits_from_cfgfile(
            args.sample_cfg)
    else:
        samples = dict()

        if args.normal_bam:
            normal_readunits = dict()
            samples["normal"] = []
            assert os.path.exists(args.normal_bam)
        else:
            if not all([args.normal_fq1, args.tumor_fq1]):
                logger.fatal(
                    "Need at least fq1 and sample without config file")
                sys.exit(1)
            normal_readunits = get_readunits_from_args(args.normal_fq1,
                                                       args.normal_fq2)
            samples["normal"] = list(normal_readunits.keys())

        if args.tumor_bam:
            tumor_readunits = dict()
            samples["tumor"] = []
            assert os.path.exists(args.tumor_bam)
        else:
            tumor_readunits = get_readunits_from_args(args.tumor_fq1,
                                                      args.tumor_fq2)
            samples["tumor"] = list(tumor_readunits.keys())

        readunits = dict(normal_readunits)
        readunits.update(tumor_readunits)

    assert sorted(samples) == sorted(["normal", "tumor"])

    # FIXME howt to
    # if not os.path.exists(reffa):
    #    logger.fatal("Reference '%s' doesn't exist", reffa)
    #    sys.exit(1)
    #
    #for p in ['bwa', 'samtools']:
    #    if not ref_is_indexed(reffa, p):
    #        logger.fatal("Reference '%s' doesn't appear to be indexed with %s", reffa, p)
    #        sys.exit(1)

    if args.seqtype in ['WES', 'targeted']:
        if not args.bed:
            logger.fatal(
                "Analysis of exome and targeted sequence runs requires a bed file"
            )
            sys.exit(1)
        else:
            if not os.path.exists(args.bed):
                logger.fatal("Bed file %s does not exist", args.sample_cfg)
                sys.exit(1)
            logger.warning("Compatilibity between bed file and"
                           " reference not checked")  # FIXME

    # turn arguments into cfg_dict that gets merged into pipeline config
    #
    cfg_dict = dict()
    cfg_dict['readunits'] = readunits
    cfg_dict['samples'] = samples

    cfg_dict['seqtype'] = args.seqtype
    cfg_dict['intervals'] = os.path.abspath(args.bed) if args.bed else None
    cfg_dict['mark_dups'] = not args.dont_mark_dups

    pipeline_handler = PipelineHandler(
        PIPELINE_NAME,
        PIPELINE_BASEDIR,
        args,
        cfg_dict,
        cluster_cfgfile=get_cluster_cfgfile(CFG_DIR))

    pipeline_handler.setup_env()

    # inject existing BAM by symlinking (everything upstream is temporary anyway)
    for sample, bam in [("normal", args.normal_bam),
                        ("tumor", args.tumor_bam)]:
        if bam:
            # target as defined in Snakefile!
            target = os.path.join(
                args.outdir, "out", sample,
                "{}.bwamem.lofreq.dedup.lacer.bam".format(sample))
            os.makedirs(os.path.dirname(target))
            os.symlink(os.path.abspath(bam), target)

    pipeline_handler.submit(args.no_run)
def main():
    """main function
    """

    default_parser = default_argparser(CFG_DIR, with_readunits=True)
    parser = configargparse.ArgumentParser(description=__doc__.format(
        PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version()),
                                           parents=[default_parser])

    parser._optionals.title = "Arguments"
    # pipeline specific args
    #/
    args = parser.parse_args()

    # Repeateable -v and -q for setting logging level.
    # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/
    # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4
    # script -vv -> DEBUG
    # script -v -> INFO
    # script -> WARNING
    # script -q -> ERROR
    # script -qq -> CRITICAL
    # script -qqq -> no logging at all
    logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose)
    aux_logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose)

    if os.path.exists(args.outdir):
        logger.fatal("Output directory %s already exists", args.outdir)
        sys.exit(1)

    # samples is a dictionary with sample names as key (mostly just
    # one) and readunit keys as value. readunits is a dict with
    # readunits (think: fastq pairs with attributes) as value
    if args.sample_cfg:
        if any([args.fq1, args.fq2, args.sample]):
            logger.fatal(
                "Config file overrides fastq and sample input arguments."
                " Use one or the other")
            sys.exit(1)
        if not os.path.exists(args.sample_cfg):
            logger.fatal("Config file %s does not exist", args.sample_cfg)
            sys.exit(1)
        samples, readunits = get_samples_and_readunits_from_cfgfile(
            args.sample_cfg)
    else:
        if not all([args.fq1, args.sample]):
            logger.fatal("Need at least fq1 and sample without config file")
            sys.exit(1)

        readunits = get_readunits_from_args(args.fq1, args.fq2)
        # all readunits go into this one sample specified on the command-line
        samples = dict()
        samples[args.sample] = list(readunits.keys())

    # FIXME implement checks on reffa index (currently not exposed via args)

    # turn arguments into cfg_dict that gets merged into pipeline config
    #
    cfg_dict = dict()
    cfg_dict['readunits'] = readunits
    cfg_dict['samples'] = samples

    pipeline_handler = PipelineHandler(
        PIPELINE_NAME,
        PIPELINE_BASEDIR,
        args,
        cfg_dict,
        cluster_cfgfile=get_cluster_cfgfile(CFG_DIR))
    pipeline_handler.setup_env()
    pipeline_handler.submit(args.no_run)
Exemplo n.º 9
0
def main():
    """main function
    """

    default_parser = default_argparser(CFG_DIR)
    parser = argparse.ArgumentParser(description=__doc__.format(
        PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version()),
                                     parents=[default_parser])

    parser._optionals.title = "Arguments"
    # pipeline specific args
    parser.add_argument(
        '-1',
        "--fq1",
        nargs="+",
        help="FastQ file/s (gzip only)."
        " Multiple input files supported (auto-sorted)."
        " Note: each file (or pair) gets a unique read-group id."
        " Collides with --sample-cfg.")
    parser.add_argument(
        '-2',
        "--fq2",
        nargs="+",
        help="FastQ file/s (if paired) (gzip only). See also --fq1")
    parser.add_argument('-s',
                        "--sample",
                        help="Sample name. Collides with --sample-cfg.")
    parser.add_argument('-C',
                        "--cuffdiff",
                        action='store_true',
                        dest="run_cuffdiff",
                        help="Also run cuffdiff")
    parser.add_argument('-S',
                        '--stranded',
                        action='store_true',
                        help="Stranded library prep (default is unstranded)")

    args = parser.parse_args()

    # Repeateable -v and -q for setting logging level.
    # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/
    # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4
    # script -vv -> DEBUG
    # script -v -> INFO
    # script -> WARNING
    # script -q -> ERROR
    # script -qq -> CRITICAL
    # script -qqq -> no logging at all
    logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose)
    aux_logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose)

    if os.path.exists(args.outdir):
        logger.fatal("Output directory %s already exists", args.outdir)
        sys.exit(1)

    # samples is a dictionary with sample names as key (mostly just
    # one) and readunit keys as value. readunits is a dict with
    # readunits (think: fastq pairs with attributes) as value
    if args.sample_cfg:
        if any([args.fq1, args.fq2, args.sample]):
            logger.fatal(
                "Config file overrides fastq and sample input arguments."
                " Use one or the other")
            sys.exit(1)
        if not os.path.exists(args.sample_cfg):
            logger.fatal("Config file %s does not exist", args.sample_cfg)
            sys.exit(1)
        samples, readunits = get_samples_and_readunits_from_cfgfile(
            args.sample_cfg)
    else:
        if not all([args.fq1, args.sample]):
            logger.fatal("Need at least fq1 and sample without config file")
            sys.exit(1)

        readunits = get_readunits_from_args(args.fq1, args.fq2)
        # all readunits go into this one sample specified on the command-line
        samples = dict()
        samples[args.sample] = list(readunits.keys())

    # FIXME checks on reffa index (currently not exposed via args)

    # turn arguments into cfg_dict that gets merged into pipeline config
    #
    cfg_dict = dict()
    cfg_dict['readunits'] = readunits
    cfg_dict['samples'] = samples

    cfg_dict['stranded'] = args.stranded
    cfg_dict['run_cuffdiff'] = args.run_cuffdiff
    cfg_dict['paired_end'] = any(ru.get('fq2') for ru in readunits.values())
    if cfg_dict['paired_end']:
        assert all(ru.get('fq2') for ru in readunits.values()), (
            "Can't handle mix of paired-end and single-end")

    pipeline_handler = PipelineHandler(
        PIPELINE_NAME,
        PIPELINE_BASEDIR,
        args,
        cfg_dict,
        cluster_cfgfile=get_cluster_cfgfile(CFG_DIR))
    pipeline_handler.setup_env()
    pipeline_handler.submit(args.no_run)
Exemplo n.º 10
0
def main():
    """main function
    """

    default_parser = default_argparser(CFG_DIR, with_readunits=True)
    parser = configargparse.ArgumentParser(description=__doc__.format(
        PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version()),
                                           parents=[default_parser])

    parser._optionals.title = "Arguments"
    # pipeline specific args
    parser.add_argument("--cuffdiff",
                        action='store_true',
                        dest="run_cuffdiff",
                        help="Also run cuffdiff")
    choices = ["none", "forward", "reverse"]
    default = "none"
    parser.add_argument(
        '--stranded',
        choices=choices,
        default=default,
        help=
        "Stranded library prep (default is {}; Following RSEM definition but see also"
        " http://chipster.csc.fi/manual/library-type-summary.html)".format(
            default))
    parser.add_argument(
        '--rsem-estimate-rspd',
        action='store_true',
        help="Estimate read start position distribution in RSEM")
    args = parser.parse_args()

    # Repeateable -v and -q for setting logging level.
    # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/
    # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4
    # script -vv -> DEBUG
    # script -v -> INFO
    # script -> WARNING
    # script -q -> ERROR
    # script -qq -> CRITICAL
    # script -qqq -> no logging at all
    logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose)
    aux_logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose)

    if os.path.exists(args.outdir):
        logger.fatal("Output directory %s already exists", args.outdir)
        sys.exit(1)

    # samples is a dictionary with sample names as key (mostly just
    # one) and readunit keys as value. readunits is a dict with
    # readunits (think: fastq pairs with attributes) as value
    if args.sample_cfg:
        if any([args.fq1, args.fq2, args.sample]):
            logger.fatal(
                "Config file overrides fastq and sample input arguments."
                " Use one or the other")
            sys.exit(1)
        if not os.path.exists(args.sample_cfg):
            logger.fatal("Config file %s does not exist", args.sample_cfg)
            sys.exit(1)
        samples, readunits = get_samples_and_readunits_from_cfgfile(
            args.sample_cfg)
    else:
        if not all([args.fq1, args.sample]):
            logger.fatal("Need at least fq1 and sample without config file")
            sys.exit(1)

        readunits = get_readunits_from_args(args.fq1, args.fq2)
        # all readunits go into this one sample specified on the command-line
        samples = dict()
        samples[args.sample] = list(readunits.keys())

    # FIXME add checks on reffa index (currently not exposed via args)

    # turn arguments into cfg_dict that gets merged into pipeline config
    #
    cfg_dict = dict()
    cfg_dict['readunits'] = readunits
    cfg_dict['samples'] = samples
    cfg_dict['rsem_extra_args'] = ''
    if args.rsem_estimate_rspd:
        cfg_dict['rsem_extra_args'] += ' --estimate-rspd'
    cfg_dict['stranded'] = args.stranded
    cfg_dict['run_cuffdiff'] = args.run_cuffdiff
    cfg_dict['paired_end'] = any(ru.get('fq2') for ru in readunits.values())
    if cfg_dict['paired_end']:
        assert all(ru.get('fq2') for ru in readunits.values()), (
            "Can't handle mix of paired-end and single-end")

    pipeline_handler = PipelineHandler(
        PIPELINE_NAME,
        PIPELINE_BASEDIR,
        args,
        cfg_dict,
        cluster_cfgfile=get_cluster_cfgfile(CFG_DIR))
    pipeline_handler.setup_env()
    pipeline_handler.submit(args.no_run)
Exemplo n.º 11
0
def main():
    """main function
    """

    # FIXME ugly and code duplication in bcl2fastq_dbupdate.py
    mongo_status_script = os.path.abspath(
        os.path.join(os.path.dirname(sys.argv[0]), "mongo_status.py"))
    assert os.path.exists(mongo_status_script)

    default_parser = default_argparser(CFG_DIR,
                                       allow_missing_cfgfile=True,
                                       allow_missing_outdir=True,
                                       default_db_logging=True)
    parser = argparse.ArgumentParser(description=__doc__.format(
        PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version()),
                                     parents=[default_parser])
    parser._optionals.title = "Arguments"
    # pipeline specific args
    parser.add_argument('-r',
                        "--runid",
                        help="Run ID plus flowcell ID (clashes with -d)")
    parser.add_argument(
        '-d',
        "--rundir",
        help=
        "BCL input directory (clashes with -r; you also probably want to disable logging)"
    )
    parser.add_argument('-t',
                        "--testing",
                        action='store_true',
                        help="Use MongoDB test server")
    parser.add_argument('--no-archive',
                        action='store_true',
                        help="Don't archieve this analysis")
    parser.add_argument(
        '-l',
        '--lanes',
        type=int,
        nargs="*",
        help="Limit run to given lane/s (multiples separated by space")
    parser.add_argument(
        '-i',
        '--mismatches',
        type=int,
        help="Max. number of allowed barcode mismatches (0>=x<=2)"
        " setting a value here overrides the default settings read from ELM)")

    args = parser.parse_args()

    # Repeateable -v and -q for setting logging level.
    # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/
    # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4
    # script -vv -> DEBUG
    # script -v -> INFO
    # script -> WARNING
    # script -q -> ERROR
    # script -qq -> CRITICAL
    # script -qqq -> no logging at all
    logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose)
    aux_logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose)

    if args.mismatches is not None:
        if args.mismatches > 2 or args.mismatches < 0:
            logger.fatal("Number of mismatches must be between 0-2")
            sys.exit(1)

    lane_info = ''
    lane_nos = []
    if args.lanes:
        lane_info = '--tiles '
        for lane in args.lanes:
            if lane > 8 or lane < 1:
                logger.fatal("Lane number must be between 1-8")
                sys.exit(1)
            else:
                lane_info += 's_{}'.format(lane) + ','
        lane_info = lane_info.rstrip()
        lane_info = lane_info[:-1]
        lane_nos = list(args.lanes)

    if args.runid and args.rundir:
        logger.fatal(
            "Cannot use run-id and input directory arguments simultaneously")
        sys.exit(1)
    elif args.runid:
        rundir = run_folder_for_run_id(args.runid)
    elif args.rundir:
        rundir = os.path.abspath(args.rundir)
    else:
        logger.fatal("Need either run-id or input directory")
        sys.exit(1)
    if not os.path.exists(rundir):
        logger.fatal("Expected run directory %s does not exist", rundir)
    logger.info("Rundir is %s", rundir)

    if not args.outdir:
        outdir = get_bcl2fastq_outdir(args.runid)
        args.outdir = outdir
    else:
        outdir = args.outdir
    if os.path.exists(outdir):
        logger.fatal("Output directory %s already exists", outdir)
        sys.exit(1)
    # create now so that generate_bcl2fastq_cfg.py can run
    os.makedirs(outdir)

    # catch cases where rundir was user provided and looks weird
    try:
        _, runid, flowcellid = get_machine_run_flowcell_id(rundir)
        run_num = runid + "_" + flowcellid
    except:
        run_num = "UNKNOWN-" + rundir.split("/")[-1]

    # call generate_bcl2fastq_cfg
    #
    # FIXME ugly assumes same directory (just like import above). better to import and run main()?
    generate_bcl2fastq = os.path.join(os.path.dirname(sys.argv[0]),
                                      "generate_bcl2fastq_cfg.py")
    assert os.path.exists(generate_bcl2fastq)
    cmd = [generate_bcl2fastq, '-r', rundir, '-o', outdir]
    if args.testing:
        cmd.append("-t")
    logger.debug("Executing %s", ' '.join(cmd))
    try:
        res = subprocess.check_output(cmd, stderr=subprocess.STDOUT)
    except subprocess.CalledProcessError as e:
        logger.fatal("The following command failed with return code %s: %s",
                     e.returncode, ' '.join(cmd))
        logger.fatal("Output: %s", e.output.decode())
        logger.fatal("Exiting")
        os.rmdir(outdir)
        sys.exit(1)
    # generate_bcl2fastq is normally quiet. if there's output, make caller aware of it
    # use sys instead of logger to avoid double logging
    if res:
        sys.stderr.write(res.decode())

    # just created files
    muxinfo_cfg = os.path.join(outdir, MUXINFO_CFG)
    status_cfg = os.path.join(outdir, STATUS_CFG)

    # NOTE: signal for failed runs is exit 0 from generate_bcl2fastq and missing output files
    #
    if any([not os.path.exists(x) for x in [muxinfo_cfg]]):
        # one missing means all should be missing
        assert all([not os.path.exists(x) for x in [muxinfo_cfg]])
        #Check status as seqrunfailed or non-bcl run
        with open(status_cfg, 'r') as fh:
            status = fh.read().strip()
        update_run_status(mongo_status_script, run_num, outdir, status,
                          args.testing)
        sys.exit(0)

    # turn arguments into cfg_dict that gets merged into pipeline config
    cfg_dict = {
        'rundir': rundir,
        'lanes_arg': lane_info,
        'no_archive': args.no_archive,
        'run_num': run_num
    }

    mux_units = get_mux_units_from_cfgfile(muxinfo_cfg, lane_nos)
    if args.mismatches is not None:
        mux_units = [
            mu._replace(barcode_mismatches=args.mismatches) for mu in mux_units
        ]
    os.unlink(muxinfo_cfg)

    cfg_dict['units'] = dict()
    for mu in mux_units:
        # special case: mux split across multiple lanes. make lanes a list
        # and add in extra lanes if needed.
        k = mu.mux_dir
        mu_dict = dict(mu._asdict())
        cfg_dict['units'][k] = mu_dict

    # create mongodb update command, used later, after submission
    mongo_update_cmd = "{} -r {} -s STARTED".format(mongo_status_script,
                                                    cfg_dict['run_num'])
    mongo_update_cmd += " -a $ANALYSIS_ID -o {}".format(
        outdir)  # set in run.sh
    if args.testing:
        mongo_update_cmd += " -t"

    pipeline_handler = PipelineHandler(
        PIPELINE_NAME,
        PIPELINE_BASEDIR,
        args,
        cfg_dict,
        logger_cmd=mongo_update_cmd,
        cluster_cfgfile=get_cluster_cfgfile(CFG_DIR))

    pipeline_handler.setup_env()
    pipeline_handler.submit(args.no_run)
Exemplo n.º 12
0
def main():
    """main function
    """

    default_parser = default_argparser(CFG_DIR, with_readunits=True)
    parser = configargparse.ArgumentParser(description=__doc__.format(
        PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version()),
                                           parents=[default_parser])

    parser._optionals.title = "Arguments"
    # pipeline specific args
    parser.add_argument('-t',
                        "--seqtype",
                        required=True,
                        choices=['WGS', 'WES', 'targeted'],
                        help="Sequencing type")
    parser.add_argument('-l',
                        "--bed",
                        help="Bed file listing regions of interest."
                        " Required for WES and targeted sequencing.")
    default = 4
    parser.add_argument(
        "-c",
        "--hc-nct",
        default=default,
        type=int,
        help="Number of Haplotype Caller threads (per region cluster)."
        " Values>1 reported to make Haplotype Caller unstable (default={})".
        format(default))
    default = 100
    parser.add_argument(
        '-i',
        "--interval-padding",
        default=default,
        help="Interval padding (for non-WGS only; default = {})".format(
            default))
    parser.add_argument(
        '-j',
        "--joint-calls",
        action='store_true',
        help="Perform joint/cohort calling (requires multisample input)")
    parser.add_argument(
        '--raw-bam',
        help=
        "Advanced: Injects raw (pre-dedup, pre-BQSR etc.) BAM (overwrites fq options)."
        " WARNING: reference needs to match pipeline requirements")
    parser.add_argument(
        '--proc-bam',
        help=
        "Advanced: Injects processed (post-dedup, post-BQSR etc.) BAM (overwrites fq options)."
        " WARNING: reference and pre-processing need to match pipeline requirements"
    )
    # FIXME can be achieved with --until rule as well
    parser.add_argument('--bam-only',
                        action='store_true',
                        help="Only process up until BAM file")
    parser.add_argument('--gvcf-only',
                        action='store_true',
                        help="Only process up until GVCF file")

    args = parser.parse_args()

    # Repeateable -v and -q for setting logging level.
    # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/
    # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4
    # script -vv -> DEBUG
    # script -v -> INFO
    # script -> WARNING
    # script -q -> ERROR
    # script -qq -> CRITICAL
    # script -qqq -> no logging at all
    logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose)
    aux_logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose)

    if os.path.exists(args.outdir):
        logger.fatal("Output directory %s already exists", args.outdir)
        sys.exit(1)

    # samples is a dictionary with sample names as key (mostly just
    # one) and readunit keys as value. readunits is a dict with
    # readunits (think: fastq pairs with attributes) as value
    if args.sample_cfg:
        if any([args.fq1, args.fq2, args.sample, args.raw_bam, args.proc_bam]):
            logger.fatal(
                "Config file overrides fastq, sample and BAM arguments."
                " Use one or the other")
            sys.exit(1)
        if not os.path.exists(args.sample_cfg):
            logger.fatal("Config file %s does not exist", args.sample_cfg)
            sys.exit(1)
        samples, readunits = get_samples_and_readunits_from_cfgfile(
            args.sample_cfg)

    else:  # no sample config, so input is either fastq or existing bam
        samples = dict()

        if not args.sample:
            logger.fatal("Need sample name if not using config file")
            sys.exit(1)

        if args.raw_bam or args.proc_bam:
            assert not args.fq1, ("BAM injection overwrites fastq arguments")

            if args.raw_bam:
                assert os.path.exists(args.raw_bam)
                assert not args.proc_bam, (
                    "Cannot inject raw and processed BAM")
            if args.proc_bam:
                assert os.path.exists(args.proc_bam)
                assert not args.raw_bam, (
                    "Cannot inject raw and processed BAM")

            readunits = dict()
            samples[args.sample] = []

        elif args.fq1:

            readunits = get_readunits_from_args(args.fq1, args.fq2)
            # all readunits go into this one sample specified on the command-line
            samples[args.sample] = list(readunits.keys())

        else:
            logger.fatal(
                "Need at least one fastq files as argument if not using config file"
            )
            sys.exit(1)

    if args.seqtype in ['WES', 'targeted']:
        if not args.bed:
            logger.fatal(
                "Analysis of exome and targeted sequence runs requires a bed file"
            )
            sys.exit(1)
        else:
            if not os.path.exists(args.bed):
                logger.fatal("Bed file %s does not exist", args.sample_cfg)
                sys.exit(1)

    if args.joint_calls:
        if len(samples) < 2:
            logger.fatal("Need at least two samples for joint calling")
            sys.exit(1)

    # turn arguments into cfg_dict (gets merged with other configs late)
    #
    cfg_dict = dict()
    cfg_dict['readunits'] = readunits
    cfg_dict['samples'] = samples
    cfg_dict['seqtype'] = args.seqtype
    cfg_dict['intervals'] = os.path.abspath(
        args.bed
    ) if args.bed else None  # always safe, might be used for WGS as well
    cfg_dict['mark_dups'] = MARK_DUPS
    cfg_dict['bam_only'] = args.bam_only
    cfg_dict['gvcf_only'] = args.gvcf_only
    cfg_dict['hc_nct'] = args.hc_nct
    cfg_dict['joint_calls'] = args.joint_calls
    cfg_dict['interval_padding'] = args.interval_padding
    pipeline_handler = PipelineHandler(
        PIPELINE_NAME,
        PIPELINE_BASEDIR,
        args,
        cfg_dict,
        cluster_cfgfile=get_cluster_cfgfile(CFG_DIR))
    pipeline_handler.setup_env()

    # Inject existing BAM by symlinking (everything upstream is temporary anyway)
    # WARNING: filename has to match definition in Snakefile!
    if args.raw_bam:
        target = os.path.join(args.outdir, "out", args.sample,
                              "{}.bwamem.bam".format(args.sample))
        os.makedirs(os.path.dirname(target))
        os.symlink(os.path.abspath(args.raw_bam), target)

        src_bai = os.path.abspath(args.raw_bam) + ".bai"
        if os.path.exists(src_bai):
            os.symlink(src_bai, target + ".bai")

    elif args.proc_bam:
        target = os.path.join(args.outdir, "out", args.sample,
                              "{}.bwamem".format(args.sample))
        if cfg_dict['mark_dups']:
            target += ".dedup"
        if cfg_dict['seqtype'] != 'targeted':
            target += ".bqsr"
        target += ".bam"
        os.makedirs(os.path.dirname(target))
        os.symlink(os.path.abspath(args.proc_bam), target)
        if os.path.exists(os.path.abspath(args.proc_bam) + ".bai"):
            os.symlink(
                os.path.abspath(args.proc_bam) + ".bai", target + ".bai")

    pipeline_handler.submit(args.no_run)
Exemplo n.º 13
0
def main():
    """main function
    """

    default_parser = default_argparser(CFG_DIR, with_readunits=True)
    parser = configargparse.ArgumentParser(description=__doc__.format(
        PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version()),
                                           parents=[default_parser])

    parser._optionals.title = "Arguments"
    # pipeline specific args
    default = 2000
    parser.add_argument(
        "--fragment-length",
        type=int,
        default=default,
        help="Fragment length argument for Bowtie (default {})".format(
            default))
    default = 200
    parser.add_argument(
        "--extsize",
        type=int,
        default=default,
        help=
        "extsize argument for MACS2; only used for single-end reads (default {})"
        .format(default))
    default = -100
    parser.add_argument(
        "--shift",
        type=int,
        default=default,
        help=
        "shift argument for MACS2; only used for single-end reads (default {})"
        .format(default))
    default = 250
    parser.add_argument(
        "--peak-ext-bp",
        type=int,
        default=default,
        help="Extension around peaks for bed creation (default {})".format(
            default))
    args = parser.parse_args()

    # Repeateable -v and -q for setting logging level.
    # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/
    # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4
    # script -vv -> DEBUG
    # script -v -> INFO
    # script -> WARNING
    # script -q -> ERROR
    # script -qq -> CRITICAL
    # script -qqq -> no logging at all
    logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose)
    aux_logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose)

    if os.path.exists(args.outdir):
        logger.fatal("Output directory %s already exists", args.outdir)
        sys.exit(1)

    # samples is a dictionary with sample names as key (mostly just
    # one) and readunit keys as value. readunits is a dict with
    # readunits (think: fastq pairs with attributes) as value
    if args.sample_cfg:
        if any([args.fq1, args.fq2, args.sample]):
            logger.fatal(
                "Config file overrides fastq and sample input arguments."
                " Use one or the other")
            sys.exit(1)
        if not os.path.exists(args.sample_cfg):
            logger.fatal("Config file %s does not exist", args.sample_cfg)
            sys.exit(1)
        samples, readunits = get_samples_and_readunits_from_cfgfile(
            args.sample_cfg)
    else:
        if not all([args.fq1, args.sample]):
            logger.fatal("Need at least fq1 and sample without config file")
            sys.exit(1)

        readunits = get_readunits_from_args(args.fq1, args.fq2)
        # all readunits go into this one sample specified on the command-line
        samples = dict()
        samples[args.sample] = list(readunits.keys())

    # turn arguments into cfg_dict that gets merged into pipeline config
    #
    cfg_dict = dict()
    cfg_dict['readunits'] = readunits
    cfg_dict['samples'] = samples
    cfg_dict['paired_end'] = any(ru.get('fq2') for ru in readunits.values())
    if cfg_dict['paired_end']:
        assert all(ru.get('fq2') for ru in readunits.values()), (
            "Can't handle mix of paired-end and single-end")
    cfg_dict['mapper'] = 'bowtie2'  # FIXME fixed for now
    # cfg_dict["bowtie2_custom_args"]
    # cfg_dict['platform']
    # cfg_dict['center']
    # cfg_dict["macs2_custom_args"]

    cfg_dict['fragment_length'] = args.fragment_length
    cfg_dict['shift'] = args.shift
    cfg_dict['extsize'] = args.extsize
    cfg_dict["peak_ext_bp"] = args.peak_ext_bp

    pipeline_handler = PipelineHandler(
        PIPELINE_NAME,
        PIPELINE_BASEDIR,
        args,
        cfg_dict,
        cluster_cfgfile=get_cluster_cfgfile(CFG_DIR))
    pipeline_handler.setup_env()
    pipeline_handler.submit(args.no_run)
Exemplo n.º 14
0
def main():
    """main function
    """

    default_parser = default_argparser(CFG_DIR)
    parser = argparse.ArgumentParser(description=__doc__.format(
        PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version()),
                                     parents=[default_parser])

    parser._optionals.title = "Arguments"
    # pipeline specific args
    parser.add_argument(
        '-1',
        "--fq1",
        nargs="+",
        help="FastQ file/s (gzip only)."
        " Multiple input files supported (auto-sorted)."
        " Note: each file (or pair) gets a unique read-group id."
        " Collides with --sample-cfg.")
    parser.add_argument(
        '-2',
        "--fq2",
        nargs="+",
        help="FastQ file/s (if paired) (gzip only). See also --fq1")
    parser.add_argument('-s',
                        "--sample",
                        help="Sample name. Collides with --sample-cfg.")

    args = parser.parse_args()

    # Repeateable -v and -q for setting logging level.
    # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/
    # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4
    # script -vv -> DEBUG
    # script -v -> INFO
    # script -> WARNING
    # script -q -> ERROR
    # script -qq -> CRITICAL
    # script -qqq -> no logging at all
    logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose)
    aux_logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose)

    if os.path.exists(args.outdir):
        logger.fatal("Output directory %s already exists", args.outdir)
        sys.exit(1)

    # samples is a dictionary with sample names as key (mostly just
    # one) and readunit keys as value. readunits is a dict with
    # readunits (think: fastq pairs with attributes) as value
    if args.sample_cfg:
        if any([args.fq1, args.fq2, args.sample]):
            logger.fatal(
                "Config file overrides fastq and sample input arguments."
                " Use one or the other")
            sys.exit(1)
        if not os.path.exists(args.sample_cfg):
            logger.fatal("Config file %s does not exist", args.sample_cfg)
            sys.exit(1)
        samples, readunits = get_samples_and_readunits_from_cfgfile(
            args.sample_cfg)
    else:
        if not all([args.fq1, args.sample]):
            logger.fatal("Need at least fq1 and sample without config file")
            sys.exit(1)

        readunits = get_readunits_from_args(args.fq1, args.fq2)
        # all readunits go into this one sample specified on the command-line
        samples = dict()
        samples[args.sample] = list(readunits.keys())

    # FIXME now exported to ref.cfg. how to auto check there?
    #if not os.path.exists(args.reffa):
    #    logger.fatal("Reference '%s' doesn't exist", args.reffa)
    #    sys.exit(1)
    #
    #for p in ['bwa', 'samtools']:
    #    if not ref_is_indexed(args.reffa, p):
    #        logger.fatal("Reference '%s' doesn't appear to be indexed"
    #                     " with %s", args.reffa, p)
    #        sys.exit(1)

    # turn arguments into cfg_dict that gets merged into pipeline config
    #
    cfg_dict = dict()
    cfg_dict['readunits'] = readunits
    cfg_dict['samples'] = samples
    cfg_dict['mark_dups'] = MARK_DUPS

    pipeline_handler = PipelineHandler(
        PIPELINE_NAME,
        PIPELINE_BASEDIR,
        args,
        cfg_dict,
        cluster_cfgfile=get_cluster_cfgfile(CFG_DIR))
    pipeline_handler.setup_env()
    pipeline_handler.submit(args.no_run)
Exemplo n.º 15
0
def main():
    """main function
    """

    default_parser = default_argparser(CFG_DIR, with_readunits=True)
    parser = configargparse.ArgumentParser(description=__doc__.format(
        PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version()),
                                           parents=[default_parser])

    parser._optionals.title = "Arguments"
    # pipeline specific args
    default = ['kraken', 'metaphlan2']
    parser.add_argument("-p",
                        "--profilers",
                        nargs='+',
                        default=default,
                        help="Profilers to run (default = {}".format(
                            ", ".join(default)))

    args = parser.parse_args()

    # Repeateable -v and -q for setting logging level.
    # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/
    # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4
    # script -vv -> DEBUG
    # script -v -> INFO
    # script -> WARNING
    # script -q -> ERROR
    # script -qq -> CRITICAL
    # script -qqq -> no logging at all
    logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose)
    aux_logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose)

    if os.path.exists(args.outdir):
        logger.fatal("Output directory %s already exists", args.outdir)
        sys.exit(1)

    # samples is a dictionary with sample names as key (mostly just
    # one) and readunit keys as value. readunits is a dict with
    # readunits (think: fastq pairs with attributes) as value
    if args.sample_cfg:
        if any([args.fq1, args.fq2, args.sample]):
            logger.fatal("Config file overrides fastq and sample arguments."
                         " Use one or the other")
            sys.exit(1)
        if not os.path.exists(args.sample_cfg):
            logger.fatal("Config file %s does not exist", args.sample_cfg)
            sys.exit(1)
        samples, readunits = get_samples_and_readunits_from_cfgfile(
            args.sample_cfg)

    else:  # no sample config, so input is either fastq or existing bam
        samples = dict()

        if not args.sample:
            logger.fatal("Need sample name if not using config file")
            sys.exit(1)

        if args.fq1:
            readunits = get_readunits_from_args(args.fq1, args.fq2)
            # all readunits go into this one sample specified on the command-line
            samples[args.sample] = list(readunits.keys())

        else:
            logger.fatal(
                "Need at least one fastq files as argument if not using config file"
            )
            sys.exit(1)

    for ru in readunits.values():
        assert ru['fq2'], (
            "FastQ R2 missing, but pipelines requires paired-end reads")

    # turn arguments into cfg_dict (gets merged with other configs late)
    #
    cfg_dict = dict()
    cfg_dict['readunits'] = readunits
    cfg_dict['samples'] = samples
    cfg_dict['profilers'] = args.profilers

    pipeline_handler = PipelineHandler(
        PIPELINE_NAME,
        PIPELINE_BASEDIR,
        args,
        cfg_dict,
        cluster_cfgfile=get_cluster_cfgfile(CFG_DIR))
    pipeline_handler.setup_env()
    pipeline_handler.submit(args.no_run)
Exemplo n.º 16
0
def main():
    """main function
    """

    default_parser = default_argparser(CFG_DIR)
    parser = argparse.ArgumentParser(description=__doc__.format(
        PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version()),
                                     parents=[default_parser])

    parser._optionals.title = "Arguments"
    # pipeline specific args
    parser.add_argument('-1', "--fq1", nargs="+",
                        help="FastQ file/s (gzip only)."
                        " Multiple input files supported (auto-sorted)."
                        " Note: each file (or pair) gets a unique read-group id."
                        " Collides with --sample-cfg.")
    parser.add_argument('-2', "--fq2", nargs="+",
                        help="FastQ file/s (if paired) (gzip only). See also --fq1")
    parser.add_argument('-s', "--sample",
                        help="Sample name. Collides with --sample-cfg.")
    parser.add_argument('-D', '--dont-mark-dups', action='store_true',
                        help="Don't mark duplicate reads")

    args = parser.parse_args()

    # Repeateable -v and -q for setting logging level.
    # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/
    # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4
    # script -vv -> DEBUG
    # script -v -> INFO
    # script -> WARNING
    # script -q -> ERROR
    # script -qq -> CRITICAL
    # script -qqq -> no logging at all
    logger.setLevel(logging.WARN + 10*args.quiet - 10*args.verbose)
    aux_logger.setLevel(logging.WARN + 10*args.quiet - 10*args.verbose)

    if os.path.exists(args.outdir):
        logger.fatal("Output directory %s already exists", args.outdir)
        sys.exit(1)

    # samples is a dictionary with sample names as key (mostly just
    # one) and readunit keys as value. readunits is a dict with
    # readunits (think: fastq pairs with attributes) as value
    if args.sample_cfg:
        if any([args.fq1, args.fq2, args.sample]):
            logger.fatal("Config file overrides fastq and sample input arguments."
                         " Use one or the other")
            sys.exit(1)
        if not os.path.exists(args.sample_cfg):
            logger.fatal("Config file %s does not exist", args.sample_cfg)
            sys.exit(1)
        samples, readunits = get_samples_and_readunits_from_cfgfile(args.sample_cfg)
    else:
        if not all([args.fq1, args.sample]):
            logger.fatal("Need at least fq1 and sample without config file")
            sys.exit(1)

        readunits = get_readunits_from_args(args.fq1, args.fq2)
        # all readunits go into this one sample specified on the command-line
        samples = dict()
        samples[args.sample] = list(readunits.keys())

    # turn arguments into cfg_dict that gets merged into pipeline config
    #
    cfg_dict = dict()
    cfg_dict['readunits'] = readunits
    cfg_dict['samples'] = samples

    cfg_dict['mark_dups'] = not args.dont_mark_dups

    # create mongodb update command, used later, after submission
    #mongo_update_cmd = "true"{} -r {} -s STARTED".format(mongo_status_script, cfg_dict['run_num'])
    #mongo_update_cmd += " -a $ANALYSIS_ID -o {}".format(outdir)# set in run.sh
    #if args.testing:
    #    mongo_update_cmd += " -t"

    pipeline_handler = PipelineHandler(
        PIPELINE_NAME, PIPELINE_BASEDIR,
        args, cfg_dict,
        cluster_cfgfile=get_cluster_cfgfile(CFG_DIR))

    pipeline_handler.setup_env()
    pipeline_handler.submit(args.no_run)
Exemplo n.º 17
0
def main():
    """main function
    """

    default_parser = default_argparser(CFG_DIR, with_readunits=True)
    parser = configargparse.ArgumentParser(description=__doc__.format(
        PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version()),
                                           parents=[default_parser])

    parser._optionals.title = "Arguments"
    # pipeline specific args
    #/
    parser.add_argument('-c',
                        "--cell-barcodes",
                        required=True,
                        help="File listing cell barcodes")
    d = 200
    parser.add_argument(
        "--frag-len",
        default=d,
        type=int,
        help="Estimated fragment length (default={})".format(d))
    d = 20.0
    parser.add_argument(
        '--frag-len-sd',
        default=d,
        type=float,
        help="Estimated fragment length standard deviation (default={})".
        format(d))
    parser.add_argument(
        '--dedup',
        action="store_true",
        help="Run UMI-based deduplication (slow for large data-sets!)")
    args = parser.parse_args()

    # Repeateable -v and -q for setting logging level.
    # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/
    # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4
    # script -vv -> DEBUG
    # script -v -> INFO
    # script -> WARNING
    # script -q -> ERROR
    # script -qq -> CRITICAL
    # script -qqq -> no logging at all
    logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose)
    aux_logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose)

    if os.path.exists(args.outdir):
        logger.fatal("Output directory %s already exists", args.outdir)
        sys.exit(1)

    # samples is a dictionary with sample names as key (mostly just
    # one) and readunit keys as value. readunits is a dict with
    # readunits (think: fastq pairs with attributes) as value
    if args.sample_cfg:
        if any([args.fq1, args.fq2, args.sample]):
            logger.fatal(
                "Config file overrides fastq and sample input arguments."
                " Use one or the other")
            sys.exit(1)
        if not os.path.exists(args.sample_cfg):
            logger.fatal("Config file '%s' does not exist", args.sample_cfg)
            sys.exit(1)
        samples, readunits = get_samples_and_readunits_from_cfgfile(
            args.sample_cfg)
    else:
        if not all([args.fq1, args.sample]):
            logger.fatal("Need at least fq1 and sample without config file")
            sys.exit(1)

        readunits = get_readunits_from_args(args.fq1, args.fq2)
        # all readunits go into this one sample specified on the command-line
        samples = dict()
        samples[args.sample] = list(readunits.keys())

    for ru in readunits.values():
        assert ru['fq2'], (
            "FastQ R2 missing, but pipeline requires paired-end reads")

    # turn arguments into cfg_dict that gets merged into pipeline config
    #
    cfg_dict = dict()
    cfg_dict['readunits'] = readunits
    cfg_dict['samples'] = samples

    if not os.path.exists(args.cell_barcodes):
        logger.fatal("Cellular barcodes file '%s' does not exist",
                     args.cell_barcodes)
        sys.exit(1)

    cfg_dict['cell_barcodes'] = os.path.abspath(args.cell_barcodes)
    cfg_dict['frag_len'] = args.frag_len
    cfg_dict['frag_len_sd'] = args.frag_len_sd
    cfg_dict['no_dedup'] = not args.dedup
    cfg_dict['scrnapipe_transform'] = os.path.abspath(
        os.path.join(PIPELINE_BASEDIR, 'aux/transform.json'))
    cfg_dict['scrna_conf_template'] = os.path.abspath(
        os.path.join(PIPELINE_BASEDIR, 'aux/scrna.conf.template'))
    cfg_dict['adapters'] = os.path.abspath(
        os.path.join(PIPELINE_BASEDIR, 'aux/adapters.fa'))

    pipeline_handler = PipelineHandler(
        PIPELINE_NAME,
        PIPELINE_BASEDIR,
        args,
        cfg_dict,
        cluster_cfgfile=get_cluster_cfgfile(CFG_DIR))
    pipeline_handler.setup_env()
    pipeline_handler.submit(args.no_run)