Exemplo n.º 1
0
def check_read_params(args, runinfo):
    read_info, flowcell = tk_bcl.load_run_info(runinfo)
    read_info_by_read_type = {r['read_name']: r for r in read_info}

    # verify barcode
    if args.bc_read_type is None:
        martian.exit("Barcode read must be specified.")
    if args.bc_read_type not in read_info_by_read_type:
        martian.exit("Barcode read not found in run folder: %s" %
                     args.bc_read_type)

    if args.bc_start_index is not None and args.bc_length is not None:
        if args.bc_start_index + args.bc_length > read_info_by_read_type[
                args.bc_read_type]['read_length']:
            martian.exit("Barcode out of bounds (%s:%d-%d)" %
                         (args.bc_read_type, args.bc_start_index,
                          args.bc_start_index + args.bc_length))

    # if sample index reads not generated, must specify lanes to demux
    if args.si_read_type not in read_info_by_read_type:
        if not args.lanes or len(args.lanes) == 0:
            martian.exit(
                "Lanes must be specified if no sample index reads were generated"
            )

    # if UMI present, do bounds check
    if args.umi_read_type is not None and args.umi_read_type not in read_info_by_read_type:
        martian.exit("UMI read type not found in run folder: %s" %
                     args.umi_read_type)
    if args.umi_start_index is not None and args.umi_length is not None:
        if args.umi_start_index + args.umi_length > read_info_by_read_type[
                args.umi_read_type]['read_length']:
            martian.exit("UMI out of bounds (%s:%d-%d)" %
                         (args.umi_read_type, args.umi_start_index,
                          args.umi_start_index + args.umi_length))
Exemplo n.º 2
0
def main(args, outs):
    specs = args.specs
    runinfo_path = tk_preflight.check_runinfo_xml(args.run_path)

    output_dir = os.path.dirname(outs.samplesheet)
    csv_specs = [spec for spec in specs if spec.get('csv')]
    if not csv_specs:
        csv_path = make_csv_from_specs(specs, runinfo_path, output_dir)
        outs.input_samplesheet = None
    else:
        csv_path = csv_specs[0]['csv']
        shutil.copy(csv_path, outs.input_samplesheet)

    read_info, flowcell = tk_bcl.load_run_info(runinfo_path)
    (rta_version, rc_i2_read,
     bcl_params) = tk_bcl.get_rta_version(args.run_path)
    read_info_by_read_type = {r['read_name']: r for r in read_info}
    r1_length = read_info_by_read_type.get('R1',
                                           {'read_length': 0})['read_length']
    r2_length = read_info_by_read_type.get('R2',
                                           {'read_length': 0})['read_length']

    rc_sample_index = (args.si_read_type == 'I2' and rc_i2_read)
    lane_count = tk_lane.get_flowcell_lane_count(runinfo_path)

    output_info = tk_sheet.transform_samplesheet(
        csv_path,
        outs.samplesheet,
        flowcell_lane_count=lane_count,
        r1_read_length=r1_length,
        r2_read_length=r2_length,
        rc_sample_index=rc_sample_index,
        project_name=args.project)

    outs.dual_indexed_samplesheet = output_info['dual_indexed']
Exemplo n.º 3
0
def check_dual_index(args, runinfo):
    """
    This assumes that the spreadsheet and runinfo have been validated
    already.
    """
    read_info, flowcell = tk_bcl.load_run_info(runinfo)
    ignore_dual_index = args.ignore_dual_index

    # if the ignore_dual_index flag is set, then just proceed; we'll
    # ignore downstream
    if ignore_dual_index:
        return

    is_dual_index = tk_bcl.is_real_dual_index_flowcell(read_info)

    # from here on, ignore_dual_index is false
    if is_dual_index:
        # check input samplesheet for index2.  If not present, complain
        # check for samplesheet
        csv_specs = [spec for spec in args.specs if spec.get('csv')]
        if not csv_specs:
            martian.exit(
                "Dual-index flowcell detected.  Please add the --ignore-dual-index "
                "option to proceed, or use an Illumina Experiment Manager-formatted "
                "sample sheet with an index2 column for the second index.")

        csv_spec = csv_specs[0]
        csv_path = csv_spec['csv']
        is_iem = tk_sheet.file_is_iem_samplesheet(csv_path)
        is_csv = tk_sheet.file_is_simple_samplesheet(csv_path)
        if is_csv:
            martian.exit(
                "Dual-index flowcell detected.  Please add the --ignore-dual-index "
                "option to proceed, or use an Illumina Experiment Manager-formatted "
                "sample sheet with an index2 column for the second index.")
        if is_iem:
            if not tk_sheet.iem_has_dual_index(csv_path):
                martian.exit(
                    "Dual-index flowcell detected.  Please add the use --ignore-dual-index "
                    "option to proceed, or add an index2 column to the supplied samplesheet."
                )
Exemplo n.º 4
0
def main(args, outs):
    """
    run_path must be the top-level Illumina flowcell directory
    """
    if not os.path.exists(args.run_path):
        martian.throw("Run directory does not exist: %s" % args.run_path)

    run_info_xml = os.path.join(args.run_path, "RunInfo.xml")
    read_info, flowcell = tk_bcl.load_run_info(run_info_xml)
    outs.si_read_type = get_si_read_type(read_info)

    (rta_version, rc_i2_read,
     bcl_params) = tk_bcl.get_rta_version(args.run_path)
    martian.log_info("BCL folder RTA Version: %s" % rta_version)
    martian.log_info("BCL params: %s" % str(bcl_params))
    martian.log_info("RC'ing i2 read: %s" % str(rc_i2_read))
    outs.rc_i2_read = rc_i2_read

    split_by_tile = _split_by_tile(args)
    martian.log_info("Splitting by tile: %s" % str(split_by_tile))
    outs.split_by_tile = split_by_tile
Exemplo n.º 5
0
def run_bcl2fastq(args, outs):
    input_dir = os.path.join(args.run_path, "Data", "Intensities", "BaseCalls")

    if args.output_path:
        outs.fastq_path = args.output_path

    output_dir = outs.fastq_path

    if args.interop_output_path:
        outs.interop_path = args.interop_output_path

    interop_dir = outs.interop_path

    martian.log_info("Running bcl2fastq on run: %s" % args.run_path)
    martian.log_info("FASTQ output dir: %s" % output_dir)

    run_info_xml = os.path.join(args.run_path, "RunInfo.xml")
    read_info, flowcell = tk_bcl.load_run_info(run_info_xml)
    if not args.bases_mask:
        use_bases_mask_val = tk_bcl.make_bases_mask_val(
            read_info,
            sample_index_read=args.si_read_type,
            dual_indexed=args.dual_indexed_samplesheet,
            ignore_dual_index=args.ignore_dual_index)
    else:
        use_bases_mask_val = args.bases_mask

    outs.file_read_types_map = tk_bcl.get_bcl2fastq_read_type_map(
        read_info,
        sample_index_read=args.si_read_type,
        dual_indexed=args.dual_indexed_samplesheet,
        ignore_dual_index=args.ignore_dual_index)

    # Determine the RTA version of the run and whether this instrument
    # requires i2 to be RC'd
    (rta_version, rc_i2_read,
     bcl_params) = tk_bcl.get_rta_version(args.run_path)
    outs.rc_i2_read = rc_i2_read
    martian.log_info("BCL folder RTA Version: %s" % rta_version)
    martian.log_info("BCL params: %s" % str(bcl_params))

    # Determine the best available bcl2fastq version to use
    # Will call martian.exit() with an error message if there isn't
    # a compatible version available
    hostname = socket.gethostname()
    (major_ver, full_ver) = tk_bcl.check_bcl2fastq(hostname, rta_version)
    outs.bcl2fastq_version = full_ver

    martian.log_info("Using bcl2fastq version: %s" % full_ver)
    martian.log_info("RC'ing i2 read: %s" % str(rc_i2_read))

    # Restore the LD_LIBRARY_PATH set aside by sourceme.bash/shell10x.
    # Only do this for the environment in which BCL2FASTQ will run.
    new_environ = dict(os.environ)
    new_environ['LD_LIBRARY_PATH'] = os.environ['_TENX_LD_LIBRARY_PATH']

    if major_ver == tk_bcl.BCL2FASTQ_V1:
        martian.exit(
            "bcl2fastq 1.8.4 is not currently supported. Please install bcl2fastq2, or use the 10x 'demux' pipeline instead."
        )

        # configure
        cmd = [
            "configureBclToFastq.pl", "--use-bases-mask=" + use_bases_mask_val,
            "--fastq-cluster-count", "20000000", "--input-dir=" + input_dir,
            "--output-dir=" + output_dir, "--no-eamss", "--ignore-missing-bcl",
            "--ignore-missing-control", "--ignore-missing-stats",
            "--sample-sheet=" + args.samplesheet_path
        ]
        cmd += remove_deprecated_args(args.bcl2fastq1_args, major_ver,
                                      full_ver)

        martian.log_info("Running bcl2fastq v1 setup command:")
        martian.log_info(" ".join(cmd))

        outs.bcl2fastq_args = " ".join(cmd)

        try:
            ret = tk_proc.call(cmd, env=new_environ)
        except OSError:
            martian.throw(
                "configureBclToFastq.pl not found on path -- make sure you've added it to your environment"
            )

        if ret != 0:
            martian.throw("configureBclToFastq.pl failed. Exiting.")

        # Run the actual makefiles
        makefile = os.path.join(output_dir, "Makefile")
        if not os.path.exists(makefile):
            martian.throw("BclToFastq Makefile not found where expected: %s" %
                          makefile)

        martian.log_info("Running Makefile...")
        mk_cmd = ["make", "-C", output_dir, "-j", str(args.num_threads)]
        martian.log_info(" ".join(mk_cmd))
        ret = tk_proc.call(mk_cmd, env=new_environ)

        if ret > 0:
            martian.throw(
                "Running the BclToFastq Makefile failed with code: %d. Exiting"
                % ret)
        elif ret < 0:
            martian.throw("Bcl2Fastq was killed with signal %d." % ret)

    elif major_ver == tk_bcl.BCL2FASTQ_V2:
        if not os.path.exists(outs.interop_path):
            os.makedirs(outs.interop_path)
        if not os.path.exists(outs.fastq_path):
            os.makedirs(outs.fastq_path)

        # minimum-trimmed-read-length and mask-short-adapter-reads must be our call (SIs, UMIs)
        min_read_length = min([x["read_length"] for x in read_info])
        if min_read_length > 8:
            # ensure min is at sample-index, if extra base grabbed for QC purposes (I8n, for example)
            min_read_length = 8

        cmd = [
            "bcl2fastq",
            "--minimum-trimmed-read-length",
            str(min_read_length),
            "--mask-short-adapter-reads",
            str(min_read_length),
            "--create-fastq-for-index-reads",
            "--ignore-missing-positions",
            "--ignore-missing-filter",
            "--ignore-missing-bcls",
            #'-r', str(args.__threads), '-w', str(args.__threads),
            "--use-bases-mask=" + use_bases_mask_val,
            "-R",
            args.run_path,
            "--output-dir=" + output_dir,
            "--interop-dir=" + interop_dir,
            "--sample-sheet=" + args.samplesheet_path
        ]
        cmd += remove_deprecated_args(args.bcl2fastq2_args, major_ver,
                                      full_ver)
        outs.bcl2fastq_args = " ".join(cmd)

        martian.log_info("Running bcl2fastq2: %s" % (" ".join(cmd)))

        try:
            ret = tk_proc.call(cmd, env=new_environ)
        except OSError:
            martian.throw(
                "bcl2fastq not found on PATH -- make sure you've added it to your environment"
            )

        if ret > 0:
            files_path = os.path.abspath(martian.make_path('_stderr'))
            enclosing_path = os.path.dirname(os.path.dirname(files_path))
            stderr_path = os.path.join(enclosing_path, '_stderr')
            martian.exit(
                "bcl2fastq exited with an error. You may have specified an invalid command-line option. See the full error here:\n%s"
                % stderr_path)
        elif ret < 0:
            # subprocess.call returns negative code (on UNIX): bcl2fastq killed by external signal
            martian.exit("bcl2fastq was killed with signal %d." % ret)
Exemplo n.º 6
0
def process_raw_ilmn_data(args, outs):
    """
    run_path must be the top-level Illumina run directory
    """
    input_dir = os.path.join(args.run_path, "Data", "Intensities", "BaseCalls")
    output_dir = outs.raw_fastq_path

    martian.log_info("Running bcl2fastq on run: %s" % args.run_path)
    martian.log_info("FASTQ output dir: %s" % output_dir)

    if not os.path.exists(args.run_path):
        martian.throw("Run directory does not exist: %s" % args.run_path)

    run_info_xml = os.path.join(args.run_path, "RunInfo.xml")
    read_info, flowcell = tk_bcl.load_run_info(run_info_xml)
    use_bases_mask_val = tk_bcl.make_bases_mask_val(read_info)

    # Determine the RTA version of the run and whether this instrument
    # requires i2 to RC'd
    (rta_version, rc_i2_read,
     bcl_params) = tk_bcl.get_rta_version(args.run_path)
    martian.log_info("BCL folder RTA Version: %s" % rta_version)
    martian.log_info("BCL params: %s" % str(bcl_params))

    # Determine the best available bcl2fastq version to use
    # Will call martian.exit() with an error message if there isn't
    # a compatible version available
    hostname = socket.gethostname()
    (major_ver, full_ver) = tk_bcl.check_bcl2fastq(hostname, rta_version)

    martian.log_info("Using bcl2fastq version: %s" % full_ver)

    tile_split = args.tile_suffix != '*'

    try:
        # Internal use only. Move aside Illumina sample sheet so
        # bcl2fastq doesn't use it. For customers, there is a pre-flight
        # check to make sure there is no sample sheet in the places
        # bcl2fastq looks for it.
        import kitten

        # Older RTA put sheet into Data/Intensities/BaseCalls while
        # newer RTA put sheet at top of the BCL folder. Check both.
        for ss_dir in [args.run_path, input_dir]:
            ilmn_sample_sheet = os.path.join(ss_dir, "SampleSheet.csv")
            mv_sample_sheet = os.path.join(ss_dir, "IlluminaSampleSheet.csv")
            if os.path.exists(ilmn_sample_sheet):
                martian.log_info("Renaming the Illumina sample sheet")
                os.rename(ilmn_sample_sheet, mv_sample_sheet)
    except ImportError:
        pass

    # Restore the LD_LIBRARY_PATH set aside by sourceme.bash/shell10x.
    # Only do this for the environment in which BCL2FASTQ will run.
    new_environ = dict(os.environ)
    new_environ['LD_LIBRARY_PATH'] = os.environ['_TENX_LD_LIBRARY_PATH']

    if major_ver == tk_bcl.BCL2FASTQ_V1:
        if tile_split:
            martian.throw(
                "Cannot support NovaSeq demux scheme on bcl2fastq v1.  Exiting."
            )

        # configure
        # write bigger fastq chunks to avoid blow-up of chunks
        cmd = [
            "configureBclToFastq.pl", "--fastq-cluster-count", "20000000",
            "--no-eamss", "--use-bases-mask=" + use_bases_mask_val,
            "--input-dir=" + input_dir, "--output-dir=" + output_dir
        ]

        martian.log_info("Running bcl2fastq setup command:")
        martian.log_info(" ".join(cmd))

        try:
            ret = tenkit.log_subprocess.call(cmd, env=new_environ)
        except OSError:
            martian.throw(
                "configureBclToFastq.pl not found on path -- make sure you've added it to your environment"
            )

        if ret != 0:
            martian.throw("configureBclToFastq.pl failed. Exiting.")

        # Run the actual makefiles
        makefile = os.path.join(output_dir, "Makefile")
        if not os.path.exists(makefile):
            martian.throw("BclToFastq Makefile not found where expected: %s" %
                          makefile)

        martian.log_info("Running Makefile...")
        mk_cmd = ["make", "-C", output_dir, "-j", str(args.num_threads)]
        martian.log_info(" ".join(mk_cmd))
        ret = tenkit.log_subprocess.call(mk_cmd, env=new_environ)

        if ret > 0:
            martian.throw(
                "running the BclToFastq Makefile failed with code: %d. Exiting"
                % ret)
        elif ret < 0:
            martian.throw("Bcl2Fastq was killed with signal %d." % ret)

    elif major_ver == tk_bcl.BCL2FASTQ_V2:
        if tile_split:
            proj_output_dir = os.path.join(output_dir,
                                           "Tile%s" % args.tile_suffix,
                                           "Project_%s" % flowcell)
        else:
            proj_output_dir = os.path.join(output_dir, "Project_%s" % flowcell)

        fastq_output_dir = os.path.join(proj_output_dir, "fastq")
        interop_output_dir = os.path.join(proj_output_dir, "interop")

        if not os.path.exists(fastq_output_dir):
            os.makedirs(fastq_output_dir)

        if not os.path.exists(interop_output_dir):
            os.makedirs(interop_output_dir)

        min_read_length = min([x["read_length"] for x in read_info])

        if tile_split:
            flowcell_info = tk_lane.get_flowcell_layout(run_info_xml)
            if flowcell_info.tile_length is None:
                martian.throw(
                    "Cannot determine tile name length from RunInfo.xml")

            tiles_regex_prefix = "[0-9]" * (flowcell_info.tile_length - 1)
            tiles_regex = "%s%s" % (tiles_regex_prefix, args.tile_suffix)
            cmd = [
                "bcl2fastq",
                "--minimum-trimmed-read-length",
                str(min_read_length),
                # PIPELINES-1140 - required in bcl2fastq 2.17 to generate correct index read fastqs
                "--mask-short-adapter-reads",
                str(min_read_length),
                # LONGRANGER-121 - ignore missing bcl data
                "--ignore-missing-bcls",
                "--ignore-missing-filter",
                "--ignore-missing-positions",
                "--ignore-missing-controls",
                '-r',
                str(args.__threads),
                '-w',
                str(args.__threads),
                # TENKIT-72 avoid CPU oversubscription
                '-p',
                str(args.__threads),
                "--use-bases-mask=" + use_bases_mask_val,
                "-R",
                args.run_path,
                "--output-dir=" + fastq_output_dir,
                "--interop-dir=" + interop_output_dir,
                "--tiles=" + tiles_regex
            ]
        else:
            cmd = [
                "bcl2fastq",
                "--minimum-trimmed-read-length",
                str(min_read_length),
                # PIPELINES-1140 - required in bcl2fastq 2.17 to generate correct index read fastqs
                "--mask-short-adapter-reads",
                str(min_read_length),
                # LONGRANGER-121 - ignore missing bcl data
                "--ignore-missing-bcls",
                "--ignore-missing-filter",
                "--ignore-missing-positions",
                "--ignore-missing-controls",
                '-r',
                str(args.__threads),
                '-w',
                str(args.__threads),
                # TENKIT-72 avoid CPU oversubscription
                '-p',
                str(args.__threads),
                "--use-bases-mask=" + use_bases_mask_val,
                "-R",
                args.run_path,
                "--output-dir=" + fastq_output_dir,
                "--interop-dir=" + interop_output_dir
            ]

        martian.log_info("Running bcl2fastq 2: %s" % (" ".join(cmd)))

        try:
            ret = tenkit.log_subprocess.call(cmd, env=new_environ)
        except OSError:
            martian.throw(
                "bcl2fastq not found on PATH -- make sure you've added it to your environment"
            )

        if ret > 0:
            martian.exit("bcl2fastq failed. Exiting.")
        elif ret < 0:
            martian.exit("bcl2fastq was killed with signal %d." % ret)

# Glob over all lanes - demultiplex handles whether to collapse them
    if tile_split:
        fastq_glob = os.path.join(output_dir, "Tile*", "Project_" + flowcell,
                                  "*", "*.fastq*")
    else:
        fastq_glob = os.path.join(output_dir, "Project_" + flowcell, "*",
                                  "*.fastq*")
    start_fastq_files = glob.glob(fastq_glob)

    # File renaming -- bcl2fastq names the reads R1, R2, R3, R4
    # Use our conventions to make them R1, I1, I2, R2, as the case may be.
    rename_fastq_files(read_info, start_fastq_files)