示例#1
0
def split(args):
    assert args.read1s is not None and args.read2s is not None

    chunks = []

    if cr_chem.get_barcode_whitelist(args.chemistry_def) is not None:

        # Data are barcoded
        for read1_fq, read2_fq, barcodes_json in zip(args.read1s, args.read2s,
                                                     args.chunk_barcodes):
            with open(barcodes_json) as f:
                chunk_barcodes = json.load(f)

            chunks.append({
                'read1_chunk': read1_fq,
                'read2_chunk': read2_fq,
                'barcodes_chunk': chunk_barcodes,
                '__mem_gb': 3.0,
            })

    else:
        # Most stages assume that each chunk has a single barcode.
        # So unfortunately we have to put all reads in the same chunk, otherwise
        # metric computation will break.
        read1_out_filename = martian.make_path('chunk0_1.fastq')
        read2_out_filename = martian.make_path('chunk0_2.fastq')
        with open(read1_out_filename,
                  'w') as read1_out, open(read2_out_filename,
                                          'w') as read2_out:
            for read1_file, read2_file in zip(args.read1s, args.read2s):
                with open(read1_file) as in1, open(read2_file) as in2:
                    fastq1_iter = tk_fasta.read_generator_fastq(
                        in1, paired_end=False)
                    fastq2_iter = tk_fasta.read_generator_fastq(
                        in2, paired_end=False)

                    for read1_tuple in fastq1_iter:
                        read2_tuple = fastq2_iter.next()
                        tk_fasta.write_read_fastq(read1_out, *read1_tuple)
                        tk_fasta.write_read_fastq(read2_out, *read2_tuple)

        chunks.append({
            'read1_chunk': read1_out_filename,
            'read2_chunk': read2_out_filename,
            'barcodes_chunk': [""],
        })

    # Martian doesn't like empty chunk lists so create a chunk w/ empty data
    if len(chunks) == 0:
        return get_dummy_chunk()

    return {'chunks': chunks}
示例#2
0
def main(args, outs):
    ok, msg = tk_preflight.check_gem_groups(args.sample_def)
    if not ok:
        martian.exit(msg)

    if args.chemistry_name is None:
        martian.exit(
            "The chemistry was unable to be automatically determined. This can happen if not enough reads originate from the given reference. Please verify your choice of reference or explicitly specify the chemistry via the --chemistry argument."
        )

    if args.chemistry_name == cr_chem.CUSTOM_CHEMISTRY_NAME:
        chemistry = args.custom_chemistry_def
    else:
        chemistry = cr_chem.get_chemistry(args.chemistry_name)

    ## Build chunk dicts
    outs.chunks = []

    for sample_def in args.sample_def:
        fq_spec = cr_fastq.FastqSpec.from_sample_def(sample_def)

        gem_group = sample_def['gem_group']
        library_id = sample_def.get('library_id', 'MissingLibrary')

        chunks = setup_chunks(args.sample_id, fq_spec, gem_group, library_id,
                              chemistry)

        if len(chunks) == 0:
            # No FASTQs found for a sample def
            martian.exit(cr_constants.NO_INPUT_FASTQS_MESSAGE)

        outs.chunks += chunks

    if len(outs.chunks) == 0:
        # No FASTQs found at all
        martian.exit(cr_constants.NO_INPUT_FASTQS_MESSAGE)

    ## Check the FASTQ files themselves
    check_chunk_fastqs(outs.chunks)

    ## Check the chemistry specifications
    check_chunk_chemistries(outs.chunks)

    ## Output chemistry and barcode whitelist
    outs.chemistry_def = outs.chunks[0]['chemistry']
    outs.barcode_whitelist = cr_chem.get_barcode_whitelist(outs.chemistry_def)
示例#3
0
def join(args, outs, chunk_defs, chunk_outs):
    outs.chunked_reporter = None
    reporter = cr_report.merge_reporters(
        [chunk_out.chunked_reporter for chunk_out in chunk_outs])

    outs.reads_per_bc = [chunk_out.reads_per_bc for chunk_out in chunk_outs]
    if args.output_fastqs:
        outs.barcode_chunked_read1 = [
            chunk_out.barcode_chunked_read1 for chunk_out in chunk_outs
        ]
        outs.barcode_chunked_read2 = [
            chunk_out.barcode_chunked_read2 for chunk_out in chunk_outs
        ]
        outs.barcode_chunked_bams = []
    else:
        outs.barcode_chunked_read1 = []
        outs.barcode_chunked_read2 = []
        outs.barcode_chunked_bams = [
            chunk_out.barcode_chunked_bams for chunk_out in chunk_outs
        ]

    # Output barcodes in each chunk
    outs.barcodes_in_chunks = [
        chunk_def.barcodes_chunk for chunk_def in chunk_defs
    ]

    # If a single chunk w/ no barcodes, return null for chunk info
    if len(outs.barcodes_in_chunks
           ) == 1 and outs.barcodes_in_chunks[0][0] == '':
        outs.barcodes_in_chunks = None

    # Write UMI info (only for barcoded data)
    if cr_chem.get_barcode_whitelist(args.chemistry_def) is not None:
        write_umi_info([c.chunked_gene_umi_counts for c in chunk_outs],
                       outs.umi_info)

    reporter.store_reference_metadata(args.vdj_reference_path,
                                      vdj_constants.REFERENCE_TYPE,
                                      vdj_constants.REFERENCE_METRIC_PREFIX)

    # Write output json
    reporter.report_summary_json(outs.summary)
示例#4
0
def split(args):
    assert args.read1s is not None and args.read2s is not None

    chunks = []

    # Ensure that data are barcoded
    assert cr_chem.get_barcode_whitelist(args.chemistry_def) is not None

    for read1_fq, read2_fq, barcodes_json in zip(args.read1s, args.read2s,
                                                 args.chunk_barcodes):
        chunks.append({
            'read1_chunk': read1_fq,
            'read2_chunk': read2_fq,
            'barcodes_chunk': barcodes_json,
            '__mem_gb': 3,
        })

    # Martian doesn't like empty chunk lists so create a chunk w/ empty data
    if len(chunks) == 0:
        return get_dummy_chunk()

    return {'chunks': chunks}
示例#5
0
def main(args, outs):
    ok, msg = tk_preflight.check_gem_groups(args.sample_def)
    if not ok:
        martian.exit(msg)

    outs.chunks = []
    for sample_def in args.sample_def:
        fastq_mode = sample_def['fastq_mode']
        chunks = []

        if fastq_mode == tk_constants.BCL_PROCESSOR_FASTQ_MODE:
            chunks = main_bcl_processor(args.sample_id, sample_def,
                                        args.chemistry_name,
                                        args.custom_chemistry_def)
        elif fastq_mode == tk_constants.ILMN_BCL2FASTQ_FASTQ_MODE:
            chunks = main_ilmn_bcl2fastq(args.sample_id, sample_def,
                                         args.chemistry_name,
                                         args.custom_chemistry_def)
        else:
            martian.throw("Unrecognized fastq_mode: %s" % fastq_mode)

        if len(chunks) == 0:
            martian.exit(cr_constants.NO_INPUT_FASTQS_MESSAGE)

        outs.chunks += chunks

    if len(outs.chunks) == 0:
        martian.exit(cr_constants.NO_INPUT_FASTQS_MESSAGE)

    check_chunk_fastqs(outs.chunks)

    check_chunk_chemistries(outs.chunks)

    # Output chemistry and barcode whitelist
    outs.chemistry_def = outs.chunks[0]['chemistry']
    outs.barcode_whitelist = cr_chem.get_barcode_whitelist(outs.chemistry_def)
示例#6
0
def main(args, outs):
    ok, msg = tk_preflight.check_gem_groups(args.sample_def)
    if not ok:
        martian.exit(msg)

    if args.chemistry_name is None:
        martian.exit(
            "The chemistry was unable to be automatically determined. This can happen if not enough reads originate from the given reference. Please verify your choice of reference or explicitly specify the chemistry via the --chemistry argument."
        )

    if args.chemistry_name == cr_chem.CUSTOM_CHEMISTRY_NAME:
        chemistry = args.custom_chemistry_def
    else:
        chemistry = cr_chem.get_chemistry(args.chemistry_name)

    ## Build chunk dicts
    outs.chunks = []

    ## Assign library ids
    sample_defs = args.sample_def
    default_lib_type = args.default_library_type or lib_constants.DEFAULT_LIBRARY_TYPE
    library_ids = cr_sample_def.assign_library_ids(sample_defs,
                                                   default_lib_type)

    for sample_def, library_id in zip(sample_defs, library_ids):
        fq_spec = cr_fastq.FastqSpec.from_sample_def(sample_def)
        gem_group = cr_sample_def.get_gem_group(sample_def)
        library_type = cr_sample_def.get_library_type(
            sample_def) or default_lib_type
        subsample_rate = cr_sample_def.get_subsample_rate(sample_def)

        chunks = setup_chunks(args.sample_id, fq_spec, gem_group, library_id,
                              chemistry, library_type, subsample_rate)

        if len(chunks) == 0:
            # No FASTQs found for a sample def
            martian.exit(cr_constants.NO_INPUT_FASTQS_MESSAGE)

        outs.chunks += chunks

    if len(outs.chunks) == 0:
        # No FASTQs found at all
        martian.exit(cr_constants.NO_INPUT_FASTQS_MESSAGE)

    ## Check the FASTQ files themselves
    check_chunk_fastqs(outs.chunks)

    ## Check the chemistry specifications
    check_chunk_chemistries(outs.chunks)

    ## Output chemistry and barcode whitelist
    outs.chemistry_def = outs.chunks[0]['chemistry']
    outs.barcode_whitelist = cr_chem.get_barcode_whitelist(outs.chemistry_def)

    ## Output library info
    lib_tuples = sorted(
        set((c['gem_group'], c['library_id'], c['library_type'])
            for c in outs.chunks))
    lib_info = []
    for g, i, t in lib_tuples:
        lib_info.append({
            'gem_group': g,
            'library_id': i,
            'library_type': t,
        })
    outs.library_info = lib_info