예제 #1
0
def main(args, outs):
    outs.coerce_strings()

    paired_end = cr_chem.is_paired_end(args.chemistry_def)

    outs.read1s = martian.make_path('reads_1.fastq' + h5_constants.LZ4_SUFFIX)
    r1_fq_out = cr_io.open_maybe_gzip(outs.read1s, 'w')

    if paired_end:
        outs.read2s = martian.make_path('reads_2.fastq' +
                                        h5_constants.LZ4_SUFFIX)
        r2_fq_out = cr_io.open_maybe_gzip(outs.read2s, 'w')
    else:
        outs.read2s = None
        r2_fq_out = None

    barcodes_out = cr_io.open_maybe_gzip(outs.chunk_barcodes, 'w')

    merge_by_barcode(args.fastqs, r1_fq_out, r2_fq_out, barcodes_out,
                     paired_end)

    r1_fq_out.close()
    if r2_fq_out is not None:
        r2_fq_out.close()
    barcodes_out.close()
예제 #2
0
def main(args, outs):
    chunk_start = args.chunk_start
    chunk_end = args.chunk_end
    chunk_index = args.chunk_index

    prefixes = get_seqs(args.nbases)
    bam_in = tk_bam.create_bam_infile(args.input)
    template = BamTemplateShim(bam_in, keep_comments=(chunk_index==0))

    bams_out = {}
    for prefix in prefixes:
        filename = martian.make_path("bc_{}.bam".format(prefix))
        bams_out[prefix], _ = tk_bam.create_bam_outfile(filename, None, None, template=template)

    non_bc_bam = martian.make_path("bc_{}.bam".format(None))
    non_bc_bam_out, _ = tk_bam.create_bam_outfile(non_bc_bam, None, None, template=template)
    for read in tk_bam.read_bam_chunk(bam_in, (chunk_start, chunk_end)):
        barcode = crdna_io.get_read_barcode(read)
        if barcode is None:
            non_bc_bam_out.write(read)
        else:
            prefix = barcode[:args.nbases]
            bams_out[prefix].write(read)
    bam_in.close()

    non_bc_bam_out.close()
    sort_bam(non_bc_bam)
    outs.non_bc_bams = [non_bc_bam]

    outs.buckets = {}
    for prefix in prefixes:
        filename = bams_out[prefix].filename
        bams_out[prefix].close()
        sort_bam(filename)
        outs.buckets[prefix] = filename
예제 #3
0
def main(args, outs):
    """
    Wrapper for GATK
    """
    tmp = martian.make_path('tmp.vcf')
    ref = os.path.join(args.reference_path, 'fasta', 'genome.fa')
    cmd = [
        'java', '-jar', args.gatk_path, 'HaplotypeCaller', '-R', ref, '-L',
        args.targets_file, '-I', args.subset_bam, '-O', tmp,
        '--native-pair-hmm-threads', '1'
    ]
    subprocess.check_call(cmd)

    # fix the name
    tmp2 = martian.make_path('{}.vcf'.format(
        args.node_id))  # node ID in original tree
    with open(tmp2, 'w') as outf:
        for l in open(tmp):
            if l.startswith('#CHROM'):
                l = l.split()
                l[-1] = args.node_id
                l = '\t'.join(l) + '\n'
            outf.write(l)

    subprocess.check_call(['bgzip', tmp2])
    subprocess.check_call(['tabix', '-p', 'vcf', tmp2 + '.gz'])
    outs.subset_variants = tmp2 + '.gz'
예제 #4
0
def main(args, outs):
    args.coerce_strings()
    tmp_bam = martian.make_path(str(args.cluster_id) + '.unsorted.bam')
    tk_bam.concatenate(tmp_bam, args.cluster_bams)
    outs.merged_bam = martian.make_path('{}.bam'.format(args.cluster_id))
    subprocess.check_call([
        'sambamba', 'sort', '-t',
        str(args.__threads), '-o', outs.merged_bam, tmp_bam
    ])
    os.remove(tmp_bam)
예제 #5
0
def get_dummy_chunk():
    read1_out_filename = martian.make_path('chunk0_1.fastq')
    read2_out_filename = martian.make_path('chunk0_2.fastq')
    with open(read1_out_filename, 'w'), open(read2_out_filename, 'w'):
        pass
    chunks = [{
        'read1_chunk': read1_out_filename,
        'read2_chunk': read2_out_filename,
        'barcodes_chunk': None,
    }]
    return {'chunks': chunks}
예제 #6
0
def main(args, outs):
    outs.coerce_strings()

    # Note: This naming scheme is required by FILTER_VDJ_READS / vdj_asm
    outs.read1s = martian.make_path('reads_1.fastq')
    outs.read2s = martian.make_path('reads_2.fastq')

    with open(outs.read1s, 'w') as r1_fq_out, \
         open(outs.read2s, 'w') as r2_fq_out, \
         open(outs.chunk_barcodes, 'w') as barcodes_out:
        merge_by_barcode(args.fastqs, r1_fq_out, r2_fq_out, barcodes_out)
예제 #7
0
def main(args, outs):
    run_assembly(args.chunked_bam, martian.make_path(''), args)

    out_pref = os.path.splitext(os.path.basename(args.chunked_bam))[0]
    out_pref = martian.make_path(out_pref)
    cr_io.move(out_pref + '.fasta', outs.contig_fasta)
    cr_io.move(out_pref + '.fastq', outs.contig_fastq)
    cr_io.move(out_pref + '_summary.tsv', outs.summary_tsv)
    cr_io.move(out_pref + '_umi_summary.tsv', outs.umi_summary_tsv)
    cr_io.move(out_pref + '_sorted.bam', outs.contig_bam)
    cr_io.move(out_pref + '_sorted.bam.bai', outs.contig_bam_bai)
    cr_io.move(out_pref + '_metrics_summary.json', outs.metrics_summary_json)
예제 #8
0
def prepare_transcriptome_indexes(reference_path, vdj_reference_path):
    """ Use ReadStates of R1/R2 to determine SC3Pv1 vs SC3Pv2 vs SC5P-R1 vs SC5P_auto/SCVDJ.
        Returns (chemistry_name, report, metrics)
        where report is a text report and metrics is a dict """

    ## Index the reference fasta
    fa_path = os.path.join(reference_path, cr_constants.REFERENCE_FASTA_PATH)
    new_fa_path = martian.make_path('ref.fa')

    need_index = True

    if os.path.exists(fa_path + '.fai'):
        # Look for existing .fai file (won't exist for our standard ref packages)
        martian.update_progress('Found genome FASTA index....')
        new_fa_path = fa_path
        need_index = False

    else:
        # Note: this will fail if user's fs doesn't support symlinks
        martian.update_progress('Symlinking genome FASTA...')
        os.symlink(fa_path, new_fa_path)

    if need_index:
        martian.update_progress('Indexing genome...')
        run(['samtools', 'faidx', new_fa_path])

    ## Generate a transcriptome reference from a genome ref
    martian.update_progress('Building transcriptome...')
    gtf_path = os.path.join(reference_path,
                            cr_constants.REFERENCE_GENES_GTF_PATH)
    out_fa_path = martian.make_path('transcriptome.fa')
    # Only index the 1st encountered transcript per gene
    run([
        'detect_chemistry', 'get-transcripts', new_fa_path, gtf_path,
        out_fa_path
    ])

    ## Build kmer index
    martian.update_progress('Building kmer index...')
    kmer_idx_path = martian.make_path('kmers.idx')
    run(['detect_chemistry', 'index-transcripts', out_fa_path, kmer_idx_path])

    # Build VDJ kmer index (optional)
    vdj_idx_path = None
    if vdj_reference_path is not None:
        vdj_fa_path = vdj_ref.get_vdj_reference_fasta(vdj_reference_path)
        vdj_idx_path = martian.make_path('vdj_kmers.idx')
        run([
            'detect_chemistry', 'index-transcripts', vdj_fa_path, vdj_idx_path
        ])

    return (kmer_idx_path, vdj_idx_path)
예제 #9
0
def split(args):
    assert args.read1s is not None and args.read2s is not None

    chunks = []

    if cr_chem.get_barcode_whitelist(args.chemistry_def) is not None:

        # Data are barcoded
        for read1_fq, read2_fq, barcodes_json in zip(args.read1s, args.read2s,
                                                     args.chunk_barcodes):
            with open(barcodes_json) as f:
                chunk_barcodes = json.load(f)

            chunks.append({
                'read1_chunk': read1_fq,
                'read2_chunk': read2_fq,
                'barcodes_chunk': chunk_barcodes,
                '__mem_gb': 3.0,
            })

    else:
        # Most stages assume that each chunk has a single barcode.
        # So unfortunately we have to put all reads in the same chunk, otherwise
        # metric computation will break.
        read1_out_filename = martian.make_path('chunk0_1.fastq')
        read2_out_filename = martian.make_path('chunk0_2.fastq')
        with open(read1_out_filename,
                  'w') as read1_out, open(read2_out_filename,
                                          'w') as read2_out:
            for read1_file, read2_file in zip(args.read1s, args.read2s):
                with open(read1_file) as in1, open(read2_file) as in2:
                    fastq1_iter = tk_fasta.read_generator_fastq(
                        in1, paired_end=False)
                    fastq2_iter = tk_fasta.read_generator_fastq(
                        in2, paired_end=False)

                    for read1_tuple in fastq1_iter:
                        read2_tuple = fastq2_iter.next()
                        tk_fasta.write_read_fastq(read1_out, *read1_tuple)
                        tk_fasta.write_read_fastq(read2_out, *read2_tuple)

        chunks.append({
            'read1_chunk': read1_out_filename,
            'read2_chunk': read2_out_filename,
            'barcodes_chunk': [""],
        })

    # Martian doesn't like empty chunk lists so create a chunk w/ empty data
    if len(chunks) == 0:
        return get_dummy_chunk()

    return {'chunks': chunks}
예제 #10
0
def main(args, outs):
    """Create files, some of which are returned in a structure."""
    outs.bar = {
        'bar': args.foo + 3,
        'file1': martian.make_path('file1'),
        'file2': martian.make_path('file2'),
    }
    with open(outs.bar['file1'], 'w') as file1:
        file1.write(str(args.foo))
    with open(outs.bar['file2'], 'w') as file2:
        file2.write(str(args.foo + 1))
    with open(outs.file3, 'w') as file3:
        file3.write(str(args.foo + 2))
예제 #11
0
def main(args, outs):
    np.random.seed(0)

    subsample_rate = args.subsample_info.get('subsample_rate')
    if subsample_rate is None:
        return

    mol_counter = MoleculeCounter.open(args.molecule_info,
                                       'r',
                                       start=int(args.chunk_start),
                                       length=int(args.chunk_len))

    # Subsample the matrices
    subsample_result = {}
    subsampled_raw_mats = cr_matrix.GeneBCMatrices.build_from_mol_counter(
        mol_counter,
        subsample_rate=subsample_rate,
        subsample_result=subsample_result)

    # Filter the subsampled matrices
    filtered_bcs_per_genome = cr_utils.load_barcode_csv(args.filtered_barcodes)
    subsampled_filt_mats = subsampled_raw_mats.filter_barcodes(
        filtered_bcs_per_genome)

    # Calculations for subsampled duplication rate
    reporter = cr_report.Reporter(
        genomes=map(str, mol_counter.get_ref_column('genome_ids')),
        subsample_types=cr_constants.ALL_SUBSAMPLE_TYPES,
        subsample_depths=args.subsample_info['all_target_rpc'])

    reporter.subsampled_duplication_frac_cb(
        subsampled_raw_mats,
        mol_counter,
        args.subsample_info['subsample_rate'],
        args.subsample_info['subsample_type'],
        args.subsample_info['target_rpc'],
        subsample_result['mapped_reads'],
    )

    mol_counter.close()

    reporter.save(outs.chunked_reporter)

    outs.subsampled_matrices = {}
    outs.subsampled_matrices['raw_matrices'] = martian.make_path(
        'raw_matrices.h5')
    outs.subsampled_matrices['filtered_matrices'] = martian.make_path(
        'filtered_matrices.h5')

    subsampled_raw_mats.save_h5(outs.subsampled_matrices['raw_matrices'])
    subsampled_filt_mats.save_h5(outs.subsampled_matrices['filtered_matrices'])
예제 #12
0
def split(args):
    """Compute base background in split and use it in each chunk."""

    ref_mgr = ReferenceManager(args.reference_path)
    npeaks = utils.quick_line_count(args.peaks) if args.peaks else 0
    if len(ref_mgr.list_species()
           ) > 1 or npeaks == 0 or ref_mgr.motifs is None:
        chunk_def = [{'skip': True}]
        return {'chunks': chunk_def}

    with open(args.globalGCdict, 'r') as f:
        GCdict = pickle.load(f)

    GCdict_paths = {}
    GCbins = sorted(GCdict.keys())
    for gc in GCbins:
        GCdict_paths[gc] = martian.make_path('GCdict_{}_{}'.format(
            gc[0], gc[1]))
        with open(GCdict_paths[gc], 'w') as dump:
            pickle.dump(GCdict[gc], dump)

    # write rows of each chunk to a new peak file
    mem_in_gb = 8
    chunk_def = [{
        '__mem_gb':
        mem_in_gb,
        '__vmem_gb':
        mem_in_gb + int(np.ceil(ref_mgr.get_vmem_est())) + 1,
        'skip':
        False,
        'GCdict':
        GCdict_paths[chunk]
    } for chunk in GCbins]
    return {'chunks': chunk_def}
예제 #13
0
def join(args, outs, chunk_defs, chunk_outs):
    if do_not_make_cloupe(args):
        outs.output_for_cloupe = None
        return

    reference = ReferenceManager(args.reference_path)

    contig_info_fn = martian.make_path("contig_info.json")
    with open(contig_info_fn, 'w') as outfile:
        contig_info = get_contig_info(args.reference_path)
        json.dump(contig_info, outfile)

    gem_group_index_json = get_gem_group_index_json(args, outs)

    call = [
        "crconverter",
        args.sample_id,
        args.pipestance_type,
        "--matrix",
        args.feature_barcode_matrix,
        "--analysis",
        args.analysis,
        "--output",
        outs.output_for_cloupe,
        "--description",
        '"' + args.sample_desc + '"',
        "--peaks",
        args.peaks,
        "--fragmentsindex",
        args.fragments_index,
        "--geneannotations",
        reference.genes,
        "--contiginfo",
        contig_info_fn,
    ]

    if args.metrics_json is not None:
        call.extend(["--metrics", args.metrics_json])
    if args.aggregation_csv is not None:
        call.extend(["--aggregation", args.aggregation_csv])
    if gem_group_index_json is not None:
        call.extend(["--gemgroups", gem_group_index_json])
    transcript_gene_types = get_annotation_gene_types(args)
    if transcript_gene_types is not None:
        call.extend(["--geneannotationtypes", ",".join(transcript_gene_types)])

    # the sample desc may be unicode, so send the whole
    # set of args str utf-8 to check_output
    unicode_call = [arg.encode('utf-8') for arg in call]

    # but keep the arg 'call' here because log_info inherently
    # attempts to encode the message... (TODO: should log_info
    # figure out the encoding of the input string)
    martian.log_info("Running crconverter: %s" % " ".join(call))
    try:
        results = tk_subproc.check_output(unicode_call)
        martian.log_info("crconverter output: %s" % results)
    except subprocess.CalledProcessError as e:
        outs.output_for_cloupe = None
        martian.throw("Could not generate .cloupe file: \n%s" % e.output)
예제 #14
0
def main(args, outs):

    # Write read_chunk for consumption by Rust
    with open("chunk_args.json", "w") as f:
        json.dump(args.read_chunk, f)

    output_path = martian.make_path("")
    prefix = "fastq_chunk"
    chunk_reads_args = [
        'chunk_reads', '--reads-per-fastq',
        str(args.reads_per_file), output_path, prefix, "--martian-args",
        "chunk_args.json", '--compress', 'lz4'
    ]
    print "running chunk reads: [%s]" % str(chunk_reads_args)
    tk_subproc.check_call(chunk_reads_args)

    with open(os.path.join(output_path, "read_chunks.json")) as f:
        chunk_results = json.load(f)

    outs.out_chunks = []

    # Write out a new chunk entry for each resulting chunk
    for chunk in chunk_results:
        print args.read_chunk
        chunk_copy = args.read_chunk.copy()
        print chunk_copy
        chunk_copy['read_chunks'] = chunk
        outs.out_chunks.append(chunk_copy)
예제 #15
0
def main_demultiplex_go(args, outs):
    data = {
        'common_sample_indices': args.common_bcs,
        'file_groups': [],
    }
    file_info = [IlmnFastqFile(x) for x in args.input_files]
    file_groups = groupby(lambda x: (x.s, x.lane, x.group), file_info).items()
    for (_, lane, _), input_files in file_groups:
        files = {read_type: [f for f in input_files if f.read == read_type][0].filename for read_type in args.read_types}
        data['file_groups'].append({
            'lane': lane,
            'files': files,
        })

    input_json_path = martian.make_path('godemux_input.json')
    with open(input_json_path, 'w') as f:
        json.dump(data, f)

    output_dir = outs.demultiplexed_fastq_path
    if args.split_by_tile:
        output_dir = os.path.join(output_dir, args.tile_folder)

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    subproc_args = ['godemux', input_json_path, output_dir,
                    outs.demultiplex_summary, '--demult-read', args.si_read_type,
                    '--chunk', str(args.chunk_number)]
    if args.rc_i2_read:
        subproc_args += ['--rci2read']
    martian.check_call(subproc_args)
예제 #16
0
def main(args, outs):
    in_bam = pysam.Samfile(args.possorted_bam)
    bcs = {x.rstrip() for x in open(args.cell_barcodes)}
    txs = [GenePredTranscript(x) for x in args.tx_subset]
    results = [[tx.name, find_recs(tx, in_bam, bcs)] for tx in txs]
    outs.pickle = martian.make_path('subset_results.pickle')
    with open(outs.pickle, 'w') as outf:
        pickle.dump(results, outf)
예제 #17
0
def join(args, outs, chunk_defs, chunk_outs):
    if args.skip:
        return

    gg_id_to_batch_id, batch_id_to_name = {}, {}

    for lib in args.library_info:
        gg_id_to_batch_id[lib['gem_group']] = lib['batch_id']
        batch_id_to_name[lib['batch_id']] = lib['batch_name']

    matrix = cr_matrix.CountMatrix.load_h5_file(args.matrix_h5)
    matrix = matrix.select_features_by_type(GENE_EXPRESSION_LIBRARY_TYPE)

    batch_ids = np.array([gg_id_to_batch_id[cr_util.split_barcode_seq(bc)[1]] for bc in matrix.bcs])

    # select intersect of non-zero feature in each batch
    feature_mask = np.ones(matrix.features_dim)
    for b_id in batch_id_to_name:
        batch_bc_indices = np.where(batch_ids == b_id)[0]
        matrix_view = cr_matrix.CountMatrixView(matrix, bc_indices=batch_bc_indices)
        feature_mask = np.logical_and(feature_mask, matrix_view.sum(axis=1))

    matrix = matrix.select_features(np.flatnonzero(feature_mask))

    # filter barcodes with zero count
    bc_indices = np.flatnonzero(matrix.get_counts_per_bc())
    matrix = matrix.select_barcodes(bc_indices)

    # l2 norm
    matrix.m = matrix.m.astype('float64')
    cr_matrix.inplace_csc_column_normalize_l2(matrix.m)

    n_pcs = args.num_pcs if args.num_pcs is not None else analysis_constants.CBC_N_COMPONENTS_DEFAULT
    dimred_matrix = fbpca_reduce_dimension(matrix, n_pcs)

    outs.dimred_matrix = martian.make_path('dimred_matrix.pickle')
    with open(outs.dimred_matrix, 'wb') as fp:
        cPickle.dump(dimred_matrix, fp, cPickle.HIGHEST_PROTOCOL)

    bc_feature_info = {
        'barcodes' : matrix.bcs,
        'features' : matrix.feature_ref.feature_defs,
    }
    outs.matrix_barcode_feature_info = martian.make_path('matrix_barcode_feature_info.pickle')
    with open(outs.matrix_barcode_feature_info, 'wb') as fp:
        cPickle.dump(bc_feature_info, fp, cPickle.HIGHEST_PROTOCOL)
예제 #18
0
def split(args):
    # Write BAM comments to json file
    bam_comment_fn = martian.make_path('bam_comments.json')
    with open(bam_comment_fn, 'w') as f:
        json.dump(args.bam_comments, f)

    # Write library info to a file
    libraries_fn = martian.make_path('libraries.json')
    with open(libraries_fn, 'w') as f:
        json.dump(tk_safe_json.json_sanitize(args.library_info),
                  f,
                  indent=4,
                  sort_keys=True)

    chunks = []
    for chunk_genome_input, tags, gem_group, library_type, library_id, in itertools.izip_longest(
            args.genome_inputs, args.tags, args.gem_groups, args.library_types,
            args.library_ids):

        gem_group_str = str(gem_group)
        if gem_group_str in args.skip_translate and library_type in args.skip_translate[
                gem_group_str]:
            this_skip_translate = args.skip_translate[gem_group_str][
                library_type]
        else:
            this_skip_translate = True

        chunks.append({
            'chunk_genome_input': chunk_genome_input,
            'chunk_tags': tags,
            'gem_group': gem_group,
            'library_type': library_type,
            'library_id': library_id,
            'library_info_json': libraries_fn,
            'bam_comments_json': bam_comment_fn,
            'skip_translate': this_skip_translate,
            '__mem_gb': 4,
        })
    join = {
        '__mem_gb': 12,
    }
    return {'chunks': chunks, 'join': join}
예제 #19
0
def join(args, outs, chunk_defs, chunk_outs):
    outs.coerce_strings()
    results = flatten_list_of_lists(
        [pickle.load(open(chunk.subset_results)) for chunk in chunk_outs])
    results = flatten_list_of_lists(results)
    tx_dict = get_gene_pred_dict(args.transcripts)
    results = parse_results(results, tx_dict, args.kit_type)
    outs.results = martian.make_path('results.csv')
    df = pd.DataFrame(
        results, columns=['tx_id', 'tx_position', 'read_molecule_fraction'])
    df.to_csv(outs.results)
예제 #20
0
def split(args):
    out_json_file = martian.make_path('snps.json')
    min_snp_call_qual = args.min_snp_call_qual if args.min_snp_call_qual is not None \
                        else snp_constants.DEFAULT_MIN_SNP_CALL_QUAL
    save_snps(out_json_file, args.variants, min_snp_call_qual)

    chunks = [{
        'chunk_variants': chunk_variants,
        'snps': out_json_file
    } for chunk_variants in args.variants]
    return {'chunks': chunks}
def main(args, outs):
    genome_fasta_path = cr_utils.get_reference_genome_fasta(
        args.reference_path)

    chrom, start, stop = args.locus
    bed_path = martian.make_path('region.bed')
    with open(bed_path, 'w') as f:
        f.write(chrom + "\t" + str(start) + "\t" + str(stop) + "\n")

    # Correct the STAR mapping from 255 to 60 and take care of split reads
    output_bam = martian.make_path('output.bam')
    star_args = [
        'gatk-launch', 'SplitNCigarReads', '-R', genome_fasta_path, '-I',
        args.input, '-L', bed_path, '-O', output_bam,
        '--skip-mapping-quality-transform', 'false',
        '--create-output-bam-index', 'false', '--TMP_DIR',
        os.getcwd()
    ]

    subprocess.check_call(star_args)
def main(args, outs):
    genome_fasta_path = cr_utils.get_reference_genome_fasta(args.reference_path)

    chrom, start, stop = tk_io.get_locus_info(args.locus)
    bed_path = martian.make_path('region.bed')
    with open(bed_path, 'w') as f:
        f.write(chrom+"\t"+str(start)+"\t"+str(stop)+"\n")

    freebayes_args = ['freebayes', '-f', genome_fasta_path, '-b', args.input, '-0', '-t', bed_path]

    with open(outs.output, 'w') as f:
        subprocess.check_call(freebayes_args, stdout=f)
예제 #23
0
def main(args, outs):
    # Martian coerces dict keys to string
    # Coerce keys back to int
    args.chunks_per_gem_group = {
        int(k): v
        for k, v in args.chunks_per_gem_group.iteritems()
    }

    with open(args.read1s_chunk) as f1:
        read1s = [read for read in tk_fasta.read_generator_fastq(f1)]

    with open(args.read2s_chunk) as f2:
        read2s = [read for read in tk_fasta.read_generator_fastq(f2)]

    assert len(read1s) == len(read2s)

    fastqs_out = {}
    buckets = {}

    outs.buckets = {}

    for gem_group, bucket_name in enumerate_bucket_names(
            args.chunks_per_gem_group):
        filename = martian.make_path("%s.fastq" % bucket_name)
        fastqs_out[bucket_name] = open(filename, 'w')
        outs.buckets[bucket_name] = filename
        buckets[bucket_name] = []

    for read1, read2 in itertools.izip(read1s, read2s):
        barcode = vdj_utils.get_fastq_read_barcode(read1)

        # Exclude unbarcoded reads
        if barcode is None:
            continue

        assert barcode == vdj_utils.get_fastq_read_barcode(read2)

        barcode_seq, gem_group = cr_utils.split_barcode_seq(barcode)
        bucket_name = get_bucket_name(gem_group, barcode_seq,
                                      args.chunks_per_gem_group[gem_group])

        buckets[bucket_name].append(read1)
        buckets[bucket_name].append(read2)

    # Sort and write each bucket
    for bucket_name, bucket in buckets.iteritems():
        bucket.sort(key=vdj_utils.fastq_barcode_sort_key)

        fastq_out = fastqs_out[bucket_name]
        for read in bucket:
            tk_fasta.write_read_fastq(fastq_out, *read)

        fastq_out.close()
예제 #24
0
def main(args, outs):
    """For each slice produce a fasta file sampling reads from that slice. 
    We split our section of the genome into a bunch of 20kb chunks. For each
    chunk we sample an identical number of paired end reads. The name of each
    read encodes the true position that it was sampled from."""

    # Grab basic stats for the read lengths and quality scores
    stats_fp = open(args.basic_stats)
    stats = json.load(stats_fp)

    # Fix the random seed
    np.random.seed(0)

    # Info is a map we use everywhere to track the sampling parameters.
    # r1_len: the length of read1
    # r2_len: the length of read2
    # insert_size_map: a map of insert-size (as a string) to frequency
    # q_score_map a map of quality score (as a string) to frequency

    info = {'r1_len': stats['r1_len'], 'r2_len': stats['r2_len']}

    info['q_score_map'] = {
        '30': stats['bc_q30_bases'],
        '20': stats['bc_q20_bases'] - stats['bc_q30_bases'],
        '0': stats['bc_tot_bases'] - stats['bc_q20_bases']
    }

    stats_is_fp = open(args.insert_sizes)
    info['insert_size_map'] = json.load(stats_is_fp)['60']

    # How many samples will we make from each window?
    samples = int(
        round(2.0 * args.target_coverage *
              (float(args.window_size) / (stats['r1_len'] + stats['r2_len']))))

    martian.log_info("Using %i samples per %i bin" %
                     (samples, args.window_size))
    output_path = martian.make_path("chnk.fasta")
    output = open(output_path, "w")

    ref = reference.open_reference(args.reference_path)
    #Loop over every window in every loci.
    for (chrom, start, end) in (tk_io.get_locus_info(l) for l in args.loci):
        cur = start
        while (cur < end):
            # Sample |samples| reads from chrom:cur-chrom:cur+window_size and put
            # the results in the output file
            perbin(chrom, cur, ref, output, info, args.window_size, samples)
            cur += args.window_size
    outs.tmp = output_path
    outs.samples_per_bin = samples
    output.close()
def join(args, outs, chunk_defs, chunk_outs):
    # mapping of cluster ID -> VCFs
    to_merge = collections.defaultdict(list)
    for o, d in zip(chunk_outs, chunk_defs):
        to_merge[d.cluster_id].append(o.variant_subset)

    # merge each VCF subset for a cluster
    merged_vcfs = []
    for cluster_id, vcf_list in to_merge.iteritems():
        merged_vcf = martian.make_path('{}.vcf'.format(cluster_id))
        tk_io.combine_vcfs(merged_vcf, vcf_list)
        merged_vcfs.append(merged_vcf + '.gz')

    # final merge to make one combined VCF
    tmp = martian.make_path('tmp.vcf')
    cmd = ['vcf-merge'] + merged_vcfs
    with open(tmp, 'w') as outf:
        subprocess.check_call(cmd, stdout=outf)
    # Sort and index the files
    tk_tabix.sort_vcf(tmp, outs.variants.replace('.gz', ''))
    tk_tabix.index_vcf(outs.variants.replace('.gz', ''))
    os.remove(tmp)
예제 #26
0
def main(args, outs):
    bam_in = tk_bam.create_bam_infile(args.chunk_input)

    # Get gem groups
    library_info = rna_library.get_bam_library_info(bam_in)
    gem_groups = sorted(list(set(lib['gem_group'] for lib in library_info)))

    # Define buckets
    bucket_names = []
    prefixes = cr_utils.get_seqs(args.nbases)
    for gg in gem_groups:
        for prefix in prefixes:
            bucket_names.append('%s-%d' % (prefix, gg))
    bucket_names.append('')

    # Read all records
    reads = [read for read in bam_in]

    # Bucket the records
    bams_out = {}
    outs.buckets = {}
    buckets = {}
    for bucket_name in bucket_names:
        filename = martian.make_path("bc-%s.bam" % bucket_name)
        bam_out, _ = tk_bam.create_bam_outfile(filename,
                                               None,
                                               None,
                                               template=bam_in,
                                               rgs=args.read_groups,
                                               replace_rg=True)

        bams_out[bucket_name] = bam_out
        outs.buckets[bucket_name] = filename
        buckets[bucket_name] = []

    for r in reads:
        barcode = cr_utils.get_read_barcode(r)
        if barcode is None:
            bucket_name = ''
        else:
            barcode_seq, gem_group = cr_utils.split_barcode_seq(barcode)
            prefix = barcode_seq[:args.nbases]
            bucket_name = '%s-%d' % (prefix, gem_group)
        buckets[bucket_name].append(r)

    for bucket_name, bucket in buckets.iteritems():
        bucket.sort(key=cr_utils.barcode_sort_key)
        bam_out = bams_out[bucket_name]
        for r in bucket:
            bam_out.write(r)
        bam_out.close()
예제 #27
0
def split(args):
    vc_mode, variant_caller, precalled_filename, gatk_path = tk_io.get_vc_mode(
        args.vc_precalled, args.variant_mode)
    precalled_file = None
    if vc_mode == "precalled" or vc_mode == "precalled_plus":
        mem_gb = 8
        threads = 1
        precalled_file = martian.make_path("precalled_vcf.vcf")
        tenkit.log_subprocess.check_call(
            ['cp', precalled_filename, precalled_file])
        tk_tabix.index_vcf(precalled_file)
        precalled_file = precalled_file + ".gz"
    if vc_mode != "precalled":
        if variant_caller == 'freebayes':
            mem_gb = 5
            threads = 1
        elif variant_caller == "gatk":
            mem_gb = 8
            threads = 2
            # make sure the gatk jar file exists
            if gatk_path is None:
                martian.throw(
                    "variant_caller 'gatk' selected, must supply path to gatk jar file -- e.g. \"gatk:/path/to/GenomeAnalysisTK.jar\""
                )

            gatk_loc = gatk_path
            if not (os.path.exists(gatk_loc)):
                martian.throw(
                    "variant_caller 'gatk' selected, gatk jar file does not exist: %s"
                    % gatk_loc)
        else:
            raise NotSupportedException('Variant caller not supported: ' +
                                        vc_mode)

    primary_contigs = tk_reference.load_primary_contigs(args.reference_path)
    bam_chunk_size_gb = 3.0

    if args.restrict_locus is None:
        loci = tk_chunks.get_sized_bam_chunks(args.input,
                                              bam_chunk_size_gb,
                                              contig_whitelist=primary_contigs,
                                              extra_args={
                                                  '__mem_gb': mem_gb,
                                                  '__threads': threads,
                                                  'split_input': precalled_file
                                              })
    else:
        loci = [{'locus': args.restrict_locus}]

    return {'chunks': loci}
예제 #28
0
def join(args, outs, chunk_defs, chunk_outs):
    ctg_mgr = ReferenceManager(args.reference_path)
    species = ctg_mgr.list_species()
    if args.filtered_peak_bc_matrix is None or len(species) > 1:
        outs.enrichment_analysis = None
        outs.enrichment_analysis_summary = {}
        return

    peak_matrix_features = cr_matrix.CountMatrix.load_feature_ref_from_h5_file(args.filtered_peak_bc_matrix)
    tf_matrix_features = cr_matrix.CountMatrix.load_feature_ref_from_h5_file(args.filtered_tf_bc_matrix) if args.filtered_tf_bc_matrix is not None else None
    outs.enrichment_analysis_summary = {'h5': {}, 'csv': {}}
    # for each method, we merge h5 files and copy csv directories to one place
    cr_io.mkdir(outs.enrichment_analysis, allow_existing=True)
    for method in args.factorization:
        method_dir = os.path.join(outs.enrichment_analysis, method)
        cr_io.mkdir(method_dir, allow_existing=True)

        _h5 = os.path.join(method_dir, '{}_enrichment_h5.h5'.format(method))
        outs.enrichment_analysis_summary['h5'][method] = _h5
        chunk_h5s = []

        _csv = os.path.join(method_dir, '{}_enrichment_csv'.format(method))
        outs.enrichment_analysis_summary['csv'][method] = _csv
        diffexp_prefixes = [(fr.id, fr.name) for fr in peak_matrix_features.feature_defs]
        if args.filtered_tf_bc_matrix is not None:
            diffexp_prefixes += [(fr.id, fr.name) for fr in tf_matrix_features.feature_defs]

        clustering_h5 = args.clustering_summary['h5'][method]
        for key in SingleGenomeAnalysis.load_clustering_keys_from_h5(clustering_h5):

            chunk_outs_def_method_clustering = sorted([[chunk_out, chunk_def] for
                                                       chunk_out, chunk_def in zip(chunk_outs, chunk_defs)
                                                       if chunk_def.clustering_key == key], key=lambda x: x[1].cluster)
            chunk_outs_method_clustering = [c[0] for c in chunk_outs_def_method_clustering]

            # load 1 vs rest tests in sorted order of chunks and combine into one output per clustering
            diffexp = cr_diffexp.DIFFERENTIAL_EXPRESSION(np.hstack([np.loadtxt(com.tmp_diffexp, delimiter=',')[:, 0:3] for com in chunk_outs_method_clustering]))

            # write out h5
            chunk_h5 = martian.make_path('{}_enrichment_h5.h5'.format(key))
            with analysis_io.open_h5_for_writing(chunk_h5) as f:
                cr_diffexp.save_differential_expression_h5(f, key, diffexp)
            chunk_h5s += [chunk_h5]

            # write out csv
            cr_diffexp.save_differential_expression_csv_from_features(key, diffexp, diffexp_prefixes, _csv)

        analysis_io.combine_h5_files(chunk_h5s, _h5, [analysis_constants.ANALYSIS_H5_DIFFERENTIAL_EXPRESSION_GROUP,
                                                      analysis_constants.ANALYSIS_H5_MAP_DE[method]])
예제 #29
0
def main(args, outs):
    outs.updated_sample_def = args.sample_def.copy()

    if args.mol_h5_version == 2:
        v2_mole_info_h5 = args.sample_def[cr_constants.AGG_H5_FIELD]
        v2_file_basename = os.path.basename(v2_mole_info_h5)
        v3_filename = '{x[0]}_v3_{x[2]}{x[1]}'.format(
            x=list(os.path.splitext(v2_file_basename)) +
            [datetime.datetime.now().isoformat()])
        out_v3_mole_info_h5 = martian.make_path(v3_filename)

        cr_mol_counter.MoleculeCounter.convert_v2_to_v3(
            v2_mole_info_h5, out_v3_mole_info_h5)
        outs.updated_sample_def[
            cr_constants.AGG_H5_FIELD] = out_v3_mole_info_h5
예제 #30
0
def main(args, outs):
    if args.skip:
        return

    dimred_matrix_file = args.dimred_matrix if args.ordered_dimred_matrix is None else args.ordered_dimred_matrix
    with open(dimred_matrix_file) as fp:
        dimred_matrix = cPickle.load(fp)

    cbc_knn = option(args.cbc_knn, analysis_constants.CBC_KNN)

    batch_to_bc_indices = args.batch_to_bc_indices
    batch_start_idx = batch_to_bc_indices[args.batch_id][0]
    batch_end_idx = batch_to_bc_indices[args.batch_id][1]
    cur_matrix = dimred_matrix[batch_start_idx:batch_end_idx, :]

    # nearest neighbor pair: stores the nearest neighbors from match_i to match_j
    # key = (batch_i, batch_j), values = set((idx_i, idx_j), ...), the index here is the global index
    batch_nearest_neighbor = defaultdict(set)

    from_idx, to_idx = None, None
    # Batch balanced KNN
    for batch in xrange(len(args.batch_to_bc_indices)):
        if batch == args.batch_id:
            continue

        ref_matrix = dimred_matrix[
            batch_to_bc_indices[batch][0]:batch_to_bc_indices[batch][1], ]
        nn_idx_right = find_knn(cur_matrix, ref_matrix, cbc_knn)

        # convert index (in cur_matrix and ref_matrix) to global index (in dimred_matrix)
        nn_idx_left = np.repeat(
            np.arange(cur_matrix.shape[0]) + batch_start_idx, cbc_knn)
        nn_idx_right += batch_to_bc_indices[batch][0]

        from_idx = nn_idx_left if from_idx is None else np.concatenate(
            [from_idx, nn_idx_left])
        to_idx = nn_idx_right if to_idx is None else np.concatenate(
            [to_idx, nn_idx_right])

        for i, j in izip(from_idx, to_idx):
            batch_nearest_neighbor[(args.batch_id, batch)].add((i, j))

    outs.batch_nearest_neighbor = martian.make_path(
        'batch_nearest_neighbor.binary')
    with open(outs.batch_nearest_neighbor, 'wb') as fp:
        serialize_batch_nearest_neighbor(fp, batch_nearest_neighbor)

    return