Exemplo n.º 1
0
def split(args):
    # Chunk bam to get 1GB per chunk
    bam_in = tk_bam.create_bam_infile(args.input)
    chunk_defs = tk_bam.chunk_bam_records(bam_in,
                                          chunk_bound_func,
                                          chunk_size_gb=0.75)

    if args.targets_file is None:
        targets_path = None
    else:
        targets_path = args.targets_file

    estimated_coverage = tenkit.coverage.estimate_mean_coverage(
        targets_path, bam_in, lambda x: stringent_read_filter(x, False))
    for chunk in chunk_defs:
        chunk['estimated_coverage'] = estimated_coverage

    lane_coord_sys = tk_lane.LaneCoordinateSystem()

    # Reopen BAM for estimating tile extents
    bam_in = tk_bam.create_bam_infile(args.input)
    lane_coord_sys.estimate_tile_extents(bam_in)
    for chunk in chunk_defs:
        chunk['lane_map'] = lane_coord_sys.to_dict()

    return {'chunks': chunk_defs}
Exemplo n.º 2
0
def split(args):
    if args.bcsorted_bam is None or args.barcode_whitelist is None:
        chunk_defs = [{'chunk_start': "0", 'chunk_end': "0"}]
        return {'chunks': chunk_defs}
    # if args.input

    # Some R&D bc sets have very small diversity -- don't run on them
    barcode_whitelist = bc_utils.load_barcode_whitelist(args.barcode_whitelist)
    if len(barcode_whitelist) < 100:
        chunk_defs = [{'chunk_start': "0", 'chunk_end': "0"}]
        return {'chunks': chunk_defs}
    # if barcode_whitelist

    min_chunks = 4
    if len(barcode_whitelist) > 1e6:
        min_chunks = 8
    # if barcode_whitelist

    bam_in = tk_bam.create_bam_infile(args.bcsorted_bam)
    chunks = tk_bam.chunk_bam_records(bam_in,
                                      chunk_split_func,
                                      chunk_size_gb=8.0,
                                      min_chunks=min_chunks)
    for c in chunks:
        c['__mem_gb'] = 12
    # for c

    return {'chunks': chunks, 'join': {'__mem_gb': 32}}
Exemplo n.º 3
0
def split(args):
    bam_in = create_bam_infile(args.input)
    chunk_defs = tk_bam.chunk_bam_records(bam_in,
                                          chunk_bound_key=None,
                                          chunk_size_gb=0.5)
    for chunk in chunk_defs:
        chunk["__mem_gb"] = 8.0
    return {'chunks': chunk_defs}
Exemplo n.º 4
0
def split(args):
    bam_in = tk_bam.create_bam_infile(args.input)
    chunk_defs = tk_bam.chunk_bam_records(bam_in, chunk_bound_key=None, max_chunks=120)
    for i, chunk_def in enumerate(chunk_defs):
        chunk_def['chunk_index'] = i
        chunk_def['__mem_gb'] = 6
        chunk_def['__threads'] = 4
    return {'chunks': chunk_defs, 'join': {'__mem_gb': 1}}
Exemplo n.º 5
0
def split(args):
    with tk_bam.create_bam_infile(args.input) as in_bam:
        chunks = tk_bam.chunk_bam_records(
            in_bam,
            chunk_bound_key=cr_utils.barcode_sort_key,
            chunk_size_gb=cr_constants.BAM_CHUNK_SIZE_GB,
            max_chunks=cr_constants.MAX_BAM_CHUNKS)
    if args.mem_gb is not None and args.mem_gb > cr_constants.MIN_MEM_GB:
        for chunk in chunks:
            chunk['__mem_gb'] = args.mem_gb
    return {'chunks': chunks}
Exemplo n.º 6
0
def split(args):
    # use a custom key that returns None for all unbarcoded reads, so that chunk_bam_records
    # never has to linearly scan over those reads to find a chunk boundary (which could take a long
    # time for large datasets)
    bc_sort_key = lambda read: cr_utils.barcode_sort_key(read, squash_unbarcoded=True)
    with tk_bam.create_bam_infile(args.input) as in_bam:
        chunks = tk_bam.chunk_bam_records(in_bam, chunk_bound_key=bc_sort_key,
                                          chunk_size_gb=cr_constants.BAM_CHUNK_SIZE_GB,
                                          max_chunks=cr_constants.MAX_BAM_CHUNKS)
    if args.mem_gb is not None and args.mem_gb > cr_constants.MIN_MEM_GB:
        for chunk in chunks:
            chunk['__mem_gb'] = args.mem_gb
    return {'chunks': chunks}
Exemplo n.º 7
0
def split(args):
    df = pd.read_csv(args.barcode_clusters)
    # construct BAM chunks
    with tk_bam.create_bam_infile(args.possorted_bam) as in_bam:
        chunks = tk_bam.chunk_bam_records(in_bam, chunk_bound_key=cr_utils.pos_sort_key,
                                          chunk_size_gb=cr_constants.BAM_CHUNK_SIZE_GB,
                                          max_chunks=cr_constants.MAX_BAM_CHUNKS)
    # nest BAM chunks with clusters
    bc_chunks = []
    for cluster_id, d in df.groupby('Cluster'):
        for c in chunks:
            bc_chunks.append({'chunk_start': c['chunk_start'], 'chunk_end': c['chunk_end'],
                              'cluster_bcs': d.Barcode.tolist(), 'cluster_id': cluster_id,
                              '__mem_gb': 8})
    return {'chunks': bc_chunks}
Exemplo n.º 8
0
def split(args):
    bam = pysam.Samfile(args.input, check_sq=False)

    min_chunks = 1
    if args.barcode_whitelist is not None:
        barcode_whitelist = bc_utils.load_barcode_whitelist(
            args.barcode_whitelist)
        if len(barcode_whitelist) > 1e6:
            min_chunks = 4

    # Split to ensure read pairs always go together
    chunks = tk_bam.chunk_bam_records(bam,
                                      lambda x: x.qname,
                                      min_chunks=min_chunks)
    for chunk in chunks:
        chunk['n_chunks'] = len(chunks)
        chunk['__mem_gb'] = 3
    return {'chunks': chunks, 'join': {'__mem_gb': 8}}
Exemplo n.º 9
0
def split(args):
    if args.input is None or args.barcode_whitelist is None:
        chunk_defs = [{'chunk_start': "0", 'chunk_end': "0", '__mem_gb': 1}]
        return {'chunks': chunk_defs, 'join': {'__mem_gb': 1}}

    ref = contig_manager.contig_manager(args.reference_path)
    species_list = ref.list_species()
    if (args.force_cells is not None and args.force_cells > 0
            and len(species_list) > 1):
        martian.exit(
            "force_cells can only be used for single species reference.")
    min_chunks = 10
    bam_in = tk_bam.create_bam_infile(args.input)
    chunks = tk_bam.chunk_bam_records(bam_in,
                                      chunk_split_func,
                                      chunk_size_gb=8.0,
                                      min_chunks=min_chunks)

    # 0.03 =~ 26meg = 1M bcs * (sizeof(int64) + 18)
    join_mem_gb = int(np.ceil(0.03 * (len(chunks) + 1) + 1))
    return {'chunks': chunks, 'join': {'__mem_gb': join_mem_gb}}
Exemplo n.º 10
0
def split(args):
    if args.input is None or args.barcode_whitelist is None:
        chunk_defs = [{'chunk_start':"0", 'chunk_end':"0"}]
        return {'chunks': chunk_defs}

    # Some R&D bc sets have very small diversity -- don't run on them
    barcode_whitelist = bc_utils.load_barcode_whitelist(args.barcode_whitelist)
    if len(barcode_whitelist) < 100:
        chunk_defs = [{'chunk_start':"0", 'chunk_end':"0"}]
        return {'chunks': chunk_defs}

    min_chunks = 20
    if len(barcode_whitelist) > 1e6:
        min_chunks = 100

    bam_in = tk_bam.create_bam_infile(args.input)
    chunks = tk_bam.chunk_bam_records(bam_in, groupbybarcode, 
                chunk_size_gb=8.0, min_chunks=min_chunks)
    for c in chunks:
        c['__mem_gb'] = 3

    return {'chunks': chunks, 'join': {'__mem_gb': 6}}
Exemplo n.º 11
0
def split(args):
    # Chunk bam to get 1GB per chunk
    bam_in = tk_bam.create_bam_infile(args.input)
    chunk_defs = tk_bam.chunk_bam_records(bam_in, chunk_bound_func, chunk_size_gb=0.75)

    for i, chunk in enumerate(chunk_defs):
        chunk['chunk_index'] = i
        chunk['__mem_gb'] = 3

    lane_coord_sys = tk_lane.LaneCoordinateSystem()

    # Reopen BAM for estimating tile extents
    bam_in = tk_bam.create_bam_infile(args.input)
    lane_coord_sys.estimate_tile_extents(bam_in)

    with open(args.diffusion_dup_summary) as f:
        data = json.load(f)
        threshold = data['diffusion']['threshold']

    for chunk in chunk_defs:
        chunk['lane_map'] = lane_coord_sys.to_dict()
        chunk['diffusion_threshold'] = threshold

    return {'chunks': chunk_defs, 'join': {'__mem_gb': 1, '__threads': 6}}
Exemplo n.º 12
0
def split(args):
    # Chunk bam to get 1GB per chunk
    bam_in = create_bam_infile(args.input)
    bam_chunk_size_disk = 0.75
    chunk_defs = tk_bam.chunk_bam_records(bam_in,
                                          chunk_bound_func,
                                          chunk_size_gb=bam_chunk_size_disk)

    for chunk in chunk_defs:
        chunk['__mem_gb'] = 4
        chunk['__vmem_gb'] = 5 + int(
            np.ceil(2 * whitelist_mem_gb(args.barcode_whitelist) +
                    bam_chunk_size_disk * 10))

    lane_coord_sys = tk_lane.LaneCoordinateSystem()

    # Reopen BAM for estimating tile extents
    bam_in = create_bam_infile(args.input)
    lane_coord_sys.estimate_tile_extents(bam_in)
    for cnum, chunk in enumerate(chunk_defs):
        chunk['lane_map'] = lane_coord_sys.to_dict()
        chunk['chunk_num'] = cnum

    return {'chunks': chunk_defs, 'join': {'__mem_gb': 8, '__threads': 4}}
Exemplo n.º 13
0
def split(args):

    # Chunk bam to get 1GB per chunk
    bam_in = tk_bam.create_bam_infile(args.input)
    lane_coord_sys = tk_lane.LaneCoordinateSystem()

    bam_in.reset()
    lane_coord_sys.estimate_tile_extents(bam_in)
    flowcell_geometry = estimate_flowcell_geometry(bam_in, lane_coord_sys)

    print "Flowcell Geometry: ", flowcell_geometry

    if flowcell_geometry is None:
        return {
            'chunks': [{
                'seed': None,
                'lane_map': None,
                'flowcell_geometry': None,
                'chunk_start': None,
                'chunk_end': None
            }]
        }

    chunk_defs = tk_bam.chunk_bam_records(bam_in,
                                          chunk_bound_func,
                                          chunk_size_gb=0.75)

    for i, chunk in enumerate(chunk_defs):
        chunk['seed'] = i
        chunk['__mem_gb'] = 3

    for chunk in chunk_defs:
        chunk['lane_map'] = lane_coord_sys.to_dict()
        chunk['flowcell_geometry'] = flowcell_geometry

    return {'chunks': chunk_defs, 'join': {'__mem_gb': 1}}
Exemplo n.º 14
0
def split(args):
    bam_in = tk_bam.create_bam_infile(args.input)
    chunk_defs = tk_bam.chunk_bam_records(bam_in,
                                          chunk_bound_key=None,
                                          chunk_size_gb=0.5)
    return {'chunks': chunk_defs}
def split(args):
    with tk_bam.create_bam_infile(args.input) as in_bam:
        chunks = tk_bam.chunk_bam_records(in_bam, chunk_bound_key=cr_utils.pos_sort_key,
                                          chunk_size_gb=cr_constants.BAM_CHUNK_SIZE_GB,
                                          max_chunks=cr_constants.MAX_BAM_CHUNKS)
    return {'chunks': chunks}
Exemplo n.º 16
0
def split(args):
    bam_in = tk_bam.create_bam_infile(args.input)
    chunk_defs = tk_bam.chunk_bam_records(bam_in, chunk_bound_key=None)
    for c in chunk_defs:
        c["__mem_gb"] = 4
    return {'chunks': chunk_defs}