예제 #1
0
def split(args):
    """Compute base background in split and use it in each chunk."""

    ref_mgr = ReferenceManager(args.reference_path)
    npeaks = utils.quick_line_count(args.peaks) if args.peaks else 0
    if len(ref_mgr.list_species()
           ) > 1 or npeaks == 0 or ref_mgr.motifs is None:
        chunk_def = [{'skip': True}]
        return {'chunks': chunk_def}

    with open(args.globalGCdict, 'r') as f:
        GCdict = pickle.load(f)

    GCdict_paths = {}
    GCbins = sorted(GCdict.keys())
    for gc in GCbins:
        GCdict_paths[gc] = martian.make_path('GCdict_{}_{}'.format(
            gc[0], gc[1]))
        with open(GCdict_paths[gc], 'w') as dump:
            pickle.dump(GCdict[gc], dump)

    # write rows of each chunk to a new peak file
    mem_in_gb = 8
    chunk_def = [{
        '__mem_gb':
        mem_in_gb,
        '__vmem_gb':
        mem_in_gb + int(np.ceil(ref_mgr.get_vmem_est())) + 1,
        'skip':
        False,
        'GCdict':
        GCdict_paths[chunk]
    } for chunk in GCbins]
    return {'chunks': chunk_def}
예제 #2
0
def split(args):
    ctg_mgr = ReferenceManager(args.reference_path)
    species = ctg_mgr.list_species()
    if args.filtered_peak_bc_matrix is None or len(species) > 1:
        return {'chunks': [{'__mem_gb': h5_constants.MIN_MEM_GB}]}

    chunks = []
    matrix_mem_gb = 0.
    if args.filtered_tf_bc_matrix is not None:
        matrix_mem_gb = cr_matrix.CountMatrix.get_mem_gb_from_matrix_h5(args.filtered_tf_bc_matrix) * 1.5
    matrix_mem_gb += cr_matrix.CountMatrix.get_mem_gb_from_matrix_h5(args.filtered_peak_bc_matrix)
    chunk_mem_gb = int(np.ceil(max(matrix_mem_gb, h5_constants.MIN_MEM_GB)))

    if not set(args.factorization).issubset(ALLOWED_FACTORIZATIONS):
        raise ValueError('Invalid factorization provided')

    # create a chunk for each method x clustering combo
    for method in args.factorization:
        clustering_h5 = args.clustering_summary['h5'][method]
        for key in SingleGenomeAnalysis.load_clustering_keys_from_h5(clustering_h5):
            clustering = SingleGenomeAnalysis.load_clustering_from_h5(clustering_h5, key)
            for cluster in set(clustering.clusters):
                chunks.append({
                    'method': method,
                    'clustering_key': key,
                    'cluster': cluster,
                    '__mem_gb': chunk_mem_gb,
                    '__vmem_gb': chunk_mem_gb + int(np.ceil(ctg_mgr.get_vmem_est())) + 1,
                    '__threads': 1,
                })

    return {'chunks': chunks, 'join': {'__mem_gb': 3}}
예제 #3
0
def split(args):
    ref_mgr = ReferenceManager(args.reference_path)
    return {
        'chunks': [],
        'join': {
            '__mem_gb': 4,
            '__vmem_gb': int(np.ceil(ref_mgr.get_vmem_est())) + 3
        }
    }
예제 #4
0
def split(args):
    """We just align each chunk independently -- joining will happen in the join step of SORT_READS"""

    # Pull some reads from fastq files -- bail out if it's less than 25bp
    fastq_tests = [x['read1'] for x in args.chunks]

    for fastq_test in fastq_tests:
        with open(fastq_test) as in_file:
            reader = tk_fasta.read_generator_fastq(in_file)
            for name, read, qual in itertools.islice(reader, 10):
                if len(read) < MIN_READ_LENGTH:
                    martian.alarm("BWA-MEM can't handle reads <25bp -- reads will be unmapped.")
                    continue

    # estimated amount of memory needed to process genome is 2x(num gigabases)+4GB
    ctg_mgr = ReferenceManager(args.reference_path)
    base_mem_in_gb = int(math.ceil(2 * ctg_mgr.get_vmem_est()))

    mem_in_gb = base_mem_in_gb + 4
    chunks = [{'chunk': x, '__threads': args.num_threads, '__mem_gb': mem_in_gb} for x in args.chunks]
    return {'chunks': chunks}