def split(args): ctg_mgr = ReferenceManager(args.reference_path) species = ctg_mgr.list_species() if args.filtered_peak_bc_matrix is None or len(species) > 1: return {'chunks': [{'__mem_gb': h5_constants.MIN_MEM_GB}]} chunks = [] matrix_mem_gb = 0. if args.filtered_tf_bc_matrix is not None: matrix_mem_gb = cr_matrix.CountMatrix.get_mem_gb_from_matrix_h5(args.filtered_tf_bc_matrix) * 1.5 matrix_mem_gb += cr_matrix.CountMatrix.get_mem_gb_from_matrix_h5(args.filtered_peak_bc_matrix) chunk_mem_gb = int(np.ceil(max(matrix_mem_gb, h5_constants.MIN_MEM_GB))) if not set(args.factorization).issubset(ALLOWED_FACTORIZATIONS): raise ValueError('Invalid factorization provided') # create a chunk for each method x clustering combo for method in args.factorization: clustering_h5 = args.clustering_summary['h5'][method] for key in SingleGenomeAnalysis.load_clustering_keys_from_h5(clustering_h5): clustering = SingleGenomeAnalysis.load_clustering_from_h5(clustering_h5, key) for cluster in set(clustering.clusters): chunks.append({ 'method': method, 'clustering_key': key, 'cluster': cluster, '__mem_gb': chunk_mem_gb, '__vmem_gb': chunk_mem_gb + int(np.ceil(ctg_mgr.get_vmem_est())) + 1, '__threads': 1, }) return {'chunks': chunks, 'join': {'__mem_gb': 3}}
def check_reference_format(reference_path): """Check file formats for files present in the reference""" try: contig_manager = ReferenceManager(reference_path) except Exception as e: martian.exit("Contig manager could not be initialized, Error:\n%s" % str(e)) # formatting error_msg = contig_manager.verify_contig_defs() if error_msg is not None: martian.exit(error_msg) # filecheck contig_manager.genes # check if motif file is in right format (naming convention) if len(contig_manager.list_species()) == 1: motif_format_checker(contig_manager.motifs) # checks for valid bed file formats in regions/ faidx_file = os.path.join(reference_path, 'fasta', 'genome.fa.fai') bed_format_checker(contig_manager.tss_track, faidx_file) bed_format_checker(contig_manager.transcripts_track, faidx_file) bed_format_checker(contig_manager.ctcf_track, faidx_file) bed_format_checker(contig_manager.blacklist_track, faidx_file) bed_format_checker(contig_manager.dnase_track, faidx_file) bed_format_checker(contig_manager.enhancer_track, faidx_file) bed_format_checker(contig_manager.promoter_track, faidx_file)
def main(args, outs): '''Find cut sites on a per chromosome basis and write out a bedgraph''' if args.fragments is None: outs.count_dict = None outs.cut_sites = None return ctg_mgr = ReferenceManager(args.reference_path) contig_len = ctg_mgr.get_contig_lengths() chrom_len = contig_len[args.contig] half_window = WINDOW_SIZE // 2 Cuts = np.zeros(chrom_len, dtype='int32') # find windowed cut sites for _, start, stop, _, _ in parsed_fragments_from_contig(contig=args.contig, filename=args.fragments, index=args.fragments_index): Cuts[max(0, start - half_window): min(start + half_window + 1, chrom_len)] += 1 Cuts[max(0, stop - half_window): min(stop + half_window + 1, chrom_len)] += 1 # get count dict count_dict = Counter(v for v in Cuts if v > 0) with open(outs.count_dict, 'w') as count_dict_out: pickle.dump(count_dict, count_dict_out) # write bedgraph of * windowed cutsites * if len(count_dict): write_chrom_bedgraph(args.contig, chrom_len, Cuts, outs.cut_sites) else: outs.cut_sites = None
def main(args, outs): metrics = {} for fname in args.metrics: if fname is not None: with open(fname, 'r') as f: metrics.update(json.load(f)) # Normalize "NaN" values for key in metrics: value = metrics[key] if str(value) == 'NaN' or (isinstance(value, float) and np.isnan(value)): metrics[key] = None # add version info metrics['cellranger-atac_version'] = martian.get_pipelines_version() if len(metrics) > 0: martian.log_info('Writing out summary_metrics') with open(outs.metrics, 'w') as outfile: outfile.write(tenkit.safe_json.safe_jsonify(metrics, pretty=True)) # compile summary.csv metrics # load library info and fake libraries as species metric_registry = MetricAnnotations() metrics_csv_dict = {} if args.library_info is not None: with open(args.library_info, 'r') as f: library_info = pickle.load(f) library_list = [library_info[n]['library_id'] for n in library_info.keys()] metrics_csv_dict.update(metric_registry.compile_summary_metrics(metrics, species_list=library_list)) # load species level metrics ctg_mgr = ReferenceManager(args.reference_path) metrics_csv_dict.update(metric_registry.compile_summary_metrics(metrics, species_list=ctg_mgr.list_species())) write_dict_to_csv(outs.metrics_csv, metrics_csv_dict, sort=True)
def split(args): """Compute base background in split and use it in each chunk.""" ref_mgr = ReferenceManager(args.reference_path) npeaks = utils.quick_line_count(args.peaks) if args.peaks else 0 if len(ref_mgr.list_species() ) > 1 or npeaks == 0 or ref_mgr.motifs is None: chunk_def = [{'skip': True}] return {'chunks': chunk_def} with open(args.globalGCdict, 'r') as f: GCdict = pickle.load(f) GCdict_paths = {} GCbins = sorted(GCdict.keys()) for gc in GCbins: GCdict_paths[gc] = martian.make_path('GCdict_{}_{}'.format( gc[0], gc[1])) with open(GCdict_paths[gc], 'w') as dump: pickle.dump(GCdict[gc], dump) # write rows of each chunk to a new peak file mem_in_gb = 8 chunk_def = [{ '__mem_gb': mem_in_gb, '__vmem_gb': mem_in_gb + int(np.ceil(ref_mgr.get_vmem_est())) + 1, 'skip': False, 'GCdict': GCdict_paths[chunk] } for chunk in GCbins] return {'chunks': chunk_def}
def join(args, outs, chunk_defs, chunk_outs): ref_mgr = ReferenceManager(args.reference_path) if args.filtered_matrix is None or args.peak_motif_hits is None or len( ref_mgr.list_species()) > 1: outs.filtered_tf_bc_matrix = None outs.filtered_tf_bc_matrix_mex = None outs.tf_propZ_matrix = None return # motif scan is completed in ANNOTATE_PEAKS peaks = BedTool(args.peaks) motifs = Motifs(args.reference_path) peak_motif_hits = BedTool(args.peak_motif_hits) # extract peak coordinate to numerical index map peak_idx, n_peaks = _get_peak_indexes(peaks) # extract motif names to numerical index map motif_idx, n_motifs = _get_motif_indexes(motifs) # extract 3 lists: peak indexes, motif indexes and counts, each entry correspond to a peak-motif pair peak_coor, motif_coor, values = motifscan_bed_to_sparse_matrix( peak_motif_hits, peak_idx, motif_idx, format='binary') # convert it to a sparse matrix, default is binary format, motifs are rows and peaks are columns tf_peak_matrix = sp.csr_matrix((values, (motif_coor, peak_coor)), shape=(n_motifs, n_peaks), dtype='int32') # compute the motif-BC matrix via pooling # The current method simply counts the number of hits for a motif inside the peaks in a barcode # cast as a CountMatrix peak_matrix = cr_matrix.CountMatrix.load_h5_file(args.filtered_matrix) motif_names = motif_idx.keys() barcodes = peak_matrix.bcs genomes = utils.generate_genome_tag(args.reference_path) motifs_def = atac_feature_ref.from_motif_list(motif_names, genomes) tf_matrix = cr_matrix.CountMatrix(motifs_def, barcodes, tf_peak_matrix * peak_matrix.m) # perform MAD-zscoring of proportion values propZ_matrix = np.array(tf_matrix.m / peak_matrix.m.sum(axis=0)) propZ_matrix = MADzscore(propZ_matrix) outs.coerce_strings() # save to h5 and csv tf_matrix.save_h5_file(outs.filtered_tf_bc_matrix, sw_version=martian.get_pipelines_version()) if not os.path.exists(outs.filtered_tf_bc_matrix_mex): os.mkdir(outs.filtered_tf_bc_matrix_mex) atac_matrix.save_mex( tf_matrix, outs.filtered_tf_bc_matrix_mex, feature_type=cr_lib_constants.ATACSEQ_LIBRARY_DERIVED_TYPE, sw_version=martian.get_pipelines_version()) # save propZ matrix as gz np.savetxt(outs.tf_propZ_matrix, propZ_matrix)
def generate_genome_tag(ref_path): """Replace empty genome name for single genomes with valid genome name""" # For a single species reference, use contents of <reference_path>/genome ref_contig_manager = ReferenceManager(ref_path) genomes = ref_contig_manager.list_species() if len(genomes) == 1 and genomes[0] == '' or len(genomes) == 0: genomes = [ref_contig_manager.genome] return genomes
def main(args, outs): reference = ReferenceManager(args.reference_path) species_list = reference.list_species() is_barnyard = len(species_list) > 1 and args.singlecell is not None summary_data = None if args.summary_results: with open(args.summary_results, 'r') as infile: summary_data = json.load(infile) # Pull up the correct template information template_path = os.path.dirname(os.path.abspath(__file__)) template_file = os.path.join( template_path, '{}{}.html'.format('barnyard' if is_barnyard else 'single', '_debug' if args.debug else '')) with open(template_file, 'r') as infile: template = infile.read() metadata = MetricAnnotations() websummary_data = { 'alarms': { 'alarms': [] }, 'sample': { 'id': args.sample_id, 'description': args.sample_desc, 'pipeline': "Cell Ranger ATAC Renalyzer" } } singlecell_df = pd.read_csv( args.singlecell) if args.singlecell is not None else None add_data( websummary_data, get_hero_metric_data(metadata, summary_data, species_list, args.debug)) add_data(websummary_data, get_pipeline_info(args, reference, args.debug)) add_data( websummary_data, get_clustering_plots(metadata, summary_data, args.analysis, args.filtered_peak_bc_matrix, species_list, singlecell_df, is_barnyard)) # Modify the titles of plots to add consistent plot styling sample ID/descriptions for key, subdata in websummary_data.iteritems(): if "layout" in subdata: subdata["layout"][ "title"] += '<br><sup>Sample {} - {}</sup>'.format( args.sample_id, args.sample_desc) subdata["layout"]["hovermode"] = "closest" subdata["config"] = PLOT_CONFIG_KWARGS with open(outs.web_summary, 'w') as outfile: summarize.generate_html_summary(websummary_data, template, template_path, outfile)
def split(args): ref_mgr = ReferenceManager(args.reference_path) return { 'chunks': [], 'join': { '__mem_gb': 4, '__vmem_gb': int(np.ceil(ref_mgr.get_vmem_est())) + 3 } }
def main(args, outs): if args.singlecell_mapping is None or args.singlecell_targets is None or args.singlecell_cells is None: outs.singlecell = None outs.summary = None return ref = ReferenceManager(args.reference_path) species_list = ref.list_species() # Merge the input singlecell data into a single dataframe and write it out mapping = pd.read_csv(args.singlecell_mapping) cells = pd.read_csv(args.singlecell_cells) targeting = pd.read_csv(args.singlecell_targets) merged = mapping.merge(cells, how="left", on="barcode", sort=False, validate="one_to_one") merged["cell_id"] = merged["cell_id"].fillna("None") for column in merged.columns: if column.endswith("_cell_barcode") or column.startswith( "passed_filters_") or column.startswith( "peak_region_fragments_"): merged[column] = merged[column].fillna(0).astype(int) merged = merged.merge(targeting, how="left", on="barcode", sort=False, validate="one_to_one") keys = [ "{}_fragments".format(region) for region in [ "TSS", "DNase_sensitive_region", "enhancer_region", "promoter_region", "on_target", "blacklist_region", "peak_region" ] ] + ["peak_region_cutsites"] for column in keys: merged[column] = merged[column].fillna(0).astype(int) merged.to_csv(outs.singlecell, index=None) summary_info = {} summary_info = add_bulk_targeting_metrics(summary_info, merged, species_list) summary_info = add_doublet_rate_metrics(summary_info, merged, species_list) summary_info = add_purity_metrics(summary_info, merged, species_list) summary_info = add_bulk_mapping_metrics(summary_info, merged, species_list) summary_info = add_singlecell_sensitivity_metrics(summary_info, merged, species_list) with open(outs.summary, 'w') as summary_file: summary_file.write(json.dumps(summary_info, indent=4))
def split(args): if args.fragments is None: return {"chunks": [], "join": {}} ctg_mgr = ReferenceManager(args.reference_path) all_contigs = ctg_mgr.primary_contigs(allow_sex_chromosomes=True) chunks = [] for contig in all_contigs: chunks.append({"contig": contig, "__mem_gb": 5}) return {"chunks": chunks, "join": {"__mem_gb": 5}}
def split(args): if args.fragments is None: return {'chunks': [], 'join': {}} ctg_mgr = ReferenceManager(args.reference_path) all_contigs = ctg_mgr.primary_contigs(allow_sex_chromosomes=True) chunks = [] for contig in all_contigs: chunks.append({'contig': contig, '__mem_gb': 5}) return {'chunks': chunks, 'join': {'__mem_gb': 5}}
def join(args, outs, chunk_defs, chunk_outs): ctg_mgr = ReferenceManager(args.reference_path) species = ctg_mgr.list_species() if args.filtered_peak_bc_matrix is None or len(species) > 1: outs.enrichment_analysis = None outs.enrichment_analysis_summary = {} return peak_matrix_features = cr_matrix.CountMatrix.load_feature_ref_from_h5_file(args.filtered_peak_bc_matrix) tf_matrix_features = cr_matrix.CountMatrix.load_feature_ref_from_h5_file(args.filtered_tf_bc_matrix) if args.filtered_tf_bc_matrix is not None else None outs.enrichment_analysis_summary = {'h5': {}, 'csv': {}} # for each method, we merge h5 files and copy csv directories to one place cr_io.mkdir(outs.enrichment_analysis, allow_existing=True) for method in args.factorization: method_dir = os.path.join(outs.enrichment_analysis, method) cr_io.mkdir(method_dir, allow_existing=True) _h5 = os.path.join(method_dir, '{}_enrichment_h5.h5'.format(method)) outs.enrichment_analysis_summary['h5'][method] = _h5 chunk_h5s = [] _csv = os.path.join(method_dir, '{}_enrichment_csv'.format(method)) outs.enrichment_analysis_summary['csv'][method] = _csv diffexp_prefixes = [(fr.id, fr.name) for fr in peak_matrix_features.feature_defs] if args.filtered_tf_bc_matrix is not None: diffexp_prefixes += [(fr.id, fr.name) for fr in tf_matrix_features.feature_defs] clustering_h5 = args.clustering_summary['h5'][method] for key in SingleGenomeAnalysis.load_clustering_keys_from_h5(clustering_h5): chunk_outs_def_method_clustering = sorted([[chunk_out, chunk_def] for chunk_out, chunk_def in zip(chunk_outs, chunk_defs) if chunk_def.clustering_key == key], key=lambda x: x[1].cluster) chunk_outs_method_clustering = [c[0] for c in chunk_outs_def_method_clustering] # load 1 vs rest tests in sorted order of chunks and combine into one output per clustering diffexp = cr_diffexp.DIFFERENTIAL_EXPRESSION(np.hstack([np.loadtxt(com.tmp_diffexp, delimiter=',')[:, 0:3] for com in chunk_outs_method_clustering])) # write out h5 chunk_h5 = martian.make_path('{}_enrichment_h5.h5'.format(key)) with analysis_io.open_h5_for_writing(chunk_h5) as f: cr_diffexp.save_differential_expression_h5(f, key, diffexp) chunk_h5s += [chunk_h5] # write out csv cr_diffexp.save_differential_expression_csv_from_features(key, diffexp, diffexp_prefixes, _csv) analysis_io.combine_h5_files(chunk_h5s, _h5, [analysis_constants.ANALYSIS_H5_DIFFERENTIAL_EXPRESSION_GROUP, analysis_constants.ANALYSIS_H5_MAP_DE[method]])
def get_cell_barcodes(filename, ref, with_species=False): """Read singlecell.csv and emit barcodes""" scdf = pd.read_csv(filename, sep=',') ctg_mgr = ReferenceManager(ref) if not with_species: cell_barcodes = set() for species in ctg_mgr.list_species(): species_cell_mask = scdf['is_{}_cell_barcode'.format(species)] == 1 cell_barcodes.update(scdf[species_cell_mask]['barcode'].values.tolist()) else: cell_barcodes = {} for species in ctg_mgr.list_species(): species_cell_mask = scdf['is_{}_cell_barcode'.format(species)] == 1 cell_barcodes[species] = set(scdf[species_cell_mask]['barcode'].values.tolist()) return cell_barcodes
def split(args): if args.fragments is None: return {'chunks': [], 'join': {}} ctg_mgr = ReferenceManager(args.reference_path) all_contigs = ctg_mgr.primary_contigs(allow_sex_chromosomes=True) contig_len = ctg_mgr.get_contig_lengths() BYTES_PER_INT32_WITH_SAFETY = 5 chunks = [] for contig in all_contigs: chunks.append({'contig': contig, '__mem_gb': int(np.ceil(BYTES_PER_INT32_WITH_SAFETY * contig_len[contig] / 1024 / 1024 / 1024))}) return {'chunks': chunks, 'join': {'__mem_gb': 5}}
def split(args): if args.fragments is None: return {"chunks": [], "join": {}} ctg_mgr = ReferenceManager(args.reference_path) all_contigs = ctg_mgr.primary_contigs(allow_sex_chromosomes=True) with open(args.barcode_counts, "r") as infile: barcode_counts = Counter(json.load(infile)) barcode_array = np.array([bc for bc in barcode_counts]) gem_group_array = np.array( [get_barcode_gem_group(bc) for bc in barcode_counts]) gem_groups = set(gem_group_array) frag_count_array = np.array([barcode_counts[bc] for bc in barcode_array]) valid_barcodes = list() for gem_group in gem_groups: count_mask = (frag_count_array > MINIMUM_COUNTS) & (gem_group_array == gem_group) # find at most top N barcodes topN_indices = barcode_array[count_mask].argsort( )[-min(MAXIMUM_BARCODES, len(count_mask)):] valid_barcodes.extend(list(barcode_array[count_mask][topN_indices])) # mem allocs JOIN_LOAD_FACTOR = 2 BUFFER_GB = 2 BYTES_PER_ENTRY = 4 # this depends on the dtype chunk_mem_gb = BUFFER_GB + np.ceil( BYTES_PER_ENTRY * len(gem_groups) * MAXIMUM_BARCODES**2 / 1024**3).astype('int32') join_mem_gb = BUFFER_GB + np.ceil( JOIN_LOAD_FACTOR * BYTES_PER_ENTRY * len(gem_groups) * MAXIMUM_BARCODES**2 / 1024**3).astype('int32') valid_barcodes_path = martian.make_path("valid_barcodes.txt") with open(valid_barcodes_path, 'w') as f: f.write(",".join(valid_barcodes)) chunks = [] for contig in all_contigs: chunks.append({ "contig": contig, "valid_barcodes": valid_barcodes_path, "__mem_gb": chunk_mem_gb, }) return {"chunks": chunks, "join": {"__mem_gb": join_mem_gb}}
def main(args, outs): """Downsample each fragments file to produce a sorted file, while computing the pre and post complexity metrics""" with open(args.library_info, 'r') as f: library_info = pickle.load(f)[args.n] # read cells cell_barcodes = get_cell_barcodes(library_info['cells'], args.reference_path) # get chrom key from fasta index chrom_order = {} ctg_mgr = ReferenceManager(args.reference_path) with open(ctg_mgr.fasta_index, 'r') as f: for en, line in enumerate(f): chrom = line.split('\t')[0] chrom_order[chrom] = en downsampling_metrics = subsample_fragments( infile=library_info['fragments'], rate=library_info['rate'], outfile=outs.fragments, group=args.n, cells=cell_barcodes, kind=library_info['kind'], key=chrom_order) with open(outs.normalization_metrics, 'w') as f: json.dump(downsampling_metrics, f, indent=4)
def split(args): if args.fragments is None: return {'chunks': [], 'join': {}} if args.peaks is None: martian.throw("peaks BED file expected") if args.cell_barcodes is None: martian.throw("cell barcodes CSV file expected") ctg_mgr = ReferenceManager(args.reference_path) all_contigs = ctg_mgr.primary_contigs(allow_sex_chromosomes=True) chunks = [] for contig in all_contigs: chunks.append({'contig': contig, '__mem_gb': 4}) return {'chunks': chunks, 'join': {'__mem_gb': 8}}
def main(args, outs): args.coerce_strings() outs.coerce_strings() with open(args.barcodes, 'r') as barcode_file: barcodes_dict = OrderedDict( (bc.strip('\n'), num) for num, bc in enumerate(barcode_file)) outs.insert_summary = None if args.fragments is None or len(barcodes_dict) == 0: outs.insert_sizes = None outs.total = None return ref_contig_manager = ReferenceManager(args.reference_path) # iterate over fragments and count fragment sizes for each barcode insert_sizes = {bc: Counter() for bc in barcodes_dict.iterkeys()} primary_contigs = set( ref_contig_manager.primary_contigs(allow_sex_chromosomes=True)) for contig, start, stop, barcode, _ in open_fragment_file(args.fragments): if barcode not in barcodes_dict: continue if args.exclude_non_nuclear and contig not in primary_contigs: continue size = stop - start insert_sizes[barcode][ str(size) if size <= MAX_INSERT_SIZE else GT_MAX_INSERT_SIZE] += 1 # compute total and write out csv total = np.zeros(MAX_INSERT_SIZE) with open(outs.insert_sizes, 'w') as outfile: outfile.write(','.join(['Barcode'] + [str(n) for n in range(1, MAX_INSERT_SIZE + 1)] + ['>{}'.format(MAX_INSERT_SIZE)]) + '\n') for barcode in insert_sizes: outfile.write(','.join([barcode] + [ str(insert_sizes[barcode][str(n)]) for n in range(1, MAX_INSERT_SIZE + 1) ] + [str(insert_sizes[barcode][GT_MAX_INSERT_SIZE])]) + '\n') for n in range(1, 1001): total[n - 1] += insert_sizes[barcode][str(n)] # write out totals for reduce in join np.savetxt(outs.total, total, delimiter=',')
def get_barcode_gc(ref_f, peaks_f, matrix): """Get mean GC% of peaks in a barcode""" ref_mgr = ReferenceManager(ref_f) genome_fa = pyfasta.Fasta(ref_mgr.fasta, key_fn=lambda x: x.split()[0]) peak_GC = np.array([get_peak_GC_counts(peak, genome_fa, counts=False) for peak in peak_reader(peaks_f)]) barcode_GC = ((peak_GC * matrix.m) / np.array(matrix.m.sum(axis=0))).squeeze() return barcode_GC
def split(args): """Compute base background in split and use it in each chunk """ n_peaks = utils.quick_line_count(args.peaks) if args.peaks else 0 ref_mgr = ReferenceManager(args.reference_path) if len(ref_mgr.list_species()) > 1 or n_peaks == 0 or ref_mgr.tss_track is None: chunk_def = [{'skip': True}] return {'chunks': chunk_def} # write rows of each chunk to a new peak file mem_in_gb = 4.0 chunk_def = [{'__mem_gb': mem_in_gb, 'skip': False, 'chunk_start': chunk[0], 'chunk_end': chunk[1]} for chunk in utils.get_chunks(n_peaks, chunks=20)] return {'chunks': chunk_def}
def count_bases_in_peaks(reference_path, peaks_file): """Count the total number of bases in peak regions (0-indexed)""" bases_in_peaks = 0 ctg_mgr = ReferenceManager(reference_path) genome_fa = pyfasta.Fasta(ctg_mgr.fasta, key_fn=lambda x: x.split()[0]) for peak in peak_reader(peaks_file): bases_in_peaks += len(genome_fa[peak.chrom][peak.start:peak.end]) return bases_in_peaks
def main(args, outs): """Run this for each method x clustering key combination from split""" ctg_mgr = ReferenceManager(args.reference_path) species = ctg_mgr.list_species() if args.filtered_peak_bc_matrix is None or len(species) > 1: return # Load the peak-BC matrix and a clustering and perform DE peak_matrix = cr_matrix.CountMatrix.load_h5_file(args.filtered_peak_bc_matrix) clustering_h5 = args.clustering_summary['h5'][args.method] clustering = SingleGenomeAnalysis.load_clustering_from_h5(clustering_h5, args.clustering_key) mask = clustering.clusters == args.cluster clustering.clusters[mask] = 1 clustering.clusters[np.logical_not(mask)] = 2 # find depth using peak matrix and normalize scale = np.array(peak_matrix.m.sum(axis=0)).squeeze() depth = (scale + 1) / np.median(scale) cov_peak = [np.log(depth)] diffexp_peak = nb2_diffexp.run_differential_expression(peak_matrix.m, clustering.clusters, model='poisson', impute_rest=True, test_params={'cov': cov_peak}, verbose=True) # find empirical estimates of alpha tf_matrix = None diffexp_tf = None # do DE on tf-BC matrix if args.filtered_tf_bc_matrix is not None: tf_matrix = cr_matrix.CountMatrix.load_h5_file(args.filtered_tf_bc_matrix) ntfmatrix = normalize_matrix(tf_matrix.m, scale) alpha_tf = nb2_diffexp.empirical_dispersion(ntfmatrix) barcode_GC = get_barcode_gc(args.reference_path, args.peaks, peak_matrix) cov_tf = [barcode_GC, np.log(depth)] diffexp_tf = nb2_diffexp.run_differential_expression(tf_matrix.m, clustering.clusters, model='nb', impute_rest=True, test_params={'cov': cov_tf, 'alpha': alpha_tf}, verbose=True) # vstack diffexp = diffexp_peak if tf_matrix is None else cr_diffexp.DIFFERENTIAL_EXPRESSION(np.vstack([diffexp_peak.data, diffexp_tf.data])) # write out temp file np.savetxt(outs.tmp_diffexp, diffexp.data, delimiter=',') outs.enrichment_analysis = None outs.enrichment_analysis_summary = None
def split(args): ref_mgr = ReferenceManager(args.reference_path) if args.filtered_matrix is None or args.peak_motif_hits is None or len( ref_mgr.list_species()) > 1: return {'chunks': []} matrix_mem_gb = cr_matrix.CountMatrix.get_mem_gb_from_matrix_h5( args.filtered_matrix) npeaks, nbcs, nnz = cr_matrix.CountMatrix.load_dims_from_h5( args.filtered_matrix) # assume we will never test more than 1000 TFs and # the relative hit-rate of a TF is a generous 1 out of every 10 peaks MAX_TF_COUNT = 1000 MAX_TF_PEAK_SPARSITY = 0.1 BYTES_PER_INT = np.dtype(int).itemsize BYTES_PER_FLOAT = np.dtype(float).itemsize BYTES_PER_GB = 1024**3 ENTRIES_PER_VAL = 3 predicted_tf_peak_matrix_mem_gb = ENTRIES_PER_VAL * MAX_TF_PEAK_SPARSITY * npeaks * MAX_TF_COUNT * BYTES_PER_INT / BYTES_PER_GB predicted_tf_matrix_mem_gb = ENTRIES_PER_VAL * nbcs * MAX_TF_COUNT * BYTES_PER_INT / BYTES_PER_GB predicted_tf_propZ_matrix_mem_gb = ENTRIES_PER_VAL * nbcs * MAX_TF_COUNT * BYTES_PER_FLOAT / BYTES_PER_GB chunk_mem_gb = int( np.ceil( max( matrix_mem_gb + predicted_tf_peak_matrix_mem_gb * 2 + predicted_tf_matrix_mem_gb * 2 + predicted_tf_propZ_matrix_mem_gb * 2, h5_constants.MIN_MEM_GB))) vmem_peak_motif_hits = int( np.ceil(predicted_tf_peak_matrix_mem_gb) * 3 + predicted_tf_peak_matrix_mem_gb) # HACK - give big jobs more threads in order to avoid overloading a node threads = cr_io.get_thread_request_from_mem_gb(chunk_mem_gb) return { 'chunks': [], 'join': { '__mem_gb': chunk_mem_gb, '__vmem_gb': chunk_mem_gb + vmem_peak_motif_hits + 1, '__threads': threads } }
def __init__(self, ref_path, bg=None): ref_manager = ReferenceManager(ref_path) self.all_motifs = [] if ref_manager.motifs is not None: with open(ref_manager.motifs, "r") as infile: self.all_motifs = list(motifs.parse(infile, "jaspar")) # for large sequence header, only keep the text before the first space self.genome_seq = pyfasta.Fasta(ref_manager.fasta, key_fn=lambda x: x.split()[0]) self.bg = bg
def split(args): """We just align each chunk independently -- joining will happen in the join step of SORT_READS""" # Pull some reads from fastq files -- bail out if it's less than 25bp fastq_tests = [x['read1'] for x in args.chunks] for fastq_test in fastq_tests: with open(fastq_test) as in_file: reader = tk_fasta.read_generator_fastq(in_file) for name, read, qual in itertools.islice(reader, 10): if len(read) < MIN_READ_LENGTH: martian.alarm("BWA-MEM can't handle reads <25bp -- reads will be unmapped.") continue # estimated amount of memory needed to process genome is 2x(num gigabases)+4GB ctg_mgr = ReferenceManager(args.reference_path) base_mem_in_gb = int(math.ceil(2 * ctg_mgr.get_vmem_est())) mem_in_gb = base_mem_in_gb + 4 chunks = [{'chunk': x, '__threads': args.num_threads, '__mem_gb': mem_in_gb} for x in args.chunks] return {'chunks': chunks}
def annotate_peaks(peaks, ref_path): """ peak to gene annotation strategy: 1. if a peak overlaps with promoter region (-1kb, + 100) of any TSS, call it a promoter peak 2. if a peak is within 200kb of the closest TSS, AND if it is not a promoter peak, call it a distal peak 3. if a peak overlaps of a transcript, AND it is not a promoter nor a distal peak of the gene, call it a distal peak This step is optional 4. call it an intergenic peak """ ref_mgr = ReferenceManager(ref_path) tss = BedTool(ref_mgr.tss_track) # if tss.bed contains the 7th column (gene type), then apply filter. Otherwise use all tss sites if tss.field_count() == 7: tss_filtered = tss.filter(lambda x: x[6] in TRANSCRIPT_ANNOTATION_GENE_TYPES).saveas() else: df_tss = tss.to_dataframe() df_tss['gene_type'] = '.' tss_filtered = BedTool.from_dataframe(df_tss).saveas() # including transcripts.bed is optional if ref_mgr.transcripts_track is None: transcripts_filtered = BedTool([]) else: transcripts = BedTool(ref_mgr.transcripts_track) if transcripts.field_count() == 7: transcripts_filtered = transcripts.filter(lambda x: x[6] in TRANSCRIPT_ANNOTATION_GENE_TYPES).saveas() else: df_tx = transcripts.to_dataframe() df_tx['gene_type'] = '.' transcripts_filtered = BedTool.from_dataframe(df_tx).saveas() # run bedtools closest for peaks against filtered tss, group by peaks and summarize annotations from select columns peaks_nearby_tss = peaks.closest(tss_filtered, D='b', g=ref_mgr.fasta_index).groupby(g=[1, 2, 3], c=[7, 11], o=['collapse']).saveas() results = [] peaks_nearby_tss_butno_tx = peaks_nearby_tss.intersect(transcripts_filtered, v=True).saveas() # avoid error when no peaks overlap with any transcipts if len(peaks_nearby_tss_butno_tx) < len(peaks_nearby_tss): peaks_nearby_tss_and_tx = peaks_nearby_tss \ .intersect(transcripts_filtered, wa=True, wb=True) \ .groupby(g=[1, 2, 3, 4, 5], c=[9], o=['distinct']) for peak in peaks_nearby_tss_and_tx: results.append(get_peak_nearby_genes(peak)) for peak in peaks_nearby_tss_butno_tx: results.append(get_peak_nearby_genes(peak)) return results
def split(args): """split into a chunk for each library in aggr csv, and define a unique gem group""" aggr_df = pd.read_csv(args.aggr_csv, sep=',') nchunks = len(aggr_df) ctg_mgr = ReferenceManager(args.reference_path) max_contig_len = max(ctg_mgr.get_contig_lengths().values()) BYTES_PER_INT32_WITH_SAFETY = 5 mem_gb = 2 * int( np.ceil( BYTES_PER_INT32_WITH_SAFETY * max_contig_len / 1024 / 1024 / 1024)) return { 'chunks': [{ 'n': group, '__mem_gb': mem_gb, '__vmem_gb': mem_gb + 6 } for group in range(nchunks)], 'join': { '__mem_gb': 12 } }
def join(args, outs, chunk_defs, chunk_outs): """Compute base background in each peak.""" ref_mgr = ReferenceManager(args.reference_path) npeaks = utils.quick_line_count(args.peaks) if args.peaks else 0 if len(ref_mgr.list_species() ) > 1 or npeaks == 0 or ref_mgr.motifs is None: outs.GCdist = None return # get peak-GC distribution genome_fa = pyfasta.Fasta(ref_mgr.fasta, key_fn=lambda x: x.split()[0]) GCdist = [ utils.get_peak_GC_counts(peak, genome_fa, counts=False) for peak in peak_reader(args.peaks) ] # compute base background from peaks in bins # merge extreme GC bins with adjoining ones if they're too narrow for motif scanner to work correctly GCbounds = [] nbins = NBINS for n, gc in enumerate( np.percentile(GCdist, np.linspace(0, 100, nbins + 1, endpoint=True), interpolation='lower')): if n == 0 or n == nbins: GCbounds += [gc] continue if gc >= LOW_GC and gc < HIGH_GC: GCbounds += [gc] GCbins = sorted(list(set(zip(GCbounds, GCbounds[1:])))) # uniqify peaks = peak_reader(args.peaks) GCdict = get_GCbinned_peaks_and_bg(peaks, genome_fa, GCbins) # dump with open(outs.GCdict, 'w') as f: pickle.dump(GCdict, f)
def split(args): if args.fragments is None: return {"chunks": [], "join": {}} with open(args.barcode_counts, "r") as infile: barcode_counts = Counter(json.load(infile)) valid_barcodes = barcode_counts.keys() part_a_seqs, part_c_seqs, part_b_seqs, gem_group_seqs = query_barcode_subsequences( valid_barcodes) ctg_mgr = ReferenceManager(args.reference_path) all_contigs = ctg_mgr.primary_contigs(allow_sex_chromosomes=True) chunks = [] for gem_group in gem_group_seqs: for contig in all_contigs: chunks.append({ "contig": contig, "gem_group": gem_group, "__mem_gb": 4, }) return {"chunks": chunks, "join": {"__mem_gb": 16}}