def transfer_fields(segments, cnarr, ignore=params.IGNORE_GENE_NAMES): """Map gene names, weights, depths from `cnarr` bins to `segarr` segments. Segment gene name is the comma-separated list of bin gene names. Segment weight is the sum of bin weights, and depth is the (weighted) mean of bin depths. Also: Post-process segmentation output. 1. Ensure every chromosome has at least one segment. 2. Ensure first and last segment ends match 1st/last bin ends (but keep log2 as-is). """ def make_null_segment(chrom, orig_start, orig_end): """Closes over 'segments'.""" vals = {'chromosome': chrom, 'start': orig_start, 'end': orig_end, 'gene': '-', 'depth': 0.0, 'log2': 0.0, 'probes': 0.0, 'weight': 0.0, } row_vals = tuple(vals[c] for c in segments.data.columns) return row_vals if not len(cnarr): # This Should Never Happen (TM) # raise RuntimeError("No bins for:\n" + str(segments.data)) logging.warn("No bins for:\n%s", segments.data) return segments # Adjust segment endpoints to cover the chromosome arm's original bins # (Stretch first and last segment endpoints to match first/last bins) bins_chrom = cnarr.chromosome.iat[0] bins_start = cnarr.start.iat[0] bins_end = cnarr.end.iat[-1] if not len(segments): # All bins in this chromosome arm were dropped: make a dummy segment return make_null_segment(bins_chrom, bins_start, bins_end) segments.start.iat[0] = bins_start segments.end.iat[-1] = bins_end # Aggregate segment depths, weights, gene names # ENH refactor so that np/CNA.data access is encapsulated in skgenome ignore += params.ANTITARGET_ALIASES assert bins_chrom == segments.chromosome.iat[0] cdata = cnarr.data.reset_index() if 'depth' not in cdata.columns: cdata['depth'] = np.exp2(cnarr['log2'].values) bin_genes = cdata['gene'].values bin_weights = cdata['weight'].values if 'weight' in cdata.columns else None bin_depths = cdata['depth'].values seg_genes = ['-'] * len(segments) seg_weights = np.zeros(len(segments)) seg_depths = np.zeros(len(segments)) for i, bin_idx in enumerate(iter_slices(cdata, segments.data, 'outer', False)): if bin_weights is not None: seg_wt = bin_weights[bin_idx].sum() if seg_wt > 0: seg_dp = np.average(bin_depths[bin_idx], weights=bin_weights[bin_idx]) else: seg_dp = 0.0 else: bin_count = len(cdata.iloc[bin_idx]) seg_wt = float(bin_count) seg_dp = bin_depths[bin_idx].mean() subgenes = [g for g in pd.unique(bin_genes[bin_idx]) if g not in ignore] if subgenes: seg_gn = ",".join(subgenes) else: seg_gn = '-' seg_genes[i] = seg_gn seg_weights[i] = seg_wt seg_depths[i] = seg_dp segments.data = segments.data.assign( gene=seg_genes, weight=seg_weights, depth=seg_depths) return segments