def add_snv_mu(bins, fasta, snv_mus, maxfloat): """ Extract SNV mutation rate (as list) for all bins from a reference fasta """ # Extend all bins by 1bp at start and end (need trinucleotide context for mu) bins.saveas() def _increment_bin(feat, dist=1, start=True, end=True): if start: feat.start = max([0, feat.start - dist]) if end: feat.end = feat.end + dist return feat buffbins = pbt.BedTool(bins).each(_increment_bin).saveas() snv_mu_dict = load_snv_mus(snv_mus) if 'compressed' in determine_filetype(fasta): fasta = GzipFile(fasta) values = [] for seq in get_seqs_from_bt(buffbins, fasta): mu = snv_mu_from_seq(seq.rstrip(), snv_mu_dict) values.append(mu) return values
def _parse_remote_inputs_tsv(urls_tsv, local_suffix='local_slice'): """ Parse a tsv of remote files to localize """ urls_dict = {} with open(urls_tsv) as f_in: for k, line in enumerate(f_in): url, mdata = line.rstrip().split('\t', 1) basename_orig = os.path.basename(url) ftype, fext = determine_filetype(basename_orig, return_extension=True) sliced_ext = '{}.{}'.format(local_suffix, fext) basename_sliced = basename_orig[:-len(fext)] + sliced_ext index_path = _find_remote_index(url, ftype) # Confirm if ftype not in 'compressed-bed compressed-vcf bam cram'.split(): err = 'INPUT ERROR: format not recognized as tabix or samtools ' + \ 'compliant for input file {}' exit(err.format(url)) urls_dict[k] = { 'url': url, 'ftype': ftype, 'metadata': mdata, 'basename_orig': basename_orig, 'basename_sliced': basename_sliced, 'index_path': index_path } return urls_dict
def add_bedtool_track(bins, track, action): """ Extract feature values (as list) for all bins vs. a single BedTool (or BAM/CRAM) """ if isinstance(track, str): ftype = determine_filetype(track) else: ftype = None if action == 'count': if ftype in 'bam cram'.split(): values = [int(f[-4]) for f in bins.coverage(track, sorted=True)] else: values = [ int(f[-1]) for f in bins.intersect(track, c=True, wa=True) ] elif action == 'count-unique': gfile = bedtool_to_genome_file(bins) bedtool = pbt.BedTool(track).sort(g=gfile).merge() values = [int(f[-1]) for f in bins.intersect(bedtool, c=True, wa=True)] elif action == 'coverage': values = [float(f[-1]) for f in bins.coverage(track)] elif action == 'any-overlap': values = [min([1, int(f[-1])]) for f \ in bins.intersect(track, c=True, wa=True)] else: from sys import exit exit('INPUT ERROR: --action {0} not recognized.'.format(action)) return values
def add_bedtool_track(bins, track, action, header_compliance='loose'): """ Extract feature values (as list) for all bins vs. a single BedTool (or BAM/CRAM) """ if isinstance(track, str): ftype = determine_filetype(track) else: ftype = None # Check for header inconsistencies for indexed tracks if ftype in 'bam cram compressed-vcf'.split(): query_bins = copy.deepcopy(bins) query_bins = check_header_compliance(track, query_bins, header_compliance) else: query_bins = bins if action == 'count': if ftype in 'bam cram'.split(): values = [ int(f[-4]) for f in query_bins.coverage(track, sorted=True) ] else: values = [ int(f[-1]) for f in query_bins.intersect(track, c=True, wa=True) ] elif action == 'count-unique': gfile = bedtool_to_genome_file(query_bins) chroms = set([f.split('\t')[0] for f in open(gfile).readlines()]) bedtool = pbt.BedTool(track).filter(lambda f: f.chrom in chroms).sort( g=gfile).merge() values = [ int(f[-1]) for f in query_bins.intersect(bedtool, c=True, wa=True) ] elif action == 'coverage': values = [float(f[-1]) for f in query_bins.coverage(track)] elif action == 'any-overlap': values = [min([1, int(f[-1])]) for f \ in query_bins.intersect(track, c=True, wa=True)] else: from sys import exit exit('INPUT ERROR: --action {0} not recognized.'.format(action)) if ftype in 'bam cram compressed-vcf'.split(): header_compliance_cleanup(track) return values
def add_nuc_content(bins, fasta, maxfloat): """ Extract GC content (as list) for all bins from a reference fasta """ if 'compressed' in determine_filetype(fasta): fasta = GzipFile(fasta) pct_gc = [ float(f[4]) for f in bins.cut(range(3)).nucleotide_content(fi=fasta) ] return pct_gc
def load_intervals(bed_in, min_size, cols_to_keep): """ Load and expand intervals """ if 'compressed' in determine_filetype(bed_in): fin = gzip.open(bed_in, 'rt') else: fin = open(bed_in) header = fin.readline().rstrip().split('\t')[:cols_to_keep] fin.close() bt = pbt.BedTool(bed_in).each(expand_interval, min_size=min_size).\ cut(range(cols_to_keep)).\ saveas('resized_intervals.bed', trackline='\t'.join(header)) return bt, header
def get_seqs_from_bt(bt, fasta, return_headers=False): """ Extract a list of nucleotide sequences corresponding to all records in a pbt.BedTool """ if 'compressed' in determine_filetype(fasta): fasta = GzipFile(fasta) fseqs = bt.sequence(fasta).seqfn seqs = [] headers = [] with open(fseqs) as fin: for seqheader, seq in itertools.zip_longest(*[fin] * 2): headers.append(seqheader.rstrip().replace('>', '')) seqs.append(seq.rstrip().upper()) if return_headers: return seqs, headers else: return seqs
def add_pairwise_local_track(pairs_bedpe_bt, track, action, query_regions, binsize, quiet): """ Wrapper function to extract values for a single local track """ ftype = determine_filetype(track) if quiet is False: status_msg = '[{0}] athena annotate-pairs: Adding track "{1}" ' + \ 'with action "{2}"' print(status_msg.format(datetime.now().strftime('%b %d %Y @ %H:%M:%S'), track, action)) # Convert BAM/CRAM records to BEDPE, if necessary if ftype in 'bam cram'.split(): track = _bam_to_bedpe(track, query_regions) if action in 'count-pairs pairwise-coverage any-pairwise-overlap'.split(): values = add_pairwise_bedtool_track(pairs_bedpe_bt, track, action, binsize) return values
def mu_predict(pairs, model_pkl, outfile, raw_mu, keep_features, maxfloat, bgzip): """ Apply a trained mutation rate model to new bin-pairs """ # Load pairs and split coordinates from features coords = pd.read_csv(pairs, sep='\t', usecols=range(3)) feats, labels = load_bed(pairs) if keep_features: feats_df = dfutils.load_feature_df(pairs) # Load model from .pkl and switch to evaluation mode model = torch.load(model_pkl) model.eval() # Predict mutation rates for all bins with torch.no_grad(): preds = model(feats).numpy() if not raw_mu: preds = log10(preds) preds_df = pd.DataFrame(preds, columns=['mu']) # Format output dataframe out_df = pd.concat([coords, preds_df], axis=1) if keep_features: out_df = pd.concat([out_df, feats_df], axis=1) out_df = dfutils.float_cleanup(out_df, maxfloat=maxfloat, start_idx=3) # Save pairs with predicted mutation rates if 'compressed' in determine_filetype(outfile): outfile = path.splitext(outfile)[0] out_df.to_csv(outfile, sep='\t', index=False) # Bgzip bins, if optioned if bgzip: bgz(outfile)
def add_local_track(bins, track, action, quiet): """ Wrapper function to add a single local track """ ftype = determine_filetype(track) if quiet is False: status_msg = '[{0}] athena annotate-bins: Adding track "{1}" ' + \ 'with action "{2}"' print( status_msg.format(datetime.now().strftime('%b %d %Y @ %H:%M:%S'), track, action)) if action in 'count count-unique coverage any-overlap'.split(): values = add_bedtool_track(bins, track, action) elif 'map-' in action: if ftype == 'bigwig': values = add_bigwig_track(bins, track, action) else: values = add_bedgraph_track(bins, track, action) return values
def decompose_bins(bins, bins_outfile=None, parameters_outfile=None, precomp_model=None, components=10, minvar=None, trans_dict=None, whiten=False, fill_missing=0, first_column=3, maxfloat=5, max_pcs=100, pca_stats=None, eigen_prefix='eigenfeature', bgzip=False): """ Master function for Eigendecomposition of bin annotations """ # Set certain defaults prior to loading precomputed model whitener = None # Load precomputed model, if optioned if precomp_model is not None: df_fills, trans_dict, scaler, pca, components, whitener = \ _load_precomp_model(precomp_model) fill_missing = df_fills # Expand feature transformation dictionary log_transform = trans_dict.get('log', []) sqrt_transform = trans_dict.get('sqrt', []) exp_transform = trans_dict.get('exp', []) square_transform = trans_dict.get('square', []) boxcox_transform = trans_dict.get('boxcox', []) # Read bins, then sanitize and transform annotations df_bins = pd.read_csv(bins, sep='\t', usecols=range(first_column)) df_annos, df_fills = \ dfutils.load_feature_df(bins, first_column, log_transform, sqrt_transform, exp_transform, square_transform, boxcox_transform, fill_missing, return_fills=True) feature_names = df_annos.columns.tolist() # Scale all columns if precomp_model is None: scaler = StandardScaler().fit(df_annos) df_annos = scaler.transform(df_annos) # Learn covariance matrix & determine number of components to keep if precomp_model is None: pcs_to_calc = min([df_annos.shape[1], max_pcs]) pca = PCA(n_components=pcs_to_calc).fit(df_annos) if minvar is None: components = pcs_to_calc else: components = len([i for i in np.cumsum(pca.explained_variance_ratio_) \ if i < minvar]) # Decompose annotations pcs = pca.transform(df_annos) eigen_names = ['_'.join([eigen_prefix, str(i+1)]) for i in range(components)] df_pcs = pd.DataFrame(pcs[:, :components], columns=eigen_names) # "Whiten" eigenfeatures, if optioned if whiten: if precomp_model is None: whitener = StandardScaler().fit(df_pcs) if whitener is not None: df_pcs = pd.DataFrame(whitener.transform(df_pcs), columns=eigen_names) # Write output bins with PCs if bins_outfile is not None: if 'compressed' in determine_filetype(bins_outfile): bins_outfile = path.splitext(bins_outfile)[0] out_df = dfutils.float_cleanup(pd.concat([df_bins, df_pcs], axis=1), maxfloat, first_column) out_df.to_csv(bins_outfile, sep='\t', index=False) if bgzip: bgz(bins_outfile) # Save model for future use, if optioned if parameters_outfile is not None: _save_model_params(df_fills, trans_dict, scaler, pca, components, whitener, parameters_outfile) # Perform extra assessments of PCA & feature fits, if optioned if pca_stats is not None: get_feature_stats(df_annos, feature_names, pca, pcs, pca_stats, eigen_prefix, components)
def main(): """ Main block """ # Parse arguments parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('bed', help='BED file of intervals to be evaluated') parser.add_argument('probes_tsv', help='.tsv of path to probeset BED and ' + 'array name') parser.add_argument('samples_tsv', help='.tsv matrix of sample counts for ' + 'arrays (rows) X cohorts (columns)') parser.add_argument('cohorts', help='.tsv of metacohort assignments') parser.add_argument('-o', '--outfile', help='Output BED file annotated with ' + 'exclusion criteria [default: stdout]', default='stdout') parser.add_argument('--probecounts-outfile', help='Output BED file annotated with ' + 'number of probes per interval [optional]') parser.add_argument( '--control-mean-counts-outfile', help='Output BED file ' + 'annotated with mean number of probes in controls per ' + 'cohort, computed as the weighted average per platform ' + '[optional]') parser.add_argument( '--frac-pass-outfile', help='Output BED file annotated with ' + 'fraction of passing samples per cohort per per interval [optional]') parser.add_argument('--min-interval-size', dest='min_size', type=int, default=100000, help='Uniformly expand small intervals to ' + 'be at least this size [default: 100,000]') parser.add_argument( '--min-probes', type=int, default=10, help='Minimum number ' + 'of probes required per interval to pass [default: 10]') parser.add_argument( '--min-frac-samples', type=float, default=0.9, help='Minimum fraction of samples required per interval ' + 'to pass [default: 0.9]') parser.add_argument('-k', '--keep-n-columns', type=int, default=3, help='Number of columns from input BED to keep in ' + '--outfile [default: 3]') parser.add_argument( '-z', '--bgzip', action='store_true', default=False, help='compress --outfile and --probecounts-outfile with ' + 'bgzip') args = parser.parse_args() # Step 1. Load intervals, and expand (as necessary) intervals, header = load_intervals(args.bed, args.min_size, args.keep_n_columns) # Step 2. Annotate intervals with probe counts using athena tracks, tnames = parse_probesets(args.probes_tsv) intervals = annotate_bins(bins=intervals.fn, chroms=None, ranges=None, tracks=tracks, ucsc_tracks=[], ucsc_ref=None, actions=['count' for i in range(len(tracks))], fasta=None, snv_mus=None, maxfloat=8, ucsc_chromsplit=False, quiet=False) intervals = replace_coords(intervals, args.bed, args.keep_n_columns) counts_outfile = args.probecounts_outfile if counts_outfile is not None: if 'compressed' in determine_filetype(counts_outfile): counts_outfile = path.splitext(counts_outfile)[0] intervals.saveas(counts_outfile, trackline='\t'.join(header + tnames)) if args.bgzip: bgzip(counts_outfile) # Step 3. Determine pass/fail labels per interval per array array_labels_df = label_array_fails(intervals, args.min_probes, header, tnames) # Step 4. Compute fraction of passing samples per interval per cohort array_counts = load_array_counts(args.samples_tsv, args.cohorts, tnames) cohort_fracs_df = get_passing_fracs(array_counts, array_labels_df, args.keep_n_columns) fracs_outfile = args.frac_pass_outfile if fracs_outfile is not None: if 'compressed' in determine_filetype(fracs_outfile): fracs_outfile = path.splitext(fracs_outfile)[0] cohort_fracs_df.to_csv(fracs_outfile, sep='\t', index=False) if args.bgzip: bgzip(fracs_outfile) # Step 5. Label each interval with cohorts to be excluded cohort_labels_df = label_cohort_fails(cohort_fracs_df, args.min_frac_samples, args.keep_n_columns) # Step 6. Format output file and write out if args.outfile in '- stdout /dev/stdout'.split(): cohort_labels_df.to_csv(stdout, sep='\t', index=False) else: outfile = args.outfile if 'compressed' in determine_filetype(outfile): outfile = path.splitext(outfile)[0] cohort_labels_df.to_csv(outfile, sep='\t', index=False) if args.bgzip: bgzip(outfile) # [Optional] Step 7. Compute average number of probes per cohort per interval means_outfile = args.control_mean_counts_outfile if means_outfile is not None: cohort_means_df = get_cohort_means(array_counts, intervals, tnames, header, args.keep_n_columns) if 'compressed' in determine_filetype(means_outfile): means_outfile = path.splitext(means_outfile)[0] cohort_means_df.to_csv(means_outfile, sep='\t', index=False) if args.bgzip: bgzip(means_outfile)
def annotate_pairs(pairs, chroms, ranges, tracks, ucsc_tracks, actions, track_names, ucsc_ref, fasta, binsize, homology_cutoffs, ucsc_chromsplit, maxfloat, quiet): """ Master pair annotation function """ # Infer binsize and filetype if binsize is None: binsize = calc_binsize(pairs) ftype = determine_filetype(pairs) # Load pairs. Note: must read contents from file due to odd utf-8 decoding # behavior for bgzipped BED files with pybedtools if 'compressed' in ftype: pairs = ''.join(s.decode('utf-8') for s in GzipFile(pairs).readlines()) else: pairs = open(pairs, 'r').readlines() firstline = pairs.split('\n')[0].split('\t') if firstline[0].startswith('#'): colnames = firstline else: colnames = None n_cols_old = len(firstline) pairs = pbt.BedTool(pairs, from_string=True) # Subset pairs to specific chromosomes/ranges, if optioned if chroms is not None: chrlist = chroms.split(',') pairs = pairs.filter(lambda x: x.chrom in chrlist).saveas() if ranges is not None: pairs = pairs.intersect(range, wa=True).saveas() # Note: more efficient (and stable) when adding many annotations to hold # pd.DataFrame of pairs with annotations in memory and convert entire # pd.DataFrame back to pbt.BedTool after adding all annotations as columns # This appears to be due to peculiarities in pyBedTools handling of wide BED files pairs_bt = pairs.cut(range(3)).saveas() pairs_df = pairs.to_dataframe(names=colnames, comment='#') pairs_bedpe_bt = _pairs_bed_to_bedpe(pairs_bt, binsize) # Make master pbt.BedTool of all bins from all pairs split_pair_bts = [_split_pairs(p, binsize) for p in pairs_bt] allbins_bt = split_pair_bts[0].cat(*split_pair_bts[1:], postmerge=False).sort().merge(d=-1) query_regions = ucsc.collapse_query_regions(allbins_bt).saveas() # Annotate bins with all local tracks track_counter = 0 if len(tracks) > 0: for track in tracks: action = actions[track_counter] pairs_df['newtrack_{}'.format(track_counter)] = \ add_pairwise_local_track(pairs_bedpe_bt, track, action, query_regions, binsize, quiet) track_counter += 1 # Annotate bins with all UCSC tracks if len(ucsc_tracks) > 0: if quiet is False: status_msg = '[{0}] athena annotate-pairs: Connecting to UCSC ' + \ 'Genome Browser database' print(status_msg.format(datetime.now().strftime('%b %d %Y @ %H:%M:%S'), fasta)) db = ucsc.ucsc_connect(ucsc_ref) # Iterate over tracks for track in ucsc_tracks: action = actions[track_counter] pairs_df['newtrack_{}'.format(track_counter)] = \ add_pairwise_ucsc_track(pairs_bedpe_bt, db, track, action, query_regions, binsize, ucsc_ref, ucsc_chromsplit, quiet) track_counter += 1 # Close UCSC connection db.close() # Annotate pairs based on nucleotide content, if optioned if fasta is not None: if quiet is False: status_msg = '[{0}] athena annotate-pairs: Adding sequence homology ' + \ 'features from reference fasta "{1}".' print(status_msg.format(datetime.now().strftime('%b %d %Y @ %H:%M:%S'), fasta)) for identity in homology_cutoffs: for rev in True, False: pairs_df['newtrack_{}'.format(track_counter)] = \ add_homology(pairs_bt, fasta, binsize, identity, rev) track_counter += 1 # Clean up long floats pairs_df = float_cleanup(pairs_df, maxfloat, start_idx=3) # Return bins as pbt.BedTool return pbt.BedTool.from_dataframe(pairs_df)
def annotatepairs(pairs, outfile, chroms, ranges, track, ucsc_track, actions, track_names, track_list, ucsc_list, ucsc_ref, fasta, binsize, homology_cutoffs, no_ucsc_chromsplit, maxfloat, bgzip, quiet): """ Annotate pairs """ # Sanitize & format inputs ucsc_chromsplit = not no_ucsc_chromsplit tracks = list(track) ucsc_tracks = list(ucsc_track) actions = tuple([a.lower() for a in actions]) if len(homology_cutoffs) > 0: homology_cutoffs = list(homology_cutoffs) else: homology_cutoffs = [1.0] # Parse file with lists of tracks (if provided) and add to track lists if track_list is not None: supp_tracks, supp_actions, supp_names = mutrate.parse_track_file( track_list) tracks = tracks + supp_tracks n_ucsc_tracks = len(ucsc_tracks) if n_ucsc_tracks > 0: actions = tuple(list(actions[:n_ucsc_tracks]) + supp_actions \ + list(actions[n_ucsc_tracks:])) track_names = tuple(list(track_names[:n_ucsc_tracks]) + supp_names \ + list(track_names[n_ucsc_tracks:])) else: actions = tuple(list(actions) + supp_actions) track_names = tuple(list(track_names) + supp_names) # Parse file with list of UCSC tracks (if provided and add to track lists) if ucsc_list is not None: supp_ucsc_tracks, supp_ucsc_actions, supp_ucsc_names = mutrate.parse_track_file( ucsc_list) ucsc_tracks = ucsc_tracks + supp_ucsc_tracks actions = tuple(list(actions) + supp_ucsc_actions) track_names = tuple(list(track_names) + supp_ucsc_names) # Handle header reformatting if 'compressed' in determine_filetype(pairs): header = GzipFile(pairs).readline().decode('utf-8').rstrip() else: header = open(pairs, 'r').readline().rstrip() if not header.startswith('#'): msg = 'INPUT WARNING: ' status_msg = '[{0}] athena annotate-pairs: No header line detected. ' + \ 'Adding default header.' print(status_msg.format( datetime.now().strftime('%b %d %Y @ %H:%M:%S'))) n_extra_cols = len(header.split('\t')) - 3 header = make_default_bed_header(n_extra_cols) if len(track_names) > 0: newheader = header + '\t' + '\t'.join(list(track_names)) else: newheader = header if fasta is not None: for k in homology_cutoffs: for direction in 'fwd rev'.split(): newheader += '\t' + 'longest_{}_kmer_{}pct_identity'.format( direction, int(round(100 * k))) # Annotate pairs newpairs = mutrate.annotate_pairs(pairs, chroms, ranges, tracks, ucsc_tracks, actions, track_names, ucsc_ref, fasta, binsize, homology_cutoffs, ucsc_chromsplit, maxfloat, quiet) # Save annotated bins if 'compressed' in determine_filetype(outfile): outfile = path.splitext(outfile)[0] newpairs.saveas(outfile, trackline=newheader) # Bgzip bins, if optioned if bgzip: bgz(outfile)
def annotatebins(bins, outfile, include_chroms, ranges, track, ucsc_track, actions, track_names, track_list, ucsc_list, ucsc_ref, fasta, snv_mus, no_ucsc_chromsplit, maxfloat, bgzip, quiet): """ Annotate bins """ # Sanitize & format inputs ucsc_chromsplit = not no_ucsc_chromsplit track = list(track) ucsc_track = list(ucsc_track) actions = tuple([a.lower() for a in actions]) # Parse file with lists of tracks (if provided) and add to track lists if track_list is not None: supp_tracks, supp_actions, supp_names = mutrate.parse_track_file( track_list) track = track + supp_tracks n_ucsc_tracks = len(ucsc_track) if n_ucsc_tracks > 0: actions = tuple(list(actions[:n_ucsc_tracks]) + supp_actions \ + list(actions[n_ucsc_tracks:])) track_names = tuple(list(track_names[:n_ucsc_tracks]) + supp_names \ + list(track_names[n_ucsc_tracks:])) else: actions = tuple(list(actions) + supp_actions) track_names = tuple(list(track_names) + supp_names) # Parse file with list of UCSC tracks (if provided and add to track lists) if ucsc_list is not None: supp_ucsc_tracks, supp_ucsc_actions, supp_ucsc_names = mutrate.parse_track_file( ucsc_list) ucsc_track = ucsc_track + supp_ucsc_tracks actions = tuple(list(actions) + supp_ucsc_actions) track_names = tuple(list(track_names) + supp_ucsc_names) # Handle header reformatting n_tracks = len(track) + len(ucsc_track) if n_tracks != len(track_names): err = 'INPUT ERROR: Number of supplied track names ({0}) does not ' + \ 'match number of tracks ({1}).' exit(err.format(len(track_names), n_tracks)) if 'compressed' in determine_filetype(bins): header = GzipFile(bins).readline().decode('utf-8').rstrip() else: header = open(bins, 'r').readline().rstrip() if not header.startswith('#'): status_msg = '[{0}] athena annotate-bins: No header line detected. ' + \ 'Adding default header.' print(status_msg.format( datetime.now().strftime('%b %d %Y @ %H:%M:%S'))) n_extra_cols = len(header.split('\t')) - 3 header = make_default_bed_header(n_extra_cols) newheader = header + '\t' + '\t'.join(list(track_names)) if fasta is not None: newheader = '\t'.join([newheader, 'pct_gc']) if snv_mus is not None: newheader = '\t'.join([newheader, 'snv_mu']) # Annotate bins newbins = mutrate.annotate_bins(bins, include_chroms, ranges, track, ucsc_track, ucsc_ref, actions, fasta, snv_mus, maxfloat, ucsc_chromsplit, quiet) # Save annotated bins if 'compressed' in determine_filetype(outfile): outfile = path.splitext(outfile)[0] newbins.saveas(outfile, trackline=newheader) # Bgzip bins, if optioned if bgzip: bgz(outfile)
def annotate_bins(bins, chroms, ranges, tracks, ucsc_tracks, ucsc_ref, actions, fasta, snv_mus, maxfloat, ucsc_chromsplit, quiet): """ Master bin annotation function """ # Parse & sanity check all track inputs n_all_tracks = len(tracks) + len(ucsc_tracks) if len(actions) != n_all_tracks: from sys import exit err = 'INPUT ERROR: Number of actions ({0}) does not match number ' + \ 'of tracks ({1}).' exit(err.format(len(actions), n_all_tracks)) if len(ucsc_tracks) > 0: if ucsc_ref is None: from sys import exit exit('INPUT ERROR: --ucsc-ref must be specified if any UCSC ' + 'tracks are requested.') # Load bins. Note: must read contents from file due to odd utf-8 decoding # behavior for bgzipped BED files ftype = determine_filetype(bins) if ftype is None: ftype = 'unknown' if 'compressed' in ftype: bins = ''.join(s.decode('utf-8') for s in GzipFile(bins).readlines()) else: bins = ''.join(open(bins, 'r').readlines()) firstline = bins.split('\n')[0].split('\t') if firstline[0].startswith('#'): colnames = firstline else: colnames = None n_cols_old = len(firstline) bins = pbt.BedTool(bins, from_string=True) # Subset bins to specific chromosomes/ranges, if optioned if chroms is not None: chrlist = chroms.split(',') bins = bins.filter(lambda x: x.chrom in chrlist).saveas() if ranges is not None: bins = bins.intersect(range, wa=True).saveas() # Note: more efficient (and stable) when adding many annotations to hold # pd.DataFrame of bins with annotations in memory and convert entire # pd.DataFrame back to pbt.BedTool after adding all annotations as columns # This appears to be due to peculiarities in pyBedTools handling of wide BED files bins_bt = bins.cut(range(3)).saveas() bins_df = bins.to_dataframe(names=colnames, comment='#') # Annotate bins with all local tracks track_counter = 0 if len(tracks) > 0: for track in tracks: action = actions[track_counter] bins_df['newtrack_{}'.format(track_counter)] = \ add_local_track(bins_bt, track, action, quiet) track_counter += 1 # Annotate bins with all UCSC tracks if len(ucsc_tracks) > 0: if quiet is False: status_msg = '[{0}] athena annotate-bins: Connecting to UCSC ' + \ 'Genome Browser database' print( status_msg.format( datetime.now().strftime('%b %d %Y @ %H:%M:%S'), fasta)) db = ucsc.ucsc_connect(ucsc_ref) query_regions = ucsc.collapse_query_regions(bins).saveas() # Iterate over tracks for track in ucsc_tracks: # Ping db connection is still active (UCSC may timeout over sequential long queries) # If UCSC connection has timed out, reopen new connection try: db.ping(True) except: try: db.close() except: pass db = ucsc.ucsc_connect(ucsc_ref) # Submit UCSC query action = actions[track_counter] bins_df['newtrack_{}'.format(track_counter)] = \ add_ucsc_track(bins_bt, db, track, action, query_regions, ucsc_ref, ucsc_chromsplit, quiet) track_counter += 1 # Close UCSC connection db.close() # Annotate bins with nucleotide content, if optioned if fasta is not None: if quiet is False: status_msg = '[{0}] athena annotate-bins: Adding nucleotide ' + \ 'content from reference fasta "{1}".' print( status_msg.format( datetime.now().strftime('%b %d %Y @ %H:%M:%S'), fasta)) bins_df['pct_gc'] = add_nuc_content(bins, fasta, maxfloat) # Annotate bins with SNV mutation rates, if optioned if snv_mus is not None: if quiet is False: status_msg = '[{0}] athena annotate-bins: Adding SNV mutation ' + \ 'rates from reference fasta "{1}".' print( status_msg.format( datetime.now().strftime('%b %d %Y @ %H:%M:%S'), fasta)) bins_df['snv_mu'] = add_snv_mu(bins, fasta, snv_mus, maxfloat) # Clean up long floats bins_df = float_cleanup(bins_df, maxfloat, start_idx=n_cols_old) # Return bins as pbt.BedTool return pbt.BedTool.from_dataframe(bins_df)
def count_sv(bins_in, sv_in, outfile, paired, binsize, breakpoints, probs, sv_ci, maxfloat, bgzip): """ Master function to annotate bins_in with count (or probability) of SVs """ # Load bins, split bin coordinates from annotations, and retain header if 'compressed' in determine_filetype(bins_in): bins_header = gzip.open( bins_in, 'r').readline().decode('utf-8').rstrip().split('\t') else: bins_header = open(bins_in, 'r').readline().rstrip().split('\t') bins_bt = pbt.BedTool(bins_in).cut(range(3)).saveas() bins_df = bins_bt.to_dataframe() feats_df = dfutils.load_feature_df(bins_in) if binsize is None: binsize = calc_binsize(bins_in) # Parse input SV file depending on format # If breakpoints == False, will return simple four-column BED with variant ID in fourth column # If breakpoints == True, will return two rows per record where each record # is one breakpoint with columns 4 = variant ID, 5 = POS or END, 6 = original # POS or END coordinate, 7 = std dev of left side of breakpoint, 8 = std dev of # right side of breakpoint, and 9 = number of std deviations extended left & right (i.e., z_extend) sv_format = determine_filetype(sv_in) if 'vcf' in sv_format: vcf = pysam.VariantFile(sv_in) sv = vcf2bed(vcf, breakpoints=breakpoints, add_ci_to_bkpts=probs, ci=sv_ci) elif 'bed' in sv_format: sv = _load_sv_from_bed(sv_in, breakpoints=breakpoints) # Perform intersection with bins depending on input parameters if breakpoints: bins_bt = add_names_to_bed(bins_bt) bin_ids = [b.name for b in bins_bt] # Split pairs if necessary if paired: bins_bt = _split_pairs(bins_bt, binsize=binsize, add_name=True, add_side=True) # Intersect breakpoints with bins hits = bins_bt.intersect(sv, wa=True, wb=True) bkpt_res = parse_breakpoint_hits(hits, paired, probs) sv_column = pd.Series([bkpt_res.get(b_id, 0) for b_id in bin_ids]) # --comparison "overlap" (i.e., breakpoints == False) is the same for both 1D and 2D bins else: if probs: sv_column = pd.Series( [min([1, int(x[-1])]) for x in bins_bt.intersect(sv, c=True)]) else: sv_column = pd.Series( [int(x[-1]) for x in bins_bt.intersect(sv, c=True)]) # Paste bin coordinates, SV counts, and original features into single dataframe out_df = dfutils.float_cleanup( pd.concat([bins_df, sv_column, feats_df], axis=1), maxfloat, 3) out_df.columns = bins_header[:3] + ['sv'] + bins_header[3:] # Save bins with SV counts if 'compressed' in determine_filetype(outfile): outfile = path.splitext(outfile)[0] out_df.to_csv(outfile, sep='\t', header=True, index=False) # Bgzip bins, if optioned if bgzip: bgz(outfile)