def mu_predict(pairs, model_pkl, outfile, raw_mu, keep_features, maxfloat, bgzip): """ Apply a trained mutation rate model to new bin-pairs """ # Load pairs and split coordinates from features coords = pd.read_csv(pairs, sep='\t', usecols=range(3)) feats, labels = load_bed(pairs) if keep_features: feats_df = dfutils.load_feature_df(pairs) # Load model from .pkl and switch to evaluation mode model = torch.load(model_pkl) model.eval() # Predict mutation rates for all bins with torch.no_grad(): preds = model(feats).numpy() if not raw_mu: preds = log10(preds) preds_df = pd.DataFrame(preds, columns=['mu']) # Format output dataframe out_df = pd.concat([coords, preds_df], axis=1) if keep_features: out_df = pd.concat([out_df, feats_df], axis=1) out_df = dfutils.float_cleanup(out_df, maxfloat=maxfloat, start_idx=3) # Save pairs with predicted mutation rates if 'compressed' in determine_filetype(outfile): outfile = path.splitext(outfile)[0] out_df.to_csv(outfile, sep='\t', index=False) # Bgzip bins, if optioned if bgzip: bgz(outfile)
def filter_vcf(vcf, out, chroms, xchroms, svtypes, exclusion_list, minAF, maxAF, minAC, maxAC, minAN, filters, minQUAL, maxQUAL, HWE, af_field, keep_infos, bgzip): # Open connection to input VCF if vcf in '- stdin'.split(): invcf = pysam.VariantFile(stdin) else: invcf = pysam.VariantFile(vcf) header = invcf.header #Clean undesired INFO fields from header if keep_infos != 'ALL': if keep_infos is None: keep_infos = [] else: keep_infos = keep_infos.split(',') for key in 'END CHR2 SVTYPE SVLEN'.split(): keep_infos.append(key) for key in header.info.keys(): if key not in keep_infos: header.info.remove_header(key) # Open connection to output VCF if out in '- stdout'.split(): outvcf = pysam.VariantFile(stdout, 'w', header=header) else: if '.gz' in out: out = path.splitext(out)[0] outvcf = pysam.VariantFile(out, 'w', header=header) # Parse filtering options if chroms is not None: chroms = chroms.split(',') else: chroms = header.contigs.keys() if xchroms is not None: xchroms = xchroms.split(',') chroms = [c for c in chroms if c not in xchroms] if svtypes is not None: if 'SVTYPE' not in header.info.keys(): sys.exit('SVTYPE filtering was specified, but input VCF ' + 'does not have SVTYPE entry in INFO.') else: svtypes = svtypes.split(',') if filters is not None: filters = filters.split(',') if exclusion_list is not None: bl = pybedtools.BedTool(exclusion_list) # Raise warning if AF or AC are missing from VCF for key in [af_field, 'AC']: if key not in header.info.keys(): import warnings warning_message = '{0} not found in VCF INFO, so {0}-based filtering ' + \ 'will be ignored' warning_message = warning_message.format(key) warnings.warn(warning_message, RuntimeWarning) # Raise exception if HWE enabled but any necessary fields missing if HWE is not None: for key in 'N_HOMREF N_HET N_HOMALT'.split(): if key not in header.info.keys(): error_message = 'Hardy-Weinberg filtering not possible due to ' + \ 'missing {0} in VCF INFO' sys.exit(error_message.format(key)) # Iterate over vcf & filter records for record in invcf.fetch(): # Filter by chromosome if chroms is not None \ and record.chrom not in chroms: continue # Filter by svtype if svtypes is not None \ and record.info['SVTYPE'] not in svtypes: continue # Exclude records where end < start if record.stop < record.start: continue # Filter by AF/AC if af_field in record.info.keys(): if minAF is not None: if np.nansum(record.info[af_field]) < minAF: continue if maxAF is not None: if np.nansum(record.info[af_field]) > maxAF: continue if 'AC' in record.info.keys(): if minAC is not None: if np.nansum(record.info['AC']) < minAC: continue if maxAC is not None: if np.nansum(record.info['AC']) > maxAC: continue # Filter by AN if 'AN' in record.info.keys(): if minAN is not None: if record.info['AN'] < minAN: continue # Filter by VCF FILTER if filters is not None: if len([f for f in record.filter if f not in filters]) > 0: continue # Filter by QUAL score if minQUAL is not None \ and record.qual is not None: if record.qual < minQUAL: continue if maxQUAL is not None \ and record.qual is not None: if record.qual > maxQUAL: continue # Filter by Hardy-Weinberg equilibrium if HWE is not None and len(record.alts) < 3: if np.nansum(record.info[af_field]) < 1: if hwe_chisq(record) < HWE: continue # Clean record if keep_infos != 'ALL': for key in record.info.keys(): if key not in keep_infos: record.info.pop(key) # Write filter-passing records to output VCF outvcf.write(record) outvcf.close() # Filter remaining records against exclusion_list if exclusion_list is not None: prebl_vcf = pybedtools.BedTool(out) prebl_vcf.intersect(bl, header=True, v=True).saveas(out) # Bgzip output VCF, if optioned if bgzip: bgz(out)
def pair_bins(query_bins, all_bins, outfile, max_dist, exclusion_list, excl_buffer, annotate_dist, sort_features, annotate_absdiff, maxfloat, bgzip, input_has_header=True): """ Create pairs of bins from input BED """ # Open connection to infiles & outfile if determine_filetype(query_bins) == 'compressed-bed': fin = gzip.open(query_bins, 'rt') else: fin = open(query_bins) if input_has_header: colnames = [ k.replace('#', '') for k in fin.readline().rstrip().split('\t') ] if all_bins is None: bins_tabix = TabixFile(bins) else: bins_tabix = TabixFile(all_bins) xbt = load_exclusion_bts(exclusion_list, excl_buffer) # Open connection to output file out_ftype, out_ext = determine_filetype(outfile, return_extension=True) if 'compressed' in out_ftype: outpath = outfile.replace(out_ext, 'bed') else: outpath = outfile fout = open(outpath, 'w') # Format header and write to outfile header = '#chr start end'.split() if annotate_dist: header.append('distance') for fname in colnames[3:]: if sort_features: fname_suffixes = ['min', 'max'] else: fname_suffixes = ['left', 'right'] if annotate_absdiff: fname_suffixes.append('absdiff') header += ['_'.join([fname, v]) for v in fname_suffixes] fout.write('\t'.join(header) + '\n') # Identify and curate all pairs for each bin in fin for query_line in fin.readlines(): query_vals = query_line.rstrip().split('\t') new_pairs = _get_pairs(fout, query_vals, bins_tabix, max_dist, xbt, annotate_dist, sort_features, annotate_absdiff, maxfloat) # Clean up fout.close() if bgzip: bgz(outpath)
def decompose_bins(bins, bins_outfile=None, parameters_outfile=None, precomp_model=None, components=10, minvar=None, trans_dict=None, whiten=False, fill_missing=0, first_column=3, maxfloat=5, max_pcs=100, pca_stats=None, eigen_prefix='eigenfeature', bgzip=False): """ Master function for Eigendecomposition of bin annotations """ # Set certain defaults prior to loading precomputed model whitener = None # Load precomputed model, if optioned if precomp_model is not None: df_fills, trans_dict, scaler, pca, components, whitener = \ _load_precomp_model(precomp_model) fill_missing = df_fills # Expand feature transformation dictionary log_transform = trans_dict.get('log', []) sqrt_transform = trans_dict.get('sqrt', []) exp_transform = trans_dict.get('exp', []) square_transform = trans_dict.get('square', []) boxcox_transform = trans_dict.get('boxcox', []) # Read bins, then sanitize and transform annotations df_bins = pd.read_csv(bins, sep='\t', usecols=range(first_column)) df_annos, df_fills = \ dfutils.load_feature_df(bins, first_column, log_transform, sqrt_transform, exp_transform, square_transform, boxcox_transform, fill_missing, return_fills=True) feature_names = df_annos.columns.tolist() # Scale all columns if precomp_model is None: scaler = StandardScaler().fit(df_annos) df_annos = scaler.transform(df_annos) # Learn covariance matrix & determine number of components to keep if precomp_model is None: pcs_to_calc = min([df_annos.shape[1], max_pcs]) pca = PCA(n_components=pcs_to_calc).fit(df_annos) if minvar is None: components = pcs_to_calc else: components = len([i for i in np.cumsum(pca.explained_variance_ratio_) \ if i < minvar]) # Decompose annotations pcs = pca.transform(df_annos) eigen_names = ['_'.join([eigen_prefix, str(i+1)]) for i in range(components)] df_pcs = pd.DataFrame(pcs[:, :components], columns=eigen_names) # "Whiten" eigenfeatures, if optioned if whiten: if precomp_model is None: whitener = StandardScaler().fit(df_pcs) if whitener is not None: df_pcs = pd.DataFrame(whitener.transform(df_pcs), columns=eigen_names) # Write output bins with PCs if bins_outfile is not None: if 'compressed' in determine_filetype(bins_outfile): bins_outfile = path.splitext(bins_outfile)[0] out_df = dfutils.float_cleanup(pd.concat([df_bins, df_pcs], axis=1), maxfloat, first_column) out_df.to_csv(bins_outfile, sep='\t', index=False) if bgzip: bgz(bins_outfile) # Save model for future use, if optioned if parameters_outfile is not None: _save_model_params(df_fills, trans_dict, scaler, pca, components, whitener, parameters_outfile) # Perform extra assessments of PCA & feature fits, if optioned if pca_stats is not None: get_feature_stats(df_annos, feature_names, pca, pcs, pca_stats, eigen_prefix, components)
def annotatepairs(pairs, outfile, chroms, ranges, track, ucsc_track, actions, track_names, track_list, ucsc_list, ucsc_ref, fasta, binsize, homology_cutoffs, no_ucsc_chromsplit, maxfloat, bgzip, quiet): """ Annotate pairs """ # Sanitize & format inputs ucsc_chromsplit = not no_ucsc_chromsplit tracks = list(track) ucsc_tracks = list(ucsc_track) actions = tuple([a.lower() for a in actions]) if len(homology_cutoffs) > 0: homology_cutoffs = list(homology_cutoffs) else: homology_cutoffs = [1.0] # Parse file with lists of tracks (if provided) and add to track lists if track_list is not None: supp_tracks, supp_actions, supp_names = mutrate.parse_track_file( track_list) tracks = tracks + supp_tracks n_ucsc_tracks = len(ucsc_tracks) if n_ucsc_tracks > 0: actions = tuple(list(actions[:n_ucsc_tracks]) + supp_actions \ + list(actions[n_ucsc_tracks:])) track_names = tuple(list(track_names[:n_ucsc_tracks]) + supp_names \ + list(track_names[n_ucsc_tracks:])) else: actions = tuple(list(actions) + supp_actions) track_names = tuple(list(track_names) + supp_names) # Parse file with list of UCSC tracks (if provided and add to track lists) if ucsc_list is not None: supp_ucsc_tracks, supp_ucsc_actions, supp_ucsc_names = mutrate.parse_track_file( ucsc_list) ucsc_tracks = ucsc_tracks + supp_ucsc_tracks actions = tuple(list(actions) + supp_ucsc_actions) track_names = tuple(list(track_names) + supp_ucsc_names) # Handle header reformatting if 'compressed' in determine_filetype(pairs): header = GzipFile(pairs).readline().decode('utf-8').rstrip() else: header = open(pairs, 'r').readline().rstrip() if not header.startswith('#'): msg = 'INPUT WARNING: ' status_msg = '[{0}] athena annotate-pairs: No header line detected. ' + \ 'Adding default header.' print(status_msg.format( datetime.now().strftime('%b %d %Y @ %H:%M:%S'))) n_extra_cols = len(header.split('\t')) - 3 header = make_default_bed_header(n_extra_cols) if len(track_names) > 0: newheader = header + '\t' + '\t'.join(list(track_names)) else: newheader = header if fasta is not None: for k in homology_cutoffs: for direction in 'fwd rev'.split(): newheader += '\t' + 'longest_{}_kmer_{}pct_identity'.format( direction, int(round(100 * k))) # Annotate pairs newpairs = mutrate.annotate_pairs(pairs, chroms, ranges, tracks, ucsc_tracks, actions, track_names, ucsc_ref, fasta, binsize, homology_cutoffs, ucsc_chromsplit, maxfloat, quiet) # Save annotated bins if 'compressed' in determine_filetype(outfile): outfile = path.splitext(outfile)[0] newpairs.saveas(outfile, trackline=newheader) # Bgzip bins, if optioned if bgzip: bgz(outfile)
def annotatebins(bins, outfile, include_chroms, ranges, track, ucsc_track, actions, track_names, track_list, ucsc_list, ucsc_ref, fasta, snv_mus, no_ucsc_chromsplit, maxfloat, bgzip, quiet): """ Annotate bins """ # Sanitize & format inputs ucsc_chromsplit = not no_ucsc_chromsplit track = list(track) ucsc_track = list(ucsc_track) actions = tuple([a.lower() for a in actions]) # Parse file with lists of tracks (if provided) and add to track lists if track_list is not None: supp_tracks, supp_actions, supp_names = mutrate.parse_track_file( track_list) track = track + supp_tracks n_ucsc_tracks = len(ucsc_track) if n_ucsc_tracks > 0: actions = tuple(list(actions[:n_ucsc_tracks]) + supp_actions \ + list(actions[n_ucsc_tracks:])) track_names = tuple(list(track_names[:n_ucsc_tracks]) + supp_names \ + list(track_names[n_ucsc_tracks:])) else: actions = tuple(list(actions) + supp_actions) track_names = tuple(list(track_names) + supp_names) # Parse file with list of UCSC tracks (if provided and add to track lists) if ucsc_list is not None: supp_ucsc_tracks, supp_ucsc_actions, supp_ucsc_names = mutrate.parse_track_file( ucsc_list) ucsc_track = ucsc_track + supp_ucsc_tracks actions = tuple(list(actions) + supp_ucsc_actions) track_names = tuple(list(track_names) + supp_ucsc_names) # Handle header reformatting n_tracks = len(track) + len(ucsc_track) if n_tracks != len(track_names): err = 'INPUT ERROR: Number of supplied track names ({0}) does not ' + \ 'match number of tracks ({1}).' exit(err.format(len(track_names), n_tracks)) if 'compressed' in determine_filetype(bins): header = GzipFile(bins).readline().decode('utf-8').rstrip() else: header = open(bins, 'r').readline().rstrip() if not header.startswith('#'): status_msg = '[{0}] athena annotate-bins: No header line detected. ' + \ 'Adding default header.' print(status_msg.format( datetime.now().strftime('%b %d %Y @ %H:%M:%S'))) n_extra_cols = len(header.split('\t')) - 3 header = make_default_bed_header(n_extra_cols) newheader = header + '\t' + '\t'.join(list(track_names)) if fasta is not None: newheader = '\t'.join([newheader, 'pct_gc']) if snv_mus is not None: newheader = '\t'.join([newheader, 'snv_mu']) # Annotate bins newbins = mutrate.annotate_bins(bins, include_chroms, ranges, track, ucsc_track, ucsc_ref, actions, fasta, snv_mus, maxfloat, ucsc_chromsplit, quiet) # Save annotated bins if 'compressed' in determine_filetype(outfile): outfile = path.splitext(outfile)[0] newbins.saveas(outfile, trackline=newheader) # Bgzip bins, if optioned if bgzip: bgz(outfile)
def count_sv(bins_in, sv_in, outfile, paired, binsize, breakpoints, probs, sv_ci, maxfloat, bgzip): """ Master function to annotate bins_in with count (or probability) of SVs """ # Load bins, split bin coordinates from annotations, and retain header if 'compressed' in determine_filetype(bins_in): bins_header = gzip.open( bins_in, 'r').readline().decode('utf-8').rstrip().split('\t') else: bins_header = open(bins_in, 'r').readline().rstrip().split('\t') bins_bt = pbt.BedTool(bins_in).cut(range(3)).saveas() bins_df = bins_bt.to_dataframe() feats_df = dfutils.load_feature_df(bins_in) if binsize is None: binsize = calc_binsize(bins_in) # Parse input SV file depending on format # If breakpoints == False, will return simple four-column BED with variant ID in fourth column # If breakpoints == True, will return two rows per record where each record # is one breakpoint with columns 4 = variant ID, 5 = POS or END, 6 = original # POS or END coordinate, 7 = std dev of left side of breakpoint, 8 = std dev of # right side of breakpoint, and 9 = number of std deviations extended left & right (i.e., z_extend) sv_format = determine_filetype(sv_in) if 'vcf' in sv_format: vcf = pysam.VariantFile(sv_in) sv = vcf2bed(vcf, breakpoints=breakpoints, add_ci_to_bkpts=probs, ci=sv_ci) elif 'bed' in sv_format: sv = _load_sv_from_bed(sv_in, breakpoints=breakpoints) # Perform intersection with bins depending on input parameters if breakpoints: bins_bt = add_names_to_bed(bins_bt) bin_ids = [b.name for b in bins_bt] # Split pairs if necessary if paired: bins_bt = _split_pairs(bins_bt, binsize=binsize, add_name=True, add_side=True) # Intersect breakpoints with bins hits = bins_bt.intersect(sv, wa=True, wb=True) bkpt_res = parse_breakpoint_hits(hits, paired, probs) sv_column = pd.Series([bkpt_res.get(b_id, 0) for b_id in bin_ids]) # --comparison "overlap" (i.e., breakpoints == False) is the same for both 1D and 2D bins else: if probs: sv_column = pd.Series( [min([1, int(x[-1])]) for x in bins_bt.intersect(sv, c=True)]) else: sv_column = pd.Series( [int(x[-1]) for x in bins_bt.intersect(sv, c=True)]) # Paste bin coordinates, SV counts, and original features into single dataframe out_df = dfutils.float_cleanup( pd.concat([bins_df, sv_column, feats_df], axis=1), maxfloat, 3) out_df.columns = bins_header[:3] + ['sv'] + bins_header[3:] # Save bins with SV counts if 'compressed' in determine_filetype(outfile): outfile = path.splitext(outfile)[0] out_df.to_csv(outfile, sep='\t', header=True, index=False) # Bgzip bins, if optioned if bgzip: bgz(outfile)