def mnll(true_counts, logits=None, probs=None): """ Compute the multinomial negative log-likelihood between true counts and predicted values of a BPNet-like profile model One of `logits` or `probs` must be given. If both are given `logits` takes preference. Args: true_counts (numpy.array): observed counts values logits (numpy.array): predicted logits values probs (numpy.array): predicted values as probabilities Returns: float: cross entropy """ dist = None if logits is not None: # check for length mismatch if len(logits) != len(true_counts): raise quietexception.QuietException( "Length of logits does not match length of true_counts") # convert logits to softmax probabilities probs = logits - logsumexp(logits) probs = np.exp(probs) elif probs is not None: # check for length mistmatch if len(probs) != len(true_counts): raise quietexception.QuietException( "Length of probs does not match length of true_counts") # check if probs sums to 1 if abs(1.0 - np.sum(probs)) > 1e-3: raise quietexception.QuietException( "'probs' array does not sum to 1") else: # both 'probs' and 'logits' are None raise quietexception.QuietException( "At least one of probs or logits must be provided. " "Both are None.") # compute the nmultinomial distribution mnom = multinomial(np.sum(true_counts), probs) return -(mnom.logpmf(true_counts) / len(true_counts))
def profile_cross_entropy(true_counts, logits=None, probs=None): """ Compute the cross entropy between true counts and predicted values of a BPNet-like profile model One of `logits` or `probs` must be given. If both are given `logits` takes preference. Args: true_counts (numpy.array): observed counts values logits (numpy.array): predicted logits values probs (numpy.array): predicted values as probabilities Returns: float: cross entropy """ if logits is not None: # check for length mismatch if len(logits) != len(true_counts): raise quietexception.QuietException( "Length of logits does not match length of true_counts") # convert logits to softmax probabilities probs = logits - logsumexp(logits) probs = np.exp(probs) elif probs is not None: # check for length mistmatch if len(probs) != len(true_counts): raise quietexception.QuietException( "Length of probs does not match length of true_counts") # check if probs sums to 1 if abs(1.0 - np.sum(probs)) > 1e-3: raise quietexception.QuietException( "'probs' array does not sum to 1") else: # both 'probs' and 'logits' are None raise quietexception.QuietException( "At least one of probs or logits must be provided. " "Both are None.") # convert true_counts to probabilities true_counts_prob = true_counts / np.sum(true_counts) return -np.sum(np.multiply(true_counts_prob, np.log(probs + 1e-7)))
def predict_main(): # parse the command line arguments parser = argparsers.predict_argsparser() args = parser.parse_args() # check if the output directory exists if not os.path.exists(args.output_dir): logging.error("Directory {} does not exist".format(args.output_dir)) return if args.automate_filenames: # create a new directory using current date/time to store the # predictions and logs date_time_str = local_datetime_str(args.time_zone) pred_dir = '{}/{}'.format(args.output_dir, date_time_str) os.mkdir(pred_dir) elif os.path.isdir(args.output_dir): pred_dir = args.output_dir else: logging.error("Directory does not exist {}.".format(args.output_dir)) return # filename to write debug logs logfname = "{}/predict.log".format(pred_dir) # set up the loggers logger.init_logger(logfname) # make sure the input_data json file exists if not os.path.isfile(args.input_data): raise quietexception.QuietException( "File not found: {} OR you may have accidentally " "specified a directory path.".format(args.input_data)) # load the json file with open(args.input_data, 'r') as inp_json: try: #: dictionary of tasks for training input_data = json.loads(inp_json.read()) except json.decoder.JSONDecodeError: raise quietexception.QuietException( "Unable to load json file {}. Valid json expected. " "Check the file for syntax errors.".format(args.input_data)) logging.info("INPUT DATA -\n{}".format(input_data)) # predict logging.info("Loading {}".format(args.model)) with CustomObjectScope( {'MultichannelMultinomialNLL': MultichannelMultinomialNLL}): predict(args, input_data, pred_dir)
def get_average_profile(input_bigWig, peaks_df, peak_width): """ Function to compute the average profile across all peaks Args: input_bigWig (str): path to bigWig file peaks_df (str): pandas dataframe containing peaks information. The dataframe should have 'chrom', 'start', and 'end' as first 3 columns. Each peak should have the same width (equal to peak_width) i.e 'end' - 'start' is the same for all rows in the dataframe. peak_width (int): width of each peak. Returns: np.array: numpy array of length peak_width """ # open the bigWig file for reading bw = pyBigWig.open(input_bigWig) # initialize numpy array for average profile average_profile = np.zeros(peak_width) # iterate through all peaks and compute the average for idx, row in peaks_df.iterrows(): # raise exception if 'end' - 'start' is not equal to peak_width if (row['end'] - row['start']) != peak_width: raise quietexception.QuietException( "Inconsistent peak width found at: {}:{}-{}".format( row['chrom'], row['start'], row['end'])) # read values from bigWig average_profile += np.nan_to_num( bw.values(row['chrom'], row['start'], row['end'])) # average profile average_profile /= peaks_df.shape[0] # close bigWig file bw.close() return average_profile
def motif_discovery_main(): parser = motif_discovery_argsparser() args = parser.parse_args() if not os.path.exists(args.scores_path): raise quietexception.QuietException( "Score file {} does not exist".format(args.scores_path)) if not os.path.exists(args.output_directory): raise quietexception.QuietException( "Output directiry {} does not exist".format(args.output_directory)) # Load the scores scores = h5py.File(args.scores_path, 'r') # window start and end based on modisco_window_size center = scores['hyp_scores'].shape[1] // 2 start = center - args.modisco_window_size // 2 end = center + args.modisco_window_size // 2 print("Shap scores shape - {}".format(scores['hyp_scores'].shape)) shap_scores = scores['hyp_scores'][:, start:end, :] one_hot_seqs = scores['input_seqs'][:, start:end, :] print("Done slicing shap scores and one hot seqs") proj_shap_scores = np.multiply(one_hot_seqs, shap_scores) print("Done computing projected shap scores") scores.close() tasks = ['task0'] task_to_scores = OrderedDict() task_to_hyp_scores = OrderedDict() task_to_scores['task0'] = proj_shap_scores task_to_hyp_scores['task0'] = shap_scores tfmodisco_workflow = modisco.tfmodisco_workflow.workflow.TfModiscoWorkflow( sliding_window_size=21, flank_size=10, target_seqlet_fdr=0.05, seqlets_to_patterns_factory=\ modisco.tfmodisco_workflow.seqlets_to_patterns .TfModiscoSeqletsToPatternsFactory( n_cores=10, embedder_factory=\ modisco.seqlet_embedding.advanced_gapped_kmer .AdvancedGappedKmerEmbedderFactory(), trim_to_window_size=30, initial_flank_to_add=10, final_min_cluster_size=30)) tfmodisco_results = tfmodisco_workflow( task_names=["task0"], contrib_scores=task_to_scores, hypothetical_contribs=task_to_hyp_scores, one_hot=one_hot_seqs) modisco_results_path = '{}/modisco_results.h5'.format( args.output_directory) tfmodisco_results.save_hdf5(h5py.File(modisco_results_path, 'w')) print("Saved modisco results to file {}".format(str(modisco_results_path))) seqlet_path = '{}/seqlets.txt'.format(args.output_directory) print("Saving seqlets to %s" % seqlet_path) seqlets = \ tfmodisco_results.metacluster_idx_to_submetacluster_results[0].seqlets bases = np.array(["A", "C", "G", "T"]) with open(seqlet_path, "w") as f: for seqlet in seqlets: sequence = "".join(bases[np.argmax(seqlet["sequence"].fwd, axis=-1)]) example_index = seqlet.coor.example_idx start, end = seqlet.coor.start, seqlet.coor.end f.write(">example%d:%d-%d\n" % (example_index, start, end)) f.write(sequence + "\n") print("Saving pattern visualizations") patterns = (tfmodisco_results.metacluster_idx_to_submetacluster_results[0]. seqlets_to_patterns_result.patterns) # generate .pngs of each motif and write motif seqlet to # individual files for idx, pattern in enumerate(patterns): print(pattern) print("pattern idx", idx) print(len(pattern.seqlets)) pattern_seqlet_path = os.path.join(args.output_directory, 'pattern{}_seqlets.txt'.format(idx)) with open(pattern_seqlet_path, "w") as f: for seqlet in pattern.seqlets: sequence = "".join(bases[np.argmax(seqlet["sequence"].fwd, axis=-1)]) example_index = seqlet.coor.example_idx start, end = seqlet.coor.start, seqlet.coor.end f.write(">example%d:%d-%d\n" % (example_index, start, end)) f.write(sequence + "\n") save_plot(pattern["task0_contrib_scores"].fwd, '{}/contrib_{}.png'.format(args.output_directory, idx)) save_plot(pattern["sequence"].fwd, '{}/sequence_{}.png'.format(args.output_directory, idx))
def metrics_main(): # parse the command line arguments parser = metrics_argsparser() args = parser.parse_args() # check if the output directory exists if not os.path.exists(args.output_dir): raise quietexception.QuietException( "Directory {} does not exist".format(args.output_dir)) # check if the peaks file exists if args.peaks is not None and not os.path.exists(args.peaks): raise quietexception.QuietException("File {} does not exist".format( args.peaks)) # check if the bounds file exists if args.bounds_csv is not None and not os.path.exists(args.bounds_csv): raise quietexception.QuietException("File {} does not exist".format( args.bounds_csv)) # check if profile A exists if not os.path.exists(args.profileA): raise quietexception.QuietException("File {} does not exist".format( args.profileA)) # check if profile B exists if not os.path.exists(args.profileB): raise quietexception.QuietException("File {} does not exist".format( args.profileB)) # check if counts A exists if args.countsA is not None and not os.path.exists(args.countsA): raise quietexception.QuietException("File {} does not exist".format( args.countsA)) # check if counts B exists if args.countsB is not None and not os.path.exists(args.countsB): raise quietexception.QuietException("File {} does not exist".format( args.countsB)) # check if we need to auto generate the output directory if args.automate_filenames: # create a new directory using current date/time to store the # metrics outputs & logs date_time_str = local_datetime_str(args.time_zone) metrics_dir = '{}/{}'.format(args.output_dir, date_time_str) os.mkdir(metrics_dir) elif os.path.isdir(args.output_dir): metrics_dir = args.output_dir else: raise quietexception.QuietException("{} is not a directory".format( args.output_dir)) # filename to write debug logs logfname = "{}/metrics.log".format(metrics_dir) # set up the loggers init_logger(logfname) # read the bounds csv into a pandas DataFrame if args.bounds_csv is not None: logging.info("Loading lower and upper bounds ...") bounds_df = pd.read_csv(args.bounds_csv, header=0) else: bounds_df = None # check if peaks file has been supplied if args.peaks is not None: peaks_df = pd.read_csv(args.peaks, sep='\t', header=None, names=[ 'chrom', 'st', 'end', 'name', 'score', 'strand', 'signal', 'p', 'q', 'summit' ]) # keep only those rows corresponding to the required # chromosomes peaks_df = peaks_df[peaks_df['chrom'].isin(args.chroms)] # create new column for peak pos peaks_df['summit_pos'] = peaks_df['st'] + peaks_df['summit'] # create new column for start pos peaks_df['start_pos'] = peaks_df['summit_pos'] - \ args.metrics_seq_len // 2 # create new column for end pos peaks_df['end_pos'] = peaks_df['summit_pos'] + \ args.metrics_seq_len // 2 # select only the chrom & summit positon columns allPositions = peaks_df[['chrom', 'start_pos', 'end_pos']] allPositions = allPositions.reset_index(drop=True) # else generate geome wide positions else: allPositions = getChromPositions(args.chroms, args.chrom_sizes, args.metrics_seq_len // 2, args.step_size, mode='sequential', num_positions=-1) # check that there are exactly the same number of rows in the # bounds dataframe as compared to allPositions if bounds_df is not None and (bounds_df.shape[0] != allPositions.shape[0]): raise quietexception.QuietException( "Bounds row count does not match chrom positions row " "count".format(args.peaks)) # open the two bigWig files try: bigWigProfileA = pyBigWig.open(args.profileA) bigWigProfileB = pyBigWig.open(args.profileB) if args.countsA: bigWigCountsA = pyBigWig.open(args.countsA) if args.countsB: bigWigCountsB = pyBigWig.open(args.countsB) except Exception as e: logging.error("Problems occurred when opening one of the input files: " "{}".format(str(e))) # for pearson on counts countsA = [] countsB = [] # initialize arrays to hold metrics values array_len = len(allPositions.index) multinomial_nll = np.zeros(array_len, dtype=np.float64) ce = np.zeros(array_len, dtype=np.float64) jsd = np.zeros(array_len, dtype=np.float64) pearson = np.zeros(array_len, dtype=np.float64) spearman = np.zeros(array_len, dtype=np.float64) mse = np.zeros(array_len, dtype=np.float64) for idx, row in tqdm(allPositions.iterrows(), total=allPositions.shape[0]): chrom = row['chrom'] start = row['start_pos'] end = row['end_pos'] # get all the bounds values if bounds_df is not None: mnll_min = bounds_df.loc[idx, 'mnll_self'] mnll_max = bounds_df.loc[idx, 'mnll_uniform'] ce_min = bounds_df.loc[idx, 'ce_self'] ce_max = bounds_df.loc[idx, 'ce_uniform'] jsd_min = bounds_df.loc[idx, 'jsd_self'] jsd_max = bounds_df.loc[idx, 'jsd_uniform'] pearson_min = bounds_df.loc[idx, 'pearson_uniform'] pearson_max = bounds_df.loc[idx, 'pearson_self'] spearman_min = bounds_df.loc[idx, 'spearman_uniform'] spearman_max = bounds_df.loc[idx, 'spearman_self'] try: profileA = np.nan_to_num( np.array(bigWigProfileA.values(chrom, start, end))) profileB = np.nan_to_num( np.array(bigWigProfileB.values(chrom, start, end))) except Exception as e: raise quietexception.QuietException( "Error retrieving values {}, {}, {}".format(chrom, start, end)) if args.countsA: # since every base is assigned the total counts in the # region we have to take the mean valsCountsA = np.mean( np.nan_to_num(np.array(bigWigCountsA.values(chrom, start, end)))) else: valsCountsA = np.sum(profileA) if args.countsB: # since every base is assigned the total counts in the # region we have to take the mean valsCountsB = np.mean( np.nan_to_num(np.array(bigWigCountsB.values(chrom, start, end)))) else: valsCountsB = np.sum(profileB) # check to see if we fetched the correct numnber of values # if the two array lengths dont match we cant compute the # metrics if len(profileA) != (end - start) or \ len(profileB) != (end - start): logging.warning("Unable to fetch {} values on chrom {} from " "{} to {}. Skipping.".format( end - start, chrom, start, end)) continue if sum(profileA) != 0: if args.apply_softmax_to_profileA: # we use log softmax to circumvent numerical instability # and then exponetiate probProfileA = profileA - logsumexp(profileA) probProfileA = np.exp(probProfileA) # we need actual counts to compute mse valsProfileA = np.multiply(valsCountsA, probProfileA) if len(args.smooth_profileA) > 0: sigma = float(args.smooth_profileA[0]) width = float(args.smooth_profileA[1]) truncate = (((width - 1) / 2) - 0.5) / sigma valsProfileA = gaussian_filter1d(valsProfileA, sigma=sigma, truncate=truncate) # recompute probabilities probProfileA = valsProfileA / sum(valsProfileA) else: if args.smooth_profileA: sigma = float(args.smooth_profileA[0]) width = float(args.smooth_profileA[1]) truncate = (((width - 1) / 2) - 0.5) / sigma profileA = gaussian_filter1d(profileA, sigma=sigma, truncate=truncate) # convert to probabilities by diving by sum probProfileA = profileA / sum(profileA) # if we are in the else block it implies profileA has # actual counts valsProfileA = profileA elif args.exclude_zero_profiles: continue else: # uniform distribution probProfileA = 1.0 / len(profileA) * np.ones(len(profileA), dtype=np.float32) if sum(profileB) != 0: if args.apply_softmax_to_profileB: # we use log softmax to circumvent numerical instability # and then exponetiate probProfileB = profileB - logsumexp(profileB) probProfileB = np.exp(probProfileB) # we need actual counts to compute mse valsProfileB = np.multiply(valsCountsB, probProfileB) if len(args.smooth_profileB) > 0: sigma = float(args.smooth_profileB[0]) width = float(args.smooth_profileB[1]) truncate = (((width - 1) / 2) - 0.5) / sigma valsProfileB = gaussian_filter1d(valsProfileB, sigma=sigma, truncate=truncate) # recompute probabilities probProfileB = valsProfileB / sum(valsProfileB) else: if args.smooth_profileB: sigma = float(args.smooth_profileB[0]) width = float(args.smooth_profileB[1]) truncate = (((width - 1) / 2) - 0.5) / sigma profileB = gaussian_filter1d(profileB, sigma=sigma, truncate=truncate) # convert to probabilities by diving by sum probProfileB = profileB / sum(profileB) # if we are in the else block it implies profileB has # actual counts valsProfileB = profileB elif args.exclude_zero_profiles: continue else: # uniform distribution probProfileB = 1.0 / len(profileB) * np.ones(len(profileB), dtype=np.float32) # pearson & spearman # with pearson we need to check if either of the arrays # has zero standard deviation (i.e having all same elements, # a zero or any other value). Unfortunately np.std # returns a very small non-zero value, so we'll use a # different approach to check if the array has the same value. # If true then pearson correlation is undefined if np.unique(probProfileA).size == 1 or \ np.unique(probProfileB).size == 1: pearson[idx] = 0 spearman[idx] = 0 else: pearson[idx] = pearsonr(valsProfileA, valsProfileB)[0] spearman[idx] = spearmanr(valsProfileA, valsProfileB)[0] # mnll multinomial_nll[idx] = mnll(valsProfileA, probs=probProfileB) # cross entropy ce[idx] = profile_cross_entropy(valsProfileA, probs=probProfileB) # jsd jsd[idx] = jensenshannon(probProfileA, probProfileB) # apply min max normlization if bounds_df is not None: multinomial_nll[idx] = get_min_max_normalized_value( multinomial_nll[idx], mnll_min, mnll_max) ce[idx] = get_min_max_normalized_value(ce[idx], ce_min, ce_max) jsd[idx] = get_min_max_normalized_value(jsd[idx], jsd_min, jsd_max) pearson[idx] = get_min_max_normalized_value( pearson[idx], pearson_min, pearson_max) spearman[idx] = get_min_max_normalized_value( spearman[idx], spearman_min, spearman_max) # mse mse[idx] = np.square(np.subtract(valsProfileA, valsProfileB)).mean() # add to the counts list countsA.append(np.sum(valsProfileA)) countsB.append(np.sum(valsProfileB)) counts_pearson = pearsonr(countsA, countsB)[0] counts_spearman = spearmanr(countsA, countsB)[0] logging.info("\t\tmin\t\tmax\t\tmedian") logging.info("mnll\t\t{:0.3f}\t\t{:0.3f}\t\t{:0.3f}".format( np.min(multinomial_nll), np.max(multinomial_nll), np.median(multinomial_nll))) logging.info("cross_entropy\t{:0.3f}\t\t{:0.3f}\t\t{:0.3f}".format( np.min(ce), np.max(ce), np.median(ce))) logging.info("jsd\t\t{:0.3f}\t\t{:0.3f}\t\t{:0.3f}".format( np.min(jsd), np.max(jsd), np.median(jsd))) logging.info("pearson\t\t{:0.3f}\t\t{:0.3f}\t\t{:0.3f}".format( np.min(pearson), np.max(pearson), np.median(pearson))) logging.info("spearman\t{:0.3f}\t\t{:0.3f}\t\t{:0.3f}".format( np.min(spearman), np.max(spearman), np.median(spearman))) logging.info("mse\t\t{:0.3f}\t\t{:0.3f}\t\t{:0.3f}".format( np.min(mse), np.max(mse), np.median(mse))) logging.info("==============================================") logging.info("counts pearson: {}".format(counts_pearson)) logging.info("counts spearman: {}".format(counts_spearman)) np.savez_compressed('{}/mnll'.format(metrics_dir), mnll=multinomial_nll) np.savez_compressed('{}/cross_entropy'.format(metrics_dir), cross_entropy=ce) np.savez_compressed('{}/mse'.format(metrics_dir), mse=mse) np.savez_compressed('{}/pearson'.format(metrics_dir), pearson=pearson) np.savez_compressed('{}/spearman'.format(metrics_dir), spearman=spearman) np.savez_compressed('{}/jsd'.format(metrics_dir), jsd=jsd) np.savez_compressed('{}/counts_pearson'.format(metrics_dir), counts_pearson=counts_pearson) np.savez_compressed('{}/counts_spearman'.format(metrics_dir), counts_spearman=counts_spearman) # write all the command line arguments to a json file config_file = '{}/config.json'.format(metrics_dir) with open(config_file, 'w') as fp: json.dump(vars(args), fp)
def interpret_main(): # parse the command line arguments parser = interpret_argsparser() args = parser.parse_args() # check if the output directory exists if not os.path.exists(args.output_directory): raise quietexception.QuietException( "Directory {} does not exist".format(args.output_directory)) # check if the output directory is a directory path if not os.path.isdir(args.output_directory): raise quietexception.QuietException("{} is not a directory".format( args.output_directory)) # check if the reference genome file exists if not os.path.exists(args.reference_genome): raise quietexception.QuietException("File {} does not exist".format( args.reference_genome)) # check if the model file exists if not os.path.exists(args.model): raise quietexception.QuietException("File {} does not exist".format( args.model)) # check if the bed file exists if not os.path.exists(args.bed_file): raise quietexception.QuietException("File {} does not exist".format( args.bed_file)) # if controls are specified check if the control_info json exists if args.control_info is not None: if not os.path.exists(args.control_info): raise quietexception.QuietException( "Input data file {} does not exist".format(args.control_info)) # check if both args.chroms and args.sample are specified, only # one of the two is allowed if args.chroms is not None and args.sample is not None: raise quietexception.QuietException( "Only one of [--chroms, --sample] is allowed") if args.automate_filenames: # create a new directory using current date/time to store the # interpretation scores date_time_str = local_datetime_str(args.time_zone) interpret_dir = '{}/{}'.format(args.output_directory, date_time_str) os.mkdir(interpret_dir) else: interpret_dir = args.output_directory # filename to write debug logs logfname = "{}/interpret.log".format(interpret_dir) # set up the loggers init_logger(logfname) # interpret logging.info("Loading {}".format(args.model)) with CustomObjectScope( {'MultichannelMultinomialNLL': MultichannelMultinomialNLL}): interpret(args, interpret_dir)
def interpret(args, interpret_dir): # load the model model = load_model(args.model) # read all the peaks into a pandas dataframe peaks_df = pd.read_csv(args.bed_file, sep='\t', header=None, names=[ 'chrom', 'st', 'end', 'name', 'score', 'strand', 'signalValue', 'p', 'q', 'summit' ]) if args.chroms is not None: # keep only those rows corresponding to the required # chromosomes peaks_df = peaks_df[peaks_df['chrom'].isin(args.chroms)] if args.sample is not None: # randomly sample rows logging.info("Sampling {} rows from {}".format(args.sample, args.bed_file)) peaks_df = peaks_df.sample(n=args.sample, random_state=args.seed) if args.presort_bed_file: # sort the bed file in descending order of peak strength peaks_df = peaks_df.sort_values(['signalValue'], ascending=False) # reset index (if any of the above 3 filters have been applied, # no harm if they haven't) peaks_df = peaks_df.reset_index(drop=True) # get final number of peaks num_peaks = peaks_df.shape[0] # reference file to fetch sequences logging.info("Opening reference file ...") fasta_ref = pysam.FastaFile(args.reference_genome) # if controls have been specified we to need open the control files # for reading control_bigWigs = [] if args.control_info is not None: # load the control info json file with open(args.control_info, 'r') as inp_json: try: input_data = json.loads(inp_json.read()) except Exception as e: exc_type, exc_value, exc_traceback = sys.exc_info() raise quietexception.QuietException(exc_type.__name__ + ' ' + str(exc_value)) logging.info("Opening control bigWigs ...") # get the control bigWig for each task for task in input_data: if input_data[task]['task_id'] == args.task_id: if 'control' in input_data[task].keys(): control_bigWig_path = input_data[task]['control'] # check if the file exists if not os.path.exists(control_bigWig_path): raise quietexception.QuietException( "File {} does not exist".format( control_bigWig_path)) logging.info(control_bigWig_path) # open the bigWig and add the file object to the # list control_bigWigs.append(pyBigWig.open(control_bigWig_path)) # log of sum of counts of the control track # if multiple control files are specified this would be # log(sum(position_wise_sum_from_all_files)) bias_counts_input = np.zeros((num_peaks, 1)) # the control profile and the smoothed version of the control # profile (1 + 1 = 2, always :) ) # if multiple control files are specified, the control profile for # each sample would be position_wise_sum_from_all_files bias_profile_input = np.zeros((num_peaks, args.control_len, 2)) ## IF NO CONTROL BIGWIGS ARE SPECIFIED THEN THE TWO NUMPY ARRAYS ## bias_counts_input AND bias_profile_input WILL REMAIN ZEROS # list to hold all the sequences for the peaks sequences = [] # get a list of valid rows to store only the peaks on which # the contribution scores are computed, excluding those that # have may some exceptions, later we'll convert these rows # to a dataframe and write it out to a new file rows = [] # iterate through all the peaks for idx, row in peaks_df.iterrows(): # peak interval based on 'summit' position start = row['st'] + row['summit'] - (args.input_seq_len // 2) end = row['st'] + row['summit'] + (args.input_seq_len // 2) # fetch the reference sequence at the peak location try: seq = fasta_ref.fetch(row['chrom'], start, end).upper() except ValueError: # start/end out of range logging.warn("Unable to fetch reference sequence at peak: " "{} {}-{}. Skipped.".format(row['chrom'], start, end)) continue # check if we have the required length if len(seq) != args.input_seq_len: logging.warn("Reference genome doesn't have required sequence " "length ({}) at peak: {} {}-{}. Returned length {}. " "Skipped.".format(args.input_seq_len, row['chrom'], start, end, len(seq))) continue # fetch control values if len(control_bigWigs) > 0: # a different start and end for controls since control_len # is usually not the same as input_seq_len start = row['st'] + row['summit'] - (args.control_len // 2) end = row['st'] + row['summit'] + (args.control_len // 2) # read the values from the control bigWigs for i in range(len(control_bigWigs)): vals = np.nan_to_num(control_bigWigs[i].values( row['chrom'], start, end)) bias_counts_input[idx, 0] += np.sum(vals) bias_profile_input[idx, :, 0] += vals # we need to take the log of the sum of counts # we add 1 to avoid taking log of 0 # same as mseqgen does while generating batches bias_counts_input[idx, 0] = np.log(bias_counts_input[idx, 0] + 1) # compute the smoothed control profile sigma = float(args.control_smoothing[0]) window_width = int(args.control_smoothing[1]) bias_profile_input[idx, :, 1] = gaussian1D_smoothing( bias_profile_input[idx, :, 0], sigma, window_width) sequences.append(seq) # row passes all exception handling rows.append(dict(row)) # if null distribution is requested null_sequences = [] if args.gen_null_dist: logging.info("generating null sequences ...") rng = np.random.RandomState(args.seed) # iterate over sequences and get the dinucleotide shuffled # sequence for each of them for seq in sequences: # get a list of shuffled seqs. Since we are setting # num_shufs to 1, the returned list will be of size 1 shuffled_seqs = dinuc_shuffle(seq, 1, rng) null_sequences.append(shuffled_seqs[0]) # null sequences are now our actual sequences sequences = null_sequences[:] # one hot encode all the sequences X = one_hot_encode(sequences) print(X.shape) # inline function to handle dinucleotide shuffling def data_func(model_inputs): rng = np.random.RandomState(args.seed) return [dinuc_shuffle(model_inputs[0], args.num_shuffles, rng)] + \ [ np.tile( np.zeros_like(model_inputs[i]), (args.num_shuffles,) + (len(model_inputs[i].shape) * (1,)) ) for i in range(1, len(model_inputs)) ] # shap explainer for the counts head profile_model_counts_explainer = shap.explainers.deep.TFDeepExplainer( ([model.input[0], model.input[1] ], tf.reduce_sum(model.outputs[1], axis=-1)), data_func, combine_mult_and_diffref=combine_mult_and_diffref) # explainer for the profile head weightedsum_meannormed_logits = get_weightedsum_meannormed_logits( model, task_id=args.task_id, stranded=True) profile_model_profile_explainer = shap.explainers.deep.TFDeepExplainer( ([model.input[0], model.input[2]], weightedsum_meannormed_logits), data_func, combine_mult_and_diffref=combine_mult_and_diffref) logging.info("Generating 'counts' shap scores") counts_shap_scores = profile_model_counts_explainer.shap_values( [X, bias_counts_input], progress_message=100) # construct a dictionary for the 'counts' shap scores & the # the projected 'counts' shap scores # MODISCO workflow expects one hot sequences with shape (?,4,1000) projected_shap_scores = np.multiply(X, counts_shap_scores[0]) counts_scores = { 'raw': { 'seq': np.transpose(X, (0, 2, 1)) }, 'shap': { 'seq': np.transpose(counts_shap_scores[0], (0, 2, 1)) }, 'projected_shap': { 'seq': np.transpose(projected_shap_scores, (0, 2, 1)) } } # save the dictionary in HDF5 formnat logging.info("Saving 'counts' scores") dd.io.save('{}/counts_scores.h5'.format(interpret_dir), counts_scores) logging.info("Generating 'profile' shap scores") profile_shap_scores = profile_model_profile_explainer.shap_values( [X, bias_profile_input], progress_message=100) # construct a dictionary for the 'profile' shap scores & the # the projected 'profile' shap scores projected_shap_scores = np.multiply(X, profile_shap_scores[0]) profile_scores = { 'raw': { 'seq': np.transpose(X, (0, 2, 1)) }, 'shap': { 'seq': np.transpose(profile_shap_scores[0], (0, 2, 1)) }, 'projected_shap': { 'seq': np.transpose(projected_shap_scores, (0, 2, 1)) } } # save the dictionary in HDF5 formnat logging.info("Saving 'profile' scores") dd.io.save('{}/profile_scores.h5'.format(interpret_dir), profile_scores) # create dataframe from all rows that were sucessfully processed df_valid_scores = pd.DataFrame(rows) # save the dataframe as a new .bed file df_valid_scores.to_csv('{}/peaks_valid_scores.bed'.format(interpret_dir), sep='\t', header=False, index=False) # write all the command line arguments to a json file config_file = '{}/config.json'.format(interpret_dir) with open(config_file, 'w') as fp: config = vars(args) json.dump(config, fp)
def __init__(self, input_params, batch_gen_params, reference_genome, chrom_sizes, chroms, num_threads, epochs, batch_size): # sampling mode to get chromosome positions self.sampling_mode = batch_gen_params['sampling_mode'] # ML task mode "train", "val" or "test" self.mode = batch_gen_params['mode'] # check if at least one of the two input modes is present if not os.path.isdir(input_params['data']) and \ os.path.splitext(input_params['data'])[1] != '.json': raise quietexception.QuietException( "Either input directory or input json must be specified. " "None found.") # load the input tasks either from the input dir or from # the input json if os.path.isdir(input_params['data']): self.tasks = sequtils.getInputTasks( input_params['data'], stranded=input_params['stranded'], has_control=input_params['has_control'], require_peaks=(self.sampling_mode == 'peaks'), mode=self.mode) else: # make sure the input_data json file exists if not os.path.isfile(input_params['data']): raise quietexception.QuietException( "File not found: {}", input_params['data']) with open(input_params['data'], 'r') as inp_json: self.tasks = json.loads(inp_json.read()) # check if the reference genome file exists if not os.path.isfile(reference_genome): raise quietexception.QuietException( "File not found: {}", reference_genome) # check if the chrom_sizes file exists if not os.path.isfile(chrom_sizes): raise quietexception.QuietException( "File not found: {}", chrom_sizes) self.num_tasks = len(list(self.tasks.keys())) self.reference = reference_genome # read the chrom sizes into a dataframe self.chrom_sizes_df = pd.read_csv(chrom_sizes, sep = '\t', header=None, names = ['chrom', 'size']) # chromosome list self.chroms = chroms # keep only those chrom_sizes rows corresponding to the # required chromosomes self.chrom_sizes_df = self.chrom_sizes_df[ self.chrom_sizes_df['chrom'].isin(self.chroms)] # generate a new column for sampling weights of the chromosomes self.chrom_sizes_df['weights'] = (self.chrom_sizes_df['size'] / self.chrom_sizes_df['size'].sum()) self.num_threads = num_threads self.epochs = epochs self.batch_size = batch_size # rest of batch generation parameters self.input_flank = batch_gen_params['input_seq_len'] // 2 self.output_flank = batch_gen_params['output_len'] // 2 self.max_jitter = batch_gen_params['max_jitter'] self.negative_sampling_rate = batch_gen_params['negative_sampling_rate'] self.rev_comp_aug = batch_gen_params['rev_comp_aug'] self.shuffle = batch_gen_params['shuffle'] # control batch generation for next epoch # if the value is not set to True, batches are not generated # Use an external controller to set value to True/False self.ready_for_next_epoch = False # (early) stopping flag self.stop = False if self.sampling_mode == 'peaks': # get a pandas dataframe for the peak positions # Note - we need the 'tasks' dictionary so we can access # the peaks.bed files from the paths available in the # dictionary self.data = sequtils.getPeakPositions( self.tasks, self.chroms, self.chrom_sizes_df[['chrom', 'size']], self.input_flank) elif sampling_mode == 'sequential': if 'num_positions' not in batch_gen_params: raise quietexception.QuietException( "Key not found in batch_gen_params_json: 'num_positions'. " "Required for sequential sampling mode") if 'step_size' not in batch_gen_params: raise quietexception.QuietException( "Key not found in batch_gen_params_json: 'step_size'. " "Required for sequential sampling mode") # get a pandas dataframe with sequential positions at # regular intervals self.data = sequtils.getChromPositions( self.chroms, self.chrom_sizes_df[['chrom', 'size']], self.input_flank, mode=sampling_mode, num_positions=batch_gen_params['num_positions'], step=batch_gen_params['step_size']) self.max_jitter = 0 elif sampling_mode == 'random': if 'num_positions' not in batch_gen_params: raise quietexception.QuietException( "Key not found in batch_gen_params_json: 'num_positions'. " "Required for random sampling mode") # get a pandas dataframe with random positions self.data = sequtils.getChromPositions( self.chroms, self.chrom_sizes_df[['chrom', 'size']], self.input_flank, mode=sampling_mode, num_positions=batch_gen_params['num_positions']) self.max_jitter = 0
def modisco_main(): parser = modisco_argsparser() args = parser.parse_args() if not os.path.exists(args.scores_path): raise quietexception.QuietException( "Score file {} does not exist".format(args.scores_path)) # if not os.path.exists(args.scores_locations): # raise quietexception.QuietException( # "Scores locations file {} does not exist".format( # args.scores_locations)) if not os.path.exists(args.output_directory): raise quietexception.QuietException( "Output directiry {} does not exist".format(args.output_directory)) # Load the scores scores = deepdish.io.load(args.scores_path) shap_scores_seq = [] proj_shap_scores_seq = [] one_hot_seqs = [] center = int(scores['shap']['seq'].shape[-1] / 2) start = center - 200 end = center + 200 for i in scores['shap']['seq']: shap_scores_seq.append(i[:, start:end].transpose()) for i in scores['projected_shap']['seq']: proj_shap_scores_seq.append(i[:, start:end].transpose()) for i in scores['raw']['seq']: one_hot_seqs.append(i[:, start:end].transpose()) tasks = ['task0'] task_to_scores = OrderedDict() task_to_hyp_scores = OrderedDict() onehot_data = one_hot_seqs task_to_scores['task0'] = proj_shap_scores_seq task_to_hyp_scores['task0'] = shap_scores_seq # track_set = modisco.tfmodisco_workflow.workflow.prep_track_set( # task_names=["task0"], # contrib_scores=task_to_scores, # hypothetical_contribs=task_to_hyp_scores, # one_hot=onehot_data) tfmodisco_workflow = modisco.tfmodisco_workflow.workflow.TfModiscoWorkflow( sliding_window_size=21, flank_size=10, target_seqlet_fdr=0.05, seqlets_to_patterns_factory=modisco.tfmodisco_workflow. seqlets_to_patterns.TfModiscoSeqletsToPatternsFactory( embedder_factory=modisco.seqlet_embedding.advanced_gapped_kmer. AdvancedGappedKmerEmbedderFactory(), trim_to_window_size=30, initial_flank_to_add=10, final_min_cluster_size=30)) tfmodisco_results = tfmodisco_workflow( task_names=["task0"], contrib_scores=task_to_scores, hypothetical_contribs=task_to_hyp_scores, one_hot=onehot_data) modisco_results_path = '{}/modisco_results.h5'.format( args.output_directory) tfmodisco_results.save_hdf5(h5py.File(modisco_results_path)) print("Saved modisco results to file {}".format(str(modisco_results_path))) seqlet_path = '{}/seqlets.txt'.format(args.output_directory) print("Saving seqlets to %s" % seqlet_path) seqlets = \ tfmodisco_results.metacluster_idx_to_submetacluster_results[0].seqlets bases = np.array(["A", "C", "G", "T"]) with open(seqlet_path, "w") as f: for seqlet in seqlets: sequence = "".join(bases[np.argmax(seqlet["sequence"].fwd, axis=-1)]) example_index = seqlet.coor.example_idx start, end = seqlet.coor.start, seqlet.coor.end f.write(">example%d:%d-%d\n" % (example_index, start, end)) f.write(sequence + "\n") print("Saving pattern visualizations") patterns = (tfmodisco_results.metacluster_idx_to_submetacluster_results[0]. seqlets_to_patterns_result.patterns) # generate .pngs of each motif and write motif seqlet to # individual files for idx, pattern in enumerate(patterns): print(pattern) print("pattern idx", idx) print(len(pattern.seqlets)) pattern_seqlet_path = os.path.join(args.output_directory, 'pattern{}_seqlets.txt'.format(idx)) with open(pattern_seqlet_path, "w") as f: for seqlet in pattern.seqlets: sequence = "".join(bases[np.argmax(seqlet["sequence"].fwd, axis=-1)]) example_index = seqlet.coor.example_idx start, end = seqlet.coor.start, seqlet.coor.end f.write(">example%d:%d-%d\n" % (example_index, start, end)) f.write(sequence + "\n") save_plot(pattern["task0_contrib_scores"].fwd, '{}/contrib_{}.png'.format(args.output_directory, idx)) save_plot(pattern["sequence"].fwd, '{}/sequence_{}.png'.format(args.output_directory, idx))
def logits2profile_main(): # parse the command line arguments parser = logits2profile_argsparser() args = parser.parse_args() # check if the output directory exists if not os.path.exists(args.output_directory): raise quietexception.QuietException( "Directory {} does not exist".format(args.output_dir)) return # check if the logits file exists if not os.path.exists(args.logits_file): raise quietexception.QuietException( "Logits file {} does not exist".format(args.logits_file)) return # check if the counts file exists if not os.path.exists(args.counts_file): raise quietexception.QuietException( "Counts file {} does not exist".format(args.counts_file)) return # check if the peaks file exists if not os.path.exists(args.peaks): raise quietexception.QuietException( "Peaks file {} does not exist".format(args.peaks)) return # check if the chrom sizes file exists if not os.path.exists(args.chrom_sizes): raise quietexception.QuietException( "Peaks file {} does not exist".format(args.chrom_sizes)) return # construct header for the output bigWig file header = [] # dataframe with chromosome sizes chrom_sizes_df = pd.read_csv(args.chrom_sizes, sep = '\t', header=None, names = ['chrom', 'size']) chrom_sizes_df = chrom_sizes_df.set_index('chrom') # sort chromosomes, to be consistent with how pandas sorts # chromosomes ... for e.g. chrom21 is < chrom8 chroms = args.chroms[:] chroms.sort() for chrom in chroms: size = chrom_sizes_df.at[chrom, 'size'] header.append((chrom, int(size))) logging.debug("bigWig HEADER - {}".format(header)) # open logits bigWig for reading logits_bigWig = pyBigWig.open(args.logits_file) # open counts bigWig for reading counts_bigWig = pyBigWig.open(args.counts_file) # open output bigWig for writing output_bigWig_fname = '{}/{}.bw'.format(args.output_directory, args.output_filename) output_bigWig = pyBigWig.open(output_bigWig_fname, 'w') # add the header to the bigWig files output_bigWig.addHeader(header, maxZooms=0) # read the peaks file into a dataframe peaks_df = pd.read_csv(args.peaks, usecols=[0, 1 ,2], names=['chrom', 'start', 'end'], header=None, sep='\t') peaks_df = peaks_df[peaks_df['chrom'].isin(args.chroms)] peaks_df['_start'] = peaks_df['start'] + \ (peaks_df['end'] - peaks_df['start']) // 2 - \ args.window_size // 2 peaks_df['_end'] = peaks_df['_start'] + args.window_size peaks_df = peaks_df.sort_values(by=['chrom', '_start']) print(peaks_df) # maintain a dictionary to record chrom coordinates that are # written to the output bigWig, this will make inserting # overlapping coordinates easy to handle. pyBigWig's addEntries # function will scream if you write to a position to which # an entry was already added previously # Note: since chromosome's are sorted we can delete the # previous chromosome entries to save memory write_log = {} prev_chrom = '' for index, row in tqdm(peaks_df.iterrows(), total=peaks_df.shape[0]): chrom = row['chrom'] start = row['_start'] end = row['_end'] # delete write log entries of the previous chromosome if chrom != prev_chrom: write_log.pop(prev_chrom, None) # new dict for new chrom write_log[chrom] = {} prev_chrom = chrom try: logits_vals = np.nan_to_num(logits_bigWig.values(chrom, start, end)) except RuntimeError as e: # Get current system exception ex_type, ex_value, ex_traceback = sys.exc_info() print("Skipping peak ({}, {}, {}) in logits bigWig. No data " "found. Make sure to use the same peaks and " "output-window-size that were used in the predict " "step".format(chrom, start, end)) continue try: counts_vals = np.nan_to_num(counts_bigWig.values(chrom, start, end)) except RuntimeError as e: # Get current system exception ex_type, ex_value, ex_traceback = sys.exc_info() print("Skipping peak ({}, {}, {}) in counts bigWig. No data " "found. Make sure to use the same peaks and " "output-window-size that were used in the predict " "step".format(chrom, start, end)) continue chroms = [chrom] * args.window_size starts = list(range(start, end, 1)) ends = list(range(start + 1, end + 1, 1)) # scale logits: first softmax, then multiply by counts probVals = logits_vals - logsumexp(logits_vals) probVals = np.exp(probVals) profile = np.multiply(counts_vals, probVals) for i in range(len(chroms)): try: _ = write_log[chroms[i]][starts[i]] except KeyError as e: # write to bigWig only if the location was not written to # before output_bigWig.addEntries( [chroms[i]], [starts[i]], ends=[ends[i]], values=[profile[i]]) # add entry into write log write_log[chrom][start] = 0
def main(): # change the way processes are started, default = 'fork' # had to do this to prevent Keras multi gpu from deadlocking mp.set_start_method('forkserver') # inform user of the keras stderr log file logging.warning("For all keras related error logs refer to " "keras.stderr in your local directory") # parse the command line arguments parser = argparsers.training_argsparser() args = parser.parse_args() # input params input_params = {} input_params['data'] = args.input_data input_params['stranded'] = args.stranded input_params['has_control'] = args.has_control # output params output_params = {} output_params['automate_filenames'] = args.automate_filenames output_params['time_zone'] = args.time_zone output_params['tag_length'] = args.tag_length output_params['output_dir'] = args.output_dir output_params['model_output_filename'] = args.model_output_filename # genome params genome_params = {} genome_params['reference_genome'] = args.reference_genome genome_params['chrom_sizes'] = args.chrom_sizes genome_params['chroms'] = args.chroms genome_params['exclude_chroms'] = args.exclude_chroms # batch generation parameters batch_gen_params = {} batch_gen_params['sequence_generator_name'] = args.sequence_generator_name batch_gen_params['input_seq_len'] = args.input_seq_len batch_gen_params['output_len'] = args.output_len batch_gen_params['sampling_mode'] = args.sampling_mode batch_gen_params['rev_comp_aug'] = args.reverse_complement_augmentation batch_gen_params['negative_sampling_rate'] = args.negative_sampling_rate batch_gen_params['max_jitter'] = args.max_jitter batch_gen_params['shuffle'] = args.shuffle # hyper parameters hyper_params = {} hyper_params['epochs'] = args.epochs hyper_params['batch_size'] = args.batch_size hyper_params['learning_rate'] = args.learning_rate hyper_params['min_learning_rate'] = args.min_learning_rate hyper_params['early_stopping_patience'] = args.early_stopping_patience hyper_params['early_stopping_min_delta'] = args.early_stopping_min_delta hyper_params['reduce_lr_on_plateau_patience'] = \ args.reduce_lr_on_plateau_patience # parallelization parms parallelization_params = {} parallelization_params['threads'] = args.threads parallelization_params['gpus'] = args.gpus # network params network_params = {} network_params['name'] = args.model_arch_name network_params['filters'] = args.filters network_params['counts_loss_weight'] = args.counts_loss_weight network_params['control_smoothing'] = args.control_smoothing if not os.path.exists(output_params['output_dir']): raise quietexception.QuietException( "Directory {} does not exist".format(output_params['output_dir'])) if not output_params['automate_filenames'] and \ output_params['automate_filenames'] is None: raise quietexception.QuietException( "Model output filename not specified") if not os.path.exists(genome_params['reference_genome']): raise quietexception.QuietException( "Reference genome file {} does not exist".format( genome_params['reference_genome'])) if not os.path.exists(genome_params['chrom_sizes']): raise quietexception.QuietException( "Chromosome sizes file {} does not exist".format( genome_params['chrom_sizes'])) try: get_model = getattr(model_archs, network_params['name']) except AttributeError: raise quietexception.QuietException( "Network {} not found in model definitions".format( network_params['name'])) if not os.path.isfile(args.splits): raise quietexception.QuietException("File not found: {}", args.splits) # load splits from json file with open(args.splits, "r") as splits_json: splits = json.loads(splits_json.read()) # training and validation training.train_and_validate_ksplits(input_params, output_params, genome_params, batch_gen_params, hyper_params, parallelization_params, network_params, splits)
def getChromPositions(chroms, chrom_sizes, flank, mode='sequential', num_positions=-1, step=50): """Chromosome positions spanning the entire chromosome at a) regular intervals or b) random locations Args: chroms (space separated list): The list of required chromosomes chrom_sizes (pandas.Dataframe): dataframe of chromosome sizes with 'chrom' and 'size' columns flank (int): Buffer size before & after the position to ensure we dont fetch values at index < 0 & > chrom size mode (str): mode of returned position 'sequential' (from the beginning) or 'random' num_positions (int): number of chromosome positions to return on each chromosome, use -1 to return positions across the entrire chromosome for all given chromosomes in `chroms`. mode='random' cannot be used with num_positions=-1 step (int): the interval between consecutive chromosome positions Returns: pandas.DataFrame: two column dataframe of chromosome positions (chrom, pos) """ if mode == 'random' and num_positions == -1: raise quietexception.QuietException( "Incompatible parameter pairing: 'mode' = random, " "'num_positions' = -1") # check if chrom_sizes has a column called 'chrom' if 'chrom' not in chrom_sizes.columns: logging.error("Expected column 'chrom' not found in chrom_sizes") return None chrom_sizes = chrom_sizes.set_index('chrom') # initialize an empty dataframe with 'chrom' and 'pos' columns positions = pd.DataFrame(columns=['chrom', 'pos']) # for each chromosome in the list for i in range(len(chroms)): chrom_size = chrom_sizes.at[chroms[i], 'size'] # keep start & end within bounds start = flank end = chrom_size - flank + 1 if mode == 'random': # randomly sample positions pos_array = np.random.randint(start, end, num_positions) if mode == 'sequential': _end = end if num_positions != -1: # change the last positon based on the number of # required positions _end = start + step * num_positions # if the newly computed 'end' goes beyond the # chromosome end (we could throw an error here) if _end > end: _end = end # positions at regular intervals pos_array = list(range(start, _end, step)) # construct a dataframe for this chromosome chrom_df = pd.DataFrame({ 'chrom': [chroms[i]] * len(pos_array), 'pos': pos_array }) # concatenate to existing df positions = pd.concat([positions, chrom_df]) return positions
def bounds_main(): """ The main entry point for the bounds computation script """ # parse the command line arguments parser = bounds_argsparser() args = parser.parse_args() # check if the output directory exists if not os.path.exists(args.output_directory): raise quietexception.QuietException( "Directory {} does not exist".format(args.output_directory)) # check to make sure at least one input profile was provided if len(args.input_profiles) == 0: raise quietexception.QuietException( "At least one input file is required to compute upper and " "lower bound") # check to see if the number of output names is equal to the number # of input profiles that were provided if len(args.output_names) != len(args.input_profiles) : raise quietexception.QuietException( "There should be same number of output names as the number " "of input files") # check if each input profile bigWig file exists for fname in args.input_profiles: if not os.path.exists(fname): raise quietexception.QuietException( "File not found! {}".format(fname)) # check if the peaks file exists if not os.path.exists(args.peaks): raise quietexception.QuietException( "Peaks file {} does not exist".format(args.peaks)) # read the peaks bed file into a pandas dataframe peaks_df = pd.read_csv(args.peaks, sep='\t', header=None, names=['chrom', 'st', 'en', 'name', 'score', 'strand', 'signalValue', 'p', 'q', 'summit']) # if --chroms paramter is provided filter the dataframe rows if args.chroms is not None: peaks_df = peaks_df[peaks_df['chrom'].isin(args.chroms)] # modified start and end based on summit & specified peak_width peaks_df['start'] = peaks_df['st'] + peaks_df['summit'] - \ (args.peak_width // 2) peaks_df['end'] = peaks_df['st'] + peaks_df['summit'] + \ (args.peak_width // 2) print("Peaks shape", peaks_df.shape[0]) # reset index in case rows have been filtered peaks_df = peaks_df.reset_index() # iterate through each input profile for i in range(len(args.input_profiles)): # path to input profile bigWig input_profile_bigWig = args.input_profiles[i] print("Processing ... ", input_profile_bigWig) # compute upper & lower bounds, and avg profile performance average_profile, bounds_df = bounds( input_profile_bigWig, peaks_df, args.peak_width, args.smoothing_params) # path to output average profile file average_profile_filename = "{}/{}_average_profile.csv".format( args.output_directory, args.output_names[i]) # write average profile to csv file print("Saving average profile ...") np.savetxt(average_profile_filename, average_profile, delimiter=",") # path to the output bounds file output_fname = "{}/{}.bds".format(args.output_directory, args.output_names[i]) # write the dataframe to a csv file print("Saving bounds file ...") bounds_df.to_csv(output_fname, index=False)
def bounds(input_bigWig, peaks_df, peak_width, smoothing_params=[7, 81]): """ Function to compute lower & upper bounds, and average profile performance for cross entropy and jsd metrics Args: input_bigWig (str): path to bigWig file peaks_df (str): pandas dataframe containing peaks information. The dataframe should have 'chrom', 'start', and 'end' as first 3 columns. Each peak should have the same width (equal to peak_width) i.e 'end' - 'start' is the same for all rows in the dataframe. peak_width (int): width of each peak. smoothing_params (list): list of length 2, containing sigma and window_size values for 1D gaussian smoothing of profiles Returns: tuple: (numpy array of average profile, pandas dataframe with bounds values in columns) """ # compute the average profile print("Computing average profile ...") avg_profile = get_average_profile(input_bigWig, peaks_df, peak_width) # get average profile as probabilities avg_profile_prob = avg_profile / np.sum(avg_profile) # open the bigWig file for reading bw = pyBigWig.open(input_bigWig) # arrays to hold metrics values for mnll, cross entropy, jsd, # pearson and spearman correlation of the peak profile computed # against uniform, average and self(observed peak) profile # mnll mnll_uniform = np.zeros(peaks_df.shape[0]) mnll_average = np.zeros(peaks_df.shape[0]) mnll_self = np.zeros(peaks_df.shape[0]) # cross entropy ce_uniform = np.zeros(peaks_df.shape[0]) ce_average = np.zeros(peaks_df.shape[0]) ce_self = np.zeros(peaks_df.shape[0]) # jsd jsd_uniform = np.zeros(peaks_df.shape[0]) jsd_average = np.zeros(peaks_df.shape[0]) jsd_self = np.zeros(peaks_df.shape[0]) # pearson pearson_uniform = np.zeros(peaks_df.shape[0]) pearson_average = np.zeros(peaks_df.shape[0]) pearson_self = np.zeros(peaks_df.shape[0]) # spearman spearman_uniform = np.zeros(peaks_df.shape[0]) spearman_average = np.zeros(peaks_df.shape[0]) spearman_self = np.zeros(peaks_df.shape[0]) print("Computing bounds ...") # iterate through all peaks for idx, row in tqdm(peaks_df.iterrows(), desc='peak', total=peaks_df.shape[0]): # raise exception if 'end' - 'start' is not equal to peak_width if (row['end'] - row['start']) != peak_width: raise quietexception.QuietException( "Inconsistent peak width found at: {}:{}-{}".format( row['chrom'], row['start'], row['end'])) # get bigWig profile profile = np.nan_to_num( bw.values(row['chrom'], row['start'], row['end'])) # if we find that the profile at this peak is all zeros if sum(profile) == 0: print("Found 'zero' profile at {}: ({}, {})".format( row['chrom'], row['start'], row['end'])) # assign nans to all mnll_uniform[idx] = np.nan mnll_average[idx] = np.nan mnll_self[idx] = np.nan ce_uniform[idx] = np.nan ce_average[idx] = np.nan ce_self[idx] = np.nan jsd_uniform[idx] = np.nan jsd_average[idx] = np.nan jsd_self[idx] = np.nan pearson_uniform[idx] = np.nan pearson_average[idx] = np.nan pearson_self[idx] = np.nan spearman_uniform[idx] = np.nan spearman_average[idx] = np.nan spearman_self[idx] = np.nan continue # uniform distribution profile uniform_profile = np.ones(peak_width) * (1.0 / peak_width) # smoothed profile profile_smooth = gaussian1D_smoothing(profile, smoothing_params[0], smoothing_params[1]) # smoothed profile as probabilities profile_smooth_prob = profile_smooth / np.sum(profile_smooth) # profile as probabilities profile_prob = profile / np.sum(profile) # mnll of profile with uniform profile mnll_uniform[idx] = mnll(profile, probs=uniform_profile) # mnll of profile with average profile mnll_average[idx] = mnll(profile, probs=avg_profile_prob) # mnll of profile with itself mnll_self[idx] = mnll(profile, probs=profile_prob) # cross entropy of profile with uniform profile ce_uniform[idx] = profile_cross_entropy(profile, probs=uniform_profile) # cross entropy of profile with average profile ce_average[idx] = profile_cross_entropy(profile, probs=avg_profile_prob) # cross entropy of profile with itself ce_self[idx] = profile_cross_entropy(profile, probs=profile_prob) # jsd of profile with uniform profile jsd_uniform[idx] = jensenshannon(profile_prob, uniform_profile) # jsd of profile with average profile jsd_average[idx] = jensenshannon(profile_prob, avg_profile_prob) # jsd of profile with itself (upper bound) jsd_self[idx] = 0.0 # pearson of profile with uniform profile ### nothing to do ... leave it as zeros # pearson of profile with average profile pearson_average[idx] = pearsonr(profile, avg_profile_prob)[0] # pearson of profile with itself pearson_self[idx] = pearsonr(profile, profile)[0] # spearman of profile with uniform profile ### nothing to do ... leave it as zeros # spearman of profile with average profile spearman_average[idx] = spearmanr(profile, avg_profile_prob)[0] spearman_self[idx] = spearmanr(profile, profile)[0] # create a pandas dataframe to hold the upper & lower bound, # and avg profile performance values column_names = ['mnll_uniform', 'mnll_average', 'mnll_self', 'ce_uniform', 'ce_average', 'ce_self', 'jsd_uniform', 'jsd_average', 'jsd_self', 'pearson_uniform', 'pearson_average', 'pearson_self', 'spearman_uniform', 'spearman_average', 'spearman_self'] # create a pandas dataframe to store all the bounds values bounds_df = pd.DataFrame(columns = column_names) # assign values to the dataframe columns bounds_df['mnll_uniform'] = np.nan_to_num(mnll_uniform) bounds_df['mnll_average'] = np.nan_to_num(mnll_average) bounds_df['mnll_self'] = np.nan_to_num(mnll_self) bounds_df['ce_uniform'] = np.nan_to_num(ce_uniform) bounds_df['ce_average'] = np.nan_to_num(ce_average) bounds_df['ce_self'] = np.nan_to_num(ce_self) bounds_df['jsd_uniform'] = np.nan_to_num(jsd_uniform) bounds_df['jsd_average'] = np.nan_to_num(jsd_average) bounds_df['jsd_self'] = np.nan_to_num(jsd_self) bounds_df['pearson_uniform'] = np.nan_to_num(pearson_uniform) bounds_df['pearson_average'] = np.nan_to_num(pearson_average) bounds_df['pearson_self'] = np.nan_to_num(pearson_self) bounds_df['spearman_uniform'] = np.nan_to_num(spearman_uniform) bounds_df['spearman_average'] = np.nan_to_num(spearman_average) bounds_df['spearman_self'] = np.nan_to_num(spearman_self) return avg_profile, bounds_df
def counts_loss_weight_main(): """ main function for counts loss weight computation """ # parse the command line arguments parser = counts_loss_weight_argsparser() args = parser.parse_args() # check if the input data file exists if not os.path.exists(args.input_data): # output the default value to stdout print(args.default) raise quietexception.QuietException( "Input data file {} does not exist. Using default weight " "{}".format(args.input_data, args.default)) with open(args.input_data, 'r') as inp_json: try: input_data = json.loads(inp_json.read()) except Exception as e: # output the default value to stdout print(args.default) exc_type, exc_value, exc_traceback = sys.exc_info() raise quietexception.QuietException( "{} {}. Using default weight {}".format( exc_type.__name__, str(exc_value), args.default)) # get all the bigWigs and peaks from the input_data bigWigs = [] peaks = [] for task in input_data: if 'signal' in input_data[task].keys(): bigWigs.append(input_data[task]['signal']) if 'peaks' in input_data[task].keys(): peaks.append(input_data[task]['peaks']) # if no bigWigs found if len(bigWigs) == 0: # output the default value to stdout print(args.default) raise quietexception.QuietException( "No 'signal' bigWigs found. Using default weight {}".format( args.default)) else: # check to see if all are valid paths for bigWig in bigWigs: if not os.path.exists(bigWig): # output the default value to stdout print(args.default) raise quietexception.QuietException( "File {} does not exist. Using default weight " "{}".format(bigWig, args.default)) # if no peaks found if len(peaks) == 0: # output the default value to stdout print(args.default) raise quietexception.QuietException( "No 'peaks' files found. Using default weight {}".format( args.default)) else: # check to see if all are valid paths for peak_file in peaks: if not os.path.exists(peak_file): # output the default value to stdout print(args.default) raise quietexception.QuietException( "File {} does not exist. Using default weight " "{}".format(peak_file, args.default)) # list of all peaks dataframes to be passed to stats function peaks_dfs = [] # load each peak file and compute the correct 'start' and 'end' # intervals for peak_file in peaks: peaks_df = pd.read_csv(peak_file, sep='\t', header=None, names=[ 'chrom', 'st', 'e', 'name', 'score', 'strand', 'signal', 'p', 'q', 'summit' ]) # create new column for peak start peaks_df['start'] = peaks_df['st'] + \ peaks_df['summit'] - \ args.peak_width//2 # create new column for peak end peaks_df['end'] = peaks_df['st'] + \ peaks_df['summit'] + \ args.peak_width//2 # append to the list of peaks dataframes peaks_dfs.append(peaks_df[['chrom', 'start', 'end']]) # compute the counts loss weight using the stats module function clw = stats.get_recommended_counts_loss_weight(bigWigs, peaks_dfs, args.alpha) print(clw)