def get_locus_counts(bamfile, loci_list, num_bins): """ Brief: Given a single bamfile, loci of interest and max length bin, produce the entire set of features Args: bamfile, str - path to file of interest loci_list, lst - loci of interest num_bins, int - the maximum length that will be binned Return: features, dict - contains all of the features in a keyed dict OR 'N/A' if there are any loci with less than 5 bins """ features = {} for locus in loci_list: reads = count_reads.count(bamfile, locus) #if any of the loci have too low read count, return N/A if len(reads) < 5: return 'N/A' lengths = [len(e) for e in reads] np_reads = np.asarray(lengths) counts = np.bincount(np_reads) x = [0] while counts.size < (num_bins + 1): counts = np.append(counts, x) idx = [0] counts = np.delete(counts, idx) for i in range(len(counts)): idx = i + 1 normed = float(counts[i]) / len(reads) feature_name = '%s_%d' % (locus, idx) features[feature_name] = normed return features
def get_locus_counts(bamfile, loci_list, num_bins): """ Brief: Produce a list of the length-based features for a certain bam file Args: bamfile, str - path to bamfile to be considered loci_list, lst - the loci to be used as features num_bins, int - the max length to be considered Returns: features, lst - ordered set of features to be used in write_lengths_df """ features = [] for locus in loci_list: reads = count_reads.count(bamfile, locus) #if any of the loci have too low read count, return N/A if len(reads) < 5: return 'N/A' lengths = [len(e) for e in reads] np_reads = np.asarray(lengths) counts = np.bincount(np_reads) x = [0] while counts.size < (num_bins + 1): counts = np.append(counts, x) idx = [0] counts = np.delete(counts, idx) norm_counts = [] for element in counts: norm_counts.append(float(element) / len(reads)) features.extend(norm_counts) return features
def get_avg_length(bamfile, locus, mismatch=2, length=7): accepted_reads = count_reads.count(bamfile, locus, flank_length=length, flank_mismatch=mismatch) if len(accepted_reads) == 0: return 'error'
def report_mode_length(directory): bamfiles = bamprocess.scan_files(directory) with open('/home/upload/msi_project/ML/modes.txt', 'w') as f: f.write('locus\t') for locus in _MSI_LOCI: f.write(locus + '\t') f.write('\nmode\t') for locus in _MSI_LOCI: all_reads = [] for bam in bamfiles: all_reads.extend(count_reads.count(bam, locus)) mode = lstproc.mode_length(all_reads) f.write(mode + '\t')
def make_synthetic_feature_dataframe(directory, loci_list): """ Brief: Produce and save to a .txt a dataframe containing synthetic features to be used in ML training and validation Args: directory, str - the path to the directory to access BAM files and turn into dataframe loci_list, lst - loci for which to produce features in the dataframe Returns: None, prints to outfile indicated """ setname = directory.split('/')[-1] bamfiles = bamprocess.scan_files(directory) outfile = '/home/upload/msi_project/ML/%s_full.txt' % setname with open(outfile, 'w') as f: #f.write('#Full %s dataset\nbam_name\t' % setname) for locus in loci_list: f.write('%s_avg_len\t%s_num_lens\t%s_stdev\t%s_dist_mode\t' % (locus, locus, locus, locus)) f.write('msi_status\n') for bam in bamfiles: bam_name = bam.split('/')[-1].replace('A.bam', '') if _ANNOTATIONS[bam_name] == 'MSI': msi_status = 1 elif _ANNOTATIONS[bam_name] == 'MSS': msi_status = 0 else: continue f.write(bam_name + '\t') for locus in loci_list: reads = count_reads.count(bam, locus) if len(reads) == 0: avg_length = num_lengths = stdev = dist_mode = 'NaN' else: lengths = [len(e) for e in reads] avg_length = np.mean(lengths) num_lengths = len(set(lengths)) stdev = np.std(lengths) distance = count = 0 for e in lengths: distance += abs(e - float(_ML_MODES[locus])) count += 1 dist_mode = distance / count f.write( str(avg_length) + '\t' + str(num_lengths) + '\t' + str(stdev) + '\t' + str(dist_mode) + '\t') f.write(str(msi_status) + '\n')
def get_num_lengths(bamfile, locus, mismatch=2, length=7): """ Brief: Calculates the z-score of the number of different lengths present in the file at the given locus as compared to the mss sample Args: str, str, int, int Return: str (if no accepted reads), float """ accepted_reads = count_reads.count(bamfile, locus, flank_length=length, flank_mismatch=mismatch) if len(accepted_reads) == 0: return 'error' else: lengths = [len(e) for e in accepted_reads] num_lengths = len(set(accepted_reads)) z_score = ((float(_MSS_LOCUS_DATA[locus][3]) - num_lengths) / float(_MSS_LOCUS_DATA[locus][4])) return z_score
def mss_num_lengths(bamfiles): outfile = '/home/upload/msi_project/diag_analysis/method_5/mss_training_data.txt' with open(outfile, 'w') as f: f.write('BAM\t') for locus in _MSI_LOCI: f.write(locus + '\t') f.write('\n') for bam in bamfiles: bam_name = bam.split('/')[-1].replace('A.bam', '') f.write(bam_name + '\t') for locus in _MSI_LOCI: accepted_reads = count_reads.count(bam, locus) lengths = [len(e) for e in accepted_reads] num_lengths = len(set(lengths)) if num_lengths == 0: f.write('n/a\t') else: f.write(str(num_lengths) + '\t') f.write('\n')
def get_dist_mode(bamfile, locus, mismatch=2, length=7): """ Brief: For a given bam file and locus, calculates the average distance of the accepted polynucleotide runs from the MSS sample mode length Args: str, str, int, int Return: str (if no accepted reads at that locus), float """ accepted_reads = count_reads.count(bamfile, locus, flank_length=length, flank_mismatch=mismatch) if len(accepted_reads) == 0: return 'error' lengths = [len(e) for e in accepted_reads] distance = 0 count = 0 for read in lengths: distance += abs(read - float(_MSS_LOCUS_DATA[locus][2])) count += 1 return distance / count
def get_stdev_score(bamfile, locus, mismatch=2, length=7): """ Brief: Calculates the standard deviation z-score for the given bam file and locus using the mean stdev and stdev from MSS sample Args: str, str, int, int Return: str (if no locus data, no accepted reads), or float """ if float(_MSS_LOCUS_DATA[locus][1]) == 0: return 'error' accepted_reads = count_reads.count(bamfile, locus, flank_length=length, flank_mismatch=mismatch) if len(accepted_reads) == 0: return 'error' else: lengths = [len(e) for e in accepted_reads] std_dev = np.std(lengths) z_score = ((float(_MSS_LOCUS_DATA[locus][0]) - float(std_dev)) / float(_MSS_LOCUS_DATA[locus][1])) return z_score
plt.savefig(saveloc) plt.clf() # ----------- Main -------------- _MSI_LOCI = msi_loci.get_msi_loci( '/home/upload/msi_project/loci/msi_loci_edited.txt') _QUALITY_THRESHOLDS = { 'MSI-11': .25, 'MSI-12': .25, 'MSI-01': .5, 'BAT-25': .18 } _ANNOTATIONS = get_msi_annotations() _MSS_LOCUS_DATA = get_mss_locus_data() #store bamfiles in a list directory = '/home/upload/msi_project/tcga_bam/tumor_bams/annotated/subset' bamfiles = scan_files(directory) print(avg_length(count_reads.count(bamfiles[1], 'H-09'))) #locus_histogram(bamfiles) ''' i = .7 stdevs = report_std_dev(bamfiles) while i <= 1: confusion_matrix(bamfiles, stdevs, reporting_threshold = i) i += .05 '''
def calling_function(directory, loci): """ Brief: archived locus-based calling Args: str, lst Returns: none, prints relevant info """ tp = tn = fp = fn = 0 upper_threshold = 0.6 lower_threshold = 0.4 min_loci = 3 bamfiles = bamprocess.scan_files(directory) correct_guesses = 0 total_files = 0 scores = {} msi_scores = [] mss_scores = [] for bam in bamfiles: features = {} prob_sum = 0 agree = False num_loci = 0 msicall = 0 bam_name = bam.split('/')[-1].replace('A.bam', '') #store msi status as a boolean, skip if not MSI or MSS if _ANNOTATIONS[bam_name] == 'MSI': msi_status = 1 elif _ANNOTATIONS[bam_name] == 'MSS': msi_status = 0 else: continue print 'Bam file: %s' % bam_name for locus in loci: reads = count_reads.count(bam, locus) #if there are NO reads at this locus in the file, skip and go to the next locus if len(reads) == 0: continue num_loci += 1 #compute the lengths of the reads, store in a list, append to the overall lengths list bam_lengths = [len(e) for e in reads] #compute the average length of a bam's reads, append to the overall avg_lengths list avg_length = np.mean(bam_lengths) features['%s_avg_len' % locus] = avg_length #compute number of different lengths of a bam's reads, append to overall nums_lengths list num_lengths = len(set(bam_lengths)) features['%s_num_lens' % locus] = num_lengths #compute the standard deviation of a bam's reads, append to overall stdevs list stdev = np.std(bam_lengths) features['%s_stdev' % locus] = stdev #compute the average distance from the mode length distance = 0 count = 0 for e in bam_lengths: distance += abs(e - float(_ML_MODES[locus])) count += 1 dist_mode = distance / count features['%s_dist_mode' % locus] = dist_mode #prob = calc_prob(locus, avg_length, dist_mode, num_lengths, stdev) prob = calc_prob(weights, features) scores[bam_name] = prob if msi_status: msi_scores.append(prob) else: mss_scores.append(prob) print 'Prob: %f' % prob # prob_sum += prob #if prob >= 0.5: #msicall += 1 #print 'Locus-level call: MSI' #else: #print 'Locus-level call: MSS' #no reads at any locus #if num_loci < min_loci: # continue #count how many files were called total_files += 1 indeterminate_files = 0 #make a prediction guessed_status = 0 #print '\nNum loci examined: %d' % num_loci #print 'Num MSI calls: %d' % msicall #perc_msi = float(msicall) / num_loci #print '%%MSI: %f' % perc_msi #if perc_msi >= threshold: #guessed_status = 1 #avg_prob = prob_sum / num_loci if prob > upper_threshold: guessed_status = 1 print 'Predicted status: MSI' elif prob < lower_threshold: print 'Predicted status: MSS' else: guessed_status = -1 total_files -= 1 indeterminate_files += 1 print 'Predicted status: Indeterminate' #print 'Predicted status: MSS' print 'Known status: %s' % _ANNOTATIONS[bam_name] #decide whether prediction is correct if guessed_status != -1: if guessed_status == msi_status: correct_guesses += 1 print 'Agree: YES' else: print 'Agree: NO' print '\n' #calculate tp, fp, tn, fn if guessed_status == 1 and msi_status == 1: tp += 1 elif guessed_status == 1 and msi_status == 0: fp += 1 elif guessed_status == 0 and msi_status == 1: fn += 1 elif guessed_status == 0 and msi_status == 0: tn += 1 print 'Summary:' print 'Loci examined: ' + ('\n'.join(loci)) print 'Upper threshold: %f' % upper_threshold #print 'Threshold: %f' % upper_threshold print 'Lower threshold: %f' % lower_threshold #print 'Min no. loci: %d' % min_loci print 'Number of predictions: %s' % total_files print 'Correct predictions: %s' % correct_guesses print 'Indeterminate files: %s' % indeterminate_files print 'Accuracy: %f' % (float(correct_guesses) / total_files) print 'True pos: %d' % tp print 'True neg: %d' % tn print 'False pos: %d' % fp print 'False neg: %d' % fn print 'Sensitivity: %f' % (float(tp) / (tp + fn)) print 'Specificity: %f' % (float(tn) / (tn + fp)) bins = [] i = 0.0 while i < 1.05: bins.append(i) i += 0.05 ''' plt.hist([msi_scores, mss_scores], bins = bins, color = ['red', 'blue'], label = ['MSI', 'MSS']) plt.title('Model-Derrived Probabilities: p(MSI)') plt.legend(loc = 'best') plt.xlabel = ('p(MSI)') plt.ylabel('Number of BAM files') saveloc = '/home/upload/msi_project/ML/probability_distribution' plt.savefig(saveloc) plt.clf() ''' return scores