Пример #1
0
def get_locus_counts(bamfile, loci_list, num_bins):
    """
	Brief: Given a single bamfile, loci of interest and max length bin, produce the entire set of features
	Args: bamfile, str - path to file of interest
	      loci_list, lst - loci of interest
	      num_bins, int - the maximum length that will be binned
	Return: features, dict - contains all of the features in a keyed dict OR 'N/A' if there are any loci with less than 5 bins
	"""
    features = {}
    for locus in loci_list:
        reads = count_reads.count(bamfile, locus)
        #if any of the loci have too low read count, return N/A
        if len(reads) < 5:
            return 'N/A'
        lengths = [len(e) for e in reads]
        np_reads = np.asarray(lengths)
        counts = np.bincount(np_reads)
        x = [0]
        while counts.size < (num_bins + 1):
            counts = np.append(counts, x)
        idx = [0]
        counts = np.delete(counts, idx)
        for i in range(len(counts)):
            idx = i + 1
            normed = float(counts[i]) / len(reads)
            feature_name = '%s_%d' % (locus, idx)
            features[feature_name] = normed
    return features
Пример #2
0
def get_locus_counts(bamfile, loci_list, num_bins):
    """
	Brief: Produce a list of the length-based features for a certain bam file
	Args: bamfile, str - path to bamfile to be considered
              loci_list, lst - the loci to be used as features
	      num_bins, int - the max length to be considered
	Returns: features, lst - ordered set of features to be used in write_lengths_df
	"""
    features = []
    for locus in loci_list:
        reads = count_reads.count(bamfile, locus)
        #if any of the loci have too low read count, return N/A
        if len(reads) < 5:
            return 'N/A'
        lengths = [len(e) for e in reads]
        np_reads = np.asarray(lengths)
        counts = np.bincount(np_reads)
        x = [0]
        while counts.size < (num_bins + 1):
            counts = np.append(counts, x)
        idx = [0]
        counts = np.delete(counts, idx)
        norm_counts = []
        for element in counts:
            norm_counts.append(float(element) / len(reads))
        features.extend(norm_counts)
    return features
Пример #3
0
def get_avg_length(bamfile, locus, mismatch=2, length=7):
    accepted_reads = count_reads.count(bamfile,
                                       locus,
                                       flank_length=length,
                                       flank_mismatch=mismatch)
    if len(accepted_reads) == 0:
        return 'error'
Пример #4
0
def report_mode_length(directory):
    bamfiles = bamprocess.scan_files(directory)
    with open('/home/upload/msi_project/ML/modes.txt', 'w') as f:
        f.write('locus\t')
        for locus in _MSI_LOCI:
            f.write(locus + '\t')
        f.write('\nmode\t')
        for locus in _MSI_LOCI:
            all_reads = []
            for bam in bamfiles:
                all_reads.extend(count_reads.count(bam, locus))
            mode = lstproc.mode_length(all_reads)
            f.write(mode + '\t')
Пример #5
0
def make_synthetic_feature_dataframe(directory, loci_list):
    """
        Brief: Produce and save to a .txt a dataframe containing synthetic features to be used in ML training and validation
        Args: directory, str - the path to the directory to access BAM files and turn into dataframe
              loci_list, lst - loci for which to produce features in the dataframe
        Returns: None, prints to outfile indicated
        """
    setname = directory.split('/')[-1]

    bamfiles = bamprocess.scan_files(directory)
    outfile = '/home/upload/msi_project/ML/%s_full.txt' % setname
    with open(outfile, 'w') as f:
        #f.write('#Full %s dataset\nbam_name\t' % setname)
        for locus in loci_list:
            f.write('%s_avg_len\t%s_num_lens\t%s_stdev\t%s_dist_mode\t' %
                    (locus, locus, locus, locus))
        f.write('msi_status\n')
        for bam in bamfiles:
            bam_name = bam.split('/')[-1].replace('A.bam', '')
            if _ANNOTATIONS[bam_name] == 'MSI':
                msi_status = 1
            elif _ANNOTATIONS[bam_name] == 'MSS':
                msi_status = 0
            else:
                continue
            f.write(bam_name + '\t')
            for locus in loci_list:
                reads = count_reads.count(bam, locus)

                if len(reads) == 0:
                    avg_length = num_lengths = stdev = dist_mode = 'NaN'
                else:
                    lengths = [len(e) for e in reads]
                    avg_length = np.mean(lengths)
                    num_lengths = len(set(lengths))
                    stdev = np.std(lengths)
                    distance = count = 0
                    for e in lengths:
                        distance += abs(e - float(_ML_MODES[locus]))
                        count += 1
                    dist_mode = distance / count
                f.write(
                    str(avg_length) + '\t' + str(num_lengths) + '\t' +
                    str(stdev) + '\t' + str(dist_mode) + '\t')
            f.write(str(msi_status) + '\n')
Пример #6
0
def get_num_lengths(bamfile, locus, mismatch=2, length=7):
    """
	Brief: Calculates the z-score of the number of different lengths present in the file at the given locus as compared to the mss sample
	Args: str, str, int, int
	Return: str (if no accepted reads), float
	"""
    accepted_reads = count_reads.count(bamfile,
                                       locus,
                                       flank_length=length,
                                       flank_mismatch=mismatch)
    if len(accepted_reads) == 0:
        return 'error'
    else:
        lengths = [len(e) for e in accepted_reads]
        num_lengths = len(set(accepted_reads))
        z_score = ((float(_MSS_LOCUS_DATA[locus][3]) - num_lengths) /
                   float(_MSS_LOCUS_DATA[locus][4]))
        return z_score
Пример #7
0
def mss_num_lengths(bamfiles):
    outfile = '/home/upload/msi_project/diag_analysis/method_5/mss_training_data.txt'
    with open(outfile, 'w') as f:
        f.write('BAM\t')
        for locus in _MSI_LOCI:
            f.write(locus + '\t')
        f.write('\n')
        for bam in bamfiles:
            bam_name = bam.split('/')[-1].replace('A.bam', '')
            f.write(bam_name + '\t')
            for locus in _MSI_LOCI:
                accepted_reads = count_reads.count(bam, locus)
                lengths = [len(e) for e in accepted_reads]
                num_lengths = len(set(lengths))
                if num_lengths == 0:
                    f.write('n/a\t')
                else:
                    f.write(str(num_lengths) + '\t')
            f.write('\n')
Пример #8
0
def get_dist_mode(bamfile, locus, mismatch=2, length=7):
    """
        Brief: For a given bam file and locus, calculates the average distance of the accepted polynucleotide runs from the MSS sample mode length
        Args: str, str, int, int
        Return: str (if no accepted reads at that locus), float
        """
    accepted_reads = count_reads.count(bamfile,
                                       locus,
                                       flank_length=length,
                                       flank_mismatch=mismatch)
    if len(accepted_reads) == 0:
        return 'error'

    lengths = [len(e) for e in accepted_reads]
    distance = 0
    count = 0
    for read in lengths:
        distance += abs(read - float(_MSS_LOCUS_DATA[locus][2]))
        count += 1
    return distance / count
Пример #9
0
def get_stdev_score(bamfile, locus, mismatch=2, length=7):
    """
        Brief: Calculates the standard deviation z-score for the given bam file and locus using the mean stdev and stdev from MSS sample
        Args: str, str, int, int
        Return: str (if no locus data, no accepted reads), or float
        """
    if float(_MSS_LOCUS_DATA[locus][1]) == 0:
        return 'error'

    accepted_reads = count_reads.count(bamfile,
                                       locus,
                                       flank_length=length,
                                       flank_mismatch=mismatch)
    if len(accepted_reads) == 0:
        return 'error'
    else:
        lengths = [len(e) for e in accepted_reads]
        std_dev = np.std(lengths)
        z_score = ((float(_MSS_LOCUS_DATA[locus][0]) - float(std_dev)) /
                   float(_MSS_LOCUS_DATA[locus][1]))
        return z_score
            plt.savefig(saveloc)
            plt.clf()


# ----------- Main --------------
_MSI_LOCI = msi_loci.get_msi_loci(
    '/home/upload/msi_project/loci/msi_loci_edited.txt')
_QUALITY_THRESHOLDS = {
    'MSI-11': .25,
    'MSI-12': .25,
    'MSI-01': .5,
    'BAT-25': .18
}
_ANNOTATIONS = get_msi_annotations()
_MSS_LOCUS_DATA = get_mss_locus_data()

#store bamfiles in a list
directory = '/home/upload/msi_project/tcga_bam/tumor_bams/annotated/subset'
bamfiles = scan_files(directory)

print(avg_length(count_reads.count(bamfiles[1], 'H-09')))

#locus_histogram(bamfiles)
'''
i = .7
stdevs = report_std_dev(bamfiles)
while i <= 1:
	confusion_matrix(bamfiles, stdevs, reporting_threshold = i)
	i += .05
'''
Пример #11
0
def calling_function(directory, loci):
    """
	Brief: archived locus-based calling
	Args: str, lst
	Returns: none, prints relevant info	
	"""
    tp = tn = fp = fn = 0
    upper_threshold = 0.6
    lower_threshold = 0.4
    min_loci = 3
    bamfiles = bamprocess.scan_files(directory)
    correct_guesses = 0
    total_files = 0
    scores = {}
    msi_scores = []
    mss_scores = []
    for bam in bamfiles:
        features = {}
        prob_sum = 0
        agree = False
        num_loci = 0
        msicall = 0
        bam_name = bam.split('/')[-1].replace('A.bam', '')
        #store msi status as a boolean, skip if not MSI or MSS
        if _ANNOTATIONS[bam_name] == 'MSI':
            msi_status = 1
        elif _ANNOTATIONS[bam_name] == 'MSS':
            msi_status = 0
        else:
            continue
        print 'Bam file: %s' % bam_name
        for locus in loci:
            reads = count_reads.count(bam, locus)

            #if there are NO reads at this locus in the file, skip and go to the next locus
            if len(reads) == 0:
                continue
            num_loci += 1

            #compute the lengths of the reads, store in a list, append to the overall lengths list
            bam_lengths = [len(e) for e in reads]

            #compute the average length of a bam's reads, append to the overall avg_lengths list
            avg_length = np.mean(bam_lengths)
            features['%s_avg_len' % locus] = avg_length
            #compute number of different lengths of a bam's reads, append to overall nums_lengths list
            num_lengths = len(set(bam_lengths))
            features['%s_num_lens' % locus] = num_lengths
            #compute the standard deviation of a bam's reads, append to overall stdevs list
            stdev = np.std(bam_lengths)
            features['%s_stdev' % locus] = stdev
            #compute the average distance from the mode length
            distance = 0
            count = 0
            for e in bam_lengths:
                distance += abs(e - float(_ML_MODES[locus]))
                count += 1
            dist_mode = distance / count
            features['%s_dist_mode' % locus] = dist_mode

        #prob = calc_prob(locus, avg_length, dist_mode, num_lengths, stdev)
        prob = calc_prob(weights, features)

        scores[bam_name] = prob

        if msi_status:
            msi_scores.append(prob)
        else:
            mss_scores.append(prob)
        print 'Prob: %f' % prob
        #       prob_sum += prob
        #if prob >= 0.5:
        #msicall += 1
        #print 'Locus-level call: MSI'
        #else:
        #print 'Locus-level call: MSS'

        #no reads at any locus
        #if num_loci < min_loci:
        #       continue

        #count how many files were called
        total_files += 1
        indeterminate_files = 0

        #make a prediction
        guessed_status = 0
        #print '\nNum loci examined: %d' % num_loci
        #print 'Num MSI calls: %d' % msicall
        #perc_msi = float(msicall) / num_loci
        #print '%%MSI: %f' % perc_msi
        #if perc_msi >= threshold:
        #guessed_status = 1
        #avg_prob = prob_sum / num_loci
        if prob > upper_threshold:
            guessed_status = 1
            print 'Predicted status: MSI'
        elif prob < lower_threshold:
            print 'Predicted status: MSS'
        else:
            guessed_status = -1
            total_files -= 1
            indeterminate_files += 1
            print 'Predicted status: Indeterminate'
            #print 'Predicted status: MSS'

        print 'Known status: %s' % _ANNOTATIONS[bam_name]
        #decide whether prediction is correct
        if guessed_status != -1:
            if guessed_status == msi_status:
                correct_guesses += 1
                print 'Agree: YES'
            else:
                print 'Agree: NO'

        print '\n'

        #calculate tp, fp, tn, fn
        if guessed_status == 1 and msi_status == 1:
            tp += 1
        elif guessed_status == 1 and msi_status == 0:
            fp += 1
        elif guessed_status == 0 and msi_status == 1:
            fn += 1
        elif guessed_status == 0 and msi_status == 0:
            tn += 1
    print 'Summary:'
    print 'Loci examined: ' + ('\n'.join(loci))
    print 'Upper threshold: %f' % upper_threshold
    #print 'Threshold: %f' % upper_threshold
    print 'Lower threshold: %f' % lower_threshold
    #print 'Min no. loci: %d' % min_loci
    print 'Number of predictions: %s' % total_files
    print 'Correct predictions: %s' % correct_guesses
    print 'Indeterminate files: %s' % indeterminate_files
    print 'Accuracy: %f' % (float(correct_guesses) / total_files)
    print 'True pos: %d' % tp
    print 'True neg: %d' % tn
    print 'False pos: %d' % fp
    print 'False neg: %d' % fn
    print 'Sensitivity: %f' % (float(tp) / (tp + fn))
    print 'Specificity: %f' % (float(tn) / (tn + fp))

    bins = []
    i = 0.0
    while i < 1.05:
        bins.append(i)
        i += 0.05
    '''
        plt.hist([msi_scores, mss_scores], bins = bins, color = ['red', 'blue'], label = ['MSI', 'MSS'])
        plt.title('Model-Derrived Probabilities: p(MSI)')
        plt.legend(loc = 'best')
        plt.xlabel = ('p(MSI)')
        plt.ylabel('Number of BAM files')
        saveloc = '/home/upload/msi_project/ML/probability_distribution'
        plt.savefig(saveloc)
        plt.clf()
        '''
    return scores