Пример #1
0
def report_mode_length(directory):
    bamfiles = bamprocess.scan_files(directory)
    with open('/home/upload/msi_project/ML/modes.txt', 'w') as f:
        f.write('locus\t')
        for locus in _MSI_LOCI:
            f.write(locus + '\t')
        f.write('\nmode\t')
        for locus in _MSI_LOCI:
            all_reads = []
            for bam in bamfiles:
                all_reads.extend(count_reads.count(bam, locus))
            mode = lstproc.mode_length(all_reads)
            f.write(mode + '\t')
Пример #2
0
def make_synthetic_feature_dataframe(directory, loci_list):
    """
        Brief: Produce and save to a .txt a dataframe containing synthetic features to be used in ML training and validation
        Args: directory, str - the path to the directory to access BAM files and turn into dataframe
              loci_list, lst - loci for which to produce features in the dataframe
        Returns: None, prints to outfile indicated
        """
    setname = directory.split('/')[-1]

    bamfiles = bamprocess.scan_files(directory)
    outfile = '/home/upload/msi_project/ML/%s_full.txt' % setname
    with open(outfile, 'w') as f:
        #f.write('#Full %s dataset\nbam_name\t' % setname)
        for locus in loci_list:
            f.write('%s_avg_len\t%s_num_lens\t%s_stdev\t%s_dist_mode\t' %
                    (locus, locus, locus, locus))
        f.write('msi_status\n')
        for bam in bamfiles:
            bam_name = bam.split('/')[-1].replace('A.bam', '')
            if _ANNOTATIONS[bam_name] == 'MSI':
                msi_status = 1
            elif _ANNOTATIONS[bam_name] == 'MSS':
                msi_status = 0
            else:
                continue
            f.write(bam_name + '\t')
            for locus in loci_list:
                reads = count_reads.count(bam, locus)

                if len(reads) == 0:
                    avg_length = num_lengths = stdev = dist_mode = 'NaN'
                else:
                    lengths = [len(e) for e in reads]
                    avg_length = np.mean(lengths)
                    num_lengths = len(set(lengths))
                    stdev = np.std(lengths)
                    distance = count = 0
                    for e in lengths:
                        distance += abs(e - float(_ML_MODES[locus]))
                        count += 1
                    dist_mode = distance / count
                f.write(
                    str(avg_length) + '\t' + str(num_lengths) + '\t' +
                    str(stdev) + '\t' + str(dist_mode) + '\t')
            f.write(str(msi_status) + '\n')
Пример #3
0
def write_lengths_df(directory, loci_list, num_bins):
    """
	Brief: Writes a dataframe of the fraction of reads occuring at each potential length for each locus
	Args: directory, str - path to directory containing bam files to use
	      loci_list, lst - a list of all loci to use in the production of this DF
	      num_bins, int - the number of different lengths to consider (usually 50)
	Returns: None, prints relevant info to file
	"""
    setname = directory.split('/')[-1]

    bamfiles = bamprocess.scan_files(directory)
    outfile = '/home/upload/msi_project/ML/%s_top_9_lengths_full.txt' % setname
    with open(outfile, 'w') as f:
        #write the header row
        f.write('bam name\t')
        feature_list = []
        for locus in loci_list:
            for i in range(num_bins):
                idx = i + 1
                feature = '%s_%d' % (locus, idx)
                f.write(feature + '\t')
                feature_list.append(feature)
        f.write('msi_status\n')
        for bam in bamfiles:
            bam_details = []
            bam_name = bam.split('/')[-1].replace('A.bam', '')
            if _ANNOTATIONS[bam_name] == 'MSI':
                msi_status = 1
            elif _ANNOTATIONS[bam_name] == 'MSS':
                msi_status = 0
            else:
                continue
            features = get_locus_counts(bam, loci_list, num_bins)
            if features == 'N/A':
                continue
            f.write(bam_name + '\t')
            for feature in features:
                f.write(str(feature) + '\t')
            f.write(str(msi_status) + '\n')
Пример #4
0
def histogram_features_calling_function(directory, loci):
    """
	Brief: call msi status based on the model for every bam file in a directory, report general statistics for the set of bamfiles to the console
	Args: directory, str - path to the directory to examine
	      loci, lst - the loci of interest
	Returns: scores, dict - the model-produced scores indexed by bam name
	"""
    tp = tn = fp = fn = 0
    threshold = 0.5
    min_loci = 3
    bamfiles = bamprocess.scan_files(directory)
    correct_guesses = 0
    total_files = 0
    scores = {}
    fn_bams = []
    test_bams = []
    msi_scores = []
    mss_scores = []
    for bam in bamfiles:
        features = {}
        prob_sum = 0
        agree = False
        msicall = 0
        bam_name = bam.split('/')[-1].replace('A.bam', '')

        if _ANNOTATIONS[bam_name] == 'MSI':
            msi_status = 1
        elif _ANNOTATIONS[bam_name] == 'MSS':
            msi_status = 0
        else:
            continue

        features = get_locus_counts(bam, loci, 50)
        if features == 'N/A':
            continue
        print 'Bam file: ' + bam_name
        prob = calc_prob(weights, features)
        test_bams.append(bam_name)
        scores[bam_name] = prob

        if msi_status:
            msi_scores.append(prob)
        else:
            mss_scores.append(prob)
        print 'Prob: %f' % prob

        total_files += 1

        #make a prediction
        guessed_status = 0
        if prob > threshold:
            guessed_status = 1
            print 'Predicted status: MSI'
        else:
            print 'Predicted status: MSS'

        print 'Known status: %s' % _ANNOTATIONS[bam_name]
        #decide whether prediction is correct
        if guessed_status != -1:
            if guessed_status == msi_status:
                correct_guesses += 1
                print 'Agree: YES'
            else:
                print 'Agree: NO'

        print '\n'

        #calculate tp, fp, tn, fn
        if guessed_status == 1 and msi_status == 1:
            tp += 1
        elif guessed_status == 1 and msi_status == 0:
            fp += 1
        elif guessed_status == 0 and msi_status == 1:
            fn += 1
            fn_bams.append(bam_name)
        elif guessed_status == 0 and msi_status == 0:
            tn += 1

    print 'Summary:'
    print 'Loci examined: ' + ('\n'.join(loci))
    print 'Threshold: %f' % threshold
    #print 'Min no. loci: %d' % min_loci
    print 'Number of predictions: %s' % total_files
    print 'Correct predictions: %s' % correct_guesses
    print 'Accuracy: %f' % (float(correct_guesses) / total_files)
    print 'True pos: %d' % tp
    print 'True neg: %d' % tn
    print 'False pos: %d' % fp
    print 'False neg: %d' % fn
    print 'Sensitivity: %f' % (float(tp) / (tp + fn))
    print 'Specificity: %f' % (float(tn) / (tn + fp))
    bins = []
    i = 0.0
    while i < 1.05:
        bins.append(i)
        i += 0.05

    plt.hist([msi_scores, mss_scores],
             bins=bins,
             color=['red', 'blue'],
             label=['MSI', 'MSS'])
    plt.title('Model-Derrived Probabilities: p(MSI)')
    plt.legend(loc='best')
    plt.xlabel = ('p(MSI)')
    plt.ylabel('Number of BAM files')
    saveloc = '/home/upload/msi_project/ML/histogram_features/top_9/mss_adjusted_probability_distribution'
    plt.savefig(saveloc)
    plt.clf()
    return scores
Пример #5
0
def calling_function(directory, loci):
    """
	Brief: archived locus-based calling
	Args: str, lst
	Returns: none, prints relevant info	
	"""
    tp = tn = fp = fn = 0
    upper_threshold = 0.6
    lower_threshold = 0.4
    min_loci = 3
    bamfiles = bamprocess.scan_files(directory)
    correct_guesses = 0
    total_files = 0
    scores = {}
    msi_scores = []
    mss_scores = []
    for bam in bamfiles:
        features = {}
        prob_sum = 0
        agree = False
        num_loci = 0
        msicall = 0
        bam_name = bam.split('/')[-1].replace('A.bam', '')
        #store msi status as a boolean, skip if not MSI or MSS
        if _ANNOTATIONS[bam_name] == 'MSI':
            msi_status = 1
        elif _ANNOTATIONS[bam_name] == 'MSS':
            msi_status = 0
        else:
            continue
        print 'Bam file: %s' % bam_name
        for locus in loci:
            reads = count_reads.count(bam, locus)

            #if there are NO reads at this locus in the file, skip and go to the next locus
            if len(reads) == 0:
                continue
            num_loci += 1

            #compute the lengths of the reads, store in a list, append to the overall lengths list
            bam_lengths = [len(e) for e in reads]

            #compute the average length of a bam's reads, append to the overall avg_lengths list
            avg_length = np.mean(bam_lengths)
            features['%s_avg_len' % locus] = avg_length
            #compute number of different lengths of a bam's reads, append to overall nums_lengths list
            num_lengths = len(set(bam_lengths))
            features['%s_num_lens' % locus] = num_lengths
            #compute the standard deviation of a bam's reads, append to overall stdevs list
            stdev = np.std(bam_lengths)
            features['%s_stdev' % locus] = stdev
            #compute the average distance from the mode length
            distance = 0
            count = 0
            for e in bam_lengths:
                distance += abs(e - float(_ML_MODES[locus]))
                count += 1
            dist_mode = distance / count
            features['%s_dist_mode' % locus] = dist_mode

        #prob = calc_prob(locus, avg_length, dist_mode, num_lengths, stdev)
        prob = calc_prob(weights, features)

        scores[bam_name] = prob

        if msi_status:
            msi_scores.append(prob)
        else:
            mss_scores.append(prob)
        print 'Prob: %f' % prob
        #       prob_sum += prob
        #if prob >= 0.5:
        #msicall += 1
        #print 'Locus-level call: MSI'
        #else:
        #print 'Locus-level call: MSS'

        #no reads at any locus
        #if num_loci < min_loci:
        #       continue

        #count how many files were called
        total_files += 1
        indeterminate_files = 0

        #make a prediction
        guessed_status = 0
        #print '\nNum loci examined: %d' % num_loci
        #print 'Num MSI calls: %d' % msicall
        #perc_msi = float(msicall) / num_loci
        #print '%%MSI: %f' % perc_msi
        #if perc_msi >= threshold:
        #guessed_status = 1
        #avg_prob = prob_sum / num_loci
        if prob > upper_threshold:
            guessed_status = 1
            print 'Predicted status: MSI'
        elif prob < lower_threshold:
            print 'Predicted status: MSS'
        else:
            guessed_status = -1
            total_files -= 1
            indeterminate_files += 1
            print 'Predicted status: Indeterminate'
            #print 'Predicted status: MSS'

        print 'Known status: %s' % _ANNOTATIONS[bam_name]
        #decide whether prediction is correct
        if guessed_status != -1:
            if guessed_status == msi_status:
                correct_guesses += 1
                print 'Agree: YES'
            else:
                print 'Agree: NO'

        print '\n'

        #calculate tp, fp, tn, fn
        if guessed_status == 1 and msi_status == 1:
            tp += 1
        elif guessed_status == 1 and msi_status == 0:
            fp += 1
        elif guessed_status == 0 and msi_status == 1:
            fn += 1
        elif guessed_status == 0 and msi_status == 0:
            tn += 1
    print 'Summary:'
    print 'Loci examined: ' + ('\n'.join(loci))
    print 'Upper threshold: %f' % upper_threshold
    #print 'Threshold: %f' % upper_threshold
    print 'Lower threshold: %f' % lower_threshold
    #print 'Min no. loci: %d' % min_loci
    print 'Number of predictions: %s' % total_files
    print 'Correct predictions: %s' % correct_guesses
    print 'Indeterminate files: %s' % indeterminate_files
    print 'Accuracy: %f' % (float(correct_guesses) / total_files)
    print 'True pos: %d' % tp
    print 'True neg: %d' % tn
    print 'False pos: %d' % fp
    print 'False neg: %d' % fn
    print 'Sensitivity: %f' % (float(tp) / (tp + fn))
    print 'Specificity: %f' % (float(tn) / (tn + fp))

    bins = []
    i = 0.0
    while i < 1.05:
        bins.append(i)
        i += 0.05
    '''
        plt.hist([msi_scores, mss_scores], bins = bins, color = ['red', 'blue'], label = ['MSI', 'MSS'])
        plt.title('Model-Derrived Probabilities: p(MSI)')
        plt.legend(loc = 'best')
        plt.xlabel = ('p(MSI)')
        plt.ylabel('Number of BAM files')
        saveloc = '/home/upload/msi_project/ML/probability_distribution'
        plt.savefig(saveloc)
        plt.clf()
        '''
    return scores
Пример #6
0
    plt.savefig(saveloc)
    plt.clf()


#------------------------------------------------------------ main ---------------------------------------------------------------#
#some paths to useful directories
mss_msi_fullset = '/home/upload/msi_project/tcga_bam/tumor_bams/ml_set/mss_msi_fullset'
mode_train = '/home/upload/msi_project/tcga_bam/tumor_bams/ml_set/mode_train'
training_set = '/home/upload/msi_project/tcga_bam/tumor_bams/ml_set/training_set'
validation_set = '/home/upload/msi_project/tcga_bam/tumor_bams/ml_set/validation_set'
test_set = '/home/upload/msi_project/tcga_bam/tumor_bams/ml_set/test_set'
cr_dir = '/home/upload/msi_project/tcga_bam/COAD-READ'
u_dir = '/home/upload/msi_project/tcga_bam/UCEC'

#list of filepaths to cr and u bams
_CR_BAMS = make_bam_list(bamprocess.scan_files(cr_dir))
_U_BAMS = make_bam_list(bamprocess.scan_files(u_dir))

#edited directories
test_directory = '/home/upload/msi_project/tcga_bam/tumor_bams/ml_set/test_set'
edited_test = '/home/upload/msi_project/tcga_bam/tumor_bams/ml_set/test_set/edited'

#some useful loci
loci = [
    'MSI-11', 'MSI-14', 'H-10', 'HSPH1-T17', 'BAT-26', 'BAT-25', 'MSI-04',
    'MSI-06', 'MSI-07', 'MSI-01', 'MSI-03', 'MSI-09', 'H-09', 'H-08', 'H-01',
    'H-03', 'H-02', 'H-04', 'H-07', 'H-06', 'H-05'
]
top_7 = ['BAT-26', 'MSI-07', 'MSI-09', 'H-06', 'MSI-06', 'MSI-04', 'HSPH1-T17']
top_9 = [
    'BAT-26', 'MSI-07', 'MSI-09', 'H-06', 'MSI-06', 'MSI-04', 'HSPH1-T17',