Exemplo n.º 1
0
def main():
    theta_file = DEFAULT_MODEL
    target_seq = 'CTGAGTAGCTATGCGGCCAGCAGCGAGACGCTCAGCGTGAAGCGGCAGTATCCCTCTTTCCTGCGCACCATCCCCAATC'
    pam_idx = 42
    profile, rep_reads, in_frame = predictMutations(theta_file, target_seq,
                                                    pam_idx)
    plotProfiles([profile], [rep_reads], [pam_idx], [False], ['Predicted'])

    import pdb
    pdb.set_trace()
Exemplo n.º 2
0
def build_plot_by_profile(filename, profile, oligo_id):
    rep_reads = {}
    fetchReads(filename, rep_reads, oligo_id)
    setFigType('png')
    fig = plotProfiles([profile], [rep_reads], [43], [False], ['Predicted'],
                       title='In Frame: %.1f%%' % rep_reads[FRAME_SHIFT])
    return fig
Exemplo n.º 3
0
def plot_predictions(theta_file, target_seq, pam_idx, out_filename=None):

    if pam_idx < 0 or pam_idx >= (len(target_seq) - 3):
        raise Exception('PAM idx out of range')

    if sum([x in ['A', 'T', 'G', 'C'] for x in target_seq]) != len(target_seq):
        raise Exception('Sequence must be composed of A,T,G,or C only')

    if len(target_seq) < 20 or pam_idx < 13 or pam_idx > len(target_seq) - 7:
        raise Exception(
            'Sequence too short or PAM too close to edge of sequence (must have at least 10nt either side of cut site)'
        )

    if target_seq[pam_idx + 1:pam_idx + 3] != 'GG':
        raise Exception('Non NGG PAM (check correct index of PAM)')

    profile, rep_reads, in_frame = predictMutations(theta_file, target_seq,
                                                    pam_idx)
    if not out_filename:
        out_filename = '%s_%d.txt' % (target_seq, pam_idx)
    fout = io.open(out_filename, 'w')
    fout.write(u'@@@%s\n' % ('%.1f' % in_frame))
    writePredictedProfileToSummary(profile, fout)
    fout.close()
    setFigType('png')
    fig = plotProfiles([profile], [rep_reads], [pam_idx], [False],
                       ['Predicted'],
                       title='In Frame: %.1f%%' % in_frame)
    return fig
Exemplo n.º 4
0
def computeAndComparePredicted(theta_file,
                               selected_id=None,
                               out_dir='.',
                               start_count=0,
                               end_count=10000):

    features_dir = getHighDataDir() + '/gen_indels/features_for_gen_indels'
    theta, train_set, feature_columns = readTheta(theta_file)

    new_sep_labels = 'New 2x800x', 'New 1600x'
    old_sep_labels = 'Old 2x800x', 'Old 1600x'

    #Note: here old refers to conventional scaffold library, new refers to improved scaffold library
    fout = io.open(
        out_dir + '/old_new_kl_predicted_summaries.txt' %
        (start_count, end_count), 'w')
    fout.write(
        u'Old Oligo Id\tNew Oligo Id\tOld Mut Reads\tNew Mut Reads\tCombined Mut Reads\t'
    )
    fout.write(u'\t'.join('%s Mut Reads' % x.split('/')[-1]
                          for x in new_sep_labels + old_sep_labels))
    fout.write(
        u'\tOld In Frame Perc\tNew In Frame Perc\tCombined in Frame Perc\tPredicted In Frame Per\t'
    )
    fout.write(u'\t'.join('%s In Frame Perc' % x.split('/')[-1]
                          for x in new_sep_labels + old_sep_labels))
    fout.write(
        u'\tOld v New KL\tOld v Predicted KL\tNew v Predicted KL\tCombined v Predicted KL\t'
    )
    fout.write(u'\t'.join('%s vs Predicted KL' % x.split('/')[-1]
                          for x in new_sep_labels + old_sep_labels) + '\t')
    fout.write(u'\t'.join([
        '%s vs %s KL' % (x.split('/')[-1], y.split('/')[-1])
        for x, y in (getCombs(new_sep_labels) + getCombs(old_sep_labels))
    ]) + '\n')

    id_pairs = loadValidationPairs()
    for (old_id, new_id) in id_pairs:
        if old_id in train_set or new_id in train_set:
            raise Exception('Bad!!! Testing on Training data: %s %s' %
                            (old_id, new_id))

        if selected_id is not None and selected_id != old_id:
            continue  #Guide pair selected for plotting

        #Load Old and new profiles, and produce combined profile from the two
        p_old, p_new, mut_reads_old, mut_reads_new = loadProfilePair(
            old_id, new_id)
        p_comb, mut_reads_comb = combineProfiles(p_old, p_new, mut_reads_old,
                                                 mut_reads_new)

        #Predict the profile (old and new will be the same so just do one)
        feature_data = loadOligoFeaturesAndReadCounts(new_id, [])
        p_predict, _ = computePredictedProfile(feature_data, theta,
                                               feature_columns)

        #Load separate profiles too
        p_old_sep, p_new_sep, old_sep_mr, new_sep_mr = loadProfilesSeparately(
            old_id, new_id)

        #Compute in frame percentages
        old_if_perc = getInFramePerc(p_old)
        new_if_perc = getInFramePerc(p_new)
        comb_if_perc = getInFramePerc(p_comb)
        pred_if_perc = getInFramePerc(p_predict)
        new_sep_if_percs = [
            getInFramePerc(profile) if len(profile) > 1 else -1
            for profile in p_new_sep
        ]
        old_sep_if_percs = [
            getInFramePerc(profile) if len(profile) > 1 else -1
            for profile in p_old_sep
        ]

        #Plot the comparison
        if selected_id is not None:
            rrds = loadRepReads(new_id)
            plotProfiles([p_new_sep[0], p_new_sep[1], p_predict],
                         [rrds, rrds, rrds], [56, 56, 56],
                         [False, False, False],
                         ['Replicate 1', 'Replicate 2', 'Predicted'],
                         title='%s (KL=%.2f, KL=%.2f)' %
                         (new_id, symmetricKL(p_new_sep[0], p_new_sep[1]),
                          symmetricKL(p_new, p_predict)))

        str_args = (symmetricKL(p_old, p_new), symmetricKL(p_old, p_predict),
                    symmetricKL(p_new,
                                p_predict), symmetricKL(p_comb, p_predict))
        kl_str = u'\t%.5f\t%.5f\t%.5f\t%.5f\t' % str_args
        kl_str += u'\t'.join([
            '%.5f' % symmetricKL(p_predict, x) for x in p_new_sep + p_old_sep
        ])
        kl_str += u'\t' + u'\t'.join([
            '%.5f' % symmetricKL(x, y)
            for (x, y) in (getCombs(p_new_sep) + getCombs(p_old_sep))
        ])
        if_str = u'\t'.join(
            ['%.3f' % x for x in new_sep_if_percs + old_sep_if_percs])
        mut_str = u'\t'.join(['%d' % x for x in new_sep_mr + old_sep_mr])
        fout.write(u'%s\t%s\t%d\t%d\t%d\t%s\t%.3f\t%.3f\t%.3f\t%.3f\t%s%s\n' %
                   (old_id, new_id, mut_reads_old, mut_reads_new,
                    mut_reads_comb, mut_str, old_if_perc, new_if_perc,
                    comb_if_perc, pred_if_perc, if_str, kl_str))
        fout.flush()
    fout.close()
Exemplo n.º 5
0
def build_plot_by_profile(filename, profile, oligo_id):
    rep_reads = {}
    fetchReads(filename, rep_reads, oligo_id)
    setFigType('png')
    fig = plotProfiles([profile], [rep_reads], [43], [False], ['Predicted'])
    return fig
Exemplo n.º 6
0
    print('Predicting mutations...')
    p_predict, rep_reads, in_frame_perc = predictMutations(
        theta_file, target_seq, pam_idx)
    print('Writing to file...')
    writeProfilesToFile(out_prefix,
                        [('Test Guide', p_predict, rep_reads, in_frame_perc)],
                        write_rr=True)
    print('Done!')


def predictMutationsBulk(target_file, out_prefix, theta_file=DEFAULT_MODEL):
    #Target File: a tab-delimited file with columns:  ID, Target, PAM Index
    print('Predicting mutations...')
    profiles_and_rr = predictProfilesBulk(theta_file, target_file)
    print('Writing to file...')
    writeProfilesToFile(out_prefix, profiles_and_rr, write_rr=True)
    print('Done!')


if __name__ == '__main__':

    theta_file = DEFAULT_MODEL
    target_seq = 'CTGAGTAGCTATGCGGCCAGCAGCGAGACGCTCAGCGTGAAGCGGCAGTATCCCTCTTTCCTGCGCACCATCCCCAATC'
    pam_idx = 42
    profile, rep_reads, in_frame = predictMutations(theta_file, target_seq,
                                                    pam_idx)
    plotProfiles([profile], [rep_reads], [pam_idx], [False], ['Predicted'])

    import pdb
    pdb.set_trace()
Exemplo n.º 7
0
def compareOverbeekProfiles(
        selected_overbeek_id=None,
        pred_results_dir='../indel_prediction/model_testing'):

    new_dirs = [
        'ST_June_2017/data/K562_800x_LV7A_DPI7/mapped_reads/Oligos_71',
        'ST_June_2017/data/K562_800x_LV7A_DPI10/mapped_reads/Oligos_71',
        'ST_June_2017/data/K562_800x_LV7B_DPI7/mapped_reads/Oligos_71',
        'ST_June_2017/data/K562_800x_LV7B_DPI10/mapped_reads/Oligos_71',
        'ST_June_2017/data/K562_1600x_LV7B_DPI5/mapped_reads/Oligos_71',
        'ST_Feb_2018/data/CAS9_12NA_1600X_DPI7/mapped_reads/Oligos_71'
    ]

    #Old Samples
    old_dirs = [
        'ST_June_2017/data/K562_1600x_6OA_DPI5/mapped_reads/Oligos_71',
        'ST_June_2017/data/K562_1600x_6OA_DPI7/mapped_reads/Oligos_71',
        'ST_April_2017/data/K562_800x_6OA_DPI3_Old7/mapped_reads/Oligos_71',
        'ST_April_2017/data/K562_800x_6OA_DPI7_Old8/mapped_reads/Oligos_71',
        'ST_April_2017/data/K562_800x_6OA_DPI10_Old9/mapped_reads/Oligos_71',
        'ST_April_2017/data/K562_800x_6OB_DPI3_Old10/mapped_reads/Oligos_71',
        'ST_April_2017/data/K562_800x_6OB_DPI7_Old11/mapped_reads/Oligos_71',
        'ST_April_2017/data/K562_800x_6OB_DPI10_Old12/mapped_reads/Oligos_71'
    ]
    remove_long_indels = False
    remove_wt, wt_thresh = True, 3.0
    mappings = loadMappings()

    all_overbeek_profiles, all_new_profiles, all_old_profiles, all_our_profiles, sel_overbeek_ids,oldnew_overbeek_ids, old_ids, new_ids = [],[],[],[], [],[],[],[]

    overbeek_inframes, ours_inframes, oof_sel_overbeek_ids = [], [], []

    kls, kls_old, kls_new, log_reads, overbeek_ids, above30_percentages, log_reads_new, log_reads_old, min_log_reads = [],[],[],[],[],[],[],[], []
    for idx in range(1, 97):

        overbeek_id = 'Overbeek%d' % idx
        if selected_overbeek_id is not None and selected_overbeek_id != overbeek_id:
            continue
        if overbeek_id not in mappings:
            continue

        overbeek_filename = getHighDataDir(
        ) + '/overbeek_fastq_files/' + overbeek_id + '_mappedindelsummary.txt'

        p1, p1_new, p1_old, o1, rep_reads1, rep_reads2 = {}, {}, {}, {}, {}, {}
        nreads2, nreads1, nreads_old, nreads_new, nnull_old, nnull_new, nnull1, nnull2 = 0, 0, 0, 0, 0, 0, 0, 0

        #Read the overbreek profile
        numread2, perc_accept2, num_null2 = readSummaryToProfile(
            overbeek_filename,
            o1,
            oligoid=overbeek_id,
            remove_long_indels=remove_long_indels,
            remove_wt=False)
        if selected_overbeek_id is not None:
            fetchRepresentativeCleanReads(
                getHighDataDir() + '/overbeek_fastq_files/' + overbeek_id +
                '_mappedindelprofiles.txt',
                rep_reads2,
                oligoid=overbeek_id)
            pam_loc2, pam_dir2 = getNullTargetPamDetails(
                getHighDataDir() + '/overbeek_control_fastq_files/' +
                overbeek_id + '_exptargets.txt',
                oligoid=overbeek_id)
        nreads2 += numread2
        nnull2 += num_null2

        if numread2 == 0: continue

        p1_new_reps, p1_old_reps = [{}, {}], [{}, {}]
        rr_new_reps, rr_old_reps = [{}, {}], [{}, {}]
        #Read all the new and old profiles
        pam_loc1, pam_dir1 = None, None
        for oligo_id, is_old in mappings[overbeek_id]:

            #Read all reads for all our K562 profiles
            oligo_idx = eval(oligo_id[5:])
            _, oligo_fileprefix = getFileForOligoIdx(oligo_idx, ext='')
            oligo_filename = oligo_fileprefix + '_mappedindelsummary.txt'
            read_filename = oligo_fileprefix + '_mappedindelprofiles.txt'
            exptarget_filename = oligo_fileprefix + '_exptargets.txt'
            if is_old:
                oligo_dirs, p1_old_new, null_oligo_dir = old_dirs, p1_old, 'ST_April_2017/data/NULL_Old/mapped_reads/Oligos_71'
                p1_reps, rr_reps = p1_old_reps, rr_old_reps
            else:
                oligo_dirs, p1_old_new, null_oligo_dir = new_dirs, p1_new, 'ST_April_2017/data/NULL_New/mapped_reads/Oligos_71'
                p1_reps, rr_reps = p1_new_reps, rr_new_reps

            for oligo_dir in [getHighDataDir() + '/' + x for x in oligo_dirs]:
                nr1, pa1, nn1 = readSummaryToProfile(
                    oligo_dir + '/' + oligo_filename,
                    p1_old_new,
                    oligoid=oligo_id,
                    remove_long_indels=remove_long_indels,
                    remove_wt=remove_wt,
                    wt_thresh=wt_thresh)
                numread1, perc_accept1, num_null1 = readSummaryToProfile(
                    oligo_dir + '/' + oligo_filename,
                    p1,
                    oligoid=oligo_id,
                    remove_long_indels=remove_long_indels,
                    remove_wt=remove_wt,
                    wt_thresh=wt_thresh)
                if 'DPI7' in oligo_dir:
                    rep_idx = 0 if '800x' in oligo_dir else 1
                    nr_rep, pa_rep, nn_rep = readSummaryToProfile(
                        oligo_dir + '/' + oligo_filename,
                        p1_reps[rep_idx],
                        oligoid=oligo_id,
                        remove_long_indels=remove_long_indels,
                        remove_wt=remove_wt,
                        wt_thresh=wt_thresh)
                if selected_overbeek_id is not None:
                    fetchRepresentativeCleanReads(oligo_dir + '/' +
                                                  read_filename,
                                                  rep_reads1,
                                                  oligoid=oligo_id)
                    if 'DPI7' in oligo_dir:
                        fetchRepresentativeCleanReads(oligo_dir + '/' +
                                                      read_filename,
                                                      rr_reps[rep_idx],
                                                      oligoid=oligo_id)
                    if pam_loc1 is None:
                        pam_loc1, pam_dir1 = getNullTargetPamDetails(
                            getHighDataDir() + '/' + null_oligo_dir + '/' +
                            exptarget_filename,
                            oligoid=oligo_id)
                if is_old:
                    nreads_old += numread1
                    nnull_old += num_null1
                else:
                    nreads_new += numread1
                    nnull_new += num_null1
                nreads1 += numread1
                nnull1 += num_null1

        kls.append(symmetricKL(p1, o1, True))
        kls_old.append(symmetricKL(p1_old, o1, True))
        kls_new.append(symmetricKL(p1_new, o1, True))

        log_reads.append(np.log10(nreads1 - nnull1 + 0.5))
        log_reads_old.append(np.log10(nreads_old - nnull_old + 0.5))
        log_reads_new.append(np.log10(nreads_new - nnull_new + 0.5))
        min_log_reads.append(min(log_reads_old[-1], log_reads_new[-1]))
        above30_percentages.append(computePercAbove30(o1))
        overbeek_ids.append(overbeek_id)

        if log_reads[-1] > 2.0:
            all_overbeek_profiles.append(o1)
            all_our_profiles.append(p1)
            sel_overbeek_ids.append(overbeek_id[8:])
            if above30_percentages[-1] < 50.0:
                oif, oof, _ = fetchIndelSizeCounts(o1)
                pif, pof, _ = fetchIndelSizeCounts(p1)
                overbeek_inframes.append(oif * 100.0 / (oif + oof))
                ours_inframes.append(pif * 100.0 / (pif + pof))
                oof_sel_overbeek_ids.append(overbeek_id)

        if min_log_reads[-1] > 2.0:
            all_new_profiles.append(p1_new)
            all_old_profiles.append(p1_old)
            oldnew_overbeek_ids.append(overbeek_id)
            old_ids.append(
                [id for id, is_old in mappings[overbeek_id] if is_old][0])
            new_ids.append(
                [id for id, is_old in mappings[overbeek_id] if not is_old][0])

        try:
            print(overbeek_id, [x for (x, y) in mappings[overbeek_id]],
                  kls[-1], nreads2, nreads1)
        except KeyError:
            print('Could not find', overbeek_id)
            print(mappings)

        if selected_overbeek_id is not None:
            title = '%s (KL=%.1f)' % (overbeek_id, kls[-1])
            labels = [
                'Conventional scaffold Rep A', 'Conventional scaffold  Rep B',
                'Improved scaffold Rep A', 'Improved scaffold  Rep B',
                'Endogenous Profile'
            ]
            plotProfiles([
                p1_old_reps[0], p1_old_reps[1], p1_new_reps[0], p1_new_reps[0],
                o1
            ], [
                rr_old_reps[0], rr_old_reps[1], rr_new_reps[0], rr_new_reps[1],
                rep_reads2
            ], [pam_loc1, pam_loc1, pam_loc1, pam_loc1, pam_loc2], [
                x == 'REVERSE'
                for x in [pam_dir1, pam_dir1, pam_dir1, pam_dir1, pam_dir2]
            ],
                         labels,
                         title=title)

    if selected_overbeek_id is None:

        plotInFrame(overbeek_inframes, ours_inframes, oof_sel_overbeek_ids,
                    pred_results_dir)

        i = 1
        PL.figure(figsize=(5.5, 5))
        for thr_l, thr_h in [(0.0, 10.0), (10.0, 20.0), (20.0, 50.0),
                             (50.0, 90.0), (90.0, 100.0)]:
            ydata = [
                kl for (kl, a30, id, reads) in zip(kls, above30_percentages,
                                                   overbeek_ids, log_reads)
                if a30 > thr_l and a30 <= thr_h
            ]
            xdata = [
                reads for (kl, a30, id, reads) in zip(kls, above30_percentages,
                                                      overbeek_ids, log_reads)
                if a30 > thr_l and a30 <= thr_h
            ]
            sel_ids = [
                id for (kl, a30, id, reads) in zip(kls, above30_percentages,
                                                   overbeek_ids, log_reads)
                if a30 > thr_l and a30 <= thr_h
            ]
            PL.plot(xdata,
                    ydata,
                    'o',
                    label='%d-%d%% Deletions > 30' % (thr_l, thr_h))
            for x, y, id in zip(xdata, ydata, sel_ids):
                if y > 3 and x > 2:
                    PL.text(x, y, id)
        PL.legend()
        PL.plot([0, 6], [0.77, 0.77], '--', color='grey')
        PL.text(0.1, 0.5, 'Median between our replicates', color='grey')
        PL.ylabel('Symmetric KL Divergence', fontsize=12)
        PL.xlabel('Log10 Mutated Reads', fontsize=12)
        PL.xlim((0, 5.5))
        PL.ylim((0, 8))
        PL.show(block=False)
        saveFig('scatter_KL')
        i += 1

        print('Median=', np.median(kls), 'Mean KL=', np.mean(kls))
        print(len(overbeek_ids))

        #Compute pairwise KL between overbeek and ours
        N = len(sel_overbeek_ids)
        kl_mat = np.zeros((N, N))
        for i, o1 in enumerate(all_overbeek_profiles):
            for j, p1 in enumerate(all_our_profiles):
                kl_mat[i, j] = symmetricKL(o1, p1)
        PL.figure(figsize=(8, 6))
        PL.imshow(kl_mat,
                  cmap='hot_r',
                  vmin=0.0,
                  vmax=3.0,
                  interpolation='nearest')
        PL.xticks(range(N), sel_overbeek_ids, rotation='vertical', fontsize=6)
        PL.yticks(range(N),
                  sel_overbeek_ids,
                  rotation='horizontal',
                  fontsize=6)
        PL.xlabel('Synthetic Measurement', fontsize=12)
        PL.ylabel('Endogenous Measurement', fontsize=12)
        PL.title('KL', fontsize=12)
        PL.colorbar()
        PL.show(block=False)
        saveFig('heatmap_KL')
Exemplo n.º 8
0
def computeAndComparePredicted(theta_file, selected_id=None, out_dir='.'):

    features_dir = getHighDataDir() + '/gen_indels/features_for_gen_indels'
    theta, train_set, feature_columns = readTheta(theta_file)

    #Note: here old refers to conventional scaffold library, new refers to improved scaffold library
    fout = io.open(out_dir + '/old_new_kl_predicted_summaries.txt', 'w')
    fout.write(
        u'Old Oligo Id\tNew Oligo Id\tOld Mut Reads\tNew Mut Reads\tCombined Mut Reads\tOld In Frame Perc\tNew In Frame Perc\tCombined in Frame Perc\tPredicted In Frame Per'
    )
    fout.write(
        u'\tOld v New KL\tOld v Predicted KL\tNew v Predicted KL\tCombined v Predicted KL\n'
    )

    id_pairs = loadValidationPairs()
    for (old_id, new_id) in id_pairs:
        if old_id in train_set or new_id in train_set:
            raise Exception('Bad!!! Testing on Training data: %s %s' %
                            (old_id, new_id))

        if selected_id is not None and selected_id != old_id:
            continue  #Guide pair selected for plotting

        #Load Old and new profiles, and produce combined profile from the two
        p_old, p_new, mut_reads_old, mut_reads_new = loadProfilePair(
            old_id, new_id)
        p_comb, mut_reads_comb = combineProfiles(p_old, p_new, mut_reads_old,
                                                 mut_reads_new)

        #Predict the profile (old and new will be the same so just do one)
        feature_data = loadOligoFeaturesAndReadCounts(new_id, [])
        p_predict, _ = computePredictedProfile(feature_data, theta,
                                               feature_columns)

        #Compute in frame percentages
        old_if, old_of, _ = fetchIndelSizeCounts(p_old)
        new_if, new_of, _ = fetchIndelSizeCounts(p_new)
        comb_if, comb_of, _ = fetchIndelSizeCounts(p_comb)
        pred_if, pred_of, _ = fetchIndelSizeCounts(p_predict)
        old_if_perc = old_if * 100.0 / (old_if + old_of)
        new_if_perc = new_if * 100.0 / (new_if + new_of)
        comb_if_perc = comb_if * 100.0 / (comb_if + comb_of)
        pred_if_perc = pred_if * 100.0 / (pred_if + pred_of)

        #Plot the comparison
        if selected_id is not None:
            rrds = loadRepReads(new_id)
            plotProfiles([p_old, p_new, p_predict], [rrds, rrds, rrds],
                         [42, 42, 42], [False, False, False],
                         ['Replicate 1', 'Replicate 2', 'Predicted'],
                         title='%s (KL=%.2f, KL=%.2f)' %
                         (new_id, symmetricKL(
                             p_old, p_new), symmetricKL(p_comb, p_predict)))

        str_args = (symmetricKL(p_old, p_new), symmetricKL(p_old, p_predict),
                    symmetricKL(p_new,
                                p_predict), symmetricKL(p_comb, p_predict))
        kl_str = u'\t%.5f\t%.5f\t%.5f\t%.5f' % str_args
        fout.write(
            u'%s\t%s\t%d\t%d\t%d\t%.3f\t%.3f\t%.3f\t%.3f%s\n' %
            (old_id, new_id, mut_reads_old, mut_reads_new, mut_reads_comb,
             old_if_perc, new_if_perc, comb_if_perc, pred_if_perc, kl_str))
        fout.flush()
    fout.close()