Exemplo n.º 1
0
def MI_get_pvalue_and_zscore(active_profile, discr_exp_profile, nbins,
                             current_MI, n_permutations):
    shuffled_MI_values = np.zeros(n_permutations, dtype=np.float64)

    for i in range(n_permutations):
        shuffled_expr = np.random.permutation(discr_exp_profile)
        ith_MI = MI.mut_info(active_profile,
                             shuffled_expr,
                             x_bins=2,
                             y_bins=nbins)

        shuffled_MI_values[i] = ith_MI

    shuffled_MI_values.sort()

    if current_MI < shuffled_MI_values[0]:
        # shortcut: if current MI is less than the minimal permuted MI, exit
        value_undiv = n_permutations
    else:
        # go from right to left while the shuffled score is higher than the real one
        j = n_permutations - 1
        while (j >= 0) and (current_MI <= shuffled_MI_values[j]):
            j -= 1
        value_undiv = n_permutations - j - 1

    pvalue = value_undiv / float(n_permutations)
    z_score = (current_MI -
               np.mean(shuffled_MI_values)) / np.std(shuffled_MI_values)

    # print(shuffled_MI_values)
    # print(current_MI)
    return pvalue, z_score
Exemplo n.º 2
0
def min_CI_normalized_test(counter,
                           accepted_seeds_list,
                           profiles_passed,
                           discr_exp_profile,
                           nbins,
                           index_array,
                           min_ratio,
                           do_print=False):
    profile_full = profiles_passed[counter]
    profile_being_analyzed = profile_full[index_array]

    for i in range(len(accepted_seeds_list)):
        ith_accepted_profile_full = profiles_passed[accepted_seeds_list[i]]
        ith_accepted_profile = ith_accepted_profile_full[index_array]

        cond_inf = MI.cond_mut_info(profile_being_analyzed,
                                    discr_exp_profile,
                                    ith_accepted_profile,
                                    x_bins=2,
                                    y_bins=nbins,
                                    z_bins=2)
        mut_inf = MI.mut_info(profile_being_analyzed,
                              ith_accepted_profile,
                              x_bins=2,
                              y_bins=2)
        if np.isclose(mut_inf, 0., atol=1e-16):
            mut_inf = 1e-16
        ratio = cond_inf / mut_inf

        print("Comparing seed #%d to an existing seed #%d. The ratio is %.2f" %
              (counter, i, ratio))

        if ratio < min_ratio:
            return False, i  # return index of accepted seed that is similar to the current one
    return True, 0
Exemplo n.º 3
0
def calculate_MI_for_seeds(decompressed_profiles_array,
                           index_array,
                           discr_exp_profile,
                           nbins,
                           min_occurences,
                           do_print=False):
    MI_values_array = np.zeros(decompressed_profiles_array.shape[0],
                               dtype=np.float32)

    for i, profile in enumerate(decompressed_profiles_array):
        active_profile = profile[index_array]

        if active_profile.sum() <= min_occurences:
            MI_values_array[i] = MASK_OUT_SEED_VALUE
            # print("The seed number %d binds only %d transcripts" % (i, active_profile.sum()))
            continue

        MI_values_array[i] = MI.mut_info(active_profile,
                                         discr_exp_profile,
                                         x_bins=2,
                                         y_bins=nbins)

        if do_print:
            if i % 1000 == 0 and i > 0:
                print("Profile number %d has been calculated" % i)

    MI_values_array = np.array(
        MI_values_array,
        dtype=np.float64)  # make sure all elements are of the same size
    return MI_values_array
Exemplo n.º 4
0
def optimize_motifs(seeds_initial, profiles_initial,
                    discr_exp_profile, nbins, index_array, seqs_of_interest,
                    args, do_print = True):
    seeds_optimized = copy.deepcopy(seeds_initial)
    profiles_optimized = np.zeros((len(seeds_initial), discr_exp_profile.shape[0]), dtype=bool)
    # seed_charact_array keeps MI values, p-values and z-scores
    seed_charact_array = np.zeros((len(seeds_initial), 3), dtype=np.float64)
    robustness_array = np.zeros(len(seeds_initial), dtype=bool)

    for i, motif in enumerate(seeds_initial):
        profile = profiles_initial[i]
        active_profile = profile[index_array]
        n_bestmotif = type_conversions.w_to_n_motif(seeds_initial[i])

        # initial mi value
        init_best_MI = MI.mut_info(active_profile, discr_exp_profile, x_bins=2, y_bins=nbins)
        lastmyfreq = active_profile.sum() / float(active_profile.shape[0])

        if do_print:
            w_bestmotif = type_conversions.n_to_w_motif(n_bestmotif)
            print("Optimzing the sequence of motif %d (sequence is %s). Initial MI = %.5f" %
                            (i, w_bestmotif.print_sequence(return_string=True), init_best_MI))
            #print("Initial frequency: %.4f" % lastmyfreq)

        bestmi, lastmyfreq, n_bestmotif = optimize_motif_sequence(n_bestmotif, init_best_MI, seqs_of_interest,
                            discr_exp_profile, nbins, lastmyfreq, args, do_print = do_print,
                            random_noseed = args.random_noseed)

        if do_print:
            print("Elongating motif %d" % i)

        bestmi, lastmyfreq, n_bestmotif = elongate_motif(n_bestmotif, bestmi, seqs_of_interest,
                            discr_exp_profile, nbins, lastmyfreq, args, do_print = do_print)

        w_bestmotif = type_conversions.n_to_w_motif(n_bestmotif)
        bestmotif_profile, bestmotif_mi, pvalue, z_score = get_characteristics(
                                                            n_bestmotif, seqs_of_interest,
                                                            discr_exp_profile, nbins, args,
                                                            do_print=do_print)

        if do_print:
            print("Checking robustness of the optimized motif %d (sequence %s)" %
                  (i, w_bestmotif.print_sequence(return_string=True)))

        is_robust = check_robustness(bestmotif_profile,
                                    discr_exp_profile, nbins, args,
                                    do_print = do_print)

        seeds_optimized[i] = w_bestmotif
        profiles_optimized[i] = bestmotif_profile.values
        seed_charact_array[i, : ] = np.array([bestmotif_mi, pvalue, z_score], dtype=np.float64)
        robustness_array[i] = is_robust

    return seeds_optimized, profiles_optimized, \
           seed_charact_array, robustness_array
Exemplo n.º 5
0
def get_characteristics(n_bestmotif, seqs_of_interest,
                        discr_exp_profile, nbins, args,
                        do_print = False):
    bestmotif_profile, _time = matchmaker.calculate_profile_one_motif(n_bestmotif, seqs_of_interest,
                                                                      is_degenerate = True)
    bestmotif_mi = MI.mut_info(bestmotif_profile.values, discr_exp_profile, x_bins=2, y_bins=nbins)
    pvalue, z_score = statistic_tests.MI_get_pvalue_and_zscore(bestmotif_profile.values, discr_exp_profile, nbins,
                                                               bestmotif_mi, args.n_permutations)
    if do_print:
        print("The final p-value is: %.4f, z-score is: %.3f" % (pvalue, z_score))
    return bestmotif_profile, bestmotif_mi, pvalue, z_score
Exemplo n.º 6
0
def calculate_MIs_all_seeds(profiles_passed, discr_exp_profile, index_array,
                            nbins):
    MI_values_array = np.zeros(profiles_passed.shape[0], dtype=np.float32)

    for i, profile in enumerate(profiles_passed):
        active_profile = profile[index_array]
        MI_values_array[i] = MI.mut_info(active_profile,
                                         discr_exp_profile,
                                         x_bins=2,
                                         y_bins=nbins)

    return MI_values_array
Exemplo n.º 7
0
def get_current_statistics(index, MI_values_array, profiles_array, index_array,
                           discr_exp_profile, args):
    profile = profiles_array[index]
    active_profile = profile[index_array]
    current_MI = MI_values_array[index]

    if current_MI == -1:
        return args.max_pvalue + 0.1, args.min_zscore - 0.1

    assert (np.isclose(current_MI,
                       MI.mut_info(active_profile, discr_exp_profile),
                       rtol=1e-10))

    pvalue, z_score = statistic_tests.MI_get_pvalue_and_zscore(
        active_profile, discr_exp_profile, current_MI, args.n_permutations)
    return pvalue, z_score
Exemplo n.º 8
0
def jackknife_test(active_profile,
                   discr_exp_profile,
                   nbins,
                   n_permutations,
                   max_pvalue,
                   n_samples,
                   fraction_retain,
                   min_fraction_passed,
                   do_print=False):
    total_number_passed = 0

    for j in range(n_samples):
        full_indices_array = np.arange(active_profile.shape[0])
        how_many_keep = int(fraction_retain * active_profile.shape[0])
        subsampl_index_array = np.random.choice(full_indices_array,
                                                size=how_many_keep,
                                                replace=False)
        curr_profile = active_profile[subsampl_index_array]
        curr_exp_profile = discr_exp_profile[subsampl_index_array]
        curr_MI = MI.mut_info(curr_profile,
                              curr_exp_profile,
                              x_bins=2,
                              y_bins=nbins)
        pvalue, z_score = MI_get_pvalue_and_zscore(curr_profile,
                                                   discr_exp_profile, nbins,
                                                   curr_MI, n_permutations)
        if do_print:
            print(
                "Iteration %d. p-value: %.5f; max_pvalue: %.5f, z-score: %.2f"
                % (j, pvalue, max_pvalue, z_score))
        if pvalue < max_pvalue:
            total_number_passed += 1

    fraction_passed = total_number_passed / float(n_samples)
    if do_print:
        print("%.2f subsamples passed the test; required fraction is %.2f" %
              (fraction_passed, min_fraction_passed))
    if fraction_passed >= min_fraction_passed:
        if do_print:
            print("Passed robustness test")
        return True
    else:
        if do_print:
            print("Did not pass robustness test")
        return False
Exemplo n.º 9
0
def are_there_better_motifs(n_modified_motifs, seqs_of_interest, discr_exp_profile, nbins,
                            bestmi, n_bestmotif, lastmyfreq, args, do_print = True):

    for curr_motif in n_modified_motifs:
        current_profile, time_spent = matchmaker.calculate_profile_one_motif(curr_motif,
                                                                             seqs_of_interest,
                                                                            is_degenerate = True)
        myfreq = current_profile.values.sum() / float(len(seqs_of_interest))
        tempmi = MI.mut_info(current_profile.values, discr_exp_profile, x_bins=2, y_bins=nbins)

        if tempmi > bestmi and current_profile.sum() > args.min_occurences and (myfreq < args.maxfreq or myfreq < lastmyfreq):
            n_bestmotif = structures.copy_n_motif(curr_motif)
            w_bestmotif = type_conversions.n_to_w_motif(n_bestmotif)
            bestmi = tempmi
            lastmyfreq = myfreq
            if do_print:
                print("New motif (MI = %.4f): %s" % (bestmi, w_bestmotif.print_sequence(return_string=True)))
                # w_bestmotif.print()
                # w_bestmotif.print_linear()
                #print("Current frequency: %.4f" % lastmyfreq)
    return bestmi, lastmyfreq, n_bestmotif