예제 #1
0
def get_mutation_fixation_trajectories(population):

    mutations, depth_tuple = parse_annotated_timecourse(population)
    population_avg_depth_times, population_avg_depths, clone_avg_depth_times, clone_avg_depths = depth_tuple
    state_times, state_trajectories = parse_well_mixed_state_timecourse(population)
    times = mutations[0][9]
    Ms = numpy.zeros_like(times)*1.0
    fixed_Ms = numpy.zeros_like(times)*1.0

    #transit_times[population] = []

    for mutation_idx in range(0,len(mutations)):

        #location, gene_name, allele, var_type, test_statistic, pvalue, cutoff_idx, depth_fold_change, depth_change_pvalue, times, alts, depths, clone_times, clone_alts, clone_depths = mutations[mutation_idx]
        location, gene_name, allele, var_type, codon, position_in_codon, AAs_count,  test_statistic, pvalue, cutoff_idx, depth_fold_change, depth_change_pvalue, times, alts, depths, clone_times, clone_alts, clone_depths = mutations[mutation_idx]

        state_Ls = state_trajectories[mutation_idx]

        good_idxs, filtered_alts, filtered_depths = timecourse_utils.mask_timepoints(times, alts, depths, var_type, cutoff_idx, depth_fold_change, depth_change_pvalue)

        freqs = timecourse_utils.estimate_frequencies(filtered_alts, filtered_depths)

        masked_times = times[good_idxs]
        masked_freqs = freqs[good_idxs]
        masked_state_Ls = state_Ls[good_idxs]

        t0,tf,transit_time = timecourse_utils.calculate_appearance_fixation_time_from_hmm(masked_times, masked_freqs, masked_state_Ls)
        #print(t0,tf,transit_time)
        if t0==tf==transit_time==None:
            continue

        #print(masked_times, masked_freqs)

        interpolating_function = timecourse_utils.create_interpolation_function(masked_times, masked_freqs, tmax=100000)

        fs = interpolating_function(times)
        fs[fs<0]=0

        # Record
        Ms += fs
        if masked_state_Ls[-1] in well_mixed_fixed_states:
            fixed_Ms += (times>=tf)


    return times, Ms, fixed_Ms
예제 #2
0
    state_times, state_trajectories = parse_file.parse_well_mixed_state_timecourse(population)
        
    times = mutations[0][10]
    Ms = numpy.zeros_like(times)*1.0
    fixed_Ms = numpy.zeros_like(times)*1.0
    
    transit_times[population] = []
    
    for mutation_idx in xrange(0,len(mutations)):
 
        location, gene_name, allele, var_type, test_statistic, pvalue, cutoff_idx, depth_fold_change, depth_change_pvalue, times, alts, depths, clone_times, clone_alts, clone_depths = mutations[mutation_idx] 
    
        Ls = haplotype_trajectories[mutation_idx]
        state_Ls = state_trajectories[mutation_idx]
        
        good_idxs, filtered_alts, filtered_depths = timecourse_utils.mask_timepoints(times, alts, depths, var_type, cutoff_idx, depth_fold_change, depth_change_pvalue)
        
        freqs = timecourse_utils.estimate_frequencies(filtered_alts, filtered_depths)
        
        masked_times = times[good_idxs]
        masked_freqs = freqs[good_idxs]
        masked_state_Ls = state_Ls[good_idxs]
        
        t0,tf,transit_time = timecourse_utils.calculate_appearance_fixation_time_from_hmm(masked_times, masked_freqs, masked_state_Ls)
        transit_times[population].append(transit_time)
        
          
        interpolating_function = timecourse_utils.create_interpolation_function(masked_times, masked_freqs, tmax=100000)
   
        fs = interpolating_function(times)
        fs[fs<0]=0
        # if pvalue is lower than threshold and not a weird insertion gene
        passed_str = 'FAIL'

        if (pvalue <= threshold_pvalue) and gene_name != 'repeat' and (
                location != parse_file.ancestral_araA_mutation_location
        ) and (location != parse_file.ancestral_recD_mutation_location) and (
                alts[0] * 1.0 / (depths[0] + (depths[0] == 0)) < 0.2):
            # would otherwise pass
            # check if clone fail

            # determine whether clone data suggests that the mutation
            # is a duplication. do not pass these

            # first estimate frequencies at good timepoints
            good_idxs, filtered_alts, filtered_depths = timecourse_utils.mask_timepoints(
                times, alts, depths, var_type, deletion_idx, fold_reduction,
                deletion_pvalue)
            freqs = timecourse_utils.estimate_frequencies(
                filtered_alts, filtered_depths)
            masked_times = times[good_idxs]
            masked_freqs = freqs[good_idxs]

            masked_depth_ratios = depths[good_idxs] / pop_avg_depths[good_idxs]

            interpolation_function = timecourse_utils.create_interpolation_function(
                masked_times, masked_freqs, tmax=100000)

            masked_clone_times, masked_clone_freqs = timecourse_utils.estimate_clone_frequencies(
                clone_times, clone_alts, clone_depths)

            if len(masked_clone_times) == 0:
예제 #4
0
    good_mutations = []

    sys.stderr.write("Processing %s\n" % population)

    mutation_data, depth_tuple = parse_file.parse_annotated_timecourse(
        population)

    times = mutation_data[0][10]

    for mutation_idx in xrange(0, len(mutation_data)):

        location, gene_name, allele, var_type, test_statistic, pvalue, cutoff_idx, depth_fold_change, depth_change_pvalue, times, alts, depths, clone_times, clone_alts, clone_depths = mutation_data[
            mutation_idx]

        good_idxs, masked_alts, masked_depths = timecourse_utils.mask_timepoints(
            times, alts, depths, var_type, cutoff_idx, depth_fold_change,
            depth_change_pvalue, min_coverage)

        max_freq = (masked_alts * 1.0 / (masked_depths +
                                         (masked_depths < 1))).max()
        num_good_timepoints = (masked_depths > 0.5).sum()

        if num_good_timepoints > 5:
            mutations.append((masked_alts * 1.0, masked_depths * 1.0))
            if var_type != 'sv' and var_type != 'indel' and masked_depths[
                    -1] > 1:
                #if masked_depths[-1]>1:
                good_mutations.append((masked_alts * 1.0, masked_depths * 1.0))
        else:
            mutations.append(
                (numpy.zeros_like(times) * 1.0, numpy.zeros_like(times)))
def run_analyses():
    r2s_obs_dict = {}
    #r2s_null_dict = {}
    for treatment in ['0', '1', '2']:
        r2s_obs_dict[treatment] = {}
        for taxon in taxa:
            r2s_all = []
            ratio_f_all = []
            abs_delta_f_all = []
            for replicate in replicates:

                population = treatment + taxon + replicate
                sys.stderr.write("Processing %s...\n" % population)

                mutations, depth_tuple = parse_file.parse_annotated_timecourse(
                    population)
                population_avg_depth_times, population_avg_depths, clone_avg_depth_times, clone_avg_depths = depth_tuple
                state_times, state_trajectories = parse_file.parse_well_mixed_state_timecourse(
                    population)

                times = mutations[0][12]
                Ms = np.zeros_like(times) * 1.0
                fixed_Ms = np.zeros_like(times) * 1.0

                for mutation_idx_i in range(0, len(mutations)):

                    location_i, gene_name_i, allele_i, var_type_i, codon_i, position_in_codon_i, AAs_count_i, test_statistic_i, pvalue_i, cutoff_idx_i, depth_fold_change_i, depth_change_pvalue_i, times_i, alts_i, depths_i, clone_times_i, clone_alts_i, clone_depths_i = mutations[
                        mutation_idx_i]

                    state_Ls_i = state_trajectories[mutation_idx_i]
                    good_idx_i, filtered_alts_i, filtered_depths_i = timecourse_utils.mask_timepoints(
                        times_i, alts_i, depths_i, var_type_i, cutoff_idx_i,
                        depth_fold_change_i, depth_change_pvalue_i)
                    freqs_i = timecourse_utils.estimate_frequencies(
                        filtered_alts_i, filtered_depths_i)

                    masked_times_i = times[good_idx_i]
                    masked_freqs_i = freqs_i[good_idx_i]
                    masked_state_Ls_i = state_Ls_i[good_idx_i]

                    P_idx_i = np.where(masked_state_Ls_i == 3)[0]
                    if len(P_idx_i) < min_trajectory_length:
                        continue
                    first_P_i = P_idx_i[0]
                    last_P_i = P_idx_i[-1]

                    masked_freqs_P_i = masked_freqs_i[first_P_i:last_P_i + 1]
                    masked_times_P_i = masked_times_i[first_P_i:last_P_i + 1]

                    delta_masked_freqs_P_i = masked_freqs_P_i[
                        1:] - masked_freqs_P_i[:-1]
                    delta_masked_times_P_i = masked_times_P_i[:-1]

                    #abs_delta_f = np.absolute(freqs_i[1:] - freqs_i[:-1])
                    #freqs_i_no_zero = freqs_i[freqs_i>0]
                    # we want to get the ratio of freqs

                    for freqs_i_k, freqs_i_l in zip(freqs_i[1:], freqs_i[:-1]):
                        if (freqs_i_k == 0) or (freqs_i_l == 0):
                            continue
                        abs_delta_f_all.append(
                            np.absolute(freqs_i_k - freqs_i_l))
                        ratio_f_all.append(freqs_i_k / freqs_i_l)

                    #ratio_f = freqs_i_no_zero[]

                    for mutation_idx_j in range(mutation_idx_i + 1,
                                                len(mutations)):

                        location_j, gene_name_j, allele_j, var_type_j, codon_j, position_in_codon_j, AAs_count_j, test_statistic_j, pvalue_j, cutoff_jdx_j, depth_fold_change_j, depth_change_pvalue_j, times_j, alts_j, depths_j, clone_times_j, clone_alts_j, clone_depths_j = mutations[
                            mutation_idx_j]

                        state_Ls_j = state_trajectories[mutation_idx_j]
                        good_idx_j, filtered_alts_j, filtered_depths_j = timecourse_utils.mask_timepoints(
                            times_j, alts_j, depths_j, var_type_j,
                            cutoff_jdx_j, depth_fold_change_j,
                            depth_change_pvalue_j)
                        freqs_j = timecourse_utils.estimate_frequencies(
                            filtered_alts_j, filtered_depths_j)

                        masked_times_j = times[good_idx_j]
                        masked_freqs_j = freqs_j[good_idx_j]
                        masked_state_Ls_j = state_Ls_j[good_idx_j]

                        P_jdx_j = np.where(masked_state_Ls_j == 3)[0]
                        if len(P_jdx_j) < min_trajectory_length:
                            continue
                        first_P_j = P_jdx_j[0]
                        last_P_j = P_jdx_j[-1]

                        masked_freqs_P_j = masked_freqs_j[first_P_j:last_P_j +
                                                          1]
                        masked_times_P_j = masked_times_j[first_P_j:last_P_j +
                                                          1]

                        delta_masked_freqs_P_j = masked_freqs_P_j[
                            1:] - masked_freqs_P_j[:-1]
                        # delta_f = f_t_plus_1 - f_t
                        delta_masked_times_P_j = masked_times_P_j[:-1]

                        intersect_times = np.intersect1d(
                            delta_masked_times_P_i, delta_masked_times_P_j)

                        if len(intersect_times) >= 3:

                            intersect_idx_i = [
                                np.where(delta_masked_times_P_i ==
                                         intersect_time)[0][0]
                                for intersect_time in intersect_times
                            ]
                            intersect_delta_i = delta_masked_freqs_P_i[
                                intersect_idx_i]

                            intersect_idx_j = [
                                np.where(delta_masked_times_P_j ==
                                         intersect_time)[0][0]
                                for intersect_time in intersect_times
                            ]
                            intersect_delta_j = delta_masked_freqs_P_j[
                                intersect_idx_j]

                            if len(intersect_delta_i) != len(
                                    intersect_delta_j):
                                print(len(intersect_delta_j),
                                      len(intersect_delta_j))

                            r2 = stats.pearsonr(intersect_delta_i,
                                                intersect_delta_j)[0]**2
                            r2s_all.append(r2)

            r2s_all = np.asarray(r2s_all)
            ratio_f_all = np.asarray(ratio_f_all)
            abs_delta_f_all = np.asarray(abs_delta_f_all)

            #r2s_obs_dict[treatment + taxon] = {}
            #r2s_obs_dict[treatment + taxon]['r2'] = r2s_all
            #r2s_obs_dict[treatment + taxon]['ratio_f'] = ratio_f_all
            #r2s_obs_dict[treatment + taxon]['abs_delta_f'] = abs_delta_f_all

            r2s_obs_dict[treatment][taxon] = {}
            r2s_obs_dict[treatment][taxon]['r2'] = r2s_all
            r2s_obs_dict[treatment][taxon]['ratio_f'] = ratio_f_all
            r2s_obs_dict[treatment][taxon]['abs_delta_f'] = abs_delta_f_all

    with open(pt.get_path() + '/data/mutation_dynamics.pickle',
              'wb') as handle:
        pickle.dump(r2s_obs_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)