예제 #1
0
def centimorgans_to_probabilities(recomb_rate_info, test_functionality):
    """

    Purpose
    -------
    Convert recombination rates (in cM) to the probability that a recombination event exists at the
    corresponding interval given that one recombination event exists in the chromosome. This is used
    as the probability of drawing that interval's genomic position to be the site of a breakpoint.

    Parameters
    ----------
    recomb_rate_info: Output from the "reduce_recomb_rate_info" function.

    Returns
    -------
    a numpy array of probabilities that sum to 1. The ith probability is the probability
    of drawing the ith recombination interval to use as the location of a breakpoint.

    """

    new_cumulative_cM = recomb_rate_info["Map(cM)"].to_numpy()
    rcmb_rates = new_cumulative_cM[1:] - new_cumulative_cM[0:-1]
    expected_Ri = (1 - np.exp(-rcmb_rates / 50)) / 2
    expected_R = np.sum(expected_Ri)
    p_Ri_true_given_R_one = expected_Ri / expected_R

    if test_functionality == "test_units":
        unit_tester(
            p_Ri_true_given_R_one,
            "correct_centimorgans_to_probabilities_output.txt",
            None,
        )

    return p_Ri_true_given_R_one
예제 #2
0
def choice_with_periodic_replacement(length, width, probabilities,
                                     test_functionality):
    """

    Purpose
    -------
    This function's operations are equivalent to the following procedure:
        1) Sample B indices from a probability vector without replacement.
        2) Replace the sampled indices.
        3) Repeat Steps 1 and 2 N times.
    Here, B is the number of breakpoints per chromosome, and N is the number of individuals to be simulated.

    Parameters
    ----------
    length: The (int) number of individuals to be simulated (N)
    width: the (int) number of breakpoints per chromosome (B)
    probabilities: A numpy array, where the ith element is the probability of sampling the ith
                   recombination interval from "reduced_rcmb_rate_info" to contain a breakpoint

    Returns
    -------
    an NxB numpy array containing N sets of B recombination interval indices. All indices
    have 1 subtracted from them because a [0] element is appended onto the beginning of
    the cumulative probability distribution for technical convenience, which increases
    all of the indices by one higher than rest of the code expects them to be.

    """

    samples = np.zeros((width, length)).astype(int)
    cumulative_probabilities = np.repeat(
        [np.cumsum(np.append([0], probabilities))], length, axis=0)
    max_cumulative_vals = cumulative_probabilities[:, -1]
    for i in range(width):
        adjusted_uniform_sample = max_cumulative_vals * np.random.rand(length)
        samples[i] = np.array([
            np.searchsorted(probs, sample) for probs, sample in zip(
                cumulative_probabilities, adjusted_uniform_sample)
        ])
        if i < (width - 1):
            for j, pos in enumerate(samples[i]):
                cumulative_probabilities[j][pos:] -= (
                    cumulative_probabilities[j][pos] -
                    cumulative_probabilities[j][pos - 1])
            max_cumulative_vals = cumulative_probabilities[:, -1]

    if test_functionality == "test_units":
        unit_tester(samples.T - 1,
                    "correct_choice_with_periodic_replacement_output.txt",
                    None)

    return samples.T - 1
예제 #3
0
def reduce_recomb_rate_info(rcmb_rate_info, bim_SNP_positions,
                            test_functionality):
    """

    Purpose
    -------
    To reduce the number of rows in rcmb_rate_info so that every resulting genomic interval contains at least one SNP.

    Parameters
    ----------
    rcmb_rate_info: a pandas dataframe with two columns. "Position(bp)" is the genomic position of the ith interval's left
                    boundary. The ith row is both the left boundry of the ith interval and the right boundary of the (i-1)th
                    interval. "Map(cM)" is the cumulative recombination rate in centiMorgans, which is 0 in the first row.

    bim_SNP_positions: the genomic positions of every SNP directly from the input bim file.

    Returns
    -------
    reduced_rcmb_rate_info: "rcmb_rate_info" with rows removed to ensure that at least one
                            SNP position from "bim_SNP_positions" resides in every interval.

    """

    rcmb_rate_intervals = rcmb_rate_info["Position(bp)"].to_numpy()

    # INDEXING NOTE: Not subtracting one from "SNP_pos_rcmb_interval_map" returns the closest indices of "rcmb_rate_intervals"
    #                boundaries at genomic positions to the RIGHT of each SNP's genomic position in bim_SNP_positions.
    #                The only row still needed is the closest boundary to the LEFT of the first SNP. This is aquired by
    #                implementing "occupied_rcmb_intervals[np.min(np.where(occupied_rcmb_intervals == True)) - 1] = True".

    SNP_pos_rcmb_interval_map = SNP_positions_to_rcmb_intervals(
        rcmb_rate_intervals,
        COPY(bim_SNP_positions),
        test_functionality,
        context=1)
    all_rcmb_intervals = np.arange(len(rcmb_rate_intervals))
    occupied_rcmb_intervals = np.isin(all_rcmb_intervals,
                                      SNP_pos_rcmb_interval_map)
    occupied_rcmb_intervals[np.min(np.where(occupied_rcmb_intervals == True)) -
                            1] = True
    reduced_rcmb_rate_info = rcmb_rate_info[occupied_rcmb_intervals]

    # reduced_rcmb_rate_info.to_csv("correct_reduce_recomb_rate_info_output.txt", sep = "\t", header = True, index = False)
    if test_functionality == "test_units":
        unit_tester(reduced_rcmb_rate_info,
                    "correct_reduce_recomb_rate_info_output.txt", 0)
    return reduced_rcmb_rate_info
예제 #4
0
def SNP_positions_to_rcmb_intervals(rcmb_interval_boundaries,
                                    bim_SNP_positions, test_functionality,
                                    context):
    """

    Purpose
    -------
    To produce a list with indices that correspond to "bim_SNP_positions" indices
    and values that correspond to "rcmb_rate_intervals" indices. This is an intermediate
    step in converting sampled breakpoint intervals to SNPs from the input dataset.

    Parameters
    ----------
    rcmb_interval_boundaries: The "Position(bp)" column from "rcmb_rate_info". The ith element is the genomic position of
                              both the left boundry of the ith interval and the right boundary of the (i-1)th interval.

    bim_SNP_positions: the genomic positions of every SNP directly from the input bim file.

    Returns
    -------
    SNP_rcmb_interval_indices: a list with indices that correspond to "rcmb_rate_intervals"
    indices and values that correspond to "bim_SNP_positions" indices

    """

    prev_start_pos = 0
    SNP_rcmb_interval_indices = np.zeros(len(bim_SNP_positions),
                                         dtype=np.int64)
    too_large_indices = bim_SNP_positions >= np.max(rcmb_interval_boundaries)
    too_small_indices = bim_SNP_positions <= np.min(rcmb_interval_boundaries)
    bim_SNP_positions[too_large_indices] = np.max(rcmb_interval_boundaries) - 1
    bim_SNP_positions[too_small_indices] = np.min(rcmb_interval_boundaries) + 1
    num_outer_SNPs = np.sum(too_large_indices) + np.sum(too_small_indices)
    if num_outer_SNPs > 0:
        print(
            "\nCAUTION: " + str(num_outer_SNPs) +
            " SNP(s) in the bim file are outside of the positional range considered by the recombination rate info file.\n"
        )
        print(
            "This is unlikely to be an issue if the number of such SNPs is small. Otherwise, remove those SNPs from the input plink files\n"
        )
        print(
            "bim file SNPs earlier than the first recombination interval are assumed to be inside of the first recombination interval.\n"
        )
        print(
            "bim file SNPs farther than the last recombination interval are assumed to be inside of the last recombination interval.\n"
        )
    for i in range(len(bim_SNP_positions)):
        for k in range(len(rcmb_interval_boundaries) - prev_start_pos):
            if rcmb_interval_boundaries[prev_start_pos +
                                        k] >= bim_SNP_positions[i]:
                SNP_rcmb_interval_indices[i] = prev_start_pos + k
                prev_start_pos += k
                break

    if test_functionality == "test_units":
        unit_tester(
            SNP_rcmb_interval_indices,
            "correct_SNP_positions_to_rcmb_intervals_output" + str(context) +
            ".txt",
            None,
        )

    return SNP_rcmb_interval_indices
예제 #5
0
def write_bed_file(
    simulated_individuals,
    bim_SNP_names,
    output_name,
    bim_SNP_complete_pos,
    bim_SNP_nucleotides,
    population_ID,
    test_functionality,
):
    """

    Purpose
    -------
    to write the simulated data into realistic (bed, bim, fam) filesets. Does not include phenotypes at this point.

    Parameters
    ----------
    sampled_individuals: an Nx(B+1)xS numpy array containing N sets of B whole chromosomes, each of which have S snps.
                         Each row in the ith (B+1)xS subset will contribute one genomic segment. Those (B+1) genomic
                         segments will be concatenated to comprise the ith simulated individual.
    bim_SNP_names: a list of SNP's rsIDs from the input bed file.
    output_name: name of the output bed file, which annotates the chromosome that it belongs to.
    bim_SNP_complete_pos: Sx3 numpy array. Columns comprise the first, third, and fourth columns from the input bim file.
    bim_SNP_nucleotides: Sx2 numpy array. Columns comprise the fifth and sixth columns from the input bim file (i.e. major and minor alleles).
                         CAUTION: minor alleles with frequencies near 50% may become the major allele after the simulation
                         because the simulated allele frequency always deviates from the real allele frequency by a small ammout.
                         This makes plink flip the sign of r values for simulated SNP pairs relative to real SNP pairs
                         if plink's --keep-allele-order flag is not used when computing the r values with plink.
    population_ID: An input argument that is concatenated to each sample's row index to comprise columns 1 and 2 for the output fam file.
                   If no input argument is selected, then it includes the popilation ID from the 1000 genomes input plink fileset. If
                   the input plink files are custom, then it includes an empty string as the population_ID.

    Returns
    -------
    It returns nothing. It only writes the simulated data into plink files.

    """

    simulated_IDs = np.array([
        population_ID + "_" + str(i)
        for i in range(1,
                       len(simulated_individuals) + 1)
    ])
    metadata = {
        "fid": simulated_IDs,
        "iid": simulated_IDs,
        "sex": np.array([2] * len(simulated_IDs)),
        "pheno": np.array([-9] * len(simulated_IDs)),
        "chromosome": bim_SNP_complete_pos.T[0],
        "sid": bim_SNP_names,
        "cm_position": bim_SNP_complete_pos.T[1],
        "bp_position": bim_SNP_complete_pos.T[2],
        "allele_1": bim_SNP_nucleotides.T[0],
        "allele_2": bim_SNP_nucleotides.T[1],
    }

    to_bed(output_name,
           simulated_individuals,
           properties=metadata,
           count_A1=True)

    if test_functionality == "test_units":
        bed_reader = open_bed(output_name, count_A1=True, num_threads=1)
        output_bed_file = bed_reader.read(dtype="int8")
        output_bim_file = (pd.read_csv(output_name[:-4] + ".bim",
                                       delimiter="\t",
                                       header=None,
                                       dtype=str).to_numpy().astype("str"))
        output_fam_file = (pd.read_csv(output_name[:-4] + ".fam",
                                       delimiter=" ",
                                       header=None,
                                       dtype=str).to_numpy().astype("str"))
        unit_tester(output_bed_file, "correct_write_bed_file_output.bed", None)
        unit_tester(output_bim_file, "correct_write_bed_file_output.bim", None)
        unit_tester(output_fam_file, "correct_write_bed_file_output.fam", None)
예제 #6
0
def get_samples_fast(
    simulation_sample_size,
    bed_col_bounds,
    plink_file_name_prefix,
    num_breakpoints,
    test_functionality,
):
    """

    Purpose
    -------
    Imports real whole genomes from the input bed file in proportions as close as possible to that of the bed file's population and
    randomly assigns every breakpoint seperated segment to be copied from one of the imported whole genomes without replacement.

    Parameters
    ----------
    simulation_sample_size: the (int) number of samples to be simulated (N)
    bed_col_bounds: first and last column indices of all SNP indices that comprise the chromosome being simulated
    num_breakpoints: user-specified (int) number of breakpoints per chromosome (B).
    plink_file_name_prefix: plink prefix of the (bed, bim, fam) fileset that contains real whole genomes.

    Returns
    -------
    an Nx(B+1)xS numpy array containing N sets of B whole chromosomes, each of which have S snps.
    Each row in the ith (B+1)xS subset will contribute one genomic segment. Those (B+1) genomic
    segments will be concatenated to comprise the ith simulated individual.

    """

    bed_file_path = plink_file_name_prefix + ".bed"
    bed_reader = open_bed(bed_file_path, count_A1=True, num_threads=1)
    num_repeats = int(simulation_sample_size *
                      (num_breakpoints + 1) / int(bed_reader.iid_count) + 1)
    bed_row_indices = np.repeat(range(bed_reader.iid_count), num_repeats)
    np.random.shuffle(bed_row_indices)
    bed_row_indices = bed_row_indices[:simulation_sample_size *
                                      (num_breakpoints + 1)]
    bed_file_samples = bed_reader.read(
        (bed_row_indices, slice(bed_col_bounds[0], bed_col_bounds[1])),
        dtype="int8")
    bed_file_samples_dimensions = (
        int(len(bed_row_indices) / (num_breakpoints + 1)),
        num_breakpoints + 1,
        len(bed_file_samples[0]),
    )
    reshaped_output = bed_file_samples.reshape(bed_file_samples_dimensions)

    if test_functionality == "test_units":
        # gives 3 different counts of how many times each minor allele is drawn. They're small and vanishingly unlikely to all be made correctly by chance.
        SNP_minor_allele_counts = np.sum(reshaped_output, axis=(0, 1))
        simulated_individual_minor_allele_counts = np.sum(reshaped_output,
                                                          axis=(1, 2))
        breakpoint_interval_minor_allele_counts = np.sum(reshaped_output,
                                                         axis=(0, 2))
        unit_tester(
            SNP_minor_allele_counts,
            "correct_get_samples_fast_SNP_minor_allele_counts_output.txt",
            None,
        )
        unit_tester(
            simulated_individual_minor_allele_counts,
            "correct_get_samples_fast_simulated_individual_minor_allele_counts_output.txt",
            None,
        )
        unit_tester(
            breakpoint_interval_minor_allele_counts,
            "correct_get_samples_fast_breakpoint_interval_minor_allele_counts_output.txt",
            None,
        )

    return reshaped_output
예제 #7
0
def draw_breakpoints(
    rcmb_rate_info,
    bim_SNP_positions,
    num_breakpoints,
    simulation_sample_size,
    test_functionality,
    chromosome_number,
    output_plink_filename_prefix,
):
    """

    Purpose
    -------
    Computes breakpoint sampling probabilities with "centimorgans_to_probabilities", Draws breakpoints
    with "choice_with_periodic_replacement", and converts the breakpoints' corresponding recombination
    interval indices into the indices of input SNPs that reside inside of the recombination interval.

    Parameters
    ----------
    rcmb_rate_info:  Output from the "reduce_recomb_rate_info" function.
    bim_SNP_positions: the genomic positions of every SNP directly from the input bim file.
    num_breakpoints: user-specified (int) number of breakpoints per chromosome.
    simulation_sample_size: user-specified number of samples to be simulated.
    test_functionality: an argument which, if equal to "yes", tests regens' functionality. It substantially increases runtime.
    chromosome_number: the chromosome that is currently being simulated.
    output_plink_filename_prefix: plink prefix of the (bed, bim, fam) fileset that will contain simulated individuals.

    Returns
    -------
    an NxB numpy array containing N sets of B recombination interval indices.
    Each index is an input SNP's bim row index (also it's bed column index).

    """

    if test_functionality == "test_correctness":
        from regens_testers import test_drawn_breakpoints
        from regens_testers import test_breakpoint_SNP_mapping

    SNP_count = len(bim_SNP_positions)
    probabilities = centimorgans_to_probabilities(rcmb_rate_info,
                                                  test_functionality)
    rcmb_rate_intervals = rcmb_rate_info["Position(bp)"].to_numpy()
    breakpoints = choice_with_periodic_replacement(simulation_sample_size,
                                                   num_breakpoints,
                                                   probabilities,
                                                   test_functionality)

    if test_functionality == "test_correctness":
        test_drawn_breakpoints(breakpoints, probabilities, chromosome_number,
                               output_plink_filename_prefix)
        old_breakpoints = COPY(breakpoints)

    # INDEXING NOTE: Subtracting one from "SNP_pos_rcmb_interval_map" returns the closest indices of "rcmb_rate_intervals"
    #                boundaries at genomic positions to the LEFT of each SNP's genomic position in bim_SNP_positions.
    #                This is because all SNPs up to the SNP immediately to the left of the ith breakpoint comprise the ith
    #                segment, noting that the (B+1)th includes all SNPs after the Bth breakpoint (there are B breakpoints).

    SNP_pos_rcmb_interval_map = (
        SNP_positions_to_rcmb_intervals(rcmb_rate_intervals,
                                        COPY(bim_SNP_positions),
                                        test_functionality,
                                        context=2) - 1)

    rcmb_interval_SNP_pos_map = {}
    for rcmb_interval in np.unique(SNP_pos_rcmb_interval_map):
        rcmb_interval_SNP_pos_map[rcmb_interval] = np.where(
            SNP_pos_rcmb_interval_map == rcmb_interval)[0]

    for jj in range(len(breakpoints)):
        for k in range(num_breakpoints):
            interval_index = breakpoints[jj][k]
            SNP_indices = rcmb_interval_SNP_pos_map[interval_index]
            if len(SNP_indices) == 1:
                breakpoints[jj][k] = SNP_indices[0]
            else:
                breakpoints[jj][k] = SNP_indices[int(
                    len(SNP_indices) * np.random.rand() - 0.5)]

    if test_functionality == "test_correctness":
        test_breakpoint_SNP_mapping(old_breakpoints, rcmb_rate_intervals,
                                    breakpoints, bim_SNP_positions)
    if test_functionality == "test_units":
        unit_tester(breakpoints, "correct_draw_breakpoints_output.txt", None)

    return breakpoints