Пример #1
0
def cryptic_parameters(id_map, labeled_nodes, related_pairs):

    # TODO: Add argument for this
    recomb_dir = abspath(
        join(dirname(__file__), "../data/recombination_rates/"))
    cm_data = centimorgan_data_from_directory(recomb_dir)
    ibd_detector = SharedSegmentDetector(0, 5, cm_data)

    # We try to only include the labeled nodes the analyst would have
    # access to This only works if we use the deterministic random
    # argument when we evaluate the classifier.
    labeled_copy = sorted(labeled_nodes)
    rand_state = getstate()
    seed(42)
    shuffle(labeled_copy)
    setstate(rand_state)

    labeled_node_pairs = set(combinations(labeled_copy[:1000], 2))
    related_pairs = set(related_pairs)
    cryptic_pairs = set(x for x in labeled_node_pairs
                        if x not in related_pairs)
    lengths = []
    for node_a_id, node_b_id in cryptic_pairs:
        node_a = id_map[node_a_id]
        node_b = id_map[node_b_id]
        genome_a = node_a.genome
        genome_b = node_b.genome
        length = ibd_detector.shared_segment_length(genome_a, genome_b)
        lengths.append(length)
    np_lengths = np.array(lengths, dtype=np.uint64)
    params = fit_hurdle_gamma(np_lengths)
    assert all(x is not None for x in params)
    return params
def distributions_from_directory(directory, id_mapping):
    """
    Calculate distributions from a directory created by
    calculate_shared_to_directory.
    """
    distributions = dict()
    for labeled_filename in tqdm(listdir(directory)):
        lengths = defaultdict(list)
        labeled = int(labeled_filename)
        with open(join(directory, labeled_filename), "r") as labeled_file:
            for line in labeled_file:
                # If the program crashed, the output can be left in an
                # inconsistent state.
                try:
                    unlabeled_id, shared_str = line.split("\t")
                except ValueError:
                    warn("Malformed line:\n{}".format(line), stacklevel=0)
                    continue
                unlabeled = int(unlabeled_id)
                if unlabeled not in id_mapping:
                    error_string = "No such unlabeled node with id {}."
                    warn(error_string.format(unlabeled_id), stacklevel=0)
                    continue
                try:
                    shared_float = float(shared_str)
                except ValueError:
                    error_string = "Error formatting value as float: {}."
                    warn(error_string.format(shared_str), stacklevel=0)
                    continue
                lengths[unlabeled].append(shared_float)
        for unlabeled, lengths in lengths.items():
            shape, scale, zero_prob = fit_hurdle_gamma(
                np.array(lengths, dtype=np.float64))
            if shape is None:
                continue
            shape, scale = adjust_shape_scale(shape, scale)
            params = HurdleGammaParams(shape, scale, zero_prob)
            distributions[unlabeled, labeled] = params
    return distributions
Пример #3
0
if args.cm_ibd_threshold > 0:
    cur_path = realpath(__file__)
    parent = split(split(cur_path)[0])[0]
    rates_dir = join(parent, "data", "recombination_rates")
    print("Loading recombination data for centimorgan cutoff.", flush = True)
    recomb_data = centimorgan_data_from_directory(rates_dir)
    ibd_detector = SharedSegmentDetector(0, 5, recomb_data)
else:
    ibd_detector = SharedSegmentDetector(5000000)

labeled_nodes = classifier._labeled_nodes
labeled_node_pairs = set(combinations(labeled_nodes, 2))
related_pairs = set(classifier._distributions.keys())
cryptic_pairs = set(x for x in combinations(labeled_nodes, 2)
                    if x not in related_pairs)

print("Calculating IBD for pairs.")
lengths = []
id_map = population.id_mapping
for node_a_id, node_b_id in progressbar(cryptic_pairs):
    node_a = id_map[node_a_id]
    node_b = id_map[node_b_id]
    genome_a = node_a.genome
    genome_b = node_b.genome
    length = ibd_detector.shared_segment_length(genome_a, genome_b)
    lengths.append(length)

np_lengths = np.array(lengths, dtype = np.uint64)
print(fit_hurdle_gamma(np_lengths))