def _compare_genome_node(self, node, genome, cache):
     probabilities = []
     length_classifier = self._length_classifier
     for labeled_node in length_classifier._labeled_nodes:
         if labeled_node in cache:
             shared = cache[labeled_node]
         else:
             shared = shared_segment_length_genomes(genome,
                                                    labeled_node.genome,
                                                    0)
             cache[labeled_node] = shared
         if (node, labeled_node) not in length_classifier:
             if shared == 0:
                 prob = INF_REPLACE
             else:
                 prob = ZERO_REPLACE
         else:
             prob = length_classifier.get_probability(shared, node,
                                                      labeled_node)
             if prob > 1 or isnan(prob):
                 prob = INF_REPLACE
             if prob == 0:
                 prob = ZERO_REPLACE
             
         probabilities.append(prob)
         
     return np.array(probabilities)
    def _compare_genome_node(self, node, genome, cache):
        probabilities = []
        length_classifier = self._length_classifier
        for labeled_node in length_classifier._labeled_nodes:
            if labeled_node in cache:
                shared = cache[labeled_node]
            else:
                shared = shared_segment_length_genomes(genome,
                                                       labeled_node.genome, 0)
                cache[labeled_node] = shared
            if (node, labeled_node) not in length_classifier:
                if shared == 0:
                    prob = INF_REPLACE
                else:
                    prob = ZERO_REPLACE
            else:
                prob = length_classifier.get_probability(
                    shared, node, labeled_node)
                if prob > 1 or isnan(prob):
                    prob = INF_REPLACE
                if prob == 0:
                    prob = ZERO_REPLACE

            probabilities.append(prob)

        return np.array(probabilities)
示例#3
0
def simulate_sharing(founders,
                     pair,
                     genome_generator,
                     recombinators,
                     iterations=10000):
    sharing = []
    for i in range(iterations):
        generate_genomes_ancestors(founders, genome_generator, recombinators)
        shared = shared_segment_length_genomes(pair[0].genome, pair[1].genome,
                                               0)
        sharing.append(shared)
    return sharing
def calculate_to_list(pairs, lengths):
    lengths.extend(shared_segment_length_genomes(node_a.genome, node_b.genome,
                                                 0)
                   for node_a, node_b in pairs)
# start = perf_counter()
# boundary = len(pairs) // 2
# thread_1 = threading.Thread(target=calculate_to_list,
#                             args = (pairs[:boundary], lengths))
# thread_2 = threading.Thread(target=calculate_to_list,
#                             args = (pairs[boundary:], lengths))
# thread_1.start()
# thread_2.start()
# thread_1.join()
# thread_2.join()
# stop = perf_counter()
# print(stop - start)

print("Comparing pairs.")
nodes = population.generations[-1].members
nodes = sample(nodes, 1500)
start = perf_counter()
lengths = [shared_segment_length_genomes(node_a.genome, node_b.genome, 0)
           for node_a, node_b in combinations(nodes, 2)]
stop = perf_counter()
print(stop - start)

# import pdb
# pdb.set_trace()
# shared = [len(np.flatnonzero(np.unpackbits(a.genome._founder_bits & b.genome._founder_bits)))
#           for a, b in combinations(nodes, 2)]

# print(np.average(shared))
# print(np.std(shared))
# print(max(lengths))
    def identify(self, genome, actual_node, ibd_threshold=5000000):
        node_probabilities = dict()  # Probability that a node is a match
        id_map = self._population.id_mapping
        length_classifier = self._length_classifier
        shared_list = []
        for labeled_node_id in length_classifier._labeled_nodes:
            labeled_node = id_map[labeled_node_id]
            s = shared_segment_length_genomes(genome, labeled_node.genome,
                                              ibd_threshold)
            shared_list.append((labeled_node_id, s))

        node_data = dict()
        batch_node_id = []
        batch_labeled_node_id = []
        batch_lengths = []
        batch_cryptic_lengths = []
        # This is done for performance reasons, as appending to this
        # list is the hottest part of the loop.
        append_cryptic = batch_cryptic_lengths.append
        distributions = length_classifier._distributions
        # Set membership testing is faster than dictionary key
        # membership testing, so we use a set.
        distribution_members = set(distributions.keys())
        nodes = self._to_search(shared_list)
        for node in nodes:
            node_start_i = len(batch_node_id)
            node_id = node._id
            cryptic_start_i = len(batch_cryptic_lengths)
            for labeled_node_id, shared in shared_list:
                if (node_id, labeled_node_id) not in distribution_members:
                    append_cryptic(shared)
                else:
                    batch_node_id.append(node_id)
                    batch_labeled_node_id.append(labeled_node_id)
                    batch_lengths.append(shared)
            cryptic_stop_i = len(batch_cryptic_lengths)
            node_stop_i = len(batch_node_id)
            node_data[node] = ProbabilityData(node_start_i, node_stop_i,
                                              cryptic_start_i, cryptic_stop_i)

        calc_prob = length_classifier.get_batch_probability(
            batch_lengths, batch_node_id, batch_labeled_node_id)
        cryptic_prob = length_classifier.get_batch_smoothing(
            batch_cryptic_lengths)

        # index_data = {node._id: tuple(indices)
        #               for node, indices in node_data.items()}
        # siblings = {node._id for node in get_sibling_group(actual_node)}
        # to_dump = {"actual_node_id": actual_node._id,
        #            "calc_prob": calc_prob,
        #            "cryptic_lengths": batch_cryptic_lengths,
        #            "siblings": siblings,
        #            "index_data": index_data}
        # output_filename = "/media/paul/Fast Storage/optimize_data/{}.pickle".format(actual_node._id)
        # with open(output_filename, "wb") as pickle_file:
        #     dump(to_dump, pickle_file)
        node_probabilities = dict()
        for node, prob_data in node_data.items():
            start_i, stop_i, cryptic_start_i, cryptic_stop_i = prob_data
            if node == actual_node:
                pass
                # import pdb
                # pdb.set_trace()
            node_calc = calc_prob[start_i:stop_i]
            node_cryptic = cryptic_prob[cryptic_start_i:cryptic_stop_i]
            log_prob = (np.sum(np.log(node_calc)) +
                        np.sum(np.log(node_cryptic)))
            node_probabilities[node] = log_prob
        # potential_node = max(node_probabilities.items(),
        #                      key = lambda x: x[1])[0]
        write_log(
            "identify", {
                "node": actual_node._id,
                "probs":
                {node._id: prob
                 for node, prob in node_probabilities.items()}
            })
        potential_nodes = nlargest(8,
                                   node_probabilities.items(),
                                   key=lambda x: x[1])
        top, top_log_prob = potential_nodes[0]
        sibling_group = get_sibling_group(top)
        for node, log_prob in potential_nodes[1:]:
            if node in sibling_group:
                continue
            next_node = node
            next_log_prob = log_prob
            break
        else:
            next_node, next_log_prob = potential_nodes[1]

        log_ratio = top_log_prob - next_log_prob
        # log_data = {"actual_node_id": actual_node._id,
        #             "prob_indices": prob_data,
        #             "calc_prob": calc_prob,
        #             "cryptic_prob": cryptic_prob
        #             "sibling_group": [node._id for node in sibling_group]}
        # write_log("run_data", log_data)
        return (sibling_group, log_ratio)
示例#7
0
def calculate_to_list(pairs, lengths):
    lengths.extend(
        shared_segment_length_genomes(node_a.genome, node_b.genome, 0)
        for node_a, node_b in pairs)
示例#8
0
# thread_1 = threading.Thread(target=calculate_to_list,
#                             args = (pairs[:boundary], lengths))
# thread_2 = threading.Thread(target=calculate_to_list,
#                             args = (pairs[boundary:], lengths))
# thread_1.start()
# thread_2.start()
# thread_1.join()
# thread_2.join()
# stop = perf_counter()
# print(stop - start)

print("Comparing pairs.")
nodes = population.generations[-1].members
nodes = sample(nodes, 1500)
start = perf_counter()
lengths = [
    shared_segment_length_genomes(node_a.genome, node_b.genome, 0)
    for node_a, node_b in combinations(nodes, 2)
]
stop = perf_counter()
print(stop - start)

# import pdb
# pdb.set_trace()
# shared = [len(np.flatnonzero(np.unpackbits(a.genome._founder_bits & b.genome._founder_bits)))
#           for a, b in combinations(nodes, 2)]

# print(np.average(shared))
# print(np.std(shared))
# print(max(lengths))