def create_using_shared_memory(variant_interval): from_variant, to_variant = variant_interval logging.info("Creating on interval %d-%d" % (from_variant, to_variant)) genotype_matrix = from_shared_memory( GenotypeMatrix, "genotype_matrix_shared_for_frequencies") genotype_frequencies = from_shared_memory( GenotypeFrequencies, "genotype_frequencies_shared") n_variants = genotype_matrix.matrix.shape[1] n_individuals = len( np.where(genotype_matrix.matrix[:, 0])[0] != 0 ) # can be zeros for non-individuals, so all non-zero is an individual # Less memory hungry, but slower for numeric_genotype, array in zip([1, 2, 3], [ genotype_frequencies.homo_ref, genotype_frequencies.homo_alt, genotype_frequencies.hetero ]): logging.info("Finding for genotype %d" % numeric_genotype) prev_time = time.time() for variant_id in range(from_variant, to_variant): if variant_id % 100000 == 0: logging.info( "%d/%d variants processed (genotype now is %d). Prev 100k processed in %.3f s" % (variant_id - from_variant, to_variant - from_variant, numeric_genotype, time.time() - prev_time)) prev_time = time.time() array[variant_id] = len( np.where(genotype_matrix.matrix[:, variant_id] == numeric_genotype)[0]) / n_individuals
def get_numeric_node_sequence_single_thread(interval): from_pos, to_pos = interval start_time = time.time() graph = from_shared_memory(Graph, "graph_shared") numeric_node_sequences = from_shared_memory(SingleSharedArray, "numeric_node_sequences") result = np_letter_sequence_to_numeric( graph.node_sequences[from_pos:to_pos]) numeric_node_sequences.array[from_pos:to_pos] = result logging.info("Spent %.3f s on interval" % (time.time() - start_time)) return from_pos, to_pos
def from_variants(cls, variants, n_individuals, n_variants, n_threads=10, chunk_size=10000): matrix = np.zeros((n_individuals, n_variants), dtype=np.uint8) matrix = cls(matrix) logging.info("Putting genotype matrix in shared memory") to_shared_memory(matrix, "genotype_matrix") logging.info("Getting variant chunks") variant_chunks = variants.get_chunks(chunk_size=chunk_size) pool = Pool(n_threads) i = 0 for result in pool.imap( GenotypeMatrix.fill_shared_memory_matrix_with_variants, variant_chunks): i += 1 logging.info("Done with %d variant chunks" % i) logging.info("Done with all variant chunks") matrix = from_shared_memory(GenotypeMatrix, "genotype_matrix") return cls(matrix.matrix)
def analyse(self, n_threads=10): n_variants = self.matrix.matrix.shape[1] n_individuals = self.matrix.matrix.shape[0] most_similar_lookup = np.zeros(n_variants, dtype=np.uint32) prob_same_genotype = np.zeros(n_variants, dtype=np.float) lookup = MostSimilarVariantLookup(most_similar_lookup, prob_same_genotype) to_shared_memory(self.matrix, "genotype_matrix") to_shared_memory(lookup, "most_similar_variant_lookup") intervals = [int(i) for i in np.linspace(0, n_variants, n_threads)] variant_intervals = [ (from_id, to_id) for from_id, to_id in zip(intervals[0:-1], intervals[1:]) ] logging.info("Will analyse intervals: %s" % variant_intervals) pool = Pool(n_threads) for result in pool.imap( GenotypeMatrixAnalyser.analyse_variants_on_shared_memody, variant_intervals): logging.info("Done with one job") lookup = from_shared_memory(MostSimilarVariantLookup, "most_similar_variant_lookup") return lookup
def set_numeric_node_sequences(args): graph = Graph.from_file(args.graph) to_shared_memory(graph, "graph_shared") pool = Pool(args.n_threads) numeric_node_sequences = SingleSharedArray( np.zeros(len(graph.node_sequences), dtype=np.uint8)) to_shared_memory(numeric_node_sequences, "numeric_node_sequences") intervals = list([ int(i) for i in np.linspace(0, len(graph.node_sequences), args.n_threads + 1) ]) intervals = [ (from_pos, to_pos) for from_pos, to_pos in zip(intervals[0:-1], intervals[1:]) ] logging.info("Intervals: %s" % intervals) for from_pos, to_pos in pool.imap( get_numeric_node_sequence_single_thread, intervals): logging.info( "Done processing interval %d-%d. Inserting into full array" % (from_pos, to_pos)) logging.info("Done with all intervals. Saving new graph") numeric_node_sequences = from_shared_memory(SingleSharedArray, "numeric_node_sequences") graph.numeric_node_sequences = numeric_node_sequences.array graph.to_file(args.graph) logging.info("Saved to the same file %s" % args.graph)
def make_unique_variant_kmers_single_thread(variants, args): variant_to_nodes = from_shared_memory(VariantToNodes, "variant_to_nodes_shared") kmer_index = from_shared_memory(CollisionFreeKmerIndex, "kmer_index_shared") graph = from_shared_memory(Graph, "graph_shared") #graph = Graph.from_file(args.graph) logging.info("Reading all variants") finder = UniqueVariantKmersFinder( graph, variant_to_nodes, variants, args.kmer_size, args.max_variant_nodes, kmer_index_with_frequencies=kmer_index) flat_kmers = finder.find_unique_kmers() return flat_kmers
def fill_shared_memory_matrix_with_variants(variants): matrix = from_shared_memory(GenotypeMatrix, "genotype_matrix") for variant in variants: variant_number = variant.vcf_line_number if variant_number % 10000 == 0: logging.info("%d variants processeed" % variant_number) for individual_id, genotype in variant.get_individuals_and_numeric_genotypes( ): matrix.matrix[individual_id, variant_number] = genotype
def from_genotype_matrix(cls, genotype_matrix, n_threads=10): to_shared_memory(genotype_matrix, "genotype_matrix_shared_for_frequencies") n_variants = genotype_matrix.matrix.shape[1] n_individuals = len( np.where(genotype_matrix.matrix[:, 0])[0] != 0 ) # can be zeros for non-individuals, so all non-zero is an individual logging.info("Assumes there are %d individuals and %d variants" % (n_individuals, n_variants)) data = { 1: np.zeros(n_variants, dtype=float), 2: np.zeros(n_variants, dtype=float), 3: np.zeros(n_variants, dtype=float) } genotype_frequences = cls(data[1], data[2], data[3]) to_shared_memory(genotype_frequences, "genotype_frequencies_shared") intervals = [int(i) for i in np.linspace(0, n_variants, n_threads)] variant_intervals = [ (from_id, to_id) for from_id, to_id in zip(intervals[0:-1], intervals[1:]) ] logging.info("Will analyse intervals: %s" % variant_intervals) pool = Pool(n_threads) for result in pool.imap(GenotypeFrequencies.create_using_shared_memory, variant_intervals): logging.info("Done with one job") """ for numeric_genotype, array in data.items(): logging.info("Finding for genotype %d" % numeric_genotype) # the second index from np where gives the columns that have a hit, every column 1 time for each hit column_hits = np.where(genotype_matrix.matrix == numeric_genotype)[1] logging.info("Making frequencies") unique_columns, n_hits_per_column = np.unique(column_hits, return_counts=True) data[numeric_genotype][unique_columns] = n_hits_per_column / n_individuals """ """ # Less memory hungry, but slower for numeric_genotype, array in data.items(): logging.info("Finding for genotype %d" % numeric_genotype) for variant_id in range(n_variants): if variant_id % 10000 == 0: logging.info("%d variants processed" % variant_id) array[variant_id] = len(np.where(genotype_matrix.matrix[:,variant_id] == numeric_genotype)[0]) / n_individuals """ return from_shared_memory(GenotypeFrequencies, "genotype_frequencies_shared")
def analyse_variants_on_shared_memody(variant_interval): from_id, to_id = variant_interval if from_id == 0: from_id = 1 logging.info("Analysing variant %d to %d in one job" % (from_id, to_id)) matrix = from_shared_memory(GenotypeMatrix, "genotype_matrix") lookup = from_shared_memory(MostSimilarVariantLookup, "most_similar_variant_lookup") n_individuals = matrix.matrix.shape[0] prev_time = time.time() for i, variant_id in enumerate(range(from_id, to_id)): if i % 5000 == 0 and i > 0: logging.info( "%d/%d variants analysed (last 5k analyse in %.3f s)" % (i, to_id - from_id, time.time() - prev_time)) prev_time = time.time() most_similar, score = matrix.get_most_similar_previous_variant( variant_id) #logging.info("Most similar to %d is %d with score %d. Genotype distribution: %s" % (variant_id, most_similar, score, np.unique(self.matrix[:,variant_id], return_counts=True))) lookup.lookup_array[variant_id] = most_similar lookup.prob_same_genotype[variant_id] = score / n_individuals
def create_index_single_thread(args, interval=None): start_position = None end_position = None if interval is not None: start_position = interval[0] end_position = interval[1] logging.info("Loading data") #graph = Graph.from_file(args.graph_file_name) if args.graph_file_name is not None: graph = from_shared_memory(Graph, "graph_shared") reference = None else: graph = None assert args.reference_fasta is not None assert args.reference_name is not None, "Reference name must be specified" reference = Fasta(args.reference_fasta)[args.reference_name] logging.info("Running kmerfinder") whitelist = None if args.whitelist is not None: w = FlatKmers.from_file(args.whitelist) whitelist = set(w._hashes) skip_kmers_with_nodes = None if args.skip_kmers_with_nodes is not None: f = FlatKmers.from_file(args.skip_kmers_with_nodes) skip_kmers_with_nodes = set(f._nodes) finder = SnpKmerFinder( graph, k=args.kmer_size, spacing=args.spacing, include_reverse_complements=args.include_reverse_complement, pruning=args.pruning, max_kmers_same_position=args.max_kmers_same_position, max_frequency=args.max_frequency, max_variant_nodes=args.max_variant_nodes, only_add_variant_kmers=args.only_add_variant_kmers, whitelist=whitelist, only_save_variant_nodes=args.only_save_variant_nodes, start_position=start_position, end_position=end_position, skip_kmers_with_nodes=skip_kmers_with_nodes, only_save_one_node_per_kmer=args.only_save_one_node_per_kmer, reference=reference) kmers = finder.find_kmers() return kmers
def _multiprocess_wrapper(shared_memory_graph_name, variants, limit_to_n_haplotypes=10): graph = from_shared_memory(Graph, shared_memory_graph_name) return HaplotypeToNodes.get_flat_haplotypes_and_nodes_from_graph_and_variants(graph, variants, limit_to_n_haplotypes)
from graph_kmer_index.shared_mem import to_shared_memory, from_shared_memory from graph_kmer_index import KmerIndex index = KmerIndex.from_file("testdata2_index.npz") print(index.get(852840309094508953)) to_shared_memory(index, "testindex") new_index = from_shared_memory(KmerIndex, "testindex") print(new_index.get(852840309094508953))