def __next__(self): if self.i < self.n_batches: if self.i % self.load_batch == 0: self.load() idx_inf = (self.i % self.load_batch) * self.batch_size idx_sup = idx_inf + self.batch_size self.i += 1 return ( format_array(self.x1[idx_inf:idx_sup, :, :]), format_array(self.x2[idx_inf:idx_sup, :, :]) ) raise StopIteration()
def __next__(self): if self.i < self.n_batches: pairs_batch = self.pairs[self.i*self.batch_size:(self.i+1)*self.batch_size] get_kmer_frequency_with_args = partial( get_kmer_frequency, kmer=self.kmer, rc=self.rc ) fragments_a = [self.genomes[spA][startA:endA] for (spA, startA, endA), _ in pairs_batch] fragments_b = [self.genomes[spB][startB:endB] for _, (spB, startB, endB) in pairs_batch] x1 = self.pool.map(get_kmer_frequency_with_args, fragments_a) x2 = self.pool.map(get_kmer_frequency_with_args, fragments_b) self.i += 1 if self.i >= self.n_batches: self.pool.close() return (format_array(np.array(x1, dtype='float32')), format_array(np.array(x2, dtype='float32'))) raise StopIteration()
def get_labels(pairs_file): """ Extract label from pair file = 1 if both species are identical, = 0 otherwise Args: pairs_file (str): npy file with structured numpy array Returns: torch.Tensor: Binary tensor of size (n_pairs, 1) """ ctg_names = np.load(pairs_file)['sp'] labels = (ctg_names[:, 0] == ctg_names[:, 1]).astype('float32')[:, None] return format_array(labels)
def compute_pairwise_comparisons(model, latent_vectors, pairs_generator, vote_threshold=None, buffer_size=500): """ Computes all comparisons between contig pairs produced by `pairs generator` using the provided `model`. A given contig-contig comparison involves comparing all n_frag*(n_frag-1)/2 pairs of fragments from both contigs. When set, `vote_threshold` imposed a hard threshold on each fragment-fragment comparison and converts it to a binary valuye - 1 if P(frag, frag) > vote_threshold and 0 otherwise. To save some memory, all comparisons are done in batches and at most `buffer_size` contig pairs are compared at once. Args: model (CompositionModel, CoverageNodel or CoCoNet): PyTorch deep learning model latent_vectors (list): items are (feature name, dict) where dict is the latent representations for all fragments in the contig (shape=(n_fragments, latent_dim)) pairs_generator (tuple generator): contig pairs to compare vote_threshold (float or None): Voting scheme to compare fragments. (None means disabled) buffer_size (int): Number of contigs to load at once. Returns: dict: computed edges with corresponding probability values """ # Get dimensions from any latent vectors (n_frags, latent_dim) = next(iter(latent_vectors[0][1].values())).shape n_frag_pairs = n_frags**2 comb_indices = (np.repeat(np.arange(n_frags), n_frags), np.tile(np.arange(n_frags), n_frags)) edges = dict() # Initialize arrays to store inputs of the network inputs = [{ feature: np.zeros((buffer_size * n_frag_pairs, latent_dim), dtype='float32') for feature, _ in latent_vectors } for _ in range(2)] # Smaller chunks for i, pairs_buffer in enumerate(chunk(*pairs_generator, size=buffer_size)): # Load data for (feature, data) in latent_vectors: for j, contig_pair in enumerate(pairs_buffer): pos = range(j * n_frag_pairs, (j + 1) * n_frag_pairs) for k, contig in enumerate(contig_pair): inputs[k][feature][pos] = data[contig][comb_indices[k]] # Convert to pytorch inputs_torch = [{ feature: format_array(matrix) for feature, matrix in input_j.items() } for input_j in inputs] if len(inputs_torch[0]) == 1: # Only one feature type feature = next(iter(inputs_torch[0].keys())) inputs_torch = [x[feature] for x in inputs_torch] # make prediction probs = model.combine_repr(*inputs_torch).detach().cpu().numpy()[:, 0] if vote_threshold is not None: probs = probs > vote_threshold # Save edge weight for j, contig_pair in enumerate(pairs_buffer): edges[contig_pair] = sum(probs[j * n_frags**2:(j + 1) * n_frags**2]) if i % 100 == 0 and i > 0: logger.info(f'{i*buffer_size:,} contig pairs processed') return edges
def save_repr_all(model, fasta=None, coverage=None, dtr=None, output=None, n_frags=30, frag_len=1024, min_ctg_len=2048, rc=True, kmer=4, wsize=64, wstep=32): """ - Calculate intermediate representation for all fragments of all contigs - Save it in a .h5 file Args: model (CompositionModel, CoverageNodel or CoCoNet) fasta (str): path to fasta file coverage (str): path to .h5 coverage file dtr (str): path to DTR contig list (to exclude) output (dict): filename to save latent representations for each feature n_frags (int): number of equal size fragments to split contigs frag_len (int): size of fragments rc (bool): whether to take the reverse complements of kmer composition kmer (int): kmer for composition feature. Must be the same as the one used for the training. wsize (int): window size for coverage smoothing. Must be the same as the one used for the training. wstep (int): window step for coverage smoothing. Must be the same as the one used for the training. Returns: None """ if 'coverage' in output: cov_h5 = h5py.File(coverage, 'r') dtr_contigs = set() if dtr is not None and dtr.is_file(): dtr_contigs |= set(ctg.split('\t')[0].strip() for ctg in open(dtr)) repr_h5 = {key: h5py.File(filename, 'w') for key, filename in output.items()} for contig in SeqIO.parse(fasta, "fasta"): if contig.id in dtr_contigs or len(contig.seq) < min_ctg_len: continue step = int((len(contig)-frag_len) / n_frags) fragment_boundaries = [(step*i, step*i+frag_len) for i in range(n_frags)] feature_arrays = [] if 'composition' in repr_h5: x_composition = format_array( np.stack([ get_kmer_frequency(str(contig.seq)[start:stop], kmer=kmer, rc=rc) for (start, stop) in fragment_boundaries ]).astype(np.float32) # Shape = (n_frags, 4**k) ) feature_arrays.append(x_composition) if 'coverage' in repr_h5: fragment_slices = np.array([np.arange(start, stop) for (start, stop) in fragment_boundaries]) coverage_genome = np.array(cov_h5[contig.id][:]).astype('float32')[:, fragment_slices] coverage_genome = np.swapaxes(coverage_genome, 1, 0) x_coverage = format_array( avg_window(coverage_genome, wsize, wstep, axis=2).astype('float32') ) feature_arrays.append(x_coverage) x_repr = model.compute_repr(*feature_arrays) for key, handle in repr_h5.items(): handle.create_dataset(contig.id, data=x_repr[key].detach().cpu().numpy(), dtype='float32') for handle in repr_h5.values(): handle.close()