Пример #1
0
    def __next__(self):
        if self.i < self.n_batches:
            if self.i % self.load_batch == 0:
                self.load()

            idx_inf = (self.i % self.load_batch) * self.batch_size
            idx_sup = idx_inf + self.batch_size

            self.i += 1

            return (
                format_array(self.x1[idx_inf:idx_sup, :, :]),
                format_array(self.x2[idx_inf:idx_sup, :, :])
            )

        raise StopIteration()
Пример #2
0
    def __next__(self):
        if self.i < self.n_batches:
            pairs_batch = self.pairs[self.i*self.batch_size:(self.i+1)*self.batch_size]

            get_kmer_frequency_with_args = partial(
                get_kmer_frequency, kmer=self.kmer, rc=self.rc
            )
            fragments_a = [self.genomes[spA][startA:endA] for (spA, startA, endA), _ in pairs_batch]
            fragments_b = [self.genomes[spB][startB:endB] for _, (spB, startB, endB) in pairs_batch]

            x1 = self.pool.map(get_kmer_frequency_with_args, fragments_a)
            x2 = self.pool.map(get_kmer_frequency_with_args, fragments_b)

            self.i += 1

            if self.i >= self.n_batches:
                self.pool.close()

            return (format_array(np.array(x1, dtype='float32')),
                    format_array(np.array(x2, dtype='float32')))

        raise StopIteration()
Пример #3
0
def get_labels(pairs_file):
    """
    Extract label from pair file
    = 1 if both species are identical,
    = 0 otherwise

    Args:
        pairs_file (str): npy file with structured numpy array
    Returns:
        torch.Tensor: Binary tensor of size (n_pairs, 1)
    """

    ctg_names = np.load(pairs_file)['sp']
    labels = (ctg_names[:, 0] == ctg_names[:, 1]).astype('float32')[:, None]

    return format_array(labels)
Пример #4
0
def compute_pairwise_comparisons(model,
                                 latent_vectors,
                                 pairs_generator,
                                 vote_threshold=None,
                                 buffer_size=500):
    """
    Computes all comparisons between contig pairs produced by `pairs generator`
    using the provided `model`. A given contig-contig comparison involves
    comparing all n_frag*(n_frag-1)/2 pairs of fragments from both contigs.
    When set, `vote_threshold` imposed a hard threshold on each fragment-fragment
    comparison and converts it to a binary valuye - 1 if P(frag, frag) > vote_threshold
    and 0 otherwise. To save some memory, all comparisons are done in batches
    and at most `buffer_size` contig pairs are compared at once.

    Args:
        model (CompositionModel, CoverageNodel or CoCoNet): PyTorch deep learning model
        latent_vectors (list): items are (feature name, dict) where dict is the latent
          representations for all fragments in the contig (shape=(n_fragments, latent_dim))
        pairs_generator (tuple generator): contig pairs to compare
        vote_threshold (float or None): Voting scheme to compare fragments. (None means disabled)
        buffer_size (int): Number of contigs to load at once.
    Returns:
        dict: computed edges with corresponding probability values
    """

    # Get dimensions from any latent vectors
    (n_frags, latent_dim) = next(iter(latent_vectors[0][1].values())).shape

    n_frag_pairs = n_frags**2
    comb_indices = (np.repeat(np.arange(n_frags),
                              n_frags), np.tile(np.arange(n_frags), n_frags))

    edges = dict()

    # Initialize arrays to store inputs of the network
    inputs = [{
        feature: np.zeros((buffer_size * n_frag_pairs, latent_dim),
                          dtype='float32')
        for feature, _ in latent_vectors
    } for _ in range(2)]

    # Smaller chunks
    for i, pairs_buffer in enumerate(chunk(*pairs_generator,
                                           size=buffer_size)):

        # Load data
        for (feature, data) in latent_vectors:
            for j, contig_pair in enumerate(pairs_buffer):
                pos = range(j * n_frag_pairs, (j + 1) * n_frag_pairs)

                for k, contig in enumerate(contig_pair):
                    inputs[k][feature][pos] = data[contig][comb_indices[k]]

        # Convert to pytorch
        inputs_torch = [{
            feature: format_array(matrix)
            for feature, matrix in input_j.items()
        } for input_j in inputs]

        if len(inputs_torch[0]) == 1:  # Only one feature type
            feature = next(iter(inputs_torch[0].keys()))
            inputs_torch = [x[feature] for x in inputs_torch]

        # make prediction
        probs = model.combine_repr(*inputs_torch).detach().cpu().numpy()[:, 0]

        if vote_threshold is not None:
            probs = probs > vote_threshold

        # Save edge weight
        for j, contig_pair in enumerate(pairs_buffer):
            edges[contig_pair] = sum(probs[j * n_frags**2:(j + 1) *
                                           n_frags**2])

        if i % 100 == 0 and i > 0:
            logger.info(f'{i*buffer_size:,} contig pairs processed')

    return edges
Пример #5
0
def save_repr_all(model, fasta=None, coverage=None, dtr=None, output=None,
                  n_frags=30, frag_len=1024, min_ctg_len=2048,
                  rc=True, kmer=4, wsize=64, wstep=32):
    """
    - Calculate intermediate representation for all fragments of all contigs
    - Save it in a .h5 file

    Args:
        model (CompositionModel, CoverageNodel or CoCoNet)
        fasta (str): path to fasta file
        coverage (str): path to .h5 coverage file
        dtr (str): path to DTR contig list (to exclude)
        output (dict): filename to save latent representations for each feature
        n_frags (int): number of equal size fragments to split contigs
        frag_len (int): size of fragments
        rc (bool): whether to take the reverse complements of kmer composition
        kmer (int): kmer for composition feature. Must be the same as the one used
          for the training.
        wsize (int): window size for coverage smoothing. Must be the same as the
          one used for the training.
        wstep (int): window step for coverage smoothing. Must be the same as the
          one used for the training.
    Returns:
        None
    """

    if 'coverage' in output:
        cov_h5 = h5py.File(coverage, 'r')

    dtr_contigs = set()
    if dtr is not None and dtr.is_file():
        dtr_contigs |= set(ctg.split('\t')[0].strip() for ctg in open(dtr))

    repr_h5 = {key: h5py.File(filename, 'w') for key, filename in output.items()}

    for contig in SeqIO.parse(fasta, "fasta"):
        if contig.id in dtr_contigs or len(contig.seq) < min_ctg_len:
            continue

        step = int((len(contig)-frag_len) / n_frags)

        fragment_boundaries = [(step*i, step*i+frag_len) for i in range(n_frags)]

        feature_arrays = []

        if 'composition' in repr_h5:
            x_composition = format_array(
                np.stack([
                    get_kmer_frequency(str(contig.seq)[start:stop], kmer=kmer, rc=rc)
                    for (start, stop) in fragment_boundaries
                ]).astype(np.float32) # Shape = (n_frags, 4**k)
            )

            feature_arrays.append(x_composition)

        if 'coverage' in repr_h5:
            fragment_slices = np.array([np.arange(start, stop)
                                        for (start, stop) in fragment_boundaries])
            coverage_genome = np.array(cov_h5[contig.id][:]).astype('float32')[:, fragment_slices]
            coverage_genome = np.swapaxes(coverage_genome, 1, 0)

            x_coverage = format_array(
                avg_window(coverage_genome, wsize, wstep, axis=2).astype('float32')
            )

            feature_arrays.append(x_coverage)

        x_repr = model.compute_repr(*feature_arrays)

        for key, handle in repr_h5.items():
            handle.create_dataset(contig.id, data=x_repr[key].detach().cpu().numpy(),
                                  dtype='float32')

    for handle in repr_h5.values():
        handle.close()