Exemplo n.º 1
0
    def to_chunks(self, chunk_path='chunk{:03d}.tsv.gz', chunk_size=1e07):
        """Split MGnify sequences file into chunks
        Given a .fa[.gz] file, makes chunks of <chunk_size> and stores them
        into out_path directory named according to chunk index.
        Note that <chunk_size> is refferred to the number of entries, not the
        number of lines in output chunk, hence chunk sizes are heterogeneous.

        Args
        chunk_path (str)    String containing the path of a generic chunk file,
                            e.g. `chunk{:d}.fa.gz` (must be formattable)
        chunk_size (int)    Maximum number of fasta entries to be stored in
                            each chunk

        Raise
        (FileNotFoundError) If given chunk path is not valid
        """
        # Get output directory
        chunks_dir = os.path.dirname(chunk_path)
        # Case given output directory does not exist
        if not os.path.exists(chunks_dir):
            # Attempt to make a new output directory
            os.mkdir(chunks_dir)
        # Initialize current chunk (batch of fasta sequences entries)
        seq_batch = list()
        # Initialize sequence index
        seq_index = 0
        # Open file for reading
        with open_file(self.path) as file:
            # Loop through every index, line in input file
            for entry in fasta_iter(file):
                # Save current line
                seq_batch.append(entry)
                # Case index reached batch size
                if (seq_index + 1) % chunk_size == 0:
                    # Define chunk index
                    chunk_index = int(seq_index // chunk_size)
                    # Persist chunk to disk
                    self.write_chunk(chunk_path,
                                     chunk_index,
                                     seq_batch,
                                     sep='\n')
                    # Reinitialize chunk content
                    seq_batch = list()
                # Increase line counter
                seq_index += 1
            # Persist last chunk, if any
            if seq_batch:
                # Define chunk index
                chunk_index = int(seq_index // chunk_size)
                # Persist chunk to disk
                self.write_chunk(chunk_path, chunk_index, seq_batch, sep='\n')
        # Define number of chunks
        num_chunks = chunk_index + 1
        # Return number of chunks
        return num_chunks
Exemplo n.º 2
0
 def get_length(self):
     # Initialize output length
     length = 0
     # Open underlying file
     with open_file(self.path) as file:
         # Loop through each entry in input fasta file
         for entry in fasta_iter(file):
             # Update dataset length
             length += 1
     # Return dataset length
     return length
Exemplo n.º 3
0
    def search(self, sequences_acc, ret_length=False, verbose=False):
        """Retrieve sequences residues
        Takes a list of sequences accessions and search for the associated
        entry by scanning underlying fasta file headers.

        Args
        sequences_acc (list)    List of sequences accession numbers whose
                                residues must be found in given fasta file
        ret_length (bool)       Wether to return the length of the searched
                                target dataset (disables early stopping
                                criterion)
        verbose (bool)          Whether to print out verbose log

        Return
        (dict(str: str))        Dictionary containing sequences accession
                                numbers as keys and fasta entries as values
        """
        # Cast cluster names to set
        sequences_acc = set(sequences_acc)
        # Initialize output dict(sequence acc: fasta entry) and length
        sequences, length = dict(), 0
        # Verbose out
        if verbose:
            print('Reading sequences file', self.path)
        # Open file with defined file handler
        with open_file(self.path) as file:
            # Define fasta entries iterator
            tqdm_iter = tqdm(
                fasta_iter(file),  # Input iterator
                disable=(not verbose),  # Set verbose
                file=sys.stdout  # Force printing to stdout
            )
            # Loop through each entry in input fasta file
            for entry in tqdm_iter:
                # Split entry in header and residues
                header, resiudes = entry.split('\n')
                # Get accession number from header
                acc = re.search(r'^>(\S+)', header).group(1)
                # Case accession is one of the searched ones
                if acc in sequences_acc:
                    # Store entry
                    sequences[acc] = entry
                # Case all sequences have been found
                if (not ret_length) and (len(sequences) == len(sequences_acc)):
                    break  # Early stopping
                # Case length must be returned
                elif ret_length:
                    length += 1
        # Case length must be returned
        if ret_length:
            return sequences, length
        # Case only sequences must be returned
        else:
            return sequences
Exemplo n.º 4
0
 def get_longest(self):
     # Initialize current longest entry and its length (number of residues)
     longest_seq, longest_len = '', 0
     # Initailize number of sequences
     num_sequences = 0
     # Open inner dataset file path
     with open_file(self.path) as file:
         # Loop through each file entry
         for entry in fasta_iter(file):
             # Split current entry in header and residues
             header, residues = tuple(entry.split('\n'))
             # Get current sequence and its number of residues
             curr_seq, curr_len = entry, len(residues)
             # Case current sequence is longer than longest
             if curr_len > longest_len:
                 # Update longest sequence and its length
                 longest_seq, longest_len = curr_seq, curr_len
             # Unpate number of sequences
             num_sequences += 1
     # Return either longest sequence and its length
     return longest_seq, longest_len, num_sequences
Exemplo n.º 5
0
    def from_fasta(cls, in_file, acc_regex=r'^>(.*)[\n\r]*$'):
        """ Load alignment from .fasta file

        Args
        in_file     (file)  Buffer for reading input fasta file
        acc_regex   (str)   Regex used to extract accession number (or a
                            generic id string) from fasta header lines
                            (by default all the line without '>' character)

        Return
        MSA                 Return new Multiple Sequnece Alignment object
        """
        # Initialize output MSA object
        msa = cls()
        # Initialize alignment matrix as list (of numpy arrays)
        aln = list()
        # Initialize accession number, begin and end positions as lists
        acc, beg, end = list(), list(), list()
        # Loop through fasta file lines
        for entry in fasta_iter(in_file):
            # Split entry in header and residues
            header, residues = tuple(entry.split('\n'))
            # Save accession number / id
            acc.append(re.search(acc_regex, header).group(1))
            # Save residues for current aligned sequence
            aln.append(list(residues))
        # Update attributes
        msa.aln = np.array(aln)
        msa.acc = np.array(acc)
        # Compute number of non-gap reisudes for each aligned sequence
        not_gap = np.isin(msa.aln, cls.gap, invert=True).sum(axis=1)
        # Define begin position of each aligned sequence
        msa.beg = (not_gap > 0).astype(np.int)
        # Define end position of each aligned sequence
        msa.end = not_gap.astype(np.int)
        # Return reference to current object (allows chaining)
        return msa
Exemplo n.º 6
0
    def compute_bias(self, fasta_path, threshold=1, inclusive=True):
        # Compute compositional bias
        # Initialize predictions dict(sequence acc: binary pred)
        pred = dict()
        # Open input fasta file
        with open(fasta_path, 'r') as file:
            # Loop through each fasta entry
            for entry in fasta_iter(file):
                # Split entry in header and residues
                head, resid = tuple(entry.split('\n'))
                # Get accession out of header
                acc = re.search(r'^>(\S+)', head)

                # Append a zeroes vector (none is predicted disordered)
                pred[acc] = [0] * len(resid)
                # Loop through all disorder predictions
                for beg, end in self.regions.get(acc, []):
                    # Make a new binary vector with ones between region bounds
                    pred[acc] += [
                        # Set 1 if resiude is disordered, 0 otherwise
                        pred[acc][i] + 1
                        # Loop through each position in residues list
                        for i in range(beg - 1, end)
                    ]