def _apply_mask(self, gtdb_msa, user_msa, msa_mask, min_perc_aa): """Apply canonical mask to MSA file.""" aligned_genomes = merge_two_dicts(gtdb_msa, user_msa) list_mask = np.fromfile(msa_mask, dtype='S1') == b'1' output_seqs = {} pruned_seqs = {} bar_fmt = '==> Masked {n_fmt}/{total_fmt} ({percentage:.0f}%) ' \ 'alignments [{rate_fmt}, ETA {remaining}]' for seq_id, seq in tqdm(aligned_genomes.items(), bar_format=bar_fmt): list_seq = np.fromiter(seq, dtype='S1') if list_mask.shape[0] != list_seq.shape[0]: raise MSAMaskLengthMismatch( 'Mask and alignment length do not match.') list_masked_seq = list_seq[list_mask] masked_seq_unique = np.unique(list_masked_seq, return_counts=True) masked_seq_counts = defaultdict(lambda: 0) for aa_char, aa_count in zip(masked_seq_unique[0], masked_seq_unique[1]): masked_seq_counts[aa_char.decode('utf-8')] = aa_count masked_seq = list_masked_seq.tostring().decode('utf-8') valid_bases = list_masked_seq.shape[0] - masked_seq_counts[ '.'] - masked_seq_counts['-'] if seq_id in user_msa and valid_bases < list_masked_seq.shape[ 0] * min_perc_aa: pruned_seqs[seq_id] = masked_seq continue output_seqs[seq_id] = masked_seq return output_seqs, pruned_seqs
def _apply_mask(self, gtdb_msa, user_msa, msa_mask, min_perc_aa): """Apply canonical mask to MSA file.""" aligned_genomes = merge_two_dicts(gtdb_msa, user_msa) list_mask = np.fromfile(msa_mask, dtype='S1') == b'1' output_seqs, pruned_seqs = dict(), dict() for seq_id, seq in tqdm_log(aligned_genomes.items(), unit='sequence'): list_seq = np.fromiter(seq, dtype='S1') if list_mask.shape[0] != list_seq.shape[0]: raise MSAMaskLengthMismatch( f'Mask ({list_mask.shape[0]}) and alignment ({list_seq.shape[0]}) length do not match.' ) list_masked_seq = list_seq[list_mask] masked_seq_unique = np.unique(list_masked_seq, return_counts=True) masked_seq_counts = defaultdict(lambda: 0) for aa_char, aa_count in zip(masked_seq_unique[0], masked_seq_unique[1]): masked_seq_counts[aa_char.decode('utf-8')] = aa_count masked_seq = list_masked_seq.tostring().decode('utf-8') valid_bases = list_masked_seq.shape[0] - \ masked_seq_counts['.'] - masked_seq_counts['-'] if seq_id in user_msa and valid_bases < list_masked_seq.shape[ 0] * min_perc_aa: pruned_seqs[seq_id] = masked_seq continue output_seqs[seq_id] = masked_seq return output_seqs, pruned_seqs
def _apply_mask(self, gtdb_msa, user_msa, msa_mask, min_perc_aa): """Apply canonical mask to MSA file.""" aligned_genomes = merge_two_dicts(gtdb_msa, user_msa) with open(msa_mask, 'r') as f: mask = f.readline().strip() list_mask = np.array([True if c == '1' else False for c in mask], dtype=bool) output_seqs = {} pruned_seqs = {} for seq_id, seq in aligned_genomes.iteritems(): if len(mask) != len(seq): self.logger.error('Mask and alignment length do not match.') raise MSAMaskLengthMismatch( 'Mask and alignment length do not match.') masked_seq = ''.join(np.array(list(seq), dtype=str)[list_mask]) valid_bases = len(masked_seq) - masked_seq.count( '.') - masked_seq.count('-') if seq_id in user_msa and valid_bases < len( masked_seq) * min_perc_aa: pruned_seqs[seq_id] = masked_seq continue output_seqs[seq_id] = masked_seq return output_seqs, pruned_seqs