def _stockholm_to_tabular_msa(fh, constructor=None): # Checks that user has passed required constructor parameter if constructor is None: raise ValueError("Must provide `constructor` parameter indicating the " "type of sequences in the alignment. `constructor` " "must be a subclass of `GrammaredSequence` " "(e.g., `DNA`, `RNA`, `Protein`).") # Checks that contructor parameter is supported elif not issubclass(constructor, GrammaredSequence): raise TypeError("`constructor` must be a subclass of " "`GrammaredSequence`.") # Checks that the file isn't empty try: line = next(fh) except StopIteration: raise StockholmFormatError("File is empty.") # Checks that the file follows basic format (includes the required header) if not _is_header(line): raise StockholmFormatError("File missing required Stockholm header " "line.") msa_data = _MSAData() for line in fh: if line.isspace(): continue line = line.rstrip('\n') if _is_sequence_line(line): seq_name, seq_data = _parse_sequence_line(line) msa_data.add_sequence(seq_name, seq_data) elif line.startswith("#=GF"): feature_name, feature_data = _parse_gf_line(line) msa_data.add_gf_metadata(feature_name, feature_data) elif line.startswith("#=GS"): seq_name, feature_name, feature_data = _parse_gs_line(line) msa_data.add_gs_metadata(seq_name, feature_name, feature_data) elif line.startswith("#=GR"): seq_name, feature_name, feature_data = _parse_gr_line(line) msa_data.add_gr_metadata(seq_name, feature_name, feature_data) elif line.startswith('#=GC'): feature_name, feature_data = _parse_gc_line(line) msa_data.add_gc_metadata(feature_name, feature_data) elif _is_footer(line): break else: raise StockholmFormatError("Unrecognized line: %r" % line) if not _is_footer(line): raise StockholmFormatError('Final line does not conform to Stockholm ' 'format. Must contain only "//".') return msa_data.build_tabular_msa(constructor)
def _format_positional_metadata(df, data_type): # Asserts positional metadata feature names are unique if not df.columns.is_unique: num_repeated_columns = len(df.columns) - len(set(df.columns)) raise StockholmFormatError('%s feature names must be unique. ' 'Found %d duplicate names.' % (data_type, num_repeated_columns)) str_df = df.astype(str) # Asserts positional metadata dataframe items are one character long for column in str_df.columns: if (str_df[column].str.len() != 1).any(): raise StockholmFormatError("%s must contain a single character for" " each position's value. Found value(s)" " in column %s of incorrect length." % (data_type, column)) return str_df
def add_gf_metadata(self, feature_name, feature_data): # Handles first instance of labelled tree if feature_name == 'TN' and 'NH' not in self._metadata: self._metadata['NH'] = OrderedDict() self._metadata['NH'][feature_data] = '' # Handles second instance of labelled tree elif feature_name == 'TN' and 'NH' in self._metadata: if feature_data in self._metadata['NH']: raise StockholmFormatError("Tree name %r used multiple times " "in file." % feature_data) self._metadata['NH'][feature_data] = '' # Handles extra line(s) of an already created tree elif feature_name == 'NH' and feature_name in self._metadata: trees = self._metadata[feature_name] if isinstance(trees, OrderedDict): tree_id = next(reversed(trees)) self._metadata[feature_name][tree_id] = (trees[tree_id] + feature_data) else: self._metadata[feature_name] = (self._metadata[feature_name] + feature_data) elif feature_name == 'RN': if feature_name not in self._metadata: self._metadata[feature_name] = [OrderedDict()] else: self._metadata[feature_name].append(OrderedDict()) elif feature_name in _REFERENCE_TAGS: if 'RN' not in self._metadata: raise StockholmFormatError("Expected 'RN' tag to precede " "'%s' tag." % feature_name) reference_dict = self._metadata['RN'][-1] if feature_name not in reference_dict: reference_dict[feature_name] = feature_data else: padding = _get_padding(reference_dict[feature_name]) reference_dict[feature_name] += padding + feature_data elif feature_name in self._metadata: padding = _get_padding(self._metadata[feature_name][-1]) self._metadata[feature_name] = (self._metadata[feature_name] + padding + feature_data) else: self._metadata[feature_name] = feature_data
def _tabular_msa_to_stockholm(obj, fh): if not obj.index.is_unique: raise StockholmFormatError("The TabularMSA's index labels must be" " unique.") # Writes header fh.write("# STOCKHOLM 1.0\n") # Writes GF data to file if obj.has_metadata(): for gf_feature, gf_feature_data in viewitems(obj.metadata): if gf_feature == 'NH' and isinstance(gf_feature_data, dict): for tree_id, tree in viewitems(obj.metadata[gf_feature]): fh.write("#=GF TN %s\n" % tree_id) fh.write("#=GF NH %s\n" % tree) else: fh.write("#=GF %s %s\n" % (gf_feature, gf_feature_data)) unpadded_data = [] # Writes GS data to file, retrieves GR data, and retrieves sequence data for seq, seq_name in zip(obj, obj.index): seq_name = str(seq_name) if seq.has_metadata(): for gs_feature, gs_feature_data in viewitems(seq.metadata): fh.write("#=GS %s %s %s\n" % (seq_name, gs_feature, gs_feature_data)) unpadded_data.append((seq_name, str(seq))) if seq.has_positional_metadata(): df = _format_positional_metadata( seq.positional_metadata, 'Sequence-specific positional ' 'metadata (GR)') for gr_feature in df.columns: gr_feature_data = ''.join(df[gr_feature]) gr_string = "#=GR %s %s" % (seq_name, gr_feature) unpadded_data.append((gr_string, gr_feature_data)) # Retrieves GC data if obj.has_positional_metadata(): df = _format_positional_metadata( obj.positional_metadata, 'Multiple sequence alignment ' 'positional metadata (GC)') for gc_feature in df.columns: gc_feature_data = ''.join(df[gc_feature]) gc_string = "#=GC %s" % gc_feature unpadded_data.append((gc_string, gc_feature_data)) # Writes GR, GC, and raw data to file with padding _write_padded_data(unpadded_data, fh) # Writes footer fh.write("//\n")
def add_gf_metadata(self, feature_name, feature_data): # Handles first instance of labelled tree if feature_name == 'TN' and 'NH' not in self._metadata: self._metadata['NH'] = OrderedDict() self._metadata['NH'][feature_data] = '' # Handles second instance of labelled tree elif feature_name == 'TN' and 'NH' in self._metadata: if feature_data in self._metadata['NH']: raise StockholmFormatError("Tree name %r used multiple times " "in file." % feature_data) self._metadata['NH'][feature_data] = '' # Handles extra line(s) of an already created tree elif feature_name == 'NH' and feature_name in self._metadata: trees = self._metadata[feature_name] tree_id = list(trees.keys())[-1] self._metadata[feature_name][tree_id] = (trees[tree_id] + feature_data) elif feature_name in self._metadata: self._metadata[feature_name] = (self._metadata[feature_name] + feature_data) else: self._metadata[feature_name] = feature_data
def build_tabular_msa(self, constructor): if len(self._seqs) != len(self._seq_order): invalid_seq_names = set(self._seqs) - set(self._seq_order) raise StockholmFormatError('Found GS or GR metadata for ' 'nonexistent sequence(s): %r' % invalid_seq_names) seqs = [] for seq_name in self._seq_order: seqs.append(self._seqs[seq_name].build_sequence(constructor)) positional_metadata = self._positional_metadata if not positional_metadata: positional_metadata = None metadata = self._metadata if not metadata: metadata = None # Constructs TabularMSA return TabularMSA(seqs, metadata=metadata, positional_metadata=positional_metadata, index=self._seq_order)
def _check_for_malformed_line(line, expected_len): if len(line) != expected_len: raise StockholmFormatError('Line contains %d item(s). It must ' 'contain exactly %d item(s).' % (len(line), expected_len))
def _raise_duplicate_error(message): raise StockholmFormatError(message+' Note: If the file being used is in ' 'Stockholm interleaved format, this ' 'is not supported by the reader.')
def _tabular_msa_to_stockholm(obj, fh): if not obj.index.is_unique: raise StockholmFormatError("The TabularMSA's index labels must be" " unique.") # Writes header fh.write("# STOCKHOLM 1.0\n") # Writes GF data to file if obj.has_metadata(): for gf_feature, gf_feature_data in obj.metadata.items(): if gf_feature == 'NH' and isinstance(gf_feature_data, dict): for tree_id, tree in gf_feature_data.items(): fh.write("#=GF TN %s\n" % tree_id) fh.write("#=GF NH %s\n" % tree) elif gf_feature == 'RN': if not isinstance(gf_feature_data, list): raise StockholmFormatError( "Expected 'RN' to contain a list of reference " "dictionaries, got %r." % gf_feature_data) for ref_num, dictionary in enumerate(gf_feature_data, start=1): if not isinstance(dictionary, dict): raise StockholmFormatError( "Expected reference information to be stored as a " "dictionary, found reference %d stored as %r." % (ref_num, type(dictionary).__name__)) fh.write("#=GF RN [%d]\n" % ref_num) for feature in dictionary: if feature not in _REFERENCE_TAGS: formatted_reference_tags = ', '.join( [tag for tag in _REFERENCE_TAGS]) raise StockholmFormatError( "Invalid reference tag %r found in reference " "dictionary %d. Valid reference tags are: %s." % (feature, ref_num, formatted_reference_tags)) fh.write("#=GF %s %s\n" % (feature, dictionary[feature])) else: fh.write("#=GF %s %s\n" % (gf_feature, gf_feature_data)) unpadded_data = [] # Writes GS data to file, retrieves GR data, and retrieves sequence data for seq, seq_name in zip(obj, obj.index): seq_name = str(seq_name) if seq.has_metadata(): for gs_feature, gs_feature_data in seq.metadata.items(): fh.write("#=GS %s %s %s\n" % (seq_name, gs_feature, gs_feature_data)) unpadded_data.append((seq_name, str(seq))) if seq.has_positional_metadata(): df = _format_positional_metadata( seq.positional_metadata, 'Sequence-specific positional ' 'metadata (GR)') for gr_feature in df.columns: gr_feature_data = ''.join(df[gr_feature]) gr_string = "#=GR %s %s" % (seq_name, gr_feature) unpadded_data.append((gr_string, gr_feature_data)) # Retrieves GC data if obj.has_positional_metadata(): df = _format_positional_metadata( obj.positional_metadata, 'Multiple sequence alignment ' 'positional metadata (GC)') for gc_feature in df.columns: gc_feature_data = ''.join(df[gc_feature]) gc_string = "#=GC %s" % gc_feature unpadded_data.append((gc_string, gc_feature_data)) # Writes GR, GC, and raw data to file with padding _write_padded_data(unpadded_data, fh) # Writes footer fh.write("//\n")