def __init__(self, **kwargs): self.tree_lineage_names = None self.config_lineage_names = None self.is_case_sensitive = kwargs.pop("is_case_sensitive", False) self.is_fail_on_extra_tree_lineages = kwargs.pop( "is_fail_on_extra_tree_lineages", True) self.is_fail_on_extra_configuration_lineages = kwargs.pop( "is_fail_on_extra_configuration_lineages", True) self.logger = kwargs.pop("logger") self.original_to_normalized_lineage_name_map = {} self.config_name_normalization_report = {} self.preanalysis_constrained_species_lineages_map = {} if self.is_case_sensitive: self.normalized_tree_lineage_names = {} self.normalized_config_lineage_names = {} self.species_names = {} self.preanalysis_constrained_lineage_species_map = {} else: self.normalized_tree_lineage_names = OrderedCaselessDict() self.normalized_config_lineage_names = OrderedCaselessDict() self.normalized_species_names = OrderedCaselessDict() self.preanalysis_constrained_lineage_species_map = OrderedCaselessDict( ) self.extra_tree_lineage_names = [] self.extra_configuration_lineages = []
def read_configuration_table_species(self, conf_lineage_species_map, conf_constrained_lineages): if self.is_case_sensitive: nccl = {} else: nccl = OrderedCaselessDict() for ln in conf_constrained_lineages: nccl[ln] = True for lineage_name in conf_lineage_species_map: if lineage_name not in nccl: continue species_name = conf_lineage_species_map[lineage_name] if species_name not in self.normalized_species_names: self.normalized_species_names[species_name] = species_name else: species_name = self.normalized_species_names[species_name] try: normalized_lineage_name = self.original_to_normalized_lineage_name_map[lineage_name] except KeyError: utility.error_exit( msg="Lineage '{}' not defined (missing on tree?)".format(lineage_name), logger=self.logger) if normalized_lineage_name in self.preanalysis_constrained_lineage_species_map: utility.error_exit( msg="Duplicate lineage species assignment: '{}'".format(normalized_lineage_name), logger=self.logger) self.preanalysis_constrained_lineage_species_map[normalized_lineage_name] = species_name try: self.preanalysis_constrained_species_lineages_map[species_name].add(normalized_lineage_name) except KeyError: self.preanalysis_constrained_species_lineages_map[species_name] = set([normalized_lineage_name]) self.preanalysis_constrained_species_report()
def __init__(self, citation=None): """ Sets up internal dictionary of BibTeX fields, and initializes if argument is given. """ self.bibtype = None self.citekey = None if isinstance(citation, BibTexEntry): self._entry_dict = OrderedCaselessDict(citation._entry_dict) elif isinstance(citation, dict): self._entry_dict = OrderedCaselessDict() for k, v in citation.items(): self._entry_dict[k.lower()] = v self.bibtype = self._entry_dict.get("bibtype", None) self.citekey = self._entry_dict.get("citekey", None) else: self._entry_dict = OrderedCaselessDict() self.parse_text(citation)
class Registry(object): def __init__(self, **kwargs): self.tree_lineage_names = None self.config_lineage_names = None self.is_case_sensitive = kwargs.pop("is_case_sensitive", False) self.is_fail_on_extra_tree_lineages = kwargs.pop( "is_fail_on_extra_tree_lineages", True) self.is_fail_on_extra_configuration_lineages = kwargs.pop( "is_fail_on_extra_configuration_lineages", True) self.logger = kwargs.pop("logger") self.original_to_normalized_lineage_name_map = {} self.config_name_normalization_report = {} self.preanalysis_constrained_species_lineages_map = {} if self.is_case_sensitive: self.normalized_tree_lineage_names = {} self.normalized_config_lineage_names = {} self.species_names = {} self.preanalysis_constrained_lineage_species_map = {} else: self.normalized_tree_lineage_names = OrderedCaselessDict() self.normalized_config_lineage_names = OrderedCaselessDict() self.normalized_species_names = OrderedCaselessDict() self.preanalysis_constrained_lineage_species_map = OrderedCaselessDict( ) self.extra_tree_lineage_names = [] self.extra_configuration_lineages = [] def normalize_lineage_names(self): tree_lineage_set = set(self.tree_lineage_names) # tree lineages give the canonical orthography for lineage in self.tree_lineage_names: self.normalized_tree_lineage_names[lineage] = lineage self.original_to_normalized_lineage_name_map[lineage] = lineage normalized_configuration_lineages = {} extra_configuration_lineages = set() for lineage in self.config_lineage_names: self.normalized_config_lineage_names[lineage] = lineage try: normalized_name = self.normalized_tree_lineage_names[lineage] self.original_to_normalized_lineage_name_map[ lineage] = normalized_name if normalized_name != lineage: self.config_name_normalization_report[ lineage] = "(NORMALIZED TO: '{}')".format( normalized_name) normalized_configuration_lineages[ lineage] = normalized_name else: self.config_name_normalization_report[lineage] = "" except KeyError as e: # This is a serious error: it means that the configuration file # has a taxon that is not on the tree. But we handle this issue # later so a full report can be shown self.config_name_normalization_report[ lineage] = "(NOT FOUND ON TREE)" extra_configuration_lineages.add(lineage) self.normalization_report( normalized_configuration_lineages=normalized_configuration_lineages, extra_configuration_lineages=extra_configuration_lineages) def read_configuration_table_species(self, conf_lineage_species_map, conf_constrained_lineages): if self.is_case_sensitive: nccl = {} else: nccl = OrderedCaselessDict() for ln in conf_constrained_lineages: nccl[ln] = True for lineage_name in conf_lineage_species_map: if lineage_name not in nccl: continue species_name = conf_lineage_species_map[lineage_name] if species_name not in self.normalized_species_names: self.normalized_species_names[species_name] = species_name else: species_name = self.normalized_species_names[species_name] try: normalized_lineage_name = self.original_to_normalized_lineage_name_map[ lineage_name] except KeyError: utility.error_exit( msg="Lineage '{}' not defined (missing on tree?)".format( lineage_name), logger=self.logger) if normalized_lineage_name in self.preanalysis_constrained_lineage_species_map: utility.error_exit( msg="Duplicate lineage species assignment: '{}'".format( normalized_lineage_name), logger=self.logger) self.preanalysis_constrained_lineage_species_map[ normalized_lineage_name] = species_name try: self.preanalysis_constrained_species_lineages_map[ species_name].add(normalized_lineage_name) except KeyError: self.preanalysis_constrained_species_lineages_map[ species_name] = set([normalized_lineage_name]) self.preanalysis_constrained_species_report() def compile_configuration_species_groupings(self, species_leafset_constraints): for spi, sp in enumerate(species_leafset_constraints): lineages = [] species_name = "ConstrainedSp{:03d}".format(spi + 1) self.normalized_species_names[species_name] = species_name for lineage_name in sp: try: normalized_lineage_name = self.original_to_normalized_lineage_name_map[ lineage_name] except KeyError: utility.error_exit( msg="Lineage '{}' not defined (missing on tree?)". format(lineage_name), logger=self.logger) self.preanalysis_constrained_lineage_species_map[ normalized_lineage_name] = species_name try: self.preanalysis_constrained_species_lineages_map[ species_name].add(normalized_lineage_name) except KeyError: self.preanalysis_constrained_species_lineages_map[ species_name] = set([normalized_lineage_name]) self.preanalysis_constrained_species_report() def preanalysis_constrained_species_report(self): species_names = sorted( self.preanalysis_constrained_species_lineages_map.keys()) num_lineages = [ "({} lineages)".format( len(self.preanalysis_constrained_species_lineages_map[n])) for n in species_names ] stbl = utility.compose_table(columns=[ species_names, num_lineages, ], prefixes=["", ""], quoted=[True, False], is_indexed=True, indent=" ") self.logger.info( "{} species defined in configuration constraints, with {} lineages assigned:\n{}" .format( len(species_names), len(self.preanalysis_constrained_lineage_species_map), stbl, )) constrained_lineages = sorted( self.preanalysis_constrained_lineage_species_map.keys(), key=lambda n: (self.preanalysis_constrained_lineage_species_map[n], n)) species_assignments = [ "(SPECIES: '{}')".format( self.preanalysis_constrained_lineage_species_map[n]) for n in constrained_lineages ] lntbl = utility.compose_table(columns=[ constrained_lineages, species_assignments, ], prefixes=["", ""], quoted=[True, False], is_indexed=True, indent=" ") self.logger.info( "{} out of {} lineages assigned by constraints to {} species:\n{}". format( len(constrained_lineages), len(self.tree_lineage_names), len(species_names), lntbl, )) unconstrained_lineages = sorted( n for n in self.tree_lineage_names if n not in self.preanalysis_constrained_lineage_species_map) lntbl = utility.compose_table(columns=[ unconstrained_lineages, ], prefixes=[""], quoted=[True], is_indexed=True, indent=" ") self.logger.info( "{} out of {} lineages not constrained by species assignments:\n{}" .format( len(unconstrained_lineages), len(self.tree_lineage_names), lntbl, )) assert len(unconstrained_lineages) + len(constrained_lineages) == len( self.tree_lineage_names) def normalization_report(self, normalized_configuration_lineages, extra_configuration_lineages): treetbl = utility.compose_table(columns=[ self.tree_lineage_names, [ "(NOT FOUND IN CONFIGURATION)" if lineage not in self.normalized_config_lineage_names else "" for lineage in self.tree_lineage_names ], ], prefixes=["", ""], quoted=[True, False], is_indexed=True, indent=" ") self.logger.info("{} lineages found on population tree:\n{}".format( len(self.tree_lineage_names), treetbl, )) if extra_configuration_lineages: cfntbl = utility.compose_table(columns=[ self.config_lineage_names, [ self.config_name_normalization_report[n] for n in self.config_lineage_names ] ], prefixes=["", ""], quoted=[True, False], is_indexed=True, indent=" ") self.logger.info( "{} lineages found in configuration file:\n{}".format( len(self.config_lineage_names), cfntbl, )) elif normalized_configuration_lineages: n1 = list(normalized_configuration_lineages.keys()) n2 = [normalized_configuration_lineages[k] for k in n1] cfntbl = utility.compose_table(columns=[ n1, n2, ], prefixes=["", "NORMALIZED TO: "], quoted=[True, True], is_indexed=True, indent=" ") self.logger.info( "{} lineages found in configuration file, with the following normalized for concordance with tree lineages:\n{}" .format( len(self.config_lineage_names), cfntbl, )) else: self.logger.info( "{} lineages found in configuration file fully concordant with tree lineages" .format(len(self.config_lineage_names), )) def validate_lineage_names(self): for lineage in self.config_lineage_names: if lineage not in self.normalized_tree_lineage_names: self.extra_configuration_lineages.append(lineage) for lineage in self.tree_lineage_names: if lineage not in self.normalized_config_lineage_names: self.extra_tree_lineage_names.append(lineage) if self.extra_tree_lineage_names: s1_error_msg = [ "{}: {} lineages found on tree but not in configuration data:". format( "ERROR" if self.is_fail_on_extra_tree_lineages else "WARNING", len(self.extra_tree_lineage_names)) ] s1_error_msg.append( self.compose_name_list(self.extra_tree_lineage_names)) s1_error_msg = "\n".join(s1_error_msg) else: s1_error_msg = "" if self.extra_configuration_lineages: s2_error_msg = [ "{}: {} lineages found in configuration data but not on tree:". format( "ERROR" if self.is_fail_on_extra_configuration_lineages else "WARNING", len(self.extra_configuration_lineages)) ] s2_error_msg.append( self.compose_name_list(self.extra_configuration_lineages)) s2_error_msg = "\n".join(s2_error_msg) else: s2_error_msg = "" is_fail = [] if self.extra_tree_lineage_names and self.is_fail_on_extra_tree_lineages: self.logger.error(s1_error_msg) is_fail.append("1") elif s1_error_msg: self.logger.warning(s1_error_msg) if self.extra_configuration_lineages and self.is_fail_on_extra_configuration_lineages: self.logger.error(s2_error_msg) is_fail.append("2") elif s2_error_msg: self.logger.warning(s2_error_msg) if is_fail: utility.error_exit(msg="Lineage identity errors found ({})".format( ", ".join(is_fail)), logger=self.logger) def compose_name_list(self, names): s = utility.compose_table(columns=[names], prefixes=[""], quoted=[True], is_indexed=True, indent=" ") return s def compose_report(self): msg = [] msg.append("{} terminal lineages on population tree".format( len(self.tree_lineage_names))) msg.append("{} lineages described in configuration file".format( len(self.config_lineage_names)))
class BibTexEntry(object): """ Tracks a single BibTeX entry. """ decompose_pattern = re.compile(r'^@(\w*)\s*{\s*([\w|\:|\-]*),(.*)}') # works, but misses last field field_pattern = re.compile( r'\s*([\w|\-]*?)\s*=\s*(.*?),(?=\s*[\w|\-]*\s*\=)') # get the last field last_field_pattern = re.compile(r'\s*([\w|\-]*?)\s*=\s*(.*?)\s*[,]*\s*$') def __init__(self, citation=None): """ Sets up internal dictionary of BibTeX fields, and initializes if argument is given. """ self.bibtype = None self.citekey = None if isinstance(citation, BibTexEntry): self._entry_dict = OrderedCaselessDict(citation._entry_dict) elif isinstance(citation, dict): self._entry_dict = OrderedCaselessDict() for k, v in citation.items(): self._entry_dict[k.lower()] = v self.bibtype = self._entry_dict.get("bibtype", None) self.citekey = self._entry_dict.get("citekey", None) else: self._entry_dict = OrderedCaselessDict() self.parse_text(citation) def __getattr__(self, name): """ Allows bibtex fields (and any additional ones) to be treated like object attributes. """ entry_dict = self._get_entry_dict() if name == '_entry_dict' or name == '_BibTexEntry_entry_dict': return entry_dict elif name == '__dict__': return object.__getattribute__(self, '__dict__') elif name == 'bibtype' and hasattr(self, 'bibtype'): return object.__getattribute__(self, '__dict__')['bibtype'] elif name == 'citekey' and hasattr(self, 'citekey'): return object.__getattribute__(self, '__dict__')['citekey'] elif name in entry_dict: return entry_dict[name] elif name in BIBTEX_FIELDS: return "" else: raise AttributeError(name) def __setattr__(self, name, value): """ Allows bibtex fields (and any additional ones) to be treated like object attributes. """ entry_dict = self._get_entry_dict() if name == '_entry_dict' or name == '_BibTexEntry_entry_dict': entry_dict = value elif name == 'bibtype' or name == 'citekey': object.__setattr__(self, name, value) else: self._entry_dict[name] = value def __delattr__(self, name): """ Allows bibtex fields (and any additional ones) to be treated like object attributes. """ entry_dict = self._get_entry_dict() if name == '_entry_dict' or name == '_BibTexEntry_entry_dict': object.__delattr__(self, '_entry_dict') elif name in entry_dict: del (entry_dict[name]) elif name in BIBTEX_FIELDS: pass elif name in object.__getattribute__(self, '__dict__'): object.__delattr__(name) else: raise AttributeError(name) def __str__(self): """ String representation of self. """ return self.as_bibtex() def __repr__(self): """ Internal representation of self. """ repr_dict = {} repr_dict['bibtype'] = self.bibtype repr_dict['citekey'] = self.citekey repr_dict.update(self.fields_as_dict()) return repr_dict def _get_entry_dict(self): """ Returns the internal field dictionary, creating it first if neccessary. """ if not hasattr(self, '_entry_dict'): object.__setattr__(self, '_entry_dict', {}) return object.__getattribute__(self, '_entry_dict') def _get_fields(self): """ Returns list of populated fields in order (does not include bibtype and citekey). """ fields = [] for field in BIBTEX_FIELDS: if field in self._entry_dict: fields.append(field) for key in self._entry_dict: if key not in fields: fields.append(key) return fields fields = property(_get_fields) def parse_text(self, text): """ Parses a BibTeX text entry. """ text = text.replace("\n", "") self.bibtype = None self.citekey = None text = text.strip() decompose_match = self.decompose_pattern.match(text) try: self.bibtype = decompose_match.group(1) except AttributeError as exception: raise ValueError("Failed to parse bibtype: {}".format(text)) try: self.citekey = decompose_match.group(2) except AttributeError as exception: raise ValueError("Failed to parse citekey: {}".format(text)) remaining = decompose_match.group(3) field_match = self.field_pattern.match(remaining) while field_match: field_match = self.field_pattern.match(remaining) if field_match: field_name = field_match.group(1).lower() field_value = _clean_parsed_text(field_match.group(2)) self._entry_dict[field_name] = field_value remaining = remaining.replace(field_match.group(), '') if remaining: last_field_match = self.last_field_pattern.match(remaining) if last_field_match: field_name = last_field_match.group(1).lower() field_value = _clean_parsed_text(last_field_match.group(2)) self._entry_dict[field_name] = field_value def fields_as_dict(self): """ Returns the fields (i.e., all public attributes except for bibtype and citekey as a dictionary). """ return dict(self._entry_dict) def as_bibtex(self, wrap_width=78): """ Composes entry in BibTex format. """ entry = [] sep = " = " entry.append('@{}{{},'.format((self.bibtype, self.citekey))) fields = self.fields # maxlen = max([len(field) for field in fields]) maxlen = max([len(field) for field in BIBTEX_FIELDS]) for field in fields: if field != 'url': wrap = True else: wrap = False field_header = field.ljust(maxlen) field_value = _format_bibtex_value(self._entry_dict[field], wrap=wrap, width=wrap_width - maxlen - len(sep) + 2, col_start=maxlen + len(sep) + 2) entry.append(" {}{}{},".format((field_header, sep, field_value))) entry.append('}') return '\n'.join(entry) def as_compact_bibtex(self): """ Composes entry in BibTex format. """ entry = [] entry.append('@{}{{{},'.format((self.bibtype, self.citekey))) fields = self.fields for field in fields: field_value = _format_bibtex_value(self._entry_dict[field], wrap=False, width=None, col_start=1) entry.append("{}={},".format((field, field_value))) entry.append('}') return ''.join(entry)
class BibTexEntry(object): """ Tracks a single BibTeX entry. """ decompose_pattern = re.compile(r'^@(\w*)\s*{\s*([\w|\:|\-]*),(.*)}') # works, but misses last field field_pattern = re.compile(r'\s*([\w|\-]*?)\s*=\s*(.*?),(?=\s*[\w|\-]*\s*\=)') # get the last field last_field_pattern = re.compile(r'\s*([\w|\-]*?)\s*=\s*(.*?)\s*[,]*\s*$') def __init__(self, citation=None): """ Sets up internal dictionary of BibTeX fields, and initializes if argument is given. """ self.bibtype = None self.citekey = None if isinstance(citation, BibTexEntry): self._entry_dict = OrderedCaselessDict(citation._entry_dict) elif isinstance(citation, dict): self._entry_dict = OrderedCaselessDict() for k, v in citation.items(): self._entry_dict[k.lower()] = v self.bibtype = self._entry_dict.get("bibtype", None) self.citekey = self._entry_dict.get("citekey", None) else: self._entry_dict = OrderedCaselessDict() self.parse_text(citation) def __getattr__(self, name): """ Allows bibtex fields (and any additional ones) to be treated like object attributes. """ entry_dict = self._get_entry_dict() if name == '_entry_dict' or name == '_BibTexEntry_entry_dict': return entry_dict elif name == '__dict__': return object.__getattribute__(self, '__dict__') elif name == 'bibtype' and hasattr(self, 'bibtype'): return object.__getattribute__(self, '__dict__')['bibtype'] elif name == 'citekey' and hasattr(self, 'citekey'): return object.__getattribute__(self, '__dict__')['citekey'] elif name in entry_dict: return entry_dict[name] elif name in BIBTEX_FIELDS: return "" else: raise AttributeError(name) def __setattr__(self, name, value): """ Allows bibtex fields (and any additional ones) to be treated like object attributes. """ entry_dict = self._get_entry_dict() if name == '_entry_dict' or name == '_BibTexEntry_entry_dict': entry_dict = value elif name == 'bibtype' or name == 'citekey': object.__setattr__(self, name, value) else: self._entry_dict[name] = value def __delattr__(self, name): """ Allows bibtex fields (and any additional ones) to be treated like object attributes. """ entry_dict = self._get_entry_dict() if name == '_entry_dict' or name == '_BibTexEntry_entry_dict': object.__delattr__(self, '_entry_dict') elif name in entry_dict: del(entry_dict[name]) elif name in BIBTEX_FIELDS: pass elif name in object.__getattribute__(self, '__dict__'): object.__delattr__(name) else: raise AttributeError(name) def __str__(self): """ String representation of self. """ return self.as_bibtex() def __repr__(self): """ Internal representation of self. """ repr_dict = {} repr_dict['bibtype'] = self.bibtype repr_dict['citekey'] = self.citekey repr_dict.update(self.fields_as_dict()) return repr_dict def _get_entry_dict(self): """ Returns the internal field dictionary, creating it first if neccessary. """ if not hasattr(self, '_entry_dict'): object.__setattr__(self, '_entry_dict', {}) return object.__getattribute__(self, '_entry_dict') def _get_fields(self): """ Returns list of populated fields in order (does not include bibtype and citekey). """ fields = [] for field in BIBTEX_FIELDS: if field in self._entry_dict: fields.append(field) for key in self._entry_dict: if key not in fields: fields.append(key) return fields fields = property(_get_fields) def parse_text(self, text): """ Parses a BibTeX text entry. """ text = text.replace("\n", "") self.bibtype = None self.citekey = None text = text.strip() decompose_match = self.decompose_pattern.match(text) try: self.bibtype = decompose_match.group(1) except AttributeError as exception: raise ValueError("Failed to parse bibtype: {}".format(text)) try: self.citekey = decompose_match.group(2) except AttributeError as exception: raise ValueError("Failed to parse citekey: {}".format(text)) remaining = decompose_match.group(3) field_match = self.field_pattern.match(remaining) while field_match: field_match = self.field_pattern.match(remaining) if field_match: field_name = field_match.group(1).lower() field_value = _clean_parsed_text(field_match.group(2)) self._entry_dict[field_name] = field_value remaining = remaining.replace(field_match.group(), '') if remaining: last_field_match = self.last_field_pattern.match(remaining) if last_field_match: field_name = last_field_match.group(1).lower() field_value = _clean_parsed_text(last_field_match.group(2)) self._entry_dict[field_name] = field_value def fields_as_dict(self): """ Returns the fields (i.e., all public attributes except for bibtype and citekey as a dictionary). """ return dict(self._entry_dict) def as_bibtex(self, wrap_width=78): """ Composes entry in BibTex format. """ entry = [] sep = " = " entry.append('@{}{{},'.format((self.bibtype, self.citekey))) fields = self.fields # maxlen = max([len(field) for field in fields]) maxlen = max([len(field) for field in BIBTEX_FIELDS]) for field in fields: if field != 'url': wrap = True else: wrap = False field_header = field.ljust(maxlen) field_value = _format_bibtex_value(self._entry_dict[field], wrap=wrap, width = wrap_width - maxlen - len(sep) + 2, col_start = maxlen + len(sep) + 2 ) entry.append(" {}{}{},".format((field_header, sep, field_value))) entry.append('}') return '\n'.join(entry) def as_compact_bibtex(self): """ Composes entry in BibTex format. """ entry = [] entry.append('@{}{{{},'.format((self.bibtype, self.citekey))) fields = self.fields for field in fields: field_value = _format_bibtex_value(self._entry_dict[field], wrap=False, width=None, col_start=1) entry.append("{}={},".format((field, field_value))) entry.append('}') return ''.join(entry)