def __init__(self, dataset, assay_entry): """Represents a single Sample entry generated from Assay, Study, general Investigation entries""" self.dataset, self["Id"] = dataset, {"Accession": dataset.accession} # associate with assay name: self["Id"]["Assay Name"] = self._get_subkey_value( assay_entry, "Id", "Assay Name", ) # associate with sample name: self["Id"]["Sample Name"] = self._get_unique_primary_value( assay_entry, "Sample Name", ) # validate names: for attr in "accession", "assay_name": value = getattr(self, attr) if isinstance(value, str): if {"$", "/"} & set(value): msg = "Forbidden characters ('$', '/') in sample attribute" raise GeneFabISAException(msg, **{f"self.{attr}": value}) # associate with assay and study metadata: self._INPLACE_extend_with_assay_metadata(assay_entry) self._INPLACE_extend_with_study_metadata() self._INPLACE_extend_with_dataset_files()
def _ingest_raw_isa(self, data, status_kwargs): """Unpack ZIP from URL and delegate to top-level parsers""" raw = SimpleNamespace(investigation=None, studies={}, assays={}) with ZipFile(BytesIO(data)) as archive: for filepath in archive.namelist(): _, filename = path.split(filepath) matcher = search(r'^([isa])_(.+)\.txt$', filename) if matcher: kind, name = matcher.groups() with archive.open(filepath) as handle: if kind == "i": reader = self._read_investigation raw.investigation = reader(handle) elif kind == "s": reader = self._read_tab raw.studies[name] = reader(handle, status_kwargs) elif kind == "a": reader = self._read_tab raw.assays[name] = reader(handle, status_kwargs) for tab, value in raw.__dict__.items(): if not value: msg = "Missing ISA tab" _kw = copy_except(status_kwargs, "collection") raise GeneFabISAException(msg, tab=tab, **_kw) return raw
def _get_unique_primary_value(self, entry, key): """Check validity / uniqueness of `key[*].''` in entry and return its value""" values = { branch[""] for branch in entry.get(key, {}) if ("" in branch) } _kw = dict(accession=self.dataset.accession, assay_name=self.assay_name) if len(values) == 0: msg = "Could not retrieve any value of `key` from Assay entry" raise GeneFabISAException(msg, **_kw, key=key) elif len(values) > 1: msg = "Ambiguous values of `key` for one Assay entry" raise GeneFabISAException(msg, **_kw, key=key, values=values) else: return values.pop()
def _INPLACE_extend_with_study_metadata(self): """Populate with Study tab annotation for entries matching current Sample Name""" matching_study_sample_names = set( self.dataset.best_sample_name_matches( self.name, self.dataset.isa.studies._by_sample_name, )) if len(matching_study_sample_names) == 1: study_entry = self.dataset.isa.studies._by_sample_name[ matching_study_sample_names.pop()] self["Id"]["Study Name"] = self._get_subkey_value( study_entry, "Id", "Study Name", ) self["Study"] = deepcopy_except(study_entry, "Id") self["Investigation"]["Study"] = ( self.dataset.isa.investigation["Study"].get( self.study_name, {})) elif len(matching_study_sample_names) > 1: raise GeneFabISAException( "Multiple Study 'Sample Name' entries match Assay entry", accession=self.dataset.accession, assay_sample_name=self.name, matching_study_sample_names=matching_study_sample_names, )
def _get_subkey_value(self, entry, key, subkey): """Check existence of `key.subkey` in entry and return its value""" try: return entry[key][subkey] except (TypeError, KeyError): msg = "Could not retrieve value of `key.subkey` from Assay entry" _kw = dict(accession=self.dataset.accession, key=key, subkey=subkey) raise GeneFabISAException(msg, **_kw)
def __init__(self, raw_tabs, status_kwargs): """Convert tables to nested JSONs""" if self._self_identifier == "Study": self._by_sample_name = {} else: # lookup in classes like AssayEntries would be ambiguous self._by_sample_name = defaultdict(self._abort_lookup) for name, raw_tab in raw_tabs.items(): for _, row in raw_tab.iterrows(): if "Sample Name" not in row: msg = f"{self._self_identifier} entry missing 'Sample Name'" _kw = copy_except(status_kwargs, "collection") raise GeneFabISAException(msg, **_kw) else: sample_name = row["Sample Name"] if isinstance(sample_name, Series): if len(set(sample_name)) > 1: _m = "entry has multiple 'Sample Name' values" msg = f"{self._self_identifier} {_m}" _kw = copy_except(status_kwargs, "collection") raise GeneFabISAException(msg, **_kw) else: sample_name = sample_name.iloc[0] if not isnull(sample_name): _kw = {**status_kwargs, "sample_name": sample_name} json = self._row_to_json(row, name, _kw) super().append(json) if self._self_identifier == "Study": if sample_name in self._by_sample_name: msg = "Duplicate 'Sample Name' in Study tab" _kw = copy_except(status_kwargs, "collection") _kkw = dict(sample_name=sample_name, **_kw) raise GeneFabISAException(msg, **_kkw) else: self._by_sample_name[sample_name] = json else: update_status( **status_kwargs, status="warning", warning="Null 'Sample Name'", tab=self._self_identifier, )
def _row_to_json(self, row, name, status_kwargs): """Convert single row of table to nested JSON""" json = {"Id": {f"{self._self_identifier} Name": name}} protocol_ref, qualifiable = nan, None for column, value in row.items(): field, subfield, extra = self._parse_field(column) if field is None: update_status( **status_kwargs, status="warning", tab=self._self_identifier, field=repr(column), warning="ISA field is not a string", ) else: if field == "Protocol REF": protocol_ref = value elif self._is_not_qualifier(field): # top-level field if not subfield: # e.g. "Source Name" qualifiable = self._INPLACE_add_toplevel_field( json, field, value, protocol_ref, ) else: # e.g. "Characteristics[Age]" qualifiable = self._INPLACE_add_metadatalike( json, field, subfield, value, protocol_ref, status_kwargs, ) else: # qualify entry at pointer with second-level field if qualifiable is None: msg = "Qualifier before main field" _kw = copy_except(status_kwargs, "collection") raise GeneFabISAException(msg, field=value, **_kw) else: self._INPLACE_qualify( qualifiable, field, subfield, value, status_kwargs={ **status_kwargs, "name": name }, ) return json
def __init__(self, raw_investigation, status_kwargs): """Convert dataframes to JSONs""" for real_name, isatools_name, target, pattern in self._key_dispatcher: if isatools_name in raw_investigation: content = raw_investigation[isatools_name] _kw = dict(coerce_comments=True, status_kwargs=status_kwargs) if isinstance(content, list): json = [self._jsonify(df, **_kw) for df in content] else: json = self._jsonify(content, **_kw) if isinstance(json, list): if (len(json) == 1) and isinstance(json[0], list): json = json[0] if isinstance(target, int) and isinstance(pattern, int): try: if len(json) != pattern: raise IndexError else: super().__setitem__(real_name, json[target]) except (TypeError, IndexError, KeyError): msg = "Unexpected structure of field" _kw = copy_except(status_kwargs, "collection") raise GeneFabISAException(msg, field=real_name, **_kw) elif target and pattern: try: super().__setitem__( real_name, { search(pattern, entry[target]).group(1): entry for entry in json }) except (TypeError, AttributeError, IndexError, KeyError): msg = "Could not break up field by name" _kw = copy_except(status_kwargs, "collection") raise GeneFabISAException(msg, field=real_name, **_kw) else: super().__setitem__(real_name, json)
def _INPLACE_add_metadatalike(self, json, field, subfield, value, protocol_ref, status_kwargs): """Add metadatalike to json (e.g. 'Characteristics' -> 'Age'), qualify with 'Protocol REF', point to resulting field""" if field not in json: json[field] = {} if subfield in json[field]: msg = "Duplicate field[subfield]" _k = copy_except(status_kwargs, "collection") raise GeneFabISAException(msg, field=field, subfield=subfield, **_k) else: # make {"Characteristics": {"Age": {"": "36"}}} json[field][subfield] = {"": value} qualifiable = json[field][subfield] if field == "Parameter Value": qualifiable["Protocol REF"] = protocol_ref return qualifiable