예제 #1
0
 def __init__(self, dataset, assay_entry):
     """Represents a single Sample entry generated from Assay, Study, general Investigation entries"""
     self.dataset, self["Id"] = dataset, {"Accession": dataset.accession}
     # associate with assay name:
     self["Id"]["Assay Name"] = self._get_subkey_value(
         assay_entry,
         "Id",
         "Assay Name",
     )
     # associate with sample name:
     self["Id"]["Sample Name"] = self._get_unique_primary_value(
         assay_entry,
         "Sample Name",
     )
     # validate names:
     for attr in "accession", "assay_name":
         value = getattr(self, attr)
         if isinstance(value, str):
             if {"$", "/"} & set(value):
                 msg = "Forbidden characters ('$', '/') in sample attribute"
                 raise GeneFabISAException(msg, **{f"self.{attr}": value})
     # associate with assay and study metadata:
     self._INPLACE_extend_with_assay_metadata(assay_entry)
     self._INPLACE_extend_with_study_metadata()
     self._INPLACE_extend_with_dataset_files()
예제 #2
0
 def _ingest_raw_isa(self, data, status_kwargs):
     """Unpack ZIP from URL and delegate to top-level parsers"""
     raw = SimpleNamespace(investigation=None, studies={}, assays={})
     with ZipFile(BytesIO(data)) as archive:
         for filepath in archive.namelist():
             _, filename = path.split(filepath)
             matcher = search(r'^([isa])_(.+)\.txt$', filename)
             if matcher:
                 kind, name = matcher.groups()
                 with archive.open(filepath) as handle:
                     if kind == "i":
                         reader = self._read_investigation
                         raw.investigation = reader(handle)
                     elif kind == "s":
                         reader = self._read_tab
                         raw.studies[name] = reader(handle, status_kwargs)
                     elif kind == "a":
                         reader = self._read_tab
                         raw.assays[name] = reader(handle, status_kwargs)
     for tab, value in raw.__dict__.items():
         if not value:
             msg = "Missing ISA tab"
             _kw = copy_except(status_kwargs, "collection")
             raise GeneFabISAException(msg, tab=tab, **_kw)
     return raw
예제 #3
0
 def _get_unique_primary_value(self, entry, key):
     """Check validity / uniqueness of `key[*].''` in entry and return its value"""
     values = {
         branch[""]
         for branch in entry.get(key, {}) if ("" in branch)
     }
     _kw = dict(accession=self.dataset.accession,
                assay_name=self.assay_name)
     if len(values) == 0:
         msg = "Could not retrieve any value of `key` from Assay entry"
         raise GeneFabISAException(msg, **_kw, key=key)
     elif len(values) > 1:
         msg = "Ambiguous values of `key` for one Assay entry"
         raise GeneFabISAException(msg, **_kw, key=key, values=values)
     else:
         return values.pop()
예제 #4
0
 def _INPLACE_extend_with_study_metadata(self):
     """Populate with Study tab annotation for entries matching current Sample Name"""
     matching_study_sample_names = set(
         self.dataset.best_sample_name_matches(
             self.name,
             self.dataset.isa.studies._by_sample_name,
         ))
     if len(matching_study_sample_names) == 1:
         study_entry = self.dataset.isa.studies._by_sample_name[
             matching_study_sample_names.pop()]
         self["Id"]["Study Name"] = self._get_subkey_value(
             study_entry,
             "Id",
             "Study Name",
         )
         self["Study"] = deepcopy_except(study_entry, "Id")
         self["Investigation"]["Study"] = (
             self.dataset.isa.investigation["Study"].get(
                 self.study_name, {}))
     elif len(matching_study_sample_names) > 1:
         raise GeneFabISAException(
             "Multiple Study 'Sample Name' entries match Assay entry",
             accession=self.dataset.accession,
             assay_sample_name=self.name,
             matching_study_sample_names=matching_study_sample_names,
         )
예제 #5
0
 def _get_subkey_value(self, entry, key, subkey):
     """Check existence of `key.subkey` in entry and return its value"""
     try:
         return entry[key][subkey]
     except (TypeError, KeyError):
         msg = "Could not retrieve value of `key.subkey` from Assay entry"
         _kw = dict(accession=self.dataset.accession,
                    key=key,
                    subkey=subkey)
         raise GeneFabISAException(msg, **_kw)
예제 #6
0
 def __init__(self, raw_tabs, status_kwargs):
     """Convert tables to nested JSONs"""
     if self._self_identifier == "Study":
         self._by_sample_name = {}
     else:  # lookup in classes like AssayEntries would be ambiguous
         self._by_sample_name = defaultdict(self._abort_lookup)
     for name, raw_tab in raw_tabs.items():
         for _, row in raw_tab.iterrows():
             if "Sample Name" not in row:
                 msg = f"{self._self_identifier} entry missing 'Sample Name'"
                 _kw = copy_except(status_kwargs, "collection")
                 raise GeneFabISAException(msg, **_kw)
             else:
                 sample_name = row["Sample Name"]
             if isinstance(sample_name, Series):
                 if len(set(sample_name)) > 1:
                     _m = "entry has multiple 'Sample Name' values"
                     msg = f"{self._self_identifier} {_m}"
                     _kw = copy_except(status_kwargs, "collection")
                     raise GeneFabISAException(msg, **_kw)
                 else:
                     sample_name = sample_name.iloc[0]
             if not isnull(sample_name):
                 _kw = {**status_kwargs, "sample_name": sample_name}
                 json = self._row_to_json(row, name, _kw)
                 super().append(json)
                 if self._self_identifier == "Study":
                     if sample_name in self._by_sample_name:
                         msg = "Duplicate 'Sample Name' in Study tab"
                         _kw = copy_except(status_kwargs, "collection")
                         _kkw = dict(sample_name=sample_name, **_kw)
                         raise GeneFabISAException(msg, **_kkw)
                     else:
                         self._by_sample_name[sample_name] = json
             else:
                 update_status(
                     **status_kwargs,
                     status="warning",
                     warning="Null 'Sample Name'",
                     tab=self._self_identifier,
                 )
예제 #7
0
 def _row_to_json(self, row, name, status_kwargs):
     """Convert single row of table to nested JSON"""
     json = {"Id": {f"{self._self_identifier} Name": name}}
     protocol_ref, qualifiable = nan, None
     for column, value in row.items():
         field, subfield, extra = self._parse_field(column)
         if field is None:
             update_status(
                 **status_kwargs,
                 status="warning",
                 tab=self._self_identifier,
                 field=repr(column),
                 warning="ISA field is not a string",
             )
         else:
             if field == "Protocol REF":
                 protocol_ref = value
             elif self._is_not_qualifier(field):  # top-level field
                 if not subfield:  # e.g. "Source Name"
                     qualifiable = self._INPLACE_add_toplevel_field(
                         json,
                         field,
                         value,
                         protocol_ref,
                     )
                 else:  # e.g. "Characteristics[Age]"
                     qualifiable = self._INPLACE_add_metadatalike(
                         json,
                         field,
                         subfield,
                         value,
                         protocol_ref,
                         status_kwargs,
                     )
             else:  # qualify entry at pointer with second-level field
                 if qualifiable is None:
                     msg = "Qualifier before main field"
                     _kw = copy_except(status_kwargs, "collection")
                     raise GeneFabISAException(msg, field=value, **_kw)
                 else:
                     self._INPLACE_qualify(
                         qualifiable,
                         field,
                         subfield,
                         value,
                         status_kwargs={
                             **status_kwargs, "name": name
                         },
                     )
     return json
예제 #8
0
 def __init__(self, raw_investigation, status_kwargs):
     """Convert dataframes to JSONs"""
     for real_name, isatools_name, target, pattern in self._key_dispatcher:
         if isatools_name in raw_investigation:
             content = raw_investigation[isatools_name]
             _kw = dict(coerce_comments=True, status_kwargs=status_kwargs)
             if isinstance(content, list):
                 json = [self._jsonify(df, **_kw) for df in content]
             else:
                 json = self._jsonify(content, **_kw)
             if isinstance(json, list):
                 if (len(json) == 1) and isinstance(json[0], list):
                     json = json[0]
             if isinstance(target, int) and isinstance(pattern, int):
                 try:
                     if len(json) != pattern:
                         raise IndexError
                     else:
                         super().__setitem__(real_name, json[target])
                 except (TypeError, IndexError, KeyError):
                     msg = "Unexpected structure of field"
                     _kw = copy_except(status_kwargs, "collection")
                     raise GeneFabISAException(msg, field=real_name, **_kw)
             elif target and pattern:
                 try:
                     super().__setitem__(
                         real_name, {
                             search(pattern, entry[target]).group(1): entry
                             for entry in json
                         })
                 except (TypeError, AttributeError, IndexError, KeyError):
                     msg = "Could not break up field by name"
                     _kw = copy_except(status_kwargs, "collection")
                     raise GeneFabISAException(msg, field=real_name, **_kw)
             else:
                 super().__setitem__(real_name, json)
예제 #9
0
 def _INPLACE_add_metadatalike(self, json, field, subfield, value,
                               protocol_ref, status_kwargs):
     """Add metadatalike to json (e.g. 'Characteristics' -> 'Age'), qualify with 'Protocol REF', point to resulting field"""
     if field not in json:
         json[field] = {}
     if subfield in json[field]:
         msg = "Duplicate field[subfield]"
         _k = copy_except(status_kwargs, "collection")
         raise GeneFabISAException(msg,
                                   field=field,
                                   subfield=subfield,
                                   **_k)
     else:  # make {"Characteristics": {"Age": {"": "36"}}}
         json[field][subfield] = {"": value}
         qualifiable = json[field][subfield]
         if field == "Parameter Value":
             qualifiable["Protocol REF"] = protocol_ref
         return qualifiable