def _filter(self, data): """Filter data to contain the required fields.""" if self.select_expr is None: return data opts = jmespath.Options(custom_functions=JMESExtensions(data)) return jmespath.search(self.select_expr, data, opts)
def run(self, data, config=None, pipeline=None): """Parse metadata from mmCIF file.""" pdb_id = self.get_vals(data) mmcif_file = phyre_engine.tools.pdb.find_pdb(pdb_id, suffix_list=(".cif", ".cif.gz"), base_dir=self.mmcif_dir) data.setdefault("metadata", {}) if mmcif_file is None: raise FileNotFoundError( "Could not find mmCIF file {} in {}".format( pdb_id, self.mmcif_dir)) with phyre_engine.tools.pdb.open_pdb(mmcif_file) as mmcif_in: if self.prefilter: mmcif_in = self._prefilter(mmcif_in) mmcif_dict = Bio.PDB.MMCIF2Dict.MMCIF2Dict(mmcif_in) jmes_extensions = JMESExtensions(mmcif_dict) jmes_opts = jmespath.Options(custom_functions=jmes_extensions) for field, jmespath_expr in self.fields.items(): value = jmespath.search(jmespath_expr, mmcif_dict, jmes_opts) data["metadata"][field] = value return data
def run(self, data, config=None, pipeline=None): """Collect and index the files that form an hhsuite database.""" jmes_opts = jmespath.Options(custom_functions=JMESExtensions(data)) templates = jmespath.search(self.select_expr, data, jmes_opts) with tempfile.NamedTemporaryFile("w") as file_list: # Write IDs to remove to temp file for template in templates: print(template["name"], file=file_list) file_list.flush() db_types = ["a3m", "hhm", "cs219"] db_prefix = Path(self.db_prefix) for file_type in db_types: ffindex = Path("{}_{}.ffindex".format(db_prefix, file_type)) # if ffindex.exists(): cmd_line = tools.ffindex_modify( (self.bin_dir, "ffindex_modify"), options={"file_list": file_list.name}, flags=["sort", "unlink"], positional=[ffindex]) self.logger.debug("Running command %s", cmd_line) tools.run(cmd_line, check=True) return data
def key_fn(datum): """Getter closure using `jmespath_key`.""" jmespath_opts = jmespath.Options(custom_functions=JMESExtensions(root)) field_value = jmespath.search(jmespath_key, datum, jmespath_opts) if allow_none: return (field_value is None, field_value) return field_value
def run(self, data, config=None, pipeline=None): """Replace results of `self.select_expr` with `self.value_expr`.""" opts = jmespath.Options(custom_functions=JMESExtensions(data)) to_replace = jmespath.search(self.select_expr, data, opts) replace_with = jmespath.search(self.value_expr, to_replace, opts) if isinstance(to_replace, collections.abc.Mapping): if not isinstance(replace_with, collections.abc.Mapping): self._type_error(to_replace, replace_with) # Edge case: If the two elements are the same, then we cannot # call "clear", because it will erase both. In that case, do # nothing. if to_replace is not replace_with: to_replace.clear() to_replace.update(replace_with) elif isinstance(to_replace, collections.abc.Sequence): if not isinstance(replace_with, collections.abc.Sequence): self._type_error(to_replace, replace_with) self.logger.info( ("Replacing result of '%s' (a list of length %d) " "with a %d-element list"), self.select_expr, len(to_replace), len(replace_with)) to_replace[:] = replace_with else: self._invalid_selection(to_replace) return data
def generate_config(self, data, config): """ Generate child pipeline configuration from runtime configuration and pipeline state. """ config = config if config is not None else {} jmes_opts = jmespath.Options(custom_functions=JMESExtensions(data)) for search_term, config_location in self.mapping.items(): state_value = jmespath.search(search_term, data, jmes_opts) apply_dotted_key(config, config_location, state_value) return config
def run(self, data, config=None, pipeline=None): """Run em4gmm for automatic clustering.""" # Select sample jmes_opts = jmespath.Options(custom_functions=JMESExtensions(data)) sample_list = jmespath.search(self.select_expr, data, jmes_opts) # Extract data points data_points = [ jmespath.search(self.dimensions_expr, sample, jmes_opts) for sample in sample_list ] num_samples = len(data_points) num_dims = len(data_points[0]) # Write samples to file with tempfile.NamedTemporaryFile("w") as sample_file: print("{} {}".format(num_dims, num_samples), file=sample_file) for sample in data_points: print(" ".join([str(i) for i in sample]), file=sample_file) sample_file.flush() # Run trainer gmmtrain_opts = {"samples": sample_file.name} gmmtrain_opts.update(self.gmmtrain_opts) gmmtrain = self.GMMTRAIN((self.bin_dir, "gmmtrain"), options=gmmtrain_opts) self.logger.debug("Running %s", gmmtrain) subprocess.run(gmmtrain, check=True) # Run classifier gmmclass_opts = {"samples": sample_file.name} gmmclass_opts.update(self.gmmclass_opts) gmmclass = self.GMMCLASS((self.bin_dir, "gmmclass"), options=gmmclass_opts) self.logger.debug("Running %s", gmmclass) subprocess.run(gmmclass, check=True) # Parse cluster definitions from trainer log file with open(self.gmmtrain_opts["model_details"], "r") as model_in: model = json.load(model_in) data["clusters"] = model # Parse sample data, adding to the samples with open(self.gmmclass_opts["sample_details"], "r") as samples_in: sample_details = json.load(samples_in)["samples_results"] for details in sample_details: i = details["sample"] sample_list[i]["cluster"] = details["class"] sample_list[i]["lprob"] = details["lprob"] return data
def run(self, data, config=None, pipeline=None): """Collect and index the files that form an hhsuite database.""" jmes_opts = jmespath.Options(custom_functions=JMESExtensions(data)) templates = jmespath.search(self.select_expr, data, jmes_opts) # First, sort templates by sequence length. templates.sort(key=lambda t: len(t["sequence"])) # Make database directory if it doesn't exist. Path(self.db_prefix).parent.mkdir(parents=True, exist_ok=True) # Collect a3m/hhm/cs219 files into ffindex/ffdata databases to_collect = ["a3m", "hhm", "cs219"] ff_dbs = {} db_prefix = Path(self.db_prefix) for file_type in to_collect: db_name = Path("{}_{}".format(str(db_prefix), file_type)) ffindex = Path("{}.ffindex".format(str(db_name))) ffdata = Path("{}.ffdata".format(str(db_name))) if self.overwrite: if ffindex.exists(): ffindex.unlink() if ffdata.exists(): ffdata.unlink() with tempfile.NamedTemporaryFile("w") as index: # Write all files of file_type `file_type` to a temp file for template in templates: print(template[file_type], file=index) index.flush() # Run ffindex_build using the the temp file as the list of files # to include in the DB. cmd_line = tools.ffindex_build( (self.bin_dir, "ffindex_build"), positional=[ffdata, ffindex], flags=["sort", "append"], options={"file_list": index.name}) self.logger.debug("Running command %s", cmd_line) tools.run(cmd_line, check=True) ff_dbs[file_type] = db_name # Cut useless information from the indices of each file. for ff_db in ff_dbs.values(): self._trim_index_names(ff_db) data["database"] = str(db_prefix) return data
def run(self, data, config=None, pipeline=None): """Sort pipeline state.""" jmespath_opts = jmespath.Options(custom_functions=JMESExtensions(data)) to_sort = jmespath.search(self.field, data, jmespath_opts) # Sort according to each key, running from last to first to take # advantage of Python's stable sorting. for sort_key in reversed(self.keys): reverse = sort_key.get("reverse", False) allow_none = sort_key.get("allow_none", False) to_sort = jmes_sort(to_sort, sort_key["key"], root=data, reverse=reverse, allow_none=allow_none) jmespath.search(self.field, data, jmespath_opts)[:] = to_sort return data
def run(self, data, config=None, pipeline=None): """Update results of `self.select_expr` with `self.value_expr`.""" jmespath_opts = jmespath.Options( custom_functions=JMESExtensions(data)) to_replace = jmespath.search(self.select_expr, data, jmespath_opts) if isinstance(to_replace, collections.abc.Mapping): value = jmespath.search(self.value_expr, to_replace, jmespath_opts) if not isinstance(value, collections.abc.Mapping): self._type_error(to_replace, value) to_replace.update(value) elif isinstance(to_replace, collections.abc.Sequence): for item in to_replace: value = jmespath.search(self.value_expr, item, jmespath_opts) item.update(value) else: self._invalid_selection(to_replace) return data
def run(self, data, config=None, pipeline=None): """Write CSV file.""" jmespath_opts = jmespath.Options(custom_functions=JMESExtensions(data)) results = jmespath.search(self.select_expr, data, jmespath_opts) with Stream(self.output, "w") as csv_out: if not results: print("# No results", file=csv_out) else: writer = csv.DictWriter(csv_out, sorted(results[0].keys()), restval=self.null_placeholder) if self.header: writer.writeheader() for record in results: self._fill_placeholders(record) writer.writerow(record) return data
def _sets(self, root): """ Return a list of sets, each of which contains the results of evaluating `self.jmespath_key` on each item. This also returns a map of those identifiers to the corresponding objects. """ key_map = {} key_sets = [] opts = jmespath.Options(custom_functions=JMESExtensions(root)) # Reverse the list of sets so the assignment to key_map occurs in # reverse order. That is, we prefer to keep elements from the first # set over the last set. for set_expr in reversed(self.jmespath_sets): item_set = set() item_list = jmespath.search(set_expr, root, opts) # Get list of identifiers by evaluating self.jmespath_key on each # item. We explicitly turn any sequences into tuples so they can # be hashed. identifiers = [] for item in item_list: identifier = jmespath.search(self.jmespath_key, item, opts) if isinstance(identifier, collections.abc.Sequence): identifier = tuple(identifier) identifiers.append(identifier) # Build map of identifiers to items, and list of sets of IDs. for ident, item in zip(identifiers, item_list): if ident not in item_set: key_map[ident] = item item_set.add(ident) key_sets.append(item_set) # Compensate for initial "reverse" key_sets.reverse() return key_sets, key_map
def search(self, expr, data): """Call :py:func:`jmespath.search` with extended functions.""" extensions = JMESExtensions(data) jmespath_opts = jmespath.Options(custom_functions=extensions) return jmespath.search(expr, data, jmespath_opts)
def run(self, data, config=None, pipeline=None): """Shuffle pipeline state.""" jmespath_opts = jmespath.Options(custom_functions=JMESExtensions(data)) to_shuffle = jmespath.search(self.field, data, jmespath_opts) self.random.shuffle(to_shuffle) return data