def dump_dataset_headers(self): """Print out the matching parameters for the --datasets specified on the command line. """ multi_context_headers = defaultdict(list) for context in self.contexts: if self.args.datasets: headers = api.get_dataset_headers_by_id( context, self.args.datasets) elif self.args.instrument: headers = api.get_dataset_headers_by_instrument( context, self.args.instrument) for dataset_id, header in headers.items(): multi_context_headers[dataset_id].append((context, header)) for dataset_id, context_headers in multi_context_headers.items(): for (context, header) in context_headers: if self.args.condition_values: header = utils.condition_header(header) if self.args.minimize_headers: header = crds.get_cached_mapping(context).minimize_header( header) if len(self.contexts) == 1: print(dataset_id, ":", log.format_parameter_list(header)) else: print(dataset_id, ":", context, ":", log.format_parameter_list(header))
def list_dataset_headers(self): """List dataset header info for self.args.dataset_headers with respect to self.args.context""" for context in self.contexts: with log.error_on_exception("Failed fetching dataset parameters with repect to", repr(context), "for", repr(self.args.dataset_headers)): pars = api.get_dataset_headers_by_id(context, self.args.dataset_headers) pmap = crds.get_cached_mapping(context) for requested_id in self.args.dataset_headers: for returned_id in sorted(pars.keys()): if requested_id.upper() in returned_id.upper(): header = pars[returned_id] if isinstance(header, python23.string_types): log.error("No header for", repr(returned_id), ":", repr(header)) # header is reason continue if self.args.id_expansions_only: print(returned_id, context if len(self.contexts) > 1 else "") else: if self.args.minimize_headers: header2 = pmap.minimize_header(header) else: header2 = dict(header) header2.pop("REFTYPE", None) header2["dataset_id"] = returned_id log.info("Dataset pars for", repr(returned_id), "with respect to", repr(context) + ":\n", log.PP(header2)) if self.args.first_id_expansion_only: break
def sync_datasets(self): """Sync mappings and references for datasets with respect to `self.contexts`.""" if not self.contexts: log.error("Define --contexts under which references are fetched for --dataset-files or --dataset-ids.""") sys.exit(-1) active_references = [] for context in self.contexts: if self.args.dataset_ids: if len(self.args.dataset_ids) == 1 and self.args.dataset_ids[0].startswith("@"): with open(self.args.dataset_ids[0][1:]) as pfile: self.args.dataset_ids = pfile.read().splitlines() with log.error_on_exception("Failed to get matching parameters for", self.args.dataset_ids): id_headers = api.get_dataset_headers_by_id(context, self.args.dataset_ids) for dataset in self.args.dataset_files or self.args.dataset_ids: log.info("Syncing context '%s' dataset '%s'." % (context, dataset)) with log.error_on_exception("Failed to get matching parameters from", repr(dataset)): if self.args.dataset_files: headers = { dataset : data_file.get_conditioned_header(dataset, observatory=self.observatory) } else: headers = { dataset_id : header for (dataset_id, header) in id_headers.items() if dataset.upper() in dataset_id } for assc_dataset, header in headers.items(): with log.error_on_exception("Failed syncing references for dataset", repr(assc_dataset), "under context", repr(context)): bestrefs = crds.getrecommendations(header, context=context, observatory=self.observatory, ignore_cache=self.args.ignore_cache) log.verbose("Best references for", repr(assc_dataset), "are", bestrefs) active_references.extend(bestrefs.values()) active_references = [ ref for ref in active_references if not ref.startswith("NOT FOUND") ] log.verbose("Syncing references:", repr(active_references)) return list(set(active_references))
def __init__(self, context, datasets, datasets_since): """"Contact the CRDS server and get headers for the list of `datasets` ids with respect to `context`.""" super(DatasetHeaderGenerator, self).__init__(context, datasets, datasets_since) server = api.get_crds_server() log.info("Dumping dataset parameters from CRDS server at", repr(server), "for", repr(datasets)) self.headers = api.get_dataset_headers_by_id(context, datasets) log.info("Dumped", len(self.headers), "of", len(datasets), "datasets from CRDS server at", repr(server)) # every command line id should correspond to 1 or more headers for source in self.sources: if self.matching_two_part_id(source) not in self.headers.keys(): log.warning("Dataset", repr(source), "isn't represented by downloaded parameters.") # Process according to downloaded 2-part ids, not command line ids. self.sources = sorted(self.headers.keys())
def list_datasets(self): """List dataset header info for self.args.datasets with respect to self.args.context""" for context in self.contexts: with log.error_on_exception("Failed fetching dataset parameters with repect to", repr(context), "for", repr(self.args.datasets)): pars = api.get_dataset_headers_by_id(context, self.args.datasets) pmap = rmap.get_cached_mapping(context) for (dataset_id, header) in pars.items(): if isinstance(header, python23.string_types): log.error("No header for", repr(dataset_id), ":", repr(header)) # header is reason continue header2 = pmap.minimize_header(header) header2.pop("REFTYPE", None) log.info("Dataset pars for", repr(dataset_id), "with respect to", repr(context) + ":\n", log.PP(header2))
def fetch_source_segment(self, source): """Return the segment of dataset ids which surrounds id `source`.""" try: index = self.sources.index(source) // self.segment_size except ValueError as exc: raise CrdsError("Unknown dataset id " + repr(source)) from exc lower = index * self.segment_size upper = (index + 1) * self.segment_size segment_ids = self.sources[lower:upper] log.verbose("Dumping", len(segment_ids), "datasets from indices", lower, "to", lower + len(segment_ids), verbosity=20) dumped_headers = api.get_dataset_headers_by_id(self.context, segment_ids) log.verbose("Dumped", len(dumped_headers), "datasets", verbosity=20) if self.save_pickles: # keep all headers, causes memory problems with multiple instruments on ~8G ram. self.headers.update(dumped_headers) else: # conserve memory by keeping only the last N headers self.headers = dumped_headers
def dump_dataset_headers(self): """Print out the matching parameters for the --datasets specified on the command line. """ multi_context_headers = defaultdict(list) for context in self.contexts: if self.args.datasets: headers = api.get_dataset_headers_by_id(context, self.args.datasets) elif self.args.instrument: headers = api.get_dataset_headers_by_instrument(context, self.args.instrument) for dataset_id, header in headers.items(): multi_context_headers[dataset_id].append((context, header)) for dataset_id, context_headers in multi_context_headers.items(): for (context, header) in context_headers: if self.args.condition_values: header = utils.condition_header(header) if self.args.minimize_headers: header = rmap.get_cached_mapping(context).minimize_header(header) if len(self.contexts) == 1: print(dataset_id, ":", log.format_parameter_list(header)) else: print(dataset_id, ":", context, ":", log.format_parameter_list(header))