def _build_query(self, query_dict, limit=None, offset=None, shards=None): if shards is not None: if self._available_shards is None: self._load_available_shards() shard_specs = [] for shard in shards: if shard not in self._available_shards: raise EsgfSearchException('Shard %s is not available' % shard) else: for port, suffix in self._available_shards[shard]: # suffix should be ommited when querying shard_specs.append('%s:%s/solr' % (shard, port)) shard_str = ','.join(shard_specs) else: shard_str = None full_query = MultiDict({ 'format': RESPONSE_FORMAT, 'limit': limit, 'distrib': 'true' if self.distrib else 'false', 'offset': offset, 'shards': shard_str, }) full_query.extend(query_dict) # Remove all None valued items full_query = MultiDict(item for item in full_query.items() if item[1] is not None) return full_query
def send_query(self, query_dict, limit=None, offset=None): """ Generally not to be called directly by the user but via SearchContext instances. :param query_dict: dictionary of query string parameers to send. :return: ElementTree instance (TODO: think about this) """ full_query = MultiDict({ 'format': RESPONSE_FORMAT, 'limit': limit, 'distrib': 'true' if self.distrib else 'false', 'offset': offset, 'shards': ','.join(self.shards) if self.shards else None, }) full_query.extend(query_dict) # Remove all None valued items full_query = MultiDict(item for item in full_query.items() if item[1] is not None) query_url = '%s?%s' % (self.url, urllib.urlencode(full_query)) log.debug('Query request is %s' % query_url) response = urllib2.urlopen(query_url) ret = json.load(response) return ret
def build_constraint_dict(constraints): c_dict = MultiDict() if constraints: for constrain in constraints.split(','): if ':' in constrain.strip(): key, value = constrain.split(':', 1) c_dict.add(key, value) return c_dict
def __init__(self, connection, constraints, search_type=TYPE_DATASET, latest=None, facets=None, fields=None, from_timestamp=None, to_timestamp=None, replica=None): """ :param connection: The SearchConnection :param constraints: A dictionary of initial constraints :param type: One of TYPE_* constants defining the document type to search for :param facets: The list of facets for which counts will be retrieved and constraints be validated against. Or None to represent all facets. :param fields: A list of field names to return in search responses :param replica: A boolean defining whether to return master records or replicas, or None to return both. :param latest: A boolean defining whether to return only latest verisons or only non-latest versions, or None to return both. """ self.connection = connection self.__facet_counts = None self.__hit_count = None # Constraints self.freetext_constraint = None self.facet_constraints = MultiDict() self.temporal_constraint = (None, None) self.geosplatial_constraint = None self._update_constraints(constraints) # Search configuration parameters self.timestamp_range = (from_timestamp, to_timestamp) search_types = [TYPE_DATASET, TYPE_FILE, TYPE_AGGREGATION] if search_type not in search_types: raise EsgfSearchException('search_type must be one of %s' % ','.join(search_types)) self.search_type = search_type self.latest = latest self.facets = facets self.fields = fields self.replica = replica
def convert_constraints(url): """ converts esgf search query to constraints parameter. TODO: constraints parameter should have the same structure as the esgf query. """ # FROM: project=CMIP5&time_frequency=mon&variable=tas,tasmax,tasmin # TO: project:CORDEX,experiment:historical,experiment:rcp26 parsed_url = urlparse(url) constraints = MultiDict() for qpart in parsed_url.query.split('&'): key, value = qpart.split('=') for val in value.split(','): constraints.add(key.strip(), val.strip()) converted = ','.join( ["{0[0]}:{0[1]}".format(c) for c in constraints.iteritems()]) return converted
def _split_constraints(self, constraints): """ Divide a constraint dictionary into 4 types of constraints: 1. Freetext query 2. Facet constraints 3. Temporal constraints 4. Geospatial constraints :return: A dictionary of the 4 types of constraint. """ # local import to prevent circular importing from .connection import query_keyword_type constraints_split = dict( (kw, MultiDict()) for kw in QUERY_KEYWORD_TYPES) for kw, val in constraints.items(): constraint_type = query_keyword_type(kw) constraints_split[constraint_type][kw] = val return constraints_split
def _build_query(self): """ Build query string parameters as a dictionary. """ query_dict = MultiDict({ "query": self.freetext_constraint, "type": self.search_type, "latest": self.latest, "facets": self.facets, "fields": self.fields, "replica": self.replica, }) query_dict.extend(self.facet_constraints) #!TODO: encode datetime start, end = self.temporal_constraint query_dict.update(start=start, end=end) return query_dict
def test_from_fieldstorage_without_filename(self): from pyesgf.multidict import MultiDict d = MultiDict() fs = DummyFieldStorage('a', '1') self.assertEqual(d.from_fieldstorage(fs), MultiDict({'a': '1'}))
def test_view_list(self): from pyesgf.multidict import MultiDict d = MultiDict() self.assertEqual(d.view_list([1, 2])._items, [1, 2])
def test_view_list_not_list(self): from pyesgf.multidict import MultiDict d = MultiDict() self.assertRaises(TypeError, d.view_list, 42)
def test_kwargs(self): from pyesgf.multidict import MultiDict md = MultiDict(kw1='val1') self.assertEqual(md._items, [('kw1', 'val1')])
def test_no_args(self): from pyesgf.multidict import MultiDict md = MultiDict() self.assertEqual(md._items, [])
def __init__(self, connection, constraints, search_type=None, latest=None, facets=None, fields=None, from_timestamp=None, to_timestamp=None, replica=None, shards=None): """ :param connection: The SearchConnection :param constraints: A dictionary of initial constraints :param search_type: One of TYPE_* constants defining the document type to search for. Overrides SearchContext.DEFAULT_SEARCH_TYPE :param facets: The list of facets for which counts will be retrieved and constraints be validated against. Or None to represent all facets. :param fields: A list of field names to return in search responses :param replica: A boolean defining whether to return master records or replicas, or None to return both. :param latest: A boolean defining whether to return only latest verisons or only non-latest versions, or None to return both. :param shards: list of shards to restrict searches to. Should be from the list self.connection.get_shard_list() :param from_timestamp: Date-time string to specify start of search range (e.g. "2000-01-01T00:00:00Z"). :param to_timestamp: Date-time string to specify end of search range (e.g. "2100-12-31T23:59:59Z"). """ self.connection = connection self.__facet_counts = None self.__hit_count = None if search_type is None: search_type = self.DEFAULT_SEARCH_TYPE # Constraints self.freetext_constraint = None self.facet_constraints = MultiDict() self.temporal_constraint = [from_timestamp, to_timestamp] self.geosplatial_constraint = None self._update_constraints(constraints) # Search configuration parameters self.timestamp_range = (from_timestamp, to_timestamp) search_types = [TYPE_DATASET, TYPE_FILE, TYPE_AGGREGATION] if search_type not in search_types: raise EsgfSearchException('search_type must be one of %s' % ','.join(search_types)) self.search_type = search_type self.latest = latest self.facets = facets self.fields = fields self.replica = replica self.shards = shards
def search(self, constraints=[('project', 'CORDEX')], query=None, start=None, end=None, limit=1, offset=0, search_type='Dataset', temporal=False): self.show_status("Starting ...", 0) from pyesgf.multidict import MultiDict my_constraints = MultiDict() for key, value in constraints: my_constraints.add(key, value) LOGGER.debug('constraints=%s', my_constraints) if not query or query == '*': query = None LOGGER.debug('query: %s', query) # TODO: check type of start, end LOGGER.debug('start=%s, end=%s', start, end) ctx = None if temporal is True: LOGGER.debug("using dataset search with time constraints") # TODO: handle timestamps in a better way timestamp_format = '%Y-%m-%dT%H:%M:%SZ' if start: from_timestamp = start.strftime(timestamp_format) else: from_timestamp = None if end: to_timestamp = end.strftime(timestamp_format) else: to_timestamp = None LOGGER.debug("from=%s, to=%s", from_timestamp, to_timestamp) ctx = self.conn.new_context(fields=self.fields, replica=self.replica, latest=self.latest, query=query, from_timestamp=from_timestamp, to_timestamp=to_timestamp) else: ctx = self.conn.new_context(fields=self.fields, replica=self.replica, latest=self.latest, query=query) if len(my_constraints) > 0: ctx = ctx.constrain(**my_constraints.mixed()) LOGGER.debug('ctx: facet_constraints=%s, replica=%s, latests=%s', ctx.facet_constraints, ctx.replica, ctx.latest) self.show_status("Datasets found=%d" % ctx.hit_count, 0) self.summary = dict(total_number_of_datasets=ctx.hit_count, number_of_datasets=0, number_of_files=0, number_of_aggregations=0, size=0) self.result = [] self.count = 0 # search datasets # we always do this to get the summary document datasets = ctx.search(ignore_facet_check=True) (self.start_index, self.stop_index, self.max_count) = self._index(datasets, limit, offset) self.summary['number_of_datasets'] = max(0, self.max_count) t0 = datetime.now() for i in range(self.start_index, self.stop_index): ds = datasets[i] # progress = self.count * 100.0 / self.max_count self.count = self.count + 1 self.result.append(ds.json) for key in ['number_of_files', 'number_of_aggregations', 'size']: # LOGGER.debug(ds.json) self.summary[key] = self.summary[key] + ds.json.get(key, 0) self.summary['ds_search_duration_secs'] = (datetime.now() - t0).seconds self.summary['size_mb'] = self.summary.get('size', 0) / 1024 / 1024 self.summary['size_gb'] = self.summary.get('size_mb', 0) / 1024 LOGGER.debug('search_type = %s ', search_type) if search_type == 'Dataset': pass # search files (optional) elif search_type == 'File': self._file_search(datasets, my_constraints, start, end) # search aggregations (optional) elif search_type == 'Aggregation': self._aggregation_search(datasets, my_constraints) else: raise Exception('unknown search type: %s', search_type) LOGGER.debug('summary=%s', self.summary) self.show_status('Done', 100) return (self.result, self.summary, ctx.facet_counts)