예제 #1
0
    def _build_query(self, query_dict, limit=None, offset=None, shards=None):
        if shards is not None:
            if self._available_shards is None:
                self._load_available_shards()

            shard_specs = []
            for shard in shards:
                if shard not in self._available_shards:
                    raise EsgfSearchException('Shard %s is not available' %
                                              shard)
                else:
                    for port, suffix in self._available_shards[shard]:
                        # suffix should be ommited when querying
                        shard_specs.append('%s:%s/solr' % (shard, port))

            shard_str = ','.join(shard_specs)
        else:
            shard_str = None

        full_query = MultiDict({
            'format': RESPONSE_FORMAT,
            'limit': limit,
            'distrib': 'true' if self.distrib else 'false',
            'offset': offset,
            'shards': shard_str,
        })
        full_query.extend(query_dict)

        # Remove all None valued items
        full_query = MultiDict(item for item in full_query.items()
                               if item[1] is not None)

        return full_query
예제 #2
0
    def send_query(self, query_dict, limit=None, offset=None):
        """
        Generally not to be called directly by the user but via SearchContext
	instances.
        
        :param query_dict: dictionary of query string parameers to send.
        :return: ElementTree instance (TODO: think about this)
        
        """

        full_query = MultiDict({
            'format':
            RESPONSE_FORMAT,
            'limit':
            limit,
            'distrib':
            'true' if self.distrib else 'false',
            'offset':
            offset,
            'shards':
            ','.join(self.shards) if self.shards else None,
        })
        full_query.extend(query_dict)

        # Remove all None valued items
        full_query = MultiDict(item for item in full_query.items()
                               if item[1] is not None)

        query_url = '%s?%s' % (self.url, urllib.urlencode(full_query))
        log.debug('Query request is %s' % query_url)

        response = urllib2.urlopen(query_url)
        ret = json.load(response)

        return ret
예제 #3
0
def build_constraint_dict(constraints):
    c_dict = MultiDict()
    if constraints:
        for constrain in constraints.split(','):
            if ':' in constrain.strip():
                key, value = constrain.split(':', 1)
                c_dict.add(key, value)
    return c_dict
예제 #4
0
    def __init__(self,
                 connection,
                 constraints,
                 search_type=TYPE_DATASET,
                 latest=None,
                 facets=None,
                 fields=None,
                 from_timestamp=None,
                 to_timestamp=None,
                 replica=None):
        """
        :param connection: The SearchConnection
        :param constraints: A dictionary of initial constraints
	:param type: One of TYPE_* constants defining the document type to
	    search for
	:param facets: The list of facets for which counts will be retrieved
	    and constraints be validated against.  Or None to represent all
	    facets.
	:param fields: A list of field names to return in search responses
	:param replica: A boolean defining whether to return master records
	    or replicas, or None to return both.
	:param latest: A boolean defining whether to return only latest verisons
	    or only non-latest versions, or None to return both.

        """

        self.connection = connection
        self.__facet_counts = None
        self.__hit_count = None

        #  Constraints
        self.freetext_constraint = None
        self.facet_constraints = MultiDict()
        self.temporal_constraint = (None, None)
        self.geosplatial_constraint = None

        self._update_constraints(constraints)

        # Search configuration parameters
        self.timestamp_range = (from_timestamp, to_timestamp)

        search_types = [TYPE_DATASET, TYPE_FILE, TYPE_AGGREGATION]
        if search_type not in search_types:
            raise EsgfSearchException('search_type must be one of %s' %
                                      ','.join(search_types))
        self.search_type = search_type

        self.latest = latest
        self.facets = facets
        self.fields = fields
        self.replica = replica
예제 #5
0
def convert_constraints(url):
    """
    converts esgf search query to constraints parameter.
    TODO: constraints parameter should have the same structure as the esgf query.
    """
    # FROM: project=CMIP5&time_frequency=mon&variable=tas,tasmax,tasmin
    # TO: project:CORDEX,experiment:historical,experiment:rcp26
    parsed_url = urlparse(url)
    constraints = MultiDict()
    for qpart in parsed_url.query.split('&'):
        key, value = qpart.split('=')
        for val in value.split(','):
            constraints.add(key.strip(), val.strip())
    converted = ','.join(
        ["{0[0]}:{0[1]}".format(c) for c in constraints.iteritems()])
    return converted
예제 #6
0
    def _split_constraints(self, constraints):
        """
        Divide a constraint dictionary into 4 types of constraints:
        1. Freetext query
        2. Facet constraints
        3. Temporal constraints
        4. Geospatial constraints

        :return: A dictionary of the 4 types of constraint.
        
        """
        # local import to prevent circular importing
        from .connection import query_keyword_type

        constraints_split = dict(
            (kw, MultiDict()) for kw in QUERY_KEYWORD_TYPES)
        for kw, val in constraints.items():
            constraint_type = query_keyword_type(kw)
            constraints_split[constraint_type][kw] = val

        return constraints_split
예제 #7
0
    def _build_query(self):
        """
        Build query string parameters as a dictionary.

        """

        query_dict = MultiDict({
            "query": self.freetext_constraint,
            "type": self.search_type,
            "latest": self.latest,
            "facets": self.facets,
            "fields": self.fields,
            "replica": self.replica,
        })

        query_dict.extend(self.facet_constraints)

        #!TODO: encode datetime
        start, end = self.temporal_constraint
        query_dict.update(start=start, end=end)

        return query_dict
예제 #8
0
 def test_from_fieldstorage_without_filename(self):
     from pyesgf.multidict import MultiDict
     d = MultiDict()
     fs = DummyFieldStorage('a', '1')
     self.assertEqual(d.from_fieldstorage(fs), MultiDict({'a': '1'}))
예제 #9
0
 def test_view_list(self):
     from pyesgf.multidict import MultiDict
     d = MultiDict()
     self.assertEqual(d.view_list([1, 2])._items, [1, 2])
예제 #10
0
 def test_view_list_not_list(self):
     from pyesgf.multidict import MultiDict
     d = MultiDict()
     self.assertRaises(TypeError, d.view_list, 42)
예제 #11
0
 def test_kwargs(self):
     from pyesgf.multidict import MultiDict
     md = MultiDict(kw1='val1')
     self.assertEqual(md._items, [('kw1', 'val1')])
예제 #12
0
 def test_no_args(self):
     from pyesgf.multidict import MultiDict
     md = MultiDict()
     self.assertEqual(md._items, [])
예제 #13
0
    def __init__(self,
                 connection,
                 constraints,
                 search_type=None,
                 latest=None,
                 facets=None,
                 fields=None,
                 from_timestamp=None,
                 to_timestamp=None,
                 replica=None,
                 shards=None):
        """

        :param connection: The SearchConnection
        :param constraints: A dictionary of initial constraints
        :param search_type: One of TYPE_* constants defining the document
            type to search for.  Overrides SearchContext.DEFAULT_SEARCH_TYPE
        :param facets: The list of facets for which counts will be retrieved
            and constraints be validated against.  Or None to represent all
            facets.
        :param fields: A list of field names to return in search responses
        :param replica: A boolean defining whether to return master records
            or replicas, or None to return both.
        :param latest: A boolean defining whether to return only latest verisons
            or only non-latest versions, or None to return both.
        :param shards: list of shards to restrict searches to.  Should be from the list
            self.connection.get_shard_list()
        :param from_timestamp: Date-time string to specify start of search range 
            (e.g. "2000-01-01T00:00:00Z"). 
        :param to_timestamp: Date-time string to specify end of search range
            (e.g. "2100-12-31T23:59:59Z").

        """

        self.connection = connection
        self.__facet_counts = None
        self.__hit_count = None

        if search_type is None:
            search_type = self.DEFAULT_SEARCH_TYPE

        #  Constraints
        self.freetext_constraint = None
        self.facet_constraints = MultiDict()
        self.temporal_constraint = [from_timestamp, to_timestamp]
        self.geosplatial_constraint = None

        self._update_constraints(constraints)

        # Search configuration parameters
        self.timestamp_range = (from_timestamp, to_timestamp)

        search_types = [TYPE_DATASET, TYPE_FILE, TYPE_AGGREGATION]
        if search_type not in search_types:
            raise EsgfSearchException('search_type must be one of %s' %
                                      ','.join(search_types))
        self.search_type = search_type

        self.latest = latest
        self.facets = facets
        self.fields = fields
        self.replica = replica
        self.shards = shards
예제 #14
0
    def search(self,
               constraints=[('project', 'CORDEX')],
               query=None,
               start=None,
               end=None,
               limit=1,
               offset=0,
               search_type='Dataset',
               temporal=False):
        self.show_status("Starting ...", 0)

        from pyesgf.multidict import MultiDict
        my_constraints = MultiDict()
        for key, value in constraints:
            my_constraints.add(key, value)

        LOGGER.debug('constraints=%s', my_constraints)

        if not query or query == '*':
            query = None
        LOGGER.debug('query: %s', query)

        # TODO: check type of start, end
        LOGGER.debug('start=%s, end=%s', start, end)

        ctx = None
        if temporal is True:
            LOGGER.debug("using dataset search with time constraints")
            # TODO: handle timestamps in a better way
            timestamp_format = '%Y-%m-%dT%H:%M:%SZ'
            if start:
                from_timestamp = start.strftime(timestamp_format)
            else:
                from_timestamp = None
            if end:
                to_timestamp = end.strftime(timestamp_format)
            else:
                to_timestamp = None
            LOGGER.debug("from=%s, to=%s", from_timestamp, to_timestamp)
            ctx = self.conn.new_context(fields=self.fields,
                                        replica=self.replica,
                                        latest=self.latest,
                                        query=query,
                                        from_timestamp=from_timestamp,
                                        to_timestamp=to_timestamp)
        else:
            ctx = self.conn.new_context(fields=self.fields,
                                        replica=self.replica,
                                        latest=self.latest,
                                        query=query)
        if len(my_constraints) > 0:
            ctx = ctx.constrain(**my_constraints.mixed())

        LOGGER.debug('ctx: facet_constraints=%s, replica=%s, latests=%s',
                     ctx.facet_constraints, ctx.replica, ctx.latest)

        self.show_status("Datasets found=%d" % ctx.hit_count, 0)

        self.summary = dict(total_number_of_datasets=ctx.hit_count,
                            number_of_datasets=0,
                            number_of_files=0,
                            number_of_aggregations=0,
                            size=0)

        self.result = []

        self.count = 0
        # search datasets
        # we always do this to get the summary document
        datasets = ctx.search(ignore_facet_check=True)

        (self.start_index, self.stop_index,
         self.max_count) = self._index(datasets, limit, offset)
        self.summary['number_of_datasets'] = max(0, self.max_count)

        t0 = datetime.now()
        for i in range(self.start_index, self.stop_index):
            ds = datasets[i]
            # progress = self.count * 100.0 / self.max_count
            self.count = self.count + 1
            self.result.append(ds.json)
            for key in ['number_of_files', 'number_of_aggregations', 'size']:
                # LOGGER.debug(ds.json)
                self.summary[key] = self.summary[key] + ds.json.get(key, 0)

        self.summary['ds_search_duration_secs'] = (datetime.now() - t0).seconds
        self.summary['size_mb'] = self.summary.get('size', 0) / 1024 / 1024
        self.summary['size_gb'] = self.summary.get('size_mb', 0) / 1024

        LOGGER.debug('search_type = %s ', search_type)

        if search_type == 'Dataset':
            pass
        # search files (optional)
        elif search_type == 'File':
            self._file_search(datasets, my_constraints, start, end)
        # search aggregations (optional)
        elif search_type == 'Aggregation':
            self._aggregation_search(datasets, my_constraints)
        else:
            raise Exception('unknown search type: %s', search_type)

        LOGGER.debug('summary=%s', self.summary)
        self.show_status('Done', 100)

        return (self.result, self.summary, ctx.facet_counts)