Exemplo n.º 1
0
    def __init__(self,
                 name,
                 event,
                 client=None,
                 field=None,
                 metric_fields=None,
                 copy_fields=None,
                 query_modifiers=None,
                 interval='day',
                 index_interval='month',
                 batch_size=7):
        """Construct aggregator instance.

        :param event: aggregated event.
        :param client: elasticsearch client.
        :param field: field on which the aggregation will be done.
        :param metric_fields: dictionary of fields on which a
            metric aggregation will be computed. The format of the dictionary
            is "destination field" ->
            tuple("metric type", "source field", "metric_options").
        :param copy_fields: list of fields which are copied from the raw events
            into the aggregation.
        :param query_modifiers: list of functions modifying the raw events
            query. By default the query_modifiers are [filter_robots].
        :param interval: aggregation time window. default: month.
        :param index_interval: time window of the elasticsearch indices which
            will contain the resulting aggregations.
        :param batch_size: max number of hours/days/months for which raw events
            are being fetched in one query. This number has to be coherent with
            the interval.
        """
        self.name = name
        self.event = event
        self.event_index = prefix_index('events-stats-{}'.format(event))
        self.client = client or current_search_client
        self.index = prefix_index('stats-{}'.format(event))
        self.field = field
        self.metric_fields = metric_fields or {}
        self.interval = interval
        self.batch_size = batch_size
        self.doc_id_suffix = SUPPORTED_INTERVALS[interval]
        self.index_interval = index_interval
        self.index_name_suffix = SUPPORTED_INTERVALS[index_interval]
        self.indices = set()
        self.copy_fields = copy_fields or {}
        self.query_modifiers = (query_modifiers if query_modifiers is not None
                                else [filter_robots])
        self.bookmark_api = BookmarkAPI(self.client, self.name, self.interval)

        if any(v not in ALLOWED_METRICS
               for k, (v, _, _) in self.metric_fields.items()):
            raise (ValueError(
                'Metric aggregation type should be one of [{}]'.format(
                    ', '.join(ALLOWED_METRICS))))

        if list(SUPPORTED_INTERVALS.keys()).index(interval) \
                > list(SUPPORTED_INTERVALS.keys()).index(index_interval):
            raise (ValueError('Aggregation interval should be'
                              ' shorter than index interval'))
Exemplo n.º 2
0
def default_record_to_mapping(record):
    """Get mapping given a record.

    It tries to extract from `record['$schema']` the index and doc_type.
    If it fails, uses the default values.

    :param record: The record object.
    :returns: mapping
    """
    index, doc = default_record_to_index(record)
    index = prefix_index(index)
    current_app.logger.debug('Using index {idx} and doc {doc}'.format(
        idx=index, doc=doc))

    mapping = current_search_client.indices.get_mapping([index])
    if mapping is not None:
        doc_type = next(iter(mapping))
        current_app.logger.debug('Using mapping for {idx}'.format(idx=index))
        current_app.logger.debug('Mapping {mapping}'.format(mapping=mapping))

        if ES_VERSION[0] >= 7:
            return mapping[doc_type]['mappings']

        return mapping[doc_type]['mappings'][doc]

    return None
Exemplo n.º 3
0
def get_affected_records(spec=None, search_pattern=None):
    """Get list of affected records.

    :param spec: The record spec.
    :param search_pattern: The search pattern.
    :returns: An iterator to lazily find results.
    """
    # spec       pattern    query
    # ---------- ---------- -------
    # None       None       None
    # None       Y          Y
    # X          None       X
    # X          ''         X
    # X          Y          X OR Y

    if spec is None and search_pattern is None:
        raise StopIteration

    queries = []

    if spec is not None:
        queries.append(Q('match', **{'_oai.sets': spec}))

    if search_pattern:
        queries.append(query_string_parser(search_pattern=search_pattern))

    search = OAIServerSearch(index=prefix_index(
        current_app.config['OAISERVER_RECORD_INDEX']), ).query(
            Q('bool', should=queries))

    for result in search.scan():
        yield result.meta.id
Exemplo n.º 4
0
    def __init__(self,
                 queue,
                 prefix='events',
                 suffix='%Y-%m-%d',
                 client=None,
                 preprocessors=None,
                 double_click_window=10):
        """Initialize indexer.

        :param prefix: prefix appended to elasticsearch indices' name.
        :param suffix: suffix appended to elasticsearch indices' name.
        :param double_click_window: time window during which similar events are
            deduplicated (counted as one occurence).
        :param client: elasticsearch client.
        :param preprocessors: a list of functions which are called on every
            event before it is indexed. Each function should return the
            processed event. If it returns None, the event is filtered and
            won't be indexed.
        """
        self.queue = queue
        self.client = client or current_search_client
        self.doctype = get_doctype(queue.routing_key)
        self.index = prefix_index('{0}-{1}'.format(prefix,
                                                   self.queue.routing_key))
        self.suffix = suffix
        # load the preprocessors
        self.preprocessors = [
            obj_or_import_string(preproc) for preproc in preprocessors
        ] if preprocessors is not None else self.default_preprocessors
        self.double_click_window = double_click_window
Exemplo n.º 5
0
    def agg_iter(self, lower_limit, upper_limit):
        """Aggregate and return dictionary to be indexed in ES."""
        aggregation_data = {}
        self.agg_query = Search(using=self.client, index=self.event_index) \
            .filter('range', timestamp={
                'gte': format_range_dt(lower_limit, self.interval),
                'lte': format_range_dt(upper_limit, self.interval)
            })

        # apply query modifiers
        for modifier in self.query_modifiers:
            self.agg_query = modifier(self.agg_query)

        # TODO: remove histogram bucket
        histogram = self.agg_query.aggs.bucket('histogram',
                                               'date_histogram',
                                               field='timestamp',
                                               interval=self.interval)
        terms = histogram.bucket('terms',
                                 'terms',
                                 field=self.field,
                                 size=get_bucket_size(self.client,
                                                      self.event_index,
                                                      self.field))
        terms.metric('top_hit', 'top_hits', size=1, sort={'timestamp': 'desc'})
        for dst, (metric, src, opts) in self.metric_fields.items():
            terms.metric(dst, metric, field=src, **opts)

        results = self.agg_query.execute()
        for interval in results.aggregations['histogram'].buckets:
            interval_date = datetime.strptime(interval['key_as_string'],
                                              '%Y-%m-%dT%H:%M:%S')
            for aggregation in interval['terms'].buckets:
                aggregation_data['timestamp'] = interval_date.isoformat()
                aggregation_data[self.field] = aggregation['key']
                aggregation_data['count'] = aggregation['doc_count']

                if self.metric_fields:
                    for f in self.metric_fields:
                        aggregation_data[f] = aggregation[f]['value']

                doc = aggregation.top_hit.hits.hits[0]['_source']
                for destination, source in self.copy_fields.items():
                    if isinstance(source, six.string_types):
                        aggregation_data[destination] = doc[source]
                    else:
                        aggregation_data[destination] = source(
                            doc, aggregation_data)

                index_name = 'stats-{0}-{1}'.format(
                    self.event, interval_date.strftime(self.index_name_suffix))
                self.indices.add(index_name)

                yield dict(_id='{0}-{1}'.format(
                    aggregation['key'],
                    interval_date.strftime(self.doc_id_suffix)),
                           _index=prefix_index(index_name),
                           _type=self.doc_type,
                           _source=aggregation_data)
                self.has_events = True
Exemplo n.º 6
0
    def __init__(self, client, agg_type, agg_interval):
        """Construct bookmark instance.

        :param client: elasticsearch client
        :param agg_type: aggregation type for the bookmark
        """
        # NOTE: doc_type is going to be deprecated with ES_7
        self.doc_type = get_doctype('aggregation-bookmark')
        self.bookmark_index = prefix_index(current_app, 'stats-bookmarks')
        self.client = client
        self.agg_type = agg_type
        self.agg_interval = agg_interval
Exemplo n.º 7
0
    def __init__(self, client, agg_type, event_index, agg_interval):
        """Construct bookmark instance.

        :param client: elasticsearch client
        :param agg_type: aggregation type for the bookmark
        """
        # NOTE: doc_type is going to be deprecated with ES_7
        self.doc_type = get_doctype('aggregation-bookmark')
        self.bookmark_index = prefix_index('bookmark-index')
        self.client = client
        self.agg_type = agg_type
        self.event_index = event_index
        self.agg_interval = agg_interval
        self._create_bookmark()
Exemplo n.º 8
0
    def _build_index_mapping(self):
        """Build index mapping."""
        old_client = self.src_es_client['client']

        def get_src(name, prefix):
            index_name = None
            if old_client.indices.exists(name):
                index_name = name
            elif old_client.indices.exists_alias(name):
                indexes = list(old_client.indices.get_alias(name=name).keys())
                if not indexes:
                    raise Exception(
                        'no index found for alias: {}'.format(name))
                index_name = indexes[0]
            else:
                raise Exception(
                    "alias or index doesn't exist: {}".format(name))
            return dict(index=index_name, prefix=prefix)

        def get_dst(aliases, prefixed_name):
            if isinstance(aliases, str):
                raise Exception(
                    'failed to find index with name: {}'.format(prefixed_name))
            for key, values in aliases.items():
                if key == prefixed_name:
                    index, mapping = list(values.items())[0]
                    return dict(index=index, mapping=mapping)
                else:
                    return get_dst(values, prefixed_name)

        index_mapping = {}
        for pid_type, name in self.pid_mappings.items():
            mapping = dict(src=get_src(name, self.src_es_client['prefix']
                                       or ''),
                           dst=get_dst(current_search.aliases,
                                       prefix_index(current_app, name)))

            index_mapping[pid_type] = mapping
        return index_mapping
Exemplo n.º 9
0
def get_records(**kwargs):
    """Get records paginated."""
    page_ = kwargs.get('resumptionToken', {}).get('page', 1)
    size_ = current_app.config['OAISERVER_PAGE_SIZE']
    scroll = current_app.config['OAISERVER_RESUMPTION_TOKEN_EXPIRE_TIME']
    scroll_id = kwargs.get('resumptionToken', {}).get('scroll_id')

    if scroll_id is None:
        search = OAIServerSearch(index=prefix_index(
            current_app.config['OAISERVER_RECORD_INDEX']), ).params(
                scroll='{0}s'.format(scroll), ).extra(
                    version=True, )[(page_ - 1) * size_:page_ * size_]

        if 'set' in kwargs:
            search = search.query('match', **{'_oai.sets': kwargs['set']})

        time_range = {}
        if 'from_' in kwargs:
            time_range['gte'] = kwargs['from_']
        if 'until' in kwargs:
            time_range['lte'] = kwargs['until']
        if time_range:
            search = search.filter('range', **{'_updated': time_range})

        response = search.execute().to_dict()
    else:
        response = current_search_client.scroll(
            scroll_id=scroll_id,
            scroll='{0}s'.format(scroll),
        )

    class Pagination(object):
        """Dummy pagination class."""

        page = page_
        per_page = size_

        def __init__(self, response):
            """Initilize pagination."""
            self.response = response
            self.total = response['hits']['total']
            self._scroll_id = response.get('_scroll_id')

            # clean descriptor on last page
            if not self.has_next:
                current_search_client.clear_scroll(scroll_id=self._scroll_id)
                self._scroll_id = None

        @cached_property
        def has_next(self):
            """Return True if there is next page."""
            total = self.total if ES_VERSION[0] < 7 else \
                self.total.get('value', 0)
            return self.page * self.per_page <= total

        @cached_property
        def next_num(self):
            """Return next page number."""
            return self.page + 1 if self.has_next else None

        @property
        def items(self):
            """Return iterator."""
            from datetime import datetime
            for result in self.response['hits']['hits']:
                if '_oai' in result['_source']:
                    yield {
                        'id':
                        result['_id'],
                        'json':
                        result,
                        'updated':
                        datetime.strptime(result['_source']['_updated'][:19],
                                          '%Y-%m-%dT%H:%M:%S'),
                    }

    return Pagination(response)
Exemplo n.º 10
0
def identify(**kwargs):
    """Create OAI-PMH response for verb Identify."""
    cfg = current_app.config

    e_tree, e_identify = verb(**kwargs)

    e_repositoryName = SubElement(e_identify,
                                  etree.QName(NS_OAIPMH, 'repositoryName'))
    e_repositoryName.text = cfg['OAISERVER_REPOSITORY_NAME']

    e_baseURL = SubElement(e_identify, etree.QName(NS_OAIPMH, 'baseURL'))
    e_baseURL.text = url_for('invenio_oaiserver.response', _external=True)

    e_protocolVersion = SubElement(e_identify,
                                   etree.QName(NS_OAIPMH, 'protocolVersion'))
    e_protocolVersion.text = cfg['OAISERVER_PROTOCOL_VERSION']

    for adminEmail in cfg['OAISERVER_ADMIN_EMAILS']:
        e = SubElement(e_identify, etree.QName(NS_OAIPMH, 'adminEmail'))
        e.text = adminEmail

    e_earliestDatestamp = SubElement(
        e_identify, etree.QName(NS_OAIPMH, 'earliestDatestamp'))
    earliest_date = datetime(MINYEAR, 1, 1)
    earliest_record = OAIServerSearch(
        index=prefix_index(current_app.config['OAISERVER_RECORD_INDEX'])).sort(
            {"_created": {
                "order": "asc"
            }})[0:1].execute()
    if len(earliest_record.hits.hits) > 0:
        hit = earliest_record.hits.hits[0]
        if ES_VERSION[0] >= 7:
            hit = hit.to_dict()
        created_date_str = hit.get("_source", {}).get('_created')
        if created_date_str:
            earliest_date = arrow.get(created_date_str).to(
                'utc').datetime.replace(tzinfo=None)

    e_earliestDatestamp.text = datetime_to_datestamp(earliest_date)

    e_deletedRecord = SubElement(e_identify,
                                 etree.QName(NS_OAIPMH, 'deletedRecord'))
    e_deletedRecord.text = 'no'

    e_granularity = SubElement(e_identify, etree.QName(NS_OAIPMH,
                                                       'granularity'))
    assert cfg['OAISERVER_GRANULARITY'] in DATETIME_FORMATS
    e_granularity.text = cfg['OAISERVER_GRANULARITY']

    compressions = cfg['OAISERVER_COMPRESSIONS']
    if compressions != ['identity']:
        for compression in compressions:
            e_compression = SubElement(e_identify,
                                       etree.QName(NS_OAIPMH, 'compression'))
            e_compression.text = compression

    for description in cfg.get('OAISERVER_DESCRIPTIONS', []):
        e_description = SubElement(e_identify,
                                   etree.QName(NS_OAIPMH, 'description'))
        e_description.append(etree.fromstring(description))

    return e_tree
Exemplo n.º 11
0
def match(record, config=None):
    """Given a record, yield the records in INSPIRE most similar to it.

    This method can be used to detect if a record that we are ingesting as a
    submission or as an harvest is already present in the system, or to find
    out which record a reference should be pointing to.
    """
    if config is None:
        current_app.logger.debug(
            'No configuration provided. Falling back to the default configuration.'
        )
        config = current_app.config['MATCHER_DEFAULT_CONFIGURATION']

    try:
        index = prefix_index(config['index'])
        algorithm = config['algorithm']
        query_config = {'index': index}
    except KeyError as e:
        raise KeyError('Malformed configuration: %s.' % repr(e))

    source = config.get('source', [])
    if source:
        query_config['_source'] = source
    match_deleted = config.get('match_deleted', False)
    collections = config.get('collections')
    if not (collections is None or
            (isinstance(collections, (list, tuple)) and all(
                isinstance(collection, string_types)
                for collection in collections))):
        raise ValueError(
            'Malformed collections. Expected a list of strings bug got: %s' %
            repr(collections))

    for i, step in enumerate(algorithm):
        try:
            queries = step['queries']
        except KeyError:
            raise KeyError('Malformed algorithm: step %d has no queries.' % i)

        validator = _get_validator(step.get('validator'))

        for j, query in enumerate(queries):
            try:
                body = compile(query,
                               record,
                               collections=collections,
                               match_deleted=match_deleted)
            except Exception as e:
                raise ValueError(
                    'Malformed query. Query %d of step %d does not compile: %s.'
                    % (j, i, repr(e)))

            if not body:
                continue
            query_config['body'] = body
            current_app.logger.debug('Sending ES query: %s' % repr(body))

            result = es.search(**query_config)

            for hit in result['hits']['hits']:
                if validator(record, hit):
                    yield hit