def __init__(self, name, event, client=None, field=None, metric_fields=None, copy_fields=None, query_modifiers=None, interval='day', index_interval='month', batch_size=7): """Construct aggregator instance. :param event: aggregated event. :param client: elasticsearch client. :param field: field on which the aggregation will be done. :param metric_fields: dictionary of fields on which a metric aggregation will be computed. The format of the dictionary is "destination field" -> tuple("metric type", "source field", "metric_options"). :param copy_fields: list of fields which are copied from the raw events into the aggregation. :param query_modifiers: list of functions modifying the raw events query. By default the query_modifiers are [filter_robots]. :param interval: aggregation time window. default: month. :param index_interval: time window of the elasticsearch indices which will contain the resulting aggregations. :param batch_size: max number of hours/days/months for which raw events are being fetched in one query. This number has to be coherent with the interval. """ self.name = name self.event = event self.event_index = prefix_index('events-stats-{}'.format(event)) self.client = client or current_search_client self.index = prefix_index('stats-{}'.format(event)) self.field = field self.metric_fields = metric_fields or {} self.interval = interval self.batch_size = batch_size self.doc_id_suffix = SUPPORTED_INTERVALS[interval] self.index_interval = index_interval self.index_name_suffix = SUPPORTED_INTERVALS[index_interval] self.indices = set() self.copy_fields = copy_fields or {} self.query_modifiers = (query_modifiers if query_modifiers is not None else [filter_robots]) self.bookmark_api = BookmarkAPI(self.client, self.name, self.interval) if any(v not in ALLOWED_METRICS for k, (v, _, _) in self.metric_fields.items()): raise (ValueError( 'Metric aggregation type should be one of [{}]'.format( ', '.join(ALLOWED_METRICS)))) if list(SUPPORTED_INTERVALS.keys()).index(interval) \ > list(SUPPORTED_INTERVALS.keys()).index(index_interval): raise (ValueError('Aggregation interval should be' ' shorter than index interval'))
def default_record_to_mapping(record): """Get mapping given a record. It tries to extract from `record['$schema']` the index and doc_type. If it fails, uses the default values. :param record: The record object. :returns: mapping """ index, doc = default_record_to_index(record) index = prefix_index(index) current_app.logger.debug('Using index {idx} and doc {doc}'.format( idx=index, doc=doc)) mapping = current_search_client.indices.get_mapping([index]) if mapping is not None: doc_type = next(iter(mapping)) current_app.logger.debug('Using mapping for {idx}'.format(idx=index)) current_app.logger.debug('Mapping {mapping}'.format(mapping=mapping)) if ES_VERSION[0] >= 7: return mapping[doc_type]['mappings'] return mapping[doc_type]['mappings'][doc] return None
def get_affected_records(spec=None, search_pattern=None): """Get list of affected records. :param spec: The record spec. :param search_pattern: The search pattern. :returns: An iterator to lazily find results. """ # spec pattern query # ---------- ---------- ------- # None None None # None Y Y # X None X # X '' X # X Y X OR Y if spec is None and search_pattern is None: raise StopIteration queries = [] if spec is not None: queries.append(Q('match', **{'_oai.sets': spec})) if search_pattern: queries.append(query_string_parser(search_pattern=search_pattern)) search = OAIServerSearch(index=prefix_index( current_app.config['OAISERVER_RECORD_INDEX']), ).query( Q('bool', should=queries)) for result in search.scan(): yield result.meta.id
def __init__(self, queue, prefix='events', suffix='%Y-%m-%d', client=None, preprocessors=None, double_click_window=10): """Initialize indexer. :param prefix: prefix appended to elasticsearch indices' name. :param suffix: suffix appended to elasticsearch indices' name. :param double_click_window: time window during which similar events are deduplicated (counted as one occurence). :param client: elasticsearch client. :param preprocessors: a list of functions which are called on every event before it is indexed. Each function should return the processed event. If it returns None, the event is filtered and won't be indexed. """ self.queue = queue self.client = client or current_search_client self.doctype = get_doctype(queue.routing_key) self.index = prefix_index('{0}-{1}'.format(prefix, self.queue.routing_key)) self.suffix = suffix # load the preprocessors self.preprocessors = [ obj_or_import_string(preproc) for preproc in preprocessors ] if preprocessors is not None else self.default_preprocessors self.double_click_window = double_click_window
def agg_iter(self, lower_limit, upper_limit): """Aggregate and return dictionary to be indexed in ES.""" aggregation_data = {} self.agg_query = Search(using=self.client, index=self.event_index) \ .filter('range', timestamp={ 'gte': format_range_dt(lower_limit, self.interval), 'lte': format_range_dt(upper_limit, self.interval) }) # apply query modifiers for modifier in self.query_modifiers: self.agg_query = modifier(self.agg_query) # TODO: remove histogram bucket histogram = self.agg_query.aggs.bucket('histogram', 'date_histogram', field='timestamp', interval=self.interval) terms = histogram.bucket('terms', 'terms', field=self.field, size=get_bucket_size(self.client, self.event_index, self.field)) terms.metric('top_hit', 'top_hits', size=1, sort={'timestamp': 'desc'}) for dst, (metric, src, opts) in self.metric_fields.items(): terms.metric(dst, metric, field=src, **opts) results = self.agg_query.execute() for interval in results.aggregations['histogram'].buckets: interval_date = datetime.strptime(interval['key_as_string'], '%Y-%m-%dT%H:%M:%S') for aggregation in interval['terms'].buckets: aggregation_data['timestamp'] = interval_date.isoformat() aggregation_data[self.field] = aggregation['key'] aggregation_data['count'] = aggregation['doc_count'] if self.metric_fields: for f in self.metric_fields: aggregation_data[f] = aggregation[f]['value'] doc = aggregation.top_hit.hits.hits[0]['_source'] for destination, source in self.copy_fields.items(): if isinstance(source, six.string_types): aggregation_data[destination] = doc[source] else: aggregation_data[destination] = source( doc, aggregation_data) index_name = 'stats-{0}-{1}'.format( self.event, interval_date.strftime(self.index_name_suffix)) self.indices.add(index_name) yield dict(_id='{0}-{1}'.format( aggregation['key'], interval_date.strftime(self.doc_id_suffix)), _index=prefix_index(index_name), _type=self.doc_type, _source=aggregation_data) self.has_events = True
def __init__(self, client, agg_type, agg_interval): """Construct bookmark instance. :param client: elasticsearch client :param agg_type: aggregation type for the bookmark """ # NOTE: doc_type is going to be deprecated with ES_7 self.doc_type = get_doctype('aggregation-bookmark') self.bookmark_index = prefix_index(current_app, 'stats-bookmarks') self.client = client self.agg_type = agg_type self.agg_interval = agg_interval
def __init__(self, client, agg_type, event_index, agg_interval): """Construct bookmark instance. :param client: elasticsearch client :param agg_type: aggregation type for the bookmark """ # NOTE: doc_type is going to be deprecated with ES_7 self.doc_type = get_doctype('aggregation-bookmark') self.bookmark_index = prefix_index('bookmark-index') self.client = client self.agg_type = agg_type self.event_index = event_index self.agg_interval = agg_interval self._create_bookmark()
def _build_index_mapping(self): """Build index mapping.""" old_client = self.src_es_client['client'] def get_src(name, prefix): index_name = None if old_client.indices.exists(name): index_name = name elif old_client.indices.exists_alias(name): indexes = list(old_client.indices.get_alias(name=name).keys()) if not indexes: raise Exception( 'no index found for alias: {}'.format(name)) index_name = indexes[0] else: raise Exception( "alias or index doesn't exist: {}".format(name)) return dict(index=index_name, prefix=prefix) def get_dst(aliases, prefixed_name): if isinstance(aliases, str): raise Exception( 'failed to find index with name: {}'.format(prefixed_name)) for key, values in aliases.items(): if key == prefixed_name: index, mapping = list(values.items())[0] return dict(index=index, mapping=mapping) else: return get_dst(values, prefixed_name) index_mapping = {} for pid_type, name in self.pid_mappings.items(): mapping = dict(src=get_src(name, self.src_es_client['prefix'] or ''), dst=get_dst(current_search.aliases, prefix_index(current_app, name))) index_mapping[pid_type] = mapping return index_mapping
def get_records(**kwargs): """Get records paginated.""" page_ = kwargs.get('resumptionToken', {}).get('page', 1) size_ = current_app.config['OAISERVER_PAGE_SIZE'] scroll = current_app.config['OAISERVER_RESUMPTION_TOKEN_EXPIRE_TIME'] scroll_id = kwargs.get('resumptionToken', {}).get('scroll_id') if scroll_id is None: search = OAIServerSearch(index=prefix_index( current_app.config['OAISERVER_RECORD_INDEX']), ).params( scroll='{0}s'.format(scroll), ).extra( version=True, )[(page_ - 1) * size_:page_ * size_] if 'set' in kwargs: search = search.query('match', **{'_oai.sets': kwargs['set']}) time_range = {} if 'from_' in kwargs: time_range['gte'] = kwargs['from_'] if 'until' in kwargs: time_range['lte'] = kwargs['until'] if time_range: search = search.filter('range', **{'_updated': time_range}) response = search.execute().to_dict() else: response = current_search_client.scroll( scroll_id=scroll_id, scroll='{0}s'.format(scroll), ) class Pagination(object): """Dummy pagination class.""" page = page_ per_page = size_ def __init__(self, response): """Initilize pagination.""" self.response = response self.total = response['hits']['total'] self._scroll_id = response.get('_scroll_id') # clean descriptor on last page if not self.has_next: current_search_client.clear_scroll(scroll_id=self._scroll_id) self._scroll_id = None @cached_property def has_next(self): """Return True if there is next page.""" total = self.total if ES_VERSION[0] < 7 else \ self.total.get('value', 0) return self.page * self.per_page <= total @cached_property def next_num(self): """Return next page number.""" return self.page + 1 if self.has_next else None @property def items(self): """Return iterator.""" from datetime import datetime for result in self.response['hits']['hits']: if '_oai' in result['_source']: yield { 'id': result['_id'], 'json': result, 'updated': datetime.strptime(result['_source']['_updated'][:19], '%Y-%m-%dT%H:%M:%S'), } return Pagination(response)
def identify(**kwargs): """Create OAI-PMH response for verb Identify.""" cfg = current_app.config e_tree, e_identify = verb(**kwargs) e_repositoryName = SubElement(e_identify, etree.QName(NS_OAIPMH, 'repositoryName')) e_repositoryName.text = cfg['OAISERVER_REPOSITORY_NAME'] e_baseURL = SubElement(e_identify, etree.QName(NS_OAIPMH, 'baseURL')) e_baseURL.text = url_for('invenio_oaiserver.response', _external=True) e_protocolVersion = SubElement(e_identify, etree.QName(NS_OAIPMH, 'protocolVersion')) e_protocolVersion.text = cfg['OAISERVER_PROTOCOL_VERSION'] for adminEmail in cfg['OAISERVER_ADMIN_EMAILS']: e = SubElement(e_identify, etree.QName(NS_OAIPMH, 'adminEmail')) e.text = adminEmail e_earliestDatestamp = SubElement( e_identify, etree.QName(NS_OAIPMH, 'earliestDatestamp')) earliest_date = datetime(MINYEAR, 1, 1) earliest_record = OAIServerSearch( index=prefix_index(current_app.config['OAISERVER_RECORD_INDEX'])).sort( {"_created": { "order": "asc" }})[0:1].execute() if len(earliest_record.hits.hits) > 0: hit = earliest_record.hits.hits[0] if ES_VERSION[0] >= 7: hit = hit.to_dict() created_date_str = hit.get("_source", {}).get('_created') if created_date_str: earliest_date = arrow.get(created_date_str).to( 'utc').datetime.replace(tzinfo=None) e_earliestDatestamp.text = datetime_to_datestamp(earliest_date) e_deletedRecord = SubElement(e_identify, etree.QName(NS_OAIPMH, 'deletedRecord')) e_deletedRecord.text = 'no' e_granularity = SubElement(e_identify, etree.QName(NS_OAIPMH, 'granularity')) assert cfg['OAISERVER_GRANULARITY'] in DATETIME_FORMATS e_granularity.text = cfg['OAISERVER_GRANULARITY'] compressions = cfg['OAISERVER_COMPRESSIONS'] if compressions != ['identity']: for compression in compressions: e_compression = SubElement(e_identify, etree.QName(NS_OAIPMH, 'compression')) e_compression.text = compression for description in cfg.get('OAISERVER_DESCRIPTIONS', []): e_description = SubElement(e_identify, etree.QName(NS_OAIPMH, 'description')) e_description.append(etree.fromstring(description)) return e_tree
def match(record, config=None): """Given a record, yield the records in INSPIRE most similar to it. This method can be used to detect if a record that we are ingesting as a submission or as an harvest is already present in the system, or to find out which record a reference should be pointing to. """ if config is None: current_app.logger.debug( 'No configuration provided. Falling back to the default configuration.' ) config = current_app.config['MATCHER_DEFAULT_CONFIGURATION'] try: index = prefix_index(config['index']) algorithm = config['algorithm'] query_config = {'index': index} except KeyError as e: raise KeyError('Malformed configuration: %s.' % repr(e)) source = config.get('source', []) if source: query_config['_source'] = source match_deleted = config.get('match_deleted', False) collections = config.get('collections') if not (collections is None or (isinstance(collections, (list, tuple)) and all( isinstance(collection, string_types) for collection in collections))): raise ValueError( 'Malformed collections. Expected a list of strings bug got: %s' % repr(collections)) for i, step in enumerate(algorithm): try: queries = step['queries'] except KeyError: raise KeyError('Malformed algorithm: step %d has no queries.' % i) validator = _get_validator(step.get('validator')) for j, query in enumerate(queries): try: body = compile(query, record, collections=collections, match_deleted=match_deleted) except Exception as e: raise ValueError( 'Malformed query. Query %d of step %d does not compile: %s.' % (j, i, repr(e))) if not body: continue query_config['body'] = body current_app.logger.debug('Sending ES query: %s' % repr(body)) result = es.search(**query_config) for hit in result['hits']['hits']: if validator(record, hit): yield hit