def get_bucket(self, threshold, bucket_id, project=None, from_=None, size=None): """ Returns information for the given bucket. """ # Coerce to a Threshold object. threshold = Threshold(threshold) query = { "query": { "constant_score": { "filter": { "term": { "buckets." + threshold.to_elasticsearch(): bucket_id } }}}, "sort": { "date": { "order": "desc" }}, #"aggregations": { #"significant": { #"significant_terms": { #"field": "_all", #"mutual_information": {}, #"size": 100 #} #} #} } if from_ is not None: query["from"] = from_; query["size"] = size; response = self.es.search(body=query, index=self.es_index) with open('bucket_response', 'wb') as debug_file: print(json.dumps(response, indent=2), file=debug_file) reports_found = response['hits']['total'] # Since no reports were found, assume the bucket does not exist (at # least for this project). if reports_found < 1: raise BucketNotFoundError(bucket_id) reports = get_reports_by_bucket(response, threshold).get(bucket_id) assert reports return Bucket(id=bucket_id, project=project, threshold=threshold, total=reports_found, top_reports=reports, first_seen=None)
def __init__(self, config_file=None): self.config = Config(config_file) self.thresholds = list(map(Threshold, self.config.Bucketing.thresholds)) self.es_store = ESStore(self.config.ElasticSearch) self.strategy_class = locate(self.config.Bucketing.Strategy.strategy) self.tokenization_class = locate( self.config.Bucketing.Tokenization.tokenization) self.tokenization = self.tokenization_class( self.config.Bucketing.Tokenization) self.index = ESIndex(self.es_store, self.config, self.tokenization, self.thresholds) self.index.ensure_index_exists() self.strategy = self.strategy_class( config=self.config.Bucketing.Strategy, index=self.index, ) # Pull configuration details needed for search and fix it up. self.fixed_summary_fields = dict() for k, v in self.config.UserInterface.fixed_summary_fields.items(): self.fixed_summary_fields[k + ".whole"] = v self.fixed_summary_fields["project"] = "Project" self.fixed_summary_fields["type"] = "Type" self.default_threshold = Threshold( self.config.Bucketing.default_threshold) self.search = self.index.search self.allow_delete_all = self.config.ElasticSearch.allow_delete_all
def bucket_search(self, threshold, from_=None, size=None, **kwargs): """Factory for groups of report buckets.""" threshold = Threshold(threshold) assert threshold in self.context.thresholds, (threshold) return BucketSearch(context=self.context, threshold=threshold, **kwargs)
def default_threshold(self): """ Default threshould to use if none are provided. """ # TODO: determine from static/dynamic configuration return Threshold( self.config.get('partycrasher.bucket', 'default_threshold'))
def get_bucket_id(self, threshold): key = Threshold(threshold).to_elasticsearch() try: buckets = self['buckets'] except KeyError: raise Exception('No assigned buckets for: {!r}'.format(self)) try: return buckets[key] except KeyError: raise Exception('Buckets threshold {} not assigned for: ' '{!r}'.format(key, self))
def __setitem__(self, k, v): if k == 'top_match': if not (isinstance(v, TopMatch) or v is None): v = TopMatch(v) else: if not isinstance(k, Threshold): k = Threshold(k) if not (isinstance(v, Bucket) or v is None): v = Bucket(v) if v is not None: assert v['threshold'] == k return self._od.__setitem__(k, v)
def compute_metrics(date_range_start, rest_service_url): client = RestClient(rest_service_url) crashes = client.get_a_bunch_of_crashes(date_range_start, 500) similaritys = get_similaritys(crashes, client) #print(pretty(similaritys)) #print(pretty(crashes)) for i in sorted(crashes[0]['buckets']): try: i = Threshold(i) except: continue compute_metrics_threshold(crashes, i, similaritys)
def __init__(self, raw_buckets): super(ESBuckets, self).__init__() self.raw_buckets = raw_buckets for k, v in raw_buckets.items(): if k == 'top_match': if v is None: self[k] = None else: self[k] = TopMatch(v) else: threshold = Threshold(k) bucket = Bucket({'id': v, 'threshold': threshold}) self[threshold] = bucket
def get_bucket(self, threshold, bucket_id, project=None, from_=None, size=None): """ Returns information for the given bucket. """ # Coerce to a Threshold object. threshold = Threshold(threshold) query = { "filter": { "term": { "buckets." + threshold.to_elasticsearch(): bucket_id } }, "sort": { "date": { "order": "desc" }} } if from_ is not None: query["from"] = from_; query["size"] = size; response = self.es.search(body=query, index='crashes') reports_found = response['hits']['total'] # Since no reports were found, assume the bucket does not exist (at # least for this project). if reports_found < 1: raise BucketNotFoundError(bucket_id) reports = get_reports_by_bucket(response, threshold).get(bucket_id) assert reports return Bucket(id=bucket_id, project=project, threshold=threshold, total=reports_found, top_reports=reports, first_seen=None)
def top_buckets(self, lower_bound, threshold=None, project=None, from_=None, size=None): """ Given a datetime lower_bound (from date), calculates the top buckets in the given timeframe for the given threshold (automatically determined if not given). The results can be tailed for a specific project if needed. Returns a list of {'doc_count': int, 'key': id} dictionaries. """ if not isinstance(lower_bound, datetime): raise TypeError('The lower bound MUST be a datetime object.') # Get the default threshold. if threshold is None: threshold = self.default_threshold if not isinstance(threshold, Threshold): threshold = Threshold(threshold) # Filters by lower-bound by default; filters = [{ "range": { "date": { "gt": lower_bound.isoformat() } } }] # May filter optionally by project name. if project is not None: filters.append({ "term": { "project": project } }) # Oh, ElasticSearch! You and your verbose query "syntax"! query = { # Read this inside out: "aggs": { "top_buckets_filtered": { # Filter the top buckets by date, and maybe by project. "filter": { "bool": { "must": filters } }, # Get the top buckets in descending order of size. "aggs": { "top_buckets": { "terms": { "field": "buckets." + threshold.to_elasticsearch(), "order": { "_count": "desc" }, }, # Get the date of the latest crash per bucket. "aggs": { "first_seen": { "min": { "field": "date" } } } } } } }, # Do not send any hits back! "size": 0 } if size is None: size = 10 actual_size = size if from_ is not None: assert from_ >= 0 actual_size = actual_size + from_ if size is not None: assert size >= 0 (query["aggs"]["top_buckets_filtered"]["aggs"] ["top_buckets"]["terms"]["size"]) = actual_size try: response = self.es.search(body=query, index='crashes') except RequestError as e: print(e.error, file=sys.stderr) raise e # Oh, ElasticSearch! You and your verbose responses! top_buckets = (response['aggregations'] ['top_buckets_filtered'] ['top_buckets'] ['buckets']) if from_ is not None: top_buckets = top_buckets[from_:] return [Bucket(id=bucket['key'], project=project, threshold=threshold, total=bucket['doc_count'], first_seen=bucket['first_seen']['value_as_string'], top_reports=None) for bucket in top_buckets]
def top_buckets(self, lower_bound, threshold=None, project=None, from_=None, size=None, upper_bound=None, query_string=None): """ Given a datetime lower_bound (from date), calculates the top buckets in the given timeframe for the given threshold (automatically determined if not given). The results can be tailed for a specific project if needed. Returns a list of {'doc_count': int, 'key': id} dictionaries. """ if not isinstance(lower_bound, datetime): raise TypeError('The lower bound MUST be a datetime object.') # Get the default threshold. if threshold is None: threshold = self.default_threshold if not isinstance(threshold, Threshold): threshold = Threshold(threshold) # Filters by lower-bound by default; filters = [{"range": {"date": {"gt": lower_bound.isoformat()}}}] if upper_bound is not None: filters[0]["range"]["date"]["lt"] = upper_bound.isoformat() # May filter optionally by project name. if project is not None: filters.append({"term": {"project": project}}) # this doesn't work on ES 2.3! if query_string is not None: print("Query string!", file=sys.stderr) filters.append({ "query": { "query_string": { "query": query_string, "default_operator": "AND", } } }) # Oh, ElasticSearch! You and your verbose query "syntax"! query = { # Read this inside out: "aggs": { "top_buckets_filtered": { # Filter the top buckets by date, and maybe by project. "filter": { "bool": { "must": filters } }, # Get the top buckets in descending order of size. "aggs": { "top_buckets": { "terms": { "field": "buckets." + threshold.to_elasticsearch(), "order": { "_count": "desc" }, }, # Get the date of the latest crash per bucket. "aggs": { "first_seen": { "min": { "field": "date" } } } } } } }, # Do not send any hits back! "size": 0 } if size is None: size = 10 actual_size = size if from_ is not None: assert from_ >= 0 actual_size = actual_size + from_ if size is not None: assert size >= 0 (query["aggs"]["top_buckets_filtered"]["aggs"]["top_buckets"] ["terms"]["size"]) = actual_size try: response = self.es.search(body=query, index=self.es_index) except RequestError as e: print(e.error, file=sys.stderr) raise e # Oh, ElasticSearch! You and your verbose responses! top_buckets = (response['aggregations']['top_buckets_filtered'] ['top_buckets']['buckets']) if from_ is not None: top_buckets = top_buckets[from_:] return [ Bucket(id=bucket['key'], project=project, threshold=threshold, total=bucket['doc_count'], first_seen=bucket['first_seen']['value_as_string'], top_reports=None) for bucket in top_buckets ]
def get_bucket(self, threshold, bucket_id, project=None, from_=None, size=None): """ Returns information for the given bucket. """ # Coerce to a Threshold object. threshold = Threshold(threshold) query = { "query": { "constant_score": { "filter": { "term": { "buckets." + threshold.to_elasticsearch(): bucket_id } } } }, "sort": { "date": { "order": "desc" } }, #"aggregations": { #"significant": { #"significant_terms": { #"field": "_all", #"mutual_information": {}, #"size": 100 #} #} #} } if from_ is not None: query["from"] = from_ query["size"] = size response = self.es.search(body=query, index=self.es_index) with open('bucket_response', 'wb') as debug_file: print(json.dumps(response, indent=2), file=debug_file) reports_found = response['hits']['total'] # Since no reports were found, assume the bucket does not exist (at # least for this project). if reports_found < 1: raise BucketNotFoundError(bucket_id) reports = get_reports_by_bucket(response, threshold).get(bucket_id) assert reports return Bucket(id=bucket_id, project=project, threshold=threshold, total=reports_found, top_reports=reports, first_seen=None)
def __init__(self, search, result, from_=None, size=None): super(ReportThreshold, self).__init__(result) search = BucketSearch(search=search, threshold=Threshold(self)) self.buckets = BucketSearch(search=search, from_=from_, size=size)