示例#1
0
    def get_bucket(self, threshold, bucket_id, 
                   project=None, from_=None, size=None):
        """
        Returns information for the given bucket.
        """
        # Coerce to a Threshold object.
        threshold = Threshold(threshold)

        query = {
            "query": { "constant_score": {
            "filter": {
                "term": {
                    "buckets." + threshold.to_elasticsearch(): bucket_id
                }
            }}},
            "sort": { "date": { "order": "desc" }},
            #"aggregations": {
                #"significant": {
                    #"significant_terms": {
                        #"field": "_all",
                        #"mutual_information": {},
                        #"size": 100
                     #}
                #}
            #}
        }
                
        if from_ is not None:
            query["from"] = from_;
            query["size"] = size;

        response = self.es.search(body=query, index=self.es_index)
        with open('bucket_response', 'wb') as debug_file:
            print(json.dumps(response, indent=2), file=debug_file)
        
        reports_found = response['hits']['total']

        # Since no reports were found, assume the bucket does not exist (at
        # least for this project).
        if reports_found < 1:
            raise BucketNotFoundError(bucket_id)

        reports = get_reports_by_bucket(response, threshold).get(bucket_id)
        assert reports

        return Bucket(id=bucket_id,
                      project=project,
                      threshold=threshold,
                      total=reports_found,
                      top_reports=reports,
                      first_seen=None)
示例#2
0
 def __init__(self, config_file=None):
     self.config = Config(config_file)
     self.thresholds = list(map(Threshold,
                                self.config.Bucketing.thresholds))
     self.es_store = ESStore(self.config.ElasticSearch)
     self.strategy_class = locate(self.config.Bucketing.Strategy.strategy)
     self.tokenization_class = locate(
         self.config.Bucketing.Tokenization.tokenization)
     self.tokenization = self.tokenization_class(
         self.config.Bucketing.Tokenization)
     self.index = ESIndex(self.es_store, self.config, self.tokenization,
                          self.thresholds)
     self.index.ensure_index_exists()
     self.strategy = self.strategy_class(
         config=self.config.Bucketing.Strategy,
         index=self.index,
     )
     # Pull configuration details needed for search and fix it up.
     self.fixed_summary_fields = dict()
     for k, v in self.config.UserInterface.fixed_summary_fields.items():
         self.fixed_summary_fields[k + ".whole"] = v
     self.fixed_summary_fields["project"] = "Project"
     self.fixed_summary_fields["type"] = "Type"
     self.default_threshold = Threshold(
         self.config.Bucketing.default_threshold)
     self.search = self.index.search
     self.allow_delete_all = self.config.ElasticSearch.allow_delete_all
示例#3
0
 def bucket_search(self, threshold, from_=None, size=None, **kwargs):
     """Factory for groups of report buckets."""
     threshold = Threshold(threshold)
     assert threshold in self.context.thresholds, (threshold)
     return BucketSearch(context=self.context,
                         threshold=threshold, 
                         **kwargs)
示例#4
0
 def default_threshold(self):
     """
     Default threshould to use if none are provided.
     """
     # TODO: determine from static/dynamic configuration
     return Threshold(
         self.config.get('partycrasher.bucket', 'default_threshold'))
示例#5
0
 def get_bucket_id(self, threshold):
     key = Threshold(threshold).to_elasticsearch()
     try:
         buckets = self['buckets']
     except KeyError:
         raise Exception('No assigned buckets for: {!r}'.format(self))
     try:
         return buckets[key]
     except KeyError:
         raise Exception('Buckets threshold {} not assigned for: '
                         '{!r}'.format(key, self))
示例#6
0
 def __setitem__(self, k, v):
     if k == 'top_match':
         if not (isinstance(v, TopMatch) or v is None):
             v = TopMatch(v)
     else:
         if not isinstance(k, Threshold):
             k = Threshold(k)
         if not (isinstance(v, Bucket) or v is None):
             v = Bucket(v)
         if v is not None:
             assert v['threshold'] == k
     return self._od.__setitem__(k, v)
示例#7
0
def compute_metrics(date_range_start, rest_service_url):
    client = RestClient(rest_service_url)
    crashes = client.get_a_bunch_of_crashes(date_range_start, 500)
    similaritys = get_similaritys(crashes, client)
    #print(pretty(similaritys))
    #print(pretty(crashes))
    for i in sorted(crashes[0]['buckets']):
        try:
            i = Threshold(i)
        except:
            continue
        compute_metrics_threshold(crashes, i, similaritys)
示例#8
0
 def __init__(self, raw_buckets):
     super(ESBuckets, self).__init__()
     self.raw_buckets = raw_buckets
     for k, v in raw_buckets.items():
         if k == 'top_match':
             if v is None:
                 self[k] = None
             else:
                 self[k] = TopMatch(v)
         else:
             threshold = Threshold(k)
             bucket = Bucket({'id': v, 'threshold': threshold})
             self[threshold] = bucket
示例#9
0
    def get_bucket(self, threshold, bucket_id, 
                   project=None, from_=None, size=None):
        """
        Returns information for the given bucket.
        """
        # Coerce to a Threshold object.
        threshold = Threshold(threshold)

        query = {
            "filter": {
                "term": {
                    "buckets." + threshold.to_elasticsearch(): bucket_id
                }
            },
            "sort": { "date": { "order": "desc" }}
        }
                
        if from_ is not None:
            query["from"] = from_;
            query["size"] = size;

        response = self.es.search(body=query, index='crashes')
        reports_found = response['hits']['total']

        # Since no reports were found, assume the bucket does not exist (at
        # least for this project).
        if reports_found < 1:
            raise BucketNotFoundError(bucket_id)

        reports = get_reports_by_bucket(response, threshold).get(bucket_id)
        assert reports

        return Bucket(id=bucket_id,
                      project=project,
                      threshold=threshold,
                      total=reports_found,
                      top_reports=reports,
                      first_seen=None)
示例#10
0
    def top_buckets(self, lower_bound, threshold=None, project=None, 
                    from_=None, size=None):
        """
        Given a datetime lower_bound (from date), calculates the top buckets
        in the given timeframe for the given threshold (automatically
        determined if not given). The results can be tailed for a specific
        project if needed.

        Returns a list of {'doc_count': int, 'key': id} dictionaries.
        """

        if not isinstance(lower_bound, datetime):
            raise TypeError('The lower bound MUST be a datetime object.')

        # Get the default threshold.
        if threshold is None:
            threshold = self.default_threshold
        if not isinstance(threshold, Threshold):
            threshold = Threshold(threshold)

        # Filters by lower-bound by default;
        filters = [{
            "range": {
                "date": {
                    "gt": lower_bound.isoformat()
                }
            }
        }]

        # May filter optionally by project name.
        if project is not None:
            filters.append({
                "term": {
                    "project": project
                }
            })

        # Oh, ElasticSearch! You and your verbose query "syntax"!
        query = {
            # Read this inside out:
            "aggs": {
                "top_buckets_filtered": {
                    # Filter the top buckets by date, and maybe by project.
                    "filter": {
                        "bool": { "must": filters }
                    },
                    # Get the top buckets in descending order of size.
                    "aggs": {
                        "top_buckets": {
                            "terms": {
                                "field": "buckets." + threshold.to_elasticsearch(),
                                "order": { "_count": "desc" },
                            },
                            # Get the date of the latest crash per bucket.
                            "aggs": {
                                "first_seen": {
                                    "min": {
                                        "field": "date"
                                    }
                                }
                            }
                        }
                    }
                }
            },

            # Do not send any hits back!
            "size": 0
        }
                                    
        if size is None:
          size = 10
        
        actual_size = size
        
        if from_ is not None:
            assert from_ >= 0
            actual_size = actual_size + from_
        if size is not None:
            assert size >= 0
            (query["aggs"]["top_buckets_filtered"]["aggs"]
                  ["top_buckets"]["terms"]["size"]) = actual_size
        
        try:
            response = self.es.search(body=query, index='crashes')
        except RequestError as e:
            print(e.error, file=sys.stderr)
            raise e

        # Oh, ElasticSearch! You and your verbose responses!
        top_buckets = (response['aggregations']
                       ['top_buckets_filtered']
                       ['top_buckets']
                       ['buckets'])
        
        if from_ is not None:
            top_buckets = top_buckets[from_:]

        return [Bucket(id=bucket['key'], project=project, threshold=threshold,
                       total=bucket['doc_count'],
                       first_seen=bucket['first_seen']['value_as_string'],
                       top_reports=None)
                for bucket in top_buckets]
示例#11
0
    def top_buckets(self,
                    lower_bound,
                    threshold=None,
                    project=None,
                    from_=None,
                    size=None,
                    upper_bound=None,
                    query_string=None):
        """
        Given a datetime lower_bound (from date), calculates the top buckets
        in the given timeframe for the given threshold (automatically
        determined if not given). The results can be tailed for a specific
        project if needed.

        Returns a list of {'doc_count': int, 'key': id} dictionaries.
        """

        if not isinstance(lower_bound, datetime):
            raise TypeError('The lower bound MUST be a datetime object.')

        # Get the default threshold.
        if threshold is None:
            threshold = self.default_threshold
        if not isinstance(threshold, Threshold):
            threshold = Threshold(threshold)

        # Filters by lower-bound by default;
        filters = [{"range": {"date": {"gt": lower_bound.isoformat()}}}]

        if upper_bound is not None:
            filters[0]["range"]["date"]["lt"] = upper_bound.isoformat()

        # May filter optionally by project name.
        if project is not None:
            filters.append({"term": {"project": project}})

        # this doesn't work on ES 2.3!
        if query_string is not None:
            print("Query string!", file=sys.stderr)
            filters.append({
                "query": {
                    "query_string": {
                        "query": query_string,
                        "default_operator": "AND",
                    }
                }
            })

        # Oh, ElasticSearch! You and your verbose query "syntax"!
        query = {
            # Read this inside out:
            "aggs": {
                "top_buckets_filtered": {
                    # Filter the top buckets by date, and maybe by project.
                    "filter": {
                        "bool": {
                            "must": filters
                        }
                    },
                    # Get the top buckets in descending order of size.
                    "aggs": {
                        "top_buckets": {
                            "terms": {
                                "field":
                                "buckets." + threshold.to_elasticsearch(),
                                "order": {
                                    "_count": "desc"
                                },
                            },
                            # Get the date of the latest crash per bucket.
                            "aggs": {
                                "first_seen": {
                                    "min": {
                                        "field": "date"
                                    }
                                }
                            }
                        }
                    }
                }
            },

            # Do not send any hits back!
            "size": 0
        }

        if size is None:
            size = 10

        actual_size = size

        if from_ is not None:
            assert from_ >= 0
            actual_size = actual_size + from_
        if size is not None:
            assert size >= 0
            (query["aggs"]["top_buckets_filtered"]["aggs"]["top_buckets"]
             ["terms"]["size"]) = actual_size

        try:
            response = self.es.search(body=query, index=self.es_index)
        except RequestError as e:
            print(e.error, file=sys.stderr)
            raise e

        # Oh, ElasticSearch! You and your verbose responses!
        top_buckets = (response['aggregations']['top_buckets_filtered']
                       ['top_buckets']['buckets'])

        if from_ is not None:
            top_buckets = top_buckets[from_:]

        return [
            Bucket(id=bucket['key'],
                   project=project,
                   threshold=threshold,
                   total=bucket['doc_count'],
                   first_seen=bucket['first_seen']['value_as_string'],
                   top_reports=None) for bucket in top_buckets
        ]
示例#12
0
    def get_bucket(self,
                   threshold,
                   bucket_id,
                   project=None,
                   from_=None,
                   size=None):
        """
        Returns information for the given bucket.
        """
        # Coerce to a Threshold object.
        threshold = Threshold(threshold)

        query = {
            "query": {
                "constant_score": {
                    "filter": {
                        "term": {
                            "buckets." + threshold.to_elasticsearch():
                            bucket_id
                        }
                    }
                }
            },
            "sort": {
                "date": {
                    "order": "desc"
                }
            },
            #"aggregations": {
            #"significant": {
            #"significant_terms": {
            #"field": "_all",
            #"mutual_information": {},
            #"size": 100
            #}
            #}
            #}
        }

        if from_ is not None:
            query["from"] = from_
            query["size"] = size

        response = self.es.search(body=query, index=self.es_index)
        with open('bucket_response', 'wb') as debug_file:
            print(json.dumps(response, indent=2), file=debug_file)

        reports_found = response['hits']['total']

        # Since no reports were found, assume the bucket does not exist (at
        # least for this project).
        if reports_found < 1:
            raise BucketNotFoundError(bucket_id)

        reports = get_reports_by_bucket(response, threshold).get(bucket_id)
        assert reports

        return Bucket(id=bucket_id,
                      project=project,
                      threshold=threshold,
                      total=reports_found,
                      top_reports=reports,
                      first_seen=None)
 def __init__(self, search, result, from_=None, size=None):
     super(ReportThreshold, self).__init__(result)
     search = BucketSearch(search=search, threshold=Threshold(self))
     self.buckets = BucketSearch(search=search, from_=from_, size=size)