def get(self, **kwargs): params = self.parse_parameters(kwargs) if params["product"] == "bad": raise BadArgumentError("Bad product") return {"hits": [], "total": 0}
def get(self, **kwargs): """Return JSON data of a crash report, given its uuid. """ filters = [ ('uuid', None, str), ('datatype', None, str), ('name', None, str) # only applicable if datatype == 'raw' ] params = external_common.parse_arguments(filters, kwargs, modern=True) if not params.uuid: raise MissingArgumentError('uuid') if not params.datatype: raise MissingArgumentError('datatype') datatype_method_mapping = { 'raw': 'get_raw_dump', 'meta': 'get_raw_crash', 'processed': 'get_processed', 'unredacted': 'get_unredacted_processed', } if params.datatype not in datatype_method_mapping: raise BadArgumentError(params.datatype) get = self.__getattribute__(datatype_method_mapping[params.datatype]) try: if params.datatype == 'raw': return get(params.uuid, name=params.name) else: return get(params.uuid) except CrashIDNotFound: # The CrashIDNotFound exception that happens inside the # crashstorage is too revealing as exception message # contains information about buckets and prefix keys. # Re-wrap it here so the message is just the crash ID. raise CrashIDNotFound(params.uuid)
def test_mapping(self, mapping): """Verify that a mapping is correct. This function does so by first creating a new, temporary index in elasticsearch using the mapping. It then takes some recent crash reports that are in elasticsearch and tries to insert them in the temporary index. Any failure in any of those steps will raise an exception. If any is raised, that means the mapping is incorrect in some way (either it doesn't validate against elasticsearch's rules, or is not compatible with the data we currently store). If no exception is raised, the mapping is likely correct. This function is to be used in any place that can change the `storage_mapping` field in any Super Search Field. Methods `create_field` and `update_field` use it, see above. """ temp_index = 'socorro_mapping_test' es_connection = self.get_connection() index_creator = self.config.index_creator_class( self.config ) try: index_creator.create_index( temp_index, index_creator.get_socorro_index_settings(mapping), ) now = datetimeutil.utc_now() last_week = now - datetime.timedelta(days=7) current_indices = self.generate_list_of_indexes(last_week, now) crashes_sample = es_connection.search( index=current_indices, doc_type=self.config.elasticsearch.elasticsearch_doctype, size=self.config.elasticsearch.mapping_test_crash_number, ) crashes = [x['_source'] for x in crashes_sample['hits']['hits']] for crash in crashes: es_connection.index( index=temp_index, doc_type=self.config.elasticsearch.elasticsearch_doctype, body=crash, ) except elasticsearch.exceptions.ElasticsearchException as e: raise BadArgumentError( 'storage_mapping', msg='Indexing existing data in Elasticsearch failed with the ' 'new mapping. Error is: %s' % str(e), ) finally: try: index_creator.get_index_client().delete(temp_index) except elasticsearch.exceptions.NotFoundError: # If the index does not exist (if the index creation failed # for example), we don't need to do anything. pass
def get(self, **kwargs): filters = [ ("report_date", None, "datetime"), ("report_type", None, "str"), ("product", None, "str"), ("version", None, "str"), ("signature", None, "str"), ("platform", None, "str"), ("min_crashes", 10, "int"), ("min_baseline_diff", 0.05, "float"), ] params = external_common.parse_arguments(filters, kwargs) hits = [] if params['report_type'] == 'interesting-addons': hits = self.interesting_addons(params) elif params['report_type'] == 'interesting-modules': hits = self.interesting_modules(params) elif params['report_type'] == 'interesting-addons-with-version': hits = self.interesting_addons_with_version(params) elif params['report_type'] == 'interesting-modules-with-version': hits = self.interesting_modules_with_version(params) elif params['report_type'] == 'core-counts': hits = self.core_counts(params) else: raise BadArgumentError( 'report_type', received=report_type ) return { 'hits': hits, 'total': len(hits) }
def get(self, *args, **kwargs): params = self.parse_parameters(kwargs) # Make sure bug_ids is a list of numbers and if not, raise # and error if not all([bug_id.isdigit() for bug_id in params['bug_ids']]): raise BadArgumentError('bug_ids') hits = list( BugAssociation.objects .filter(bug_id__in=params['bug_ids']) .values('bug_id', 'signature') ) hits = [ { 'id': int(hit['bug_id']), 'signature': hit['signature'] } for hit in hits ] return { 'hits': hits, 'total': len(hits) }
def get(self, **kwargs): params = self.parse_parameters(kwargs) if params['product'] == 'bad': raise BadArgumentError('Bad product') return {'hits': [], 'total': 0}
def get_signatures(self, **kwargs): """Return top crashers by signatures. See https://socorro.readthedocs.io/en/latest/middleware.html#tcbs """ filters = [ ("product", None, "str"), ("version", None, "str"), ("crash_type", "all", "str"), ("to_date", datetimeutil.utc_now(), "datetime"), ("duration", datetime.timedelta(7), "timedelta"), ("os", None, "str"), ("limit", 100, "int"), ("date_range_type", None, "str") ] params = external_common.parse_arguments(filters, kwargs) params.logger = logger # what the twoPeriodTopCrasherComparison() function does is that it # makes a start date from taking the to_date - duration if params.duration > datetime.timedelta(30): raise BadArgumentError('Duration too long. Max 30 days.') with self.get_connection() as connection: return tcbs.twoPeriodTopCrasherComparison(connection, params)
def get(self, **kwargs): """Return JSON data of a crash report, given its uuid. """ filters = [ ("uuid", None, str), ("datatype", None, str), ("name", None, str), # only applicable if datatype == 'raw' ] params = external_common.parse_arguments(filters, kwargs, modern=True) if not params.uuid: raise MissingArgumentError("uuid") if not ooid.is_crash_id_valid(params.uuid): raise BadArgumentError("uuid") if not params.datatype: raise MissingArgumentError("datatype") datatype_method_mapping = { "raw": "get_raw_dump", "meta": "get_raw_crash", "processed": "get_processed", "unredacted": "get_unredacted_processed", } if params.datatype not in datatype_method_mapping: raise BadArgumentError(params.datatype) get = self.__getattribute__(datatype_method_mapping[params.datatype]) try: if params.datatype == "raw": return get(params.uuid, name=params.name) else: return get(params.uuid) except CrashIDNotFound as cidnf: self.logger.warning( "%(datatype)s not found: %(exception)s", { "datatype": params.datatype, "exception": cidnf }, ) # The CrashIDNotFound exception that happens inside the # crashstorage is too revealing as exception message # contains information about buckets and prefix keys. # Re-wrap it here so the message is just the crash ID. raise CrashIDNotFound(params.uuid)
def mocked_get(**options): gets.append(options) if options.get('product') == '400': raise BadArgumentError('product') return { 'hits': { 'release': 0.1, 'beta': 1.0, } }
def get_field_name(self, value, full=True): try: field_ = self.all_fields[value] except KeyError: raise BadArgumentError(value, msg='Unknown field "%s"' % value) if not field_["is_returned"]: # Returning this field is not allowed. raise BadArgumentError( value, msg='Field "%s" is not allowed to be returned' % value) field_name = self.get_full_field_name(field_) if full and field_["has_full_version"]: # If the param has a full version, that means what matters # is the full string, and not its individual terms. field_name += ".full" return field_name
def post(self, **data): crash_ids = data["crash_ids"] if not isinstance(crash_ids, (list, tuple)): crash_ids = [crash_ids] # If one of them isn't a crash id, raise a 400. for crash_id in crash_ids: if not is_crash_id_valid(crash_id): raise BadArgumentError("Crash id '%s' is not valid." % crash_id) return self.get_implementation().publish( queue="reprocessing", crash_ids=crash_ids )
def get(self, **kwargs): """Return a list of ADUs and crash counts by signature and ADU date """ now = datetimeutil.utc_now().date() lastweek = now - datetime.timedelta(weeks=1) filters = [ ("start_date", lastweek, "date"), ("end_date", now, "date"), ("signature", None, "str"), ("channel", None, "str"), ("product_name", None, "str"), ] params = external_common.parse_arguments(filters, kwargs) for param in ("start_date", "end_date", "signature", "channel"): if not params[param]: raise MissingArgumentError(param) if params.end_date - params.start_date > datetime.timedelta(days=365): raise BadArgumentError('Duration too long. Max 365 days.') sql_query = """ SELECT product_name, signature, adu_date::TEXT, build_date::TEXT, buildid::TEXT, crash_count, adu_count, os_name, channel FROM crash_adu_by_build_signature WHERE adu_date BETWEEN %(start_date)s AND %(end_date)s AND product_name = %(product_name)s AND channel = %(channel)s AND signature = %(signature)s ORDER BY buildid """ error_message = ( "Failed to retrieve crash ADU by build signature from PostgreSQL" ) results = self.query(sql_query, params, error_message=error_message) crashes = results.zipped() return { "hits": crashes, "total": len(crashes) }
def get(self, **kwargs): format_ = kwargs.get('format', 'meta') if format_ == 'raw_crash': # legacy format_ = kwargs['format'] = 'raw' expect_dict = format_ != 'raw' result = super(RawCrash, self).get(**kwargs) # This 'result', will either be a binary blob or a python dict. # Unless kwargs['format']==raw, this has to be a python dict. if expect_dict and not isinstance(result, dict): raise BadArgumentError('format') return result
def get(self, **kwargs): format_ = kwargs.get("format", "meta") if format_ == "raw_crash": # legacy format_ = kwargs["format"] = "raw" expect_dict = format_ != "raw" result = super().get(**kwargs) # This 'result', will either be a binary blob or a python dict. # Unless kwargs['format']==raw, this has to be a python dict. if expect_dict and not isinstance(result, dict): raise BadArgumentError("format") return result
def get(self, *args, **kwargs): params = self.parse_parameters(kwargs) # Make sure bug_ids is a list of numbers and if not, raise # and error if not all([bug_id.isdigit() for bug_id in params["bug_ids"]]): raise BadArgumentError("bug_ids") hits = list( BugAssociation.objects.filter(bug_id__in=params["bug_ids"]).values( "bug_id", "signature")) hits = [{ "id": int(hit["bug_id"]), "signature": hit["signature"] } for hit in hits] return {"hits": hits, "total": len(hits)}
def get(self, **kwargs): """Return a list of signatures-to-bug_ids or bug_ids-to-signatures associations. """ params = external_common.parse_arguments(self.filters, kwargs, modern=True) if not params['signatures'] and not params['bug_ids']: raise MissingArgumentError('specify one of signatures or bug_ids') elif params['signatures'] and params['bug_ids']: raise BadArgumentError('specify only one of signatures or bug_ids') sql_params = [] if params['signatures']: sql_params.append(tuple(params.signatures)) sql = """/* socorro.external.postgresql.bugs.Bugs.get */ SELECT ba.signature, bugs.id FROM bugs JOIN bug_associations AS ba ON bugs.id = ba.bug_id WHERE EXISTS( SELECT 1 FROM bug_associations WHERE bug_associations.bug_id = bugs.id AND signature IN %s ) """ elif params['bug_ids']: sql_params.append(tuple(params.bug_ids)) sql = """/* socorro.external.postgresql.bugs.Bugs.get */ SELECT ba.signature, bugs.id FROM bugs JOIN bug_associations AS ba ON bugs.id = ba.bug_id WHERE bugs.id IN %s """ error_message = "Failed to retrieve bug associations from PostgreSQL" results = self.query(sql, sql_params, error_message=error_message) bugs = results.zipped() return {"hits": bugs, "total": len(bugs)}
def post(self, **kwargs): '''Return the result of a custom query. ''' params = external_common.parse_arguments(self.filters, kwargs) if not params.query: raise MissingArgumentError('query') try: query = json.loads(params.query) except ValueError: raise BadArgumentError( 'query', msg="Invalid JSON value for parameter 'query'") # Set indices. indices = [] if not params.indices: # By default, use the last two indices. today = datetimeutil.utc_now() last_week = today - datetime.timedelta(days=7) indices = self.generate_list_of_indexes(last_week, today) elif len(params.indices) == 1 and params.indices[0] == 'ALL': # If we want all indices, just do nothing. pass else: indices = params.indices search_args = {} if indices: search_args['index'] = indices search_args['doc_type'] = ( self.config.elasticsearch.elasticsearch_doctype) connection = self.get_connection() try: results = connection.search(body=query, **search_args) except elasticsearch.exceptions.NotFoundError, e: missing_index = re.findall(BAD_INDEX_REGEX, e.error)[0] raise ResourceNotFound("elasticsearch index '%s' does not exist" % missing_index)
def get(self, **kwargs): filters = [ ("backfill_type", None, "str"), ("reports_clean", True, "bool"), ("check_period", '01:00:00', "str"), ("table_name", None, "str"), ("update_day", None, "datetime"), ("start_date", None, "datetime"), ("end_date", None, "datetime"), ] params = external_common.parse_arguments(filters, kwargs) if not params.backfill_type: raise MissingArgumentError('backfill_type') date_param = ['update_day', 'start_date', 'end_date'] for i in date_param: if i in kwargs: params[i] = str(params[i].date()) try: query = 'SELECT backfill_%(backfill_type)s (%(params)s); ' required_params = BACKFILL_PARAMETERS[params.backfill_type] query_params = [(i, params[i]) for i in required_params] query_params_str = ', '.join('%(' + str(i[0]) + ')s' for i in query_params) query = query % { 'backfill_type': params.backfill_type, 'params': query_params_str } except: raise BadArgumentError(kwargs['backfill_type']) error_message = "Failed to retrieve backfill %s from PostgreSQL" error_message = error_message % kwargs['backfill_type'] results = self.query(query, params, error_message=error_message) return results
def get(self, **kwargs): self.context.logger.info('Running %s' % self.__class__.__name__) raise BadArgumentError('bad arg')
def mocked_supersearch_get(**params): raise BadArgumentError('<script>xss')
def get_list(self, **kwargs): """ List all crashes with a given signature and return them. Both `from_date` and `to_date` (and their aliases `from` and `to`) are required and can not be greater than 30 days apart. Optional arguments: see SearchCommon.get_parameters() """ # aliases if "from" in kwargs and "from_date" not in kwargs: kwargs["from_date"] = kwargs.get("from") if "to" in kwargs and "to_date" not in kwargs: kwargs["to_date"] = kwargs.get("to") if not kwargs.get('from_date'): raise MissingArgumentError('from_date') if not kwargs.get('to_date'): raise MissingArgumentError('to_date') from_date = datetimeutil.datetimeFromISOdateString(kwargs['from_date']) to_date = datetimeutil.datetimeFromISOdateString(kwargs['to_date']) span_days = (to_date - from_date).days if span_days > 30: raise BadArgumentError( 'Span between from_date and to_date can not be more than 30' ) # start with the default sort_order = { 'key': 'date_processed', 'direction': 'DESC' } if 'sort' in kwargs: sort_order['key'] = kwargs.pop('sort') _recognized_sort_orders = ( 'date_processed', 'uptime', 'user_comments', 'uuid', 'uuid_text', 'product', 'version', 'build', 'signature', 'url', 'os_name', 'os_version', 'cpu_name', 'cpu_info', 'address', 'reason', 'last_crash', 'install_age', 'hangid', 'process_type', 'release_channel', 'install_time', 'duplicate_of', ) if sort_order['key'] not in _recognized_sort_orders: raise BadArgumentError( '%s is not a recognized sort order key' % sort_order['key'] ) sort_order['direction'] = 'ASC' if str(kwargs.get('reverse', '')).lower() == 'true': if kwargs.pop('reverse'): sort_order['direction'] = 'DESC' include_raw_crash = kwargs.get('include_raw_crash') or False params = search_common.get_parameters(kwargs) if not params["signature"]: raise MissingArgumentError('signature') params["terms"] = params["signature"] params["search_mode"] = "is_exactly" # Default mode falls back to starts_with for postgres if params["plugin_search_mode"] == "default": params["plugin_search_mode"] = "starts_with" # Limiting to a signature if params["terms"]: params["terms"] = self.prepare_terms(params["terms"], params["search_mode"]) # Searching for terms in plugins if params["report_process"] == "plugin" and params["plugin_terms"]: params["plugin_terms"] = " ".join(params["plugin_terms"]) params["plugin_terms"] = self.prepare_terms( params["plugin_terms"], params["plugin_search_mode"] ) # Get information about the versions util_service = Util(config=self.context) params["versions_info"] = util_service.versions_info(**params) # Parsing the versions params["versions_string"] = params["versions"] (params["versions"], params["products"]) = self.parse_versions( params["versions"], params["products"] ) if hasattr(self.context, 'webapi'): context = self.context.webapi else: # old middleware context = self.context # Changing the OS ids to OS names for i, elem in enumerate(params["os"]): for platform in context.platforms: if platform["id"][:3] == elem[:3]: params["os"][i] = platform["name"] # Creating the parameters for the sql query sql_params = { } # Preparing the different parts of the sql query sql_select = """ SELECT r.date_processed, r.uptime, r.user_comments, r.uuid::uuid, r.uuid as uuid_text, r.product, r.version, r.build, r.signature, r.url, r.os_name, r.os_version, r.cpu_name, r.cpu_info, r.address, r.reason, r.last_crash, r.install_age, r.hangid, r.process_type, r.release_channel, (r.client_crash_date - (r.install_age * INTERVAL '1 second')) AS install_time """ if include_raw_crash: pass else: sql_select += """ , rd.duplicate_of """ wrapped_select = """ WITH report_slice AS ( %s ), dupes AS ( SELECT report_slice.uuid, rd.duplicate_of FROM reports_duplicates rd JOIN report_slice ON report_slice.uuid_text = rd.uuid WHERE rd.date_processed BETWEEN %%(from_date)s AND %%(to_date)s ) SELECT rs.*, dupes.duplicate_of, rc.raw_crash FROM report_slice rs LEFT OUTER JOIN dupes USING (uuid) LEFT OUTER JOIN raw_crashes rc ON rs.uuid = rc.uuid AND rc.date_processed BETWEEN %%(from_date)s AND %%(to_date)s """ sql_from = self.build_reports_sql_from(params) if not include_raw_crash: sql_from = """%s LEFT OUTER JOIN reports_duplicates rd ON r.uuid = rd.uuid """ % sql_from sql_where, sql_params = self.build_reports_sql_where( params, sql_params, self.context ) sql_order = """ ORDER BY %(key)s %(direction)s """ % sort_order sql_limit, sql_params = self.build_reports_sql_limit( params, sql_params ) # Assembling the query if include_raw_crash: sql_query = "\n".join(( "/* socorro.external.postgresql.report.Report.list */", sql_select, sql_from, sql_where, sql_order, sql_limit) ) else: sql_query = "\n".join(( "/* socorro.external.postgresql.report.Report.list */", sql_select, sql_from, sql_where, sql_order, sql_limit) ) # Query for counting the results sql_count_query = "\n".join(( "/* socorro.external.postgresql.report.Report.list */", "SELECT count(*)", sql_from, sql_where) ) # Querying the DB with self.get_connection() as connection: total = self.count( sql_count_query, sql_params, error_message="Failed to count crashes from reports.", connection=connection ) # No need to call Postgres if we know there will be no results if total: if include_raw_crash: sql_query = wrapped_select % sql_query results = self.query( sql_query, sql_params, error_message="Failed to retrieve crashes from reports", connection=connection ).zipped() else: results = [] crashes = [] for crash in results: assert crash['uuid'] == crash['uuid_text'] crash.pop('uuid_text') if not include_raw_crash and 'raw_crash' in crash: crash.pop('raw_crash') for i in crash: try: crash[i] = datetimeutil.date_to_string(crash[i]) except TypeError: pass crashes.append(crash) return { "hits": crashes, "total": total }
def create_field(self, **kwargs): """Create a new field in the database, to be used by supersearch and all Elasticsearch related services. """ filters = [ ('name', None, 'str'), ('data_validation_type', 'enum', 'str'), ('default_value', None, 'str'), ('description', None, 'str'), ('form_field_choices', None, ['list', 'str']), ('has_full_version', False, 'bool'), ('in_database_name', None, 'str'), ('is_exposed', False, 'bool'), ('is_returned', False, 'bool'), ('is_mandatory', False, 'bool'), ('query_type', 'enum', 'str'), ('namespace', None, 'str'), ('permissions_needed', None, ['list', 'str']), ('storage_mapping', None, 'json'), ] params = external_common.parse_arguments(filters, kwargs) mandatory_params = ('name', 'in_database_name') for param in mandatory_params: if not params[param]: raise MissingArgumentError(param) # Before making the change, make sure it does not break indexing. new_mapping = self.get_mapping(overwrite_mapping=params) # Try the mapping. If there is an error, an exception will be raised. # If an exception is raised, the new mapping will be rejected. self.test_mapping(new_mapping) es_connection = self.get_connection() try: es_connection.index( index=self.config.elasticsearch.elasticsearch_default_index, doc_type='supersearch_fields', body=params, id=params['name'], op_type='create', refresh=True, ) except elasticsearch.exceptions.ConflictError: # This field exists in the database, it thus cannot be created! raise BadArgumentError( 'name', msg='The field "%s" already exists in the database, ' 'impossible to create it. ' % params['name'], ) if params.get('storage_mapping'): # If we made a change to the storage_mapping, log that change. self.config.logger.info( 'elasticsearch mapping changed for field "%s", ' 'added new mapping "%s"', params['name'], params['storage_mapping'], ) return True
def fix_date_parameter(self, parameters): """Correct the date parameter. If there is no date parameter, set default values. Otherwise, make sure there is exactly one lower bound value and one greater bound value. """ default_date_range = datetime.timedelta(days=DEFAULT_DATE_RANGE) maximum_date_range = datetime.timedelta(days=MAXIMUM_DATE_RANGE) if not parameters.get("date"): now = datetimeutil.utc_now() lastweek = now - default_date_range parameters["date"] = [] parameters["date"].append(SearchParam("date", lastweek, ">=", "datetime")) parameters["date"].append(SearchParam("date", now, "<=", "datetime")) else: lower_than = None greater_than = None for param in parameters["date"]: if not param.operator: # dates can't be a specific date raise BadArgumentError( "date", msg="date must have a prefix operator" ) if "<" in param.operator and ( not lower_than or (lower_than and lower_than.value > param.value) ): lower_than = param if ">" in param.operator and ( not greater_than or (greater_than and greater_than.value < param.value) ): greater_than = param # Remove all the existing parameters so we have exactly # one lower value and one greater value parameters["date"] = [] if not lower_than: # add a lower than that is now lower_than = SearchParam( "date", datetimeutil.utc_now(), "<=", "datetime" ) if not greater_than: # add a greater than that is lower_than minus the date range greater_than = SearchParam( "date", lower_than.value - default_date_range, ">=", "datetime" ) # Verify the date range is not too big. delta = lower_than.value - greater_than.value if delta > maximum_date_range: raise BadArgumentError( "date", msg="Date range is bigger than %s days" % MAXIMUM_DATE_RANGE ) parameters["date"].append(lower_than) parameters["date"].append(greater_than)
def get(self, **kwargs): """Return a list of results and aggregations based on parameters. The list of accepted parameters (with types and default values) is in the database and can be accessed with the super_search_fields service. """ # Require that the list of fields be passed. if not kwargs.get("_fields"): raise MissingArgumentError("_fields") self.all_fields = kwargs["_fields"] # Filter parameters and raise potential errors. params = self.get_parameters(**kwargs) # Find the indices to use to optimize the elasticsearch query. indices = self.get_indices(params["date"]) if "%" in self.context.get_index_template(): # If the index template is date-centric, remove indices before the retention # policy because they're not valid to search through and probably don't # exist policy = datetime.timedelta( weeks=self.context.get_retention_policy()) template = self.context.get_index_template() indices = prune_invalid_indices(indices, policy, template) # Create and configure the search object. search = Search( using=self.get_connection(), index=indices, doc_type=self.context.get_doctype(), ) # Create filters. filters = [] histogram_intervals = {} for field, sub_params in params.items(): sub_filters = None for param in sub_params: if param.name.startswith("_"): # By default, all param values are turned into lists, # even when they have and can have only one value. # For those we know there can only be one value, # so we just extract it from the made-up list. if param.name == "_results_offset": results_from = param.value[0] elif param.name == "_results_number": results_number = param.value[0] if results_number > 1000: raise BadArgumentError( "_results_number", msg=("_results_number cannot be greater " "than 1,000"), ) if results_number < 0: raise BadArgumentError( "_results_number", msg="_results_number cannot be negative", ) elif param.name == "_facets_size": facets_size = param.value[0] # Why cap it? # Because if the query is covering a lot of different # things you can get a really really large query # which can hog resources excessively. # Downloading, as an example, 100k facets (and 0 hits) # when there is plenty of data yields a 11MB JSON # file. if facets_size > 10000: raise BadArgumentError( "_facets_size greater than 10,000") for f in self.histogram_fields: if param.name == "_histogram_interval.%s" % f: histogram_intervals[f] = param.value[0] # Don't use meta parameters in the query. continue field_data = self.all_fields[param.name] name = self.get_full_field_name(field_data) if param.data_type in ("date", "datetime"): param.value = datetimeutil.date_to_string(param.value) elif param.data_type == "enum": param.value = [x.lower() for x in param.value] elif param.data_type == "str" and not param.operator: param.value = [x.lower() for x in param.value] # Operators needing wildcards, and the associated value # transformation with said wildcards. operator_wildcards = { "~": "*%s*", # contains "^": "%s*", # starts with "$": "*%s", # ends with } # Operators needing ranges, and the associated Elasticsearch # comparison operator. operator_range = { ">": "gt", "<": "lt", ">=": "gte", "<=": "lte" } args = {} filter_type = "term" filter_value = None if not param.operator: # contains one of the terms if len(param.value) == 1: val = param.value[0] if not isinstance(val, str) or " " not in val: # There's only one term and no white space, this # is a simple term filter. filter_value = val else: # If the term contains white spaces, we want to # perform a phrase query. filter_type = "query" args = Q( "simple_query_string", query=param.value[0], fields=[name], default_operator="and", ).to_dict() else: # There are several terms, this is a terms filter. filter_type = "terms" filter_value = param.value elif param.operator == "=": # is exactly if field_data["has_full_version"]: name = "%s.full" % name filter_value = param.value elif param.operator in operator_range: filter_type = "range" filter_value = { operator_range[param.operator]: param.value } elif param.operator == "__null__": filter_type = "missing" args["field"] = name elif param.operator == "__true__": filter_type = "term" filter_value = True elif param.operator == "@": filter_type = "regexp" if field_data["has_full_version"]: name = "%s.full" % name filter_value = param.value elif param.operator in operator_wildcards: filter_type = "query" # Wildcard operations are better applied to a non-analyzed # field (called "full") if there is one. if field_data["has_full_version"]: name = "%s.full" % name q_args = {} q_args[name] = operator_wildcards[ param.operator] % param.value query = Q("wildcard", **q_args) args = query.to_dict() if filter_value is not None: args[name] = filter_value if args: new_filter = F(filter_type, **args) if param.operator_not: new_filter = ~new_filter if sub_filters is None: sub_filters = new_filter elif filter_type == "range": sub_filters &= new_filter else: sub_filters |= new_filter continue if sub_filters is not None: filters.append(sub_filters) search = search.filter(F("bool", must=filters)) # Restricting returned fields. fields = [] # We keep track of the requested columns in order to make sure we # return those column names and not aliases for example. self.request_columns = [] for param in params["_columns"]: for value in param.value: if not value: continue self.request_columns.append(value) field_name = self.get_field_name(value, full=False) fields.append(field_name) search = search.fields(fields) # Sorting. sort_fields = [] for param in params["_sort"]: for value in param.value: if not value: continue # Values starting with a '-' are sorted in descending order. # In order to retrieve the database name of the field, we # must first remove the '-' part and add it back later. # Example: given ['product', '-version'], the results will be # sorted by ascending product then descending version. desc = False if value.startswith("-"): desc = True value = value[1:] field_name = self.get_field_name(value) if desc: # The underlying library understands that '-' means # sorting in descending order. field_name = "-" + field_name sort_fields.append(field_name) search = search.sort(*sort_fields) # Pagination. results_to = results_from + results_number search = search[results_from:results_to] # Create facets. if facets_size: self._create_aggregations(params, search, facets_size, histogram_intervals) # Query and compute results. hits = [] if params["_return_query"][0].value[0]: # Return only the JSON query that would be sent to elasticsearch. return {"query": search.to_dict(), "indices": indices} errors = [] # We call elasticsearch with a computed list of indices, based on # the date range. However, if that list contains indices that do not # exist in elasticsearch, an error will be raised. We thus want to # remove all failing indices until we either have a valid list, or # an empty list in which case we return no result. while True: try: results = search.execute() for hit in results: hits.append(self.format_fields(hit.to_dict())) total = search.count() aggregations = getattr(results, "aggregations", {}) if aggregations: aggregations = self.format_aggregations(aggregations) shards = getattr(results, "_shards", {}) break # Yay! Results! except NotFoundError as e: missing_index = re.findall(BAD_INDEX_REGEX, e.error)[0] if missing_index in indices: del indices[indices.index(missing_index)] else: # Wait what? An error caused by an index that was not # in the request? That should never happen, but in case # it does, better know it. raise errors.append({ "type": "missing_index", "index": missing_index }) if indices: # Update the list of indices and try again. # Note: we need to first empty the list of indices before # updating it, otherwise the removed indices never get # actually removed. search = search.index().index(*indices) else: # There is no index left in the list, return an empty # result. hits = [] total = 0 aggregations = {} shards = None break except RequestError as exception: # Try to handle it gracefully if we can find out what # input was bad and caused the exception. try: bad_input = ELASTICSEARCH_PARSE_EXCEPTION_REGEX.findall( exception.error)[-1] # Loop over the original parameters to try to figure # out which *key* had the bad input. for key, value in kwargs.items(): if value == bad_input: raise BadArgumentError(key) except IndexError: # Not an ElasticsearchParseException exception pass # Re-raise the original exception raise if shards and shards.failed: # Some shards failed. We want to explain what happened in the # results, so the client can decide what to do. failed_indices = defaultdict(int) for failure in shards.failures: failed_indices[failure.index] += 1 for index, shards_count in failed_indices.items(): errors.append({ "type": "shards", "index": index, "shards_count": shards_count }) return { "hits": hits, "total": total, "facets": aggregations, "errors": errors }
def get(self, **kwargs): """Return a list of results and aggregations based on parameters. The list of accepted parameters (with types and default values) is in the database and can be accessed with the super_search_fields service. """ # Require that the list of fields be passed. if not kwargs.get('_fields'): raise MissingArgumentError('_fields') self.all_fields = kwargs['_fields'] # Filter parameters and raise potential errors. params = self.get_parameters(**kwargs) # Find the indices to use to optimize the elasticsearch query. indices = self.get_indices(params['date']) # Create and configure the search object. search = Search( using=self.get_connection(), index=indices, doc_type=self.config.elasticsearch.elasticsearch_doctype, ) # Create filters. filters = [] histogram_intervals = {} for field, sub_params in params.items(): sub_filters = None for param in sub_params: if param.name.startswith('_'): # By default, all param values are turned into lists, # even when they have and can have only one value. # For those we know there can only be one value, # so we just extract it from the made-up list. if param.name == '_results_offset': results_from = param.value[0] elif param.name == '_results_number': results_number = param.value[0] if results_number > 1000: raise BadArgumentError( '_results_number', msg=( '_results_number cannot be greater ' 'than 1,000' ) ) if results_number < 0: raise BadArgumentError( '_results_number', msg='_results_number cannot be negative' ) elif param.name == '_facets_size': facets_size = param.value[0] # Why cap it? # Because if the query is covering a lot of different # things you can get a really really large query # which can hog resources excessively. # Downloading, as an example, 100k facets (and 0 hits) # when there is plenty of data yields a 11MB JSON # file. if facets_size > 10000: raise BadArgumentError( '_facets_size greater than 10,000' ) for f in self.histogram_fields: if param.name == '_histogram_interval.%s' % f: histogram_intervals[f] = param.value[0] # Don't use meta parameters in the query. continue field_data = self.all_fields[param.name] name = self.get_full_field_name(field_data) if param.data_type in ('date', 'datetime'): param.value = datetimeutil.date_to_string(param.value) elif param.data_type == 'enum': param.value = [x.lower() for x in param.value] elif param.data_type == 'str' and not param.operator: param.value = [x.lower() for x in param.value] # Operators needing wildcards, and the associated value # transformation with said wildcards. operator_wildcards = { '~': '*%s*', # contains '^': '%s*', # starts with '$': '*%s' # ends with } # Operators needing ranges, and the associated Elasticsearch # comparison operator. operator_range = { '>': 'gt', '<': 'lt', '>=': 'gte', '<=': 'lte', } args = {} filter_type = 'term' filter_value = None if not param.operator: # contains one of the terms if len(param.value) == 1: val = param.value[0] if not isinstance(val, basestring) or ' ' not in val: # There's only one term and no white space, this # is a simple term filter. filter_value = val else: # If the term contains white spaces, we want to # perform a phrase query. filter_type = 'query' args = Q( 'simple_query_string', query=param.value[0], fields=[name], default_operator='and', ).to_dict() else: # There are several terms, this is a terms filter. filter_type = 'terms' filter_value = param.value elif param.operator == '=': # is exactly if field_data['has_full_version']: name = '%s.full' % name filter_value = param.value elif param.operator in operator_range: filter_type = 'range' filter_value = { operator_range[param.operator]: param.value } elif param.operator == '__null__': filter_type = 'missing' args['field'] = name elif param.operator == '__true__': filter_type = 'term' filter_value = True elif param.operator == '@': filter_type = 'regexp' if field_data['has_full_version']: name = '%s.full' % name filter_value = param.value elif param.operator in operator_wildcards: filter_type = 'query' # Wildcard operations are better applied to a non-analyzed # field (called "full") if there is one. if field_data['has_full_version']: name = '%s.full' % name q_args = {} q_args[name] = ( operator_wildcards[param.operator] % param.value ) query = Q('wildcard', **q_args) args = query.to_dict() if filter_value is not None: args[name] = filter_value if args: new_filter = F(filter_type, **args) if param.operator_not: new_filter = ~new_filter if sub_filters is None: sub_filters = new_filter elif filter_type == 'range': sub_filters &= new_filter else: sub_filters |= new_filter continue if sub_filters is not None: filters.append(sub_filters) search = search.filter(F('bool', must=filters)) # Restricting returned fields. fields = [] # We keep track of the requested columns in order to make sure we # return those column names and not aliases for example. self.request_columns = [] for param in params['_columns']: for value in param.value: if not value: continue self.request_columns.append(value) field_name = self.get_field_name(value, full=False) fields.append(field_name) search = search.fields(fields) # Sorting. sort_fields = [] for param in params['_sort']: for value in param.value: if not value: continue # Values starting with a '-' are sorted in descending order. # In order to retrieve the database name of the field, we # must first remove the '-' part and add it back later. # Example: given ['product', '-version'], the results will be # sorted by ascending product then descending version. desc = False if value.startswith('-'): desc = True value = value[1:] field_name = self.get_field_name(value) if desc: # The underlying library understands that '-' means # sorting in descending order. field_name = '-' + field_name sort_fields.append(field_name) search = search.sort(*sort_fields) # Pagination. results_to = results_from + results_number search = search[results_from:results_to] # Create facets. if facets_size: self._create_aggregations( params, search, facets_size, histogram_intervals ) # Query and compute results. hits = [] if params['_return_query'][0].value[0]: # Return only the JSON query that would be sent to elasticsearch. return { 'query': search.to_dict(), 'indices': indices, } errors = [] # We call elasticsearch with a computed list of indices, based on # the date range. However, if that list contains indices that do not # exist in elasticsearch, an error will be raised. We thus want to # remove all failing indices until we either have a valid list, or # an empty list in which case we return no result. while True: try: results = search.execute() for hit in results: hits.append(self.format_fields(hit.to_dict())) total = search.count() aggregations = getattr(results, 'aggregations', {}) if aggregations: aggregations = self.format_aggregations(aggregations) shards = getattr(results, '_shards', {}) break # Yay! Results! except NotFoundError, e: missing_index = re.findall(BAD_INDEX_REGEX, e.error)[0] if missing_index in indices: del indices[indices.index(missing_index)] else: # Wait what? An error caused by an index that was not # in the request? That should never happen, but in case # it does, better know it. raise errors.append({ 'type': 'missing_index', 'index': missing_index, }) if indices: # Update the list of indices and try again. # Note: we need to first empty the list of indices before # updating it, otherwise the removed indices never get # actually removed. search = search.index().index(*indices) else: # There is no index left in the list, return an empty # result. hits = [] total = 0 aggregations = {} shards = None break
def get(self, **kwargs): filters = [ ("report_types", None, ["list", "str"]), ("report_type", None, "str"), ("signature", None, "str"), ("start_date", None, "date"), ("end_date", None, "date"), ("versions", None, ["list", "str"]), ] params = external_common.parse_arguments(filters, kwargs) if not params.get('report_types') and params.get('report_type'): # to support the legacy case individual_report = True report_types = [params['report_type']] else: individual_report = False report_types = params['report_types'] # check that all the report types are recognized for report_type in report_types: query_params = report_type_sql.get(report_type, {}) known_report_types = ('products', 'distinct_install', 'exploitability', 'devices', 'graphics') if (report_type not in known_report_types and 'first_col' not in query_params): raise BadArgumentError(report_type) products = [] versions = [] # Get information about the versions util_service = Util(config=self.context) versions_info = util_service.versions_info(**params) if versions_info: for i, elem in enumerate(versions_info): products.append(versions_info[elem]["product_name"]) versions.append(str(versions_info[elem]["version_string"])) # This MUST be a tuple otherwise it gets cast to an array params['product'] = tuple(products) params['version'] = tuple(versions) all_results = {} assert isinstance(params['start_date'], datetime.date) assert isinstance(params['end_date'], datetime.date) with self.get_connection() as connection: for report_type in report_types: result_cols, query_string, query_parameters = self._get_query( report_type, params) sql_results = self.query(query_string, params=query_parameters, connection=connection) results = [dict(zip(result_cols, row)) for row in sql_results] all_results[report_type] = results if individual_report: return all_results.values()[0] else: return {'reports': all_results}
def get_parameters(self, **kwargs): parameters = {} fields = kwargs['_fields'] assert fields if fields: self.build_filters(fields) for param in self.filters: values = kwargs.get(param.name, param.default) if values in ('', []): # Those values are equivalent to None here. # Note that we cannot use bool(), because 0 is not equivalent # to None in our case. values = None if values is None and param.mandatory: raise MissingArgumentError(param.name) if values is None and param.default is not None: values = param.default # all values can be a list, so we make them all lists to simplify if values is not None and not isinstance(values, (list, tuple)): values = [values] if values is not None: # There should only be one parameter with no operator, and # we want to stack all values into it. That's why we want # to keep track of it. # Actually, we want _two_ parameters with no operator: one # for each possible value of "operator_not". no_operator_param = {True: None, False: None} for value in values: operator = None operator_not = False operators = OPERATORS_MAP.get(param.data_type, OPERATORS_MAP['default']) if isinstance(value, basestring): if value.startswith(OPERATOR_NOT): operator_not = True value = value[1:] for ope in operators: if value.startswith(ope): operator = ope value = value[len(ope):] break # ensure the right data type try: value = convert_to_type(value, param.data_type) except ValueError: raise BadArgumentError( param.name, msg='Bad value for parameter %s:' ' "%s" is not a valid %s' % (param.name, value, param.data_type)) if param.name not in parameters: parameters[param.name] = [] if not operator: if not no_operator_param[operator_not]: no_operator_param[operator_not] = SearchParam( param.name, [value], operator, param.data_type, operator_not) else: no_operator_param[operator_not].value.append(value) else: parameters[param.name].append( SearchParam(param.name, value, operator, param.data_type, operator_not)) for value in no_operator_param.values(): if value: parameters[value.name].append(value) self.fix_date_parameter(parameters) self.fix_process_type_parameter(parameters) self.fix_hang_type_parameter(parameters) self.fix_version_parameter(parameters) return parameters
def fix_date_parameter(self, parameters): """Correct the date parameter. If there is no date parameter, set default values. Otherwise, make sure there is exactly one lower bound value and one greater bound value. """ default_date_range = datetime.timedelta( days=self.config.search_default_date_range) maximum_date_range = datetime.timedelta( days=self.config.search_maximum_date_range) if not parameters.get('date'): now = datetimeutil.utc_now() lastweek = now - default_date_range parameters['date'] = [] parameters['date'].append( SearchParam('date', lastweek, '>=', 'datetime')) parameters['date'].append( SearchParam('date', now, '<=', 'datetime')) else: lower_than = None greater_than = None for param in parameters['date']: if not param.operator: # dates can't be a specific date raise BadArgumentError( 'date', msg='date must have a prefix operator') if ('<' in param.operator and (not lower_than or (lower_than and lower_than.value > param.value))): lower_than = param if ('>' in param.operator and (not greater_than or (greater_than and greater_than.value < param.value))): greater_than = param # Remove all the existing parameters so we have exactly # one lower value and one greater value parameters['date'] = [] if not lower_than: # add a lower than that is now lower_than = SearchParam('date', datetimeutil.utc_now(), '<=', 'datetime') if not greater_than: # add a greater than that is lower_than minus the date range greater_than = SearchParam( 'date', lower_than.value - default_date_range, '>=', 'datetime') # Verify the date range is not too big. delta = lower_than.value - greater_than.value if delta > maximum_date_range: raise BadArgumentError( 'date', msg='Date range is bigger than %s days' % self.config.search_maximum_date_range) parameters['date'].append(lower_than) parameters['date'].append(greater_than)
def mocked_supersearch_get(**params): if params.get('product'): raise MissingArgumentError(params['product']) else: raise BadArgumentError('That was a bad thing to do')
def check_type(param, datatype): """ Make sure that param is of type datatype and return it. If param is None, return it. If param is an instance of datatype, return it. If param is not an instance of datatype and is not None, cast it as datatype and return it. """ if param is None: return param if getattr(datatype, 'clean', None) and callable(datatype.clean): try: return datatype.clean(param) except ValueError: raise BadArgumentError(param) elif isinstance(datatype, str): # You've given it something like `'bool'` as a string. # This is the legacy way of doing it. datatype = { 'str': str, 'bool': bool, 'float': float, 'date': datetime.date, 'datetime': datetime.datetime, 'timedelta': datetime.timedelta, 'json': 'json', # exception 'int': int, }[datatype] if datatype is str and not isinstance(param, basestring): try: param = str(param) except ValueError: param = str() elif datatype is int and not isinstance(param, int): try: param = int(param) except ValueError: param = int() elif datatype is bool and not isinstance(param, bool): param = str(param).lower() in ("true", "t", "1", "y", "yes") elif (datatype is datetime.datetime and not isinstance(param, datetime.datetime)): try: param = dtutil.string_to_datetime(param) except ValueError: param = None elif datatype is datetime.date and not isinstance(param, datetime.date): try: param = dtutil.string_to_datetime(param).date() except ValueError: param = None elif (datatype is datetime.timedelta and not isinstance(param, datetime.timedelta)): try: param = dtutil.str_hours_to_time_delta(param) except ValueError: param = None elif datatype == "json" and isinstance(param, basestring): try: param = json.loads(param) except ValueError: param = None return param