Exemplo n.º 1
0
def get_scan_generic_format(client, now, last_fetch_timestamp=None):
    """Gets a scan object in generic format"""
    # if method is simple date - convert the date string to datetime
    es = client.es
    time_field = client.time_field
    fetch_index = client.fetch_index
    if not fetch_index:
        fetch_index = '_all'
    if time_field:
        query = QueryString(query=time_field + ':*')
        range_field = {
            time_field: {
                'gt': last_fetch_timestamp,
                'lte': now
            }
        } if last_fetch_timestamp else {
            time_field: {
                'lte': now
            }
        }
        search = Search(using=es, index=fetch_index).filter({
            'range':
            range_field
        }).query(query)
    else:
        search = Search(using=es, index=fetch_index).query(
            QueryString(query=client.query))
    return search
def make_query(query, filters, page, sort_by):
    try:
        client = Elasticsearch()
        s = Search(client, index=app.config['INDEX'])

        if query:
            s = s.query(QueryString(query=escape_query(query)))
            if not sort_by:
                sort_by = "relevance"
        else:
            s = s.query(MatchAll())
            if not sort_by:
                sort_by = DEFAULT_SORT_BY

        s = s.sort(SORT_BY.get(sort_by, DEFAULT_SORT_BY)['value'])

        start = (page - 1) * 20
        end = start + 20
        s = s[start:end]

        if filters:
            s = s.filter('bool', must=filters)

        result = s.execute()
        return result
    except ConnectionError, ex:
        return None
Exemplo n.º 3
0
def get_scan_insight_format(client,
                            now,
                            last_fetch_timestamp=None,
                            feed_type=None):
    """Gets a scan object in insight format"""
    time_field = client.time_field
    range_field = {
        time_field: {
            'gt': last_fetch_timestamp,
            'lte': now
        }
    } if last_fetch_timestamp else {
        time_field: {
            'lte': now
        }
    }
    es = client.es
    query = QueryString(query=time_field + ":*")
    indices = client.fetch_index
    if feed_type == FEED_TYPE_CORTEX_MT:
        indices = '*-shared*'
        tenant_hash = demisto.getIndexHash()
        if tenant_hash:
            # all shared indexes minus this tenant shared
            indices += f',-*{tenant_hash}*-shared*'
    elif not indices:
        indices = '_all'
    search = Search(using=es, index=indices).filter({
        'range': range_field
    }).query(query)
    return search
Exemplo n.º 4
0
 def get_entities_by_sport_and_query(self, sport, query):
     search = Search(using=self.es)
     search = search[0:5]
     if sport == Sport.SOCCER:
         search = search.index('soccer-entity')
     elif sport == Sport.BASKETBALL:
         search = search.index('basketball-entity')
     if query:
         query = '*{}*'.format(query)
         search = search.query(
             QueryString(query=query, fields=['name^5', 'abstract']))
     hits = []
     for hit in search.execute():
         id = hit.meta['id']
         hit = hit.to_dict()
         entity = {'id': id, 'name': hit['name']}
         if 'abstract' in hit:
             entity['abstract'] = hit['abstract']
         else:
             entity['abstract'] = 'None'
         if 'type' in hit:
             entity['type'] = hit['type']
         else:
             entity['abstract'] = 'None'
         hits.append(entity)
     return hits
Exemplo n.º 5
0
def test_time_field_query(es):
    """Test executing query of fetch time field.

    Notes:
        if is_fetch is ticked, this function checks if the entered TIME_FIELD returns results.

    Args:
        es(Elasticsearch): an Elasticsearch object to which we run the test.

    Returns:
        (dict).The results of the query if they are returned.
    """
    query = QueryString(query=TIME_FIELD + ':*')
    search = Search(using=es, index=FETCH_INDEX).query(query)[0:1]
    response = search.execute().to_dict()
    _, total_results = get_total_results(response)

    if total_results == 0:
        # failed in getting the TIME_FIELD
        return_error(
            "Fetch incidents test failed.\nDate field value incorrect [{}].".
            format(TIME_FIELD))

    else:
        return response
Exemplo n.º 6
0
def test_fetch_query(es):
    """Test executing fetch query.

    Notes:
        if is_fetch is ticked, this function checks if the FETCH_QUERY returns results.

    Args:
        es(Elasticsearch): an Elasticsearch object to which we run the test.

    Returns:
        (dict).The results of the query if they are returned.
    """
    query = QueryString(query=str(TIME_FIELD) + ":* AND " + FETCH_QUERY)
    search = Search(using=es, index=FETCH_INDEX).query(query)[0:1]
    response = search.execute().to_dict()
    _, total_results = get_total_results(response)

    if total_results > 0:
        return response

    else:
        # failed to get the TIME_FIELD with the FETCH_QUERY
        # this can happen and not be an error if the FETCH_QUERY doesn't have results yet.
        # Thus this does not return an error message
        return None
Exemplo n.º 7
0
def fetch_incidents():
    last_fetch, last_fetch_timestamp = get_last_fetch_time()
    es = elasticsearch_builder()

    query = QueryString(query=FETCH_QUERY + " AND " + TIME_FIELD + ":*")
    # Elastic search can use epoch timestamps (in milliseconds) as date representation regardless of date format.
    search = Search(using=es, index=FETCH_INDEX).filter(
        {'range': {
            TIME_FIELD: {
                'gt': last_fetch_timestamp
            }
        }})
    search = search.sort({TIME_FIELD: {
        'order': 'asc'
    }})[0:FETCH_SIZE].query(query)
    response = search.execute().to_dict()
    _, total_results = get_total_results(response)

    incidents = []  # type: List

    if total_results > 0:
        if 'Timestamp' in TIME_METHOD:
            incidents, last_fetch = results_to_incidents_timestamp(
                response, last_fetch)
            demisto.setLastRun({'time': last_fetch})

        else:
            incidents, last_fetch = results_to_incidents_datetime(
                response, last_fetch)
            demisto.setLastRun({'time': str(last_fetch)})

        demisto.info('extract {} incidents'.format(len(incidents)))

    demisto.incidents(incidents)
Exemplo n.º 8
0
def fetch_incidents(proxies):
    last_run = demisto.getLastRun()
    last_fetch = last_run.get('time')

    # handle first time fetch
    if last_fetch is None:
        last_fetch, _ = parse_date_range(date_range=FETCH_TIME,
                                         date_format='%Y-%m-%dT%H:%M:%S.%f',
                                         utc=False,
                                         to_timestamp=False)
        last_fetch = parse(str(last_fetch))
        last_fetch_timestamp = int(last_fetch.timestamp() * 1000)

        # if timestamp: get the last fetch to the correct format of timestamp
        if 'Timestamp' in TIME_METHOD:
            last_fetch = get_timestamp_first_fetch(last_fetch)
            last_fetch_timestamp = last_fetch

    # if method is simple date - convert the date string to datetime
    elif 'Simple-Date' == TIME_METHOD:
        last_fetch = parse(str(last_fetch))
        last_fetch_timestamp = int(last_fetch.timestamp() * 1000)

    # if last_fetch is set and we are in a "Timestamp" method - than the last_fetch_timestamp is the last_fetch.
    else:
        last_fetch_timestamp = last_fetch

    es = elasticsearch_builder(proxies)

    query = QueryString(query=FETCH_QUERY + " AND " + TIME_FIELD + ":*")
    # Elastic search can use epoch timestamps (in milliseconds) as date representation regardless of date format.
    search = Search(using=es, index=FETCH_INDEX).filter(
        {'range': {
            TIME_FIELD: {
                'gt': last_fetch_timestamp
            }
        }})
    search = search.sort({TIME_FIELD: {
        'order': 'asc'
    }})[0:FETCH_SIZE].query(query)
    response = search.execute().to_dict()
    _, total_results = get_total_results(response)

    incidents = []  # type: List

    if total_results > 0:
        if 'Timestamp' in TIME_METHOD:
            incidents, last_fetch = results_to_incidents_timestamp(
                response, last_fetch)
            demisto.setLastRun({'time': last_fetch})

        else:
            incidents, last_fetch = results_to_incidents_datetime(
                response, last_fetch)
            demisto.setLastRun({'time': str(last_fetch)})

        demisto.info('extract {} incidents'.format(len(incidents)))
    demisto.incidents(incidents)
Exemplo n.º 9
0
def test_general_query(es):
    try:
        query = QueryString(query='*')
        search = Search(using=es, index=FETCH_INDEX).query(query)[0:1]
        response = search.execute().to_dict()
        _, total_results = get_total_results(response)

    except NotFoundError as e:
        return_error("Fetch incidents test failed.\nError message: {}.".format(
            str(e).split(',')[2][2:-1]))
Exemplo n.º 10
0
def index():
    """Query Elasticsearch using Invenio query syntax."""
    page = request.values.get('page', 1, type=int)
    size = request.values.get('size', 2, type=int)
    search = ExampleSearch()[(page - 1) * size:page * size]
    if 'q' in request.values:
        search = search.query(QueryString(query=request.values.get('q')))

    search = search.sort(request.values.get('sort', 'title'))
    search = ExampleSearch.faceted_search(search=search)
    return jsonify(search.execute().to_dict())
Exemplo n.º 11
0
def test_cernopendata_query_parser():
    assert cernopendata_query_parser('/Btau') == Bool(must=[QueryString(query='"/Btau"')], must_not=[Match(distribution__availability__keyword='ondemand')])
    assert cernopendata_query_parser('"/Btau"') == Bool(must=[QueryString(query='"/Btau"')], must_not=[Match(distribution__availability__keyword='ondemand')])
    assert cernopendata_query_parser('/btau AND CMS') == Bool(must=[QueryString(query='"/btau" AND CMS')], must_not=[Match(distribution__availability__keyword='ondemand')])
    assert cernopendata_query_parser('"/btau" AND CMS') == Bool(must=[QueryString(query='"/btau" AND CMS')], must_not=[Match(distribution__availability__keyword='ondemand')])
    assert cernopendata_query_parser('CMS AND /btau') == Bool(must=[QueryString(query='CMS AND "/btau"')], must_not=[Match(distribution__availability__keyword='ondemand')])
    assert cernopendata_query_parser('CMS AND /btau', show_ondemand='true') == QueryString(query='CMS AND "/btau"')
Exemplo n.º 12
0
def fetch_incidents():
    last_run = demisto.getLastRun()
    last_fetch = last_run.get('time')

    # handle first time fetch
    if last_fetch is None:
        last_fetch, _ = parse_date_range(date_range=FETCH_TIME,
                                         date_format=TIME_FORMAT,
                                         utc=False,
                                         to_timestamp=False)
        last_fetch = datetime.strptime(str(last_fetch), TIME_FORMAT)

        # if timestamp: get the last fetch to the correct format of timestamp
        if 'Timestamp' in TIME_METHOD:
            last_fetch = get_timestamp_first_fetch(last_fetch)

    # if method is simple date - convert the date string to datetime
    elif 'Simple-Date' == TIME_METHOD:
        last_fetch = datetime.strptime(last_fetch, TIME_FORMAT)

    es = elasticsearch_builder()

    query = QueryString(query=FETCH_QUERY + " AND " + TIME_FIELD + ":*")
    search = Search(using=es, index=FETCH_INDEX).filter(
        {'range': {
            TIME_FIELD: {
                'gt': last_fetch
            }
        }})
    search = search.sort({TIME_FIELD: {
        'order': 'asc'
    }})[0:FETCH_SIZE].query(query)
    response = search.execute().to_dict()
    _, total_results = get_total_results(response)

    incidents = []  # type: List

    if total_results > 0:
        if 'Timestamp' in TIME_METHOD:
            incidents, last_fetch = results_to_incidents_timestamp(
                response, last_fetch)
            demisto.setLastRun({'time': last_fetch})

        else:
            incidents, last_fetch = results_to_incidents_datetime(
                response, last_fetch)
            demisto.setLastRun(
                {'time': datetime.strftime(last_fetch, TIME_FORMAT)})

        demisto.info('extract {} incidents'.format(len(incidents)))

    demisto.incidents(incidents)
Exemplo n.º 13
0
def test_fetch_query(es):
    query = QueryString(query=str(TIME_FIELD) + ":* AND " + FETCH_QUERY)
    search = Search(using=es, index=FETCH_INDEX).query(query)[0:1]
    response = search.execute().to_dict()
    _, total_results = get_total_results(response)

    if total_results > 0:
        return response

    else:
        # failed to get the TIME_FIELD with the FETCH_QUERY
        # this can happen and not be an error if the FETCH_QUERY doesn't have results yet.
        # Thus this does not return an error message
        return None
Exemplo n.º 14
0
def test_time_field_query(es):
    query = QueryString(query=TIME_FIELD + ':*')
    search = Search(using=es, index=FETCH_INDEX).query(query)[0:1]
    response = search.execute().to_dict()
    _, total_results = get_total_results(response)

    if total_results == 0:
        # failed in getting the TIME_FIELD
        return_error(
            "Fetch incidents test failed.\nDate field value incorrect [{}].".
            format(TIME_FIELD))

    else:
        return response
Exemplo n.º 15
0
def collection_records(collection=None):
    collections = Collection.query.filter(Collection.name.in_(
        [collection])).one().drilldown_tree()

    query_array = get_collections_queries(collections)
    query_string = ' or '.join(query_array)

    search = RecordsSearch().params(version=True).query(
        QueryString(query=query_string))
    response = search.execute().to_dict()
    recs = json_v1.serialize_search(cap_record_fetcher, response)

    records = {'records': recs}

    return recs
Exemplo n.º 16
0
def search_command(proxies):
    """Performs a search in Elasticsearch."""
    index = demisto.args().get('index')
    query = demisto.args().get('query')
    fields = demisto.args().get('fields')  # fields to display
    explain = 'true' == demisto.args().get('explain')
    base_page = int(demisto.args().get('page'))
    size = int(demisto.args().get('size'))
    sort_field = demisto.args().get('sort-field')
    sort_order = demisto.args().get('sort-order')

    es = elasticsearch_builder(proxies)

    que = QueryString(query=query)
    search = Search(using=es,
                    index=index).query(que)[base_page:base_page + size]
    if explain:
        # if 'explain parameter is set to 'true' - adds explanation section to search results
        search = search.extra(explain=True)

    if fields is not None:
        fields = fields.split(',')
        search = search.source(fields)

    if sort_field is not None:
        search = search.sort({sort_field: {'order': sort_order}})

    response = search.execute().to_dict()

    total_dict, total_results = get_total_results(response)
    search_context, meta_headers, hit_tables, hit_headers = results_to_context(
        index, query, base_page, size, total_dict, response)
    search_human_readable = tableToMarkdown('Search Metadata:',
                                            search_context,
                                            meta_headers,
                                            removeNull=True)
    hits_human_readable = tableToMarkdown('Hits:',
                                          hit_tables,
                                          hit_headers,
                                          removeNull=True)
    total_human_readable = search_human_readable + '\n' + hits_human_readable
    full_context = {
        'Elasticsearch.Search(val.Query == obj.Query && val.Index == obj.Index '
        '&& val.Server == obj.Server && val.Page == obj.Page && val.Size == obj.Size)':
        search_context
    }

    return_outputs(total_human_readable, full_context, response)
Exemplo n.º 17
0
def test_general_query(es):
    """Test executing query to all available indexes.

    Args:
        es(Elasticsearch): an Elasticsearch object to which we run the test.
    """
    try:
        query = QueryString(query='*')
        search = Search(using=es, index='*').query(query)[0:1]
        response = search.execute().to_dict()
        get_total_results(response)

    except NotFoundError as e:
        return_error(
            "Failed executing general search command - please check the Server URL and port number "
            "and the supplied credentials.\nError message: {}.".format(str(e)))
Exemplo n.º 18
0
def test_general_query(es):
    """Test executing query in fetch index.

    Notes:
        if is_fetch it ticked, this function runs a generay query to Elasticsearch just to make sure we get a response
        from the FETCH_INDEX.

    Args:
        es(Elasticsearch): an Elasticsearch object to which we run the test.
    """
    try:
        query = QueryString(query='*')
        search = Search(using=es, index=FETCH_INDEX).query(query)[0:1]
        response = search.execute().to_dict()
        _, total_results = get_total_results(response)

    except NotFoundError as e:
        return_error("Fetch incidents test failed.\nError message: {}.".format(str(e).split(',')[2][2:-1]))
Exemplo n.º 19
0
    def get(self, request):
        q = request.GET.get('q', None)

        if not q:
            return Response("Search query not set",
                            status=status.HTTP_400_BAD_REQUEST)

        q = f'*{q.strip()[:-1]}*'

        query = QueryString(query=q,
                            fields=[
                                'name', 'base_name', 'generic_names',
                                'active_ingredients'
                            ])
        drugs = DrugDocument.search().query(query)

        serializer = DrugDocumentSerializer(drugs, many=True)

        return Response(serializer.data, status=status.HTTP_200_OK)
Exemplo n.º 20
0
    def get(self, request):
        q = request.GET.get('q', None)

        if not q:
            return Response("Search query not set",
                            status=status.HTTP_400_BAD_REQUEST)

        q = f'{q.strip()[:-1]}*'

        query = QueryString(query=q,
                            fields=[
                                'name', 'specializations', 'degrees',
                                'associations', 'fellowships', 'diplomates',
                                'insurance_providers', 'medical_institutions',
                                'addresses'
                            ])
        doctors = DoctorDocument.search().query(query)

        serializer = DoctorDocumentSerializer(doctors, many=True)

        return Response(serializer.data, status=status.HTTP_200_OK)
Exemplo n.º 21
0
def get_indicators_search_scan():
    now = datetime.now()
    time_field = "calculatedTime"
    last_fetch = demisto.getLastRun().get('time')
    range_field = {
        time_field: {
            'gt': datetime.fromtimestamp(float(last_fetch)),
            'lte': now
        }
    } if last_fetch else {
        time_field: {
            'lte': now
        }
    }
    es = elasticsearch_builder()
    query = QueryString(query=time_field + ":*")
    tenant_hash = demisto.getIndexHash()
    # all shared indexes minus this tenant shared
    indexes = f'*-shared*,-*{tenant_hash}*-shared*'
    search = Search(using=es, index=indexes).filter({
        'range': range_field
    }).query(query)
    return search, str(now.timestamp())
Exemplo n.º 22
0
def search(query,
           index=None,
           filters=list(),
           size=10,
           include="*",
           exclude="authors",
           offset=0,
           sort_field=None,
           sort_order='',
           post_filter=None):
    """ Perform a search query.

    :param query: [string] query string e.g. 'higgs boson'
    :param index: [string] name of the index. If None a default is used
    :param filters: [list of tuples] list of filters for the query.
                    Currently supported: ('author', author_fullname),
                    ('collaboration', collaboration_name), ('date', date)
    :param size: [int] max number of hits that should be returned
    :param offset: [int] offset for the results (used for pagination)
    :param sort_by: [string] sorting field. Currently supported fields:
                    "title", "collaboration", "date", "relevance"
    :param sort_order: [string] order of the sorting either original
                    (for a particular field) or reversed. Supported:
                    '' or 'rev'

    :return: [dict] dictionary with processed results and facets
    """
    # If empty query then sort by date
    if query == '' and not sort_field:
        sort_field = 'date'

    query = HEPDataQueryParser.parse_query(query)
    # Create search with preference param to ensure consistency of results across shards
    search = RecordsSearch(using=es, index=index).with_preference_param()

    if query:
        fuzzy_query = QueryString(query=query, fuzziness='AUTO')
        search.query = fuzzy_query | \
                       Q('nested', query=fuzzy_query, path='authors') | \
                       Q('has_child', type="child_datatable", query=fuzzy_query)

    search = search.filter("term", doc_type=CFG_PUB_TYPE)
    search = QueryBuilder.add_filters(search, filters)

    mapped_sort_field = sort_fields_mapping(sort_field)
    search = search.sort({mapped_sort_field : {"order" : calculate_sort_order(sort_order, sort_field)}})
    search = add_default_aggregations(search, filters)

    if post_filter:
        search = search.post_filter(post_filter)

    search = search.source(includes=include, excludes=exclude)
    search = search[offset:offset+size]
    pub_result = search.execute().to_dict()

    parent_filter = {
        "terms": {
                    "_id": [hit["_id"] for hit in pub_result['hits']['hits']]
        }
    }

    data_search = RecordsSearch(using=es, index=index)
    data_search = data_search.query('has_parent',
                                    parent_type="parent_publication",
                                    query=parent_filter)
    if query:
        data_search = data_search.query(QueryString(query=query))

    data_search = data_search[0:size*50]
    data_result = data_search.execute().to_dict()

    merged_results = merge_results(pub_result, data_result)
    return map_result(merged_results, filters)
Exemplo n.º 23
0
#!/usr/bin/env python
import sys
from elasticsearch_dsl import Search
from elasticsearch_dsl.query import QueryString
from elasticsearch import helpers, Elasticsearch
from bs4 import BeautifulSoup
from celery import group
import bipolar
from time import sleep
import json

query = sys.argv[1]
es = Elasticsearch()

s = Search(using=es, index="fnhttp")
qs = QueryString(query="form action")
s = s.query(qs)

response = s.scan()

final_urls = []
for hit in response:
    url = hit.url
    soup = BeautifulSoup(hit.content, 'html.parser')
    inputs = soup.find_all('input')
    field_names = []
    for i in inputs:
        if i.has_key('name'):
            field_names.append(i['name'])
    if field_names == []:
        pass
Exemplo n.º 24
0
    def get_collection_manifest(cls, api_root, **query_parameters):
        objects_query = None
        manifest_query = None
        version_range = None
        added_after_range = None
        size = int(query_parameters.get('limit'))
        max_page_size = PAGE_SIZE
        added_after = query_parameters.get('added_after')
        sort_by = {'date_added': {'order': 'asc'}}
        types = query_parameters.get('types')
        ids = query_parameters.get('ids')
        versions = query_parameters.get('versions')
        spec_versions = query_parameters.get('spec_versions')
        base_page = 0
        next_id = 0
        objects_query = None
        manifest_query = None
        version_range = None
        added_after_range = None

        log_debug(
            f"Request to Get The objects Manifest of Collection: {query_parameters.get('collection_id')} "
            f"in the Feed Root: {api_root}")

        if query_parameters is None:
            query_parameters = {}

        try:
            # Create a Query to filter Objects by collection id, types and spec_versions
            objects_query = f"collection : {query_parameters.get('collection_id')}"
            if types:
                types = types.replace(",", " OR ")
                objects_query = objects_query + f" AND type : ('{types}')"
            if spec_versions:
                spec_versions = spec_versions.replace(",", " OR ")
                objects_query = objects_query + f" AND spec_version : ('{spec_versions}')"
            objects_query_string = QueryString(query=objects_query,
                                               default_operator="and")

            # Create a Query to filter Manifest by collection id, object id's, versions and added after dates
            manifest_query = f"collection : {query_parameters.get('collection_id')}"
            if ids:
                ids = ids.replace(",", " OR ")
                manifest_query = manifest_query + f" AND id : ('{ids}')"
            if added_after:
                added_after_range = Range(
                    **{'date_added': {
                        'gt': f'{added_after}'
                    }})
            manifests_query_string = QueryString(query=manifest_query,
                                                 default_operator="and")

            # Get the intersect of both Objects and Manifest Queries
            intersected_results = cls.es_client.manifest_intersect(
                intersect_by='id',
                objects_index=f'{api_root}-objects',
                objects_query_string=objects_query_string,
                manifests_index=f'{api_root}-manifest',
                manifests_query_string=manifests_query_string,
                added_after_range=added_after_range)

            # Version and Paginate The Results
            if intersected_results:
                manifest_ids = ",".join(intersected_results).replace(
                    ',', ' OR ')
                query_string = QueryString(query=f"id:('{manifest_ids}')",
                                           default_operator="AND")
                pre_versioning_results = cls.es_client.scan(
                    index=f'{api_root}-manifest', query_string=query_string)
                pre_pagination_results = Helper.fetch_objects_by_versions(
                    stix_objects=pre_versioning_results, versions=versions)
                if -1 < size < max_page_size:
                    results = cls.es_client.search(
                        index=f'{api_root}-manifest',
                        query_string=query_string,
                        search_from=base_page,
                        size=size,
                        sort_by=sort_by)
                else:
                    results = cls.es_client.search(
                        index=f'{api_root}-manifest',
                        query_string=query_string,
                        search_from=base_page,
                        size=max_page_size,
                        sort_by=sort_by)
                results = {'objects': pre_pagination_results}
            else:
                results = {"objects": []}
            return results

        except Exception as e:
            log_error(e)
            if query_parameters.get('next'):
                return EXCEPTIONS.get('NextNotFoundException', {})
            else:
                return EXCEPTIONS.get('CollectionNotFoundException', {})
Exemplo n.º 25
0
    def prepare(self,
                params={},
                params_whitelist=SEARCH_PARAM_WHITELIST,
                search_models=SEARCH_MODELS,
                fields=SEARCH_INCLUDE_FIELDS,
                fields_nested=SEARCH_NESTED_FIELDS,
                fields_agg=SEARCH_AGG_FIELDS):
        """Assemble elasticsearch_dsl.Search object
        
        @param params:           dict
        @param params_whitelist: list Accept only these (SEARCH_PARAM_WHITELIST)
        @param search_models:    list Limit to these ES doctypes (SEARCH_MODELS)
        @param fields:           list Retrieve these fields (SEARCH_INCLUDE_FIELDS)
        @param fields_nested:    list See SEARCH_NESTED_FIELDS
        @param fields_agg:       dict See SEARCH_AGG_FIELDS
        @returns: 
        """

        # gather inputs ------------------------------

        # self.params is a copy of the params arg as it was passed
        # to the method.  It is used for informational purposes
        # and is passed to SearchResults.
        # Sanitize while copying.
        if params:
            self.params = {
                key: sanitize_input(val)
                for key, val in params.items()
            }
        params = deepcopy(self.params)

        # scrub fields not in whitelist
        bad_fields = [
            key for key in params.keys()
            if key not in params_whitelist + ['page']
        ]
        for key in bad_fields:
            params.pop(key)

        indices = search_models
        if params.get('models'):
            indices = ','.join(
                [DOCSTORE.index_name(model) for model in models])

        # field-specific searches embedded in fulltext
        if params.get('fulltext') and 'creators:' in params['fulltext']:
            params['creators'] = params.pop('fulltext').replace(
                'creators:', '')
        if params.get('fulltext') and 'persons:' in params['fulltext']:
            params['persons'] = params.pop('fulltext').replace('persons:', '')

        s = Search(using=self.conn, index=indices)

        # only return specified fields
        s = s.source(fields)

        # sorting
        if params.get('sort'):
            args = params.pop('sort')
            s = s.sort(*args)

        if params.get('match_all'):
            s = s.query('match_all')

        elif params.get('fulltext'):
            fulltext = params.pop('fulltext')
            # MultiMatch chokes on lists
            if isinstance(fulltext, list) and (len(fulltext) == 1):
                fulltext = fulltext[0]
            # fulltext search
            s = s.query(
                QueryString(
                    query=fulltext,
                    fields=fields,
                    analyze_wildcard=False,
                    allow_leading_wildcard=False,
                    default_operator='AND',
                ))

        elif params.get('creators'):
            q = Q('bool',
                  must=[
                      Q('nested',
                        path='creators',
                        query=Q('term',
                                creators__namepart=params.pop('creators')))
                  ])
            s = s.query(q)

        elif params.get('topics') or params.get('facility'):
            # SPECIAL CASE FOR DDRPUBLIC TOPICS, FACILITY BROWSE PAGES
            if params.get('topics'):
                q = Q('bool',
                      must=[
                          Q('nested',
                            path='topics',
                            query=Q('term', topics__id=params.pop('topics')))
                      ])
                s = s.query(q)
            elif params.get('facility'):
                q = Q('bool',
                      must=[
                          Q('nested',
                            path='facility',
                            query=Q('term',
                                    facility__id=params.pop('facility')))
                      ])
                s = s.query(q)

        if params.get('parent'):
            parent = params.pop('parent')
            if isinstance(parent, list) and (len(parent) == 1):
                parent = parent[0]
            if parent:
                parent = '%s-*' % parent
            s = s.query("wildcard", id=parent)

        # filters
        for key, val in params.items():

            if key in fields_nested:
                # Instead of nested search on topics.id or facility.id
                # search on denormalized topics_id or facility_id fields.
                fieldname = '%s_id' % key
                s = s.filter('term', **{fieldname: val})

                ## search for *ALL* the topics (AND)
                #for term_id in val:
                #    s = s.filter(
                #        Q('bool',
                #          must=[
                #              Q('nested',
                #                path=key,
                #                query=Q('term', **{'%s.id' % key: term_id})
                #              )
                #          ]
                #        )
                #    )

                ## search for *ANY* of the topics (OR)
                #s = s.query(
                #    Q('bool',
                #      must=[
                #          Q('nested',
                #            path=key,
                #            query=Q('terms', **{'%s.id' % key: val})
                #          )
                #      ]
                #    )
                #)

            elif (key in params_whitelist) and val:
                s = s.filter('term', **{key: val})
                # 'term' search is for single choice, not multiple choice fields(?)

        # aggregations
        for fieldname, field in fields_agg.items():

            # nested aggregation (Elastic docs: https://goo.gl/xM8fPr)
            if fieldname == 'topics':
                s.aggs.bucket('topics', 'nested', path='topics') \
                      .bucket('topics_ids', 'terms', field='topics.id', size=1000)
            elif fieldname == 'facility':
                s.aggs.bucket('facility', 'nested', path='facility') \
                      .bucket('facility_ids', 'terms', field='facility.id', size=1000)
                # result:
                # results.aggregations['topics']['topic_ids']['buckets']
                #   {u'key': u'69', u'doc_count': 9}
                #   {u'key': u'68', u'doc_count': 2}
                #   {u'key': u'62', u'doc_count': 1}

            # simple aggregations
            else:
                s.aggs.bucket(fieldname, 'terms', field=field)

        self.s = s
Exemplo n.º 26
0
def search(query,
           index=None,
           filters=list(),
           size=10,
           include="*",
           exclude="authors",
           offset=0,
           sort_field=None,
           sort_order='',
           post_filter=None):
    """ Perform a search query.

    :param query: [string] query string e.g. 'higgs boson'
    :param index: [string] name of the index. If None a default is used
    :param filters: [list of tuples] list of filters for the query.
                    Currently supported: ('author', author_fullname),
                    ('collaboration', collaboration_name), ('date', date)
    :param size: [int] max number of hits that should be returned
    :param offset: [int] offset for the results (used for pagination)
    :param sort_by: [string] sorting field. Currently supported fields:
                    "title", "collaboration", "date", "relevance"
    :param sort_order: [string] order of the sorting either original
                    (for a particular field) or reversed. Supported:
                    '' or 'rev'

    :return: [dict] dictionary with processed results and facets
    """
    # If empty query then sort by date
    if query == '' and not sort_field:
        sort_field = 'date'

    query = HEPDataQueryParser.parse_query(query)
    # Create search with preference param to ensure consistency of results across shards
    search = RecordsSearch(using=es, index=index).with_preference_param()

    if query:
        fuzzy_query = QueryString(query=query, fuzziness='AUTO')
        search.query = fuzzy_query | \
                       Q('nested', query=fuzzy_query, path='authors') | \
                       Q('has_child', type="child_datatable", query=fuzzy_query)

    search = search.filter("term", doc_type=CFG_PUB_TYPE)
    search = QueryBuilder.add_filters(search, filters)

    mapped_sort_field = sort_fields_mapping(sort_field)
    search = search.sort({
        mapped_sort_field: {
            "order": calculate_sort_order(sort_order, sort_field)
        }
    })
    search = add_default_aggregations(search, filters)

    if post_filter:
        search = search.post_filter(post_filter)

    search = search.source(includes=include, excludes=exclude)
    search = search[offset:offset + size]

    try:
        pub_result = search.execute().to_dict()

        parent_filter = {
            "terms": {
                "_id": [hit["_id"] for hit in pub_result['hits']['hits']]
            }
        }

        data_search = RecordsSearch(using=es, index=index)
        data_search = data_search.query('has_parent',
                                        parent_type="parent_publication",
                                        query=parent_filter)
        if query:
            data_search = data_search.query(QueryString(query=query))

        data_search_size = size * ELASTICSEARCH_MAX_RESULT_WINDOW // LIMIT_MAX_RESULTS_PER_PAGE
        data_search = data_search[0:data_search_size]
        data_result = data_search.execute().to_dict()

        merged_results = merge_results(pub_result, data_result)
        return map_result(merged_results, filters)
    except TransportError as e:
        # For search phase execution exceptions we pass the reason as it's
        # likely to be user error (e.g. invalid search query)
        if e.error == 'search_phase_execution_exception' and e.info \
                and "error" in e.info and isinstance(e.info['error'], dict):
            reason = e.info['error']['root_cause'][0]['reason']
        # Otherwise we hide the details from the user
        else:
            log.error(f'An unexpected error occurred when searching: {e}')
            reason = f'An unexpected error occurred: {e.error}'
        return {'error': reason}