Exemplo n.º 1
0
 def __init__(self, elasticsearch, index_info, doc_prep_fn=None, doc_filter_fn=None, change_filter_fn=None):
     self.change_filter_fn = change_filter_fn or noop_filter
     self.doc_filter_fn = doc_filter_fn or noop_filter
     self.elasticsearch = elasticsearch
     self.es_interface = ElasticsearchInterface(self.elasticsearch)
     self.index_info = index_info
     self.doc_transform_fn = doc_prep_fn or identity
Exemplo n.º 2
0
def run_query(index_name,
              q,
              debug_host=None,
              es_instance_alias=ES_DEFAULT_INSTANCE):
    # the debug_host parameter allows you to query another env for testing purposes
    if debug_host:
        if not settings.DEBUG:
            raise Exception("You can only specify an ES env in DEBUG mode")
        es_host = settings.ELASTICSEARCH_DEBUG_HOSTS[debug_host]
        es_instance = Elasticsearch([{
            'host': es_host,
            'port': settings.ELASTICSEARCH_PORT
        }],
                                    timeout=3,
                                    max_retries=0)
    else:
        es_instance = get_es_instance(es_instance_alias)

    es_interface = ElasticsearchInterface(es_instance)

    es_meta = ES_META[index_name]
    try:
        results = es_interface.search(es_meta.alias, es_meta.type, body=q)
        report_and_fail_on_shard_failures(results)
        return results
    except ElasticsearchException as e:
        raise ESError(e)
Exemplo n.º 3
0
    def process_bulk_docs(self, docs, progress_logger):
        if not docs:
            return True

        pillow_logging.info("Processing batch of %s docs", len(docs))
        changes = []
        for doc in docs:
            change = self._doc_to_change(doc)  # de-dupe the is_deletion check
            if self.process_deletes or not change.deleted:
                changes.append(change)
        error_collector = ErrorCollector()

        bulk_changes = build_bulk_payload(changes, self.doc_transform,
                                          error_collector)

        for change, exception in error_collector.errors:
            pillow_logging.error("Error processing doc %s: %s (%s)", change.id,
                                 type(exception), exception)

        es_interface = ElasticsearchInterface(self.es)
        try:
            es_interface.bulk_ops(self.index_info.alias, self.index_info.type,
                                  bulk_changes)
        except BulkIndexError as e:
            pillow_logging.error("Bulk index errors\n%s", e.errors)
        except Exception as exc:
            pillow_logging.exception(
                "Error sending bulk payload to Elasticsearch: %s", exc)
            return False

        return True
Exemplo n.º 4
0
def scan(client, query=None, scroll='5m', **kwargs):
    """
    This is a copy of elasticsearch.helpers.scan, except this function returns
    a ScanResult (which includes the total number of documents), and removes
    some options from scan that we aren't using.

    Simple abstraction on top of the
    :meth:`~elasticsearch.Elasticsearch.scroll` api - a simple iterator that
    yields all hits as returned by underlining scroll requests.

    :arg client: instance of :class:`~elasticsearch.Elasticsearch` to use
    :arg query: body for the :meth:`~elasticsearch.Elasticsearch.search` api
    :arg scroll: Specify how long a consistent view of the index should be
        maintained for scrolled search

    Any additional keyword arguments will be passed to the initial
    :meth:`~elasticsearch.Elasticsearch.search` call::

        scan(es,
            query={"match": {"title": "python"}},
            index="orders-*",
            doc_type="books"
        )

    """
    kwargs['search_type'] = 'scan'
    # initial search
    es_interface = ElasticsearchInterface(client)
    initial_resp = es_interface.search(body=query, scroll=scroll, **kwargs)

    def fetch_all(initial_response):

        resp = initial_response
        scroll_id = resp.get('_scroll_id')
        if scroll_id is None:
            return
        iteration = 0

        while True:

            start = int(time.time() * 1000)
            resp = es_interface.scroll(scroll_id, scroll=scroll)
            for hit in resp['hits']['hits']:
                yield hit

            # check if we have any errrors
            if resp["_shards"]["failed"]:
                logging.getLogger('elasticsearch.helpers').warning(
                    'Scroll request has failed on %d shards out of %d.',
                    resp['_shards']['failed'], resp['_shards']['total'])

            scroll_id = resp.get('_scroll_id')
            # end of scroll
            if scroll_id is None or not resp['hits']['hits']:
                break

            iteration += 1

    count = initial_resp.get("hits", {}).get("total", None)
    return ScanResult(count, fetch_all(initial_resp))
Exemplo n.º 5
0
def update_unknown_user_from_form_if_necessary(es, doc_dict):
    if doc_dict is None:
        return

    user_id, username, domain, xform_id = _get_user_fields_from_form_doc(
        doc_dict)

    if (not user_id or user_id in WEIRD_USER_IDS
            or _user_exists_in_couch(user_id)):
        return

    if not doc_exists_in_es(USER_INDEX_INFO, user_id):
        doc_type = "AdminUser" if username == "admin" else "UnknownUser"
        doc = {
            "_id": user_id,
            "domain": domain,
            "username": username,
            "first_form_found_in": xform_id,
            "doc_type": doc_type,
        }
        if domain:
            doc["domain_membership"] = {"domain": domain}
        ElasticsearchInterface(es).create_doc(USER_INDEX_INFO.alias,
                                              ES_META['users'].type,
                                              doc=doc,
                                              doc_id=user_id)
Exemplo n.º 6
0
def set_index_reindex_settings(es, index):
    """
    Set a more optimized setting setup for fast reindexing
    """
    from pillowtop.index_settings import INDEX_REINDEX_SETTINGS
    return ElasticsearchInterface(es).update_index_settings(
        index, INDEX_REINDEX_SETTINGS)
Exemplo n.º 7
0
    def handle(self, **options):
        es = get_es_new()
        es_interface = ElasticsearchInterface(es)
        # call this before getting existing indices because apparently getting the pillow will create the index
        # if it doesn't exist
        # fixme: this can delete real indices if a reindex is in progress
        found_indices = set(es_interface.get_aliases().keys())
        expected_indices = {
            info.index
            for info in get_all_expected_es_indices()
        }
        print(expected_indices)

        if options['verbose']:
            if expected_indices - found_indices:
                print('the following indices were not found:\n{}\n'.format(
                    '\n'.join(expected_indices - found_indices)))
            print('expecting {} indices:\n{}\n'.format(
                len(expected_indices), '\n'.join(sorted(expected_indices))))

        unref_indices = set([
            index for index in found_indices if index not in expected_indices
        ])
        if unref_indices:
            if options['delete']:
                _delete_indices(es, unref_indices)
            else:
                _close_indices(es, unref_indices, options['noinput'])
        else:
            print('no indices need pruning')
Exemplo n.º 8
0
 def setUp(self):
     self.index = TEST_INDEX_INFO.index
     self.es_alias = TEST_INDEX_INFO.alias
     self.es = get_es_new()
     self.es_interface = ElasticsearchInterface(self.es)
     with trap_extra_setup(ConnectionError):
         ensure_index_deleted(self.index)
Exemplo n.º 9
0
def set_index_normal_settings(es, index):
    """
    Normal indexing configuration
    """
    from pillowtop.index_settings import INDEX_STANDARD_SETTINGS
    return ElasticsearchInterface(es).update_index_settings(
        index, INDEX_STANDARD_SETTINGS)
Exemplo n.º 10
0
def _get_latest_doc_from_index(es_alias, sort_field):
    """
    Query elasticsearch index sort descending by the sort field
    and get the doc_id back so we can then do a rev-update check.

    This si because there's no direct view known ahead of time what's inside the report* index,
    so just get it directly from the index and do the modify check workflow.
    """
    recent_query = {
        "filter": {
            "match_all": {}
        },
        "sort": {
            sort_field: "desc"
        },
        "size": 1
    }
    es_interface = ElasticsearchInterface(get_es_new())

    try:
        res = es_interface.search(es_alias, body=recent_query)
        if 'hits' in res:
            if 'hits' in res['hits']:
                result = res['hits']['hits'][0]
                return result['_source']['_id']

    except Exception as ex:
        logging.error("Error querying get_latest_doc_from_index[%s]: %s" %
                      (es_alias, ex))
        return None
Exemplo n.º 11
0
    def process_bulk_docs(self, docs, progress_logger):
        if len(docs) == 0:
            return True

        pillow_logging.info("Processing batch of %s docs", len(docs))

        changes = [
            self._doc_to_change(doc) for doc in docs
            if self.process_deletes or not is_deletion(doc.get('doc_type'))
        ]
        error_collector = ErrorCollector()

        bulk_changes = build_bulk_payload(self.index_info, changes,
                                          self.doc_transform, error_collector)

        for change, exception in error_collector.errors:
            pillow_logging.error("Error procesing doc %s: %s (%s)", change.id,
                                 type(exception), exception)

        es_interface = ElasticsearchInterface(self.es)
        try:
            es_interface.bulk_ops(bulk_changes)
        except (ESBulkIndexError, ES2BulkIndexError, ES7BulkIndexError) as e:
            pillow_logging.error("Bulk index errors\n%s", e.errors)
        except Exception:
            pillow_logging.exception("\tException sending payload to ES")
            return False

        return True
Exemplo n.º 12
0
    def setUp(self):
        self.es = get_es_new()
        self.es_interface = ElasticsearchInterface(self.es)
        self.index = TEST_INDEX_INFO.index

        with trap_extra_setup(ConnectionError):
            ensure_index_deleted(self.index)
            initialize_index_and_mapping(self.es, TEST_INDEX_INFO)
Exemplo n.º 13
0
 def tearDownClass(cls):
     interface = ElasticsearchInterface(cls.es)
     for form in cls.forms:
         interface.delete_doc(XFORM_INDEX_INFO.alias, XFORM_INDEX_INFO.type,
                              form.wrapped_form.form_id)
     cls.es.indices.refresh(XFORM_INDEX_INFO.index)
     cls.forms = []
     super(XFormESTestCase, cls).tearDownClass()
Exemplo n.º 14
0
def send_to_elasticsearch(index, doc_type, doc_id, es_getter, name, data=None,
                          retries=MAX_RETRIES, propagate_failure=settings.UNIT_TESTING,
                          update=False, delete=False, es_merge_update=False):
    """
    More fault tolerant es.put method
    kwargs:
        es_merge_update: Set this to True to use Elasticsearch.update instead of Elasticsearch.index
            which merges existing ES doc and current update. If this is set to False, the doc will be replaced

    """
    data = data if data is not None else {}
    current_tries = 0
    es_interface = ElasticsearchInterface(es_getter())
    retries = 1 if settings.UNIT_TESTING else retries
    while current_tries < retries:
        try:
            if delete:
                es_interface.delete_doc(index, doc_type, doc_id)
            elif update:
                params = {'retry_on_conflict': 2}
                if es_merge_update:
                    es_interface.update_doc_fields(index, doc_type, doc_id, fields=data, params=params)
                else:
                    es_interface.update_doc(index, doc_type, doc_id, doc=data, params=params)
            else:
                es_interface.create_doc(index, doc_type, doc_id, doc=data)
            break
        except ConnectionError as ex:
            current_tries += 1
            pillow_logging.error("[{}] put_robust error {} attempt {}/{}".format(
                name, ex, current_tries, retries))

            if current_tries == retries:
                message = "[{}] Max retry error on {}/{}/{}:\n\n{}".format(
                    name, index, doc_type, doc_id, traceback.format_exc())
                if propagate_failure:
                    raise PillowtopIndexingError(message)
                else:
                    pillow_logging.error(message)

            time.sleep(math.pow(RETRY_INTERVAL, current_tries))
        except RequestError:
            error_message = (
                "Pillowtop put_robust error [{}]:\n\n{}\n\tpath: {}/{}/{}\n\t{}".format(
                    name, traceback.format_exc(), index, doc_type, doc_id, list(data))
            )

            if propagate_failure:
                raise PillowtopIndexingError(error_message)
            else:
                pillow_logging.error(error_message)
            break
        except ConflictError:
            break  # ignore the error if a doc already exists when trying to create it in the index
        except NotFoundError:
            break
Exemplo n.º 15
0
def mget_query(index_name, ids):
    if not ids:
        return []

    es_interface = ElasticsearchInterface(get_es_new())
    es_meta = ES_META[index_name]
    try:
        return es_interface.get_bulk_docs(es_meta.alias, es_meta.type, ids)
    except ElasticsearchException as e:
        raise ESError(e)
Exemplo n.º 16
0
def form_ids_in_es(form_ids):
    query = {"filter": {"ids": {"values": list(form_ids)}}}
    es_interface = ElasticsearchInterface(get_es_new())
    es_meta = ES_META['forms']
    results = es_interface.search(es_meta.index, es_meta.type, query,
                                  params={'size': CHUNK_SIZE})
    if 'hits' in results:
        for hit in results['hits']['hits']:
            es_doc = hit['_source']
            yield es_doc['_id']
Exemplo n.º 17
0
def get_case_name(case_id):
    from corehq.pillows.mappings.case_mapping import CASE_INDEX_INFO
    try:
        result = ElasticsearchInterface(get_es_new()).get_doc(
            CASE_INDEX_INFO.alias,
            CASE_INDEX_INFO.type,
            case_id,
            source_includes=['name'])
    except ElasticsearchException:
        return None

    return result['name']
Exemplo n.º 18
0
 def test_assume_alias(self):
     initialize_index_and_mapping(self.es, TEST_INDEX_INFO)
     doc_id = uuid.uuid4().hex
     doc = {'_id': doc_id, 'doc_type': 'CommCareCase', 'type': 'mother'}
     ElasticsearchInterface(get_es_new()).index_doc(
         self.index, TEST_INDEX_INFO.type, doc_id, {'doc_type': 'CommCareCase', 'type': 'mother'},
         verify_alias=False)
     self.assertEqual(1, get_doc_count(self.es, self.index))
     assume_alias(self.es, self.index, TEST_INDEX_INFO.alias)
     es_doc = self.es_interface.get_doc(TEST_INDEX_INFO.alias, TEST_INDEX_INFO.type, doc_id)
     for prop in doc:
         self.assertEqual(doc[prop], es_doc[prop])
Exemplo n.º 19
0
 def _delete_docs_from_es(cls, doc_ids, index_info):
     es_interface = ElasticsearchInterface(cls.elasticsearch)
     refresh = False
     for doc_id in doc_ids:
         try:
             es_interface.delete_doc(index_info.alias, index_info.type,
                                     doc_id)
         except elasticsearch.NotFoundError:
             pass
         else:
             refresh = True
     if refresh:
         cls.elasticsearch.indices.refresh(index_info.index)
Exemplo n.º 20
0
def delete_case_search_cases(domain):
    if domain is None or isinstance(domain, dict):
        raise TypeError("Domain attribute is required")

    get_es_new().indices.refresh(CASE_SEARCH_INDEX)
    case_ids = CaseSearchES().domain(domain).values_list('_id', flat=True)

    ElasticsearchInterface(get_es_new()).bulk_ops([{
        "_op_type": "delete",
        "_index": CASE_SEARCH_INDEX,
        "_type": CASE_ES_TYPE,
        "_id": case_id,
    } for case_id in case_ids])
Exemplo n.º 21
0
def _check_es_rev(es_alias, doc_id, couch_revs):
    """
    Specific docid and rev checker.

    es_alias: Elasticsearch alias
    doc_id: id to query in ES
    couch_rev: target couch_rev that you want to match
    """
    es_interface = ElasticsearchInterface(get_es_new())
    doc_id_query = {
        "filter": {
            "ids": {
                "values": [doc_id]
            }
        },
        "fields": ["_id", "_rev"]
    }

    try:
        res = es_interface.search(es_alias, body=doc_id_query)
        status = False
        message = "Not in sync"

        if 'hits' in res:
            if res['hits'].get('total', 0) == 0:
                status = False
                # if doc doesn't exist it's def. not in sync
                message = "Not in sync %s" % es_alias
            elif 'hits' in res['hits']:
                fields = res['hits']['hits'][0]['fields']
                if fields['_rev'] in couch_revs:
                    status = True
                    message = "%s OK" % es_alias
                else:
                    status = False
                    # less likely, but if it's there but the rev is off
                    message = "Not in sync - %s stale" % es_alias
        else:
            status = False
            message = "Not in sync - query failed"
            notify_error("%s: %s" % (message, str(res)))
    except Exception as ex:
        message = "ES Error: %s" % ex
        status = False
    return {
        es_alias: {
            "es_alias": es_alias,
            "status": status,
            "message": message
        }
    }
Exemplo n.º 22
0
    def setUpClass(cls):
        super().setUpClass()
        cls.domain = uuid.uuid4().hex
        cls.case_ids = [uuid.uuid4().hex for i in range(4)]
        with drop_connected_signals(case_post_save), drop_connected_signals(
                sql_case_post_save):
            for case_id in cls.case_ids:
                create_form_for_test(cls.domain, case_id)

        cls.es = get_es_new()
        cls.es_interface = ElasticsearchInterface(cls.es)
        cls.index = TEST_INDEX_INFO.index

        with trap_extra_setup(ConnectionError):
            ensure_index_deleted(cls.index)
            initialize_index_and_mapping(cls.es, TEST_INDEX_INFO)
Exemplo n.º 23
0
    def setUp(self):
        self.index = TEST_ES_INFO.alias
        self.type = TEST_ES_INFO.type
        self.es = get_es_new()
        # tweak mapping
        self.mapping = {"properties": {"message": {"type": "string"}}}
        meta = {"mapping": self.mapping}
        # setup index
        if self.es.indices.exists(self.index):
            self.es.indices.delete(self.index)
        self.es.indices.create(index=self.index, body=meta)

        # insert a doc so we get some mapping data
        interface = ElasticsearchInterface(self.es)
        ident = uuid.uuid4().hex
        doc = {"message": "hello"}
        interface.index_doc(self.index, self.type, ident, doc)
        self.es.indices.refresh(self.index)
Exemplo n.º 24
0
def delete_case_search_cases(domain):
    if domain is None or isinstance(domain, dict):
        raise TypeError("Domain attribute is required")

    get_es_new().indices.refresh(CASE_SEARCH_INDEX)
    case_ids = CaseSearchES().domain(domain).values_list('_id', flat=True)

    op_kwargs = {
        "_op_type": "delete",
        "_index": CASE_SEARCH_INDEX_INFO.alias,
        "_type": CASE_ES_TYPE,
    }
    if settings.ELASTICSEARCH_MAJOR_VERSION == 7:
        op_kwargs.pop('_type')

    ElasticsearchInterface(get_es_new()).bulk_ops([{
        **op_kwargs,
        "_id": case_id,
    } for case_id in case_ids])
Exemplo n.º 25
0
    def process_bulk_docs(self, docs):
        if len(docs) == 0:
            return True

        pillow_logging.info("Processing batch of %s docs", len((docs)))

        changes = [self._doc_to_change(doc) for doc in docs]
        error_collector = ErrorCollector()

        bulk_changes = build_bulk_payload(self.index_info, changes,
                                          self.doc_transform, error_collector)

        for change, exception in error_collector.errors:
            pillow_logging.error("Error procesing doc %s: %s (%s)", change.id,
                                 type(exception), exception)

        es_interface = ElasticsearchInterface(self.es)
        try:
            es_interface.bulk_ops(bulk_changes)
        except Exception:
            pillow_logging.exception("\tException sending payload to ES")
            return False

        return True
Exemplo n.º 26
0
def doc_exists_in_es(index_info, doc_id):
    """
    Check if a document exists
    """
    return ElasticsearchInterface(get_es_new()).doc_exists(
        index_info.alias, doc_id, index_info.type)
Exemplo n.º 27
0
def scroll_query(index_name, q, es_instance_alias=ES_DEFAULT_INSTANCE):
    es_meta = ES_META[index_name]
    es_interface = ElasticsearchInterface(get_es_instance(es_instance_alias))
    return es_interface.scan(es_meta.alias, q, es_meta.type)
Exemplo n.º 28
0
def count_query(index_name, q):
    es_meta = ES_META[index_name]
    es_interface = ElasticsearchInterface(get_es_new())
    return es_interface.count(es_meta.alias, es_meta.type, q)
Exemplo n.º 29
0
def es_query(params=None,
             facets=None,
             terms=None,
             q=None,
             es_index=None,
             start_at=None,
             size=None,
             dict_only=False,
             fields=None,
             facet_size=None):
    if terms is None:
        terms = []
    if q is None:
        q = {}
    else:
        q = copy.deepcopy(q)
    if params is None:
        params = {}

    q["size"] = size if size is not None else q.get("size", SIZE_LIMIT)
    q["from"] = start_at or 0

    def get_or_init_anded_filter_from_query_dict(qdict):
        and_filter = qdict.get("filter", {}).pop("and", [])
        filter = qdict.pop("filter", None)
        if filter:
            and_filter.append(filter)
        return {"and": and_filter}

    filter = get_or_init_anded_filter_from_query_dict(q)

    def convert(param):
        #todo: find a better way to handle bools, something that won't break fields that may be 'T' or 'F' but not bool
        if param == 'T' or param is True:
            return 1
        elif param == 'F' or param is False:
            return 0
        return param

    for attr in params:
        if attr not in terms:
            attr_val = [convert(params[attr])] if not isinstance(
                params[attr], list) else [convert(p) for p in params[attr]]
            filter["and"].append({"terms": {attr: attr_val}})

    if facets:
        q["facets"] = q.get("facets", {})
        if isinstance(facets, list):
            for facet in facets:
                q["facets"][facet] = {
                    "terms": {
                        "field": facet,
                        "size": facet_size or SIZE_LIMIT
                    }
                }
        elif isinstance(facets, dict):
            q["facets"].update(facets)

    if filter["and"]:
        query = q.pop("query", {})
        q["query"] = {
            "filtered": {
                "filter": filter,
            }
        }
        q["query"]["filtered"]["query"] = query if query else {"match_all": {}}

    if fields is not None:
        q["fields"] = q.get("fields", [])
        q["fields"].extend(fields)

    if dict_only:
        return q

    es_index = es_index or 'domains'
    es_interface = ElasticsearchInterface(get_es_new())
    meta = ES_META[es_index]

    try:
        result = es_interface.search(meta.index, meta.type, body=q)
        report_and_fail_on_shard_failures(result)
    except ElasticsearchException as e:
        raise ESError(e)

    if fields is not None:
        for res in result['hits']['hits']:
            flatten_field_dict(res)

    return result
Exemplo n.º 30
0
 def __init__(self, domain):
     super(ESView, self).__init__()
     self.domain = domain.lower()
     self.es = get_es_new()
     self.es_interface = ElasticsearchInterface(self.es)