Exemplo n.º 1
9
class ES(object):
    def __init__(self):
        self.es = Elasticsearch()
        self.id = 0

    def insert_es(self, id, good, description):
        doc = {
            'id': id,
            'good': good,
            'description': description
            }
        res = self.es.index(index="test-index", doc_type='description_goods', id=self.id, body=doc)
        #print(res['created'])
        res = self.es.get(index="test-index", doc_type='description_goods', id=self.id)
        #print(res['_source'])
        self.es.indices.refresh(index="test-index")
        self.id += 1

    def search_es(self, what, query):
        res = self.es.search(index="test-index", body={"query": {"match": {what: query}}})  #"author": 'kimchy'
        print("Got %d Hits" % res['hits']['total'])
        documents = []
        for hit in res['hits']['hits']:
            #print hit
            documents.append(hit['_source'])
        return documents

    def del_by_query(self, query):
        res = self.es.delete_by_query(index="test-index", body={"query": {"match": {query}}}) #{"match_all": {}}

    def del_all(self):
        res = self.es.delete_by_query(index="test-index", body={"query": {"match_all": {}}}) #{"match_all": {}}
Exemplo n.º 2
0
class ES(object):
    def __init__(self):
        self.es = Elasticsearch()
        self.id = 0

    def insert_es(self, id, good, description):
        doc = {
            'id': id,
            'good': good,
            'description': description
            }
        res = self.es.index(index="test-index", doc_type='description_goods', id=self.id, body=doc)
        print(res['created'])
        res = self.es.get(index="test-index", doc_type='description_goods', id=self.id)
        print(res['_source'])
        self.es.indices.refresh(index="test-index")
        self.id += 1

    def search_es(self, what, query):
        res = self.es.search(index="test-index", body={"query": {"match": {what: query}}})  #"author": 'kimchy'
        print("Got %d Hits" % res['hits']['total'])
        documents = []
        for hit in res['hits']['hits']:
            print hit
            documents.append(hit['_source'])
        return documents

    def del_by_query(self, query):
        res = self.es.delete_by_query(index="test-index", body={"query": {"match": {query}}}) #{"match_all": {}}

    def del_all(self):
        res = self.es.delete_by_query(index="test-index", body={"query": {"match_all": {}}}) #{"match_all": {}}
Exemplo n.º 3
0
def delete_duplicate_titles(index, duplicate_dict):
    es = Elasticsearch()
    for k, v in duplicate_dict.items():
        if len(v) > 1:
            for i_id in v[1:]:
                query_dict = {'query': {'match': {'_id': i_id}}}
                es.delete_by_query(index=index, body=query_dict)
Exemplo n.º 4
0
class ElasticSilo:
    """ A silo is where we store stuff that has been harvested.
        Store features in elastic search"""
    def __init__(self, args):
        """ initialize, set endpoint & index name """
        self._es = Elasticsearch([args.elastic_search])
        self._index = args.elastic_index

    def __str__(self):
        return "ElasticSilo es:{} idx:{}".format(self._es, self._index)

    def delete_all(self):
        """delete index"""
        try:
            indices_client = IndicesClient(self._es)
            indices_client.delete(index=self._index)
        except Exception as e:
            _eprint("exception on delete_index {}".format(e))
            pass

    def delete_source(self, source):
        """ delete source from index """
        try:
            query = {
                "query": {
                    "query_string": {
                        "analyze_wildcard": True,
                        "query": "source:{}".format(source)
                    }
                }
            }
            self._es.delete_by_query(index=self._index, body=query)
        except Exception as e:
            _eprint(e, query)
            pass

    def _stringify_sources(self, feature_association):
        """ Maintaining the original document causes a 'field explosion'
        thousands on fields in a document. So, for now at least,
        maintain it as a string.
        """
        sources = ['cgi', 'jax', 'civic', 'oncokb', 'molecularmatch', 'pmkb']
        for source in sources:
            if source in feature_association:
                if not isinstance(feature_association[source], basestring):
                    feature_association[source] = json.dumps(
                        feature_association[source])  # NOQA
        return feature_association

    def save(self, feature_association):
        """ write to es """
        # prevent field explosion
        feature_association = self._stringify_sources(feature_association)
        result = self._es.index(index=self._index,
                                body=feature_association,
                                doc_type='association',
                                op_type='index')
        if result['_shards']['failed'] > 0:
            _eprint('failure updating association {}'.format(
                gene_feature['gene']))
def clean_elasticsearch_data(**context):
    elasticsearch_conf = extract_context_conf(ELASTICSEARCH_CONF_KEY_NAME,
                                              **context)

    # Skip if the "clean data" flag is turned off (the flag is turned on by default).
    if not elasticsearch_conf.get(CLEAN_DATA_KEY_NAME, True):
        return

    elasticsearch = Elasticsearch(hosts=[elasticsearch_host],
                                  port=int(elasticsearch_port))
    indices = elasticsearch.cat.indices(h="index").encode("utf-8").split("\n")

    for index in indices:
        # Escape system metrics.
        if not index.startswith(".") and not index == "":
            if index.startswith(
                ('presidio-monitoring', 'metricbeat', 'packetbeat')):
                elasticsearch.indices.delete(index=index,
                                             ignore=[404],
                                             request_timeout=360)
            else:
                elasticsearch.delete_by_query(
                    index=index,
                    body="{\"query\": {\"match_all\": {}}}",
                    request_timeout=360)
Exemplo n.º 6
0
def delete_item_from_date(index_name, str_date):
    print("delete_es")
    try:
        conn = Elasticsearch(hosts="168.1.1.195", port=9200)
        conn.delete_by_query(index=index_name, body={"query": {"match_phrase": {"DEAL_YMD": DEAL_YMD}}})
    except Exception as ex:
        print("엘라스틱 서치 에러 발생", ex)
        pass
Exemplo n.º 7
0
def delete_last_phase(index, event_date):

    ES_HOST = {"host": "localhost", "port": 9200}
    es = Elasticsearch()
    project_name = 'measure_' + index

    query = {"query": {"match": {"insert_date": event_date}}}

    es.delete_by_query(index=project_name, doc_type='metric', body=query)
Exemplo n.º 8
0
class zebrasearch():
    """
    连接Elaticsearch
    """
    def connect_es(self, host, port):
        self.es = Elasticsearch([{u'host': host, u'port': port}], timeout=3600)

    """
    连接到mongodb
    """

    def connect_mongo(self, host, port):
        self.client = MongoClient(host, port)

    """
    将mongodb中的db数据库的collection插入
    elaticsearch的index索引的types中
    """

    def mongo2es(self, db, collection, index, types):
        db = self.client[db]
        collection = db[collection]
        count = 0
        inserted = False
        actions = []
        tmp = collection.find().skip(SKIPNUM * ONCE).limit(ONCE)
        for item in tmp:
            inserted = False
            item = dict(item)
            item.pop('_id')
            # for p in item['paper']:
            #     if '_id' in p.keys():
            #         p.pop('_id')
            action = {"_index": index, "_type": types, "_source": item}
            actions.append(action)
            count += 1
            print('第' + str(SKIPNUM * ONCE + count) + '篇论文已加入列表')
            try:
                if len(actions) == INSERT_NUM:
                    print("截止到" + str(SKIPNUM * ONCE + count) + "篇论文正在准备插入")
                    helpers.bulk(client=self.es, actions=actions)
                    inserted = True
                    actions.clear()
            except:
                actions.clear()
                ERROR_ELE.append(SKIPNUM * ONCE + count)
        if not inserted:
            helpers.bulk(client=self.es, actions=actions)

    """
    将es的index索引的types清空
    """

    def cleartypes(self, index, types):
        query = {'query': {'match_all': {}}}
        self.es.delete_by_query(index=index, body=query, doc_type=types)
Exemplo n.º 9
0
def delete_records(index,
                   filter_key,
                   filter_value,
                   host="localhost",
                   port=9200):
    es = Elasticsearch(hosts=[{'host': host, 'port': port}])

    if es.indices.exists(index):
        query = fill_base_query(filter_key, filter_value)
        es.delete_by_query(index=index, body=query, refresh=True)
Exemplo n.º 10
0
def delete_from_ES(event_id):

    es_host = "https://vpc-tasktrigger-domain-bmmm3cd2xeh4x3iex5aug35u3q.us-east-1.es.amazonaws.com"
    es_port = 443
    es = Elasticsearch([
        es_host + ":" + str(es_port),
    ])

    doc = {"query": {"term": {"event_id": event_id}}}
    es.delete_by_query(index='events', doc_type='Event', body=doc)
    print("Event: {} deleted from ES".format(event_id))
Exemplo n.º 11
0
def delById(index, ids, hosts=ELASTICSEARCH_SERVER):
    """
    根据失败的详情博客id删除对应的图片信息
    :param index: 请求失败的博文index
    :param ids: 请求失败的博文id
    :param hosts: :param hosts: es主机地址
    :return:
    """
    es = Elasticsearch(hosts=hosts)
    query = {'query': {'match': {'_id': ids}}}
    es.delete_by_query(index=index, doc_type='_doc', body=query)
Exemplo n.º 12
0
Arquivo: user.py Projeto: SEOBJ/noteit
    def delete(self):
        el = Elasticsearch(port=port)
        el.delete_by_query(index='users',
                           doc_type='user',
                           body={'query': {
                               'match': {
                                   "_id": self._id
                               }
                           }})

        Database.remove(UserConstants.COLLECTION, {'_id': self._id})
Exemplo n.º 13
0
def delete_model(es_object: Elasticsearch, model_id: str, models: dict):
    es_object.delete_by_query(
        index='ai_models',
        doc_type='_doc',
        body={'query': {
            'match': {
                'id_model': model_id
            }
        }})
    es_object.indices.delete(index=model_id, ignore=[400, 404])
    models.pop(model_id, None)
Exemplo n.º 14
0
class Deletor(object):
    def __init__(self, url, index, username, password):
        self.es_url = url
        self.es_main_index = index
        self.es = Elasticsearch(self.es_url,
                                http_auth=(username, password),
                                use_ssl=True,
                                verify_certs=True,
                                connection_class=RequestsHttpConnection,
                                timeout=30,
                                max_retries=10,
                                retry_on_timeout=True)

    def delete(self, type):
        # make sure index exists
        indice = client.IndicesClient(self.es)
        try:
            if indice.exists(self.es_main_index):
                # if type is 'all' delete everything
                if type == 'all':
                    try:
                        self.es.delete_by_query(index=self.es_main_index,
                                                body=match_all,
                                                conflicts='proceed')
                        print('Deleted ' + self.es_main_index)
                        return True
                    except ConnectionError:
                        print(
                            'There was a connection error. Check your Elastic'
                            +
                            ' Search setting and make sure Elastic Search is' +
                            'running.')
                        return False
                elif type:
                    try:
                        if indice.exists_type(index=self.es_main_index,
                                              doc_type=type):
                            self.es.delete_by_query(index=self.es_main_index,
                                                    doc_type=type,
                                                    body=match_all,
                                                    conflicts='proceed')
                            print('Deleted ' + self.es_main_index + '/' + type)
                            return True
                    except ConnectionError:
                        print(
                            'There was a connection error. Check your Elastic'
                            +
                            ' Search setting and make sure Elastic Search is' +
                            'running.')
                        return False
        except TransportError:
            print('Incorrect username or password')
            return False
Exemplo n.º 15
0
 def delete_on_elastic(self):
     el = Elasticsearch(port=port)
     body = {
         "query": {
             "match": {
                 "note_id": self._id
             }
         }
     }
     el.delete_by_query(index="notes", doc_type="note", body=body)
     del el
     return True
class Elastic:

    def __init__(self, cloud_id, username, password):
        self.es = Elasticsearch(cloud_id=cloud_id,
                                http_auth=(username, password))

    def load_stream_analytics(self, **args):
        """Wrapper to load simulation data"""
        return None

    def load_data(self, data, index):
        """
        to load array of tweets to elastic search
        :param data: (list of json objects)
        :param index: (string) name of the elastic search index to load data to
        :return: None
        """

        # to make the index if it doesn't exist
        self.es.index(index, data[0], id=0)

        # Bulk insert
        actions = [{
            "_index": index,
            "_type": "_doc",
            "_id": j,
            "_source": data[j]
        } for j in range(0, len(data))]
        helpers.bulk(self.es, actions)
        print("insert successful")
        self.es.indices.refresh(index=index)
        print("data loaded to elastic")

    def delete_data(self, index, id_range):
        """
        :param index: (string) elastic search index from which docs are to be deleted
        :param id_range: (int) id range to be deleted
        :return: None
        """
        # Bulk delete
        actions = [{
            "_op_type": 'delete',
            "_index": index,
            "_id": j,
        } for j in range(0, id_range)]
        helpers.bulk(self.es, actions)

    def clear_data(self, index):
        """
        :param index: (string) elastic search index to be cleared
        :return: None
        """
        self.es.delete_by_query(index, body={"query": {"match_all": {}}})
 def delete_data(self):
     es = Elasticsearch('39.107.66.190:9200')
     query = {
         'query': {
             'match': {
                 "serialNo": 368389656708132864,
                 "is_result": 1
             }
         }
     }  # 删除性别为女性的所有文档
     es = Elasticsearch('39.107.66.190:9200')
     # 删除所有文档
     es.delete_by_query(index="monitor", body=query)
Exemplo n.º 18
0
def deleteSingleElement():
    logList = None

    _index = request.form.get('_index')
    _id = request.form.get('_id')

    es = Elasticsearch([{
        'host': app.config['ELASTICSEARCH_URI'],
        'port': app.config['ELASTICSEARCH_PORT']
    }])

    deleteDoc = {
        "query": {
            "bool": {
                "must": [{
                    "match": {
                        "analysis_info_id": {
                            "query": _id,
                            "type": "phrase"
                        }
                    }
                }]
            }
        }
    }

    try:
        res = es.delete(index=_index, doc_type="analysis_info", id=_id)
    except Exception as e:
        raise InvalidUsage('Elastic error ' + e.message, status_code=501)

    try:
        res = es.delete_by_query(index=_index,
                                 body=deleteDoc,
                                 doc_type="analysis_file_detail_info",
                                 request_timeout=360)
        #res = es.delete(index=_index, doc_type="analysis_file_detail_info", id=_id)
    except Exception as e:
        pass

    try:
        res = es.delete_by_query(index=_index,
                                 body=deleteDoc,
                                 doc_type="analysis_url_detail_info",
                                 request_timeout=360)
    except Exception as e:
        pass

    return json.dumps({'success': True}), 200, {
        'ContentType': 'application/json'
    }
Exemplo n.º 19
0
class MyElasticsearch:
    def __init__(self):
        self.es = Elasticsearch()
        self.offset = 30000
        self.doc_index = "ecg"

    def getTimeStr(self, t):
        return datetime.fromtimestamp(int(
            t / 1000)).strftime('%Y-%m-%d %H:%M:%S.') + str(int(t))[-3:]

    #
    # push data into elasticsearch
    #
    def push(self, data):
        if "ecg" not in data:
            print "'ecg' not in data"
            return
        if "start_ecg" not in data["ecg"]:
            print "'start_ecg' not in data.ecg"
            return
        ecg = data["ecg"]
        start_ecg = float(ecg["start_ecg"])
        items = ecg["data"]
        doc_type = data["client_id"]

        # delete 5 seconds older data
        oldTime = self.getTimeStr(start_ecg - 5000)

        self.es.delete_by_query(
            index=self.doc_index,
            doc_type=doc_type,
            body={"query": {
                "range": {
                    "time": {
                        "lt": oldTime
                    }
                }
            }})
        for item in items:
            doc = {}
            # convert ecg_time into readable format: "yyyy-mm-dd HH:MM:SS.sssZ"
            doc["time"] = self.getTimeStr(start_ecg)
            start_ecg += 4.7
            # elasticsearch does not accept negtive item,
            # add offset to become positive
            doc["ecg"] = int(item) + self.offset
            print self.es.index(index=self.doc_index,
                                doc_type=doc_type,
                                id=int(start_ecg),
                                body=doc)
Exemplo n.º 20
0
    def get(self, request, *args, **kwargs):
        ret = {"code": -1, "desc": ""}
        try:
            es = Elasticsearch(settings.ELASTICSEARCH_HOST)
            es.delete_by_query(index='bookshelf',
                               body={"query": {
                                   "match_all": {}
                               }})
            ret["code"] = 1
            ret["desc"] = "그리고 아무도 없었다."
        except Exception as e:
            ret["desc"] = str(e)

        return JsonResponse(ret)
Exemplo n.º 21
0
def _delete_subscription(es_client: Elasticsearch, uuid: str):
    _unregister_percolate(es_client, uuid)
    es_client.delete_by_query(index="_all",
                              doc_type=ESDocType.subscription.name,
                              body={
                                  "query": {
                                      "ids": {
                                          "type": ESDocType.subscription.name,
                                          "values": [uuid]
                                      }
                                  }
                              },
                              conflicts="proceed",
                              refresh=True)
Exemplo n.º 22
0
    def update_to_elastic(self):
        el = Elasticsearch(port=port)
        doc1 = {"query": {"match": {'note_id': self._id}}}
        doc2 = {
            'box_id': self._id,
            'name': self.name,
            'notes': self.notes,
            'created_date': self.created_date.strftime('%Y-%m-%d'),
            'maker_id': self.maker_id
        }

        el.delete_by_query(index="boxs", doc_type='box', body=doc1)
        el.index(index="boxs", doc_type='box', body=doc2)
        del el
        return True
Exemplo n.º 23
0
def del_es_his(content_id):
    try:
        from elasticsearch import Elasticsearch
        es = Elasticsearch(config.es_ip, port=config.es_port)

        indexs = "recommendhist"
        doc_type1 = "test"
        content_id = int(content_id)
        query = {
            "query": {
            "term": {
              "contentId": {
                "value": content_id
              }
            }
          }
        }
        res = es.delete_by_query(index=indexs, doc_type= doc_type1, body = query)
        out = {
            'status': 200,
            'del_num': res.get('total')
        }

        return json.dumps(out)
    except Exception as e:
        print(e)
        out = {
            'status': 404,
            'message': 'errors on del_es_his',
            'error': str(e)
        }

        return json.dumps(out)
Exemplo n.º 24
0
    def update_to_elastic(self):
        el = Elasticsearch(port=port)
        doc1 = {"query": {"match": {'group_id': self._id}}}
        doc2 = {
            "group_id": self._id,
            "name": self.name,
            "members": self.members,
            "description": self.description,
            "shared_notes": self.shared_notes,
            "shared": self.shared
        }

        el.delete_by_query(index="groups", doc_type='group', body=doc1)
        el.index(index="groups", doc_type='group', body=doc2)
        del el
        return True
Exemplo n.º 25
0
class DB:
    def __init__(self):
        self.conn = Elasticsearch(os.environ["ELASTICSEARCH_HOST"], timeout=60)

    def query(self, method, index, query):
        method = method.lower()
        assert method in ["create", "read", "update", "delete"]

        if method == "create":
            return self.create(index, query)
        if method == "read":
            return self.read(index, query)
        if method == "update":
            return self.update(index, query)
        if method == "delete":
            return self.delete(index, query)

    def create(self, index, docs):
        response = bulk(self.conn, docs, index=index)
        return response

    def read(self, index, query):
        return [[doc["_id"], doc["_source"]]
                for doc in scan(self.conn, index=index, query=query)]

    def update(self, index, query):
        response = self.conn.update_by_query(index=index, body=query)
        return response

    def delete(self, index, query):
        response = self.conn.delete_by_query(index=index, body=query)
        return response
Exemplo n.º 26
0
def lambda_handler(event, context):
    host = 'https://search-textwifi-jhk6t4hsbrgwl4636zeyrabk74.us-east-1.es.amazonaws.com'
    es = Elasticsearch([host], ca_certs=certifi.where())
    loc = event[0]["location"]
    q = {
        "query": {
            "bool": {
                "must": {
                    "match_all": {}
                },
                "filter": {
                    "geo_distance": {
                        "distance": "0.1km",
                        "location": loc
                    }
                }
            }
        }
    }

    response = es.delete_by_query(index='wifi', doc_type='hotspot', body=q)
    print(response)

    for w in event:
        print(w)
        hotspot = {}
        hotspot["name"] = str(w["name"])
        hotspot["strength"] = int(w["strength"])
        hotspot["location"] = w["location"]
        hotspot["loc_type"] = str(w["loc_type"])
        response = es.index(index='wifi', doc_type='hotspot', body=hotspot)

    return response
Exemplo n.º 27
0
def delete_company_events(es: Elasticsearch, company_id: str,
                          event_type: EventType, body: dict, **kwargs) -> dict:
    es_index = get_index_name(company_id, event_type.value)
    return es.delete_by_query(index=es_index,
                              body=body,
                              conflicts="proceed",
                              **kwargs)
Exemplo n.º 28
0
def del_func(host, port, user, password, doc_type):
    logging.info("del_func start")
    logging.info("doc_type: %s" % doc_type)
    # es = Elasticsearch(["10.54.8.71:9200"])
    es = Elasticsearch([host], http_auth=(user, password), port=port)
    count = 1
    while count > 0:
        try:
            params = {"refresh": True}
            es.delete_by_query(index='bidder',
                               body='{"query":{"match_all":{}}}',
                               doc_type=doc_type,
                               params=params)
            count = get_count(es, doc_type)
        except ConnectionTimeout as e:
            logging.warn(e)
            count = 1  # 为了让循环不退出
            continue
Exemplo n.º 29
0
    def update_to_elastic(self):
        el = Elasticsearch(port=port)
        doc1 = {"query": {"match": {"message_id": self._id}}}
        doc2 = {
            "title": self.title,
            "content": self.content,
            "reciver_id": self.reciver_id,
            "sender_id": self.sender_id,
            "sended_date": self.sended_date.strftime('%Y-%m-%d'),
            "readed_by_reciver": self.readed_by_reciver,
            "is_a_noteOBJ": self.is_a_noteOBJ,
            "message_id": self._id
        }

        el.delete_by_query(index="messages", doc_type='message', body=doc1)
        el.index(index="messages", doc_type='message', body=doc2)
        del el
        return True
Exemplo n.º 30
0
class Searcher(object):
    def __init__(self, hosts):
        self.es = Elasticsearch(hosts=hosts)

    def es_mapping(self, index, doc_type, mapping):
        return self.es.indices.put_mapping(index=index,
                                           doc_type=doc_type,
                                           body=mapping)

    def es_setting(self, index, setting):
        return self.es.indices.put_settings(index=index, body=setting)

    def delete_by_query(self, index, doc_type, query):
        return self.es.delete_by_query(index=index,
                                       doc_type=doc_type,
                                       body=query)

    def es_init(self):
        # create index mapping
        # self.es.indices.create()
        # create ingest pipeline

        pipeline_params = {
            "description":
            "attachment extractor pipeline",
            "processors": [
                {
                    "attachment": {
                        "field": "data",
                        "indexed_chars": -1
                    }
                },
                {
                    "remove": {
                        "field": "data"
                    }
                },
                # {
                #     "set":{
                #         "field":"publish",
                #         "value":"{{attachment.date}}"
                #     }
                # },
                # {
                #     "set":{
                #         "field":"language",
                #         "value":"{{attachment.language}}"
                #     }
                # },
            ]
        }
        return self.es.ingest.put_pipeline(id='attachment',
                                           body=pipeline_params)

    def es_search(self, index, doc_type, query):
        results = self.es.search(index=index, doc_type=doc_type, body=query)
        return results
Exemplo n.º 31
0
class MyElasticsearch:
    def __init__(self):
        self.es = Elasticsearch()
        self.offset = 30000
        self.doc_index = "ecg"

    def getTimeStr(self, t):
        return datetime.fromtimestamp(int(t/1000)).strftime('%Y-%m-%d %H:%M:%S.') + str(int(t))[-3:]

    #
    # push data into elasticsearch
    #
    def push(self, data):
        if "ecg" not in data:
            print "'ecg' not in data"
            return
        if "start_ecg" not in data["ecg"]:
            print "'start_ecg' not in data.ecg"
            return
        ecg = data["ecg"]
        start_ecg = float(ecg["start_ecg"])
        items = ecg["data"]
        doc_type = data["client_id"]

        # delete 5 seconds older data
        oldTime = self.getTimeStr(start_ecg - 5000)

        self.es.delete_by_query(index=self.doc_index, doc_type=doc_type, body={
            "query": {
                "range": {
                    "time": {"lt": oldTime}
                }
            }
        })
        for item in items:
            doc = {}
            # convert ecg_time into readable format: "yyyy-mm-dd HH:MM:SS.sssZ"
            doc["time"] = self.getTimeStr(start_ecg)
            start_ecg += 4.7
            # elasticsearch does not accept negtive item,
            # add offset to become positive
            doc["ecg"] = int(item) + self.offset
            print self.es.index(index=self.doc_index, doc_type=doc_type, id=int(start_ecg), body=doc)
def es_post_test(esnode: Elasticsearch):
    allowed_indeces = ['review', 'comment', 'picture']
    parkid = 22222222222222
    pictureid = 111111111111111
    user = '******'
    for index in allowed_indeces:
        body = {}
        searchbody = {"query": {
                        "match": {
                            "user": user
                        }
                    }}
        if index == 'review':
            body = {
                'parkid': parkid,
                'rating': 5,
                'review': "wow great park!",
                'time': time.time(),
                'user': user
            }
        elif index == 'comment':
            body = {
                'picture_id': pictureid,
                'comment': "wow great picture!",
                'time': time.time(),
                'user': user
            }
        elif index == 'picture':
            body = {
                'parkid': parkid,
                'picture_url': "fake.com",
                'picture_id': pictureid,
                'time': time.time(),
                'user': user
            }

        esnode.index(index=index,
                     doc_type=index,
                     body=str(body).replace('\'', '\"'))
        print(esnode.search(index=index, body=str(searchbody).replace('\'', '\"')))
        time.sleep(2)
        esnode.delete_by_query(index=index, body=str(searchbody).replace('\'', '\"'))
Exemplo n.º 33
0
	def handle(self, *args, **options):	
		es = Elasticsearch(ES_URL)
		if args:
			if args[0]=='__ALL__':
				delete = es.delete_by_query(index="pod", doc_type='pod', body={"query":{"match_all":{}}})
				from pods.views import VIDEOS
				for pod in VIDEOS:
					res = es.index(index="pod", doc_type='pod', id=pod.id, body=pod.get_json_to_index(), refresh=True)
			else:
				for pod_id in args:
					try:
						pod = Pod.objects.get(pk=int(pod_id))
					except Pod.DoesNotExist:
						raise CommandError('Pod "%s" does not exist' % pod_id)
					res = es.index(index="pod", doc_type='pod', id=pod.id, body=pod.get_json_to_index(), refresh=True)
		else:
			print "******* Warning : you must give some arguments : %s *******" %self.args
Exemplo n.º 34
0
def push():
    host = os.environ.get('ELASTICSEARCH_HOST', 'localhost')
    connection = Elasticsearch([host])

    # Delete old markers or do initial setup
    try:
        print(connection.delete_by_query(index=[INDEX], doc_type=DOC_TYPE, q='*'))
    except NotFoundError:
        set_mapping()

    if True:
        # real  0m9.839s
        bulk(connection, get_bulk_ready_data())
    else:
        # real  0m30.341s
        for row in get_data():
            connection.create(
                index=INDEX,
                doc_type=DOC_TYPE,
                body=row,
                id=row['atlas_number'],
            )
Exemplo n.º 35
0
else:
    res = es.search(index='%s-*' % (args.index), body={'query': {'match_all': {}}})

doccount = res['hits']['total']
if doccount > 0:
    # get user confirmation to proceed
    print '%d documents found\n' % res['hits']['total']

    if not confirm(prompt='Delete these documents permanently?', resp=False):
        print 'Will NOT delete documents.  Exiting.'
        exit(0)

    # delete the records
    if args.filepath:
        es.delete_by_query(index='%s-*' % (args.index), body={'query': {'prefix': {'source.raw': '%s' % (args.filepath)}}})

    else:
        delres = es.indices.delete(index='%s-*' % (args.index), ignore=[400, 404])

else:
    print 'No matching documents in the %s index.  Nothing to delete.' % (args.index)

### reload from source files
if args.reload:
    # display files to be re-loaded
    matches = []

    if args.index and not args.filepath:
        for filepath in index_sourcedir_mapping[args.index]:
            matches = matches + file_path_matches(filepath)
Exemplo n.º 36
0
class ElasticSearchStorage(storage.Storage):
    '''
    Subclass of Storage.
    '''
    DEFAULTCONF = {
        'ENABLED': False,
        'host': 'localhost',
        'port': 9200,
        'index': 'multiscanner_reports',
        'metricbeat_enabled': True,
        'metricbeat_rollover_days': 7,
    }

    def setup(self):
        host_string = self.config['host']
        host_list = []
        for host in host_string.split(','):
            host_list.append(host.strip(' '))
        self.hosts = host_list
        self.port = self.config['port']
        self.index = self.config['index']
        self.doc_type = '_doc'
        self.es = Elasticsearch(
            hosts=self.hosts,
            port=self.port
        )

        # Reduce traceback output from the elasticsearch module
        es_logger = logging.getLogger('elasticsearch')
        es_logger.setLevel(logging.ERROR)

        # Create the index if it doesn't exist
        es_indices = self.es.indices
        # Add the template for Cuckoo
        with open(ES_TEMPLATE, 'r') as file_:
            template = json.loads(file_.read())
        if not es_indices.exists_template(ES_TEMPLATE_NAME):
            es_indices.put_template(
                name=ES_TEMPLATE_NAME,
                body=json.dumps(template)
            )

        # Try to create the index, pass if it exists
        try:
            es_indices.create(self.index)
        except TransportError:
            pass

        # Set the total fields limit
        try:
            es_indices.put_settings(
                index=self.index,
                body={'index.mapping.total_fields.limit': ES_MAX},
            )
        except TransportError:
            pass

        # Create de-dot preprocessor if doesn't exist yet
        try:
            dedot = self.es.ingest.get_pipeline('dedot')
        except TransportError:
            dedot = False
        if not dedot:
            script = {
                "source": """void dedot(def field) {
                        if (field != null && field instanceof HashMap) {
                            ArrayList replacelist = new ArrayList();
                            for (String key : field.keySet()) {
                                if (key.contains('.')) {
                                    replacelist.add(key)
                                }
                            }
                            for (String oldkey : replacelist) {
                                String newkey = /\\./.matcher(oldkey).replaceAll(\"_\");
                                field.put(newkey, field.get(oldkey));
                                field.remove(oldkey);
                            }
                            for (String k : field.keySet()) {
                                dedot(field.get(k));
                            }
                        }
                    }
                    dedot(ctx);"""
            }
            self.es.ingest.put_pipeline(id='dedot', body={
                'description': 'Replace dots in field names with underscores.',
                'processors': [
                    {
                        "script": script
                    }
                ]
            })

        return True

    def store(self, report):
        sample_ids = {}
        sample_list = []
        sample_tags = {}  # track in case we need to update sample instead of create

        for filename in report:
            report[filename]['filename'] = filename
            try:
                sample_id = report[filename]['SHA256']
            except KeyError:
                sample_id = uuid4()
            # Store metadata with the sample, not the report
            sample = {'doc_type': 'sample', 'filename': filename, 'tags': []}
            for field in METADATA_FIELDS:
                if field in report[filename]:
                    if len(report[filename][field]) != 0:
                        sample[field] = report[filename][field]
                    del report[filename][field]
            report[filename]['doc_type'] = {'name': 'report', 'parent': sample_id}
            sample_tags[sample_id] = sample.get('tags', [])

            # If there is Cuckoo results in the report, some
            # cleanup is needed for the report
            if 'Cuckoo Sandbox' in report[filename].keys():
                cuckoo_report = report[filename]['Cuckoo Sandbox']
                cuckoo_doc = {
                    'target': cuckoo_report.get('target'),
                    'summary': cuckoo_report.get('behavior', {}).get('summary'),
                    'info': cuckoo_report.get('info')
                }
                signatures = cuckoo_report.get('signatures')
                if signatures:
                    cuckoo_doc['signatures'] = process_cuckoo_signatures(signatures)

                dropped = cuckoo_report.get('dropped')
                if dropped:
                    cuckoo_doc['dropped'] = dropped

                procmemory = cuckoo_report.get('procmemory')
                if procmemory:
                    cuckoo_doc['procmemory'] = procmemory

                # TODO: add the API calls to the Cuckoo Report document
                # for process in cuckoo_report.get('behavior', {}).get('processes', []):
                #     process_pid = process['pid']
                #     cuckoo_doc['calls'] = {}
                #     cuckoo_doc['calls'][process_pid] = []
                #     for call in process['calls']:
                #         cuckoo_doc['calls'][process_pid].append(call)

                report[filename]['Cuckoo Sandbox'] = cuckoo_doc

            # Store report; let ES autogenerate the ID so we can save it with the sample
            try:
                report_result = self.es.index(index=self.index, doc_type=self.doc_type,
                                              body=report[filename],
                                              pipeline='dedot', routing=sample_id)
            except (TransportError, UnicodeEncodeError) as e:
                # If fail, index empty doc instead
                print('Failed to index that report!\n{}'.format(e))
                report_body_fail = {
                    'doc_type': {
                        'name': 'report',
                        'parent': sample_id,
                    },
                    'ERROR': 'Failed to index the full report in Elasticsearch',
                }
                if 'Scan Time' in report[filename]:
                    report_body_fail['Scan Time'] = report[filename]['Scan Time']
                report_result = self.es.index(index=self.index, doc_type=self.doc_type,
                                              body=report_body_fail,
                                              pipeline='dedot', routing=sample_id)

            report_id = report_result.get('_id')
            sample['report_id'] = report_id
            sample_ids[sample_id] = report_id

            sample_list.append(
                {
                    '_op_type': 'create',
                    '_index': self.index,
                    '_type': self.doc_type,
                    '_id': sample_id,
                    '_source': sample,
                    'pipeline': 'dedot'
                }
            )

        result = helpers.bulk(self.es, sample_list, raise_on_error=False)

        creation_errors = result[1]
        if not creation_errors:
            return sample_ids

        # Some samples already exist; update them to ref the new reports
        updates_list = []
        for err in creation_errors:
            if err['create']['status'] == 409:
                sid = err['create']['_id']
                rid = sample_ids[sid]
                updates_list.append(
                    {
                        '_op_type': 'update',
                        '_index': self.index,
                        '_type': self.doc_type,
                        '_id': sid,
                        'doc': {'report_id': rid},
                        'pipeline': 'dedot'
                    }
                )
                # Update tags
                for tag in sample_tags.get(sid, []):
                    self.add_tag(sid, tag)

        result = helpers.bulk(self.es, updates_list, raise_on_error=False)
        return sample_ids

    def get_report(self, sample_id, timestamp):
        '''Find a report for the given sample at the given timestamp, and
        return the report with sample metadata included.
        '''
        ts = str(timestamp).replace(' ', 'T')
        query = {
            "query": {
                "bool": {
                    "must": [
                        {"parent_id": {
                            "type": "report",
                            "id": sample_id
                        }},
                        {
                            "term": {
                                "Scan Metadata.Scan Time": ts
                            }
                        }
                    ]
                }
            }
        }

        try:
            result_search = self.es.search(
                index=self.index, doc_type=self.doc_type, body=query
            )
            result_report = result_search['hits']['hits'][0]

            result_sample = self.es.get(
                index=self.index, doc_type=self.doc_type,
                id=sample_id
            )
            del result_sample['_source']['report_id']
            del result_sample['_source']['doc_type']
            del result_report['_source']['doc_type']
            result = result_report['_source'].copy()
            result.update(result_sample['_source'])
            return result
        except Exception as e:
            print(e)
            return None

    def build_query(self, query_string):
        return {"query": {"query_string": {
            "default_operator": 'AND',
            "query": query_string}}}

    def search(self, query_string, search_type='default'):
        '''Run a Query String query and return a list of sample_ids associated
        with the matches. Run the query against all document types.
        '''
        if search_type == 'advanced':
            query = self.build_query(query_string)
        else:
            es_reserved_chars_re = r'([\+\-=\>\<\!\(\)\{\}\[\]\^\"\~\*\?\:\\/ ])'
            query_string = re.sub(es_reserved_chars_re, r'\\\g<1>', query_string)
            if search_type == 'default':
                query = self.build_query("*" + query_string + "*")
            elif search_type == 'exact':
                query = self.build_query("\"" + query_string + "\"")
            else:
                print('Unknown search type!')
                return None
        result = helpers.scan(
            self.es, query=query, index=self.index
        )

        matches = []
        for r in result:
            if r.get('_source', {}).get('doc_type', {}) == 'sample':
                field = '_id'
            else:
                field = '_routing'
            matches.append(r[field])
        return tuple(set(matches))

    def add_tag(self, sample_id, tag):
        script = {
            "script": {
                "inline": """def i = ctx._source.tags.indexOf(params.tag);
                    if (i == -1) { ctx._source.tags.add(params.tag); }""",
                "lang": "painless",
                "params": {
                    "tag": tag
                }
            }
        }

        try:
            result = self.es.update(
                index=self.index, doc_type=self.doc_type,
                id=sample_id, body=script
            )
            return result
        except Exception as e:
            # TODO: log exception
            return None

    def remove_tag(self, sample_id, tag):
        script = {
            "script": {
                "source": """def i = ctx._source.tags.indexOf(params.tag);
                    if (i > -1) { ctx._source.tags.remove(i); }""",
                "lang": "painless",
                "params": {
                    "tag": tag
                }
            }
        }

        try:
            result = self.es.update(
                index=self.index, doc_type=self.doc_type,
                id=sample_id, body=script
            )
            return result
        except Exception as e:
            # TODO: log exception
            return None

    def get_tags(self):
        script = {
            "query": {
                "match_all": {}
            },
            "aggs": {
                "tags_agg": {
                    "terms": {
                        "field": "tags.keyword",
                        "size": ES_MAX
                    }
                }
            }
        }

        result = self.es.search(
            index=self.index, doc_type=self.doc_type, body=script
        )
        return result['aggregations']['tags_agg']['buckets']

    def get_notes(self, sample_id, search_after=None):
        query = {
            "query": {
                "parent_id": {
                    "type": "note",
                    "id": sample_id
                }
            },
            "sort": [
                {
                    "timestamp": {
                        "order": "asc"
                    }
                },
                {
                    "_id": {
                        "order": "desc"
                    }
                }
            ]
        }
        if search_after:
            query['search_after'] = search_after

        result = self.es.search(
            index=self.index, doc_type=self.doc_type, body=query
        )
        return result

    def get_note(self, sample_id, note_id):
        try:
            result = self.es.get(
                index=self.index, doc_type=self.doc_type,
                id=note_id, routing=sample_id
            )
            return result
        except Exception as e:
            # TODO: log exception
            return None

    def add_note(self, sample_id, data):
        data['doc_type'] = {'name': 'note', 'parent': sample_id}
        data['timestamp'] = datetime.now().isoformat()
        result = self.es.index(
            index=self.index, doc_type=self.doc_type, body=data,
            routing=sample_id
        )
        if result['result'] == 'created':
            return self.get_note(sample_id, result['_id'])
        return result

    def edit_note(self, sample_id, note_id, text):
        partial_doc = {
            'doc': {
                "text": text
            }
        }
        print(partial_doc)
        result = self.es.update(
            index=self.index, doc_type=self.doc_type, id=note_id,
            body=partial_doc, routing=sample_id
        )
        if result['result'] == 'created':
            return self.get_note(sample_id, result['_id'])
        return result

    def delete_note(self, sample_id, note_id):
        result = self.es.delete(
            index=self.index, doc_type=self.doc_type, id=note_id,
            routing=sample_id
        )
        return result

    def delete(self, report_id):
        try:
            self.es.delete(
                index=self.index, doc_type=self.doc_type,
                id=report_id
            )
            return True
        except Exception as e:
            # TODO: log exception
            return False

    def delete_by_task_id(self, task_id):
        query = {
            "query": {
                "term": {
                    "Scan Metadata.Task ID": task_id
                }
            }
        }

        try:
            self.es.delete_by_query(
                index=self.index, doc_type=self.doc_type, body=query
            )
            return True
        except Exception as e:
            # TODO: log exception
            return False

    def teardown(self):
        pass

    def delete_index(self, index_prefix, days):
        '''
        Delete index equal to or older than days.
        '''
        try:
            ilo = curator.IndexList(self.es)
            ilo.filter_by_regex(kind='prefix', value=index_prefix)
            ilo.filter_by_age(source='name', direction='older', timestring='%Y.%m.%d', unit='days', unit_count=days)
            delete_indices = curator.DeleteIndices(ilo)
            delete_indices.do_action()
        except Exception as e:
            # TODO: log exception
            return False
Exemplo n.º 37
0
class DocManager():
    """The DocManager class creates a connection to the backend engine and
        adds/removes documents, and in the case of rollback, searches for them.

        The reason for storing id/doc pairs as opposed to doc's is so that
        multiple updates to the same doc reflect the most up to date version as
        opposed to multiple, slightly different versions of a doc.

        We are using elastic native fields for _id and ns, but we also store
        them as fields in the document, due to compatibility issues.
        """

    def __init__(self, url, auto_commit=False, unique_key='_id', **kwargs):
        """ Establish a connection to Elastic
        """
        self.elastic = Elasticsearch(hosts=[url])
        self.auto_commit = auto_commit
        self.doc_type = 'string'  # default type is string, change if needed
        self.unique_key = unique_key
        if auto_commit:
            self.run_auto_commit()

    def stop(self):
        """ Stops the instance
        """
        self.auto_commit = False

    def upsert(self, doc):
        """Update or insert a document into Elastic

        If you'd like to have different types of document in your database,
        you can store the doc type as a field in Mongo and set doc_type to
        that field. (e.g. doc_type = doc['_type'])

        """
        doc_type = self.doc_type
        index = doc['ns']
        doc[self.unique_key] = str(doc["_id"])
        doc_id = doc[self.unique_key]
        try:
            self.elastic.index(index=index, doc_type=doc_type,
                               body=bsjson.dumps(doc), id=doc_id, refresh=True)
        except (es_exceptions.ConnectionError):
            raise errors.ConnectionFailed("Could not connect to Elastic Search")
        except es_exceptions.TransportError:
            raise errors.OperationFailed("Could not index document: %s"%(
                bsjson.dumps(doc)))

    def bulk_upsert(self, docs):
        """Update or insert multiple documents into Elastic

        docs may be any iterable
        """

        def docs_to_upsert():
            doc = None
            for doc in docs:
                index = doc["ns"]
                doc[self.unique_key] = str(doc[self.unique_key])
                doc_id = doc[self.unique_key]
                yield { "index": {"_index": index, "_type": self.doc_type,
                                  "_id": doc_id} }
                yield doc
            if not doc:
                raise errors.EmptyDocsError(
                    "Cannot upsert an empty sequence of "
                    "documents into Elastic Search")
        try:
            self.elastic.bulk(doc_type=self.doc_type,
                              body=docs_to_upsert(), refresh=True)
        except (es_exceptions.ConnectionError):
            raise errors.ConnectionFailed("Could not connect to Elastic Search")
        except es_exceptions.TransportError:
            raise errors.OperationFailed(
                "Could not bulk-insert documents into Elastic")
        except errors.EmptyDocsError:
            # This can happen when mongo-connector starts up, there is no
            # config file, but nothing to dump
            pass

    def remove(self, doc):
        """Removes documents from Elastic

        The input is a python dictionary that represents a mongo document.
        """
        try:
            self.elastic.delete(index=doc['ns'], doc_type=self.doc_type,
                                id=str(doc[self.unique_key]), refresh=True)
        except (es_exceptions.ConnectionError):
            raise errors.ConnectionFailed("Could not connect to Elastic Search")
        except es_exceptions.TransportError:
            raise errors.OperationFailed("Could not remove document: %s"%(
                bsjson.dumps(doc)))

    def _remove(self):
        """For test purposes only. Removes all documents in test.test
        """
        try:
            self.elastic.delete_by_query(index="test.test",
                                         doc_type=self.doc_type,
                                         body={"match_all":{}})
        except (es_exceptions.ConnectionError):
            raise errors.ConnectionFailed("Could not connect to Elastic Search")
        except es_exceptions.TransportError:
            raise errors.OperationFailed("Could not remove test documents")
        self.commit()

    def _stream_search(self, *args, **kwargs):
        """Helper method for iterating over ES search results"""
        try:
            first_response = self.elastic.search(*args, search_type="scan",
                                                 scroll="10m", size=100,
                                                 **kwargs)
            scroll_id = first_response.get("_scroll_id")
            expected_count = first_response.get("hits", {}).get("total", 0)
            results_returned = 0
            while results_returned < expected_count:
                next_response = self.elastic.scroll(scroll_id=scroll_id,
                                                    scroll="10m")
                results_returned += len(next_response["hits"]["hits"])
                for doc in next_response["hits"]["hits"]:
                    yield doc["_source"]
        except (es_exceptions.ConnectionError):
            raise errors.ConnectionFailed(
                "Could not connect to Elastic Search")
        except es_exceptions.TransportError:
            raise errors.OperationFailed(
                "Could not retrieve documents from Elastic Search")


    def search(self, start_ts, end_ts):
        """Called to query Elastic for documents in a time range.
        """
        return self._stream_search(index="_all",
                                   body={"query": {"range": {"_ts": {
                                       "gte": start_ts,
                                       "lte": end_ts
                                   }}}})

    def _search(self):
        """For test purposes only. Performs search on Elastic with empty query.
        Does not have to be implemented.
        """
        return self._stream_search(index="test.test",
                                   body={"query": {"match_all": {}}})

    def commit(self):
        """This function is used to force a refresh/commit.
        """
        retry_until_ok(self.elastic.indices.refresh, index="")

    def run_auto_commit(self):
        """Periodically commits to the Elastic server.
        """
        self.elastic.indices.refresh()

        if self.auto_commit:
            Timer(1, self.run_auto_commit).start()

    def get_last_doc(self):
        """Returns the last document stored in the Elastic engine.
        """
        try:
            result = self.elastic.search(
                index="_all",
                body={
                    "query": {"match_all": {}},
                    "sort": [{"_ts":"desc"}]
                },
                size=1
            )["hits"]["hits"]
        except (es_exceptions.ConnectionError):
            raise errors.ConnectionFailed("Could not connect to Elastic Search")
        except es_exceptions.TransportError:
            raise errors.OperationFailed(
                "Could not retrieve last document from Elastic Search")

        return result[0]["_source"] if len(result) > 0 else None
Exemplo n.º 38
0
class ElasticSearchStorage(BaseStorage):
  SETTINGS_VALIDATORS = {
    'force_refresh': is_bool,
    'hosts': lambda x: is_list,
    'index_template': is_non_empty_str,
    'index_prefix': is_non_empty_str,
    'replicas': is_int,
    'rollover_size': is_pos_int,
    'rollover_check_period_seconds': is_pos_int,
    'read_size': is_pos_int,
    'shards': is_pos_int,
  }

  def __init__(self, name, namespaces, **settings):
    super(ElasticSearchStorage, self).__init__(name, namespaces, **settings)
    self.setup_elasticsearch()
    self.index_manager = IndexManager(weakref.proxy(self))

  def setup_elasticsearch(self):
    self.es = Elasticsearch(hosts=self.hosts,
                            sniff_on_start=True,
                            sniff_on_connection_fail=True)

    # Load index template.
    template_path = os.path.join(os.path.dirname(__file__), INDEX_TEMPLATE)
    with open(template_path) as f:
      template = f.read()
      for var, value in {'id_field': ID_FIELD,
                         'timestamp_field': TIMESTAMP_FIELD,
                         'index_prefix': self.index_prefix,
                         'shards': self.shards,
                         'replicas': self.replicas}.iteritems():
        template = template.replace('{{ %s }}' % var, str(value))

    # Always update template (in case it's missing, or it was updated).
    self.es.indices.put_template(name=self.index_template, body=template)

  def is_alive(self):
    return self.es.ping()

  def _insert(self, namespace, stream, events, configuration):
    """
    `namespace` acts as db for different streams
    `stream` is the name of a stream and `events` is a list of events to
    insert.
    """
    index = self.index_manager.get_index(namespace)
    start_dts_to_add = set()

    def actions():
      for _id, event in events:
        dt = kronos_time_to_datetime(uuid_to_kronos_time(_id))
        start_dts_to_add.add(_round_datetime_down(dt))
        event['_index'] = index
        event['_type'] = stream
        event[LOGSTASH_TIMESTAMP_FIELD] = dt.isoformat()

        yield event

    list(es_helpers.streaming_bulk(self.es, actions(), chunk_size=1000,
                                   refresh=self.force_refresh))
    self.index_manager.add_aliases(namespace,
                                   index,
                                   start_dts_to_add)
  
  def _delete(self, namespace, stream, start_id, end_time, configuration):
    """
    Delete events with id > `start_id` and end_time <= `end_time`.
    """
    start_time = uuid_to_kronos_time(start_id)
    body_query = {
      'query': {
        'filtered': {
          'query': {'match_all': {}},
          'filter': {
            'bool': {
              'should': [
                {'range': {TIMESTAMP_FIELD: {'gt': start_time,
                                             'lte': end_time}}},
                {'bool': {
                  'must': [
                    {'range': {ID_FIELD: {'gt': str(start_id)}}},
                    {'term': {TIMESTAMP_FIELD: start_time}}
                    ]
                  }
                 }
                ]
              }
            }
          }
        }
      }
    query = {'index': self.index_manager.get_index(namespace),
             'doc_type': stream,
             'body': body_query,
             'ignore': 404,
             'allow_no_indices': True,
             'ignore_unavailable': True}
    try:
      # XXX: ElasticSearch does not return stats on deletions.
      # https://github.com/elasticsearch/elasticsearch/issues/6519
      count = self.es.count(**query).get('count', 0)
      if count:
        self.es.delete_by_query(**query)
      return count, []
    except Exception, e:
      return 0, [repr(e)]
Exemplo n.º 39
0
class ElasticSearchService:
    '''
    Service for operating with Elasticsearch APIs. Used when data indices are created/deleted and
    data is deleted/reindexed.
    '''

    ES_CONFIG_DIR = 'resources/es-config/'

    REFERENCE_DATA_INDEX_NAME = 'reference_data'
    REFERENCE_DATA_INDEX_FILENAME = ES_CONFIG_DIR + 'reference_data_index.json'
    REFERENCE_DATA_TYPE_MAPPING_FILENAME = ES_CONFIG_DIR + 'reference_data_type_mapping.json'

    ORGANIZATION_DATA_INDEX_NAME = 'organization_data'
    ORGANIZATION_DATA_INDEX_FILENAME = ES_CONFIG_DIR + 'organization_data_index.json'
    ORGANIZATION_DATA_TYPE_MAPPING_FILENAME = ES_CONFIG_DIR + 'organization_data_type_mapping.json'

    def __init__(self):
        self.es = Elasticsearch(
            [
            'http://localhost:9200/'
            ]
        )

    def index_exists(self, index):
        return self.es.indices.exists(index=index)

    def create_index(self, index, filename):
        print("Trying to create index " + index)
        return self._operation_ok(self.es.indices.create(index=index,body=self._get_json_file_as_str(filename)))

    def delete_index(self, index):
        print("Trying to delete index " + index)
        return self._operation_ok(self.es.indices.delete(index=index, ignore=[404]))

    def create_type_mapping(self, index, doc_type, filename):
        print("Trying to create mapping type " + doc_type + " for index " + index)
        return self._operation_ok(self.es.indices.put_mapping(index=index, doc_type=doc_type, body=self._get_json_file_as_str(filename)))

    def delete_and_update_indexable_data(self, index, doc_type, indexable_data_list):
        if len(indexable_data_list) > 0:
            self._delete_all_documents_from_index_with_type(index, doc_type)
            bulk_update_str = "\n".join(map(lambda idx_data: self._create_bulk_update_row_for_indexable_data(index, doc_type, idx_data), indexable_data_list))
            print("Trying to bulk update reference data with type " + doc_type + " to index " + index)
            return self._operation_ok(self.es.bulk(body=bulk_update_str))
        return None

    def _delete_all_documents_from_index(self, index):
        print("Trying to delete all documents from index " + index)
        return self._operation_ok(self.es.delete_by_query(index=index, body="{\"query\": { \"match_all\": {}}}"))

    def _delete_all_documents_from_index_with_type(self, index, doc_type):
        print("Trying to delete all documents from index " + index + " having type " + doc_type)
        return self._operation_ok(self.es.delete_by_query(index=index, doc_type=doc_type, body="{\"query\": { \"match_all\": {}}}"))

    def _create_bulk_update_row_for_indexable_data(self, index, doc_type, indexable_data_item):
        return "{\"index\":{\"_index\": \"" + index + "\", \"_type\": \"" + doc_type + "\", \"_id\":\"" + indexable_data_item.get_es_document_id() + "\"}}\n" + indexable_data_item.to_es_document()

    def _create_bulk_delete_row_indexable_data(self, index, doc_type, indexable_data_item):
       return "{\"delete\":{\"_index\": \"" + index + "\", \"_type\": \"" + doc_type + "\", \"_id\":\"" + indexable_data_item.get_es_document_id() + "\"}}"

    def _operation_ok(self, op_response):
        if op_response.get('acknowledged'):
            print("OK")
            return True
        return False

    def _get_json_file_as_str(self, filename):
        with open(filename) as json_data:
            return json.load(json_data)
Exemplo n.º 40
0
class VWCollection(VWCallback):
    
    def __init__(self,items=[],**kwargs):
        self.bulk_chunk_size = kwargs.get('bulk_chunk_size',
            config.bulk_chunk_size)
        self._sort = []
        self.results_per_page = kwargs.get('results_per_page',
            config.results_per_page)
        self._querybody = querybuilder.QueryBody() # sets up the new query bodies

        if kwargs.get('base_obj'):
            self.base_obj = kwargs.get('base_obj')
        else:
            try:
                self.base_obj = self.__class__.__model__
            except AttributeError:
                raise AttributeError('Base object must contain a model or pass base_obj')

        self._es = Elasticsearch(config.dsn)
        self._esc = client.IndicesClient(self._es)

        if '__index__' in dir(self.base_obj):
            idx = self.base_obj.__index__
        else:
            idx = config.default_index

        self._search_params = []
        self._raw = {}
        self.idx = idx
        self.type = self.base_obj.__type__
        self._special_body = {}
        
        # special list of items that can be committed in bulk
        self._items = items 

    def search(self,q):
        self._search_params.append(q)
        return self

    # setup a raw request
    def raw(self, raw_request):
        self._raw = raw_request
        return self

    def filter_by(self, condition = 'and',**kwargs):
        if kwargs.get('condition'):
            condition=kwargs.get('condition')
            del kwargs['condition']

        condition = self._translate_bool_condition(condition)

        for k,v in kwargs.iteritems():
            if k == 'id' or k == 'ids':
                id_filter = v
                if not isinstance(id_filter, list):
                    id_filter = [id_filter]

                self._querybody.chain(qdsl.ids(id_filter), condition=condition)
            else:
                try:
                    analyzed = is_analyzed(getattr(self.base_obj, k))
                except AttributeError:
                    analyzed = is_analyzed(v)

                q_type = 'filter'
                if analyzed:
                    q_type = 'query'

                if isinstance(v, list):
                    # lists are treat as like "OR" (using terms() on not_analyzed, bool/matched on analyzed)
                    if analyzed:
                        match_queries = []
                        for item in v:
                            match_queries.append( qdsl.match(k,item) )
                        self._querybody.chain( qdsl.bool(qdsl.should(match_queries)), condition=condition,type=q_type )
                    else:
                        self._querybody.chain( qdsl.terms(k,v),condition=condition,
                            type=q_type)
                else:
                    #search_value = unicode(v)
                    if analyzed:
                        self._querybody.chain(qdsl.match(unicode(k), v), condition=condition,type=q_type)
                    else:
                        self._querybody.chain(qdsl.term(unicode(k), v), condition=condition,type=q_type)

        return self

    def multi_match(self, fields, query, **kwargs):
        self._querybody.chain(qdsl.multi_match(query, fields), condition=kwargs.get('condition', None), type='query')
        return self

    def exact(self, field, value,**kwargs):
        try:
            field_template = getattr( self.base_obj, field)

            if type(field_template) != ESType:
                field_template = create_es_type(field_template)

            for estype in [String,IP,Attachment]:
                if isinstance(field_template, estype) and field_template.analyzed == True:
                    logger.warn('%s types may not exact match correctly if they are analyzed' % unicode(estype.__class__.__name__))

        except AttributeError:
            logger.warn('%s is not in the base model.' % unicode(field))

        kwargs['type'] = 'filter'
        if isinstance(value, list):
            self._querybody.chain(qdsl.terms(field,value), **kwargs)
        else:
            self._querybody.chain(qdsl.term(field, value), **kwargs)

        return self


    def or_(self,*args):
        return ' OR '.join(args)

    def and_(self,*args):
        return ' AND '.join(args)

    def get(self,id, **kwargs):
        try:
            params = {'index':self.idx, 'doc_type':self.type, 'id':id}
            params.update(kwargs)
            doc = self._es.get(**params)
            if doc:
                return VWCollectionGen(self.base_obj, {'docs':[doc]})[0]

            return None

        except:
            # TODO. Discuss this. Should get() return None even on exceptions?
            return None

    def refresh(self, **kwargs):
        self._esc.refresh(index=self.idx, **kwargs)

    def get_in(self, ids,**kwargs):
        if len(ids) > 0: # check for ids. empty list returns an empty list (instead of exception)
            params = {'index':self.idx, 'doc_type':self.type, 'body':{'ids':ids}}
            params.update(kwargs);
            res = self._es.mget(**params)
            if res and res.get('docs'):
                return VWCollectionGen(self.base_obj, res)

        return []

    def get_like_this(self,doc_id,**kwargs):
        params = {'index':self.idx,'doc_type':self.type,'id':doc_id}
        params.update(kwargs)
        res = self._es.mlt(**params)

        if res and res.get('docs'):
            return VWCollectionGen(self.base_obj, res)
        else:
            return []

    def sort(self, **kwargs):
        for k,v in kwargs.iteritems():
            v = v.lower()
            if v not in ['asc','desc']:
                v = 'asc'

            self._sort.append('%s:%s' % (k,v))
        return self

    def clear_previous_search(self):
        self._raw = {}
        self._search_params = []
        self._special_body = {}
        self._querybody = querybuilder.QueryBody()

    def _create_search_params( self, **kwargs ):
        # before_query_build() is allowed to manipulate the object's internal state before we do stuff
        self._querybody = self.execute_callbacks('before_query_build', self._querybody )

        q = {
            'index': self.idx,
            'doc_type': self.type
        }

        if self._raw:
            q['body'] = self._raw
        elif len(self._search_params) > 0:
            kwargs['type'] = 'query'
            self._querybody.chain(qdsl.query_string(self.and_(*self._search_params)), **kwargs)
        else:
            q['body'] = qdsl.query(qdsl.match_all())

        if self._querybody.is_filtered() or self._querybody.is_query():
            q['body'] = self._querybody.build()

        # after_query_build() can manipulate the final query before being sent to ES
        # this is generally considered a bad idea but might be useful for logging
        q = self.execute_callbacks( 'after_query_build', q )

        logger.debug(json.dumps(q))
        return q

    def count(self):
        params = self._create_search_params()
        resp = self._es.count(**params)
        return resp.get('count')

    def __len__(self):
        return self.count()

    def limit(self,count):
        self.results_per_page = count
        return self

    def all(self,**kwargs):

        params = self._create_search_params()
        if not params.get('size'):
            params['size'] = self.results_per_page

        if kwargs.get('results_per_page') != None:
            kwargs['size'] = kwargs.get('results_per_page')
            del kwargs['results_per_page']

        if kwargs.get('start') != None:
            kwargs['from_'] = kwargs.get('start')
            del kwargs['start']

        logger.debug(json.dumps(self._sort))

        params.update(kwargs)
        if len(self._sort) > 0:
            if params.get('sort') and isinstance(params['sort'], list):
                params['sort'].extend(self._sort)
            else:
                params['sort'] = self._sort

        if params.get('sort'):
            if isinstance(params['sort'], list):
                params['sort'] = ','.join(params.get('sort'))
            else:
                raise TypeError('"sort" argument must be a list')

        logger.debug(json.dumps(params))
        results = self._es.search(**params)

        return VWCollectionGen(self.base_obj,results)

    def one(self,**kwargs):
        kwargs['results_per_page'] = 1
        results = self.all(**kwargs)
        try:
            return results[0]
        except IndexError:
            raise NoResultsFound('No result found for one()')

    # this is for legacy purposes in filter_by
    def _translate_bool_condition(self,_bool_condition):
        if _bool_condition == 'and':
            _bool_condition = 'must'
        elif _bool_condition == 'or':
            _bool_condition = 'should'
        elif _bool_condition == 'not':
            _bool_condition = 'must_not'

        # this is for things like geo_distance where we explicitly want the true and/or/not
        elif _bool_condition == 'explicit_and':
            _bool_condition = 'and'
        elif _bool_condition == 'explicit_or':
            _bool_condition = 'or'
        elif _bool_condition == 'explicit_not':
            _bool_condition = 'not'

        return _bool_condition

    def range(self, field, **kwargs):
        search_options = {}
        for opt in ['condition','minimum_should_match']:
            if opt in kwargs:
                search_options[opt] = kwargs.get(opt)
                del kwargs[opt]

        q = qdsl.range(field, **kwargs)
        if self._querybody.is_filtered():
            d = {'filter': q}
        else:
            d = {'query': q}

        if search_options:
            d.update(search_options)

        self._querybody.chain(d)

        return self

    def search_geo(self, field, distance, lat, lon,**kwargs):
        condition = kwargs.get('condition', 'and')
        if 'condition' in kwargs:
            del kwargs['condition']

        self._querybody.chain(qdsl.filter_(qdsl.geo_distance(field, [lon,lat], distance, **kwargs)), condition=condition)
        return self

    def missing( self, field, **kwargs):
        self._querybody.chain(qdsl.filter_(qdsl.missing(field)))
        return self

    def exists( self, field, **kwargs):
        self._querybody.chain(qdsl.filter_(qdsl.exists(field, **kwargs)))
        return self

    def delete(self, **kwargs):
        params = self._create_search_params()
        params.update(kwargs)
        self._es.delete_by_query(**params)

    def delete_in(self, ids):
        if not isinstance(ids, list):
            raise TypeError('argument to delete in must be a list.')

        bulk_docs = []
        for i in ids:
            this_id = i
            this_type = self.base_obj.__type__
            this_idx = self.idx
            if isinstance(i, VWBase):
                this_id = i.id
                this_type = i.__type__
                try:
                    this_idx = i.__index__
                except AttributeError:
                    pass

            bulk_docs.append({'_op_type': 'delete', '_type': this_type, '_index': this_idx, '_id': this_id })

        return helpers.bulk( self._es, bulk_docs, chunk_size=self.bulk_chunk_size)

    # commits items in bulk
    def commit(self, callback=None):
        bulk_docs = []

        if callback:
            if not callable(callback):
                raise TypeError('Argument 2 to commit() must be callable')

        # allow for a search to work if there are not _items
        if len(self._items) == 0:
            items = self.all()
        else:
            items = self._items

        for i in self._items:
            if callback:
                i = callback(i)

            i = self.execute_callbacks('on_bulk_commit', i)

            this_dict = {}
            this_id = ''
            this_idx = self.idx
            this_type = self.base_obj.__type__
            if isinstance(i, VWBase):
                this_dict = i._create_source_document()
                this_type = i.__type__
                this_id = i.id
                try:
                    this_idx = i.__index__
                except AttributeError:
                    pass

            elif isinstance(i,dict):
                this_dict = i
                this_id = i.get('id')

            else:
                raise TypeError('Elments passed to the collection must be type of "dict" or "VWBase"')

            if not this_id:
                this_id = str(uuid4())

            bulk_docs.append({'_op_type': 'index', '_type': this_type, '_index': this_idx, '_id': this_id, '_source': this_dict})

        return helpers.bulk(self._es,bulk_docs,chunk_size=self.bulk_chunk_size)
Exemplo n.º 41
0
class ElasticSearchDB(object):

  """
  .. class:: ElasticSearchDB

  :param str url: the url to the database for example: el.cern.ch:9200
  :param str gDebugFile: is used to save the debug information to a file
  :param int timeout: the default time out to Elasticsearch
  :param int RESULT_SIZE: The number of data points which will be returned by the query.
  """
  __chunk_size = 1000
  __url = ""
  __timeout = 120
  clusterName = ''
  RESULT_SIZE = 10000

  ########################################################################
  def __init__(self, host, port, user=None, password=None, indexPrefix='', useSSL=True):
    """ c'tor
    :param self: self reference
    :param str host: name of the database for example: MonitoringDB
    :param str port: The full name of the database for example: 'Monitoring/MonitoringDB'
    :param str user: user name to access the db
    :param str password: if the db is password protected we need to provide a password
    :param str indexPrefix: it is the indexPrefix used to get all indexes
    :param bool useSSL: We can disable using secure connection. By default we use secure connection.
    """

    self.__indexPrefix = indexPrefix
    self._connected = False
    if user and password:
      gLogger.debug("Specified username and password")
      self.__url = "https://%s:%s@%s:%d" % (user, password, host, port)
    else:
      gLogger.debug("Username and password not specified")
      self.__url = "http://%s:%d" % (host, port)

    gLogger.verbose("Connecting to %s:%s, useSSL = %s" % (host, port, useSSL))

    if useSSL:
      bd = BundleDeliveryClient()
      retVal = bd.getCAs()
      casFile = None
      if not retVal['OK']:
        gLogger.error("CAs file does not exists:", retVal['Message'])
        casFile = certifi.where()
      else:
        casFile = retVal['Value']

      self.__client = Elasticsearch(self.__url,
                                    timeout=self.__timeout,
                                    use_ssl=True,
                                    verify_certs=True,
                                    ca_certs=casFile)
    else:
      self.__client = Elasticsearch(self.__url, timeout=self.__timeout)

    self.__tryToConnect()

  def getIndexPrefix(self):
    """
    It returns the DIRAC setup.
    """
    return self.__indexPrefix

  ########################################################################
  def query(self, index, query):
    """ Executes a query and returns its result (uses ES DSL language).

    :param self: self reference
    :param basestring index: index name
    :param dict query: It is the query in ElasticSearch DSL language

    """
    try:
      esDSLQueryResult = self.__client.search(index=index, body=query)
      return S_OK(esDSLQueryResult)
    except RequestError as re:
      return S_ERROR(re)

  def _Search(self, indexname):
    """
    it returns the object which can be used for reatriving ceratin value from the DB
    """
    return Search(using=self.__client, index=indexname)

  ########################################################################
  def _Q(self, name_or_query='match', **params):
    """
    It is a wrapper to ElasticDSL Query module used to create a query object.
    :param str name_or_query is the type of the query
    """
    return Q(name_or_query, **params)

  def _A(self, name_or_agg, aggsfilter=None, **params):
    """
    It is a wrapper to ElasticDSL aggregation module, used to create an aggregation
    """
    return A(name_or_agg, aggsfilter, **params)
  ########################################################################

  def __tryToConnect(self):
    """Before we use the database we try to connect and retrieve the cluster name

    :param self: self reference

    """
    try:
      if self.__client.ping():
        # Returns True if the cluster is running, False otherwise
        result = self.__client.info()
        self.clusterName = result.get("cluster_name", " ")  # pylint: disable=no-member
        gLogger.info("Database info", result)
        self._connected = True
      else:
        self._connected = False
        gLogger.error("Cannot ping ElasticsearchDB!")
    except ConnectionError as e:
      gLogger.error(repr(e))
      self._connected = False

  ########################################################################
  def getIndexes(self):
    """
    It returns the available indexes...
    """

    # we only return indexes which belong to a specific prefix for example 'lhcb-production' or 'dirac-production etc.
    return [index for index in self.__client.indices.get_alias("%s*" % self.__indexPrefix)]

  ########################################################################
  def getDocTypes(self, indexName):
    """
    :param str indexName is the name of the index...
    :return S_OK or S_ERROR
    """
    result = []
    try:
      gLogger.debug("Getting mappings for ", indexName)
      result = self.__client.indices.get_mapping(indexName)
    except Exception as e:  # pylint: disable=broad-except
      gLogger.error(e)
    doctype = ''
    for indexConfig in result:
      if not result[indexConfig].get('mappings'):
        # there is a case when the mapping exits and the value is None...
        # this is usually an empty index or a corrupted index.
        gLogger.warn("Index does not have mapping %s!" % indexConfig)
        continue
      if result[indexConfig].get('mappings'):
        doctype = result[indexConfig]['mappings']
        break  # we supose the mapping of all indexes are the same...

    if not doctype:
      return S_ERROR("%s does not exists!" % indexName)

    return S_OK(doctype)

  ########################################################################
  def exists(self, indexName):
    """
    it checks the existance of an index
    :param str indexName: the name of the index
    """
    return self.__client.indices.exists(indexName)

  ########################################################################

  def createIndex(self, indexPrefix, mapping, period=None):
    """
    :param str indexPrefix: it is the index name.
    :param dict mapping: the configuration of the index.
    :param str period: We can specify, which kind of index will be created.
                       Currently only daily and monthly indexes are supported.

    """
    fullIndex = generateFullIndexName(indexPrefix, period)  # we have to create an index each day...
    if self.exists(fullIndex):
      return S_OK(fullIndex)

    try:
      gLogger.info("Create index: ", fullIndex + str(mapping))
      self.__client.indices.create(fullIndex, body={'mappings': mapping})
      return S_OK(fullIndex)
    except Exception as e:  # pylint: disable=broad-except
      gLogger.error("Can not create the index:", e)
      return S_ERROR("Can not create the index")

  def deleteIndex(self, indexName):
    """
    :param str indexName the name of the index to be deleted...
    """
    try:
      retVal = self.__client.indices.delete(indexName)
    except NotFoundError as e:
      return S_ERROR(DErrno.EELNOFOUND, e)
    except ValueError as e:
      return S_ERROR(DErrno.EVALUE, e)

    if retVal.get('acknowledged'):
      # if the value exists and the value is not None
      return S_OK(indexName)

    return S_ERROR(retVal)

  def index(self, indexName, doc_type, body):
    """
    :param str indexName: the name of the index to be used...
    :param str doc_type: the type of the document
    :param dict body: the data which will be indexed
    :return: the index name in case of success.
    """
    try:
      res = self.__client.index(index=indexName,
                                doc_type=doc_type,
                                body=body)
    except TransportError as e:
      return S_ERROR(e)

    if res.get('created') or res.get('result') == 'created':
      # the created index exists but the value can be None.
      return S_OK(indexName)

    return S_ERROR(res)

  def bulk_index(self, indexprefix, doc_type, data, mapping=None, period=None):
    """
    :param str indexPrefix: index name.
    :param str doc_type: the type of the document
    :param list data: contains a list of dictionary
    :paran dict mapping: the mapping used by elasticsearch
    :param str period: We can specify which kind of indices will be created.
                       Currently only daily and monthly indexes are supported.
    """
    gLogger.info("%d records will be insert to %s" % (len(data), doc_type))
    if mapping is None:
      mapping = {}

    indexName = generateFullIndexName(indexprefix, period)
    gLogger.debug("inserting datat to %s index" % indexName)
    if not self.exists(indexName):
      retVal = self.createIndex(indexprefix, mapping, period)
      if not retVal['OK']:
        return retVal
    docs = []
    for row in data:
      body = {
          '_index': indexName,
          '_type': doc_type,
          '_source': {}
      }
      body['_source'] = row

      if 'timestamp' not in row:
        gLogger.warn("timestamp is not given! Note: the actual time is used!")

      # if the timestamp is not provided, we use the current utc time.
      timestamp = row.get('timestamp', int(Time.toEpoch()))
      try:
        if isinstance(timestamp, datetime):
          body['_source']['timestamp'] = int(timestamp.strftime('%s')) * 1000
        elif isinstance(timestamp, basestring):
          timeobj = datetime.strptime(timestamp, '%Y-%m-%d %H:%M:%S.%f')
          body['_source']['timestamp'] = int(timeobj.strftime('%s')) * 1000
        else:  # we assume  the timestamp is an unix epoch time (integer).
          body['_source']['timestamp'] = timestamp * 1000
      except (TypeError, ValueError) as e:
        # in case we are not able to convert the timestamp to epoch time....
        gLogger.error("Wrong timestamp", e)
        body['_source']['timestamp'] = int(Time.toEpoch()) * 1000
      docs += [body]
    try:
      res = bulk(self.__client, docs, chunk_size=self.__chunk_size)
    except BulkIndexError as e:
      return S_ERROR(e)

    if res[0] == len(docs):
      # we have inserted all documents...
      return S_OK(len(docs))
    else:
      return S_ERROR(res)
    return res

  def getUniqueValue(self, indexName, key, orderBy=False):
    """
    :param str indexName the name of the index which will be used for the query
    :param dict orderBy it is a dictionary in case we want to order the result {key:'desc'} or {key:'asc'}
    It returns a list of unique value for a certain key from the dictionary.
    """

    query = self._Search(indexName)

    endDate = datetime.utcnow()

    startDate = endDate - timedelta(days=30)

    timeFilter = self._Q('range',
                         timestamp={'lte': int(Time.toEpoch(endDate)) * 1000,
                                    'gte': int(Time.toEpoch(startDate)) * 1000, })
    query = query.filter('bool', must=timeFilter)
    if orderBy:
      query.aggs.bucket(key,
                        'terms',
                        field=key,
                        size=self.RESULT_SIZE,
                        order=orderBy).metric(key,
                                              'cardinality',
                                              field=key)
    else:
      query.aggs.bucket(key,
                        'terms',
                        field=key,
                        size=self.RESULT_SIZE).metric(key,
                                                      'cardinality',
                                                      field=key)

    try:
      query = query.extra(size=self.RESULT_SIZE)  # do not need the raw data.
      gLogger.debug("Query", query.to_dict())
      result = query.execute()
    except TransportError as e:
      return S_ERROR(e)

    values = []
    for bucket in result.aggregations[key].buckets:
      values += [bucket['key']]
    del query
    gLogger.debug("Nb of unique rows retrieved", len(values))
    return S_OK(values)

  def pingDB(self):
    """
    Try to connect to the database
    :return: S_OK(TRUE/FALSE)
    """
    connected = False
    try:
      connected = self.__client.ping()
    except ConnectionError as e:
      gLogger.error("Cannot connect to the db", repr(e))
    return S_OK(connected)

  def deleteByQuery(self, indexName, query):
    """
    Delete data by query

    :param str indexName: the name of the index
    :param str query: the query that we want to issue the delete on
    """
    try:
      self.__client.delete_by_query(index=indexName, body=query)
    except Exception as inst:
      gLogger.error("ERROR: Couldn't delete data")
      return S_ERROR(inst)
    return S_OK('Successfully deleted data from index %s' % indexName)
        if 'redirected' not in x:
            temp+=x+' '
    text.append(temp)
descriptions=[]
for item in data.description:
    temp=''
    for x in item:
        temp+=x+' '
    descriptions.append(temp)
#text=data.text
#text=map(str,text)
links=data.link
links=map(str,links)

es.delete_by_query(index='web_content',
                 doc_type='SBA',
                 body={'query':{'match_all':{}}})

standard_descriptions=[]
standard_links=[]
standard_text=[]

# STANDARDIZE LINKS AND DESCRIPTIONS
m=0
for m in range(len(text)):
    y = text[m].replace('/',' ').replace('-',' ').replace('&','').replace('.','').replace(',','').replace('(','').replace(')','').lower()  #split / and - into separated words
    y=re.sub(r':',' ',y)
    y=re.sub(r'businesses|business|small|sba','',y)
    z = descriptions[m].replace('/',' ').replace('-',' ').replace('&','').replace('.','').replace(',','').replace('(','').replace(')','').lower()  #split / and - into separated words
    z=z.replace('|','').replace('the us small business administration','').replace('sbagov','')
    z=re.sub(r':',' ',z)
Exemplo n.º 43
0
class RecastElasticSearch(object):
    """ High level elasticsearch API for Recast. """
    
    def __init__(self, config):

        self.config = config
        self.es = Elasticsearch([{'host': self.config.host(),
                                  'port': self.config.port(),
                                  'use_ssl': self.config.use_ssl(),
                                  'http_auth': self.config.auth(),
                                  'verify_certs': self.config.verify_certs(),
                                  'ca_certs':certifi.where()
                                  }])

    def create(self):
        """ creates index. """
        # ignore index already exists error
        self.es.indices.create(index=self.config.index(), ignore=400)

    def delete(self):
        """ (careful) deletes index and everything contained. """
        self.es.indices.delete(index=self.config.index(), ignore=[400, 404])

    def delete_requests(self):
        """ deletes all requests. """
        self.es.delete_by_query(index=self.config.index(),
                                doc_type=self.config.request_doc_type(),
                                body={"query": {"match_all": {}}})

    def delete_analyses(self):
        """ deletes all analyses. """
        self.es.delete_by_query(index=self.config.index(),
                                doc_type=self.config.analysis_doc_type(),
                                body={"query": {"match_all": {}}})

    def delete_record(self, doc_type, query):
        """ deletes by query. """
        query = self.query_builder(query)
        print query
        self.es.delete_by_query(index=self.config.index(),
                                doc_type=self.config.analysis_doc_type(),
                                body=query)
        
    def query_builder(self, query):
        """ builds query body for search. """
        body = {
            'query': {
                'filtered': {
                    'query': {
                        'query_string': {
                            'query': query
                            }
                        }
                    }
                }
            }
        return body

    def pretty_response(self, response):
        """ Cleans response, removes unrelevant info. """
        if response.has_key('hits'):
            if response['hits'].has_key('hits'):
                    return response['hits']['hits']

        raise Exception('Unconventional response!')

    def isEmpty(self, doc_type):
        """ checks if elastic search is empty.
        
        :returns True if empty otherwise false.
        """
        if not doc_type:
            raise Exception('No doc_type provided!')
        if doc_type == self.config.analysis_doc_type():
            return not bool(self.all_analyses())
        if doc_type == self.config.request_doc_type():
            return not bool(self.all_requests())

    def all_analyses(self):
        """ returns all analyses. """
        response = self.es.search(index=self.config.index(),
                                  doc_type=self.config.analysis_doc_type(),
                                  body={"query": {"match_all": {}}})
        return self.pretty_response(response)

    def all_requests(self):
        """ returns all requests. """
        response = self.es.search(index=self.config.index(),
                                  doc_type=self.config.request_doc_type(),
                                  body={"query": {"match_all": {}}})
        return self.pretty_response(response)

    def advanced_search(self, doc_type, body):
        """ Perform advanced search, user provides search body. """
        response = self.es.search(index=self.config.index(),
                                  doc_type=doc_type,
                                  body=body)
        return self.pretty_response(response)
                                                 
    def simple_search(self, query, doc_type):
        """ simple search. 
        
        :returns list containing 
        """
        if not query:
            raise Exception('Empty query!')
        if not doc_type:
            raise Exception('No doc type provided!')
        
        body = self.query_builder(query=None)
        response = self.es.search(index=self.config.index(),
                                  doc_type=doc_type,
                                  body=body)
        return self.pretty_response(response)
                                      
    def search_analysis(self, query):
        """ Search for analysis.

        :param query: search keywords or string
        :Returns JSON object containing ranked results.
        """
        body = self.query_builder(query=query)
        print self.config.index()
        print self.config.analysis_doc_type()
        print body
        response = self.es.search(index=self.config.index(),
                                  doc_type=self.config.analysis_doc_type(),
                                  body = body)

        return self.pretty_response(response)

    def search_request(self, query):
        """ Search for requests.
        
        :param query: search keywords or string
        :Returns JSON object contained ranked results.
        """
        body = self.query_builder(query=query)
        response = self.es.search(index=self.config.index(),
                                  doc_type=self.config.request_doc_type(),
                                  body=body)
        return self.pretty_response(response)

    def clean_api_data(self, data):
        """ function to remove unneccessary field given from response of API. """
        #data = json.loads(data)
        if type(data) is list:
            for d in data:
                try:
                    del d['_updated']
                    del d['_created']
                    del d['_links']
                    del d['_id']
                except Exception, e:
                    pass
            return data

        try:
            del data['_updated']
            del data['_created']
            del data['_links']
            del data['_id']
        except Exception, e:
            pass
Exemplo n.º 44
0
class SearchEngine(object):

    def __init__(self):
        self.es = Elasticsearch(hosts=settings.ELASTICSEARCH_HOSTS, **settings.ELASTICSEARCH_CONNECTION_OPTIONS)
        self.logger = logging.getLogger(__name__)

    def delete(self, **kwargs):
        """
        Deletes a document from the index
        Pass an index, doc_type, and id to delete a specific document
        Pass a body with a query dsl to delete by query

        """

        body = kwargs.get('body', None)
        if body != None:
            try:
                return self.es.delete_by_query(ignore=[404], **kwargs)
            except Exception as detail:
                self.logger.warning('%s: WARNING: failed to delete document by query: %s \nException detail: %s\n' % (datetime.now(), body, detail))
                raise detail   
        else:
            try:
                return self.es.delete(ignore=[404], **kwargs)
            except Exception as detail:
                self.logger.warning('%s: WARNING: failed to delete document: %s \nException detail: %s\n' % (datetime.now(), body, detail))
                raise detail   

    def delete_index(self, **kwargs):
        """
        Deletes an entire index

        """

        index = kwargs.get('index', '').strip()
        print 'deleting index : %s' % index
        return self.es.indices.delete(index=index, ignore=[400, 404])

    def search(self, **kwargs):
        """
        Search for an item in the index.
        Pass an index, doc_type, and id to get a specific document
        Pass a body with a query dsl to perform a search

        """

        body = kwargs.get('body', None)
        index = kwargs.get('index', None)
        id = kwargs.get('id', None)

        if index is None:
            raise NotImplementedError("You must specify an 'index' in your call to search")

        if id:
            if isinstance(id, list):
                kwargs.setdefault('body', {'ids': kwargs.pop('id')})
                return self.es.mget(**kwargs)
            else:
                return self.es.get(**kwargs)
        
        ret = None
        try: 
            ret = self.es.search(**kwargs)
        except Exception as detail:
            self.logger.warning('%s: WARNING: search failed for query: %s \nException detail: %s\n' % (datetime.now(), body, detail))
            pass   

        return ret

    def index_term(self, term, id, context='', ewstatus='', options={}):
        """
        If the term is already indexed, then simply increment the count and add the id of the term to the existing index.
        If the term isn't indexed then add the index.

        id: a unique id associated with the term
        context: a uuid of a concept to associate with the term to render in the ui
        options: any additional information to associate with the term

        """

        if term.strip(' \t\n\r') != '':
            already_indexed = False
            count = 1
            ids = [id]
            
            try:
                #_id = unicode(term, errors='ignore').decode('utf-8').encode('ascii')
                _id = uuid.uuid3(uuid.NAMESPACE_DNS, '%s%s' % (hash(term), hash(context)))
                result = self.es.get(index='term', doc_type='value', id=_id, ignore=404)

                #print 'result: %s' % result
                if result['found'] == True:
                    ids = result['_source']['ids']
                    if id not in ids:
                        ids.append(id)
                else:
                    ids = [id]
                # ewstatus is indexed only if it's not dict
                if (type(ewstatus) is dict):
                    self.index_data('term', 'value', {'term': term, 'context': context, 'options': options, 'count': len(ids), 'ids': ids}, id=_id)
                else:
                    self.index_data('term', 'value', {'term': term, 'context': context, 'ewstatus': ewstatus, 'options': options, 'count': len(ids), 'ids': ids}, id=_id)
                
            except Exception as detail:
                self.logger.warning('%s: WARNING: search failed to index term: %s \nException detail: %s\n' % (datetime.now(), term, detail))
                raise detail   
                  
    def delete_terms(self, ids):
        """
        If the term is referenced more then once simply decrement the 
        count and remove the id of the deleted term from the from the existing index.

        If the term is only referenced once then delete the index  

        """

        if not isinstance(ids, list):
            ids = [ids]

        for id in ids:
            result = self.es.search(index='term', doc_type='value', body={
                "query": {
                    "filtered": {
                        "filter":{
                            "terms": {
                                "ids": [id]
                            }
                        }, 
                        "query": {
                            "match_all": {}
                        }
                    }
                }, 
                "from": 0, 
                "size": 10
            }, ignore=404)

            if 'hits' in result:
                for document in result['hits']['hits']:
                    document['_source']['ids'].remove(id)
                    count = len(document['_source']['ids'])
                    if count > 0:
                        document['_source']['count'] = count
                        self.index_data('term', 'value', document['_source'], id=document['_id'])
                        self.es.indices.refresh(index='term')
                    else:
                        self.delete(index='term', doc_type='value', id=document['_id'])

    def create_mapping(self, index, doc_type, fieldname='', fieldtype='string', fieldindex='analyzed', body=None):
        """
        Creates an Elasticsearch body for a single field given an index name and type name

        """

        if not body:
            if fieldtype == 'geo_shape':
                body =  { 
                    doc_type : {
                        'properties' : {
                            fieldname : { 'type' : 'geo_shape', 'tree' : 'geohash', 'precision': '1m' }
                        }
                    }
                } 
            else:           
                body =  { 
                    doc_type : {
                        'properties' : {
                            fieldname : { 'type' : fieldtype, 'index' : fieldindex }
                        }
                    }
                }

        self.create_index(index=index, ignore=400)
        self.es.indices.put_mapping(index=index, doc_type=doc_type, body=body)

    def create_index(self, **kwargs):
        self.es.indices.create(**kwargs)

    def index_data(self, index=None, doc_type=None, body=None, idfield=None, id=None, **kwargs):
        """
        Indexes a document or list of documents into Elasticsearch

        If "id" is supplied then will use that as the id of the document

        If "idfield" is supplied then will try to find that property in the 
            document itself and use the value found for the id of the document

        """

        if not isinstance(body, list):
            body = [body]

        for document in body:
            if idfield is not None:
                if isinstance(document, dict):
                    id = document[idfield]
                else:
                    id = getattr(document,idfield)

            try:
                self.es.index(index=index, doc_type=doc_type, body=document, id=id, **kwargs)
            except Exception as detail:
                self.logger.warning('%s: WARNING: failed to index document: %s \nException detail: %s\n' % (datetime.now(), document, detail))
                raise detail


    def bulk_index(self, data):
        return helpers.bulk(self.es, data, chunk_size=500, raise_on_error=True)

    def create_bulk_item(self, index, type, id, data):
        if not(self.isempty_or_none(index) or self.isempty_or_none(type) or self.isempty_or_none(id)):
            return[
                { "index" : { "_index" : index, "_type" : type, "_id" : id } },
                data
            ]
        else:
            return false
Exemplo n.º 45
0
class ElasticSearchUtil:
    def __init__(self, host):
        self.host = host
        self.conn = Elasticsearch([self.host])

    def __del__(self):
        self.close()

    def check(self):
        '''
        输出当前系统的ES信息
        :return:
        '''
        return self.conn.info()

    def insertDocument(self, index, type, body, id=None):
        '''
        插入一条数据body到指定的index、指定的type下;可指定Id,若不指定,ES会自动生成
        :param index: 待插入的index值
        :param type: 待插入的type值
        :param body: 待插入的数据 -> dict型
        :param id: 自定义Id值
        :return:
        '''
        return self.conn.index(index=index, doc_type=type, body=body, id=id)

    def insertDataFrame(self, index, type, dataFrame):
        '''
        批量插入接口;
        bulk接口所要求的数据列表结构为:[{{optionType}: {Condition}}, {data}]
        其中optionType可为index、delete、update
        Condition可设置每条数据所对应的index值和type值
        data为具体要插入/更新的单条数据
        :param index: 默认插入的index值
        :param type: 默认插入的type值
        :param dataFrame: 待插入数据集
        :return:
        '''
        dataList = dataFrame.to_dict(orient='records')
        insertHeadInfoList = [{"index": {}} for i in range(len(dataList))]
        temp = [dict] * (len(dataList) * 2)
        temp[::2] = insertHeadInfoList
        temp[1::2] = dataList
        try:
            return self.conn.bulk(index=index, doc_type=type, body=temp)
        except Exception as e:
            return str(e)

    def deleteDocById(self, index, type, id):
        '''
        删除指定index、type、id对应的数据
        :param index:
        :param type:
        :param id:
        :return:
        '''
        return self.conn.delete(index=index, doc_type=type, id=id)

    def deleteDocByQuery(self, index, query, type=None):
        '''
        删除idnex下符合条件query的所有数据
        :param index:
        :param query: 满足DSL语法格式
        :param type:
        :return:
        '''
        return self.conn.delete_by_query(index=index, body=query, doc_type=type)

    def deleteAllDocByIndex(self, index, type=None):
        '''
        删除指定index下的所有数据
        :param index:
        :return:
        '''
        try:
            query = {'query': {'match_all': {}}}
            return self.conn.delete_by_query(index=index, body=query, doc_type=type)
        except Exception as e:
            return str(e) + ' -> ' + index

    def searchDoc(self, index=None, type=None, body=None):
        '''
        查找index下所有符合条件的数据
        :param index:
        :param type:
        :param body: 筛选语句,符合DSL语法格式
        :return:
        '''
        return self.conn.search(index=index, doc_type=type, body=body)

    def getDocById(self, index, type, id):
        '''
        获取指定index、type、id对应的数据
        :param index:
        :param type:
        :param id:
        :return:
        '''
        return self.conn.get(index=index, doc_type=type, id=id)

    def updateDocById(self, index, type, id, body=None):
        '''
        更新指定index、type、id所对应的数据
        :param index:
        :param type:
        :param id:
        :param body: 待更新的值
        :return:
        '''
        return self.conn.update(index=index, doc_type=type, id=id, body=body)


    def close(self):
     if self.conn is not None:
        try:
            self.conn.close()
        except Exception as e:
            pass
        finally:
            self.conn = None
Exemplo n.º 46
0
#!/usr/bin/env python

from elasticsearch import Elasticsearch
import json

INPUT = "../data/sentences.json"
ES_HOST = {"host" : "localhost", "port" : 9200}
INDEX_NAME = 'dd'
TYPE_NAME = 'docs'
N = 1000

es = Elasticsearch(hosts = [ES_HOST])

es.delete_by_query(index = INDEX_NAME, body = {
      "query": {
        "match_all": {}
      }
  })

with open(INPUT, 'r') as f:
    bulk_data = []

    for line in f:
        src = json.loads(line)
        id = src['doc_id'] + '__' + src['sent_id']
        content = ' '.join(src['words'])
        op_dict = {
            "index": {
                "_index": INDEX_NAME,
                "_type": TYPE_NAME,
        	"_id": id 
Exemplo n.º 47
0
class ElasticSearchManager(object):
    """
    This class handle loading of definition from a MongoDB Server
    """
    def __init__(self, index='element', logger=None):
        self.client = Elasticsearch()
        self.index = index
        self.logger = logger

        mappings = dict()
        # need to iterate over node types available to alter the definition
        mappings["node"] = {
            "_source": {"enabled": True},
            "properties": {
                "uuid":   {"type": "string", "index": "not_analyzed"},
                "parent": {"type": "string", "index": "not_analyzed"},
                "type":   {"type": "string", "index": "not_analyzed"},
                "tags":   {"type": "string", "index": "not_analyzed",  "index_name": "tag"}
            }
        }

        self.client.indices.create(index=self.index, ignore=400, body={
            "mappings": mappings
        })

    def get_id(self, id):
        return id

    def retrieve(self, uuid):
        res = self.client.search(index=self.index, body={
            "query": {
                "term": {
                    "uuid": uuid
                }
            },
            "size": 10
        })

        # print "UUID", uuid, res
        results = self.normalize(res)

        if len(results) > 0:
            return results[0]

        return None

    def exists(self, uuid):
        return self.client.count(index=self.index, body={
            "query": {
                "term": {
                    "uuid": uuid
                }
            },
            "size": 1
        })['count'] > 0

    def delete(self, uuid):
        result = self.client.delete_by_query(index=self.index, body={
            "query": {
                "term": {
                    "uuid": uuid
                }
            },
            "size": 1
        })

        return True

    def resolve_parents(self, data):
        if 'parent' not in data:
            data['parent'] = None

    def fix_children(self, data):
        children = self.find(**{
            'parent': data['uuid']
        })

        for child in children:
            path = "%s/%s" % (data['path'], child['slug'])

            if child['path'] == path:
                continue

            child['path'] = path

            self.client.update(index=self.index, id=child['id'], refresh=True, doc_type="node", body={
                'doc': {'path': path}
            })

            self.fix_children(child)

    def fix_paths(self, data):
        path = False

        if not data['parent']:  # no parent
            path = ""

        if 'slug' not in data:  # no parent
            raise InvalidDataFormat("No slug property defined for the data")

        if data['parent']:
            parent = self.retrieve(data['parent'])

            if not parent:
                raise InvalidTreeState("The parent %s defined in %s does not exist" % (data['uuid'], data['parent']))

            if 'path' not in parent:
                raise InvalidTreeState("The parent %s does not contains a `path`" % (parent['uuid']))

            path = parent['path']

        if path == False:
            raise InvalidTreeState("Unable to resolve the path for %s" % (data))

        data['path'] = "%s/%s" % (path, data['slug'])


    def save(self, uuid, data):
        """
        Save data and resolve the path for the children
        """
        if 'slug' not in data:
            raise InvalidDataFormat("The data must contain a `slug` key: %s" % (data))

        if not uuid:
            uuid = generate_uuid()

        data['uuid'] = uuid

        kwargs = {
            'index': self.index,
            'doc_type': 'node',
            'body': data,
            # refresh=True, this is not ideal but required to have real time data inside the index
            'refresh': True
        }

        if 'id' in data:
            kwargs['id'] = data['id']
            del(data['id'])

        self.resolve_parents(data)
        self.fix_paths(data)


        res = self.client.index(**kwargs)

        data['id'] = res['_id']

        self.fix_children(data)
        # self.normalize([data])

        return data

    def find(self, **kwargs):
        """
        Of course this is not optimized at all

            supported options:
                - path: the path to look up
                - type: the node type
                - types: retrieve types defined
                - tags: retrieve node matching tags
                - category: retrieve node matching the category

        """
        find_kwargs = {
            'index': self.index,
            'body': {
                'size': 25,
                'from': 0,
                'query': {
                    'filtered': {
                        'query':  {},
                        'filter': {}
                    }
                }
            }
        }

        query = {
            "bool": {
                "must": []
            }
        }

        lookup_types = []
        if 'types' in kwargs:
            lookup_types += kwargs['types']

        if 'type' in kwargs:
            lookup_types += [kwargs['type']]

        for type in lookup_types:
            query['bool']['must'].append({'match': {'type': type}})

        if 'tags' in kwargs and kwargs['tags'] and len(kwargs['tags']) > 0:
            for tag in kwargs['tags']:
                query['bool']['must'].append({'match': {'tag': tag}})

        if 'category' in kwargs and kwargs['category'] != None:
            query['bool']['must'].append({'match': {'category': kwargs['category']}})

        if 'parent' in kwargs and kwargs['parent'] != None:
            query['bool']['must'].append({'match': {'parent': kwargs['parent']}})

        if 'limit' in kwargs:
            find_kwargs['body']['limit'] = int(kwargs['limit'])

        if 'offset' in kwargs:
            find_kwargs['body']['from'] = int(kwargs['offset']) * find_kwargs['body']['limit']

        # if 'alias' in kwargs and kwargs['alias']:
        #     find_kwargs['spec']['path'] = kwargs['alias']
        #
        # if 'path' in kwargs and kwargs['path']:
        #     find_kwargs['spec']['path'] = {'$regex': "^" + kwargs['path']}
        #
        # if self.logger:
        #     self.logger.info("element.manager.mongo: find:%s" % (find_kwargs))
        #

        #
        # if 'order_by' in kwargs:
        #     query.sort(kwargs['order_by'])
        # else:
        #     query.sort([('created_at', pymongo.DESCENDING)])

        if len(query['bool']['must']) == 0:
            # we need to match all documents otherwise with have an exception
            find_kwargs['body']['query']['filtered']['query'] = {"bool": {"should": [{"match_all": {}}]}}
        else:
            find_kwargs['body']['query']['filtered']['query'] = query

        res = self.client.search(**find_kwargs)

        return self.normalize(res)

    def find_one(self, **kwargs):
        return self.find(**kwargs)[0]

    def normalize(self, cursor):
        """
        Far from being perfect
        """
        nodes = []
        for data in cursor['hits']['hits']:
            data['_source'].update({'id': data['_id']})

            nodes.append(data['_source'])

        return nodes
Exemplo n.º 48
0
time_start = time.time()

es = Elasticsearch(hosts=[{'host': '192.168.69.41', 'port': 9200}])

TYPE_NEW_USER_ACCUMULATED = "new_user_accumulated"
INDEX_RESULT_TEMP = "result_temp"
INTERVAL = "day"

#Remove temp result
query_body = {
    "query": {
      "match_all": {}
    }
}

res = es.delete_by_query(index=INDEX_RESULT_TEMP, doc_type=TYPE_NEW_USER_ACCUMULATED, body=query_body)

#Query from raw data
body = {
    "aggs" : {
        "user_over_time" : {
            "date_histogram" : {
                "field" : "created_at",
                "interval" : INTERVAL,
                "min_doc_count" : 0
            }
        }
    }
}

Exemplo n.º 49
0
class ThreadingTests(TestCase):
    def setUp(self):
        self.es = Elasticsearch(ES_NODES)
        print GAME_QUEUE, Tasks.redis.llen(GAME_QUEUE)
        print USER_QUEUE, Tasks.redis.llen(USER_QUEUE)
        print GAME_SET, Tasks.redis.scard(GAME_SET)
        print USER_SET, Tasks.redis.scard(USER_SET)
        print TO_CRUNCHER, Tasks.redis.llen(TO_CRUNCHER)
        Tasks.new_games = 0
        print "Deleting the above-listed Redis keys."
        for key in GAME_QUEUE, USER_QUEUE, GAME_SET, USER_SET, TO_CRUNCHER:
            Tasks.redis.delete(key)
        self.es.delete_by_query(index=TEST_ES_INDEX, doc_type=GAME_DOCTYPE, body={"query": {"match_all": {}}})
        print "Be patient (10s) - making sure API is available"
        sleep(10)
        print "Ready!"

    def test_games_make_it_to_elasticsearch_in_reasonable_time(self):
        Tasks.add(TEST_GAMES, [])
        wt = WatcherThread(TEST_KEY, cycles=1)
        wt.start()
        REASONABLE_TIME = 20  # seconds
        with timeout(REASONABLE_TIME):
            while True:
                try:
                    # TODO - assert that the all items made it to ES
                    docs = self.es.mget(index=TEST_ES_INDEX, doc_type=GAME_DOCTYPE, body={'ids': TEST_GAMES})['docs']
                    assert all([d['found'] for d in docs])
                    break
                except:
                    pass
                sleep(0.1)
        wt.join()

        # 1. check that the game queue is now empty
        ONE_SHITLOAD = 10000
        self.assertGreater(ONE_SHITLOAD, Tasks.redis.llen(GAME_QUEUE))
        newly_queued_games = Tasks.redis._bulk_rpop(GAME_QUEUE, ONE_SHITLOAD)
        self.assertEquals(len(set(newly_queued_games)), 0)

        # 2. check that processed games made it to the GAME_SET
        self.assertEquals(Tasks.redis.scard(GAME_SET), len(set(TEST_GAMES)))
        items, is_old = zip(*Tasks.redis._intersect(GAME_SET, TEST_GAMES, insert=False))
        self.assertTrue(all(is_old))

    def test_games_and_users_properly_queued(self):
        # Init with 10 games and 5 users
        Tasks.add(TEST_GAMES, TEST_USERS)
        wt = WatcherThread(TEST_KEY, cycles=1)
        wt.run()

        # 1. check that none of the test games are now currently queued
        ONE_SHITLOAD = 10000
        newly_queued_games = Tasks.redis._bulk_rpop(GAME_QUEUE, ONE_SHITLOAD)
        self.assertEquals(len(set(newly_queued_games) & set(TEST_GAMES)), 0)

        # 2. check that seeded TEST_GAMEs are still in GAME_SET after the second iteration
        items, is_old = zip(*Tasks.redis._intersect(GAME_SET, TEST_GAMES, insert=False))
        self.assertTrue(all(is_old))

        # 3. check that some new users got added
        self.assertNotEqual(Tasks.redis.scard(USER_SET), 0)

        # 4. check that some new games got added
        self.assertNotEqual(Tasks.redis.scard(GAME_SET), 0)

        # 5. check that game counts are accurate
        self.assertEquals(Tasks.new_games, len(TEST_GAMES) + len(newly_queued_games))

    def test_multi_thread(self):
        Tasks.add(TEST_MANY_GAMES, TEST_USERS)
        wt1 = WatcherThread(TEST_KEY, cycles=1)
        wt2 = WatcherThread(TEST_KEY2, cycles=1)

        wt1.start()
        wt2.start()

        wt1.join()
        wt2.join()

        # 1. check that the game counts are accurate
        self.assertEquals(Tasks.new_games, len(TEST_MANY_GAMES) + Tasks.redis.llen(GAME_QUEUE))
Exemplo n.º 50
0
class TestESNetLogs(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        TestESNetLogs.oid1 = OID("1.1.1.1.1", "test_oid1", "public", "127.0.0.1", 55, "test target 1", True)
        TestESNetLogs.oid2 = OID("1.1.1.1.1.0", "test_oid2", "private", "127.0.0.1", 255, "test target 2", True)
        TestESNetLogs.oid3 = OID("1.1.1.2.1", "test_oid3", "public", "127.0.0.1", 155, "test target 3", False)
        TestESNetLogs.oid4 = OID("1.1.1.2.1.0", "test_oid4", "private", "127.0.0.1", 233, "test target 4", False)
        config.SNMP_OIDS.append(TestESNetLogs.oid1)
        config.SNMP_OIDS.append(TestESNetLogs.oid2)
        config.SNMP_OIDS.append(TestESNetLogs.oid3)
        config.SNMP_OIDS.append(TestESNetLogs.oid4)
        config.ES_INDEX = "pypro_tests"
        config.PRINT_CONSOLE = False

    def setUp(self):
        if os.path.exists(utils.log_dir):
            shutil.rmtree(utils.log_dir, ignore_errors=True)
        self.es = Elasticsearch()
        self.es.delete_by_query(config.ES_INDEX, body='{"query":{"match_all":{}}}', ignore=[404])

    def test_numeric_oid_es(self):
        log = ESNetLogger()
        log.start(0)
        log.value(1, TestESNetLogs.oid1, 1)
        log.value(2, TestESNetLogs.oid1, 5)
        log.value(3, TestESNetLogs.oid2, 7)
        log.value(4, TestESNetLogs.oid2, 9)
        log.value(5, TestESNetLogs.oid1, 11)
        log.value(6, TestESNetLogs.oid1, 12)
        log.close()

        self.assert_es('expected_numeric.es')

    def test_string_oid_es(self):
        log = ESNetLogger()
        log.start(0)
        log.value(1, TestESNetLogs.oid3, "omg its bob\n\ron the line")
        log.value(2, TestESNetLogs.oid4, "now for bunnies..")
        log.value(3, TestESNetLogs.oid3, "this is a test\nwith linefeed")
        log.value(4, TestESNetLogs.oid4, "once upon a time")
        log.value(5, TestESNetLogs.oid3, "hello")
        log.value(6, TestESNetLogs.oid3, 11)
        log.value(7, TestESNetLogs.oid3, "ok passed?")
        log.close()

        self.assert_es('expected_string.es')

    def test_mixed_oid_es(self):
        log = ESNetLogger()
        log.start(0)
        log.value(1, TestESNetLogs.oid3, "omg its bob\n\ron the line")
        log.value(2, TestESNetLogs.oid4, "now for bunnies..")
        log.value(3, TestESNetLogs.oid1, 1)
        log.value(7, TestESNetLogs.oid1, 5)
        log.value(8, TestESNetLogs.oid2, 7)
        log.value(33, TestESNetLogs.oid3, "this is a test\nwith linefeed")
        log.value(555, TestESNetLogs.oid4, "once upon a time")
        log.value(3331, TestESNetLogs.oid3, "hello")
        log.value(11111, TestESNetLogs.oid1, 12)
        log.value(42322, TestESNetLogs.oid3, 11)
        log.value(111111, TestESNetLogs.oid3, "ok passed?")
        log.value(112223, TestESNetLogs.oid2, 9)
        log.value(22, TestESNetLogs.oid1, 11)
        log.close()

        self.assert_es('expected_mixed.es')

    def test_error_es(self):
#        time.sleep(1)
        log = ESNetLogger()

        log.error(11111, "epic fail")
        log.error(11111, "fail")
        log.error(11112, "little fail")

        self.assert_es('expected_errors.es')

    def assert_es(self, expected_file_name):
        time.sleep(1)
#        query='{"query":{"match_all":{}}, "sort": { "time": { "order": "asc" }}}'
        data = self.es.search(index=config.ES_INDEX, scroll="10m",
                              body='{"query":{"match_all":{}}, "sort": { "time": { "order": "asc" }}}')
        scroll_id = data['_scroll_id']
        actual = self.read_es(data, "")
        items = data["hits"]["hits"]
        while len(items) > 0:
            data = self.es.scroll(scroll_id=scroll_id, scroll= "10m")
            items = data["hits"]["hits"]
            actual = self.read_es(data, actual)
#        print(actual)
        expected = pkg_resources.resource_string(__name__, expected_file_name).decode('utf8')
        t_assert.equal(actual, expected)

#this will produce only the first 10 items (or the size given... however that is done)
#        data = self.es.search(index=config.ES_INDEX, body='{"query":{"match_all":{}}, "sort": { "time": { "order": "asc" }}}')
#this is a scan, which loses the sort but is effective for very large data sets
#        response = self.es.search(index=config.ES_INDEX, search_type="scan", scroll="10m",
#                              body='{"query":{"match_all":{}}, "sort": { "time": { "order": "asc" }}}')
#        data = helpers.scan(client = self.es, query=query, scroll= "10m", index=config.ES_INDEX, timeout="1m")


    def read_es(self, data, actual):
        items = data["hits"]["hits"]
        for item in items:
            #sort the keys for deterministic comparison
            src = sorted(item["_source"].items(), key=itemgetter(0))
            #the linefeed at the end is not really needed but it makes for more readable error reports
            actual += str(src)+"\n"
        return actual
Exemplo n.º 51
0
class Docstore():
    hosts = None
    indexname = None
    facets = None
    es = None

    def __init__(self, hosts=config.DOCSTORE_HOST, index=config.DOCSTORE_INDEX, connection=None):
        self.hosts = hosts
        self.indexname = index
        if connection:
            self.es = connection
        else:
            self.es = Elasticsearch(hosts, timeout=config.DOCSTORE_TIMEOUT)
    
    def __repr__(self):
        return "<%s.%s %s:%s>" % (
            self.__module__, self.__class__.__name__, self.hosts, self.indexname
        )
    
    def print_configs(self):
        print('CONFIG_FILES:           %s' % config.CONFIG_FILES)
        print('')
        print('DOCSTORE_HOST:          %s' % config.DOCSTORE_HOST)
        print('DOCSTORE_INDEX:         %s' % config.DOCSTORE_INDEX)
        print('')
    
    def health(self):
        return self.es.cluster.health()
    
    def index_exists(self, index):
        """
        """
        return self.es.indices.exists(index=index)
    
    def status(self):
        """Returns status information from the Elasticsearch cluster.
        
        >>> docstore.Docstore().status()
        {
            u'indices': {
                u'ddrpublic-dev': {
                    u'total': {
                        u'store': {
                            u'size_in_bytes': 4438191,
                            u'throttle_time_in_millis': 0
                        },
                        u'docs': {
                            u'max_doc': 2664,
                            u'num_docs': 2504,
                            u'deleted_docs': 160
                        },
                        ...
                    },
                    ...
                }
            },
            ...
        }
        """
        return self.es.indices.stats()
    
    def index_names(self):
        """Returns list of index names
        """
        return [name for name in self.status()['indices'].keys()]
     
    def aliases(self):
        """
        @param hosts: list of dicts containing host information.
        """
        return _parse_cataliases(
            self.es.cat.aliases(h=['index','alias'])
        )
    
    def delete_alias(self, alias, index):
        """Remove specified alias.
        
        @param alias: Name of the alias
        @param index: Name of the alias' target index.
        """
        logger.debug('deleting alias %s -> %s' % (alias, index))
        alias = make_index_name(alias)
        index = make_index_name(index)
        if alias not in [alias for index,alias in self.aliases()]:
            logger.error('Alias does not exist: "%s".' % alias)
            return
        result = self.es.indices.delete_alias(index=index, name=alias)
        logger.debug(result)
        logger.debug('DONE')
        return result
    
    def create_alias(self, alias, index):
        """Point alias at specified index; create index if doesn't exist.
        
        IMPORTANT: There should only ever be ONE alias per index.
        Existing aliases are deleted before specified one is created.
        
        @param alias: Name of the alias
        @param index: Name of the alias' target index.
        """
        logger.debug('creating alias %s -> %s' % (alias, index))
        alias = make_index_name(alias)
        index = make_index_name(index)
        # delete existing alias
        for i,a in self.aliases():
            removed = ''
            if a == alias:
                self.es.indices.delete_alias(
                    # NOTE: "i" is probably not the arg "index".  That's what
                    #       we want. We only want the arg "index".
                    index=i,
                    name=alias
                )
                removed = ' (removed)'
            print('%s -> %s%s' % (a,i,removed))
        result = self.es.indices.put_alias(index=index, name=alias, body='')
        logger.debug(result)
        logger.debug('DONE')
        return result
     
    def target_index(self, alias):
        """Get the name of the index to which the alias points
        
        >>> es.cat.aliases(h=['alias','index'])
        u'documents0 wd5000bmv-2 \n'
        
        @param alias: Name of the alias
        @returns: name of target index
        """
        alias = make_index_name(alias)
        target = []
        for i,a in _parse_cataliases(self.es.cat.aliases(h=['index','alias'])):
            if a == alias:
                target = i
        return target
     
    def create_index(self, index=None):
        """Creates the specified index if it does not already exist.
        
        @returns: JSON dict with status codes and responses
        """
        if not index:
            index = self.indexname
        logger.debug('creating new index: %s' % index)
        body = {
            'settings': {},
            'mappings': {}
            }
        status = self.es.indices.create(index=index, body=body)
        logger.debug(status)
        statuses = self.init_mappings()
        self.model_fields_lists()
        logger.debug('DONE')
     
    def delete_index(self, index=None):
        """Delete the specified index.
        
        @returns: JSON dict with status code and response
        """
        if not index:
            index = self.indexname
        logger.debug('deleting index: %s' % index)
        if self.index_exists(index):
            status = self.es.indices.delete(index=index)
        else:
            status = '{"status":500, "message":"Index does not exist"}'
        logger.debug(status)
        return status
    
    def init_mappings(self):
        """Initializes mappings for Elasticsearch objects
        
        Mappings for objects in (ddr-defs)repo_models.elastic.ELASTICSEARCH_CLASSES
                
        @returns: JSON dict with status code and response
        """
        logger.debug('registering doc types')
        statuses = []
        for class_ in ELASTICSEARCH_CLASSES['all']:
            logger.debug('- %s' % class_['doctype'])
            print('- %s' % class_)
            status = class_['class'].init(index=self.indexname, using=self.es)
            statuses.append( {'doctype':class_['doctype'], 'status':status} )
        return statuses

    def model_fields_lists(self):
        """
        Lists of class-specific fields for each class, in order,
        so documents may be emitted as OrderedDicts with fields in order.
        HOSTS:PORT/INDEX/modelfields/collection/
        HOSTS:PORT/INDEX/modelfields/entity/
        HOSTS:PORT/INDEX/modelfields/segment/
        HOSTS:PORT/INDEX/modelfields/file/
        
        identifier.MODEL_REPO_MODELS
        Identifier.fields_module
        """
        DOCTYPE = 'esobjectfields'
        EXCLUDED = [
            'id', 'title', 'description',
        ]
        for model in MODEL_REPO_MODELS.keys():
            module = module_for_name(MODEL_REPO_MODELS[model]['module']
            )
            fields = [
                f['name'] for f in module.FIELDS
                if f['elasticsearch']['public'] and (f['name'] not in EXCLUDED)
            ]
            data = {
                'model': model,
                'fields': fields,
            }
            self.post_json(
                doc_type=DOCTYPE,
                document_id=model,
                json_text=json.dumps(data),
            )
    
    def get_mappings(self, raw=False):
        """Get mappings for ESObjects
        
        @param raw: boolean Use lower-level function to get all mappings
        @returns: str JSON
        """
        if raw:
            return self.es.indices.get_mapping(self.indexname)
        return {
            class_['doctype']: elasticsearch_dsl.Mapping.from_es(
                index=self.indexname,
                doc_type=class_['doctype'],
                using=self.es,
            ).to_dict()
            for class_ in ELASTICSEARCH_CLASSES['all']
        }
    
    def post_vocabs(self, path=config.VOCABS_URL):
        """Posts ddr-vocab facets,terms to ES.
        
        curl -XPUT 'http://localhost:9200/meta/facet/format' -d '{ ... }'
        >>> elasticsearch.post_facets(
            '192.168.56.120:9200', 'meta',
            '/opt/ddr-local/ddr-vocab'
            )
        
        @param path: Absolute path to dir containing facet files.
        @returns: JSON dict with status code and response
        """
        logger.debug('index_facets(%s, %s)' % (self.indexname, path))
        vocabs = vocab.get_vocabs(path)
        
        # get classes from ddr-defs
        Facet = ELASTICSEARCH_CLASSES_BY_MODEL['facet']
        FacetTerm = ELASTICSEARCH_CLASSES_BY_MODEL['facetterm']
        
        # push facet data
        statuses = []
        for v in vocabs.keys():
            fid = vocabs[v]['id']
            facet = Facet()
            facet.meta.id = fid
            facet.id = fid
            facet.model = 'facet'
            facet.links_html = fid
            facet.links_json = fid
            facet.links_children = fid
            facet.title = vocabs[v]['title']
            facet.description = vocabs[v]['description']
            logging.debug(facet)
            status = facet.save(using=self.es, index=self.indexname)
            statuses.append(status)
            
            for t in vocabs[v]['terms']:
                tid = t.get('id')
                facetterm_id = '-'.join([
                    str(fid),
                    str(tid),
                ])
                term = FacetTerm()
                term.meta.id = facetterm_id
                term.facet = fid
                term.term_id = tid
                term.links_html = facetterm_id
                term.links_json = facetterm_id
                # TODO doesn't handle location_geopoint
                for field in FacetTerm._doc_type.mapping.to_dict()[
                        FacetTerm._doc_type.name]['properties'].keys():
                    if t.get(field):
                        setattr(term, field, t[field])
                term.id = facetterm_id  # overwrite term.id from original
                logging.debug(term)
                status = term.save(using=self.es, index=self.indexname)
                statuses.append(status)
        
        forms_choices = {
            'topics-choices': vocab.topics_choices(
                vocab.get_vocabs(config.VOCABS_URL)['topics'],
                ELASTICSEARCH_CLASSES_BY_MODEL['facetterm']
            ),
            'facility-choices': vocab.form_vocab_choices(
                vocab.get_vocabs(config.VOCABS_URL)['facility'],
                'facility'
            ),
            'format-choices': vocab.form_vocab_choices(
                vocab.get_vocabs(config.VOCABS_URL)['format'],
                'format'
            ),
            'genre-choices': vocab.form_vocab_choices(
                vocab.get_vocabs(config.VOCABS_URL)['genre'],
                'genre'
            ),
            'rights-choices': vocab.form_vocab_choices(
                vocab.get_vocabs(config.VOCABS_URL)['rights'],
                'rights'
            ),
        }
        self.post_json('forms', 'forms-choices', forms_choices)
        return statuses
    
    def facet_terms(self, facet, order='term', all_terms=True, model=None):
        """Gets list of terms for the facet.
        
        $ curl -XGET 'http://192.168.56.101:9200/ddr/entity/_search?format=yaml' -d '{
          "fields": ["id"],
          "query": { "match_all": {} },
          "facets": {
            "genre_facet_result": {
              "terms": {
                "order": "count",
                "field": "genre"
              }
            }
          }
        }'
        Sample results:
            {
              u'_type': u'terms',
              u'missing': 203,
              u'total': 49,
              u'other': 6,
              u'terms': [
                {u'term': u'photograph', u'count': 14},
                {u'term': u'ephemera', u'count': 6},
                {u'term': u'advertisement', u'count': 6},
                {u'term': u'book', u'count': 5},
                {u'term': u'architecture', u'count': 3},
                {u'term': u'illustration', u'count': 2},
                {u'term': u'fieldnotes', u'count': 2},
                {u'term': u'cityscape', u'count': 2},
                {u'term': u'blank_form', u'count': 2},
                {u'term': u'portrait, u'count': 1'}
              ]
            }
        
        @param facet: Name of field
        @param order: term, count, reverse_term, reverse_count
        @param model: (optional) Type of object ('collection', 'entity', 'file')
        @returns raw output of facet query
        """
        payload = {
            "fields": ["id"],
            "query": { "match_all": {} },
            "facets": {
                "results": {
                    "terms": {
                        "size": MAX_SIZE,
                        "order": order,
                        "all_terms": all_terms,
                        "field": facet
                    }
                }
            }
        }
        results = self.es.search(index=self.indexname, doc_type=model, body=payload)
        return results['facets']['results']

    def _repo_org(self, path, doctype, remove=False):
        """
        seealso DDR.models.common.DDRObject.to_esobject
        """
        # get and validate file
        data = load_json(path)
        if (not (data.get('id') and data.get('repo'))):
            raise Exception('Data file is not well-formed.')
        oi = Identifier(id=data['id'])
        d = OrderedDict()
        d['id'] = oi.id
        d['model'] = oi.model
        d['parent_id'] = oi.parent_id(stubs=1)
        # links
        d['links_html'] = oi.id
        d['links_json'] = oi.id
        d['links_img'] = '%s/logo.png' % oi.id
        d['links_thumb'] = '%s/logo.png' % oi.id
        d['links_parent'] = oi.parent_id(stubs=1)
        d['links_children'] = oi.id
        # title,description
        d['title'] = data['title']
        d['description'] = data['description']
        d['url'] = data['url']
        # ID components (repo, org, cid, ...) as separate fields
        idparts = deepcopy(oi.idparts)
        idparts.pop('model')
        for k in ID_COMPONENTS:
            d[k] = '' # ensure all fields present
        for k,v in idparts.iteritems():
            d[k] = v
        # add/update
        if remove and self.exists(doctype, oi):
            results = self.es.delete(
                index=self.indexname, doc_type=doctype, id=oi.id
            )
        else:
            results = self.es.index(
                index=self.indexname, doc_type=doctype, id=oi.id, body=d
            )
        return results
    
    def repo(self, path, remove=False):
        """Add/update or remove base repository metadata.
        
        @param path: str Absolute path to repository.json
        @param remove: bool Remove record from ES
        @returns: dict
        """
        return self._repo_org(path, 'repository', remove)
    
    def org(self, path, remove=False):
        """Add/update or remove base organization metadata.
        
        @param path: str Absolute path to organization.json
        @param remove: bool Remove record from ES
        @returns: dict
        """
        return self._repo_org(path, 'organization', remove)
    
    def narrators(self, path):
        """Add/update or remove narrators metadata.
        
        @param path: str Absolute path to narrators.json
        @returns: dict
        """
        DOC_TYPE = 'narrator'
        data = load_json(path)
        for document in data['narrators']:
            document['model'] = 'narrator'
            has_published = document.get('has_published', '')
            if has_published.isdigit():
                has_published = int(has_published)
            if has_published:
                result = self.post_json(DOC_TYPE, document['id'], json.dumps(document))
                logging.debug(document['id'], result)
            else:
                logging.debug('%s not published' % document['id'])
                if self.get(DOC_TYPE, document['id'], fields=[]):
                    self.delete(document['id'])
    
    def post_json(self, doc_type, document_id, json_text):
        """POST the specified JSON document as-is.
        
        @param doc_type: str
        @param document_id: str
        @param json_text: str JSON-formatted string
        @returns: dict Status info.
        """
        logger.debug('post_json(%s, %s, %s)' % (
            self.indexname, doc_type, document_id
        ))
        return self.es.index(
            index=self.indexname, doc_type=doc_type, id=document_id, body=json_text
        )

    def post(self, document, public_fields=[], additional_fields={}, parents={}, force=False):
        """Add a new document to an index or update an existing one.
        
        This function can produce ElasticSearch documents in two formats:
        - old-style list-of-dicts used in the DDR JSON files.
        - normal dicts used by ddr-public.
        
        DDR metadata JSON files are structured as a list of fieldname:value dicts.
        This is done so that the fields are always in the same order, making it
        possible to easily see the difference between versions of a file.
        [IMPORTANT: documents MUST contain an 'id' field!]
        
        In ElasticSearch, documents are structured in a normal dict so that faceting
        works properly.
        
        curl -XPUT 'http://localhost:9200/ddr/collection/ddr-testing-141' -d '{ ... }'
        
        @param document: Collection,Entity,File The object to post.
        @param public_fields: list
        @param additional_fields: dict
        @param parents: dict Basic metadata for parent documents.
        @param force: boolean Bypass status and public checks.
        @returns: JSON dict with status code and response
        """
        logger.debug('post(%s, %s, %s)' % (
            self.indexname, document, force
        ))

        if force:
            publishable = True
            public = False
        else:
            if not parents:
                parents = _parents_status([document.identifier.path_abs()])
            publishable = _publishable([document.identifier.path_abs()], parents)
            public = True
        if not publishable:
            return {'status':403, 'response':'object not publishable'}

        d = document.to_esobject(public_fields=public_fields, public=public)
        logger.debug('saving')
        status = d.save(using=self.es, index=self.indexname)
        logger.debug(str(status))
        return status
    
    def post_multi(self, path, recursive=False, force=False):
        """Publish (index) specified document and (optionally) its children.
        
        After receiving a list of metadata files, index() iterates through the
        list several times.  The first pass weeds out paths to objects that can
        not be published (e.g. object or its parent is unpublished).
        
        In the final pass, a list of public/publishable fields is chosen based
        on the model.  Additional fields not in the model (e.g. parent ID, parent
        organization/collection/entity ID) are packaged.  Then everything is sent
        off to post().
        
        @param path: Absolute path to directory containing object metadata files.
        @param recursive: Whether or not to recurse into subdirectories.
        @param force: boolean Just publish the damn collection already.
        @returns: number successful,list of paths that didn't work out
        """
        logger.debug('index(%s, %s, %s, %s)' % (self.indexname, path, recursive, force))
        
        publicfields = _public_fields()
        
        # process a single file if requested
        if os.path.isfile(path):
            paths = [path]
        else:
            # files listed first, then entities, then collections
            paths = util.find_meta_files(path, recursive, files_first=1)
        
        # Store value of public,status for each collection,entity.
        # Values will be used by entities and files to inherit these values
        # from their parent.
        parents = _parents_status(paths)
        
        # Determine if paths are publishable or not
        paths = _publishable(paths, parents, force=force)
        
        skipped = 0
        successful = 0
        bad_paths = []
        
        num = len(paths)
        for n,path in enumerate(paths):
            oi = path.get('identifier')
            # TODO write logs instead of print
            print('%s | %s/%s %s %s %s' % (
                datetime.now(config.TZ), n+1, num, path['action'], oi.id, path['note'])
            )
            
            if not oi:
                path['note'] = 'No identifier'
                bad_paths.append(path)
                continue
            try:
                document = oi.object()
            except Exception as err:
                path['note'] = 'Could not instantiate: %s' % err
                bad_paths.append(path)
                continue
            if not document:
                path['note'] = 'No document'
                bad_paths.append(path)
                continue
            
            # see if document exists
            existing_v = None
            d = self.get(oi.model, oi.id)
            if d:
                existing_v = d.meta.version
            
            # post document
            if path['action'] == 'POST':
                created = self.post(document, parents=parents, force=True)
                # force=True bypasses _publishable in post() function
            # delete previously published items now marked incomplete/private
            elif existing_v and (path['action'] == 'SKIP'):
                print('%s | %s/%s DELETE' % (datetime.now(config.TZ), n+1, num))
                self.delete(oi.id)
            
            if path['action'] == 'SKIP':
                skipped += 1
                continue
            
            # version is incremented with each updated
            posted_v = None
            # for e.g. segment the ES doc_type will be 'entity' but oi.model is 'segment'
            es_model = ELASTICSEARCH_CLASSES_BY_MODEL[oi.model]._doc_type.name
            d = self.get(es_model, oi.id)
            if d:
                posted_v = d.meta.version

            # success: created, or version number incremented
            status = 'ERROR - unspecified'
            if posted_v and not existing_v:
                status = 'CREATED'
                successful += 1
            elif (existing_v and posted_v) and (existing_v < posted_v):
                status = 'UPDATED'
                successful += 1
            elif not posted_v:
                status = 'ERROR: not created'
                bad_paths.append(path)
                print(status)
            
        logger.debug('INDEXING COMPLETED')
        return {'total':len(paths), 'skipped':skipped, 'successful':successful, 'bad':bad_paths}
     
    def exists(self, model, document_id):
        """
        @param model:
        @param document_id:
        """
        return self.es.exists(index=self.indexname, doc_type=model, id=document_id)
     
    def get(self, model, document_id, fields=None):
        """
        @param model:
        @param document_id:
        @param fields: boolean Only return these fields
        """
        if self.exists(model, document_id):
            ES_Class = ELASTICSEARCH_CLASSES_BY_MODEL[model]
            return ES_Class.get(document_id, using=self.es, index=self.indexname)
        return None

    def count(self, doctypes=[], query={}):
        """Executes a query and returns number of hits.
        
        The "query" arg must be a dict that conforms to the Elasticsearch query DSL.
        See docstore.search_query for more info.
        
        @param doctypes: list Type of object ('collection', 'entity', 'file')
        @param query: dict The search definition using Elasticsearch Query DSL
        @returns raw ElasticSearch query output
        """
        logger.debug('count(index=%s, doctypes=%s, query=%s' % (
            self.indexname, doctypes, query
        ))
        if not query:
            raise Exception("Can't do an empty search. Give me something to work with here.")
        
        doctypes = ','.join(doctypes)
        logger.debug(json.dumps(query))
        
        return self.es.count(
            index=self.indexname,
            doc_type=doctypes,
            body=query,
        )
    
    def delete(self, document_id, recursive=False):
        """Delete a document and optionally its children.
        
        @param document_id:
        @param recursive: True or False
        """
        identifier = Identifier(id=document_id)
        if recursive:
            if identifier.model == 'collection': doc_type = 'collection,entity,file'
            elif identifier.model == 'entity': doc_type = 'entity,file'
            elif identifier.model == 'file': doc_type = 'file'
            query = 'id:"%s"' % identifier.id
            try:
                return self.es.delete_by_query(
                    index=self.indexname, doc_type=doc_type, q=query
                )
            except TransportError:
                pass
        else:
            try:
                return self.es.delete(
                    index=self.indexname, doc_type=identifier.model, id=identifier.id
                )
            except TransportError:
                pass

    def search(self, doctypes=[], query={}, sort=[], fields=[], from_=0, size=MAX_SIZE):
        """Executes a query, get a list of zero or more hits.
        
        The "query" arg must be a dict that conforms to the Elasticsearch query DSL.
        See docstore.search_query for more info.
        
        @param doctypes: list Type of object ('collection', 'entity', 'file')
        @param query: dict The search definition using Elasticsearch Query DSL
        @param sort: list of (fieldname,direction) tuples
        @param fields: str
        @param from_: int Index of document from which to start results
        @param size: int Number of results to return
        @returns raw ElasticSearch query output
        """
        logger.debug('search(index=%s, doctypes=%s, query=%s, sort=%s, fields=%s, from_=%s, size=%s' % (
            self.indexname, doctypes, query, sort, fields, from_, size
        ))
        if not query:
            raise Exception("Can't do an empty search. Give me something to work with here.")
        
        doctypes = ','.join(doctypes)
        logger.debug(json.dumps(query))
        _clean_dict(sort)
        sort_cleaned = _clean_sort(sort)
        fields = ','.join(fields)
        
        results = self.es.search(
            index=self.indexname,
            doc_type=doctypes,
            body=query,
            sort=sort_cleaned,
            from_=from_,
            size=size,
            _source_include=fields,
        )
        return results
    
    def reindex(self, source, dest):
        """Copy documents from one index to another.
        
        @param source: str Name of source index.
        @param dest: str Name of destination index.
        @returns: number successful,list of paths that didn't work out
        """
        logger.debug('reindex(%s, %s)' % (source, dest))
        
        if self.index_exists(source):
            logger.info('Source index exists: %s' % source)
        else:
            return '{"status":500, "message":"Source index does not exist"}'
        
        if self.index_exists(dest):
            logger.info('Destination index exists: %s' % dest)
        else:
            return '{"status":500, "message":"Destination index does not exist"}'
        
        version = self.es.info()['version']['number']
        logger.debug('Elasticsearch version %s' % version)
        
        if version >= '2.3':
            logger.debug('new API')
            body = {
                "source": {"index": source},
                "dest": {"index": dest}
            }
            results = self.es.reindex(
                body=json.dumps(body),
                refresh=None,
                requests_per_second=0,
                timeout='1m',
                wait_for_active_shards=1,
                wait_for_completion=False,
            )
        else:
            logger.debug('pre-2.3 legacy API')
            from elasticsearch import helpers
            results = helpers.reindex(
                self.es, source, dest,
                #query=None,
                #target_client=None,
                #chunk_size=500,
                #scroll=5m,
                #scan_kwargs={},
                #bulk_kwargs={}
            )
        return results
Exemplo n.º 52
0
class Index(object):
    """
    Base class to define some common methods across indexes.
    """

    # The _index and _type define the URL path to Elasticsearch, e.g.:
    #   http://localhost:9200/{_index}/{_type}/_search
    _index = "readthedocs"
    _type = None

    def __init__(self):
        self.es = Elasticsearch(settings.ES_HOSTS)

    def get_settings(self, settings_override=None):
        """
        Returns settings to be passed to ES create_index.

        If `settings_override` is provided, this will use `settings_override`
        to override the defaults defined here.

        """
        default_settings = {
            "number_of_replicas": settings.ES_DEFAULT_NUM_REPLICAS,
            "number_of_shards": settings.ES_DEFAULT_NUM_SHARDS,
            "refresh_interval": "5s",
            "store.compress.tv": True,
            "store.compress.stored": True,
            "analysis": self.get_analysis(),
        }
        if settings_override:
            default_settings.update(settings_override)

        return default_settings

    def get_analysis(self):
        """
        Returns the analysis dict to be used in settings for create_index.

        For languages that ES supports we define either the minimal or light
        stemming, which isn't as aggresive as the snowball stemmer. We also
        define the stopwords for that language.

        For all languages we've customized we're using the ICU plugin.

        """
        analyzers = {}
        filters = {}

        # The default is used for fields that need ICU but are composed of
        # many languages.
        analyzers["default_icu"] = {
            "type": "custom",
            "tokenizer": "icu_tokenizer",
            "filter": ["word_delimiter", "icu_folding", "icu_normalizer"],
        }

        # Customize the word_delimiter filter to set various options.
        filters["custom_word_delimiter"] = {"type": "word_delimiter", "preserve_original": True}

        return {"analyzer": analyzers, "filter": filters}

    def timestamped_index(self):
        return "{0}-{1}".format(self._index, datetime.datetime.now().strftime("%Y%m%d%H%M%S"))

    def create_index(self, index=None):
        """
        Creates index.

        This uses `get_settings` and `get_mappings` to define the index.

        """
        index = index or self._index
        body = {"settings": self.get_settings()}
        self.es.indices.create(index=index, body=body)

    def put_mapping(self, index=None):
        index = index or self._index
        self.es.indices.put_mapping(self._type, self.get_mapping(), index)

    def bulk_index(self, data, index=None, chunk_size=500, parent=None, routing=None):
        """
        Given a list of documents, uses Elasticsearch bulk indexing.

        For each doc this calls `extract_document`, then indexes.

        `chunk_size` defaults to the elasticsearch lib's default. Override per
        your document size as needed.

        """
        index = index or self._index
        docs = []
        for d in data:
            source = self.extract_document(d)
            doc = {"_index": index, "_type": self._type, "_id": source["id"], "_source": source}
            if parent:
                doc["_parent"] = parent
            if routing:
                doc["_routing"] = routing
            docs.append(doc)

        # TODO: This doesn't work with the new ES setup.
        bulk_index(self.es, docs, chunk_size=chunk_size)

    def index_document(self, data, index=None, parent=None, routing=None):
        doc = self.extract_document(data)
        kwargs = {"index": index or self._index, "doc_type": self._type, "body": doc, "id": doc["id"]}
        if parent:
            kwargs["parent"] = parent
        if routing:
            kwargs["routing"] = routing
        self.es.index(**kwargs)

    def delete_document(self, body, index=None, parent=None, routing=None):
        kwargs = {"index": index or self._index, "doc_type": self._type, "body": body}
        if parent:
            kwargs["parent"] = parent
        if routing:
            kwargs["routing"] = routing
        return self.es.delete_by_query(**kwargs)

    def get_mapping(self):
        """
        Returns the mapping for this _index and _type.
        """
        raise NotImplemented

    def extract_document(self, pk, obj):
        """
        Extracts the Elasticsearch document for this object instance.
        """
        raise NotImplemented

    def update_aliases(self, new_index, delete=True):
        """
        Points `_index` to `new_index` and deletes `_index` if delete=True.

        The ES `update_aliases` is atomic.
        """
        old_index = None

        # Get current alias, if any.
        try:
            aliases = self.es.indices.get_alias(name=self._index)
            if aliases and aliases.keys():
                old_index = aliases.keys()[0]
        except exceptions.NotFoundError:
            pass

        actions = []
        if old_index:
            actions.append({"remove": {"index": old_index, "alias": self._index}})
        actions.append({"add": {"index": new_index, "alias": self._index}})

        self.es.indices.update_aliases(body={"actions": actions})

        # Delete old index if any and if specified.
        if delete and old_index:
            self.es.indices.delete(index=old_index)

    def search(self, body, **kwargs):
        return self.es.search(index=self._index, doc_type=self._type, body=body, **kwargs)
Exemplo n.º 53
0
    for arg_idx, arg in enumerate(sys.argv):
        if arg == "--query":
            queryFile = sys.argv[arg_idx+1]
            continue
        if arg == "--server":
            esServer = sys.argv[arg_idx+1]
            continue
        if arg == "--index":
            index = sys.argv[arg_idx+1]
            continue

def die():
    print "Please input the required parameters"
    print "Usage: deleteFromElasticSearch.py --query <Name of query file> --server <Server with port and auth info> --index <index name>"
    exit(1)

parse_args()
if esServer is None or queryFile is None or index is None:
    die()

queryBody = None
with open(queryFile, 'r') as queryFile_handle:
    queryBody = queryFile_handle.read()

print "Connnect:" + esServer
es = Elasticsearch([esServer])
#print es.info()
print "Delete from index:" + index + ", query:" + queryBody
es.delete_by_query(index, body=queryBody)
print "Done deleting from index"
Exemplo n.º 54
0
class TestESLogs(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        config.DB_NAME = "pypro_tests"
        config.PRINT_CONSOLE = False

    def setUp(self):
        if os.path.exists(utils.log_dir):
            shutil.rmtree(utils.log_dir, ignore_errors=True)
        self.es = Elasticsearch()
        self.es.delete_by_query(config.DB_NAME, body='{"query":{"match_all":{}}}', ignore=[404])

    def test_cpu_sys_es(self):
        log = ESNetLogger()
        log.cpu_sys(0, 1, 1, 1, 1)
        log.cpu_sys(1, 3, 2, 5, 6)
        log.cpu_sys(3, 22, 99, 11, 4)
        log.cpu_sys(5, 155, 122, 12, 22)
        log.close()
        time.sleep(1)
        data = self.es.search(index=config.DB_NAME, body='{"query":{"match_all":{}}, "sort": { "time": { "order": "asc" }}}')
        items = data["hits"]["hits"]
        self.assertEqual(4, len(items), "number of cpu sys items logged")
        self.assert_cpu_sys(items[0]["_source"], 0, 1, 1, 1, 1)
        self.assert_cpu_sys(items[1]["_source"], 1, 3, 2, 5, 6)
        self.assert_cpu_sys(items[2]["_source"], 3, 22, 99, 11, 4)
        self.assert_cpu_sys(items[3]["_source"], 5, 155, 122, 12, 22)

    def assert_cpu_sys(self, item, time, user_count, system_count, idle_count, percent):
        self.assertEqual(5, len(item), "number of properties for a cpu sys item")
        self.assertEqual(item['time'], time)
        self.assertEqual(item['percent'], percent)
        self.assertEqual(item['idle_count'], idle_count)
        self.assertEqual(item['system_count'], system_count)
        self.assertEqual(item['user_count'], user_count)

    def test_cpu_proc_es(self):
        log = ESNetLogger()
        log.cpu_proc(0, 1, 1, 1, 1, 1, 1, 1, "p1")
        log.cpu_proc(1, 2, 1, 3, 4, 2, 3, 1, "p2")
        log.cpu_proc(2, 3, 2, 122, 7, 5, 8, 11, "p3")
        log.cpu_proc(10, 1, 1, 1, 1, 1, 1, 1, "p1")
        log.cpu_proc(11, 2, 1, 3, 4, 2, 3, 1, "p2")
        log.cpu_proc(12, 3, 2, 122, 7, 5, 8, 11, "p3")
        log.cpu_proc(20, 1, 1, 5, 1, 4, 3, 2, "p1")
        log.cpu_proc(21, 3, 2, 555, 7, 11, 55, 32, "p3")
        log.close()
        time.sleep(1)
        data = self.es.search(index=config.DB_NAME, body='{"query":{"match_all":{}}, "sort": { "time": { "order": "asc" }}}')
        items = data["hits"]["hits"]
        self.assertEqual(8, len(items), "number of cpu proc items logged")
#        print(str(items[0]["_source"]))
        self.assert_cpu_proc(items[0]["_source"], 0, 1, 1, 1, 1, 1, 1, 1, "p1")
        self.assert_cpu_proc(items[1]["_source"], 1, 2, 1, 3, 4, 2, 3, 1, "p2")
        self.assert_cpu_proc(items[2]["_source"], 2, 3, 2, 122, 7, 5, 8, 11, "p3")
        self.assert_cpu_proc(items[3]["_source"], 10, 1, 1, 1, 1, 1, 1, 1, "p1")
        self.assert_cpu_proc(items[4]["_source"], 11, 2, 1, 3, 4, 2, 3, 1, "p2")
        self.assert_cpu_proc(items[5]["_source"], 12, 3, 2, 122, 7, 5, 8, 11, "p3")
        self.assert_cpu_proc(items[6]["_source"], 20, 1, 1, 5, 1, 4, 3, 2, "p1")
        self.assert_cpu_proc(items[7]["_source"], 21, 3, 2, 555, 7, 11, 55, 32, "p3")

    def assert_cpu_proc(self, item, time, pid, priority, ctx_count, n_threads, cpu_user, cpu_system, percent, pname):
        self.assertEqual(9, len(item), "number of properties for a cpu sys item")
        self.assertEqual(item['time'], time)
        self.assertEqual(item['pid'], pid)
        self.assertEqual(item['priority'], priority)
        self.assertEqual(item['context_switches'], ctx_count)
        self.assertEqual(item['threads'], n_threads)
        self.assertEqual(item['cpu_user'], cpu_user)
        self.assertEqual(item['cpu_system'], cpu_system)
        self.assertEqual(item['percent'], percent)
        self.assertEqual(item['pname'], pname)
        # args = locals().copy()
        # del args["self"]
        # del args["item"]
        # expected_len = len(args) #reduce self and item
        # actual_len = len(item)
        # self.assertEqual(expected_len, actual_len, "number of properties for a cpu proc")
        # for arg in args:
        #     value = args[arg]
        #     if arg == "time":
        #         value *= 1000
        #     self.assertEqual(item[arg], value)

    def test_mem_sys_es(self):
        log = ESNetLogger()
        log.mem_sys(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)
        log.mem_sys(10, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)
        log.mem_sys(12, 34, 654, 24, 33, 23, 442, 1, 13, 21, 44)
        log.mem_sys(15, 3445, 345, 345, 44, 745, 367, 32, 1111, 33, 55)
        log.mem_sys(33, 33, 453, 998, 347, 976, 8544, 45, 5555, 66, 33)
        log.close()
        time.sleep(1)
        data = self.es.search(index=config.DB_NAME, body='{"query":{"match_all":{}}, "sort": { "time": { "order": "asc" }}}')
        items = data["hits"]["hits"]
        self.assertEqual(5, len(items), "number of mem sys items logged")
        self.assert_mem_sys(items[0]["_source"], 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)
        self.assert_mem_sys(items[1]["_source"], 10, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)
        self.assert_mem_sys(items[2]["_source"], 12, 34, 654, 24, 33, 23, 442, 1, 13, 21, 44)
        self.assert_mem_sys(items[3]["_source"], 15, 3445, 345, 345, 44, 745, 367, 32, 1111, 33, 55)
        self.assert_mem_sys(items[4]["_source"], 33, 33, 453, 998, 347, 976, 8544, 45, 5555, 66, 33)

    def assert_mem_sys(self, item, time, available, percent, used, free,
                swap_total, swap_used, swap_free, swap_in, swap_out, swap_percent):
        self.assertEqual(11, len(item), "number of properties for a mem sys item")
        self.assertEqual(item['time'], time)
        self.assertEqual(item['available'], available)
        self.assertEqual(item['percent'], percent)
        self.assertEqual(item['used'], used)
        self.assertEqual(item['free'], free)
        self.assertEqual(item['swap_total'], swap_total)
        self.assertEqual(item['swap_used'], swap_used)
        self.assertEqual(item['swap_free'], swap_free)
        self.assertEqual(item['swap_in'], swap_in)
        self.assertEqual(item['swap_out'], swap_out)
        self.assertEqual(item['swap_percent'], swap_percent)

    def test_mem_proc_es(self):
        log = ESNetLogger()
        log.mem_proc(0, 1, 11, 15, 5, "p1")
        log.mem_proc(1, 2, 1, 3, 2, "p2")
        log.mem_proc(2, 5432, 21, 33, 9, "p3")
        log.mem_proc(5, 1, 22, 11, 3, "p1")
        log.mem_proc(6, 5432, 7, 55, 7, "p3")
        log.mem_proc(66, 1, 11, 15, 5, "p1")
        log.mem_proc(67, 2, 11, 0, 22, "p2")
        log.mem_proc(68, 5432, 212, 334, 44, "p3")
        log.close()
        time.sleep(1)
        data = self.es.search(index=config.DB_NAME, body='{"query":{"match_all":{}}, "sort": { "time": { "order": "asc" }}}')
        items = data["hits"]["hits"]
        self.assertEqual(8, len(items), "number of mem proc items logged")
#        print(str(items[0]["_source"]))
        self.assert_mem_proc(items[0]["_source"], 0, 1, 11, 15, 5, "p1")
        self.assert_mem_proc(items[1]["_source"], 1, 2, 1, 3, 2, "p2")
        self.assert_mem_proc(items[2]["_source"], 2, 5432, 21, 33, 9, "p3")
        self.assert_mem_proc(items[3]["_source"], 5, 1, 22, 11, 3, "p1")
        self.assert_mem_proc(items[4]["_source"], 6, 5432, 7, 55, 7, "p3")
        self.assert_mem_proc(items[5]["_source"], 66, 1, 11, 15, 5, "p1")
        self.assert_mem_proc(items[6]["_source"], 67, 2, 11, 0, 22, "p2")
        self.assert_mem_proc(items[7]["_source"], 68, 5432, 212, 334, 44, "p3")

    def assert_mem_proc(self, item, time, pid, rss, vms, percent, pname):
        self.assertEqual(6, len(item), "number of properties for a mem proc item")
        self.assertEqual(item['time'], time)
        self.assertEqual(item['pid'], pid)
        self.assertEqual(item['rss'], rss)
        self.assertEqual(item['vms'], vms)
        self.assertEqual(item['percent'], percent)
        self.assertEqual(item['pname'], pname)

    def test_io_sys_es(self):
        log = ESNetLogger()
        log.io_sys(11111, 22, 22, 34, 43, 11, 11, 5, 3)
        log.io_sys(22222, 55, 23, 44, 34, 23, 17, 15, 4)
        log.io_sys(22233, 65, 23, 777, 44, 28, 18, 35, 5)
        log.io_sys(25555, 78, 44, 1911, 53, 99434, 43, 43, 21)
        log.close()
        time.sleep(1)
        data = self.es.search(index=config.DB_NAME, body='{"query":{"match_all":{}}, "sort": { "time": { "order": "asc" }}}')
        items = data["hits"]["hits"]
        self.assertEqual(4, len(items), "number of mem sys items logged")
#        print(str(items[0]["_source"]))
        self.assert_io_sys(items[0]["_source"], 11111, 22, 22, 34, 43, 11, 11, 5, 3)
        self.assert_io_sys(items[1]["_source"], 22222, 55, 23, 44, 34, 23, 17, 15, 4)
        self.assert_io_sys(items[2]["_source"], 22233, 65, 23, 777, 44, 28, 18, 35, 5)
        self.assert_io_sys(items[3]["_source"], 25555, 78, 44, 1911, 53, 99434, 43, 43, 21)

    def assert_io_sys(self, item, time, bytes_sent, bytes_recv, packets_sent, packets_recv, errin, errout, dropin, dropout):
        self.assertEqual(9, len(item), "number of properties for a mem proc item")
        self.assertEqual(item['time'], time)
        self.assertEqual(item['bytes_sent'], bytes_sent)
        self.assertEqual(item['bytes_recv'], bytes_recv)
        self.assertEqual(item['packets_sent'], packets_sent)
        self.assertEqual(item['packets_received'], packets_recv)
        self.assertEqual(item['errors_in'], errin)
        self.assertEqual(item['errors_out'], errout)
        self.assertEqual(item['dropped_in'], dropin)
        self.assertEqual(item['dropped_out'], dropout)

    def test_proc_error_es(self):
        log = ESNetLogger()
        log.proc_error(11111, 22, "epic fail")
        log.proc_error(11112, 9758, "fail")
        log.proc_error(11113, 7364, "little fail")
        log.close()
        time.sleep(1)
        data = self.es.search(index=config.DB_NAME, body='{"query":{"match_all":{}}, "sort": { "time": { "order": "asc" }}}')
        items = data["hits"]["hits"]
        self.assertEqual(3, len(items), "number of mem sys items logged")
#        print(str(items[0]["_source"]))
        self.assert_proc_error(items[0]["_source"], 11111, 22, "epic fail")
        self.assert_proc_error(items[1]["_source"], 11112, 9758, "fail")
        self.assert_proc_error(items[2]["_source"], 11113, 7364, "little fail")

    def assert_proc_error(self, item, time, pid, name):
        self.assertEqual(3, len(item), "number of properties for a proc error item")
        self.assertEqual(item['time'], time)
        self.assertEqual(item['pid'], pid)
        self.assertEqual(item['name'], name)

    def test_proc_info_es(self):
        log = ESNetLogger()
        log.proc_info(11111, 22, "proc1")
        log.proc_info(11111, 9758, "proc2")
        log.proc_info(11111, 7364, "proc4")
        log.proc_info(11111, 3332, "proc3")
        log.close()
        time.sleep(1)
        data = self.es.search(index=config.DB_NAME, body='{"query":{"match_all":{}}, "sort": { "pid": { "order": "asc" }}}')
        items = data["hits"]["hits"]
        self.assertEqual(4, len(items), "number of mem sys items logged")
#        print(str(items[0]["_source"]))
        self.assert_proc_error(items[0]["_source"], 11111, 22, "proc1")
        self.assert_proc_error(items[1]["_source"], 11111, 3332, "proc3")
        self.assert_proc_error(items[2]["_source"], 11111, 7364, "proc4")
        self.assert_proc_error(items[3]["_source"], 11111, 9758, "proc2")

    def assert_proc_info(self, item, time, pid, name):
        self.assertEqual(3, len(item), "number of properties for a proc info item")
        self.assertEqual(item['time'], time)
        self.assertEqual(item['pid'], pid)
        self.assertEqual(item['name'], name)
class EscuchaSync(object):
	# Constants
	VALID_TWTR_CHARS = u"_@#{0}{1}áéíóúÁÉÍÓÚñÑ".format(string.ascii_letters, string.digits)

	def __init__(self):
		self.es = Elasticsearch('http://*****:*****@%'")
                #self.cursor.execute("select name from escucha.follow_pages")
                self.authors={r[0][1:].lower() for r in self.cursor.fetchall()}

	def reload_sources(self):
		self.cursor.execute("SELECT id, name FROM escucha.sources")
		self.sources = {r[0]: r[1] for r in self.cursor.fetchall()}

	def reload_censored_keywords(self):
		"""
		TODO: Si hay muchas palabras censuradas seria conveniente filtrar todas pero eliminar
		solo las ultimas en cada iteracion del loop
		"""
		self.cursor.execute("SELECT term, in_title, in_description, in_author, blocked_on FROM escucha.blocked_in_reports")
		self.censored_in_title = []
		self.censored_in_description = []
		self.censored_in_author = []
		for r in self.cursor.fetchall():
			term = r[0].lower()
			if r[1]: # In title
				self.censored_in_title.append(term)
			if r[2]: # In description
				self.censored_in_description.append(term)
			if r[3]: # In author
				self.censored_in_author.append(term)

	def has_censored_terms(self, text, censored_list):
		if not text:
			return False
		for kw in censored_list:
			if kw in text.lower():
				return True

	def has_censored_title(self, text):
		return self.has_censored_terms(text, self.censored_in_title)

	def has_censored_description(self, text):
		return self.has_censored_terms(text, self.censored_in_description)

	def has_censored_author(self, text):
		return self.has_censored_terms(text, self.censored_in_author)

	def is_spanish(self, text):
		language, confidence =  langid.classify(text)
		return language == 'es'

	def is_spanish_item(self, title, description):
		if not title and not description:
			return True
		else:
			spanish_item = False
			if title:
				spanish_item = self.is_spanish(title)
			if not spanish_item and description:
				spanish_item = self.is_spanish(description)
			return spanish_item

	def get_geometa(self, lat, lon):
		# name_0: pais, name_1: c.autonoma, name_2: provincia, name_3: region, name_4: municipio
		self.local_cursor.execute("""
			SELECT
				name_2, name_3, name_4
			FROM
				municipios
			WHERE
				ST_CONTAINS(geom, ST_PointFromText('POINT(%s %s)', 4326));
		""", (lon, lat))
		return self.local_cursor.fetchone()

	def get_geoweight(self, lat, lon):
		self.local_cursor.execute("""
			SELECT
				population
			FROM
				municipalities_weight
			WHERE
				ST_CONTAINS(geom, ST_PointFromText('POINT(%s %s)', 4326));
		""", (lon, lat))
		dbres = self.local_cursor.fetchone()
		if dbres is None:
			return None
		else:
			population = dbres[0]
			return 1.0/population if population > 0 else 0

	def twtr_filter(self, text):
		return ''.join(c for c in text if c in self.VALID_TWTR_CHARS)

	def get_twitter_entities(self, text):
		hashtags = []
		mentions = []
		for token in text.split():
			filtered = self.twtr_filter(token)
			if len(filtered) >= 2:
				if filtered.startswith('#'):
					hashtags.append(filtered.lower())
				elif filtered.startswith('@'):
					mentions.append(filtered.lower())
		return hashtags, mentions

	def get_max_captured_on(self):
		# Search max captured_on
		payload = {
			"size": 0,
			"aggs": {
				"max_captured_on": {
					"max": {
						"field": "captured_on"
					}
				}
			}
		}
		r = self.es.search(index="escucha", doc_type="data_items", body=payload)
		es_res = r['aggregations']['max_captured_on']
		if es_res['value'] is None:
			# There are no data in ElasticSearch. Retrieve all
			self.cursor.execute("SELECT min(captured_on) FROM escucha.data_items")
			max_captured_on = self.cursor.fetchone()[0] - datetime.timedelta(days=1)
		else:
			max_captured_on = datetime.datetime.strptime(es_res['value_as_string'], '%Y-%m-%d %H:%M:%S')
		return max_captured_on

	def sync(self):
		logger.info('Sincronizando')

		max_captured_on = self.get_max_captured_on()
		logger.debug('Sincronizando desde %s', max_captured_on)

		self.cursor.execute("""
			DECLARE C CURSOR FOR
				SELECT
					id, source_id, type, author, title, description, published_on, captured_on,
					starts_on, ends_on, reason, geom IS NOT NULL AS is_geom,
					ST_Y(geom) AS lat, ST_X(geom) AS lon, accuracy, url
				FROM
					escucha.data_items
				WHERE
					captured_on > %s
				ORDER BY
					captured_on
		""", (max_captured_on, ))

		logger.info('Indexando')
		last = None
		limit = 5000
		indexados = 0
		n_retrieved = Counter()
		n_items = Counter()
		n_censored = Counter()
		n_notspanish = Counter()
		while not (last is not None and last < limit):
			start_t = time.time()
			self.cursor.execute("FETCH %s FROM C", (limit,))
			batch = self.cursor.fetchall()
			statsd.timing('fetch', (time.time() - start_t)*1000)
			logger.debug('Retrieved items: %s', len(batch))

			actions = []
			n_retrieved.clear()
			n_items.clear()
			n_censored.clear()
			n_notspanish.clear()

			for dbitem in batch:
				source = self.sources[dbitem[1]]
				n_retrieved[source] += 1

				if self.has_censored_title(dbitem[4]) or self.has_censored_description(dbitem[5]) or self.has_censored_author(dbitem[3]):
					n_censored[source] += 1
					continue

				if not self.is_spanish_item(dbitem[4], dbitem[5]):
					n_notspanish[source] += 1
					continue

				n_items[source] += 1

				item = {
					'source': source,
					'type': dbitem[2],
					'author': dbitem[3],
					'title': dbitem[4],
					'description': dbitem[5],
					'published_on': dbitem[6].strftime('%Y-%m-%d %H:%M:%S'),
					'captured_on': dbitem[7].strftime('%Y-%m-%d %H:%M:%S'),
					'starts_on': dbitem[8].strftime('%Y-%m-%d %H:%M:%S') if dbitem[8] is not None else None,
					'ends_on': dbitem[9].strftime('%Y-%m-%d %H:%M:%S') if dbitem[9] is not None else None,
					'reason': dbitem[10],
					'geo': {'lat': dbitem[12], 'lon': dbitem[13]} if dbitem[11] else None,
					'geoweight': None,
					'accuracy': dbitem[14] if dbitem[11] else None,
					'province': None,
					'region': None,
					'municipality': None,
					'url': dbitem[15],
					'polarity': 0,
					'hashtags': [],
					'mentions': [],
				}
				if dbitem[5]:
					item['polarity'] = self.analyzer.polarity(dbitem[5])
					hashtags, mentions = self.get_twitter_entities(dbitem[5])
					item['hashtags'] = hashtags
					item['mentions'] = mentions
				if dbitem[11]:
					geometa = self.get_geometa(dbitem[12], dbitem[13])
					if geometa is not None:
						item['province'] = geometa[0].lower()
						item['region'] = geometa[1].lower()
						item['municipality'] = geometa[2].lower()
					item['geoweight'] = self.get_geoweight(dbitem[12], dbitem[13])
                                if dbitem[3] <> None:
                                    if dbitem[3].lower() in self.authors:
                                        item["author_in_list"]=True 
                                    else:
                                        item["author_in_list"]=False
                                if len(item['mentions'])>0:
                                    cleanmentions=set(x.strip('@/') for x in item['mentions'])
                                    if len(cleanmentions & self.authors) > 0:
                                        item["mention_in_list"]=True
                                    else:
                                        item["mention_in_list"]=False
				action = {
					"_index": "escucha",
					"_type": "data_items",
					"_id": dbitem[0],
					"_source": item
				}
				actions.append(action)

			for source, num in n_retrieved.iteritems():
				statsd.gauge('retrieved_items.%s' % source, num)
			for source, num in n_items.iteritems():
				statsd.gauge('items.%s' % source, num)
			for source, num in n_censored.iteritems():
				statsd.gauge('censored_keywords.%s' % source, num)
			for source, num in n_notspanish.iteritems():
				statsd.gauge('not_spanish.%s' % source, num)

			if actions:
				start_t = time.time()
				helpers.bulk(self.es, actions)
				statsd.timing('bulk', (time.time() - start_t)*1000)
				indexados += len(actions)
				actions = []
				#print '[{0}] {1} items indexados'.format(datetime.datetime.now(), indexados)

			last = len(batch)
		self.cursor.execute("CLOSE C")
		logger.info('%s items indexados', indexados)

	def delete(self):
		logger.info('Eliminando palabras censuradas')
		start_t = time.time()
		payload = {
		  "query": {
		    "bool": {
		      "should": []
		    }
		  }
		}
		for term in self.censored_in_title:
			payload["query"]["bool"]["should"].append({ "match": { "title":  '"' + term + '"'}})
		for term in self.censored_in_description:
			payload["query"]["bool"]["should"].append({ "match": { "description": '"' + term + '"'}})
		for term in self.censored_in_author:
			payload["query"]["bool"]["should"].append({ "match": { "author": term}})
		self.es.delete_by_query(index="escucha", doc_type="data_items", body=payload)
		statsd.timing('delete', (time.time() - start_t)*1000)

	def run(self):
		try:
			logger.info('Iniciado script')
			while True:
				# Every 5 mins
				self.reload_sources()
				self.reload_censored_keywords()
				self.delete()
				self.sync()
				time.sleep(5*60)
		except (SystemExit, KeyboardInterrupt):
			raise
		except:
			logger.error('Unexpected error sync escucha db to elasticsearch', exc_info=True)
		finally:
			logger.info('Finalizado script')
class DocManager(DocManagerBase):
    """The DocManager class creates a connection to the backend engine and
        adds/removes documents, and in the case of rollback, searches for them.

        The reason for storing id/doc pairs as opposed to doc's is so that
        multiple updates to the same doc reflect the most up to date version as
        opposed to multiple, slightly different versions of a doc.

        We are using elastic native fields for _id and ns, but we also store
        them as fields in the document, due to compatibility issues.
        """

    def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL,
                 unique_key='_id', chunk_size=DEFAULT_MAX_BULK, **kwargs):
        """ Establish a connection to Elastic
        """
        self.elastic = Elasticsearch(hosts=[url])
        self.auto_commit_interval = auto_commit_interval
        self.doc_type = 'string'  # default type is string, change if needed
        self.unique_key = unique_key
        self.chunk_size = chunk_size
        if self.auto_commit_interval not in [None, 0]:
            self.run_auto_commit()

    def stop(self):
        """ Stops the instance
        """
        self.auto_commit_interval = None

    @wrap_exceptions
    def update(self, doc, update_spec):
        """Apply updates given in update_spec to the document whose id
        matches that of doc.

        """
        document = self.elastic.get(index=doc['ns'],
                                    id=str(doc['_id']))
        updated = self.apply_update(document['_source'], update_spec)
        self.upsert(updated)
        return updated

    @wrap_exceptions
    def upsert(self, doc):
        """Update or insert a document into Elastic

        If you'd like to have different types of document in your database,
        you can store the doc type as a field in Mongo and set doc_type to
        that field. (e.g. doc_type = doc['_type'])

        """
        doc_type = self.doc_type
        index = doc['ns']
        doc[self.unique_key] = str(doc["_id"])
        doc_id = doc[self.unique_key]
        self.elastic.index(index=index, doc_type=doc_type,
                           body=bsjson.dumps(doc), id=doc_id,
                           refresh=(self.auto_commit_interval == 0))

    @wrap_exceptions
    def bulk_upsert(self, docs):
        """Update or insert multiple documents into Elastic

        docs may be any iterable
        """
        def docs_to_upsert():
            doc = None
            for doc in docs:
                index = doc["ns"]
                doc[self.unique_key] = str(doc[self.unique_key])
                doc_id = doc[self.unique_key]
                yield {
                    "_index": index,
                    "_type": self.doc_type,
                    "_id": doc_id,
                    "_source": doc
                }
            if not doc:
                raise errors.EmptyDocsError(
                    "Cannot upsert an empty sequence of "
                    "documents into Elastic Search")
        try:
            if self.chunk_size > 0:
                responses = bulk(client=self.elastic,
                                 actions=docs_to_upsert(),
                                 chunk_size=self.chunk_size)
            else:
                responses = bulk(client=self.elastic,
                                 actions=docs_to_upsert())
            for resp in responses[1]:
                ok = resp['index'].get('ok')
                if ok is None:
                    status = resp['index'].get('status')
                    ok = (300 > status >= 200)
                if not ok:
                    logging.error(
                        "Could not bulk-upsert document "
                        "into ElasticSearch: %r" % resp)
            if self.auto_commit_interval == 0:
                self.commit()
        except errors.EmptyDocsError:
            # This can happen when mongo-connector starts up, there is no
            # config file, but nothing to dump
            pass

    @wrap_exceptions
    def remove(self, doc):
        """Removes documents from Elastic

        The input is a python dictionary that represents a mongo document.
        """
        self.elastic.delete(index=doc['ns'], doc_type=self.doc_type,
                            id=str(doc[self.unique_key]),
                            refresh=(self.auto_commit_interval == 0))

    @wrap_exceptions
    def _remove(self):
        """For test purposes only. Removes all documents in test.test
        """
        self.elastic.delete_by_query(index="test.test",
                                     doc_type=self.doc_type,
                                     q="*:*")
        self.commit()

    @wrap_exceptions
    def _stream_search(self, *args, **kwargs):
        """Helper method for iterating over ES search results"""
        first_response = self.elastic.search(*args, search_type="scan",
                                             scroll="10m", size=100,
                                             **kwargs)
        scroll_id = first_response.get("_scroll_id")
        expected_count = first_response.get("hits", {}).get("total", 0)
        results_returned = 0
        while results_returned < expected_count:
            next_response = self.elastic.scroll(scroll_id=scroll_id,
                                                scroll="10m")
            results_returned += len(next_response["hits"]["hits"])
            for doc in next_response["hits"]["hits"]:
                yield doc["_source"]

    def search(self, start_ts, end_ts):
        """Called to query Elastic for documents in a time range.
        """
        return self._stream_search(index="_all",
                                   body={"query": {"range": {"_ts": {
                                       "gte": start_ts,
                                       "lte": end_ts
                                   }}}})

    def _search(self):
        """For test purposes only. Performs search on Elastic with empty query.
        Does not have to be implemented.
        """
        return self._stream_search(index="test.test",
                                   body={"query": {"match_all": {}}})

    def commit(self):
        """This function is used to force a refresh/commit.
        """
        retry_until_ok(self.elastic.indices.refresh, index="")

    def run_auto_commit(self):
        """Periodically commits to the Elastic server.
        """
        self.elastic.indices.refresh()
        if self.auto_commit_interval not in [None, 0]:
            Timer(self.auto_commit_interval, self.run_auto_commit).start()

    @wrap_exceptions
    def get_last_doc(self):
        """Returns the last document stored in the Elastic engine.
        """
        result = self.elastic.search(
            index="_all",
            body={
                "query": {"match_all": {}},
                "sort": [{"_ts": "desc"}]
            },
            size=1
        )["hits"]["hits"]
        return result[0]["_source"] if len(result) > 0 else None