Exemplo n.º 1
0
 def __init__(self, name, **kwargs):
     super().__init__(name=name, **kwargs)
     if type(self.data) is dict:
         self.data = Mapper(self.data)
     if type(self.location) is not str:
         self.location = Mapper(self.location)(kwargs)
     try:
         es = Elasticsearch(
             **self.location,
             request_timeout=0.2,
             retries=False,
             ignore=404)  # TODO url=self.location, ssl_context, http_auth
         es.info()
         self.location = es
     except ImproperlyConfigured as e:
         raise NotFound("ElasticSearch rejected {}\n-----\n\t{}".format(
             pformat(self.location), e))
     except TransportError as e:
         raise NotFound(
             "Failed to reach ElasticSearch at {}\n-----\n\t{}".format(
                 pformat(self.location), e.error))
     except:
         raise NotFound(
             "Unable to connect to ElasticSearch at host:{}".format(
                 self.location.get('host')))
Exemplo n.º 2
0
def _elasticsearch_connect(timeout=300):
    """
    Connect to configured Elasticsearch domain.

    :param timeout: How long to wait before ANY request to Elasticsearch times
    out. Because we use parallel bulk uploads (which sometimes wait long periods
    of time before beginning execution), a value of at least 30 seconds is
    recommended.
    :return: An Elasticsearch connection object.
    """

    log.info('Connecting to %s %s with AWS auth', ELASTICSEARCH_URL,
             ELASTICSEARCH_PORT)
    auth = AWSRequestsAuth(aws_access_key=AWS_ACCESS_KEY_ID,
                           aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
                           aws_host=ELASTICSEARCH_URL,
                           aws_region=AWS_REGION,
                           aws_service='es')
    auth.encode = lambda x: bytes(x.encode('utf-8'))
    es = Elasticsearch(host=ELASTICSEARCH_URL,
                       port=ELASTICSEARCH_PORT,
                       connection_class=RequestsHttpConnection,
                       timeout=timeout,
                       max_retries=10,
                       retry_on_timeout=True,
                       http_auth=auth,
                       wait_for_status='yellow')
    es.info()
    return es
class ElasticService():
    def __init__(self, host, port, username, password):
        super().__init__()
        requests.packages.urllib3.disable_warnings()
        try:
            log.info(f"connecting to elastic on host: {host}")
            self.es = Elasticsearch(f"https://{host}:{port}",
                                    http_auth=(username, password),
                                    verify_certs=False)
            self.es.info()
        except Exception as e:
            log.error(e)
            raise

    def create_index(self, index_name):
        try:
            print("creating index if not exists")
            self.es.indices.create(index=index_name, ignore=400)
        except Exception as e:
            log.error(e)

    def create_doc(self, index_name, id, body):
        try:
            log.info("creating doc in elastic")
            self.es.create(index=index_name, id=id, body=body)
        except Exception as e:
            log.error(e)

    def delete_doc(self, index_name, id):
        try:
            log.info("deleting doc from elastic")
            self.es.delete(index=index_name, id=id)
        except Exception as e:
            log.error(e)
Exemplo n.º 4
0
def elastic():
    connections.configure(default={
        'hosts': os.environ['ELASTICSEARCH_HOST'],
        'port': os.environ['ELASTICSEARCH_PORT'],
        'use_ssl': True,
        'verify_certs': True,
        'ca_certs': '/run/secrets/ca.crt',
        'client_cert': '/run/secrets/certificate.crt',
        'client_key': '/run/secrets/certificate.key'
    }, )

    es = Elasticsearch(host=os.environ['ELASTICSEARCH_HOST'],
                       port=os.environ['ELASTICSEARCH_PORT'],
                       use_ssl=True,
                       verify_certs=True,
                       ca_certs='/run/secrets/ca.crt',
                       client_cert='/run/secrets/certificate.crt',
                       client_key='/run/secrets/certificate.key')
    try:
        es.info()
    except es_exceptions.ConnectionError:
        return error('Please check Elasticserach service, %s:%s' %
                     (os.environ['ELASTICSEARCH_HOST'],
                      os.environ['ELASTICSEARCH_PORT']))
    return ({'response': es, 'status': 'OK'})
Exemplo n.º 5
0
async def insert(data):

    server = 'http://es-arques.com:9200/'
    index_name = 'crypto_price_info'
    # # elasticsearch connect
    es = Elasticsearch(server)
    es.info()

    if len(data) > 0:
        for li in data:
            _exchange_name = li['exchange_name']
            _symbol = li['symbol']
            _period = 'ticksync'
            _timestamp = li['timestamp']
            _datetime = li['datetime']
            _open = check_none_value(li['open'])
            _high = check_none_value(li['high'])
            _low = check_none_value(li['low'])
            _close = check_none_value(li['close'])
            _volume = check_none_value(li['volume'])

            params = (_exchange_name, _symbol, _period, _timestamp, _datetime, _open, _high, _low, _close, _volume)
            r = db.insert_price(_exchange_name, params)

            es.index(index=index_name, doc_type='string', body=li)

    es.indices.refresh(index=index_name)


    t = time.time()
    print(t, 'Ticker ok')
Exemplo n.º 6
0
 def elasticsearch_fail():
     es = Elasticsearch([{
         'host': 'example.com',
         'port': 9999
     }],
                        timeout=0.1)
     es.info()
Exemplo n.º 7
0
def setup():
    log = logging.getLogger("haystack")
    try:
        import elasticsearch

        if not ((1, 0, 0) <= elasticsearch.__version__ < (2, 0, 0)):
            raise ImportError
        from elasticsearch import Elasticsearch, ElasticsearchException
    except ImportError:
        log.error(
            "Skipping ElasticSearch 1 tests: 'elasticsearch>=1.0.0,<2.0.0' not installed."
        )
        raise unittest.SkipTest("'elasticsearch>=1.0.0,<2.0.0' not installed.")

    es = Elasticsearch(settings.HAYSTACK_CONNECTIONS["elasticsearch"]["URL"])
    try:
        es.info()
    except ElasticsearchException as e:
        log.error(
            "elasticsearch not running on %r" %
            settings.HAYSTACK_CONNECTIONS["elasticsearch"]["URL"],
            exc_info=True,
        )
        raise unittest.SkipTest(
            "elasticsearch not running on %r" %
            settings.HAYSTACK_CONNECTIONS["elasticsearch"]["URL"],
            e,
        )
Exemplo n.º 8
0
def setup():
    log = logging.getLogger("haystack")
    try:
        import elasticsearch

        if not ((1, 0, 0) <= elasticsearch.__version__ < (2, 0, 0)):
            raise ImportError
        from elasticsearch import Elasticsearch, ElasticsearchException
    except ImportError:
        log.error(
            "Skipping ElasticSearch 1 tests: 'elasticsearch>=1.0.0,<2.0.0' not installed."
        )
        raise unittest.SkipTest("'elasticsearch>=1.0.0,<2.0.0' not installed.")

    es = Elasticsearch(settings.HAYSTACK_CONNECTIONS["elasticsearch"]["URL"])
    try:
        es.info()
    except ElasticsearchException as e:
        log.error(
            "elasticsearch not running on %r"
            % settings.HAYSTACK_CONNECTIONS["elasticsearch"]["URL"],
            exc_info=True,
        )
        raise unittest.SkipTest(
            "elasticsearch not running on %r"
            % settings.HAYSTACK_CONNECTIONS["elasticsearch"]["URL"],
            e,
        )
Exemplo n.º 9
0
def update_my_model_data():
    es = Elasticsearch("http://localhost:9200/")
    es.info()

    index_name = 'articles'
    make_index(es, index_name)

    doc_files = []

    for i in os.listdir(
            "/Users/parkjeongseop/Desktop/Dev/NLP/hw6/ITnews623_sim383/"):
        try:
            print("processing", i)

            this_doc = open(
                '/Users/parkjeongseop/Desktop/Dev/NLP/hw6/ITnews623_sim383/' +
                i,
                'r',
                encoding='cp949').read()

            # Django Model
            this = Content()
            this.title = i
            this.content = this_doc
            this.save()

            # Elasticsearch Indexing
            doc = {'title': i, 'content': this_doc}
            es.index(index=index_name, doc_type='string', body=doc)

        except:
            print("ERROR", i)
    es.indices.refresh(index=index_name)
Exemplo n.º 10
0
def _elasticsearch_connect():
    """
    Connect to configured Elasticsearch domain.

    :return: An Elasticsearch connection object.
    """

    es_url = config("ELASTICSEARCH_URL", default="localhost")
    es_port = config("ELASTICSEARCH_PORT", default=9200, cast=int)
    es_aws_region = config("ELASTICSEARCH_AWS_REGION", default="us-east-1")

    auth = AWSRequestsAuth(
        aws_access_key=settings.AWS_ACCESS_KEY_ID,
        aws_secret_access_key=settings.AWS_SECRET_ACCESS_KEY,
        aws_host=es_url,
        aws_region=es_aws_region,
        aws_service="es",
    )
    auth.encode = lambda x: bytes(x.encode("utf-8"))
    _es = Elasticsearch(
        host=es_url,
        port=es_port,
        connection_class=RequestsHttpConnection,
        timeout=10,
        max_retries=1,
        retry_on_timeout=True,
        http_auth=auth,
        wait_for_status="yellow",
    )
    _es.info()
    return _es
Exemplo n.º 11
0
def _elasticsearch_connect():
    """
    Connect to configured Elasticsearch domain.

    :return: An Elasticsearch connection object.
    """
    auth = AWSRequestsAuth(
        aws_access_key=settings.AWS_ACCESS_KEY_ID,
        aws_secret_access_key=settings.AWS_SECRET_ACCESS_KEY,
        aws_host=settings.ELASTICSEARCH_URL,
        aws_region=settings.ELASTICSEARCH_AWS_REGION,
        aws_service='es'
    )
    auth.encode = lambda x: bytes(x.encode('utf-8'))
    _es = Elasticsearch(
        host=settings.ELASTICSEARCH_URL,
        port=settings.ELASTICSEARCH_PORT,
        connection_class=RequestsHttpConnection,
        timeout=10,
        max_retries=99,
        retry_on_timeout=True,
        http_auth=auth,
        wait_for_status='yellow'
    )
    _es.info()
    return _es
Exemplo n.º 12
0
def get_elasticsearch_client(cloud_id=None,
                             elasticsearch_url=None,
                             es_user=None,
                             es_password=None,
                             ctx=None,
                             **kwargs):
    """Get an authenticated elasticsearch client."""
    if not (cloud_id or elasticsearch_url):
        client_error("Missing required --cloud-id or --elasticsearch-url")

    # don't prompt for these until there's a cloud id or elasticsearch URL
    es_user = es_user or click.prompt("es_user")
    es_password = es_password or click.prompt("es_password", hide_input=True)
    hosts = [elasticsearch_url] if elasticsearch_url else None
    timeout = kwargs.pop('timeout', 60)

    try:
        client = Elasticsearch(hosts=hosts,
                               cloud_id=cloud_id,
                               http_auth=(es_user, es_password),
                               timeout=timeout,
                               **kwargs)
        # force login to test auth
        client.info()
        return client
    except elasticsearch.AuthenticationException as e:
        error_msg = f'Failed authentication for {elasticsearch_url or cloud_id}'
        client_error(error_msg, e, ctx=ctx, err=True)
Exemplo n.º 13
0
def _elasticsearch_connect() -> Elasticsearch:
    """
    Connect to an Elasticsearch indices at the configured domain. This method also
    handles AWS authentication using the AWS access key ID and the secret access key.

    :return: an Elasticsearch client
    """

    log.info(
        f"Connecting to {ELASTICSEARCH_URL}:{ELASTICSEARCH_PORT} with AWS auth"
    )
    auth = AWSRequestsAuth(
        aws_access_key=AWS_ACCESS_KEY_ID,
        aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
        aws_host=ELASTICSEARCH_URL,
        aws_region=AWS_REGION,
        aws_service="es",
    )
    auth.encode = lambda x: bytes(x.encode("utf-8"))
    es = Elasticsearch(
        host=ELASTICSEARCH_URL,
        port=ELASTICSEARCH_PORT,
        connection_class=RequestsHttpConnection,
        http_auth=auth,
        timeout=TWELVE_HOURS_SEC,
    )
    es.info()
    return es
Exemplo n.º 14
0
def check_index():

    try:
        # ES Connect
        es_client = Elasticsearch(
            ['121.125.71.147', '121.125.71.148', '121.125.71.149'],
            port=9200,
            timeout=20,
            http_auth=('elastic', 'wtlcnNyrDPVko01lZfIl'))
        es_client.info()

        index_name = "index-nudge-result-analysis"

        query = """
        {
          "size": 0,
          "query": {
            "bool": {
              "filter": [
                {
                  "term": {
                    "log_day": "%s"
                  }
                }
              ]
            }
          },"aggs": {
            "NAME": {
              "terms": {
                "field": "action_body.category",
                "size": 1000
              }
            }
          }
        }
        """

        response = es_client.search(index=index_name,
                                    body=query % one_days_before)
        list_day = response['aggregations']['NAME']['buckets']

        message = "<넛지 성과 분석>(" + today.strftime("%Y-%m-%d") + ")\n"

        if not list_day:
            message += "ALIAS ERROR"

        for day in list_day:
            message += str(day['key']) + " : " + str(day['doc_count']) + "건\n"

        print(message)
        bot.sendMessage(chat_id='1228894509', text=str(message))
        bot.sendMessage(chat_id='976803858', text=str(message))
        bot.sendMessage(chat_id='1070666335', text=str(message))

    except Exception as es_err:
        print(es_err)
        err_message = "넛지 성과 분석 ERROR"
        err_message += str(es_err)
        bot.sendMessage(chat_id='1228894509', text=err_message)
Exemplo n.º 15
0
def connElasticsearch(args):
    '''尝试连接 ElasticSearch'''
    es = Elasticsearch(['%s:%s'%(args['host'],args['port'])])
    try:
        # 尝试连接
        es.info()
    except Exception,e:
        raise Exception,'ElasticSearch <%s:%s> 连接失败!'%(args['host'],args['port'])
Exemplo n.º 16
0
 def v_es(self):
     hosts = self._get_config("ELASTICSEARCH_HOSTS")
     try:
         es = Elasticsearch(hosts=hosts)
         es.info()
     except Exception:
         return False
     return True
Exemplo n.º 17
0
 def __init__(self):
     indexName = 'andernieuws'
     es = Elasticsearch()
     print es.info()
     x = Indexer()
     """ uncomment the following to create an empty index on your local Elasticsearch"""
     #indexer.createIndex(es, indexName, '../resources/settings.json', '../resources/mapping.json')
     """ uncomment the following to index a directory with ASR files"""
Exemplo n.º 18
0
def getSnapshotClient():
	global client
	es = Elasticsearch()
	client = SnapshotClient(es)
	try:
		es.info()
	except ConnectionError, e:
		return False
Exemplo n.º 19
0
class ElasticClient:
    def __init__(self, host: str, port: int):
        try:
            self.es = Elasticsearch(hosts=[
                {'host': host,
                 'port': port}])
            info = self.es.info()
            logger.info("Connected to Elasticsearch v. %s, name: %s" % (info['version']['number'], info['name']))

        except ElasticsearchException as e:
            logger.error("Elasticsearch is not available.", e)
            exit(0)

    def get_articles(self, index, doctype, batch_size):
        query = '{"query": { "bool": { "must_not": { "exists": { "field": "status" }}}}}'
        result = self.es.search(index=index, doc_type=doctype, size=batch_size, body=query)
        articles = result.get('hits').get('hits')
        return articles if articles is not None else []

    def count(self, index):
        return self.es.count(index=index)['count']

    def info(self):
        return self.es.info()

    def check_url(self, url: str, auth_index: str):
        """
        Private function to check if a URL appears in the database.

        Parameters
        ----------

        url: URL for the news stories to be scraped.

        auth_index: es index

        Returns
        -------

        found: Boolean.
                Indicates whether or not a URL was found in the database.
        """
        response = self.es.search(index=auth_index, doc_type=auth_index, body={
            "query":
                {
                    "match_phrase": {
                        "url": url
                    }
                }
        }, size=0, terminate_after=1, ignore_unavailable=True)

        return response["hits"]["total"] > 0

    def persist(self, index, doctype, payload):
        self.es.index(index=index, doc_type=doctype, body=payload)

    def update(self, index, doctype, doc_id, payload):
        self.es.update(index=index, doc_type=doctype, id=doc_id, body=payload)
Exemplo n.º 20
0
 def delete_index(cls):
     client = Elasticsearch([
         'https://search-ticker-sentiment-ohr4wryq6vcybcoqvqumx5bezm.us-east-2.es.amazonaws.com'
     ])
     print(client.info())
     client.indices.delete(index='tweet', ignore=[400, 404])
     client.indices.delete(index='twitter', ignore=[400, 404])
     client.indices.delete(index='financials', ignore=[400, 404])
     client.indices.delete(index='stocks_data.py', ignore=[400, 404])
     print(client.info())
Exemplo n.º 21
0
def _init_elasticsearch_client(host: str) -> Elasticsearch:
    client = None

    try:
        client = Elasticsearch(host)
        client.info()
    except exceptions.ConnectionError:
        logger.error(f"Failed to connect to elasticsearch server at '{host}'")

    return client
Exemplo n.º 22
0
class Catalogue():
    def __init__(self, config):
        print config
        self.config = config
        self.es = Elasticsearch(host=self.config['CATALOGUE_ES_HOST'],
                                port=self.config['CATALOGUE_ES_PORT'])
        try:
            print 'Trying to connect to the B&G catalogue'
            print self.es.info()
        except ConnectionError, e:
            print e
Exemplo n.º 23
0
 def wrap(request, *args, **kwargs):
     # controllo lo stato della connessione a ElastiSearch
     try:
         es = Elasticsearch()
         es.info()
         return function(request, *args, **kwargs)
     except es_exceptions.ConnectionError as ce:
         return HttpResponseRedirect('/elastic-connection-error')
     except Exception as generic_exp:
         print str(generic_exp)
         return HttpResponseRedirect('/elastic-connection-error')
def check_server_status(conn=None):
    if conn is None:
        conn = Elasticsearch(hosts=getElasticsearchServerHostAndPort())

    try:
        conn.info()
    except (ConnectionError, TransportError):
        return 'Connection error'

    # no errors!
    return 'OK'
Exemplo n.º 25
0
def setup():
    try:
        from elasticsearch import Elasticsearch, ElasticsearchException
    except ImportError:
        raise SkipTest("elasticsearch-py not installed.")

    es = Elasticsearch(settings.HAYSTACK_CONNECTIONS['elasticsearch']['URL'])
    try:
        es.info()
    except ElasticsearchException as e:
        raise SkipTest("elasticsearch not running on %r" % settings.HAYSTACK_CONNECTIONS['elasticsearch']['URL'], e)
Exemplo n.º 26
0
def get_es_client(user, password, elasticsearch_url=None, cloud_id=None, **kwargs):
    """Get an auth-validated elsticsearch client."""
    assert elasticsearch_url or cloud_id, \
        'You must specify a host or cloud_id to authenticate to an elasticsearch instance'

    hosts = [elasticsearch_url] if elasticsearch_url else elasticsearch_url

    client = Elasticsearch(hosts=hosts, cloud_id=cloud_id, http_auth=(user, password), **kwargs)
    # force login to test auth
    client.info()
    return client
Exemplo n.º 27
0
def setup():
    try:
        from elasticsearch import Elasticsearch, ElasticsearchException
    except ImportError:
        raise unittest.SkipTest("elasticsearch-py not installed.")

    es = Elasticsearch(settings.HAYSTACK_CONNECTIONS['elasticsearch']['URL'])
    try:
        es.info()
    except ElasticsearchException as e:
        raise unittest.SkipTest("elasticsearch not running on %r" % settings.HAYSTACK_CONNECTIONS['elasticsearch']['URL'], e)
Exemplo n.º 28
0
class EsSearcher:
    def __init__(self):
        self.es = None

    def initialize(self, ip, port):
        try:
            self.es = Elasticsearch([ip], port=port)
            print self.es.info()
            return True
        except Exception, err:
            print "failed to connect to es, err=%s" % err
            return False
Exemplo n.º 29
0
def get_elasticsearch_pulse():
    global logger, es
    try:
        es = Elasticsearch([{
            'host': os.getenv('ES_HOST'),
            'port': os.getenv('ES_PORT')
        }])
        es.info()
        set_service_available(True)
    except Exception as e:
        logger.error("elasticsearch unreachable: {}".format(str(e)))
        set_service_available(False)
Exemplo n.º 30
0
def get_es_client(user, password, host=None, cloud_id=None, **kwargs):
    """Get an auth-validated elsticsearch client."""
    assert host or cloud_id, 'You must specify a host or cloud-id to authenticate to elasticsearch instance'
    hosts = [host] if host else host

    client = Elasticsearch(hosts=hosts,
                           cloud_id=cloud_id,
                           http_auth=(user, password),
                           **kwargs)
    # force login to test auth
    client.info()
    return client
Exemplo n.º 31
0
def try_es_connect(attempts=0):
    """Recursively try to connect to elasticsearch."""
    try:
        cli = Elasticsearch([ELASTIC_ENDPOINT])
        cli.info()
    except ElasticsearchException as ex:
        if attempts < ELASTIC_CONNECT_ATTEMPTS:
            sleep(ELASTIC_WAIT)
            attempts += 1
            try_es_connect(attempts)
        else:
            raise ex
Exemplo n.º 32
0
def elasticsearch_fixture(elasticsearch_dir):
    # test if a ES cluster is already running. If not, download and start an ES instance locally.
    try:
        client = Elasticsearch(hosts=[{"host": "localhost"}])
        client.info()
    except:
        print("Downloading and starting an Elasticsearch instance for the tests ...")
        thetarfile = "https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.6.1-linux-x86_64.tar.gz"
        ftpstream = urllib.request.urlopen(thetarfile)
        thetarfile = tarfile.open(fileobj=ftpstream, mode="r|gz")
        thetarfile.extractall(path=elasticsearch_dir)
        es_server = Popen([elasticsearch_dir / "elasticsearch-7.6.1/bin/elasticsearch"], stdout=PIPE, stderr=STDOUT)
        time.sleep(40)
Exemplo n.º 33
0
def get_es():
    url = settings.ELASTICSEARCH_URL
    timeout = settings.ELASTICSEARCH_TIMEOUT
    for attempt in service_retries():
        try:
            if not hasattr(settings, "_es_instance"):
                es = Elasticsearch(url, timeout=timeout)
                es.info()
                settings._es_instance = es
            return settings._es_instance
        except TransportError as exc:
            log.warning("ElasticSearch error: %s", exc.error)
            backoff(failures=attempt)
    raise RuntimeError("Could not connect to ElasticSearch")
Exemplo n.º 34
0
def es_auth(user, pw):
    try:
        es = Elasticsearch(
            [' '],
            http_auth=(user, pw),
            port=443,
            use_ssl=True,
            verify_certs=True,
            ca_certs=certifi.where()
        )
        es.info()
    except:
        print("Unable to Auth to ES")
        exit()
    return es
Exemplo n.º 35
0
def get_es_info(hosts):
    es = Elasticsearch(hosts)

    info = es.info()
    if not info:
        return 0

    stats = es.indices.stats()
    #print stats

    print "***********************************info of elasticsearch server : %s************************************" % hosts[0]
    for index in stats["indices"].keys():
        indices = dict(es.indices.get(index))
        #print "|Index Name\tDoc Count\tType"
        print "|",index,"\t",stats["indices"][index]["total"]["docs"]["count"],"\t",indices[index]["mappings"].keys()

        #print indices[index]["mappings"].keys()
        for type in indices[index]["mappings"].keys():

            print "----------------------------------------------%s-------------------------------------------------------" % type
            doc = es.search(index=index,doc_type=type)
            #print doc
            for hits in doc["hits"]["hits"]:
                record = dict(hits)
                for key in record["_source"].keys():
                    print key,":",record["_source"][key]
            print "\n"

    print "****************************************************end*********************************************************"
    print "\n\n"
Exemplo n.º 36
0
  def PublishSamples(self, samples):
    """Publish samples to Elasticsearch service"""
    try:
      from elasticsearch import Elasticsearch
    except ImportError:
      raise ImportError('The "elasticsearch" package is required to use '
                        'the Elasticsearch publisher. Please make sure it '
                        'is installed.')

    es = Elasticsearch([self.es_uri])
    if not es.indices.exists(index=self.es_index):
      # choose whether to use old or new mapings based on
      # the version of elasticsearch that is being used
      if int(es.info()['version']['number'].split('.')[0]) >= 5:
        es.indices.create(index=self.es_index, body=self.mapping_5_plus)
        logging.info('Create index %s and default mappings for'
                     ' elasticsearch version >= 5.0.0',
                     self.es_index)
      else:
        es.indices.create(index=self.es_index, body=self.mapping_before_5)
        logging.info('Create index %s and default mappings for'
                     ' elasticsearch version < 5.0.0',
                     self.es_index)
    for s in samples:
      sample = copy.deepcopy(s)
      # Make timestamp understandable by ES and human.
      sample['timestamp'] = self._FormatTimestampForElasticsearch(
          sample['timestamp']
      )
      # Keys cannot have dots for ES
      sample = self._deDotKeys(sample)
      # Add sample to the "perfkit index" of "result type" and using sample_uri
      # as each ES's document's unique _id
      es.create(index=self.es_index, doc_type=self.es_type,
                id=sample['sample_uri'], body=json.dumps(sample))
Exemplo n.º 37
0
def admin( request ):
    """Administrative stuff like re-indexing.
    """
    target_index = search.DOCSTORE.target_index()
    server_info = []
    index_names = []
    indices = []
    es = Elasticsearch(hosts=settings.DOCSTORE_HOSTS)
    ping = es.ping()
    no_indices = True
    if ping:
        info = es.info()
        info_status = info['status']
        if info_status == 200:
            info_status_class = 'success'
        else:
            info_status_class = 'error'
        server_info.append( {'label':'status', 'data':info_status, 'class':info_status_class} )
        
        status = es.indices.status()
        shards_success = status['_shards']['successful']
        shards_failed = status['_shards']['failed']
        if shards_failed == 0:
            shards_success_class = 'success'
            shards_failed_class = 'success'
        else:
            shards_success_class = 'error'
            shards_failed_class = 'error'
        server_info.append( {'label':'shards(successful)', 'data':shards_success, 'class':shards_success_class} )
        server_info.append( {'label':'shards(failed)', 'data':shards_failed, 'class':shards_failed_class} )
        # indices
        for name in status['indices'].keys():
            no_indices = False
            server_info.append( {'label':name, 'data':'', 'class':''} )
            size = status['indices'][name]['total']['store']['size_in_bytes']
            ONEPLACE = Decimal(10) ** -1
            size_nice = Decimal(size/1024/1024.0).quantize(ONEPLACE)
            size_formatted = '%sMB (%s bytes)' % (size_nice, size)
            num_docs = status['indices'][name]['total']['docs']['num_docs']
            server_info.append( {'label':'size', 'data':size_formatted, 'class':'info'} )
            server_info.append( {'label':'documents', 'data':num_docs, 'class':'info'} )
            
            index_names.append(name)
            index = {'name':name, 'exists':True}
            indices.append(index)
    indexform = IndexConfirmForm(request=request)
    dropform = None
    if indices:
        dropform = DropConfirmForm(request=request)
    return render(request, 'webui/search/admin.html', {
        'ping': ping,
        'no_indices': no_indices,
        'server_info': server_info,
        'indices': indices,
        'indexform': indexform,
        'dropform': dropform,
        'docstore_index': settings.DOCSTORE_INDEX,
        'target_index': target_index,
    })
Exemplo n.º 38
0
def setup():
    log = logging.getLogger('haystack')
    try:
        import elasticsearch
        if not ((2, 0, 0) <= elasticsearch.__version__ < (3, 0, 0)):
            raise ImportError
        from elasticsearch import Elasticsearch, exceptions
    except ImportError:
        log.error("Skipping ElasticSearch 2 tests: 'elasticsearch>=2.0.0,<3.0.0' not installed.")
        raise unittest.SkipTest("'elasticsearch>=2.0.0,<3.0.0' not installed.")

    url = settings.HAYSTACK_CONNECTIONS['elasticsearch']['URL']
    es = Elasticsearch(url)
    try:
        es.info()
    except exceptions.ConnectionError as e:
        log.error("elasticsearch not running on %r" % url, exc_info=True)
        raise unittest.SkipTest("elasticsearch not running on %r" % url, e)
Exemplo n.º 39
0
 def get_es_client(self):
     config = self.config
     hosts = [config.get('elastic','cluster')]
     auth = (config.get('elastic','user'), config.get('elastic','password'))
     es = Elasticsearch(hosts, timeout=40,
                                  connection_class=RequestsHttpConnection,
                                  http_auth=auth,
                                  use_ssl=False, verify_certs=False)
     print("ES client Check:")
     print(es.info())
     return es
def snapshot_indices_from_src_to_s3(config):
    """
    Take a snapshot of all the indices specified in the config file.

    The specified indices are backed up from the ElasticSearch Node on which backup is initiated
    and are stored at the S3 location specified in the config file.
    
    Parameters:
        config: dictionary storing the configuration details 
        
    """
    
    src_seed1 = config['elasticsearch_config']['es_src_seed1']
    es_s3_repo = config['elasticsearch_config']['es_repository_name']

    try:
        src_seed2 = config['elasticsearch_config']['es_src_seed2']
        src_seed3 = config['elasticsearch_config']['es_src_seed3']
    except KeyError: # running in test mode? use a single node
        print ("\n[WARN] Only one SOURCE seed node found in the config, falling back to single SOURCE seed...")
        src_seed2 = src_seed3 = src_seed1

    try:
        src_es = Elasticsearch([src_seed1, src_seed2, src_seed3], sniff_on_start=True, 
            sniff_on_connection_fail=True, sniffer_timeout=60)

        print ("\n[INFO] Connected to src ES cluster: %s" %(src_es.info()))

        src_es.snapshot.create_repository(repository=es_s3_repo,
            body={
                "type": "s3",
                "settings": {
                    "region": config['aws_s3_config']['aws_region'],
                    "bucket": config['aws_s3_config']['s3_bucket_name'],
                    "base_path": config['aws_s3_config']['s3_base_path'],
                    "access_key": config['aws_api_keys']['aws_access_key'],
                    "secret_key": config['aws_api_keys']['aws_secret_key']
                }
            },
            request_timeout=30,
            verify=False)

        print ("\n[INFO] Snapshotting ES indices: '%s' to S3...\n" %(config['elasticsearch_config']['index_names']))

        src_es.snapshot.create(repository=es_s3_repo, 
            snapshot=config['elasticsearch_config']['snapshot_name'], 
            body={"indices": config['elasticsearch_config']['index_names']}, 
            wait_for_completion=False)

    except Exception as e:
        print ("\n\n[ERROR] Unexpected error: %s" %(str(e)))
Exemplo n.º 41
0
def get_elasticsearch_info():
    """Check Elasticsearch connection."""
    from elasticsearch import (
        Elasticsearch,
        ConnectionError as ESConnectionError
    )
    try:
        url = settings.HAYSTACK_CONNECTIONS["default"]["URL"]
    except (AttributeError, KeyError) as ex:
        log.error("No elasticsearch connection info found in settings. "
                  "Error: %s", ex)
        return {"status": NO_CONFIG}
    start = datetime.now()
    try:
        search = Elasticsearch(url, request_timeout=TIMEOUT_SECONDS)
        search.info()
    except ESConnectionError:
        return {"status": DOWN}
    del search  # The elasticsearch library has no "close" or "disconnect."
    micro = (datetime.now() - start).microseconds
    return {
        "status": UP, "response_microseconds": micro,
    }
Exemplo n.º 42
0
class Elastic(object):
    """ Базовый класс для работы с базой ElasticSearch """
    def __init__(self, di):
        self.host   = di['host']
        self.port   = int(di['port'])
        self.con    = None
        self.index  = di['index']

    def __connect(self):
        ''' Соединение с базой '''
        self.con = Elasticsearch([{'host' : self.host, 'port' : self.port}])
        self.con.info()

    def search_all(self, query):
        """ Выполняет поиск в базе, если нет соединения - пересоединяется """
        res = None
        index = '{0}-*'.format(self.index)
        try:
            res = self.con.search(index=index, body=query)
        except (AttributeError, ConnectionError):
            self.__connect()
            res = self.con.search(index=index, body=query)
        return res

    def store(self, query):
        """ Выполняет запрос к базе, если нет соединения - пересоединяется """
        index = '{0}-{1}'.format(self.index, strftime('%Y.%m.%d', localtime()))
        try:
            self.con.index(index=index, doc_type='logs', body=query)
        except (AttributeError, ConnectionError):
            try:
                self.__connect()
                self.con.index(index=index, doc_type='logs', body=query)
            except ConnectionError:
                print('failed to connect elasticsearch')
                print(query)
Exemplo n.º 43
0
def health(request):
    # check database
    message = 'OK'
    status = 200

    # check elasticsearch
    try:
        client = Elasticsearch(settings.ELASTIC_SEARCH_HOSTS)
        assert client.info()
    except:
        log.exception("Elasticsearch connectivity failed")
        message += "\nElasticsearch connectivity failed."
        status = 500
        return HttpResponse(message, content_type='text/plain', status=status)

    # return HttpResponse(message, content_type='text/plain', status=status)
    return check_data(request)
Exemplo n.º 44
0
def connect():
	from elasticsearch import Elasticsearch, RequestsHttpConnection
	from requests_aws4auth import AWS4Auth

	REGION = "us-west-2"
	host = 'search-search-jxvug2z72gmuoz6ysdy3rz4z44.us-west-2.es.amazonaws.com'
	awsauth = AWS4Auth("AKIAIC6D7UKE76OCB6HQ", "/OMQDN2x8+FZyVeIxa7bbtNXswhYB7uIBOkz6rDi", REGION, 'es')

	es = Elasticsearch(
		hosts=[{'host': host, 'port': 443}],
		http_auth=awsauth,
		use_ssl=True,
		verify_certs=True,
		connection_class=RequestsHttpConnection
	)
	print(es.info())
	return es
Exemplo n.º 45
0
def consumeKafkaToEs():
    es = Elasticsearch(
        hosts=[{'host': ES_HOST, 'port': 9200}],
        connection_class=RequestsHttpConnection
    )
    print(es.info())
    consumer = KafkaConsumer(TOPIC,
                         group_id='my-group',
                         bootstrap_servers=['localhost:9092'],
#                         value_deserializer=lambda m: json.loads(m.decode('utf-8'))
)
    for message in consumer:
        print(dir(message))
        print ("%s:%d:%d: key=%s value=%s" % (message.topic, message.partition,
                                          message.offset, message.key,
                                          message.value))
        toInsert = json.loads(message.value.decode("utf-8") )
        res = es.index(index="test-index", doc_type='fb',  body=toInsert)
        print("done insert")
        time.sleep(1)
Exemplo n.º 46
0
class Elastic:
    def __init__(self,credentials):
        self.ip = credentials["elastic.ip"]
        self.port = credentials["elastic.port"]
        self.connecting = False
        self.connected = False

    def start(self):
        self.connect()
        self.refresh("_all")

    def connect(self):
         self.connecting = True
         print("[elastic] :connecting...")
         self.es = Elasticsearch(hosts=[{'host': self.ip, 'port': self.port}])

    def putIndex(self,putIndex,theBody):
        res = self.es.index(index=putIndex, doc_type='tweet', id=1, body=doc)
        print(res['created'])

    def getIndex(self,getIndex):
        res = self.es.get(index=getIndex, doc_type='tweet', id=1)
        print(res['_source'])

    def refresh(self,refIndex):
        self.es.indices.refresh(index=refIndex)
        print("[elastic] :connected!")
        print(self.es.info())
        self.connected = True

    def search(self,index,query):
        res = self.es.search(index="test-index",
                        body={
                            "query": {
                                "match_all": {}
                                }
                            })
        print("Got %d Hits:" % res['hits']['total'])
        for hit in res['hits']['hits']:
            print("%(timestamp)s %(author)s: %(text)s" % hit["_source"])
        return res;
Exemplo n.º 47
0
def load(tweets):    
    es = Elasticsearch(host = config.es_host, port = config.es_port)
    es_version_number = es.info()['version']['number']
    tweet_mapping = get_tweet_mapping(es_version_number)
    mapping = {doc_type: tweet_mapping
               }


    if es.indices.exists(index_name):
        print ('index {} already exists'.format(index_name))
        try:
            es.indices.put_mapping(doc_type, tweet_mapping, index_name)
        except ElasticsearchException as e:
            print('error putting mapping:\n'+str(e))
            print('deleting index {}...'.format(index_name))
            es.indices.delete(index_name)
            create_index(es, index_name, mapping)
    else:
        print('index {} does not exist'.format(index_name))
        create_index(es, index_name, mapping)
    
    counter = 0
    bulk_data = []
    list_size = len(tweets)
    for doc in tweets:
        tweet = get_tweet(doc)
        bulk_doc = {
            "_index": index_name,
            "_type": doc_type,
            "_id": tweet[id_field],
            "_source": tweet
            }
        bulk_data.append(bulk_doc)
        counter+=1
        
        if counter % bulk_chunk_size == 0 or counter == list_size:
            print "ElasticSearch bulk index (index: {INDEX}, type: {TYPE})...".format(INDEX=index_name, TYPE=doc_type)
            success, _ = bulk(es, bulk_data)
            print 'ElasticSearch indexed %d documents' % success
            bulk_data = []
Exemplo n.º 48
0
parser = argparse.ArgumentParser()
parser.add_argument("--es_host", default="localhost:9200", help="ES Connection String")
parser.add_argument("--es_user", default="elastic", help="ES User")
parser.add_argument("--es_password", default="changeme", help="ES Password")
parser.add_argument("--interval", default=300, help="Interval in Seconds", type=int)
parser.add_argument("--start_time", help="Start Time")
parser.add_argument("--end_time", help="End Time")
parser.add_argument("--watch_template", help="Watch File")
options = parser.parse_args()

start_time = datetime.strptime(options.start_time, '%Y-%m-%dT%H:%M:%SZ')
end_time = datetime.strptime(options.end_time, '%Y-%m-%dT%H:%M:%SZ')
client = Elasticsearch(hosts=[options.es_host], http_auth=(options.es_user, options.es_password), use_ssl=False,
                           timeout=300)
try:
    cluster = client.info()
except:
    print("Cluster not accessible")
    sys.exit(1)

watch_template = json.loads(open(options.watch_template).read())
next_time = start_time
while next_time < end_time:
    print("Executing for %s-%s seconds"%(next_time.strftime('%Y-%m-%dT%H:%M:%SZ'),options.interval))
    watch_body = watch_template
    watch_body["metadata"]["time_period"] = "%ss"%options.interval
    client.transport.perform_request('POST', _make_path('_xpack',
                                                     'watcher', 'watch', '_execute'), body={
        "trigger_data":{
            "scheduled_time":next_time.strftime('%Y-%m-%dT%H:%M:%SZ')
        },
Exemplo n.º 49
0
 def elasticsearch_fail():
     es = Elasticsearch([{'host': 'example.com', 'port': 9999}], timeout=0.1)
     es.info()
Exemplo n.º 50
0
class Test(BaseTest):

    def init(self):
        self.elasticsearch_url = self.get_elasticsearch_url()
        self.kibana_url = self.get_kibana_url()
        print("Using elasticsearch: {}".format(self.elasticsearch_url))
        self.es = Elasticsearch([self.elasticsearch_url])
        logging.getLogger("urllib3").setLevel(logging.WARNING)
        logging.getLogger("elasticsearch").setLevel(logging.ERROR)

        self.modules_path = os.path.abspath(self.working_dir +
                                            "/../../../../module")

        self.kibana_path = os.path.abspath(self.working_dir +
                                           "/../../../../build/kibana")

        self.filebeat = os.path.abspath(self.working_dir +
                                        "/../../../../filebeat.test")

        self.index_name = "test-filebeat-ml"

    @parameterized.expand([
        (False,),
        (True,),
    ])
    @unittest.skipIf(not INTEGRATION_TESTS,
                     "integration tests are disabled, run with INTEGRATION_TESTS=1 to enable them.")
    @unittest.skipIf(os.getenv("TESTING_ENVIRONMENT") == "2x",
                     "integration test not available on 2.x")
    @unittest.skipIf(os.name == "nt", "skipped on Windows")
    @unittest.skip("Skipped as flaky: https://github.com/elastic/beats/issues/11629")
    def test_ml_setup(self, modules_flag):
        """ Test ML are installed in all possible ways """
        self._run_ml_test(modules_flag)

    def _run_ml_test(self, modules_flag):
        self.init()

        from elasticsearch import AuthorizationException

        es_info = self.es.info()
        version = semver.parse(es_info["version"]["number"])
        if version["major"] < 7:
            start_trial_api_url = "/_xpack/license/start_trial?acknowledge=true"
            ml_datafeeds_url = "/_xpack/ml/datafeeds/"
            ml_anomaly_detectors_url = "/_xpack/ml/anomaly_detectors/"
        else:
            start_trial_api_url = "/_license/start_trial?acknowledge=true"
            ml_datafeeds_url = "/_ml/datafeeds/"
            ml_anomaly_detectors_url = "/_ml/anomaly_detectors/"

        try:
            output = self.es.transport.perform_request("POST", start_trial_api_url)
        except AuthorizationException:
            print("License already enabled")

        print("Test modules_flag: {}".format(modules_flag))

        # Clean any previous state
        for df in self.es.transport.perform_request("GET", ml_datafeeds_url)["datafeeds"]:
            if df["datafeed_id"] == 'filebeat-nginx-access-response_code':
                self.es.transport.perform_request(
                    "DELETE", "/_ml/datafeeds/" + df["datafeed_id"])

        for df in self.es.transport.perform_request("GET", ml_anomaly_detectors_url)["jobs"]:
            if df["job_id"] == 'datafeed-filebeat-nginx-access-response_code':
                self.es.transport.perform_request(
                    "DELETE", ml_anomaly_detectors_url + df["job_id"])

        shutil.rmtree(os.path.join(self.working_dir,
                                   "modules.d"), ignore_errors=True)

        # generate a minimal configuration
        cfgfile = os.path.join(self.working_dir, "filebeat.yml")
        self.render_config_template(
            template_name="filebeat_modules",
            output=cfgfile,
            index_name=self.index_name,
            elasticsearch_url=self.elasticsearch_url,
            kibana_url=self.kibana_url,
            kibana_path=self.kibana_path)

        if not modules_flag:
            # Enable nginx
            os.mkdir(os.path.join(self.working_dir, "modules.d"))
            with open(os.path.join(self.working_dir, "modules.d/nginx.yml"), "wb") as nginx:
                nginx.write("- module: nginx")

        cmd = [
            self.filebeat, "-systemTest",
            "-e", "-d", "*",
            "-c", cfgfile
        ]

        # Skipping dashboard loading to speed up tests
        cmd += ["-E", "setup.dashboards.enabled=false"]
        cmd += ["setup", "--machine-learning"]
        if modules_flag:
            cmd += ["--modules=nginx"]

        output_path = os.path.join(self.working_dir, "output.log")
        output = open(output_path, "ab")
        output.write(" ".join(cmd) + "\n")
        beat = subprocess.Popen(cmd,
                                stdin=None,
                                stdout=output,
                                stderr=output,
                                bufsize=0)

        # Check result
        self.wait_until(lambda: "filebeat-nginx_ecs-access-status_code_rate_ecs" in
                                (df["job_id"] for df in self.es.transport.perform_request(
                                    "GET", ml_anomaly_detectors_url)["jobs"]),
                        max_timeout=60)
        self.wait_until(lambda: "datafeed-filebeat-nginx_ecs-access-status_code_rate_ecs" in
                                (df["datafeed_id"] for df in self.es.transport.perform_request("GET", ml_datafeeds_url)["datafeeds"]))

        beat.kill()

        # check if fails during trying to setting it up again
        output = open(output_path, "ab")
        output.write(" ".join(cmd) + "\n")
        beat = subprocess.Popen(cmd,
                                stdin=None,
                                stdout=output,
                                stderr=output,
                                bufsize=0)

        output = open(output_path, "r")
        for obj in ["Datafeed", "Job", "Dashboard", "Search", "Visualization"]:
            self.wait_log_contains("{obj} already exists".format(obj=obj),
                                   logfile=output_path,
                                   max_timeout=60)

        beat.kill()
class ElasticsearchAPI:
    """
    Each query will have its own index based on query name.
    index_name = query.name
    Doc type = query_name to make it possible to set mapping. Mapping is set per doc_type.

    All rows from a Query should look the same no matter the source.

    This makes all the data from all the servers in the same index.
        Comparable.
        Less indexes.
    """
    def __init__(self, host, port, user, password):
        logger.info("Connecting to ES %s..." % host)
        self.es = Elasticsearch(hosts=[
            {'host': host, 'port': port}, ])
        logger.debug(self.es.info())

    @staticmethod
    def from_config_manager(config_manager):
        config = config_manager.get_config('Elasticsearch')

        return ElasticsearchAPI(config['host'],
                                config['port'],
                                config['password'],
                                config['username'])

    def consume_all(self, items, doc_type, index_name, id_column_name):
        print('Pushing %s docs to index: %s' % (len(items), index_name))
        actions = []
        for doc in items:
            action = {
                "_id": doc[id_column_name],
                "_index": index_name,
                "_type": doc_type,
                "_source": doc,
                }
            actions.append(action)
        helpers.bulk(self.es, actions)
        self.es.indices.refresh()

        return len(items)

    def find_ids(self, ids, doc_type, index_name):
        body = {"ids": ids}
        result = self.es.mget(index=index_name, doc_type=doc_type, body=body)
        # print(result)
        if len(result) > 0:
            return [r['_id'] for r in result['docs'] if r['found'] is True]
        return []

    def init_indexes_for(self, sources):
        for source in sources:
            self.init_index_for_source(source)

    def set_mapping(self, doc_type, index_name, mapping):
        self.es.indices.put_mapping(
            index=index_name,
            doc_type=doc_type,
            body=mapping)

    def delete_index(self, index_name):
        print('Truncating data in index: %s' % index_name)
        self.es.indices.delete(index=index_name, ignore=404)

    def create_index(self, index_name):
        print('Creating index %s' % index_name)
        self.es.indices.create(index_name, ignore=400)
Exemplo n.º 52
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--numdays', help='Number of days back from today to archive, default is 0', default=0)
    parser.add_argument('--archive', help='Archive name')
    parser.add_argument('--logtype', help='Log type cwl/logstash', default='cwl')
    parser.add_argument('--esaddress', help='Elasticsearch Address', default='localhost')
    parser.add_argument('--esport', help='Elasticsearch Port', default=9200)
    parser.add_argument('--bucket', help='S3 bucket name')
    parser.add_argument('--awsaccesskey', help='AWS Access Key')
    parser.add_argument('--awssecretkey', help='AWS Secret Key')
    parser.add_argument('--awsregion', help='AWS Region', default='us-east-1')
    parser.add_argument('--dry', help='Dry run', action='store_true')

    args = parser.parse_args()

    numDays = args.numdays
    archiveName = args.archive
    now = datetime.now()
    indexDay = int(now.day) - int(numDays)
    indexName = '%s-%s.%02d.%s' % (args.logtype, now.year, now.month, indexDay)
    stuffs = []

    awsauth = AWS4Auth(args.awsaccesskey, args.awssecretkey, args.awsregion, 'es')

    es = Elasticsearch(
        hosts=[{'host': args.esaddress, 'port': int(args.esport)}],
        http_auth=awsauth,
        use_ssl=True,
        verify_certs=True,
        connection_class=RequestsHttpConnection
    )
    print(es.info())

    query={"query" : {"match_all" : {}}}

    if es.search_exists(index=indexName):
        rs = es.search(index=indexName, scroll='60s', search_type='scan', size=100, body=query)
        scroll_size = rs['hits']['total']
        while (scroll_size > 0):
            try:
                scroll_id = rs['_scroll_id']
                rs = es.scroll(scroll_id=scroll_id, scroll='60s')
                stuffs += rs['hits']['hits']
                scroll_size = len(rs['hits']['hits'])
            except:
                break
    else:
        print 'Index %s does not exist' % indexName
        exit()

    with gzip.open(archiveName + '.gz', 'wb') as f:
        for stuff in stuffs:
            f.write(str(stuff))
        f.close

    if args.dry:
        print "Not deleting index %s" % indexName
    else:
        es.indices.delete(index=indexName)

    print 'Pushing to bucket name %s' % args.bucket
    s3 = boto3.resource('s3')
    data = open(archiveName + '.gz', 'rb')
    s3.Bucket(args.bucket).put_object(Key=archiveName + '.gz', Body=data)
Exemplo n.º 53
0
    return True


parser = argparse.ArgumentParser()
parser.add_argument('inputfile', type=str, help='the input file, must be .json created from django dumpdata command')
parser.add_argument('index', type=str, help='the elastic index')
args = parser.parse_args()
input_fp = args.inputfile
index = args.index


print('connecting to elastic index %s...' % index)

es = Elasticsearch()
try:
    print(es.info())
except Exception as e:
    print(e)
    raise

print('...OK')

print('loading data from %s...' % input_fp)

DATA = json.load(open(input_fp))

print('found %s objects' % str(len(DATA)))
print('this would be a sample document:\n')
print(DATA[0]['fields'])

Exemplo n.º 54
0
class Connector:
    def __init__(self, esEndpoint, dmonPort=5001, esInstanceEndpoint=9200, index="logstash-*"):
        self.esInstance = Elasticsearch(esEndpoint)
        self.esEndpoint = esEndpoint
        self.dmonPort = dmonPort
        self.esInstanceEndpoint = esInstanceEndpoint
        self.myIndex = index

    def query(self, queryBody, allm=True, dMetrics=[], debug=False):
        res = self.esInstance.search(index=self.myIndex, body=queryBody, request_timeout=230)
        if debug == True:
            print "%---------------------------------------------------------%"
            print "Raw JSON Ouput"
            print res
            print("%d documents found" % res['hits']['total'])
            print "%---------------------------------------------------------%"
        termsList = []
        termValues = []
        ListMetrics = []
        for doc in res['hits']['hits']:
            if allm == False:
                if not dMetrics:
                    sys.exit("dMetrics argument not set. Please supply valid list of metrics!")
                for met in dMetrics:
                    # prints the values of the metrics defined in the metrics list
                    if debug == True:
                        print "%---------------------------------------------------------%"
                        print "Parsed Output -> ES doc id, metrics, metrics values."
                        print("doc id %s) metric %s -> value %s" % (doc['_id'], met, doc['_source'][met]))
                        print "%---------------------------------------------------------%"
                    termsList.append(met)
                    termValues.append(doc['_source'][met])
                dictValues = dict(zip(termsList, termValues))
            else:
                for terms in doc['_source']:
                    # prints the values of the metrics defined in the metrics list
                    if debug == True:
                        print "%---------------------------------------------------------%"
                        print "Parsed Output -> ES doc id, metrics, metrics values."
                        print("doc id %s) metric %s -> value %s" % (doc['_id'], terms, doc['_source'][terms]))
                        print "%---------------------------------------------------------%"
                    termsList.append(terms)
                    termValues.append(doc['_source'][terms])
                    dictValues = dict(zip(termsList, termValues))
            ListMetrics.append(dictValues)
        return ListMetrics, res

    def info(self):
        try:
            res = self.esInstance.info()
        except Exception as inst:
            logger.error('[%s] : [ERROR] Exception has occured while connecting to ES dmon with type %s at arguments %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)
            return "An exception has occured with type %s at arguments %s" %(type(inst), inst.args)
            sys.exit(2)
        return res

    def roles(self):
        nUrl = "http://%s:%s/dmon/v1/overlord/nodes/roles" % (self.esEndpoint, self.dmonPort)
        logger.info('[%s] : [INFO] dmon get roles url -> %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), nUrl)
        try:
            rRoles = requests.get(nUrl)
        except Exception as inst:
            logger.error('[%s] : [ERROR] Exception has occured while connecting to dmon with type %s at arguments %s',
                         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)
            print "Can't connect to dmon at %s port %s" % (self.esEndpoint, self.dmonPort)
            sys.exit(2)
        rData = rRoles.json()
        return rData

    def createIndex(self, indexName):
        try:
            self.esInstance.create(index=indexName, ignore=400)
            logger.info('[%s] : [INFO] Created index %s',
                         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), indexName)
        except Exception as inst:
            logger.error('[%s] : [ERROR] Failed to created index %s with %s and %s',
                        datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), indexName, type(inst), inst.args)

    def closeIndex(self, indexName):
        try:
            self.esInstance.close(index=indexName)
            logger.info('[%s] : [INFO] Closed index %s',
                         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), indexName)
        except Exception as inst:
            logger.error('[%s] : [ERROR] Failed to close index %s with %s and %s',
                         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), indexName, type(inst),
                         inst.args)

    def deleteIndex(self, indexName):
        try:
            res = self.esInstance.indices.delete(index=indexName, ignore=[400, 404])
            logger.info('[%s] : [INFO] Deleted index %s',
                    datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), indexName)
        except Exception as inst:
            logger.error('[%s] : [ERROR] Failed to delete index %s with %s and %s',
                         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), indexName, type(inst),
                         inst.args)
            return 0
        return res

    def openIndex(self, indexName):
        res = self.esInstance.indices.open(index=indexName)
        logger.info('[%s] : [INFO] Open index %s',
                    datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), indexName)
        return res

    def getIndex(self, indexName):
        res = self.esInstance.indices.get(index=indexName, human=True)
        return res

    def getIndexSettings(self, indexName):
        res = self.esInstance.indices.get_settings(index=indexName, human=True)
        return res

    def clusterHealth(self):
        res = self.esInstance.cluster.health(request_timeout=15)
        return res

    def clusterSettings(self):
        res = self.esInstance.cluster.get_settings(request_timeout=15)
        return res

    def clusterState(self):
        res = self.esInstance.cluster.stats(human=True, request_timeout=15)
        return res

    def nodeInfo(self):
        res = self.esInstance.nodes.info(request_timeout=15)
        return res

    def nodeState(self):
        res = self.esInstance.nodes.stats(request_timeout=15)
        return res

    def getStormTopology(self):
        nUrl = "http://%s:%s/dmon/v1/overlord/detect/storm" % (self.esEndpoint, self.dmonPort)
        logger.info('[%s] : [INFO] dmon get storm topology url -> %s',
                    datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), nUrl)
        try:
            rStormTopology = requests.get(nUrl)
        except Exception as inst:
            logger.error('[%s] : [ERROR] Exception has occured while connecting to dmon with type %s at arguments %s',
                         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)
            print "Can't connect to dmon at %s port %s" % (self.esEndpoint, self.dmonPort)
            sys.exit(2)
        rData = rStormTopology.json()
        return rData

    def pushAnomaly(self, anomalyIndex, doc_type, body):
        try:
            res = self.esInstance.index(index=anomalyIndex, doc_type=doc_type, body=body)
        except Exception as inst:
            logger.error('[%s] : [ERROR] Exception has occured while pushing anomaly with type %s at arguments %s',
                         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)
            print "Can't push anomaly to dmon!"
            sys.exit(2)
        return res

    def getModel(self):
        return "getModel"

    def pushModel(self):
        return "push model"

    def localData(self):
        return "use local data"

    def getInterval(self):
        nUrl = "http://%s:%s/dmon/v1/overlord/aux/interval" % (self.esEndpoint, self.dmonPort)
        logger.info('[%s] : [INFO] dmon get interval url -> %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), nUrl)
        try:
            rInterval = requests.get(nUrl)
        except Exception as inst:
            logger.error('[%s] : [ERROR] Exception has occured while connecting to dmon with type %s at arguments %s',
                         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)
            print "Can't connect to dmon at %s port %s" % (self.esEndpoint, self.dmonPort)
            sys.exit(2)
        rData = rInterval.json()
        return rData

    def aggQuery(self, queryBody):
        adt_timeout = os.environ['ADP_TIMEOUT'] = os.getenv('ADP_TIMEOUT', str(60)) # Set timeout as env variable ADT_TIMEOUT, if not set use default 60
        try:
            res = self.esInstance.search(index=self.myIndex, body=queryBody, request_timeout=float(adt_timeout))
        except Exception as inst:
            logger.error('[%s] : [ERROR] Exception while executing ES query with %s and %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)
            sys.exit(2)
        return res

    def getNodeList(self):
        '''
        :return: -> returns the list of registered nodes from dmon
        '''
        nUrl = "http://%s:%s/dmon/v1/observer/nodes" % (self.esEndpoint, self.dmonPort)
        logger.info('[%s] : [INFO] dmon get node url -> %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), nUrl)
        try:
            rdmonNode = requests.get(nUrl)
        except Exception as inst:
            logger.error('[%s] : [ERROR] Exception has occured while connecting to dmon with type %s at arguments %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)
            print "Can't connect to dmon at %s port %s" % (self.esEndpoint, self.dmonPort)
            sys.exit(2)
        rdata = rdmonNode.json()
        nodes = []
        for e in rdata['Nodes']:
            for k in e:
                nodes.append(k)
        return nodes

    def getDmonStatus(self):
        nUrl = "http://%s:%s/dmon/v1/overlord/core/status" % (self.esEndpoint, self.dmonPort)
        logger.info('[%s] : [INFO] dmon get core status url -> %s',
                    datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), nUrl)
        try:
            rdmonStatus = requests.get(nUrl)
        except Exception as inst:
            logger.error('[%s] : [ERROR] Exception has occured while connecting to dmon with type %s at arguments %s',
                         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)
            print "Can't connect to dmon at %s port %s" % (self.esEndpoint, self.dmonPort)
            sys.exit(2)
        return rdmonStatus.json()
Exemplo n.º 55
0
class ElasticsearchDataStore(object):
    """Implements the datastore."""

    # Number of events to queue up when bulk inserting events.
    DEFAULT_FLUSH_INTERVAL = 1000
    DEFAULT_SIZE = 100
    DEFAULT_LIMIT = DEFAULT_SIZE  # Max events to return
    DEFAULT_FROM = 0
    DEFAULT_STREAM_LIMIT = 5000 # Max events to return when streaming results

    def __init__(self, host='127.0.0.1', port=9200):
        """Create a Elasticsearch client."""
        super(ElasticsearchDataStore, self).__init__()
        self.client = Elasticsearch([{'host': host, 'port': port}])
        self.import_counter = Counter()
        self.import_events = []

    @staticmethod
    def _build_label_query(sketch_id, label_name):
        """Build Elasticsearch query for Timesketch labels.

        Args:
            sketch_id: Integer of sketch primary key.
            label_name: Name of the label to search for.

        Returns:
            Elasticsearch query as a dictionary.
        """
        query_dict = {
            'query': {
                'nested': {
                    'query': {
                        'bool': {
                            'must': [{
                                'term': {
                                    'timesketch_label.name': label_name
                                }
                            }, {
                                'term': {
                                    'timesketch_label.sketch_id': sketch_id
                                }
                            }]
                        }
                    },
                    'path': 'timesketch_label'
                }
            }
        }
        return query_dict

    @staticmethod
    def _build_events_query(events):
        """Build Elasticsearch query for one or more document ids.

        Args:
            events: List of Elasticsearch document IDs.

        Returns:
            Elasticsearch query as a dictionary.
        """
        events_list = [event['event_id'] for event in events]
        query_dict = {'query': {'ids': {'values': events_list}}}
        return query_dict

    @staticmethod
    def _build_field_aggregator(field_name):
        """Build Elasticsearch query for aggregation based on field.

        Args:
            field_name: Field to aggregate.

        Returns:
            Elasticsearch aggregation as a dictionary.
        """
        field_aggregation = {
            'field_aggregation': {
                'terms': {
                    'field': '{0:s}.keyword'.format(field_name)
                }
            }
        }
        return field_aggregation

    def build_query(self,
                    sketch_id,
                    query_string,
                    query_filter,
                    query_dsl,
                    aggregations=None):
        """Build Elasticsearch DSL query.

        Args:
            sketch_id: Integer of sketch primary key
            query_string: Query string
            query_filter: Dictionary containing filters to apply
            query_dsl: Dictionary containing Elasticsearch DSL query
            aggregations: Dict of Elasticsearch aggregations

        Returns:
            Elasticsearch DSL query as a dictionary
        """
        if not query_dsl:
            if query_filter.get('star', None):
                query_dsl = self._build_label_query(sketch_id, '__ts_star')

            if query_filter.get('events', None):
                events = query_filter['events']
                query_dsl = self._build_events_query(events)

            if not query_dsl:
                query_dsl = {
                    'query': {
                        'bool': {
                            'must': [{
                                'query_string': {
                                    'query': query_string
                                }
                            }]
                        }
                    }
                }
            if query_filter.get('time_start', None):
                # TODO(jberggren): Add support for multiple time ranges.
                query_dsl['query']['bool']['filter'] = {
                    'bool': {
                        'should': [{
                            'range': {
                                'datetime': {
                                    'gte': query_filter['time_start'],
                                    'lte': query_filter['time_end']
                                }
                            }
                        }]
                    }
                }
            if query_filter.get('from', None):
                query_dsl['from'] = query_filter['from']
            if query_filter.get('size', None):
                query_dsl['size'] = query_filter['size']
            if query_filter.get('exclude', None):
                query_dsl['post_filter'] = {
                    'bool': {
                        'must_not': {
                            'terms': {
                                'data_type': query_filter['exclude']
                            }
                        }
                    }
                }
        else:
            query_dsl = json.loads(query_dsl)

        # Make sure we are sorting.
        if not query_dsl.get('sort', None):
            query_dsl['sort'] = {
                'datetime': query_filter.get('order', 'asc')
            }

        # Remove any aggregation coming from user supplied Query DSL. We have
        # no way to display this data in a good way today.
        # TODO: Revisit this and figure out if we can display the data.
        if query_dsl.get('aggregations', None):
            del query_dsl['aggregations']

        # Add any pre defined aggregations
        if aggregations:
            # post_filter happens after aggregation so we need to move the
            # filter to the query instead.
            if query_dsl.get('post_filter', None):
                query_dsl['query']['bool']['filter'] = query_dsl[
                    'post_filter']
                query_dsl.pop('post_filter', None)
            query_dsl['aggregations'] = aggregations
        return query_dsl

    def search(self,
               sketch_id,
               query_string,
               query_filter,
               query_dsl,
               indices,
               count=False,
               aggregations=None,
               return_fields=None,
               enable_scroll=False):
        """Search ElasticSearch. This will take a query string from the UI
        together with a filter definition. Based on this it will execute the
        search request on ElasticSearch and get result back.

        Args:
            sketch_id: Integer of sketch primary key
            query_string: Query string
            query_filter: Dictionary containing filters to apply
            query_dsl: Dictionary containing Elasticsearch DSL query
            indices: List of indices to query
            count: Boolean indicating if we should only return result count
            aggregations: Dict of Elasticsearch aggregations
            return_fields: List of fields to return
            enable_scroll: If Elasticsearch scroll API should be used

        Returns:
            Set of event documents in JSON format
        """

        scroll_timeout = None
        if enable_scroll:
            scroll_timeout = '1m'  # Default to 1 minute scroll timeout

        # Exit early if we have no indices to query
        if not indices:
            return {'hits': {'hits': [], 'total': 0}, 'took': 0}

        # Check if we have specific events to fetch and get indices.
        if query_filter.get('events', None):
            indices = {
                event['index']
                for event in query_filter['events']
                if event['index'] in indices
            }

        query_dsl = self.build_query(sketch_id, query_string, query_filter,
                                     query_dsl, aggregations)

        # Default search type for elasticsearch is query_then_fetch.
        search_type = 'query_then_fetch'

        # Only return how many documents matches the query.
        if count:
            del query_dsl['sort']
            count_result = self.client.count(
                body=query_dsl, index=list(indices))
            return count_result.get('count', 0)

        if not return_fields:
            # Suppress the lint error because elasticsearch-py adds parameters
            # to the function with a decorator and this makes pylint sad.
            # pylint: disable=unexpected-keyword-arg
            return self.client.search(
                body=query_dsl,
                index=list(indices),
                search_type=search_type,
                scroll=scroll_timeout)

        # Suppress the lint error because elasticsearch-py adds parameters
        # to the function with a decorator and this makes pylint sad.
        # pylint: disable=unexpected-keyword-arg
        return self.client.search(
            body=query_dsl,
            index=list(indices),
            search_type=search_type,
            _source_include=return_fields,
            scroll=scroll_timeout)

    def search_stream(
            self, sketch_id=None, query_string=None, query_filter=None,
            query_dsl=None, indices=None, return_fields=None):
        """Search ElasticSearch. This will take a query string from the UI
        together with a filter definition. Based on this it will execute the
        search request on ElasticSearch and get result back.

        Args :
            sketch_id: Integer of sketch primary key
            query_string: Query string
            query_filter: Dictionary containing filters to apply
            query_dsl: Dictionary containing Elasticsearch DSL query
            indices: List of indices to query
            return_fields: List of fields to return

        Returns:
            Generator of event documents in JSON format
        """

        if not query_filter.get('size'):
            query_filter['size'] = self.DEFAULT_STREAM_LIMIT

        if not query_filter.get('terminate_after'):
            query_filter['terminate_after'] = self.DEFAULT_STREAM_LIMIT

        result = self.search(
            sketch_id=sketch_id,
            query_string=query_string,
            query_dsl=query_dsl,
            query_filter=query_filter,
            indices=indices,
            return_fields=return_fields,
            enable_scroll=True)

        scroll_id = result['_scroll_id']
        scroll_size = result['hits']['total']

        for event in result['hits']['hits']:
            yield event

        while scroll_size > 0:
            # pylint: disable=unexpected-keyword-arg
            result = self.client.scroll(scroll_id=scroll_id, scroll='5m')
            scroll_id = result['_scroll_id']
            scroll_size = len(result['hits']['hits'])
            for event in result['hits']['hits']:
                yield event

    def get_event(self, searchindex_id, event_id):
        """Get one event from the datastore.

        Args:
            searchindex_id: String of ElasticSearch index id
            event_id: String of ElasticSearch event id

        Returns:
            Event document in JSON format
        """
        try:
            # Suppress the lint error because elasticsearch-py adds parameters
            # to the function with a decorator and this makes pylint sad.
            # pylint: disable=unexpected-keyword-arg
            return self.client.get(
                index=searchindex_id,
                id=event_id,
                doc_type='_all',
                _source_exclude=['timesketch_label'])
        except NotFoundError:
            abort(HTTP_STATUS_CODE_NOT_FOUND)

    def count(self, indices):
        """Count number of documents.

        Args:
            indices: List of indices.

        Returns:
            Number of documents.
        """
        if not indices:
            return 0
        result = self.client.count(index=indices)
        return result.get('count', 0)

    def set_label(self, searchindex_id, event_id, event_type, sketch_id,
                  user_id, label, toggle=False, single_update=True):
        """Set label on event in the datastore.

        Args:
            searchindex_id: String of ElasticSearch index id
            event_id: String of ElasticSearch event id
            event_type: String of ElasticSearch document type
            sketch_id: Integer of sketch primary key
            user_id: Integer of user primary key
            label: String with the name of the label
            toggle: Optional boolean value if the label should be toggled
            single_update: Boolean if the label should be indexed immediately.
            (add/remove). The default is False.

        Returns:
            Dict with updated document body, or None if this is a single update.
        """
        # Elasticsearch painless script.
        update_body = {
            'script': {
                'lang': 'painless',
                'source': ADD_LABEL_SCRIPT,
                'params': {
                    'timesketch_label': {
                        'name': str(label),
                        'user_id': user_id,
                        'sketch_id': sketch_id
                    }
                }
            }
        }

        if toggle:
            update_body['script']['source'] = TOGGLE_LABEL_SCRIPT

        if not single_update:
            script = update_body['script']
            return dict(
                source=script['source'], lang=script['lang'],
                params=script['params']
            )

        doc = self.client.get(
            index=searchindex_id, id=event_id, doc_type='_all')
        try:
            doc['_source']['timesketch_label']
        except KeyError:
            doc = {'doc': {'timesketch_label': []}}
            self.client.update(
                index=searchindex_id,
                doc_type=event_type,
                id=event_id,
                body=doc)

        self.client.update(
            index=searchindex_id,
            id=event_id,
            doc_type=event_type,
            body=update_body)

        return None

    def create_index(self, index_name=uuid4().hex, doc_type='generic_event'):
        """Create index with Timesketch settings.

        Args:
            index_name: Name of the index. Default is a generated UUID.
            doc_type: Name of the document type. Default id generic_event.

        Returns:
            Index name in string format.
            Document type in string format.
        """
        _document_mapping = {
            doc_type: {
                'properties': {
                    'timesketch_label': {
                        'type': 'nested'
                    }
                }
            }
        }

        if not self.client.indices.exists(index_name):
            try:
                self.client.indices.create(
                    index=index_name, body={'mappings': _document_mapping})
            except ConnectionError:
                raise RuntimeError('Unable to connect to Timesketch backend.')
        # We want to return unicode here to keep SQLalchemy happy.
        if not isinstance(index_name, six.text_type):
            index_name = codecs.decode(index_name, 'utf-8')

        if not isinstance(doc_type, six.text_type):
            doc_type = codecs.decode(doc_type, 'utf-8')

        return index_name, doc_type

    def delete_index(self, index_name):
        """Delete Elasticsearch index.

        Args:
            index_name: Name of the index to delete.
        """
        if self.client.indices.exists(index_name):
            try:
                self.client.indices.delete(index=index_name)
            except ConnectionError as e:
                raise RuntimeError(
                    'Unable to connect to Timesketch backend: {}'.format(e)
                )

    def import_event(
            self, index_name, event_type, event=None,
            event_id=None, flush_interval=DEFAULT_FLUSH_INTERVAL):
        """Add event to Elasticsearch.

        Args:
            flush_interval: Number of events to queue up before indexing
            index_name: Name of the index in Elasticsearch
            event_type: Type of event (e.g. plaso_event)
            event: Event dictionary
            event_id: Event Elasticsearch ID
        """
        if event:
            # Make sure we have decoded strings in the event dict.
            event = {
                k.decode('utf8'): (codecs.decode(v, 'utf8')
                                   if isinstance(v, six.binary_type) else v)
                for k, v in event.items()
            }

            # Header needed by Elasticsearch when bulk inserting.
            header = {
                'index': {
                    '_index': index_name,
                    '_type': event_type
                }
            }
            update_header = {
                'update': {
                    '_index': index_name,
                    '_type': event_type,
                    '_id': event_id
                }
            }

            if event_id:
                # Event has "lang" defined if there is a script used for import.
                if event.get('lang'):
                    event = {'script': event}
                else:
                    event = {'doc': event}
                header = update_header

            self.import_events.append(header)
            self.import_events.append(event)
            self.import_counter['events'] += 1

            if self.import_counter['events'] % int(flush_interval) == 0:
                self.client.bulk(body=self.import_events)
                self.import_events = []
        else:
            # Import the remaining events in the queue.
            if self.import_events:
                self.client.bulk(body=self.import_events)

        return self.import_counter['events']

    def flush_queued_events(self):
        if self.import_events:
            self.client.bulk(body=self.import_events)

    @property
    def version(self):
        """Get Elasticsearch version.

        Returns:
          Version number as a string.
        """
        version_info = self.client.info().get('version')
        return version_info.get('number')
Exemplo n.º 56
0
class ElasticSearchUtil:
    def __init__(self, host):
        self.host = host
        self.conn = Elasticsearch([self.host])

    def __del__(self):
        self.close()

    def check(self):
        '''
        输出当前系统的ES信息
        :return:
        '''
        return self.conn.info()

    def insertDocument(self, index, type, body, id=None):
        '''
        插入一条数据body到指定的index、指定的type下;可指定Id,若不指定,ES会自动生成
        :param index: 待插入的index值
        :param type: 待插入的type值
        :param body: 待插入的数据 -> dict型
        :param id: 自定义Id值
        :return:
        '''
        return self.conn.index(index=index, doc_type=type, body=body, id=id)

    def insertDataFrame(self, index, type, dataFrame):
        '''
        批量插入接口;
        bulk接口所要求的数据列表结构为:[{{optionType}: {Condition}}, {data}]
        其中optionType可为index、delete、update
        Condition可设置每条数据所对应的index值和type值
        data为具体要插入/更新的单条数据
        :param index: 默认插入的index值
        :param type: 默认插入的type值
        :param dataFrame: 待插入数据集
        :return:
        '''
        dataList = dataFrame.to_dict(orient='records')
        insertHeadInfoList = [{"index": {}} for i in range(len(dataList))]
        temp = [dict] * (len(dataList) * 2)
        temp[::2] = insertHeadInfoList
        temp[1::2] = dataList
        try:
            return self.conn.bulk(index=index, doc_type=type, body=temp)
        except Exception as e:
            return str(e)

    def deleteDocById(self, index, type, id):
        '''
        删除指定index、type、id对应的数据
        :param index:
        :param type:
        :param id:
        :return:
        '''
        return self.conn.delete(index=index, doc_type=type, id=id)

    def deleteDocByQuery(self, index, query, type=None):
        '''
        删除idnex下符合条件query的所有数据
        :param index:
        :param query: 满足DSL语法格式
        :param type:
        :return:
        '''
        return self.conn.delete_by_query(index=index, body=query, doc_type=type)

    def deleteAllDocByIndex(self, index, type=None):
        '''
        删除指定index下的所有数据
        :param index:
        :return:
        '''
        try:
            query = {'query': {'match_all': {}}}
            return self.conn.delete_by_query(index=index, body=query, doc_type=type)
        except Exception as e:
            return str(e) + ' -> ' + index

    def searchDoc(self, index=None, type=None, body=None):
        '''
        查找index下所有符合条件的数据
        :param index:
        :param type:
        :param body: 筛选语句,符合DSL语法格式
        :return:
        '''
        return self.conn.search(index=index, doc_type=type, body=body)

    def getDocById(self, index, type, id):
        '''
        获取指定index、type、id对应的数据
        :param index:
        :param type:
        :param id:
        :return:
        '''
        return self.conn.get(index=index, doc_type=type, id=id)

    def updateDocById(self, index, type, id, body=None):
        '''
        更新指定index、type、id所对应的数据
        :param index:
        :param type:
        :param id:
        :param body: 待更新的值
        :return:
        '''
        return self.conn.update(index=index, doc_type=type, id=id, body=body)


    def close(self):
     if self.conn is not None:
        try:
            self.conn.close()
        except Exception as e:
            pass
        finally:
            self.conn = None
def restore_indices_from_s3_to_dest(config):
    """
    Restore the specified indices from the snapshot specified in the config file.

    The indices are restored at the specified 'dest' ElasticSearch Node.
    ElasticSearch automatically replicates the indices across the ES cluster after the restore. 
    
    Parameters:
        config: dictionary storing the configuration details
        
    """
    
    dest_seed1 = config['elasticsearch_config']['es_dest_seed1']
    es_s3_repo = config['elasticsearch_config']['es_repository_name']
    index_list = config['elasticsearch_config']['index_names'].split(',')

    try:
        dest_seed2 = config['elasticsearch_config']['es_dest_seed2']
        dest_seed3 = config['elasticsearch_config']['es_dest_seed3']
    except KeyError: # running in test mode? use a single node
        print ("\n[WARN] Are you running in test mode? Have you defined >1 dest node in the conf?")
        print ("\n[WARN] Falling back to a single dest node...")
        dest_seed2 = dest_seed3 = dest_seed1

    try:
        # specify all 3 dest ES nodes in the connection string
        dest_es = Elasticsearch([dest_seed1, dest_seed2, dest_seed3], sniff_on_start=True, 
            sniff_on_connection_fail=True, sniffer_timeout=60)

        dest_es.snapshot.create_repository(repository=es_s3_repo,
            body={
                "type": "s3",
                "settings": {
                    "region": config['aws_s3_config']['aws_region'],
                    "bucket": config['aws_s3_config']['s3_bucket_name'],
                    "base_path": config['aws_s3_config']['s3_base_path'],
                    "access_key": config['aws_api_keys']['aws_access_key'],
                    "secret_key": config['aws_api_keys']['aws_secret_key']
                }
            },
            request_timeout=30,
            verify=False)

        print ("\n[INFO] Connected to dest ES cluster: %s" %(dest_es.info()))

        # must close indices before restoring:
        for index in index_list:
            try:
                print ("[INFO] Closing index: '%s'" %(index))
                dest_es.indices.close(index=index, ignore_unavailable=True)
            except NotFoundError:
                print ("\n\n[WARN] Index '%s' not present on Target ES cluster - could not close it." %(index))
            except Exception as e:
                print ("\n\n[ERROR] Unexpected error '%s' while trying to close index: '%s'" %(str(e)))
                #reopen_indices(dest_es, index_list)

        print ("\n[INFO] Restoring ES indices: '%s' from S3 snapshot...\n" %(config['elasticsearch_config']['index_names']))

        dest_es.snapshot.restore(repository=es_s3_repo, 
            snapshot=config['elasticsearch_config']['snapshot_name'], 
            body={"indices": config['elasticsearch_config']['index_names']}, 
            wait_for_completion=False)

    except Exception as e:
        print ("\n\n[ERROR] Unexpected error: %s" %(str(e)))
    finally:
        print ("\n[INFO] (finally) Re-opening indices: '%s'" %(str(index_list)))
        reopen_indices(dest_es, index_list)
Exemplo n.º 58
0
class ElasticSearchDB(object):

  """
  .. class:: ElasticSearchDB

  :param str url: the url to the database for example: el.cern.ch:9200
  :param str gDebugFile: is used to save the debug information to a file
  :param int timeout: the default time out to Elasticsearch
  :param int RESULT_SIZE: The number of data points which will be returned by the query.
  """
  __chunk_size = 1000
  __url = ""
  __timeout = 120
  clusterName = ''
  RESULT_SIZE = 10000

  ########################################################################
  def __init__(self, host, port, user=None, password=None, indexPrefix='', useSSL=True):
    """ c'tor
    :param self: self reference
    :param str host: name of the database for example: MonitoringDB
    :param str port: The full name of the database for example: 'Monitoring/MonitoringDB'
    :param str user: user name to access the db
    :param str password: if the db is password protected we need to provide a password
    :param str indexPrefix: it is the indexPrefix used to get all indexes
    :param bool useSSL: We can disable using secure connection. By default we use secure connection.
    """

    self.__indexPrefix = indexPrefix
    self._connected = False
    if user and password:
      gLogger.debug("Specified username and password")
      self.__url = "https://%s:%s@%s:%d" % (user, password, host, port)
    else:
      gLogger.debug("Username and password not specified")
      self.__url = "http://%s:%d" % (host, port)

    gLogger.verbose("Connecting to %s:%s, useSSL = %s" % (host, port, useSSL))

    if useSSL:
      bd = BundleDeliveryClient()
      retVal = bd.getCAs()
      casFile = None
      if not retVal['OK']:
        gLogger.error("CAs file does not exists:", retVal['Message'])
        casFile = certifi.where()
      else:
        casFile = retVal['Value']

      self.__client = Elasticsearch(self.__url,
                                    timeout=self.__timeout,
                                    use_ssl=True,
                                    verify_certs=True,
                                    ca_certs=casFile)
    else:
      self.__client = Elasticsearch(self.__url, timeout=self.__timeout)

    self.__tryToConnect()

  def getIndexPrefix(self):
    """
    It returns the DIRAC setup.
    """
    return self.__indexPrefix

  ########################################################################
  def query(self, index, query):
    """ Executes a query and returns its result (uses ES DSL language).

    :param self: self reference
    :param basestring index: index name
    :param dict query: It is the query in ElasticSearch DSL language

    """
    try:
      esDSLQueryResult = self.__client.search(index=index, body=query)
      return S_OK(esDSLQueryResult)
    except RequestError as re:
      return S_ERROR(re)

  def _Search(self, indexname):
    """
    it returns the object which can be used for reatriving ceratin value from the DB
    """
    return Search(using=self.__client, index=indexname)

  ########################################################################
  def _Q(self, name_or_query='match', **params):
    """
    It is a wrapper to ElasticDSL Query module used to create a query object.
    :param str name_or_query is the type of the query
    """
    return Q(name_or_query, **params)

  def _A(self, name_or_agg, aggsfilter=None, **params):
    """
    It is a wrapper to ElasticDSL aggregation module, used to create an aggregation
    """
    return A(name_or_agg, aggsfilter, **params)
  ########################################################################

  def __tryToConnect(self):
    """Before we use the database we try to connect and retrieve the cluster name

    :param self: self reference

    """
    try:
      if self.__client.ping():
        # Returns True if the cluster is running, False otherwise
        result = self.__client.info()
        self.clusterName = result.get("cluster_name", " ")  # pylint: disable=no-member
        gLogger.info("Database info", result)
        self._connected = True
      else:
        self._connected = False
        gLogger.error("Cannot ping ElasticsearchDB!")
    except ConnectionError as e:
      gLogger.error(repr(e))
      self._connected = False

  ########################################################################
  def getIndexes(self):
    """
    It returns the available indexes...
    """

    # we only return indexes which belong to a specific prefix for example 'lhcb-production' or 'dirac-production etc.
    return [index for index in self.__client.indices.get_alias("%s*" % self.__indexPrefix)]

  ########################################################################
  def getDocTypes(self, indexName):
    """
    :param str indexName is the name of the index...
    :return S_OK or S_ERROR
    """
    result = []
    try:
      gLogger.debug("Getting mappings for ", indexName)
      result = self.__client.indices.get_mapping(indexName)
    except Exception as e:  # pylint: disable=broad-except
      gLogger.error(e)
    doctype = ''
    for indexConfig in result:
      if not result[indexConfig].get('mappings'):
        # there is a case when the mapping exits and the value is None...
        # this is usually an empty index or a corrupted index.
        gLogger.warn("Index does not have mapping %s!" % indexConfig)
        continue
      if result[indexConfig].get('mappings'):
        doctype = result[indexConfig]['mappings']
        break  # we supose the mapping of all indexes are the same...

    if not doctype:
      return S_ERROR("%s does not exists!" % indexName)

    return S_OK(doctype)

  ########################################################################
  def exists(self, indexName):
    """
    it checks the existance of an index
    :param str indexName: the name of the index
    """
    return self.__client.indices.exists(indexName)

  ########################################################################

  def createIndex(self, indexPrefix, mapping, period=None):
    """
    :param str indexPrefix: it is the index name.
    :param dict mapping: the configuration of the index.
    :param str period: We can specify, which kind of index will be created.
                       Currently only daily and monthly indexes are supported.

    """
    fullIndex = generateFullIndexName(indexPrefix, period)  # we have to create an index each day...
    if self.exists(fullIndex):
      return S_OK(fullIndex)

    try:
      gLogger.info("Create index: ", fullIndex + str(mapping))
      self.__client.indices.create(fullIndex, body={'mappings': mapping})
      return S_OK(fullIndex)
    except Exception as e:  # pylint: disable=broad-except
      gLogger.error("Can not create the index:", e)
      return S_ERROR("Can not create the index")

  def deleteIndex(self, indexName):
    """
    :param str indexName the name of the index to be deleted...
    """
    try:
      retVal = self.__client.indices.delete(indexName)
    except NotFoundError as e:
      return S_ERROR(DErrno.EELNOFOUND, e)
    except ValueError as e:
      return S_ERROR(DErrno.EVALUE, e)

    if retVal.get('acknowledged'):
      # if the value exists and the value is not None
      return S_OK(indexName)

    return S_ERROR(retVal)

  def index(self, indexName, doc_type, body):
    """
    :param str indexName: the name of the index to be used...
    :param str doc_type: the type of the document
    :param dict body: the data which will be indexed
    :return: the index name in case of success.
    """
    try:
      res = self.__client.index(index=indexName,
                                doc_type=doc_type,
                                body=body)
    except TransportError as e:
      return S_ERROR(e)

    if res.get('created') or res.get('result') == 'created':
      # the created index exists but the value can be None.
      return S_OK(indexName)

    return S_ERROR(res)

  def bulk_index(self, indexprefix, doc_type, data, mapping=None, period=None):
    """
    :param str indexPrefix: index name.
    :param str doc_type: the type of the document
    :param list data: contains a list of dictionary
    :paran dict mapping: the mapping used by elasticsearch
    :param str period: We can specify which kind of indices will be created.
                       Currently only daily and monthly indexes are supported.
    """
    gLogger.info("%d records will be insert to %s" % (len(data), doc_type))
    if mapping is None:
      mapping = {}

    indexName = generateFullIndexName(indexprefix, period)
    gLogger.debug("inserting datat to %s index" % indexName)
    if not self.exists(indexName):
      retVal = self.createIndex(indexprefix, mapping, period)
      if not retVal['OK']:
        return retVal
    docs = []
    for row in data:
      body = {
          '_index': indexName,
          '_type': doc_type,
          '_source': {}
      }
      body['_source'] = row

      if 'timestamp' not in row:
        gLogger.warn("timestamp is not given! Note: the actual time is used!")

      # if the timestamp is not provided, we use the current utc time.
      timestamp = row.get('timestamp', int(Time.toEpoch()))
      try:
        if isinstance(timestamp, datetime):
          body['_source']['timestamp'] = int(timestamp.strftime('%s')) * 1000
        elif isinstance(timestamp, basestring):
          timeobj = datetime.strptime(timestamp, '%Y-%m-%d %H:%M:%S.%f')
          body['_source']['timestamp'] = int(timeobj.strftime('%s')) * 1000
        else:  # we assume  the timestamp is an unix epoch time (integer).
          body['_source']['timestamp'] = timestamp * 1000
      except (TypeError, ValueError) as e:
        # in case we are not able to convert the timestamp to epoch time....
        gLogger.error("Wrong timestamp", e)
        body['_source']['timestamp'] = int(Time.toEpoch()) * 1000
      docs += [body]
    try:
      res = bulk(self.__client, docs, chunk_size=self.__chunk_size)
    except BulkIndexError as e:
      return S_ERROR(e)

    if res[0] == len(docs):
      # we have inserted all documents...
      return S_OK(len(docs))
    else:
      return S_ERROR(res)
    return res

  def getUniqueValue(self, indexName, key, orderBy=False):
    """
    :param str indexName the name of the index which will be used for the query
    :param dict orderBy it is a dictionary in case we want to order the result {key:'desc'} or {key:'asc'}
    It returns a list of unique value for a certain key from the dictionary.
    """

    query = self._Search(indexName)

    endDate = datetime.utcnow()

    startDate = endDate - timedelta(days=30)

    timeFilter = self._Q('range',
                         timestamp={'lte': int(Time.toEpoch(endDate)) * 1000,
                                    'gte': int(Time.toEpoch(startDate)) * 1000, })
    query = query.filter('bool', must=timeFilter)
    if orderBy:
      query.aggs.bucket(key,
                        'terms',
                        field=key,
                        size=self.RESULT_SIZE,
                        order=orderBy).metric(key,
                                              'cardinality',
                                              field=key)
    else:
      query.aggs.bucket(key,
                        'terms',
                        field=key,
                        size=self.RESULT_SIZE).metric(key,
                                                      'cardinality',
                                                      field=key)

    try:
      query = query.extra(size=self.RESULT_SIZE)  # do not need the raw data.
      gLogger.debug("Query", query.to_dict())
      result = query.execute()
    except TransportError as e:
      return S_ERROR(e)

    values = []
    for bucket in result.aggregations[key].buckets:
      values += [bucket['key']]
    del query
    gLogger.debug("Nb of unique rows retrieved", len(values))
    return S_OK(values)

  def pingDB(self):
    """
    Try to connect to the database
    :return: S_OK(TRUE/FALSE)
    """
    connected = False
    try:
      connected = self.__client.ping()
    except ConnectionError as e:
      gLogger.error("Cannot connect to the db", repr(e))
    return S_OK(connected)

  def deleteByQuery(self, indexName, query):
    """
    Delete data by query

    :param str indexName: the name of the index
    :param str query: the query that we want to issue the delete on
    """
    try:
      self.__client.delete_by_query(index=indexName, body=query)
    except Exception as inst:
      gLogger.error("ERROR: Couldn't delete data")
      return S_ERROR(inst)
    return S_OK('Successfully deleted data from index %s' % indexName)
class ElasticsearchAPI:
    """
    Each query will have its own index based on query name.
    index_name = query.name
    Doc type = query_name to make it possible to set mapping. Mapping is set per doc_type.

    All rows from a Query should look the same no matter the source.

    This makes all the data from all the servers in the same index.
        Comparable.
        Less indexes.
    """

    def __init__(self, host, port, user, password):
        logger.info("Connecting to ES %s..." % host)
        self.es = Elasticsearch(hosts=[{"host": host, "port": port}])
        logger.debug(self.es.info())

    @staticmethod
    def from_config_manager(config_manager):
        config = config_manager.get_config("Elasticsearch")

        return ElasticsearchAPI(config["host"], config["port"], config["password"], config["username"])

    def consume_collection(self, calculated_delta):
        assert type(calculated_delta) is CalculatedData

        query_name = calculated_delta.source.query.query_name
        db_name = calculated_delta.source.source_name
        docs = calculated_delta.delta_rows

        index_name = self.get_index_names(db_name)

        logger.debug("Pushing %s docs to index: %s" % (len(docs), index_name))
        print("Pushing %s docs to index: %s" % (len(docs), index_name))
        actions = []
        for doc in docs:
            d = doc.as_dict()
            d["measure_source"] = db_name
            action = {"_index": index_name, "_type": query_name + "_type", "_source": d}
            actions.append(action)
        helpers.bulk(self.es, actions)
        self.es.indices.refresh()

        return len(docs)

    def init_indexes_for(self, sources):
        for source in sources:
            self.init_index_for_source(source)

    def init_index_for_source(self, source):
        assert type(source) is Source
        db_name = source.source_name

        index_name = self.get_index_names(db_name=db_name)

        self.create_index(index_name)
        self.set_mapping(index_name, source.query.query_name, source.query.mapping)

    def set_mapping(self, index_name, query_name, source_mapping):
        mapping = {
            "properties": {
                "timestamp": {"type": "date", "format": "date_hour_minute_second"},
                "key_col": {"index": "not_analyzed", "type": "string"},
            }
        }
        for k, v in source_mapping.items():
            mapping["properties"][k] = v

        self.es.indices.put_mapping(index=index_name, doc_type=query_name + "_type", body=mapping)

    def delete_index(self, index_name):
        logger.info("Truncating data in index: %s" % index_name)
        self.es.indices.delete(index=index_name, ignore=404)

    def get_index_names(self, db_name):
        hist = "hist-%s" % (db_name.replace("\\", "-"))
        return hist.lower()

    def create_index(self, index_name):
        print("Creating index %s" % index_name)
        self.es.indices.create(index_name, ignore=400)
Exemplo n.º 60
0
class Ncli:
    _version = 1.0
    _yaml = 'nets.yaml'

    def __init__(self, yamlfile):

        with open(yamlfile) as f:
            self.parameters = yaml.load(f)

        logging.basicConfig(format=self.parameters['logging']['format'])
        self.logger = logging.getLogger("NETS")
        self.logger.level = logging.INFO

        try:
            self.es = Elasticsearch(hosts=[
                {'host': self.parameters['elasticsearch']['host'],
                 'port': self.parameters['elasticsearch']['port']}])
            info = self.es.info()
            self.logger.info("Connected to Elasticsearch v. %s, name: %s" % (info['version']['number'], info['name']))

        except ElasticsearchException:
            self.logger.info("Elasticsearch is not available.")
            exit(0)

    def indexinfo(self, target):
        for item in self.parameters['elasticsearch']['indexes']:
            if item['type'] == target:
                return item['name'], item['doctype']

    # display status check and exit

    def status(self):
        idx_client = IndicesClient(self.es)
        for idx in ['raw-article', 'enhanced-article']:
            es_index = self.indexinfo(idx)[0]
            if idx_client.exists(es_index):
                self.logger.info("%s contains %s documents." % (idx, self.es.count(index=es_index)['count']))
                if idx == 'article':
                    query = {"query": {"term": {"status": 1}}}
                    self.logger.info(
                        "%s articles have been processed." % self.es.count(index=es_index, body=query)['count'])
            else:
                self.logger.info("%s does not exist" % es_index)

    # initialize articles or events index.

    def initialize(self, idx):
        es_index, es_doctype = self.indexinfo(idx)
        self.logger.info("Initializing %s" % es_index)
        idx_client = IndicesClient(self.es)
        if idx_client.exists(es_index):
            idx_client.delete(es_index)
        idx_client.create(es_index)
        if idx == 'event':
            idx_client.put_mapping(doc_type=es_doctype, index=[es_index], body=event_mapping())
        self.logger.info("%s ready." % es_index)

    # find n articles and run them through the pipeline

    def pipeline(self, n):
        self.eventpipeline = Pipeline(self.parameters)
        es_index, es_doctype = self.indexinfo('raw-article')
        self.logger.info("Send %s articles through the pipeline" % n)
        query = '{"query": { "bool": { "must": { "match": { "status" : 0 }}}}}'
        result = self.es.search(index=es_index, doc_type=es_doctype, size=n, body=query)
        articles = result['hits']['hits']

        self.eventpipeline.batch(articles)

    # load articles from json files in a directory

    def load(self):
        self.logger.info("Load articles")
        es_index, es_doctype = self.indexinfo('raw-article')
        path = self.parameters['directories']['articles']
        files = [join(path, f) for f in listdir(path) if isfile(join(path, f))]
        for filename in files:
            with open(filename) as data_file:
                rows = [json.loads(row) for row in data_file.readlines()]
                for index, article in enumerate(rows):
                    if '_id' in article: del article['_id']
                    self.es.index(index=es_index, doc_type=es_doctype, body=article)

    def reset(self, n):
        resetpayload = {"doc": {"status": 0}}

        self.logger.info("reset %s  raw articles" % n)
        es_index, es_doctype = self.indexinfo('raw-article')
        query = '{"query": { "bool": { "must": { "match": { "status": "1" }}}}}'
        result = self.es.search(index=es_index, doc_type=es_doctype, size=n, body=query)
        articles = result['hits']['hits']
        tic = 0
        for article in articles:
            aid = article["_id"]
            status = article["_source"]["status"]
            self.es.update(index=es_index, doc_type=es_doctype, id=aid, body=resetpayload)
            tic = tic + 1
            if tic == 500:
                print("...", tic)
                tic = 0