def import_on_es(csv_enter, data_generator, sep):
    """
    The aim of this function is to push CPF or Cnet into ElasticSearch.
    """
    es_client = es(http_compress=True)
    data = pd.read_csv(csv_enter, sep=sep)
    return (helpers.bulk(es_client, data_generator(data)))
Пример #2
0
def start_elastic_search():
    # u6250082 Xuguang Song
    '''start elastic search'''

    ip_url = ["127.0.0.1"]
    # initialize elastic search
    new_es = es(ip_url, timeout=35, max_retries=8, retry_on_timeout=True)
    return new_es
Пример #3
0
 def __init__(self):
     self.es = es([{
         'host': cfg.esHost,
         'port': cfg.esPortNo
     }],
                  retry_on_timeout=True)
     if self.es.indices.exists(cfg.indexName) == False:
         requestBody = {
             "settings": {
                 "number_of_shards": 1,
                 "number_of_replicas": 0
             }
         }
         self.es.indices.create(index=cfg.indexName, body=requestBody)
Пример #4
0
def open_es_conn(hosts=[], port=9200, **kwargs):
    '''
        This creates a default es object ( which establishes the connection to the ES )
        and returns the es object.

    '''
    
    if(len(hosts)==0):
        e = es()
    else:
        port=""
        if("port" in kwargs):
            port = kwargs["port"]
        else:
            port = 9200
        e = es(
            hosts,
            http_auth=('', ''),
            port=port,
            use_ssl=True,
            verify_certs=False,
            sniff_on_connection_fail=True
        )
    return e
Пример #5
0
#!/bin/env python
from elasticsearch import Elasticsearch as es
from datetime import datetime, timedelta
import random, sys, uuid

es = es("http://localhost:9200")
# ilo = cu.IndexList(es)
"""
create some sample data, so we can use the cleaner to test the cleaning
"""

list_of_indexes = ["logstash-", "potato-"]
# list_of_indexes = ["logstash-", "application-", "potato-"]
days_of_indexes = -10
logs_per_index = 60
"""
-100 is starting from 100 days ago, until now
"""


def random_message(i=[0]):
    """
    returns a randomly generated message, useful for generating
    dynamic content.
    """
    nouns = ("puppy", "car", "rabbit", "potato", "monkey", "kitten", "giddy")
    verbs = ("runs", "hits", "jumps", "drives", "barfs", "poops", "sings")
    adv = ("crazily", "dutifully", "foolishly", "occasionally", "playfully",
           "bravely")
    adj = ("adorable", "clueless", "happy", "odd", "stupid", "cheeky", "lucky")
    num = random.randrange(0, 6)
Пример #6
0
    sys.exit(-1)

if not (os.path.exists(options.csv_file) and (os.path.isdir(options.csv_file) == False)):
    print "[!] CSV doesn't exist or is a directory"
    sys.exit(-1)


f = open(options.csv_file, 'r')
try:
    reader = csv.DictReader(f)
except:
    print "[!] Error parsing CSV file"
    sys.exit(-1)

try:
    elastic = es([{'host':options.server, 'port':options.port}])
except:
    print "[!] Error connecting to elasticsearch instance"
    sys.exit(-1)

if not elastic.indices.exists(options.index_name):
    print "[!] Index doesn't exist!"
    sys.exit(-1)

found_hits = False
for entry in reader:
    for header_name in entry.keys():
        results = elastic.search(index=options.index_name,  body={"query": {"match": {header_name:entry[header_name]}}})
        if results["hits"]["total"]:
            found_hits = True
            print "[+] Got %d hits where '%s' = '%s'"%(results["hits"]["total"], header_name, entry[header_name])
Пример #7
0
 def import_data(self,
                 index,
                 backend_tag_keyword="",
                 lte="",
                 gte="",
                 query="default",
                 address="https://datalab.42cloud.io:9200",
                 user="******",
                 pwd=os.environ.get("private")):
     """
     Import data from Kibana.
     :param index: Name of the elk index to query
     :param backend_tag_keyword: Filter log based on the backend tag keyword
     :param lte: Request log prior to lte date
     :param gte: Request log older than gte date
     :param query: Query to be sent to Kibana. See default query in the code
     :param address: address of the Kibana server
     :param user: user name
     :param pwd: user password
     :return: No return. Loaded data are stored in a structured
     numpy array in self.data with column "time" and "count"
     """
     if query != "default" and (backend_tag_keyword != "" or lte != ""
                                or gte != ""):
         raise AttributeError(
             "Cannot specify lte, gte, backend_tag_keyword if query is specified"
         )
     if query == "default" and not isinstance(lte, datetime.datetime):
         raise TypeError(
             "lte arg shall be datetime.datetime object (got {})".format(
                 type(lte)))
     if query == "default" and not isinstance(gte, datetime.datetime):
         raise TypeError(
             "gte arg shall be datetime.datetime object (got {})".format(
                 type(gte)))
     aggs_name = "byhour"
     if query == "default":
         lte = lte.strftime("%d/%m/%Y")
         gte = gte.strftime("%d/%m/%Y")
         query = {
             "size": 0,
             "query": {
                 "bool": {
                     "must": [{
                         "match": {
                             "backend.tag.keyword": {
                                 "query": backend_tag_keyword
                             }
                         }
                     }, {
                         "range": {
                             "@timestamp": {
                                 "gte": gte,
                                 "lte": lte,
                                 "format": "dd/MM/yyyy"
                             }
                         }
                     }]
                 }
             },
             "aggs": {
                 aggs_name: {
                     "date_histogram": {
                         "field": "@timestamp",
                         "interval": "hour"
                     }
                 }
             }
         }
     elastic = es(address,
                  http_auth=[user, pwd],
                  verify_certs=False,
                  use_ssl=True)
     result = elastic.search(index=index, body=query)
     result = result["aggregations"][aggs_name]["buckets"]
     self.data = np.zeros(len(result),
                          dtype=[('count', np.float64),
                                 ('time', 'datetime64[ms]')])
     for i, element in enumerate(result):
         date = element["key_as_string"]
         #            print("{} ; {}".format(element["doc_count"], element["key_as_string"]))
         if date[-1] == 'Z':
             date = date[:-1] + "+0000"
         date = datetime.datetime.strptime(date, "%Y-%m-%dT%H:%M:%S.%f%z")
         date = date.replace(tzinfo=datetime.timezone.utc).astimezone(
             tz=None)
         self.data[i]['time'] = date
         self.data[i]['count'] = element["doc_count"]
Пример #8
0
print("""   

    """)
body =  {
            'settings': {
                'analysis': {
                    'analyzer': {
                        # custom analyzer for analyzing file paths
                        'myngramanalyzer': {
                            'tokenizer': 'myngramtokenizer',
                        }
                    },
                    'tokenizer':{
                        'myngramtokenizer':{
                            'type':'nGram',
                            'token_chars' : ['whitespace']
                        }
                    }
                }
            }
        }
index = 'testindex'

from elasticsearch import Elasticsearch as es
e = es()
e.indices.create(
    index = index, 
    body = body,
    )
Пример #9
0
"""Compare tool.

Calculate difference between public API from `elasticsearch` and `aioes`.
"""

from elasticsearch import Elasticsearch as es
from elasticsearch.client.utils import NamespacedClient
from aioes import Elasticsearch as aioes

es_set = {i for i in dir(es([])) if not i.startswith('_')}
aioes_set = {i for i in dir(aioes([])) if not i.startswith('_')}

print('-'*70)
print('Missing: ', ' '.join(sorted(es_set - aioes_set)))
print('Extra: ', ' '.join(sorted(aioes_set - es_set)))


obj = es([])
obj2 = aioes([])

for sub in dir(obj):
    if sub.startswith('_'):
        continue
    val = getattr(obj, sub)
    if isinstance(val, NamespacedClient):
        left = {i for i in dir(val) if not i.startswith('_')}
        val2 = getattr(obj2, sub, None)
        if val2 is None:
            continue
        right = {i for i in dir(val2) if not i.startswith('_')}
        print(' '*6, sub)
Пример #10
0
from elasticsearch import Elasticsearch as es
import config
import datetime
import mod

cli = es([config.es_address])  #creating client

body_map = {  #setting body for index mapping
    "properties": {
        "url": {
            "type": "keyword"
        },
        "time": {
            "type": "date"
        }
    }
}

body_sort = {  #setting body for index sorting, in case of query
    "size": mod.count,
    "query": {
        "match_all": {}
    },
    "sort": {
        "time": "asc"
    }
}


def create():
    if (cli.indices.exists(index='crawled_urls')):
Пример #11
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from elasticsearch import Elasticsearch as es
from bs4 import BeautifulSoup as bs
from helpers import get_html_from_folder
import os

tag = 'span'
config = {'index': 'dataset',
              'doc_type': 'data'}



if __name__ == "__main__":

    es = es()
    path = os.getcwd() + "/pdftxt/htmls/"
    htmls = get_html_from_folder(path)
    id = 0
    for html in htmls:
        with open(path + html, 'r') as f:
            data = f.read().strip()

        soup = bs(data, 'lxml')
        tags = soup.findAll(tag)

        for n, t in enumerate(tags):
            tag_style = t.attrs['style']
            tag_style = tag_style.split()
            doc_id = str(id)
            print "id: ", id, "html: ", html, "style: ", tag
Пример #12
0
 def __init__(self, hosts):
     self.client = es(hosts=[hosts])
Пример #13
0
 def open_es(self):
     self.e = es()
     return self.e
Пример #14
0
import gzip, multiprocessing
from lxml import etree
from elasticsearch import Elasticsearch as es
from multiprocessing import Pool
from dataclasses import dataclass

POOL_THREAD_COUNT = 10
MAP_POOL_GENPULL_CHUNKSIZE = 50

elastic_search = es(["127.0.0.1"],
                    timeout=35,
                    max_retries=8,
                    retry_on_timeout=True)


@dataclass
class Abstract:
    """Wikipedia abstract"""
    ID: int
    title: str
    abstract: str
    url: str

    @property
    def fulltext(self):
        return ' '.join([self.title, self.abstract])


"""
Need to create the index first before using this script
curl -XPUT https://localhost:9200/example3 --insecure -u admin:admin