def import_on_es(csv_enter, data_generator, sep): """ The aim of this function is to push CPF or Cnet into ElasticSearch. """ es_client = es(http_compress=True) data = pd.read_csv(csv_enter, sep=sep) return (helpers.bulk(es_client, data_generator(data)))
def start_elastic_search(): # u6250082 Xuguang Song '''start elastic search''' ip_url = ["127.0.0.1"] # initialize elastic search new_es = es(ip_url, timeout=35, max_retries=8, retry_on_timeout=True) return new_es
def __init__(self): self.es = es([{ 'host': cfg.esHost, 'port': cfg.esPortNo }], retry_on_timeout=True) if self.es.indices.exists(cfg.indexName) == False: requestBody = { "settings": { "number_of_shards": 1, "number_of_replicas": 0 } } self.es.indices.create(index=cfg.indexName, body=requestBody)
def open_es_conn(hosts=[], port=9200, **kwargs): ''' This creates a default es object ( which establishes the connection to the ES ) and returns the es object. ''' if(len(hosts)==0): e = es() else: port="" if("port" in kwargs): port = kwargs["port"] else: port = 9200 e = es( hosts, http_auth=('', ''), port=port, use_ssl=True, verify_certs=False, sniff_on_connection_fail=True ) return e
#!/bin/env python from elasticsearch import Elasticsearch as es from datetime import datetime, timedelta import random, sys, uuid es = es("http://localhost:9200") # ilo = cu.IndexList(es) """ create some sample data, so we can use the cleaner to test the cleaning """ list_of_indexes = ["logstash-", "potato-"] # list_of_indexes = ["logstash-", "application-", "potato-"] days_of_indexes = -10 logs_per_index = 60 """ -100 is starting from 100 days ago, until now """ def random_message(i=[0]): """ returns a randomly generated message, useful for generating dynamic content. """ nouns = ("puppy", "car", "rabbit", "potato", "monkey", "kitten", "giddy") verbs = ("runs", "hits", "jumps", "drives", "barfs", "poops", "sings") adv = ("crazily", "dutifully", "foolishly", "occasionally", "playfully", "bravely") adj = ("adorable", "clueless", "happy", "odd", "stupid", "cheeky", "lucky") num = random.randrange(0, 6)
sys.exit(-1) if not (os.path.exists(options.csv_file) and (os.path.isdir(options.csv_file) == False)): print "[!] CSV doesn't exist or is a directory" sys.exit(-1) f = open(options.csv_file, 'r') try: reader = csv.DictReader(f) except: print "[!] Error parsing CSV file" sys.exit(-1) try: elastic = es([{'host':options.server, 'port':options.port}]) except: print "[!] Error connecting to elasticsearch instance" sys.exit(-1) if not elastic.indices.exists(options.index_name): print "[!] Index doesn't exist!" sys.exit(-1) found_hits = False for entry in reader: for header_name in entry.keys(): results = elastic.search(index=options.index_name, body={"query": {"match": {header_name:entry[header_name]}}}) if results["hits"]["total"]: found_hits = True print "[+] Got %d hits where '%s' = '%s'"%(results["hits"]["total"], header_name, entry[header_name])
def import_data(self, index, backend_tag_keyword="", lte="", gte="", query="default", address="https://datalab.42cloud.io:9200", user="******", pwd=os.environ.get("private")): """ Import data from Kibana. :param index: Name of the elk index to query :param backend_tag_keyword: Filter log based on the backend tag keyword :param lte: Request log prior to lte date :param gte: Request log older than gte date :param query: Query to be sent to Kibana. See default query in the code :param address: address of the Kibana server :param user: user name :param pwd: user password :return: No return. Loaded data are stored in a structured numpy array in self.data with column "time" and "count" """ if query != "default" and (backend_tag_keyword != "" or lte != "" or gte != ""): raise AttributeError( "Cannot specify lte, gte, backend_tag_keyword if query is specified" ) if query == "default" and not isinstance(lte, datetime.datetime): raise TypeError( "lte arg shall be datetime.datetime object (got {})".format( type(lte))) if query == "default" and not isinstance(gte, datetime.datetime): raise TypeError( "gte arg shall be datetime.datetime object (got {})".format( type(gte))) aggs_name = "byhour" if query == "default": lte = lte.strftime("%d/%m/%Y") gte = gte.strftime("%d/%m/%Y") query = { "size": 0, "query": { "bool": { "must": [{ "match": { "backend.tag.keyword": { "query": backend_tag_keyword } } }, { "range": { "@timestamp": { "gte": gte, "lte": lte, "format": "dd/MM/yyyy" } } }] } }, "aggs": { aggs_name: { "date_histogram": { "field": "@timestamp", "interval": "hour" } } } } elastic = es(address, http_auth=[user, pwd], verify_certs=False, use_ssl=True) result = elastic.search(index=index, body=query) result = result["aggregations"][aggs_name]["buckets"] self.data = np.zeros(len(result), dtype=[('count', np.float64), ('time', 'datetime64[ms]')]) for i, element in enumerate(result): date = element["key_as_string"] # print("{} ; {}".format(element["doc_count"], element["key_as_string"])) if date[-1] == 'Z': date = date[:-1] + "+0000" date = datetime.datetime.strptime(date, "%Y-%m-%dT%H:%M:%S.%f%z") date = date.replace(tzinfo=datetime.timezone.utc).astimezone( tz=None) self.data[i]['time'] = date self.data[i]['count'] = element["doc_count"]
print(""" """) body = { 'settings': { 'analysis': { 'analyzer': { # custom analyzer for analyzing file paths 'myngramanalyzer': { 'tokenizer': 'myngramtokenizer', } }, 'tokenizer':{ 'myngramtokenizer':{ 'type':'nGram', 'token_chars' : ['whitespace'] } } } } } index = 'testindex' from elasticsearch import Elasticsearch as es e = es() e.indices.create( index = index, body = body, )
"""Compare tool. Calculate difference between public API from `elasticsearch` and `aioes`. """ from elasticsearch import Elasticsearch as es from elasticsearch.client.utils import NamespacedClient from aioes import Elasticsearch as aioes es_set = {i for i in dir(es([])) if not i.startswith('_')} aioes_set = {i for i in dir(aioes([])) if not i.startswith('_')} print('-'*70) print('Missing: ', ' '.join(sorted(es_set - aioes_set))) print('Extra: ', ' '.join(sorted(aioes_set - es_set))) obj = es([]) obj2 = aioes([]) for sub in dir(obj): if sub.startswith('_'): continue val = getattr(obj, sub) if isinstance(val, NamespacedClient): left = {i for i in dir(val) if not i.startswith('_')} val2 = getattr(obj2, sub, None) if val2 is None: continue right = {i for i in dir(val2) if not i.startswith('_')} print(' '*6, sub)
from elasticsearch import Elasticsearch as es import config import datetime import mod cli = es([config.es_address]) #creating client body_map = { #setting body for index mapping "properties": { "url": { "type": "keyword" }, "time": { "type": "date" } } } body_sort = { #setting body for index sorting, in case of query "size": mod.count, "query": { "match_all": {} }, "sort": { "time": "asc" } } def create(): if (cli.indices.exists(index='crawled_urls')):
#!/usr/bin/env python # -*- coding: utf-8 -*- from elasticsearch import Elasticsearch as es from bs4 import BeautifulSoup as bs from helpers import get_html_from_folder import os tag = 'span' config = {'index': 'dataset', 'doc_type': 'data'} if __name__ == "__main__": es = es() path = os.getcwd() + "/pdftxt/htmls/" htmls = get_html_from_folder(path) id = 0 for html in htmls: with open(path + html, 'r') as f: data = f.read().strip() soup = bs(data, 'lxml') tags = soup.findAll(tag) for n, t in enumerate(tags): tag_style = t.attrs['style'] tag_style = tag_style.split() doc_id = str(id) print "id: ", id, "html: ", html, "style: ", tag
def __init__(self, hosts): self.client = es(hosts=[hosts])
def open_es(self): self.e = es() return self.e
import gzip, multiprocessing from lxml import etree from elasticsearch import Elasticsearch as es from multiprocessing import Pool from dataclasses import dataclass POOL_THREAD_COUNT = 10 MAP_POOL_GENPULL_CHUNKSIZE = 50 elastic_search = es(["127.0.0.1"], timeout=35, max_retries=8, retry_on_timeout=True) @dataclass class Abstract: """Wikipedia abstract""" ID: int title: str abstract: str url: str @property def fulltext(self): return ' '.join([self.title, self.abstract]) """ Need to create the index first before using this script curl -XPUT https://localhost:9200/example3 --insecure -u admin:admin