def setup_elasticsearch_connection(self) -> None: _LOGGER.debug("Setting up Elasticsearch connection.") if not self.elasticsearch.ca_crt_path.exists(): raise FileNotFoundError( f"CA-Certificate '{self.elasticsearch.ca_crt_path}' could not be found." " Configuration without a certificate is not supported at this time." ) connections.create_connection( hosts=[{"host": self.elasticsearch.host, "port": self.elasticsearch.port}], timeout=self.elasticsearch.timeout, retry_on_timeout=self.elasticsearch.retry_on_timeout, max_retries=self.elasticsearch.max_retries, http_compress=self.elasticsearch.http_compress, scheme="https", use_ssl=True, http_auth=( self.elasticsearch.user, self.elasticsearch.password.get_secret_value(), ), verify_certs=True, ssl_show_warn=True, ca_certs=str(self.elasticsearch.ca_crt_path), ssl_assert_hostname=self.elasticsearch.host, )
def ready(self): from .documents import EventDocument connections.create_connection(hosts=[settings.ELASTICSEARCH_URL], timeout=20) EventDocument.init()
def processHashList(hashList): """hash_data main module""" results = dict() outResults = dict() fileDate = datetime.datetime.now().strftime("%y-%m-%d-%H-%M") outFile = f"{app.config['BASE_DIR']}/templates/output/{app.config['QUERY_TAG']}_{fileDate}.json" # Define the default Elasticsearch client connections.create_connection(hosts=[app.config['ELASTICSEARCH_HOST']]) # Set the number of multi processes to use and cap it at 16 so # so we don't blow out the minute points on AutoFocus multiProcNum = (app.config['TORT_POOL_COUNT'] if app.config['TORT_POOL_COUNT'] <= 2 else 2) # DEBUG_MODE means we only run 1 at a time, rather than multi-processing if app.config['DEBUG_MODE'] != True: # Multiprocess the the hashes app.logger.debug(f"Running hashes through on {multiProcNum} processes") with Pool(multiProcNum) as pool: results = pool.map(getHashInfo, hashList) return storeResults(results, outFile) else: for hashData in hashList: results = getHashInfo(hashData) outResults.update(storeResults(results, outFile)) if "text" in app.config['OUTPUT_TYPE']: return f"{outFile}" else: return outResults
def __init__(self): #Create default connection connections.create_connection( hosts=[keys.ELASTICSEARCH_ENDPOINT_URL], timeout=20, http_auth=(keys.ELASTICSEARCH_USERNAME, keys.ELASTICSEARCH_PASSWORD), use_ssl=True, verify_certs=True, ca_certs=certifi.where() ) #ES DSL method self.es_client = None try: self.es_client = Elasticsearch( [keys.ELASTICSEARCH_CLUSTER_URL], http_auth=(keys.ELASTICSEARCH_USERNAME, keys.ELASTICSEARCH_PASSWORD), port=keys.ELASTICSEARCH_CLUSTER_PORT, use_ssl=True, verify_certs=True, ca_certs=certifi.where()) print ("clsElasticIndex.__init__: Connected to elastic server") #print(self.es_client) except Exception as ex: print("Error:") print(ex)
def run(self, corpus, index_name="fact_corpus", document_class=Fact, **kwargs): connections.create_connection(hosts=["localhost"]) document_class.init() documents = ( document_class(meta={"id": id}, fact=doc["fact"]).to_dict(True) for id, doc in corpus.items() ) logger.info(f"Building corpus index for {index_name}") # RayExecutor().run(documents, self.save_data, {}) for success, info in tqdm( parallel_bulk( connections.get_connection(), documents, thread_count=kwargs.pop("batch_size", multiprocessing.cpu_count()), chunk_size=100000, max_chunk_bytes=2 * 1024 ** 3, ) ): if not success: logger.error(f"A document failed: {info} ") logger.success("Elastic index successfully built") return index_name
def sync_posts(): # establish connection to postgres conn = psycopg2.connect( "dbname='{}' user='******' host='{}' password='******'".format( config.DB_DATABASE, config.DB_USER, config.DB_HOST, config.DB_PASSWORD)) # establish connection to elasticseatch connections.create_connection(hosts=[config.ELASTICSEARCH_HOST], timeout=20) cur = conn.cursor() cur.execute("""SELECT * from articles""") columns = [desc[0] for desc in cur.description] rows = cur.fetchall() for article in rows: article = {key: value for key, value in zip(columns, article)} document = ArticleDoc.get(str(article['id']), ignore=404) if not document: document = ArticleDoc() document.meta.id = str(article['id']) document.title = article['title'] document.description = article['description'] document.is_published = article['is_published'] document.created_at = article['created_at'] document.save() click.echo('Posts were successfully synced to ElasticSearch!')
def processHashList(hashList, outputType, queryTag, hashType, apiKey): """hash_data main module""" results = dict() outResults = dict() fileDate = datetime.datetime.now().strftime("%y-%m-%d-%H-%M") outFile = f"/tmp/{queryTag}_{fileDate}.json" # Define the default Elasticsearch client connections.create_connection( hosts=cnc_utils.get_config_value('ELASTICSEARCH_HOST', 'localhost')) # Set the number of multi processes to use and cap it at 16 so # so we don't blow out the minute points on AutoFocus # multiProcNum = (app.config['TORT_POOL_COUNT'] # if app.config['TORT_POOL_COUNT'] <= 2 else 2) # DEBUG_MODE means we only run 1 at a time, rather than multi-processing # if app.config['DEBUG_MODE'] != True: # # Multiprocess the the hashes # print(f"Running hashes through on {multiProcNum} processes") # with Pool(multiProcNum) as pool: # results = pool.map(getHashInfo, hashList) # return storeResults(results,outFile) # else: for hashData in hashList: results = getHashInfo(hashData, outputType, queryTag, apiKey) outResults.update(storeResults(results, outFile, outputType)) if "text" in outputType: return f"{outFile}" else: return outResults
def connect_to_server(server: str, cookie: str): if "_oauth2_proxy" not in cookie: cookie = "_oauth2_proxy=" + cookie connections.create_connection( hosts=[server], headers={"cookie": cookie} if cookie else None, connection_class=RequestsHttpConnection)
def __init__(self, settings): self.database = settings['HTTPCACHE_ES_DATABASE'] self.database_host = settings.get('HTTPCACHE_HOST', '127.0.0.1') connections.create_connection(hosts=[self.database_host]) WebLink.init() self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
def set_hosts(hosts, use_ssl=False, ssl_cert_path=None, username=None, password=None, timeout=60.0): """ Sets the Elasticsearch hosts to use Args: hosts (str): A single hostname or URL, or list of hostnames or URLs use_ssl (bool): Use a HTTPS connection to the server ssl_cert_path (str): Path to the certificate chain username (str): The username to use for authentication password (str): The password to use for authentication timeout (float): Timeout in seconds """ if type(hosts) != list: hosts = [hosts] conn_params = {"hosts": hosts, "timeout": timeout} if use_ssl: conn_params['use_ssl'] = True if ssl_cert_path: conn_params['verify_certs'] = True conn_params['ca_certs'] = ssl_cert_path else: conn_params['verify_certs'] = False if username: conn_params['http_auth'] = (username + ":" + password) connections.create_connection(**conn_params)
def main(self) -> None: try: instance.config = ConfigParser() instance.config.read_dict(defaultconfig()) logger().info('Started eisp with pid %s', getpid()) for i in [i for i in argv if i.startswith('--')]: try: mod('eisp.param.{}'.format(i[2:])).__dict__[i[2:]](argv) except: exit('Invalid parameter or argument to {}'.format(i[2:])) conf = dotdict(instance.config['data']) connections.create_connection(hosts=[conf.host]) delete_index(conf.index_name) create_index(conf.elastic_mapping, conf.index_name) for ok, info in helpers.parallel_bulk(connections.get_connection(), actions=index_pdfs( conf.index_name, conf.root), request_timeout=60, chunk_size=100, thread_count=8, queue_size=8): if not ok: print(info) except KeyboardInterrupt: print('\N{bomb}') except Exception as exception: logger().exception(exception) except SystemExit as exception: logger().critical(str(exception))
def set_hosts(hosts, use_ssl=False, ssl_cert_path=None, ssl_client_cert=None, ssl_client_key=None, timeout=60.0): """ Sets the Elasticsearch hosts to use Args: hosts (str): A single hostname or URL, or list of hostnames or URLs use_ssl (bool): Use a HTTPS connection to the server ssl_cert_path (str): Path to the certificate chain timeout (float): Timeout in seconds """ if type(hosts) != list: hosts = [hosts] conn_params = {"hosts": hosts, "timeout": timeout} if use_ssl: conn_params['use_ssl'] = True if ssl_cert_path: conn_params['verify_certs'] = True conn_params['ca_certs'] = ssl_cert_path else: conn_params['verify_certs'] = False if ssl_client_cert: conn_params["client_cert"] = ssl_client_cert conn_params["client_key"] = ssl_client_key conn_params["connection_class"] = RequestsHttpConnection print("=" * 72) print("\n") print(conn_params) connections.create_connection(**conn_params)
def __init__(self, server, index): self._server = server self._index_name = index connections.create_connection(hosts=[self._server]) self._init_index()
def handle(self, *args, **options): print(ELASTIC_SEARCH_ROOT) connections.create_connection(hosts=[ELASTIC_SEARCH_ROOT]) # es = Elasticsearch(ELASTIC_SEARCH_ROOT, use_ssl=True, ca_certs=certifi.where()) es = Elasticsearch(ELASTIC_SEARCH_ROOT, ca_certs=certifi.where()) es.indices.delete(index='acronyms', ignore=[400, 404]) AcronymIndex.init()
def ready(self): from . import signal_receivers # noqa from elasticsearch_dsl import connections from .settings import app_settings host = app_settings.CONNECTION['HOST'] or 'localhost' connections.create_connection(hosts=[host, ])
def __init__(self, *args, **kwargs) -> None: super().__init__(*args, **kwargs) connections.create_connection(hosts=[os.environ['ELASTICSEARCH_HOST']], http_auth=(os.environ['ELASTICSEARCH_USER'], os.environ['ELASTICSEARCH_PASSWORD']), timeout=20) self.client = Elasticsearch()
def ready(self): connections.create_connection(settings.ES_SETTINGS["ALIAS"], hosts=[{ "host": settings.ES_SETTINGS["HOST"], "port": settings.ES_SETTINGS["PORT"] }])
def connect(): hosts = elasticsearch_settings.get('hosts') scheme = elasticsearch_settings.get('scheme') port = elasticsearch_settings.get('port') LogService.info("Using ElasticSearch hosts: \"%s\" via %s/%i" % (hosts, scheme, port)) connections.create_connection(scheme=scheme, hosts=hosts, port=port)
def search_for_es(search_word, n): page_size = 20 connections.create_connection(hosts=['localhost:1235']) s = Specs.search() mul_math = MultiMatch(query=search_word, fields=['name', 'factory']) res = s.query(mul_math)[(n - 1) * page_size:n * page_size] for one in res: print(one.name)
def runLambda(q: Q, f): connections.create_connection() s = Stock.search() r = s.query(q) # for stock in r.scan(): stock = Stock.get(id="FR0000076887") f(stock) stock.save()
def init_elasticsearch_con(host, user=None, password=None, port=None, index_prefix=None, tls=None): http_auth = None # Set authentication parameters if available if user and password: http_auth = (user, password) if port is None: port = DEFAULT_ES_PORT # Create ssl context if enabled ssl_context = None tls = collections.ChainMap(tls or {}, DEFAULT_TLS_CONFIG) if tls["enabled"]: ssl_context = ssl.create_default_context() ssl_context.check_hostname = tls["check_hostname"] ssl_context.verify_mode = getattr(ssl, tls["verify_mode"]) # Somehow the SSL context is not enough, we must also pass the use_ssl=True use_ssl = ssl_context is not None connections.create_connection( host=host, http_auth=http_auth, port=port, use_ssl=use_ssl, ssl_context=ssl_context, ) # NOTE (felix): Hack to override the index names with prefix from config # TODO (felix): Remove this once https://github.com/elastic/elasticsearch-dsl-py/pull/1099 # is merged and use the pattern described in the elasticsearch-dsl documentation # https://elasticsearch-dsl.readthedocs.io/en/latest/persistence.html#index # # Unfortunately, this pattern is currently only working for document.init(), # while the search() and save() methods will still use the original index name # set in the index-meta class. # This unexpected behaviour is also described in # https://github.com/elastic/elasticsearch-dsl-py/issues/1121 and # https://github.com/elastic/elasticsearch-dsl-py/issues/1091. if index_prefix: # If the user set a '-' at the end of the prefix, we don't want to end # up in messy index names index_prefix = index_prefix.rstrip("-") for idx_cls in [ZuulJob, AnsibleRole, ZuulTenant, GitRepo]: # NOTE (felix): Index.name seems to hold the constant value that we defined # in our index-meta class for the document. _index._name on the other hand # holds the active value. Thus, we can use this to ensure that the prefix # is only prepended once, even if we call this method multiple times. idx_cls._index._name = "{}-{}".format(index_prefix, idx_cls.Index.name) ZuulJob.init() AnsibleRole.init() ZuulTenant.init() GitRepo.init()
def main(use_elasticsearch = True, calculate_PageRank = False, tele_const = 0.2): """ main entry for the indexer module. """ jsons_root_dir = 'JSONs201806101057/' # list of addresses of all json files all_json_dirs = glob.glob(unicode(jsons_root_dir + '*.json')) # first reading all json files jsons = [] for jdir in all_json_dirs: with open(jdir, 'r') as f: jsn = json.load(f) jsons.append(jsn) print len(jsons), ' json files imported.' # now creating a set of all links and then a list of all links in json files print 'creating a list of all links' links_set = set() for js in jsons: links_set.add(js["url"]) for l in js["outlinks"]: links_set.add(l) print len(links_set), ' links found' links = list(links_set) ## if user has selected to index documents using Elasticsearch # Note that when using Elasticsearch, page rank is ignored if use_elasticsearch: from elasticsearch import Elasticsearch from elasticsearch_dsl import Search, document, field, connections, Q from elasticsearch_dsl.connections import connections print 'Using Elasticsearch for indexing, PageRank is ignored' es = Elasticsearch(serializer=JSONSerializerPython2()) es.indices.create(index='book-index', ignore=[400, 404]) connections.create_connection(hosts=['localhost'], timeout=20) connections.add_connection('book', es) Book.init('book-index') ## adding all document to the index 'book-index' for idx, js in enumerate(jsons): if len(js['type']) == 0: js['type'] = ['missing'] print idx print js['title'] book = Book(average=js['average'], cover=js['cover'], description=js['description'].encode('utf-8',"replace"), ratings=js['ratings'], reviews=js['reviews'], title=js['title'], url=js['url'], outlinks=js['outlinks'], type=js['type']) book.add_authors(js['authors']) book.add_userreviews(js['userreviews']) book.id = idx book.save() print 'Elasticsearch index created' ### use pyLucene instead else: """
def elastic_search(context): """Behave Fixture method. Sets up elastic search connection and makes sure our index is available. """ print("Setting up Elastic Search") userdata = context.config.userdata authentication = userdata.get("esUser") + ":" + userdata.get("esPassword") connections.create_connection(hosts=[userdata.get("esHost")], http_auth=(authentication), timeout=20)
def index_create(): # establish connection to elasticseatch connections.create_connection(hosts=[config.ELASTICSEARCH_HOST], timeout=20) if ArticleDoc._index.exists(): ArticleDoc._index.delete() ArticleDoc.init() click.echo('Index was successfully created!')
def __init__(self, CONNECTION_URI=None, database_name=None, collection_name=None): self.CONNECTION_URI = CONNECTION_URI self.database_name = database_name self.collection_name = collection_name connections.create_connection(hosts=[self.CONNECTION_URI]) self.WebLinkExtracted = self.setup_collection() self.WebLinkExtracted.init()
def main(): """Main entrypoint for example script querying ElasticSearch. """ parser = argparse.ArgumentParser( description= "Script runnig several example queries against Elasticsearch.") parser.add_argument( "--host", default="localhost", help="[Optional] Elasticsearch host to connect to (default: localhost)" ) parser.add_argument( "--port", default=9200, help="[Optional] Elasticsearch port to connect to (default: 9200)") parser.add_argument( "--cacert", help="[Optional] CA cert file in PEM format (if required)") parser.add_argument("--ssl", default=True, type=bool, help="[Optional] Set to false to use plaintext HTTP") parser.add_argument( "--user", default="elastic", help="[Optional] User account to bind to (default: elastic)") parser.add_argument( "--password", default=None, help="[Optional] Password for user account to bind to (default: None)") parser.add_argument( "--index", default="staging*", help="[Optional] Name of Elasticsearch index (default: staging*)") args = parser.parse_args() if not args.password and args.user: args.password = getpass("Password for " + args.user + ": ") # URL encode the user and password to enable it to used with HTTP BASIC auth safely enc_user = urllib.parse.quote_plus(args.user) enc_password = urllib.parse.quote_plus(args.password) # Create default connection to Elasticsearch instance connections.create_connection(hosts=[{ "host": args.host, "http_auth": enc_user + ":" + enc_password, "port": args.port, "timeout": 20, "use_ssl": args.ssl, "verify_certs": bool(args.cacert), "ca_certs": args.cacert, }]) query_failed_tests(args.index) query_for_successful_job(args.index)
def __init__(self, host, index, user=AnonymousUser()): self.host = host self.index = index self.user = user self.search = Search(using='default', index=index) connections.create_connection(alias='default', hosts=[ self.host, ], timeout=60)
def set_hosts(hosts): """ Sets the Elasticsearch hosts to use Args: hosts: A single hostname or URL, or list of hostnames or URLs """ if type(hosts) != list: hosts = [hosts] connections.create_connection(hosts=hosts, timeout=20)
def __init__(self, connection_uri=None, database_name=None, collection_name=None): self.connection_uri = connection_uri self.database_name = database_name self.collection_name = collection_name connections.create_connection(hosts=[self.connection_uri]) self.WebLinkExtracted = self.setup_collection() self.WebLinkExtracted.init()
def trigger_upload(url, csv_file, index_name): index = Index(index_name) index.settings = {"number_of_shards": 1, "number_of_replicas": 0} # index schema class QA(InnerDoc): ans_id = Integer() ans_str = Text(fields={'raw': Keyword()}) query_id = Integer() query_str = Text() @index.document class Doc(Document): doc = Text() created_at = Date() qa_pair = Nested(QA) def add_qa_pair(self, ans_id, ans_str, query_id, query_str): self.qa_pair.append( QA(ans_id=ans_id, ans_str=ans_str, query_id=query_id, query_str=query_str)) def save(self, **kwargs): self.created_at = datetime.now() return super().save(**kwargs) # connect to ES instance and start indexing connections.create_connection(hosts=[url]) qa_pairs = pd.read_csv(csv_file).fillna('nan').to_dict('records') print('uploading docs') counter = 0 st.markdown('Progress Bar') progress_bar = st.progress(0) qa_pairs_len = len(qa_pairs) chunks = 1 / qa_pairs_len print(chunks) for i, pair in enumerate(qa_pairs): first = Doc(doc=pair['ans_str']) print(first) first.add_qa_pair(pair['ans_id'], pair['ans_str'], pair['query_id'], pair['query_str']) first.save() counter += 1 chunks = chunks + (i / 10) if (chunks > 1): progress_bar.progress(chunks - 1) progress_bar.progress(100) break progress_bar.progress(chunks) print("indexing finished") print(f'indexed {counter} documents') return 'Done'
def PullUniques(self, field_name): self.dataDict.clear() self.fieldname=field_name # initiate the default connection to elasticsearch connections.create_connection(hosts=self.ES_HOST, timeout=360,http_auth=(self.ES_USERNAME,self.ES_PASSWORD)) print (" Querying Host, please wait, this could take a few minutes.. ") for bucket in self.scan_aggs(Search(index=self.ES_INDEX), {field_name: A("terms", field=field_name)}): dictBucket=bucket.to_dict() self.dataDict[dictBucket['key'][field_name]]=dictBucket['doc_count'] print (" Query Complete! ", len(self.dataDict.keys()), " Records Pulled.")
def setUp(self): super(GeonodeElasticsearchTest, self).setUp() self.login() call_command('rebuild_index') # connect to the ES instance connections.create_connection(hosts=[settings.ES_URL])
from elasticsearch import Elasticsearch from elasticsearch.helpers import bulk from elasticsearch_dsl import DocType, Text, Date, Integer, Boolean, Completion, Search, Nested, InnerDoc, connections, \ Q from yablog.client4es import es_client connections.create_connection(host='elasticsearch', port='9200') class BlogPostIndex(DocType): id = Integer() title = Text(analyzer='ik_max_word', search_analyzer="ik_max_word") content = Text(analyzer='ik_max_word', search_analyzer="ik_max_word") char_num = Integer() allow_comments = Boolean() vote_num = Integer() category = Text(analyzer='ik_max_word', search_analyzer="ik_max_word") tags = Text(analyzer='ik_max_word', search_analyzer="ik_max_word") publish_date = Date() suggestions = Completion() class Meta: index = 'blogpost-index' @classmethod def add(cls, **kwargs): id = kwargs.pop('id', None) if id is None: return False blog = cls(meta={'id': id}, **kwargs)
) # refresh the index to make the changes visible es.indices.refresh(index=next_index) if update_alias: # repoint the alias to point to the newly created index es.indices.update_aliases(body={ 'actions': [ {"remove": {"alias": ALIAS, "index": PATTERN}}, {"add": {"alias": ALIAS, "index": next_index}}, ] }) if __name__ == '__main__': # initiate the default connection to elasticsearch connections.create_connection() # create the empty index setup() # create a new document bp = BlogPost( _id=0, title='Hello World!', tags = ['testing', 'dummy'], content=open(__file__).read() ) bp.save(refresh=True) # create new index migrate()