def restore_tokens(): connections.create_connection(hosts=ES_NODES) Index(INDEX_NAME).delete() class Token(DocType): username = String() token = String() expires = Date() read = Boolean() write = Boolean() revoked = Boolean() acl = String() groups = String() admin = Boolean() last_activity_at = Date() class Meta: index = INDEX_NAME Token.init() reindex_results = connections.get_connection().reindex(body={"source": {"index": BACKUP_INDEX_NAME}, "dest": {"index": INDEX_NAME}}, request_timeout=3600) if reindex_results.get('created') + reindex_results.get('updated') == reindex_results.get('total'): return ('Tokens restored to previous schema successfully!') else: return ('Tokens did not restore from backup properly')
def handle(self, *args, **options): s3 = boto3.resource('s3', aws_access_key_id=settings.AWS_ACCESS_KEY, aws_secret_access_key=settings.AWS_SECRET_ACCESS_KEY) dt = options['start_date'].replace(hour=0, minute=0, second=0, microsecond=0) if not options['to_stdout']: connections.create_connection(hosts=[options['es_url']], **settings.ES_CONNECTION_PARAMS) CRECDoc.init() while dt < options['end_date']: logger.info('Processing files for {0}.'.format(dt)) try: response = s3.Object( options['source_bucket'], crec_s3_key('mods.xml', dt) ).get() except botocore.exceptions.ClientError as e: logger.info('Could not find mods file for {0}.'.format(dt)) response = None if response is not None and response.get('Body'): try: crecs = extract_crecs_from_mods(response['Body']) logger.info('Found {0} new records.'.format(len(crecs))) if options['to_stdout']: logger.info('Using stdout:') for crec in crecs: if not crec.is_skippable(): if options['to_stdout']: logger.info(crec.to_es_doc()) else: es_doc = crec.to_es_doc() es_doc.save() upload_speaker_word_counts(crec) except Exception as e: logger.exception('Error processing data for {0}.'.format(dt.strftime('%Y-%m-%d'))) dt += timedelta(days=1)
def __init__(self, nodes=ES_NODES, **kwargs): if type(nodes) == str: nodes = nodes.split(',') if not nodes: nodes = ES_NODES self.indicators_prefix = kwargs.get('indicators_prefix', 'indicators') self.tokens_prefix = kwargs.get('tokens_prefix', 'tokens') logger.info('setting es nodes {}'.format(nodes)) connections.create_connection(hosts=nodes) self._alive = False while not self._alive: if not self._health_check(): logger.warn('ES cluster not accessible') logger.info('retrying connection in 30s') sleep(30) self._alive = True logger.info('ES connection successful') self.tokens = TokenManager() self.indicators = IndicatorManager()
def update_all(): """Check with Proxy source and update authors and articles. IMPORTANT: Will lock if unable to connect to MediaWiki server! """ # authors connections.create_connection(hosts=config.DOCSTORE_HOSTS) index = Index(config.DOCSTORE_INDEX) mw_authors = Proxy.authors(cached_ok=False) es_authors = self.authors() authors_new,authors_delete = self.authors_to_update(mw_authors, es_authors) for n,title in enumerate(authors_delete): logging.debug('%s/%s %s' % (n, len(authors_delete), title)) author = Author.get(url_title=title) author.delete() for n,title in enumerate(authors_new): logging.debug('%s/%s %s' % (n, len(authors_new), title)) mwauthor = Proxy.page(title) author = Author.from_mw(mwauthor) author.save() # articles connections.create_connection(hosts=config.DOCSTORE_HOSTS) index = Index(config.DOCSTORE_INDEX) # authors need to be refreshed mw_authors = Proxy.authors(cached_ok=False) mw_articles = Proxy.articles_lastmod() es_authors = self.authors() es_articles = self.articles() articles_update,articles_delete = self.articles_to_update( mw_authors, mw_articles, es_authors, es_articles) self.delete_articles(titles=articles_delete) self.index_articles(titles=articles_update)
def setUp(self): from django.conf import settings SEARCH = getattr(settings, 'SEARCH') connections.create_connection('testing', **SEARCH['default']['connections']) self.index = Index(SEARCH['default']['index'], using='testing') # This is needed for test_documents, but has side effects in all running tests doctypes_list = ( value for name, value in inspect.getmembers(documents) if not name.startswith('_') and inspect.isclass(value) and issubclass(value, DocType) and name != DocType.__name__ ) for doctype in doctypes_list: # Remove assigned index doctype._doc_type.index = None # Associate docs with test index self.index.doc_type(doctype) if self.index.exists(): self.index.delete(ignore=404) self.index.create() self.search = Search(index=SEARCH['default']['index'])
def test_it_does_raise_if_bad_connection_is_queried(self, request): request.addfinalizer(remove_connection) connections.create_connection(alias='foobar', hosts=['localhost:2323']) with pytest.raises(ConnectionError): Index('whatever', using='foobar').exists()
def __init__(self, nodes=ES_NODES, **kwargs): self.logger = logging.getLogger(__name__) if type(nodes) == str: nodes = nodes.split(',') self.logger.info('setting es nodes {}'.format(nodes)) connections.create_connection(hosts=nodes)
def __init__(self): """Do nothing, by default.""" from elasticsearch_dsl.connections import connections from tg import config connections.create_connection(hosts=[config.get('elasticsearch.host')], send_get_body_as="POST", timeout=20)
def __init__(self, remote='localhost:9200', index='indicators', **kwargs): super(_ElasticSearch, self).__init__(remote) self.index = index if isinstance(self.remote, str): self.remote = self.remote.split(',') connections.create_connection(hosts=self.remote)
def __init__(self): if not self.index: raise ValueError("No index specified") if not self.doc_types: raise ValueError("No doc_types specified") connections.create_connection(hosts=settings.ELASTIC_SEARCH_HOSTS)
def __init__(self, **kwargs): self.hosts = kwargs.get('hosts', 'localhost') self.client = Elasticsearch(self.hosts) timeout = kwargs.get('timeout', 10) os.environ['LOGGO_REQUEST_TIMEOUT'] = str(timeout) max_retries = kwargs.get('max_retries', 2) connections.create_connection(hosts=self.hosts, connection_class=CustomUrllib3HttpConnection, max_retries=max_retries) index_name = kwargs.get('index', None) self.create_index_if_not_exists(index_name)
def app(): print "Running notifications app..." # Define a default Elasticsearch client connections.create_connection(hosts=[ES_SERVER]) # App logic runs here #rabbitmq_conf() sample_data() # Display cluster health print(connections.get_connection().cluster.health())
def __init__(self): if not self.index: raise ValueError("No index specified") if not self.doc_types: raise ValueError("No doc_types specified") connections.create_connection( hosts=settings.ELASTIC_SEARCH_HOSTS, # sniff_on_start=True, retry_on_timeout=True, )
def _configure(self): if "endpoints" in self.config: self.endpoints = self.config["endpoints"] else: self.host = self.config["host"] self.port = self.config["port"] self.endpoints = ["{}:{}".format(self.host, self.port)] connections.create_connection(hosts=self.endpoints, timeout=self.config.get("timeout",15), retry_on_timeout=True, maxsize=25) return
def setup_mapping(command, conf, vars): # Setup Elasticsearch's database mapping print("Setting up Elasticsearch's model") connections.create_connection( hosts=[config.get('elasticsearch.host')], send_get_body_as='POST') # Setup the jobs index _setup_index(model.JobElastic) # Setup the company index _setup_index(model.CompanyElastic) # Setup the geocomplete index _setup_index(model.Geocomplete)
def create_connections(): """Create connections to elasticsearch as defined in settings.py.""" for alias, params in CONNECTIONS.items(): processed_params = {} for param_name, param_value in params.items(): if param_name == 'serializer' and isinstance(param_value, str): serializer_class = import_string(param_value) param_value = serializer_class() processed_params[param_name] = param_value connections.create_connection(alias, **processed_params)
def reindex_tokens(): TokenBackup.init() connections.create_connection(hosts=ES_NODES) backup_results = connections.get_connection().reindex(body={"source": {"index": INDEX_NAME}, "dest": {"index": BACKUP_INDEX_NAME}}, request_timeout=3600) if backup_results.get('created') + backup_results.get('updated') == backup_results.get('total'): Index(INDEX_NAME).delete() else: return ('Tokens did not backup properly') time.sleep(1) Token.init() reindex_results = connections.get_connection().reindex(body={"source": {"index": BACKUP_INDEX_NAME}, "dest": {"index": INDEX_NAME}}, request_timeout=3600) if reindex_results.get('created') + reindex_results.get('updated') == reindex_results.get('total'): return ('Tokens reindexed successfully!') else: return ('Tokens did not reindex from backup properly')
def prepare_connection(): """Set dafault connection for ElasticSearch. .. warning:: In case of using multiprocessing/multithreading, connection will be probably initialized in the main process/thread and the same connection (socket) will be used in all processes/threads. This will cause some unexpected timeouts of pushes to Elasticsearch. So make sure that this function is called again in each process/thread to make sure that unique connection will be used. """ elasticsearch_host = getattr(settings, 'ELASTICSEARCH_HOST', 'localhost') elasticsearch_port = getattr(settings, 'ELASTICSEARCH_PORT', 9200) connections.create_connection(hosts=['{}:{}'.format(elasticsearch_host, elasticsearch_port)])
def __init__(self, config_file='config.cfg'): super(Elastic, self).__init__() self.percentage=10.0 self.minimum_occurrences=250 # The ConfigParser documentation points out that there's no way to force defaults config option # outside the "DEFAULT" section. config = ConfigParser() config.read(config_file) if not config.has_section('elastic'): config.add_section('elastic') for option, value in {'use_ssl': 'True', 'host': '127.0.0.1', 'version': '2', 'index': 'nxapi', 'doc_type': 'events'}.items(): if not config.has_option('elastic', option): config.set('elastic', option, value) self.version = config.getint('elastic', 'version') self.index = config.get('elastic', 'index') use_ssl = config.getboolean('elastic', 'use_ssl') host = config.get('elastic', 'host') self.doc_type = config.get('elastic', 'doc_type') self.client = connections.create_connection(hosts=[host], use_ssl=use_ssl, index=self.index, version=self.version, doc_type=self.doc_type, timeout=30, retry_on_timeout=True ) Event.init(index=self.index) index = Index(self.index, using=self.client) index.doc_type(Event) self.initialize_search()
def __init__(self): es_url = app.config['ELASTICSEARCH_URL'] es_port = app.config['ELASTICSEARCH_PORT'] logstash_host = app.config['LOGSTASH_HOST'] logstash_port = int(app.config['LOGSTASH_PORT']) self.measure = Measure(app.config['ELASTICSEARCH_CLIENT'], (logstash_host, logstash_port)) self.es = connections.create_connection(hosts=[es_url + ':' + es_port])
def get_conn(*, verify=True, verify_indices=None): """ Lazily create the connection. Args: verify (bool): If true, check the presence of indices and mappings verify_indices (list of str): If set, check the presence of these indices. Else use the defaults. Returns: elasticsearch.client.Elasticsearch: An Elasticsearch client """ # pylint: disable=global-statement global _CONN global _CONN_VERIFIED do_verify = False if _CONN is None: http_auth = settings.ELASTICSEARCH_HTTP_AUTH use_ssl = http_auth is not None _CONN = connections.create_connection( hosts=[settings.ELASTICSEARCH_URL], http_auth=http_auth, use_ssl=use_ssl, # make sure we verify SSL certificates (off by default) verify_certs=use_ssl ) # Verify connection on first connect if verify=True. do_verify = verify if verify and not _CONN_VERIFIED: # If we have a connection but haven't verified before, do it now. do_verify = True if not do_verify: if not verify: # We only skip verification if we're reindexing or # deleting the index. Make sure we verify next time we connect. _CONN_VERIFIED = False return _CONN # Make sure everything exists. if verify_indices is None: verify_indices = set() for index_type in ALL_INDEX_TYPES: verify_indices = verify_indices.union( get_aliases(index_type) ) for verify_index in verify_indices: if not _CONN.indices.exists(verify_index): raise ReindexException("Unable to find index {index_name}".format( index_name=verify_index )) _CONN_VERIFIED = True return _CONN
def applyConfig(self): try: print("Connecting to '%s', index '%s'" % (self.confESHost, self.confESIndex)) res = connections.create_connection(hosts=[self.confESHost]) idx = Index(self.confESIndex) idx.doc_type(DocHTTPRequestResponse) DocHTTPRequestResponse.init() try: idx.create() except: pass except Exception as e: JOptionPane.showMessageDialog(self.panel, "<html><p style='width: 300px'>Error while initializing ElasticSearch: %s</p></html>" % (str(e)), "Error", JOptionPane.ERROR_MESSAGE)
def get_es_client(enforce_new=False, retry=True): """Returns the singleton Elasticsearch-client object connected to ES server specified by environment variable ES_HOST with default timeout specified by environment variable ES_TIMEOUT """ global CLIENT if enforce_new or not CLIENT: CLIENT = \ connections.create_connection(hosts=[os.environ['ES_HOST']], timeout=os.environ['ES_TIMEOUT']) if CLIENT.ping(): return CLIENT elif retry: return get_es_client(enforce_new=True, retry=False) else: return CLIENT
def form_sinks(self): model_modules = [c for c in listdir(self.model_location) if isfile(join(self.model_location, c)) if c != '__init__.py'] model_modules = [m for m in model_modules if PY_FILE_REGEX.match(m)] for model_module in model_modules: # get the name of the class model_module = model_module.split('.')[0] try: module_path = self.load_path + '.' + model_module model_class = model_module module = importlib.import_module(module_path) self.models.append(ModelIdentifier(index=module.index, mapping=module.mapping, model_class=module.model_class)) except (ImportError, Exception) as e: raise RuntimeError("Error importing the module ", e) for model in self.models: model_name = model.index + "." + model.mapping connections.create_connection(model_name, hosts=[ELASTIC_HOST], port=ELASTIC_PORT) data_sink = ElasticDataSink(model_name, connections.get_connection(model_name), model) self.data_sinks[model_name] = data_sink
def registerExtenderCallbacks(self, callbacks): self.callbacks = callbacks self.helpers = callbacks.getHelpers() callbacks.setExtensionName("Storing HTTP Requests/Responses into ElasticSearch") self.callbacks.registerHttpListener(self) self.callbacks.registerContextMenuFactory(self) self.out = callbacks.getStdout() res = connections.create_connection(hosts=[ES_host]) idx = Index(ES_index) idx.doc_type(DocHTTPRequestResponse) try: idx.create() except: print("Index already exists")
def get_conn(verify=True): """ Lazily create the connection. """ # pylint: disable=global-statement # This is ugly. Any suggestions on a way that doesn't require "global"? global _CONN global _CONN_VERIFIED do_verify = False if _CONN is None: _CONN = connections.create_connection(hosts=[URL]) # Verify connection on first connect if verify=True. do_verify = verify if verify and not _CONN_VERIFIED: # If we have a connection but haven't verified before, do it now. do_verify = True if not do_verify: if not verify: # We only skip verification if we're reindexing or # deleting the index. Make sure we verify next time we connect. _CONN_VERIFIED = False return _CONN # Make sure everything exists. if not _CONN.indices.exists(INDEX_NAME): raise ReindexException("Unable to find index {index_name}".format( index_name=INDEX_NAME )) mapping = _CONN.indices.get_mapping() if INDEX_NAME not in mapping: raise ReindexException( "No mappings found in index {index_name}".format( index_name=INDEX_NAME ) ) mappings = _CONN.indices.get_mapping()[INDEX_NAME]["mappings"] if DOC_TYPE not in mappings.keys(): raise ReindexException("Mapping {doc_type} not found".format( doc_type=DOC_TYPE )) _CONN_VERIFIED = True return _CONN
def applyConfig(self): try: print("Connecting to '%s', index '%s'" % (self.confESHost, self.confESIndex)) self.es = connections.create_connection(hosts=[self.confESHost]) self.idx = Index(self.confESIndex) self.idx.doc_type(DocHTTPRequestResponse) if self.idx.exists(): self.idx.open() else: self.idx.create() self.callbacks.saveExtensionSetting("elasticburp.host", self.confESHost) self.callbacks.saveExtensionSetting("elasticburp.index", self.confESIndex) self.callbacks.saveExtensionSetting("elasticburp.tools", str(self.confBurpTools)) self.callbacks.saveExtensionSetting("elasticburp.onlyresp", str(int(self.confBurpOnlyResp))) except Exception as e: JOptionPane.showMessageDialog(self.panel, "<html><p style='width: 300px'>Error while initializing ElasticSearch: %s</p></html>" % (str(e)), "Error", JOptionPane.ERROR_MESSAGE)
def test_event_send_and_store(options): inputServer=options['loginput'] esServer = options['esserver'] uuids=[] #create a sample test event anevent=json.loads(r'''{ "category": "pytest", "processid": "0", "severity": "DEBUG", "utctimestamp": "", "hostname": "testhost.pytest.com", "summary": "a test event for pytest from test_basic_event_send", "eventsource": "pytest", "details": { "processid": "14148", "hostname": "testvictim.pytest.com", "program": "pytest", "sourceipaddress": "10.1.2.3" } }''') #send events for i in range(0,5): anevent['timestamp']=datetime.utcnow().isoformat() anevent['details']['uuid']=str(uuid.uuid1()) uuids.append(anevent['details']['uuid']) if options["verbose"]: print('sending {0}'.format(anevent)) r=requests.put(url="http://{0}/events".format(inputServer),data=json.dumps(anevent)) if options["verbose"]: print(r) assert r.status_code == 200 #search for events to have landed in ES es=connections.create_connection(hosts=['{0}'.format(esServer)]) for u in uuids: for hit in scan(es, query={"query":{"match":{"details.uuid":"{0}".format(u)}}}, index="events", doc_type="event"): assert u == hit['_source']['details']['uuid']
def _init(): es_url = settings.ELASTIC_SEARCH["url"] if not es_url: return connection = connections.create_connection( hosts=[es_url], verify_certs=es_url.startswith("https"), ca_certs=certifi.where(), timeout=20) # Create any indices that are missing indices = connection.indices.get("*") for item in doctypes(): if item._doc_type.index not in indices: item.init() connection.indices.put_mapping(doc_type=item._doc_type.name, index=item._doc_type.index, body={"_routing": {"required": True}}) return connection
def setup_database(config): settings = dictset(config.registry.settings).mget('elasticsearch') params = {} params['chunk_size'] = settings.get('chunk_size', 500) params['hosts'] = [] for hp in split_strip(settings['hosts']): h, p = split_strip(hp, ':') params['hosts'].append(dict(host=h, port=p)) if settings.asbool('sniff'): params['sniff_on_start'] = True params['sniff_on_connection_fail'] = True # XXX if this connection has to deal with mongo and sqla objects, # then we'll need to use their es serializers instead. should # probably clean up that part of the engine interface - there's # lots of repeated code, plus other engines shouldn't have to know # about es - they should just know how to serialize their # documents to JSON. conn = connections.create_connection( serializer=JSONSerializer(), **params) setup_index(conn, settings)
# Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html import hashlib import os import sys sys.path.append(os.path.dirname(os.path.abspath(__file__))+"/../../../../") from med_base.storage.es.models import EntityDisease, EntityBodypart, EntityDepartment, \ EntityDrug, EntityExam, EntityOperation, EntitySymptom from elasticsearch_dsl.connections import connections from conf.settings import ES_HOST connections.create_connection(hosts=[ES_HOST]) from jk39.items import DiseaseItem, ExamItem, DrugItem, OperationItem, SymptomItem class EntityItemPipeline(object): def process_item(self, item, spider): if isinstance(item, DiseaseItem): spider.logger.info('====== SAVE A Entity Disease: name={} ======'.format(item.get('name', '').strip())) meta_dict = {} for key in ['name', 'describe', 'is_infect', 'highrisk_group', 'source_url', 'treatment_cycle', 'treatment_cost']: if item.get(key, '').strip(): meta_dict[key] = item.get(key, '').strip()
def setUp(self): from django.conf import settings self.settings = getattr(settings, 'SEARCH') connections.create_connection( 'testing', **self.settings['default']['connections'])
from elasticsearch_dsl import Document, Text, Keyword, Date, analyzer from elasticsearch_dsl.connections import connections connections.create_connection(hosts=['127.0.0.1']) ik_analyzer = analyzer('ik_max_word') class CnblogsType(Document): title = Text(analyzer='ik_max_word') description = Text(analyzer='ik_max_word') url = Keyword() riqi = Date() class Index: name = 'cnblog_text' settings = { 'number_of_shards': 5, } es = connections.create_connection(CnblogsType) if __name__ == '__main__': CnblogsType.init()
def build_index(self, document_parquet, section_parquet, tables_parquet, figures_parquet, equations_parquet): if self.awsauth is not None: connections.create_connection( hosts=self.hosts, http_auth=self.awsauth, use_ssl=True, verify_certs=True, connection_class=RequestsHttpConnection) else: connections.create_connection(hosts=self.hosts) logger.info('Building elastic index') connections.create_connection(hosts=self.hosts) Object.init() FullDocument.init() # This is a parquet file to load from df = pd.read_parquet(document_parquet) for ind, row in df.iterrows(): FullDocument(name=row['pdf_name'], dataset_id=row['dataset_id'], content=row['content']).save() logger.info('Done building document index') df = pd.read_parquet(section_parquet) for ind, row in df.iterrows(): Object( cls='Section', dataset_id=row['dataset_id'], content=row['content'], header_content=row['section_header'], area=50, detect_score=row['detect_score'], postprocess_score=row['postprocess_score'], pdf_name=row['pdf_name'], ).save() logger.info('Done building section index') if tables_parquet != '': df = pd.read_parquet(tables_parquet) for ind, row in df.iterrows(): Object( cls='Table', dataset_id=row['dataset_id'], content=row['content'], header_content=row['caption_content'], area=50, detect_score=row['detect_score'], postprocess_score=row['postprocess_score'], pdf_name=row['pdf_name'], img_pth=row['img_pth'], ).save() logger.info('Done building tables index') if figures_parquet != '': df = pd.read_parquet(figures_parquet) for ind, row in df.iterrows(): Object( cls='Figure', dataset_id=row['dataset_id'], content=row['content'], header_content=row['caption_content'], area=50, detect_score=row['detect_score'], postprocess_score=row['postprocess_score'], pdf_name=row['pdf_name'], img_pth=row['img_pth'], ).save() logger.info('Done building figures index') if equations_parquet != '': df = pd.read_parquet(equations_parquet) for ind, row in df.iterrows(): Object( cls='Equation', dataset_id=row['dataset_id'], content=row['content'], header_content='', area=50, detect_score=row['detect_score'], postprocess_score=row['postprocess_score'], pdf_name=row['pdf_name'], img_pth=row['img_pth'], ).save() logger.info('Done building equations index') logger.info('Done building object index')
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html import pymysql import pymysql.cursors from elasticsearch_dsl.connections import connections # 将mysql插入变成异步化的包,由twisted提供 from twisted.enterprise import adbapi from ArticleSpider.models.es_types import ArticleType from ArticleSpider import settings es = connections.create_connection(hosts=[settings.ES_ADDRESS]) class ArticlespiderPipeline(object): def process_item(self, item, spider): return item class MysqlTwistPipeline(object): """ 数据异步插入mysql """ def __init__(self, dbpool): self.dbpool = dbpool
def index( buildroot: Path, url: str, update=False, no_progressbar=False, ): # We can confidently use a single host here because we're not searching # a cluster. connections.create_connection(hosts=[url], retry_on_timeout=True) connection = connections.get_connection() health = connection.cluster.health() status = health["status"] if status not in ("green", "yellow"): raise click.ClickException(f"status {status} not green or yellow") count_todo = 0 for file in walk(buildroot): count_todo += 1 click.echo(f"Found {count_todo:,} (potential) documents to index") if update: for name in connection.indices.get_alias(): if name.startswith(f"{INDEX_ALIAS_NAME}_"): document_index = Index(name) break else: raise IndexAliasError( f"Unable to find an index called {INDEX_ALIAS_NAME}_*") else: # Confusingly, `._index` is actually not a private API. # It's the documented way you're supposed to reach it. document_index = Document._index click.echo("Deleting any possible existing index " f"and creating a new one called {document_index._name!r}") document_index.delete(ignore=404) document_index.create() skipped = [] def generator(): root = Path(buildroot) for doc in walk(root): # The reason for specifying the exact index name is that we might # be doing an update and if you don't specify it, elasticsearch_dsl # will fall back to using whatever Document._meta.Index automatically # becomes in this moment. search_doc = to_search(doc, _index=document_index._name) if search_doc: yield search_doc.to_dict(True) else: # The reason something might be chosen to be skipped is because # there's logic that kicks in only when the `index.json` file # has been opened and parsed. # Keep a count of all of these. It's used to make sure the # progressbar, if used, ticks as many times as the estimate # count was. skipped.append(1) def get_progressbar(): if no_progressbar: return VoidProgressBar() return click.progressbar(length=count_todo, label="Indexing", width=0) count_done = count_worked = count_errors = 0 count_shards_worked = count_shards_failed = 0 errors_counter = Counter() t0 = time.time() with get_progressbar() as bar: for success, info in parallel_bulk( connection, generator(), # If the bulk indexing failed, it will by default raise a BulkIndexError. # Setting this to 'False' will suppress that. raise_on_exception=False, # If the bulk operation failed for some other reason like a ReadTimeoutError # it will raise whatever the error but default. # We prefer to swallow all errors under the assumption that the holes # will hopefully be fixed in the next attempt. raise_on_error=False, ): if success: count_shards_worked += info["index"]["_shards"]["successful"] count_shards_failed += info["index"]["_shards"]["failed"] count_worked += 1 else: count_errors += 1 errors_counter[info["index"]["error"]] += 1 count_done += 1 bar.update(1) for skip in skipped: bar.update(1) # Now when the index has been filled, we need to make sure we # correct any previous indexes. if update: # When you do an update, Elasticsearch will internally delete the # previous docs (based on the _id primary key we set). # Normally, Elasticsearch will do this when you restart the cluster # but that's not something we usually do. # See https://www.elastic.co/guide/en/elasticsearch/reference/current/indices-forcemerge.html document_index.forcemerge() else: # Now we're going to bundle the change to set the alias to point # to the new index and delete all old indexes. # The reason for doing this together in one update is to make it atomic. alias_updates = [{ "add": { "index": document_index._name, "alias": INDEX_ALIAS_NAME } }] for index_name in connection.indices.get_alias(): if index_name.startswith(f"{INDEX_ALIAS_NAME}_"): if index_name != document_index._name: alias_updates.append( {"remove_index": { "index": index_name }}) click.echo(f"Delete old index {index_name!r}") connection.indices.update_aliases({"actions": alias_updates}) click.echo(f"Reassign the {INDEX_ALIAS_NAME!r} alias from old index " f"to {document_index._name}") t1 = time.time() took = t1 - t0 rate = count_done / took click.echo(f"Took {format_time(took)} to index {count_done:,} documents. " f"Approximately {rate:.1f} docs/second") click.echo(f"Count shards - successful: {count_shards_worked:,} " f"failed: {count_shards_failed:,}") click.echo(f"Counts - worked: {count_worked:,} errors: {count_errors:,}") if errors_counter: click.echo("Most common errors....") for error, count in errors_counter.most_common(): click.echo(f"{count:,}\t{error[:80]}")
""" Elasticsearch base configruation. """ import os from elasticsearch import Elasticsearch from elasticsearch_dsl.connections import connections ELASTICSEARCH_AVAILABLE = False ES_CLIENT = None es_host = os.environ.get('ELASTICSEARCH_URL') if es_host: ES_CLIENT = Elasticsearch(['{}'.format(es_host)]) connections.create_connection(hosts=['{}'.format(es_host)]) ELASTICSEARCH_AVAILABLE = True
import elasticsearch import tqdm from nltk.tokenize import word_tokenize from jinja2 import Environment, PackageLoader from qanta.wikipedia.cached_wikipedia import Wikipedia from qanta.datasets.abstract import QuestionText from qanta.guesser.abstract import AbstractGuesser from qanta.spark import create_spark_context from qanta.config import conf from qanta.util.io import get_tmp_dir, safe_path from qanta import qlogging log = qlogging.get(__name__) ES_PARAMS = 'es_params.pickle' connections.create_connection(hosts=['localhost']) def create_es_config(output_path, host='localhost', port=9200, tmp_dir=None): if tmp_dir is None: tmp_dir = get_tmp_dir() data_dir = safe_path(os.path.join(tmp_dir, 'elasticsearch/data/')) log_dir = safe_path(os.path.join(tmp_dir, 'elasticsearch/log/')) env = Environment(loader=PackageLoader('qanta', 'templates')) template = env.get_template('elasticsearch.yml') config_content = template.render({ 'host': host, 'port': port, 'log_dir': log_dir, 'data_dir': data_dir })
import time import elasticsearch.client from django.conf import settings from elasticsearch_dsl import Document, InnerDoc, Date, Integer, Long, Text, Object, GeoPoint, Keyword, Boolean from elasticsearch_dsl.connections import connections from blog.models import Article ELASTICSEARCH_ENABLED = hasattr(settings, 'ELASTICSEARCH_DSL') if ELASTICSEARCH_ENABLED: connections.create_connection( hosts=[settings.ELASTICSEARCH_DSL['default']['hosts']]) from elasticsearch import Elasticsearch es = Elasticsearch(settings.ELASTICSEARCH_DSL['default']['hosts']) from elasticsearch.client import IngestClient c = IngestClient(es) try: c.get_pipeline('geoip') except elasticsearch.exceptions.NotFoundError: c.put_pipeline('geoip', body='''{ "description" : "Add geoip info", "processors" : [ { "geoip" : { "field" : "ip" }
# -*- coding:utf-8 -*- import random import hashlib from elasticsearch_dsl import DocType, Keyword from elasticsearch_dsl.connections import connections index_prefix = 'skyeye_cloud_sandbox_s3_index_%s' connections.create_connection(hosts=['10.95.166.208', '10.95.166.209', '10.95.166.210']) class Doc(DocType): task_id = Keyword() file_name = Keyword() class Meta: index = 'skyeye_cloud_sandbox_s3_index_*' def save(self, **kwargs): # 使用索引模板生成索引 index = index_prefix % (self.task_id[0]) return super(Doc, self).save(index=index, **kwargs) if __name__ == '__main__': for i in range(100): s3index = Doc(task_id = str(hashlib.md5(str(random.randint(0,100))).hexdigest()),file_name = str(hashlib.md5(str(random.randint(0,100))).hexdigest())) s3index.save() print 'finish done!'
from datetime import datetime from elasticsearch_dsl import DocType, Date, Nested, Boolean, \ analyzer, InnerObjectWrapper, Completion, Keyword, Text from elasticsearch_dsl.analysis import CustomAnalyzer as _CustomAnalyzer from elasticsearch_dsl.connections import connections connections.create_connection(hosts=["140.143.211.106"]) class CustomAnalyzer(_CustomAnalyzer): def get_analysis_definition(self): return {} ik_analyzer = CustomAnalyzer("ik_max_word", filter=["lowercase"]) class BaiduType(DocType): suggest = Completion(analyzer=ik_analyzer) url = Keyword() title = Text(analyzer="ik_max_word") summary = Text(analyzer="ik_max_word") content = Text(analyzer="ik_max_word") class Meta: index = "baidu" doc_type = "baike" def gen_suggest(index, info_tuple):
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html import scrapy import redis from tools.es_models import NewsClsType from elasticsearch_dsl.connections import connections es = connections.create_connection(NewsClsType._doc_type.using) redis_cli = redis.StrictRedis(host="127.0.0.1") def gen_suggest(index, info_tuple): # 根据字符串生成搜索建议字符串数组 user_words = set() suggests = [] for text, weight in info_tuple: if text: # 调用 es 的 Analyzer 接口分析字符串 words = es.indices.analyze(index=index, analyzer="ik_max_word", params={'filter': ["lowercase"]}, body=text) # analyzed_words = set([r["token"] for r in words if len(r["token"]) > 1]) analyzed_words = set( [r["token"] for r in words["tokens"] if len(r["token"]) > 1])
# coding=utf8 #将伯乐在线mysql中的数据存到es中 import MySQLdb from ArticleSpider.moudles.es_types import ArticleType # 连接到数据库,并获得图标 connection = MySQLdb.connect('127.0.0.1', 'root', '111111', 'spider', charset="utf8", use_unicode=True) cursor = connection.cursor() from elasticsearch_dsl.connections import connections from w3lib.html import remove_tags es = connections.create_connection(ArticleType._doc_type.using) def gen_suggests(index, info_tuple): # 根据字符串生成搜索建议数组 used_words = set() suggests = [] for text, weight in info_tuple: if text: # 调用es的analyze接口分析字符串 words = es.indices.analyze(index=index, analyzer="ik_max_word", params={'filter': ["lowercase"]}, body=text) anylyzed_words = set([r["token"] for r in words["tokens"] if len(r["token"]) > 1]) new_words = anylyzed_words - used_words else: new_words = set() if new_words: suggests.append({"input": list(new_words), "weight": weight}) return suggests cursor.execute("select title, url, create_date, praise_nums, comment_nums, fav_nums, front_image_url, tags, content from article")
# Elasticsearch # aws, localhost, or govuk-paas ELASTICSEARCH_PROVIDER = env.str('ELASTICSEARCH_PROVIDER', 'aws').lower() if ELASTICSEARCH_PROVIDER == 'govuk-paas': services = { item['instance_name']: item for item in VCAP_SERVICES['elasticsearch'] } ELASTICSEARCH_INSTANCE_NAME = env.str( 'ELASTICSEARCH_INSTANCE_NAME', VCAP_SERVICES['elasticsearch'][0]['instance_name']) connections.create_connection( alias='default', hosts=[services[ELASTICSEARCH_INSTANCE_NAME]['credentials']['uri']], connection_class=RequestsHttpConnection, ) elif ELASTICSEARCH_PROVIDER == 'localhost': connections.create_connection(alias='default', hosts=['localhost:9200'], use_ssl=False, verify_certs=False, connection_class=RequestsHttpConnection) else: raise NotImplementedError() ELASTICSEARCH_COMPANY_INDEX_ALIAS = env.str( 'ELASTICSEARCH_COMPANY_INDEX_ALIAS', 'ch-companies') # health check
from elasticsearch_dsl.connections import connections from elasticsearch_dsl import DocType, Text, Date, Search from elasticsearch.helpers import bulk from elasticsearch import Elasticsearch from . import models connections.create_connection(hosts =[ '192.168.99.100'], port= 32771) class ResourceIndex(DocType): Name = Text() created_date = Date() description = Text() link = Text() image = Text() class Meta: index = 'takethekids' def bulk_indexing(): ResourceIndex.init() es = Elasticsearch([ {'host': '192.168.99.100', 'port': 32771} ]) bulk(client=es, actions=(b.indexing() for b in models.Resource.objects.all().iterator()))
""" Phải khai báo lại class Meta với doc_type và tên index đúng như dưới """ doc_type = 'CivilArticle' index = index_name class Index: """ Từ bản 0.10.2 trở đi phải khai báo thêm cả class Index có thuộc tính name là tên index trong elastic search như dưới đây """ name = index_name # Create connection es = Elasticsearch() connections.create_connection(hosts=['localhost'], timeout=20) connections.add_connection('CivilArticle', es) CivilArticle.init(index_name) def preprocess_content(content): res = [] lines = content.split('\n') for line in lines: line = remove_numbering(line) words = pre_process_text(line) res.append(' '.join(words)) return ' '.join(res)
def _initialize(self): """ Initialize a connection to an ES cluster and creates an index template if it does not exist. """ if not self._initialized: http_auth = None if self._access_key and self._secret_key and self._aws_region: http_auth = AWS4Auth(self._access_key, self._secret_key, self._aws_region, "es") elif self._access_key and self._secret_key: http_auth = (self._access_key, self._secret_key) else: logger.warn("Connecting to Elasticsearch without HTTP auth") self._client = connections.create_connection( hosts=[{ "host": self._host, "port": self._port }], http_auth=http_auth, use_ssl=self._use_ssl, verify_certs=True, connection_class=RequestsHttpConnection, timeout=ELASTICSEARCH_DEFAULT_CONNECTION_TIMEOUT, ) # Create a second connection with a timeout of 60s vs 10s. # For some reason the PUT template API can take anywhere between # 10s and 30s on the test cluster. # This only needs to be done once to initialize the index template connections.create_connection( alias=ELASTICSEARCH_TEMPLATE_CONNECTION_ALIAS, hosts=[{ "host": self._host, "port": self._port }], http_auth=http_auth, use_ssl=self._use_ssl, verify_certs=True, connection_class=RequestsHttpConnection, timeout=ELASTICSEARCH_TEMPLATE_CONNECTION_TIMEOUT, ) try: force_template_update = ELASTICSEARCH_FORCE_INDEX_TEMPLATE_UPDATE.lower( ) == "true" self._client.indices.get_template(self._index_prefix) LogEntry.init( self._index_prefix, self._index_settings, skip_template_init=not force_template_update, ) except NotFoundError: LogEntry.init(self._index_prefix, self._index_settings, skip_template_init=False) finally: try: connections.remove_connection( ELASTICSEARCH_TEMPLATE_CONNECTION_ALIAS) except KeyError as ke: logger.exception( "Elasticsearch connection not found to remove %s: %s", ELASTICSEARCH_TEMPLATE_CONNECTION_ALIAS, ke, ) self._initialized = True
from django_elasticsearch_dsl import Index, fields from django_elasticsearch_dsl.documents import Document from api.models.article_model import ArticleOfInterest from elasticsearch_dsl.connections import connections from django_elasticsearch_dsl.registries import registry from elasticsearchapp.custom_analyzers import greek_analyzer connections.create_connection() article_index = Index('articles') article_index.settings(number_of_shards=1, number_of_replicas=0) @registry.register_document @article_index.document class ArticleDocument(Document): title = fields.TextField(analyzer=greek_analyzer) date = fields.DateField() body = fields.TextField(analyzer=greek_analyzer) tags = fields.TextField(analyzer=greek_analyzer) author = fields.TextField() link = fields.TextField() type = fields.TextField() scope = fields.TextField() class Django: model = ArticleOfInterest
from elasticsearch_dsl import DocType, String, Boolean, Long, FacetedSearch, Date from elasticsearch_dsl.connections import connections from config import parse_config connections.create_connection(hosts=[parse_config("db")['url']]) class Function(DocType): function_id = String(index='not_analyzed') tenant_id = String(index='not_analyzed') user_id = String(index='not_analyzed') image_id = String(index='not_analyzed') name = String(index='not_analyzed') description = String(index='not_analyzed') type = String(index='not_analyzed') event = String(index='not_analyzed') public = Boolean() endpoint = String(index='not_analyzed') runtime = String(index='not_analyzed') memory = Long() zip_location = String(index='not_analyzed') tags = String() status = String(index='not_analyzed') class Meta: index = 'pratai' class FunctionSearch(FacetedSearch): doc_types = [
# -*- coding: UTF-8 -*- from elasticsearch_dsl.connections import connections from ultis.commons import ComFunc # server connect ELASTIC_HOST = "192.168.9.199" ELASTIC_PORT = 9200 connections.create_connection( hosts=['{0}:{1}'.format(ELASTIC_HOST, ELASTIC_PORT)]) class CommonEs(object): """ es可以共用的函数 """ time_zone = "Asia/Shanghai" @classmethod def debug_query(cls, s): """ debug search query dict :param s: search object :return: """ print('=' * 30) print(ComFunc.to_json_string(s.to_dict())) print('=' * 30)
from django.db import models # Create your models here. from datetime import datetime from elasticsearch_dsl import DocType, Date, Nested, Boolean, \ analyzer, InnerObjectWrapper, Completion, Keyword, Text from elasticsearch_dsl.analysis import CustomAnalyzer as _CustomAnalyzer from elasticsearch_dsl.connections import connections connections.create_connection(hosts=["localhost"]) #连接服务器 #避免analyzer报错 class CustonAnalyzer(_CustomAnalyzer): def get_analysis_definition(self): return {} ik_analyzer = CustonAnalyzer("ik_max_word", filter=["lowercase"]) class LagouType(DocType): #拉钩职位类型 suggest = Completion(analyzer=ik_analyzer) #添加suggester url = Keyword() url_object_id = Keyword() title = Text(analyzer="ik_max_word") salary = Text(analyzer="ik_max_word") job_city = Text(analyzer="ik_max_word") work_years = Text(analyzer="ik_max_word")
def __init__(self, url_path, query_dict): self.url_path = url_path self.query_dict = query_dict # create ES connection connections.create_connection()
def enable_es(): ES_URL = 'http://search:9200' connections.create_connection('default', hosts=[ES_URL])
from urllib.parse import urlparse from scrapy.crawler import CrawlerProcess from elasticsearch_dsl import Index, Search, Mapping from elasticsearch_dsl.connections import connections from elasticsearch_dsl import Document, DateRange, Keyword, Range, Text # Initiate the elasticsearch connection # hosts = "localhost" # http_auth = ("elastic", "changeme") # port = "9200" hosts = [os.getenv("HOST")] http_auth = (os.getenv("USERNAME"), os.getenv("PASSWORD")) port = os.getenv("PORT") client = connections.create_connection(hosts=hosts, http_auth=http_auth, port=port) # initiate Redis connection # redis_conn = Redis("127.0.0.1", os.getenv("REDIS_PORT", 6379)) redis_conn = Redis(os.getenv("REDIS_HOST", "redis"), os.getenv("REDIS_PORT", 6379)) def domains(url): """ Get the domain of the url. """ return tldextract.extract(url).registered_domain
import os import sys import time import pickle from elasticsearch_dsl.connections import connections from okcom_tokenizer.tokenizers import CCEmojiJieba, UniGram from marginalbear_elastic.query import post_search, post_multifield_query from marginalbear_elastic.utils import concat_tokens from marginalbear_elastic.ranking import avg_pmi client = connections.create_connection(hosts=['elastic:changeme@localhost'], timeout=20) ccjieba = CCEmojiJieba() unigram = UniGram() package_dir = os.path.dirname(os.path.realpath(__name__)) def query_ccjieba(input_sentence, pairs_cnt, total_pairs_cnt): query = ccjieba.cut(input_sentence.strip()) results = post_search(client, index='post', tokenizer='ccjieba', query=concat_tokens(query, pos=False), top=100) tokenized_query = [str(i['word']) for i in query] sorted_ans = avg_pmi(tokenized_query, results, pairs_cnt, total_pairs_cnt,
# -*- coding: utf-8 -*-# -*- coding: utf-8 -*- from datetime import datetime from elasticsearch_dsl import DocType, Date, Nested, Boolean, \ analyzer, InnerObjectWrapper, Completion, Keyword, Text, Integer from elasticsearch_dsl.analysis import CustomAnalyzer as _CustomAnalyzer from elasticsearch_dsl.connections import connections connections.create_connection(hosts=["localhost"]) class CustomAnalyzer(_CustomAnalyzer): def get_analysis_definition(self): return {} ik_analyzer = CustomAnalyzer("ik_max_word", filter=["lowercase"]) class ArticleType(DocType): # 伯乐在线文章类型 suggest = Completion(analyzer=ik_analyzer) title = Text(analyzer="ik_max_word") create_date = Date() url = Keyword() url_obj_id = Keyword() img_url = Keyword() img_path = Keyword() praise_count = Integer() comment_count = Integer() collect_count = Integer() tags = Text(analyzer="ik_max_word") content = Text(analyzer="ik_max_word") class Meta:
# _*_ coding:utf-8 _*_ from es_config import host from elasticsearch_dsl import DocType, Text, Date, Integer, Q, Keyword, analyzer, token_filter, tokenizer from elasticsearch_dsl.connections import connections # Define a default Elasticsearch client connections.create_connection(hosts=[host]) pinyin_analyzer = analyzer('pinyin_analyzer', tokenizer=tokenizer('my_pinyin', type='pinyin', lowercase=True)) local_dynamic_synonym_filter = token_filter(name_or_instance="local_synonym", type="dynamic_synonym", synonyms_path="synonyms.txt", interval=60) local_synonym = token_filter(name_or_instance="local_synonym", type="dynamic_synonym", synonyms_path="synonyms.txt", interval=30) ik_synonym_analyzer = analyzer("remote_ik_synonym_analyzer ", tokenizer='ik_max_word', filter=[local_synonym]) ik_smart_synonym = analyzer("ik_smart_synonym", tokenizer='ik_smart', filter=[local_dynamic_synonym_filter]) ik_max_word_synonym = analyzer("ik_max_word_synonym", tokenizer='ik_max_word', filter=[local_dynamic_synonym_filter])
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html import scrapy from .models.es_types import CompanyType from elasticsearch_dsl.connections import connections es = connections.create_connection(CompanyType._doc_type.using) class HuijuCompanyItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() pass class CompanyItem(scrapy.Item): company_name = scrapy.Field() #公司名称 legal_person = scrapy.Field() #法人 registered_capital = scrapy.Field() #注册资本 telephone_number = scrapy.Field() #电话 email = scrapy.Field() #邮箱 company_url = scrapy.Field() #公司网址 address = scrapy.Field() #地址 registration_time = scrapy.Field() #注册时间 company_state = scrapy.Field() #公司状态
from elasticsearch_dsl import Document, Date, Integer from elasticsearch_dsl.connections import connections connections.create_connection(hosts=['elastic:[email protected]']) class ReservationDocument(Document): reserved_by = Integer() room = Integer() settle_date = Date() leave_date = Date() class Index: name = 'reservation' settings = { "number_of_shards": 1, } def save(self, **kwargs): return super(ReservationDocument, self).save(**kwargs)
This file generates an elasticsearch index. Run this before running web_app.py. Make sure that elasticsearch is running in the background beforehand. """ import csv import time from elasticsearch import Elasticsearch from elasticsearch import helpers from elasticsearch_dsl import Index, Document, Text from elasticsearch_dsl.connections import connections from elasticsearch_dsl.analysis import analyzer # Connect to local host server connections.create_connection(hosts=['127.0.0.1']) # Create elasticsearch object es = Elasticsearch() # Analyzers for both the predicate and arguments. predicate_analyzer = analyzer( 'covid_predicate_analyzer', tokenizer='whitespace', # Use a stemmer to capture predicates with similar stems. filter=['lowercase', 'stemmer']) argument_analyzer = analyzer('covid_argument_analyzer', tokenizer='whitespace', filter=['lowercase'])