def refresh(self, index=None): index = index or Index.objects.get_current().prefixed_name # Any time we're doing a refresh, we're making sure that the # index is ready to be queried. Given that, it's almost # always the case that we want to run all the generated tasks, # then refresh. connections.get_connection().indices.refresh(index=index)
def destroy(self): """Destroy an index.""" self._refresh_connection() self.push_queue = [] index_name = self.document_class()._get_index() # pylint: disable=protected-access connections.get_connection().indices.delete(index_name, ignore=404) self._mapping_created = False
def store(): with Store(store_type='elasticsearch', nodes='192.168.99.100:9200') as s: s._load_plugin(nodes='192.168.99.100:9200') try: connections.get_connection().indices.delete(index='indicators-*') connections.get_connection().indices.delete(index='tokens') except Exception as e: pass yield s
def create(self, data): logger.debug(data) for v in ['admin', 'read', 'write']: if data.get(v): data[v] = True if data.get('token') is None: data['token'] = self._generate() t = Token(**data) if t.save(): connections.get_connection().indices.flush(index='tokens') return t.to_dict()
def reindex_tokens(): TokenBackup.init() connections.create_connection(hosts=ES_NODES) backup_results = connections.get_connection().reindex(body={"source": {"index": INDEX_NAME}, "dest": {"index": BACKUP_INDEX_NAME}}, request_timeout=3600) if backup_results.get('created') + backup_results.get('updated') == backup_results.get('total'): Index(INDEX_NAME).delete() else: return ('Tokens did not backup properly') time.sleep(1) Token.init() reindex_results = connections.get_connection().reindex(body={"source": {"index": BACKUP_INDEX_NAME}, "dest": {"index": INDEX_NAME}}, request_timeout=3600) if reindex_results.get('created') + reindex_results.get('updated') == reindex_results.get('total'): return ('Tokens reindexed successfully!') else: return ('Tokens did not reindex from backup properly')
def delete(self, data): if not (data.get('token') or data.get('username')): return 'username or token required' rv = list(self.search(data, raw=True)) if not rv: return 0 for t in rv: t = Token.get(t['_id']) t.delete() connections.get_connection().indices.flush(index='tokens') return len(rv)
def token_edit(self, data): if not data.get('token'): return 'token required for updating' s = Token.search() s = s.filter('term', token=data['token']) rv = s.execute() if not rv.hits.total > 0: return 'token not found' d = rv.hits.hits[0] d.update(fields=data) connections.get_connection().indices.flush(index='tokens')
def _perform_index_sync(self, sql_table_cls, es_doc_cls, id_logger): es_doc = es_doc_cls() elasticsearch_conn = connections.get_connection() sync_timestamp = current_server_timestamp() pending_insertions = self._compute_dirty_documents( sql_table_cls, es_doc.doc_type) bulk_op = self._synchronisation_op(es_doc, pending_insertions) self._logging(logging.INFO, 'Performing synchronization.') for ok, info in parallel_bulk(elasticsearch_conn, bulk_op): obj_id = info['index']['_id'] \ if 'index' in info else info['update']['_id'] if ok: # Mark the task as handled so we don't retreat it next time self._logging(logging.INFO, 'Document %s has been synced successfully.' % obj_id) sql_table_cls.update_last_sync(obj_id, sync_timestamp) else: id_logger(obj_id, logging.ERROR, 'Error while syncing document %s index.' % obj_id) # Refresh indices to increase research speed elasticsearch_dsl.Index(es_doc.index).refresh()
def handle(self, *args, **options): usings = options.get("using") or settings.ELASTICSEARCH_CONNECTIONS.keys() for using in usings: # figure out if there is a conflict with the analysis defined in ES # and the analysis defined in Python land for this connection index_name = settings.ELASTICSEARCH_CONNECTIONS[using]['index_name'] es = connections.get_connection(using) result = is_analysis_compatible(using) if result is False: if options.get("clopen"): # get the existing analysis setting in ES, and combine # those with the ones defined in Python. Close the index, # update the settings, and re-open it analysis = combined_analysis(using) es.indices.close(index=index_name, ignore=[404]) es.indices.put_settings(index=index_name, body={'analysis': analysis}, ignore=[404]) es.indices.open(index=index_name, ignore=[404]) else: self.stderr.write( "The analysis defined in ES and the analysis defined by your Indexes are not compatible. Aborting." "Use --clopen to close the index, update the analysis, and open the index again." ) self.stderr.write(diff_analysis(using)) exit(1) super().handle(*args, **options) if self.confirmed: call_command("update_index", *args, **options)
def main(): es = connections.get_connection() dry = '--dry' in sys.argv if not dry: utils.add_file_logger(logger, __file__) preprints = Preprint.objects.filter(primary_file__isnull=False).select_related('primary_file', 'provider') total_preprints = preprints.count() logger.info('Collecting data on {} preprints...'.format(total_preprints)) batch_to_update = [] for i, preprint in enumerate(preprints, 1): preprint_id = preprint._id provider_id = preprint.provider._id file_id = preprint.primary_file._id page_counters = ( PageCounter.objects .filter( _id__startswith='download:{preprint_id}:{file_id}:'.format( preprint_id=preprint_id, file_id=file_id ) ).values_list('_id', 'date') ) for page_counter in page_counters: page_counter__id, date = page_counter version_num = page_counter__id.split(':')[-1] for date, totals in date.items(): timestamp = datetime.datetime.strptime(date, '%Y/%m/%d').replace(tzinfo=pytz.utc) batch_to_update.append({ '_index': 'osf_preprintdownload_{}'.format(timestamp.strftime(settings.ELASTICSEARCH_METRICS_DATE_FORMAT)), '_source': { 'count': totals['total'], 'path': '/{}'.format(file_id), 'preprint_id': preprint_id, 'provider_id': provider_id, 'timestamp': timestamp, 'user_id': None, # Pagecounter never tracked this 'version': int(version_num) + 1 }, '_type': 'doc' }) if len(batch_to_update) >= MAX_BATCH_SIZE: logger.info('Bulk-indexing data from {} PageCounter records'.format(len(batch_to_update))) if not dry: bulk(es, batch_to_update, max_retries=3, chunk_size=CHUNK_SIZE, request_timeout=REQUEST_TIMEOUT) batch_to_update = [] # Allow elasticsearch to catch up print('{}/{} preprints completed ({:.2f}%)'.format(i + 1, total_preprints, (i + 1) / total_preprints * 100)) sleep(THROTTLE_PERIOD) # Index final batch if len(batch_to_update): logger.info('Bulk-indexing data from {} PageCounter records'.format(len(batch_to_update))) if not dry: bulk(es, batch_to_update, max_retries=3, chunk_size=CHUNK_SIZE, request_timeout=REQUEST_TIMEOUT) logger.info('This will migrate {} Pagecounter entries to Elasticsearch'.format(len(batch_to_update)))
def index_model(label): logger.debug('index_model') Model = None SerializerClass = None try: Model = apps.get_model(label) except LookupError as e: logger.error(e) raise e try: SerializerClass = search_config.get_serializer_for_model(label) except LookupError as e: logger.error(e) raise e if Model and SerializerClass: serializer = SerializerClass() conn = connections.get_connection() # Get default connection queryset = Model.objects.all() if hasattr(queryset, 'published'): queryset = queryset.published() if serializer.related_object_fields: queryset = queryset.prefetch_related(*serializer.related_object_fields) model_docs = (serializer.create_document(item) for item in queryset) doc_dicts = (doc.to_dict(include_meta=True) for doc in model_docs) return es_bulk(conn, doc_dicts)
def query_articles(self, query, prefs): client = connections.get_connection() search = Search(using=client, index='articles') q = Q('bool', must=[Q('exists', field='watson_analyzed'), Q('match', watson_success=True), Q('match', body=query)]) search = search.query(q) search.execute() documents = [] for hit in search[:100]: if '#' not in hit.url and '?' not in hit.url: documents.append({ 'id': hit.meta.id, 'title': hit.title, 'body': hit.body, 'url': hit.url, 'score': hit.meta.score, 'tone': dict( joy=hit.tone.joy, fear=hit.tone.fear, sadness=hit.tone.sadness, disgust=hit.tone.disgust, anger=hit.tone.anger ), 'top_image': hit.top_image }) if len(documents) < 10: return documents else: return select_k_and_sort(documents, prefs)
def update_sentiments(self): from watson_developer_cloud import ToneAnalyzerV3Beta tone_analyzer = ToneAnalyzerV3Beta(username='******', password='******', version='2016-02-11') client = connections.get_connection() search = Search(using=client, index='articles', doc_type='article') q = Q('bool', must=[Q('missing', field='watson_analyzed')]) search = search.query(q) counter = 0 for result in search.scan(): doc = Article.get(result.meta.id) try: analysis = tone_analyzer.tone(text=doc.body) tone_categories = analysis['document_tone']['tone_categories'] emotion_tones = list(filter(lambda x: x['category_id'] == 'emotion_tone', tone_categories))[0] doc.tone = {} for tone in emotion_tones['tones']: doc.tone[tone['tone_id']] = tone['score'] doc.watson_success = True except WatsonException: continue finally: doc.watson_analyzed = True doc.save() counter += 1 print(counter) if counter == 0: raise RealError()
def handle(self, *args, **options): Index(ElasticAddress._doc_type.index).delete(ignore=404) ElasticAddress.init() es = connections.get_connection('default') es.indices.put_settings( index=ElasticAddress._doc_type.index, body={ "number_of_replicas": 0, 'index.max_result_window': 50000 } ) Address.objects.reindex() self.stdout.write( 'Loaded {} addresses to persistence storage'.format( Address.objects.count())) ownership_idx.delete(ignore=404) ownership_idx.create() ElasticOwnership.init() Ownership.objects.select_related("prop__address").reindex() self.stdout.write( 'Loaded {} ownerships to persistence storage'.format( Ownership.objects.count()))
def _index_all_blogitems(self): iterator = BlogItem.objects.all() category_names = dict((x.id, x.name) for x in Category.objects.all()) categories = defaultdict(list) for e in BlogItem.categories.through.objects.all(): categories[e.blogitem_id].append(category_names[e.category_id]) es = connections.get_connection() report_every = 100 count = 0 doc_type_name = _get_doc_type_name(BlogItem) t0 = time.time() for success, doc in streaming_bulk( es, (m.to_search(all_categories=categories).to_dict(True) for m in iterator), index=settings.ES_BLOG_ITEM_INDEX, doc_type=doc_type_name, ): if not success: print("NOT SUCCESS!", doc) count += 1 if not count % report_every: print(count) t1 = time.time() self.out("DONE Indexing {} blogitems in {} seconds".format(count, t1 - t0))
def restore_tokens(): connections.create_connection(hosts=ES_NODES) Index(INDEX_NAME).delete() class Token(DocType): username = String() token = String() expires = Date() read = Boolean() write = Boolean() revoked = Boolean() acl = String() groups = String() admin = Boolean() last_activity_at = Date() class Meta: index = INDEX_NAME Token.init() reindex_results = connections.get_connection().reindex(body={"source": {"index": BACKUP_INDEX_NAME}, "dest": {"index": INDEX_NAME}}, request_timeout=3600) if reindex_results.get('created') + reindex_results.get('updated') == reindex_results.get('total'): return ('Tokens restored to previous schema successfully!') else: return ('Tokens did not restore from backup properly')
def get_es(alias='default'): """Retrieve Elasticsearch instance :arg alias: the alias in ES_URLS for this Elasticsearch connection """ return connections.get_connection(alias=alias)
def handle(self, *args, **options): es = connections.get_connection() self.stdout.write('Deleting all the indices') es.indices.delete('blog-search') es.indices.create('blog-search', settings.ES_INDICES_SETTINGS) for post in Post.objects.all(): PostES.index_post(post)
def setUpClass(cls): super(ElasticTestCase, cls).setUpClass() if not getattr(settings, 'ES_URLS', None): cls.skipme = True return try: connections.get_connection().cluster.health() except ConnectionError: cls.skipme = True return cls._old_es_index_prefix = settings.ES_INDEX_PREFIX settings.ES_INDEX_PREFIX = 'test-%s' % settings.ES_INDEX_PREFIX cls._old_es_live_index = settings.ES_LIVE_INDEX settings.ES_LIVE_INDEX = True
def token_last_activity_at(self, token, timestamp=None): s = Token.search() s = s.filter('term', token=token.decode('utf-8')) rv = s.execute() if rv.hits.total > 0: rv = rv.hits.hits[0] rv = Token.get(rv['_id']) if timestamp: self.logger.debug('updating timestamp to: {}'.format(timestamp)) rv.update(last_activity_at=timestamp) connections.get_connection().indices.flush(index='tokens') return timestamp else: return rv.last_activity_at else: return timestamp
def search_results_page(request): es_conn = connections.get_connection() term = request.GET.get('q', '').strip().lower() days_ago = request.GET.get('days_ago', 30) size = request.GET.get('size', 10) offset = request.GET.get('offset', 0) start_date, end_date = get_date_range_from_args(request) prev_start_date = start_date - timedelta(days=int(days_ago)) prev_end_date = end_date - timedelta(days=int(days_ago)) current_histogram = get_term_counts_histogram( es_conn, term, start_date, end_date ) prev_histogram = get_term_counts_histogram( es_conn, term, prev_start_date, prev_end_date ) current_total = sum(current_histogram.values()) prev_total = sum(prev_histogram.values()) docs = get_text_search_results( start_date, end_date, {'content': term}, size=size, offset=offset ) for doc in docs: doc['mentions'] = doc['content'].lower().count(term.lower()) doc['search_phrase'] = term date_issued = datetime.strptime(doc['date_issued'], '%Y-%m-%d') doc['human_date'] = date_issued.strftime('%b %d, %Y') i = doc['content'].lower().find(term.lower()) if i > 0: start = max(0, i - 100) end = min(len(doc['content']), i + 200) doc['snippet'] = doc['content'][start:end] speakers = doc.get('speakers', '').split(',') doc['speakers'] = [] for s in speakers: matched_bioguide_data = match_speaker_to_bioguide(s) if matched_bioguide_data: doc['speakers'].append(matched_bioguide_data) return JsonResponse( { 'delta': int(100 * ((current_total - prev_total) / float(max(prev_total, 1)))), 'docs': docs, 'term': term, 'current_period': { 'daily_breakdown': [ {'date': k, 'count': v} for k, v in current_histogram.items() ], 'total_count': current_total }, 'previous_period': { 'daily_breakdown': [ {'date': k, 'count': v} for k, v in prev_histogram.items() ], 'total_count': prev_total }, 'start_date': start_date, 'end_date': end_date, } )
def __init__(self, config='cdr', size=2000): """ :param url: str Fully qualified url to an elasticsearch instance :param size: int| Size limit to set on elasticsearch query """ self.conn = connections.get_connection(config) self.elastic = Search('cdr', extra={'size': size})
def existing_analysis(using): """ Get the existing analysis for the `using` Elasticsearch connection """ es = connections.get_connection(using) index_name = settings.ELASTICSEARCH_CONNECTIONS[using]['index_name'] if es.indices.exists(index=index_name): return stringer(es.indices.get_settings(index=index_name)[index_name]['settings']['index'].get('analysis', {})) return DOES_NOT_EXIST
def search(cls, **kwargs): options = { 'using': connections.get_connection(), 'index': cls.get_index(), 'doc_type': {cls._doc_type.name: cls.from_es}, } options.update(kwargs) sq = Search(**options) return sq
def search(cls, **kwargs): options = { "using": connections.get_connection(), "index": cls.get_index(), "doc_type": {cls._doc_type.name: cls.from_es}, } options.update(kwargs) sq = Search(**options) return sq
def handle(self, *args, **options): doc_types = ','.join(args) or None output = self.stdout output.write('[') es = connections.get_connection() for idx, doc in enumerate(scan(es, index=options['index'], doc_type=doc_types)): if idx > 0: output.write(',') output.write(json.dumps(doc, indent=options['indent']), ending='') output.write(']')
def bulk_update(cls, dicts, client=None): def upsert(doc): d = doc.to_dict(True) d['_op_type'] = 'update' d['doc'] = d['_source'] d['doc_as_upsert'] = True del d['_source'] return d client = client or connections.get_connection() return bulk(client, (upsert(d) for d in dicts))
def bulk_load(docs_to_index): conn = connections.get_connection() index = NameVariant._doc_type.index for response in streaming_bulk( conn, docs_to_index, index=index, doc_type=NameVariant._doc_type.name): pass
def bulk_save(cls, dicts): objects = ( dict( d.to_dict(include_meta=True), **{'_index': cls.set_index_name(int(d.ano_eleicao))} ) for d in dicts ) client = connections.get_connection() return bulk(client, objects)
def test_delete_index(self): # first create and populate the index index = Index.objects.create() index.populate() # then delete it and check if recreating works without blowing up index.delete() es = connections.get_connection() es.indices.create(index.prefixed_name) es.indices.delete(index.prefixed_name)
def create_app(config_name='default'): """ Create Flask app :param config_name: :return: Flask """ from .api import blueprint as api_blueprint app = Flask(__name__) CORS(app, resources={ r"/api/*": {"origins": "*"} }) app.config.from_object(config[config_name]) config[config_name].init_app(app) connections.create_connection( hosts=app.config['ELASTICSEARCH_HOST'], http_auth=(app.config['ELASTICSEARCH_USER'], app.config['ELASTICSEARCH_SECRET']), timeout=20 ) connections.get_connection() app.register_blueprint(api_blueprint) extensions(app) @app.after_request def after_request(response): response.headers.add('Access-Control-Allow-Origin', '*') if request.method == 'OPTIONS': response.headers['Access-Control-Allow-Methods'] = 'DELETE, GET, POST, PUT' headers = request.headers.get('Access-Control-Request-Headers') if headers: response.headers['Access-Control-Allow-Headers'] = headers return response return app
def process_batch(self, jobs): # query redis in a pipeline pipe = listinghash_db.pipeline() for job in jobs: pipe.get(job['job_data']['listingHash']) job_info_xs = pipe.execute() # deserializing jobs job_info_xs = [json.loads(job) if job else job for job in job_info_xs] # dup detect by listinghash cache _to_update = [] _to_norm = [] for job_info, job in zip(job_info_xs, jobs): if job_info: _to_update.append((job['job_data'], job_info)) else: _to_norm.append(job) # bulk update old jobs # TODO: log old jobs to a topic for destination reasoning # logger slows down the processor, don't use logger process_seq = r_db.get(KEY_PROCESS_SEQ) if not process_seq: self.logger.error("process_seq is empty") is_ok = False else: _better_job = [] for job, job_info in _to_update: if better_job(job, job_info) >= 0: # job = job.copy() job['_id'] = job_info['_id'] _better_job.append(job) es_actions = bulk_update_actions(_better_job, process_seq) conn = connections.get_connection() is_ok = bulk_execute(self.logger, conn, es_actions) # if failed, norm old jobs as well, so # [_to_update] ++ [_to_norm] <- jobs # update: just discard them if not is_ok: # _to_norm = jobs self.logger.warn("Failed to bulk update existing jobs") # reporting count_all_job = len(jobs) count_old_job = len(_to_update) count_new_job = len(_to_norm) r_db.incr(r_total_job_key, count_all_job) r_db.incr(r_new_job_key, count_new_job) r_db.incr(r_old_job_key, count_old_job) # send to normalizer for job in _to_norm: self.produce_msg(**job)
def save(self, using=None, index=None, **kwargs): es = connections.get_connection() doc_meta = dict( (k, self.meta[k]) for k in DOC_META_FIELDS if k in self.meta) doc_meta.update(kwargs) meta = es.index(index=self._get_index(), doc_type=self._doc_type.name, body=self.serializer.data, **doc_meta) return meta
def setUp(self): self.es_conn = connections.get_connection() self.test_crecs = [] for i in range(20): self.test_crecs.append( CRECDoc(title=str(i), content='foo bar baz Foo', date_issued=datetime(2017, 1, i % 5 + 1))) self.index = Index(settings.ES_CW_INDEX) CRECDoc.init() for c in self.test_crecs: c.save(refresh=True) self.client = Client()
def tokens_create(self, data): self.logger.debug(data) if data.get('admin'): data['admin'] = True if data.get('read'): data['read'] = True if data.get('write'): data['write'] = True if not data.get('token'): data['token'] = self._token_generate() data['token'] = data['token'] self.logger.debug(data) t = Token(**data) if t.save(): connections.get_connection().indices.flush(index='tokens') return t.__dict__['_d_']
def setUpClass(cls): try: super(ElasticTestCase, cls).setUpClass() except AttributeError: # python 2.6 has no setUpClass, but that's okay pass if not getattr(settings, 'ES_URLS', None): cls.skipme = True return try: connections.get_connection().cluster.health() except ConnectionError: cls.skipme = True return cls._old_es_index_prefix = settings.ES_INDEX_PREFIX settings.ES_INDEX_PREFIX = 'test-%s' % settings.ES_INDEX_PREFIX # TODO: cleanup after upgarding test-utils (also in tearDownClass) cls._old_es_live_index = settings.ES_LIVE_INDEX settings.ES_LIVE_INDEX = True
def es_conn(server=settings.ES_SERVER): """Standardized connection to the ES cluster. :param server: a server definition of the form [host:port, ...]. See https://elasticsearch-py.readthedocs.org/en/master/api.html#elasticsearch for alternate host specification options. :return: an Elasticsearch connection instance """ connections.configure(default=server, max_retries=1, sniff_on_start=False) return connections.get_connection()
def index_video(): """ Method that index all the video objects to search server Call by pustakalaya_search app index_pustakalaya management command """ from .models import Video # Create an index and populate the mappings VideoDoc.init() # Get elastic search client es = connections.get_connection() # Index all community with nested collection print("Indexing videos...") bulk(client=es, actions=(b.bulk_index() for b in Video.objects.all().iterator() if b.published == "yes"))
def setUpClass(cls): try: super(ElasticTestCase, cls).setUpClass() except AttributeError: # python 2.6 has no setUpClass, but that's okay pass if not getattr(settings, 'ES_URLS', None): cls.skipme = True return try: connections.get_connection().cluster.health() except ConnectionError: cls.skipme = True return cls._old_es_index_prefix = settings.ES_INDEX_PREFIX settings.ES_INDEX_PREFIX = 'test-{0!s}'.format( settings.ES_INDEX_PREFIX) cls._old_es_live_index = settings.ES_LIVE_INDEX settings.ES_LIVE_INDEX = True
def setUp(self): super(TestsWithData, self).setUp() self.docs = [ self.TestDoc(title='doc-' + str(i)) for i in range(1000) ] actions = [d.to_dict(include_meta=True) for d in self.docs] inserted, errors = bulk(connections.get_connection(), actions=actions, refresh=True) self.assertEqual(inserted, len(actions)) self.assertEqual(len(errors), 0)
def build_index(self, document_parquet): if self.awsauth is not None: connections.create_connection( hosts=self.hosts, http_auth=self.awsauth, use_ssl=True, verify_certs=True, connection_class=RequestsHttpConnection) else: connections.create_connection(hosts=self.hosts) logger.info('Building elastic index') connections.create_connection(hosts=self.hosts) Page.init() # This is a parquet file to load from df = pd.read_parquet(document_parquet) unique_pages = df.groupby( ['pdf_name', 'page_num', 'dataset_id', 'img_pth']).agg(lambda x: list(x)) to_add = [] for i, row in unique_pages.iterrows(): to_add.append( Page(pdf_name=i[0], page_num=i[1], dataset_id=i[2], img_pth=i[3], pdf_dims=row['pdf_dims'][0].tolist(), bbox=[j.tolist() for j in row['bounding_box']], classes=[j.tolist() for j in row['classes']], scores=[j.tolist() for j in row['scores']], postprocess_cls=row['postprocess_cls'], postprocess_score=row['postprocess_score'], detect_cls=row['detect_cls'], detect_score=row['detect_score'])) if len(to_add) == 1000: bulk(connections.get_connection(), (upsert(d) for d in to_add)) to_add = [] bulk(connections.get_connection(), (upsert(d) for d in to_add)) logger.info('Done building page index')
def index_all(cls, using=None, delete=False, **kwargs): def actions_generator(): for obj in cls.index_queryset().iterator(): yield cls.from_django(obj).to_dict(include_meta=True) client = connections.get_connection(using or cls._doc_type.using) if delete: client.indices.delete(index=cls._doc_type.index, ignore=[400, 404]) cls._doc_type.init() for ok, item in streaming_bulk(client, actions_generator(), refresh=True, **kwargs): yield ok, item
def index_all(cls, index_name, using=None, **kwargs): def actions_generator(): for obj in cls.index_queryset().iterator(): elastic_data = cls.from_django(obj).to_dict(include_meta=True) elastic_data['_index'] = index_name yield elastic_data client = connections.get_connection(using or cls._doc_type.using) cls.init(index_name) for ok, item in streaming_bulk(client, actions_generator(), chunk_size=90, **kwargs): yield ok, item
def search(self, token, filters, sort='reporttime', raw=False, timeout=TIMEOUT): limit = filters.get('limit', LIMIT) s = Indicator.search(index='{}-*'.format(self.indicators_prefix)) s = s.params(size=limit, timeout=timeout) s = s.sort('-reporttime', '-lasttime') s = filter_build(s, filters, token=token) logger.debug(s.to_dict()) start = time.time() try: es = connections.get_connection(s._using) old_serializer = es.transport.deserializer if raw: rv = es.search(index=s._index, doc_type=s._doc_type, body=s.to_dict(), **s._params) else: es.transport.deserializer = self.Deserializer() rv = es.search(index=s._index, doc_type=s._doc_type, body=s.to_dict(), filter_path=['hits.hits._source'], **s._params) # transport caches this, so the tokens mis-fire es.transport.deserializer = old_serializer except elasticsearch.exceptions.RequestError as e: logger.error(e) es.transport.deserializer = old_serializer return # catch all other es errors except elasticsearch.ElasticsearchException as e: logger.error(e) es.transport.deserializer = old_serializer raise CIFException logger.debug('query took: %0.2f' % (time.time() - start)) return rv
def sync_orders(): highest_id = None try: r = Search(index='py-orders').sort('-_id')[0].execute() highest_id = int(r.hits[0].meta.id) except TransportError as e: if e.status_code == 404: highest_id = 0 order_docs = [] for order in models.Order.objects.filter( id__gt=highest_id).prefetch_related('customer'): order_docs.append( documents.Order(**order.to_search()).to_dict(include_meta=True)) bulk(connections.get_connection(), order_docs)
def init_app(app, db): email_service.sender.init_app(app) connections.create_connection( hosts=[{'host': app.config['ES_HOST'], 'port': app.config['ES_PORT']}], use_ssl=app.config['ES_USE_SSL'], connection_class=RequestsHttpConnection, timeout=120 ) app.elasticsearch = connections.get_connection() db.event.listen(db.session, 'before_commit', search.before_commit) db.event.listen(db.session, 'after_commit', search.after_commit)
def __init__(self, *args, **kwargs): assert self.document is not None self.client = connections.get_connection( self.document._get_using() ) self.index = self.document._index._name self.mapping = self.document._doc_type.mapping.properties.name self.search = Search( using=self.client, index=self.index, doc_type=self.document._doc_type.name ) super(BaseDocumentViewSet, self).__init__(*args, **kwargs)
def do_multi_search(queries: List[ElasticSearchMultiSearchQuery], connection_type=DATA_CONNECTION): try: conn = connections.get_connection(alias=connection_type) multi_search_body = [] for query_i in queries: multi_search_body.append({'index': query_i.index}) if query_i.body is None: query_i.body = {} query_i.body['track_total_hits'] = True multi_search_body.append(query_i.body) return conn.msearch(body=multi_search_body) except Exception as e: traceback.print_exc() raise Exception('ERROR: can\'t retrieve elastic search data!')
def tokens_delete(self, data): if not (data.get('token') or data.get('username')): return 'username or token required' s = Token.search() if data.get('username'): s = s.filter('term', username=data['username']) if data.get('token'): s = s.filter('term', token=data['token']) rv = s.execute() if rv.hits.total > 0: for t in rv.hits.hits: t = Token.get(t['_id']) t.delete() connections.get_connection().indices.flush(index='tokens') return rv.hits.total else: return 0
def initiate_es_specific_state_if_is_enabled(self): """ Initiates elasticsearch specific state if elasticsearch is enabled. Should be called in the class `__init__` method. """ if not settings.ES_DISABLED: self.client = connections.get_connection( self.document._get_using()) self.index = self.document._index._name self.mapping = self.document._doc_type.mapping.properties.name self.search = Search(using=self.client, index=self.index, doc_type=self.document._doc_type.name)
def test_smrt_elastcisearch(): with Smrt(remote=REMOTE, client='elasticsearch') as s: assert type(s) is Smrt x = s.process('test/smrt/rules/csirtg.yml', feed='port-scanners') assert len(x) > 0 x = s.process('test/smrt/rules/csirtg.yml', feed='port-scanners') assert len(x) > 0 # cleanup es = connections.get_connection() cli = elasticsearch.client.IndicesClient(es) cli.delete(index='indicators-*')
def _health_check(self): try: x = connections.get_connection().cluster.health() except ConnectionError as e: logger.warn('elasticsearch connection error') logger.error(e) return except Exception as e: logger.error(traceback.print_exc()) return logger.info('ES cluster is: %s' % x['status']) return x
def test_delete_index(self): # first create and populate the index index = Index.objects.create() index.populate() # then delete it and check if recreating works without blowing up index.delete() es = connections.get_connection() try: es.indices.create(index.prefixed_name) except RequestError: assert False es.indices.delete(index.prefixed_name)
def community_detail(request, community_name): """Community detail page""" client = connections.get_connection() # s = Search(using=client, index=settings.ES_INDEX).query("match", ) # Query the total number of items in elastic search having the name of this community # es_count = Search(index="pustakalaya").using(client).query("match", communities=community_name).count() # Context data context = {} # print("community name =",community_name) community_name = " ".join(community_name.split("-")) # Query all the collection that contains this community_name from ORM collections = Collection.objects.filter(community_name=community_name) collection_list = [] all_total = 0 for collection in collections: # Get the total no of items having this collection name in elastic search # item_count_per_collection = Search(index="pustakalaya").using(client).query("match", communities=collection).count() item_count_per_collection = Search( index="pustakalaya").using(client).query( "match", collections=collection.collection_name).count() all_total += item_count_per_collection pk = collection.pk # Create a list to that contain collection_name and total count collection_list.append({ "collection_name": collection.collection_name, "total_count": all_total, "es_count": item_count_per_collection, "pk": pk, }) # Implement total count # Sort list to display in alphabetical order. context["collection_list"] = collection_list context["community_name"] = community_name # print("colllist=",collection_list) return render(request, "collection/community_detail.html", context)
def handle(self, *args, **options): if "datasource" not in options: self.stderr.write("You need to specify datasource to reindex") return config = django_apps.app_configs[options["datasource"]] ElasticModel = config.elastic_model Model = config.data_model idx = config.elastic_index conn = connections.get_connection("default") if options["drop_indices"]: idx.delete(ignore=404) idx.create() ElasticModel.init() conn.indices.put_settings( index=ElasticModel._doc_type.index, body={"index.max_result_window": int(Model.objects.count() * 2. + 1)}, ) Model.setup_indexing() qs = Model.objects.all() if options["only_last_n_days"] is not None: qs = qs.filter( last_updated_from_dataset__gte=date.today() - relativedelta(days=options["only_last_n_days"]) ) docs_to_index = [] with tqdm(total=qs.count()) as pbar: for p in qs.iterator(): pbar.update(1) doc = p.to_dict() if doc is None: self.stderr.write("Cannot parse {} document".format(p)) continue docs_to_index.append(ElasticModel(**doc)) if len(docs_to_index) > options["batch_size"]: self.bulk_write(conn, docs_to_index) docs_to_index = [] self.bulk_write(conn, docs_to_index) self.stdout.write( "{} of {} records indexed into ES".format(qs.count(), config.name) )
def search(cls, **kwargs): kwargs.update({ 'using': connections.get_connection(), 'index': cls.get_index(), 'doc_type': { cls._doc_type.name: cls.from_es }, }) sq = Search(**kwargs) # Add highlighting. sq = sq.highlight(*cls.excerpt_fields) sq = sq.highlight_options(order='score') return sq
def push(self): """Push built documents to ElasticSearch.""" self._refresh_connection() self.create_mapping() if not self.push_queue: logger.debug("No documents to push, skipping push.") return logger.debug("Found %s documents to push to Elasticsearch.", len(self.push_queue)) bulk(connections.get_connection(), (doc.to_dict(True) for doc in self.push_queue), refresh=True) self.push_queue = [] logger.debug("Finished pushing builded documents to Elasticsearch server.")
def handle(self, *args, **options): for index in options['indices']: if index == 'declarations_v2': doc_type = Declaration elif index == 'nacp_declarations': doc_type = NACPDeclaration es = connections.get_connection('default') if es.indices.exists(index=index): self.stdout.write('Index "{}" already exists, not creating.'.format(index)) return doc_type.init() es.indices.put_settings(index=index, body=CATALOG_INDEX_SETTINGS) self.stdout.write('Created index "{}".'.format(index))
def run(): # create the mappings in elasticsearch DocumentIndex.init() # create and save and document document = DocumentIndex(meta={id:42}) document.ocr_json = {'hello':'ok'} document.save() document = DocumentIndex.get(id=42) #print(document.ocr_json) # Display cluster health print(connections.get_connection().cluster.health())
def _create_index(self): dt = datetime.utcnow() dt = dt.strftime('%Y.%m') es = connections.get_connection() if not es.indices.exists('indicators-{}'.format(dt)): index = Index('indicators-{}'.format(dt)) index.aliases(live={}) index.doc_type(Indicator) index.create() m = Mapping('indicator') m.field('indicator_ipv4', 'ip') m.field('indicator_ipv4_mask', 'integer') m.save('indicators-{}'.format(dt)) return 'indicators-{}'.format(dt)