def setUp(self): super(TestUserSearch, self).setUp() self.es_connection = ESConnection("localhost", 9200, self.io_loop) self.us = ESSearch(index_mapping=user_index_mapping, index_name="index", type_name="user", analyze_fields=user_analyze_fields, none_analyze_fields=user_none_analyze_fields, io_loop=self.io_loop) self.io_loop.run_sync(self.setup_coro)
def test_use_of_custom_http_clients(self): mocked_http_client = Mock() mocked_http_client.fetch = Mock() es_connection = ESConnection("localhost", "9200", self.io_loop, custom_client=mocked_http_client) es_connection.search(callback=self.stop, source={"query": {"term": {"ID": "171171"}}}, type="materia", index="teste") mocked_http_client.fetch.assert_called()
def __init__(self, config, db=None, authnz_wrapper=None, io_loop=None): self.debug = False self.config = config if db is not None: self.db = db self.syncES = ElasticSearch( '%(ELASTIC_SEARCH_PROTOCOL)s://%(ELASTIC_SEARCH_HOST)s:%(ELASTIC_SEARCH_PORT)s' % config ) self.asyncES = ESConnection( host=config.get('ELASTIC_SEARCH_HOST'), port=config.get('ELASTIC_SEARCH_PORT'), io_loop=io_loop, protocol=config.get('ELASTIC_SEARCH_PROTOCOL'), ) self.index = config.get('ELASTIC_SEARCH_INDEX') self.max_retries = config.get('ELASTIC_SEARCH_MAX_RETRIES')
def make_app(): es_index = "feed" client = motor.motor_tornado.MotorClient("mongodb://mongodb:27017") db = client.test es = ESConnection("elasticsearch", 9200) handlers = [ (r"/", MainHandler), (r"/news/new", NewsCreateHandler), (r"/news/create", NewsCreateHandler), (r"/news/search", NewsSearchHandler), (r"/news/([^/]+)", NewsShowHandler), ] settings = dict( template_path=os.path.join(os.path.dirname(__file__), "templates"), static_path=os.path.join(os.path.dirname(__file__), "static"), xsrf_cookies=True, cookie_secret="__TODO:_GENERATE_YOUR_OWN_RANDOM_VALUE_HERE__", debug=True, db=db, es=es, es_index=es_index) return tornado.web.Application(handlers, **settings)
class ESConnectionTestBase(AsyncTestCase): def setUp(self): super(ESConnectionTestBase, self).setUp() self.es_connection = ESConnection("localhost", "9200", self.io_loop) self._set_version() def tearDown(self): if (not IOLoop.initialized() or self.io_loop is not IOLoop.instance()): self.io_loop.close(all_fds=True) super(AsyncTestCase, self).tearDown() def _set_version(self): self.es_connection.get_by_path("/", self.stop) response = self.wait() response = escape.json_decode(response.body) version = response['version']['number'] self.version = [int(n) for n in version.split('.') if n.isdigit()] def _set_count_query(self, query): if self.version[0] < 1: return query['query'] return query
class SearchHandler(tornado.web.RequestHandler): es_connection = ESConnection(ES_HOST, ES_PORT) @tornado.web.asynchronous @coroutine def get(self, indice="index", tipo="user"): query = {"query": {"match_all": {}}} response = yield self.es_connection.search(index=indice, type=tipo, source=query) #print response self.write(json.loads(response.body)) self.finish()
import json import tornado.ioloop from tornado import web from tornadoes import ESConnection es = ESConnection('local.elasticsearch.com', '9200') class MainHandler(tornado.web.RequestHandler): @web.gen.coroutine def get(self): res = yield es.search(index='sitemap_g1', body={"query": {"match_all": {}}}) data = json.loads(res.body.decode('utf-8')) self.render('templates/index.xml', sitemaps=data['hits']['hits']) def make_app(): return web.Application([ (r"/", MainHandler), ]) app = make_app() if __name__ == "__main__": app.listen(8888) tornado.ioloop.IOLoop.current().start()
def setUp(self): self.io_loop = self.get_new_ioloop() self.es_connection = ESConnection("localhost", "9200", self.io_loop)
def setUp(self): super(TestESConnectionWithTornadoGen, self).setUp() self.es_connection = ESConnection("localhost", "9200", self.io_loop)
class ElasticSearchProvider(SearchProvider): def __init__(self, config, db=None, authnz_wrapper=None, io_loop=None): self.debug = False self.config = config if db is not None: self.db = db self.syncES = ElasticSearch( '%(ELASTIC_SEARCH_PROTOCOL)s://%(ELASTIC_SEARCH_HOST)s:%(ELASTIC_SEARCH_PORT)s' % config ) self.asyncES = ESConnection( host=config.get('ELASTIC_SEARCH_HOST'), port=config.get('ELASTIC_SEARCH_PORT'), io_loop=io_loop, protocol=config.get('ELASTIC_SEARCH_PROTOCOL'), ) self.index = config.get('ELASTIC_SEARCH_INDEX') self.max_retries = config.get('ELASTIC_SEARCH_MAX_RETRIES') def activate_debug(self): self.debug = True def connect_to_db(self): from sqlalchemy import create_engine from sqlalchemy.orm import scoped_session, sessionmaker conn_string = self.config.get('SQLALCHEMY_CONNECTION_STRING') engine = create_engine( conn_string, convert_unicode=True, pool_size=1, max_overflow=0, echo=self.debug ) maker = sessionmaker(bind=engine, autoflush=True) self.db = scoped_session(maker) def _assemble_inner_query(self, domain=None, page_filter=None): if page_filter and domain: page_prefix = '%s/%s' % (domain.url, page_filter) else: page_prefix = None if page_prefix: return { 'prefix': { 'page_url': page_prefix } } else: return { 'match_all': {} } def _assemble_outer_query(self, inner_query, filter_terms): return { 'filtered': { 'query': inner_query, 'filter': { 'and': [{ 'term': filter_term } for filter_term in filter_terms] } } } def _assemble_filter_terms(self, key_id=None, domain=None): filter_terms = [] if key_id: filter_terms.append({'keys.id': key_id}) if domain: filter_terms.append({'domain_id': domain.id}) return filter_terms def gen_doc(self, review): return { 'keys': [{'id': violation.key_id} for violation in review.violations], 'uuid': str(review.uuid), 'completed_date': review.completed_date, 'violation_count': review.violation_count, 'page_id': review.page_id, 'page_uuid': str(review.page.uuid), 'page_url': review.page.url, 'page_last_review_date': review.page.last_review_date, 'domain_id': review.domain_id, 'domain_name': review.domain.name, } def index_review(self, review): for attempt in range(self.max_retries): try: self.syncES.send_request( method='POST', path_components=[self.index, 'review', review.page_id], body=dumps(self.gen_doc(review)), encode_body=False ) break except (Timeout, ConnectionError, ElasticHttpError, InvalidJsonResponseError) as e: values = review.id, review.page_id, str(e) logging.error('Could not index review (review_id:{0}, page_id:{1}): {2}'.format(*values)) time.sleep(1) if attempt >= self.max_retries - 1: raise else: raise def index_reviews(self, reviewed_pages, reviews_count, batch_size): action = {'index': {'_type': 'review'}} for i in range(0, reviews_count, batch_size): body_bits = [] for page in reviewed_pages[i:i + batch_size]: doc = self.gen_doc(page.last_review) action['index']['_id'] = doc['page_id'] body_bits.append(dumps(action)) body_bits.append(dumps(doc)) # Yes, that trailing newline IS necessary body = '\n'.join(body_bits) + '\n' self.syncES.send_request( method='POST', path_components=[self.index, '_bulk'], body=body, encode_body=False ) logging.info('Done!') @return_future def get_by_violation_key_name(self, key_id, current_page=1, page_size=10, domain=None, page_filter=None, callback=None): def treat_response(response): if response.error is None: try: hits = loads(response.body).get('hits', {'hits': []}) reviews_data = [] for hit in hits['hits']: completedAt = datetime.utcfromtimestamp(hit['_source']['completed_date']) reviews_data.append({ 'uuid': hit['_source']['uuid'], 'page': { 'uuid': hit['_source']['page_uuid'], 'url': hit['_source']['page_url'], 'completedAt': completedAt }, 'domain': hit['_source']['domain_name'] }) reviews_count = hits.get('total', 0) callback({ 'reviews': reviews_data, 'reviewsCount': reviews_count }) except Exception as e: reason = 'ElasticSearchProvider: invalid response (%s [%s])' % (type(e), e.message) logging.error(reason) callback({'error': {'status_code': 500, 'reason': reason}}) else: reason = 'ElasticSearchProvider: erroneous response (%s [%s])' % (response.error.message, response.body) logging.error(reason) callback({'error': {'status_code': 500, 'reason': reason}}) inner_query = self._assemble_inner_query(domain, page_filter) filter_terms = self._assemble_filter_terms(key_id, domain) query = self._assemble_outer_query(inner_query, filter_terms) sort_ = [{ 'completed_date': { 'order': 'desc' } }, { 'violation_count': { 'order': 'desc' } }] source = {'query': query, 'sort': sort_} self.asyncES.search( callback=treat_response, index=self.index, type='review', source=source, page=current_page, size=page_size, ) @return_future def get_domain_active_reviews(self, domain, current_page=1, page_size=10, page_filter=None, callback=None): def treat_response(response): if response.error is None: try: hits = loads(response.body).get('hits', {'hits': []}) pages = [] for hit in hits['hits']: completedAt = datetime.utcfromtimestamp(hit['_source']['completed_date']) pages.append({ 'url': hit['_source']['page_url'], 'uuid': hit['_source']['page_uuid'], 'violationCount': len(hit['_source']['keys']), 'completedAt': completedAt, 'reviewId': hit['_source']['uuid'] }) reviews_count = hits.get('total', 0) callback({ 'reviewsCount': reviews_count, 'pages': pages }) except Exception as e: reason = 'ElasticSearchProvider: invalid response (%s [%s])' % (type(e), e.message) logging.error(reason) callback({'error': {'status_code': 500, 'reason': reason}}) else: reason = 'ElasticSearchProvider: erroneous response (%s [%s])' % (response.error.message, response.body) logging.error(reason) callback({'error': {'status_code': 500, 'reason': reason}}) inner_query = self._assemble_inner_query(domain=domain, page_filter=page_filter) filter_terms = self._assemble_filter_terms(domain=domain) query = self._assemble_outer_query(inner_query, filter_terms) sort_ = [{ 'violation_count': { 'order': 'desc' } }, { 'completed_date': { 'order': 'desc' } }] source = {'query': query, 'sort': sort_} self.asyncES.search( callback=treat_response, index=self.index, type='review', source=source, page=current_page, size=page_size, ) def refresh(self): try: self.syncES.refresh(index=self.index) except Exception as e: logging.error('Could not refresh index (%s)' % e) def get_index_settings(cls): return { 'index': { 'number_of_shards': 4 } } def get_index_mapping(cls): return { 'review': { 'properties': { 'keys': { 'properties': { 'id': { 'type': 'integer' } } }, 'uuid': { 'type': 'string', 'index': 'not_analyzed' }, 'completed_date': { 'type': 'integer' }, 'violation_count': { 'type': 'float' }, 'page_id': { 'type': 'integer' }, 'page_uuid': { 'type': 'string', 'index': 'not_analyzed' }, 'page_url': { 'type': 'string', 'index': 'not_analyzed' }, 'page_last_review_date': { 'type': 'integer' }, 'domain_id': { 'type': 'integer' }, 'domain_name': { 'type': 'string', 'index': 'not_analyzed' } } } } def setup_index(self): try: settings = self.get_index_settings() self.syncES.create_index(index=self.index, settings=settings) mapping = self.get_index_mapping() self.syncES.put_mapping(index=self.index, doc_type='review', mapping=mapping) logging.info('Index %s created.' % self.index) except Exception as e: raise e def delete_index(self): try: self.syncES.delete_index(index=self.index) logging.info('Index %s deleted.' % self.index) except Exception as e: raise e def _get_max_page_id_from_index(self, must_have_domain_name=False): if must_have_domain_name: inner_query = { 'constant_score': { 'filter': { 'not': { 'missing': { 'field': 'domain_name' } } } } } else: inner_query = { 'match_all': {} } query = { 'query': inner_query, 'sort': [{ 'page_id': { 'order': 'desc' } }] } results = self.syncES.search(query, index=self.index, doc_type='review') if results['hits']['total'] > 0: return results['hits']['hits'][0]['_id'] or 0 return 0 def index_all_reviews(self, keys=None, batch_size=200, replace=False): logging.info('Querying database...') self.connect_to_db() if keys is not None: keys = [k.id for k in self.db.query(Key.id).filter(Key.name.in_(keys)).all()] try: max_page_id = self._get_max_page_id_from_index(must_have_domain_name=True) except Exception: logging.error('Could not retrieve max page_id! Use with --replace (with caution)') return def apply_filters(query): if keys is not None: query = query \ .filter(Violation.review_id == Page.last_review_id) \ .filter(Violation.key_id.in_(keys)) if not replace: query = query.filter(Page.id > max_page_id) return query.filter(Page.last_review_id != None) reviews_count = apply_filters(self.db.query(func.count(Page))).scalar() query = self.db.query(Page).options(joinedload('last_review')) reviewed_pages = apply_filters(query).order_by(Page.id.asc()) logging.info('Indexing %d reviews...' % reviews_count) self.index_reviews(reviewed_pages, reviews_count, batch_size) @classmethod def new_instance(cls, config): return ElasticSearchProvider(config) @classmethod def main(cls): import sys parser = cls.argparser() args = parser.parse_args() config = {} host = None port = None index = None es = None levels = ['ERROR', 'WARNING', 'INFO', 'DEBUG'] log_level = levels[args.verbose] logging.basicConfig(level=getattr(logging, log_level), format='%(levelname)s - %(message)s') if not (args.create or args.recreate or args.delete or args.keys or args.all_keys): parser.print_help() sys.exit(1) if args.conf: from derpconf.config import ConfigurationError from holmes.config import Config try: config = Config().load(args.conf[0]) host = config['ELASTIC_SEARCH_HOST'] port = config['ELASTIC_SEARCH_PORT'] index = config['ELASTIC_SEARCH_INDEX'] except ConfigurationError: logging.error('Could not load config! Use --conf conf_file') sys.exit(1) except KeyError: logging.error('Could not parse config! Check it\'s contents') sys.exit(1) if args.server: try: host, port = args.server[0].split(':') config['ELASTIC_SEARCH_HOST'] = host config['ELASTIC_SEARCH_PORT'] = port except Exception: logging.error('Could not parse server host and port! Use --server host:port') sys.exit(1) if args.index: index = args.index[0] config['ELASTIC_SEARCH_INDEX'] = index from pyelasticsearch.exceptions import IndexAlreadyExistsError, ElasticHttpNotFoundError, InvalidJsonResponseError from requests.exceptions import ConnectionError try: if args.create or args.recreate or args.delete: if host is None or port is None: logging.error('Need either a host and port or a config file to perform such operation!') sys.exit(1) if index is None: logging.error('Need either an index name or a config file to perform such operation!') sys.exit(1) else: es = cls.new_instance(config) if args.recreate or args.delete: try: es.delete_index() except ElasticHttpNotFoundError: pass except InvalidJsonResponseError as e: logging.error('Invalid response! Reason: %s' % e) sys.exit(1) if args.create or args.recreate: es.setup_index() if args.keys or args.all_keys: if config is None: logging.error('Need a config file to perform such operation! Use --conf conf_file') else: batch_size = args.batch_size[0] if args.batch_size else 200 es = cls.new_instance(config) if not es else es try: if args.verbose > 2: es.activate_debug() if args.keys: es.index_all_reviews(args.keys, replace=args.replace, batch_size=batch_size) elif args.all_keys: es.index_all_reviews(replace=args.replace, batch_size=batch_size) except InvalidJsonResponseError as e: logging.error('Invalid response! Reason: %s' % e) sys.exit(1) except IndexAlreadyExistsError: logging.error('Index %s already exists! Use --recreate (with caution) to recreate' % index) except ConnectionError: logging.error('Could not connect to server at %s:%s' % (host, port)) except KeyError: logging.error('Could not get host nor port! Use either -conf or --server') sys.exit(1)
def test_initilize_client_from_uri(self): es_connection = ESConnection.from_uri("https://dummy.server:1234/") self.assertEqual(es_connection.url, "https://dummy.server:1234")
class TestESConnection(AsyncTestCase): def setUp(self): self.io_loop = self.get_new_ioloop() self.es_connection = ESConnection("localhost", "9200", self.io_loop) def tearDown(self): if (not IOLoop.initialized() or self.io_loop is not IOLoop.instance()): self.io_loop.close(all_fds=True) super(AsyncTestCase, self).tearDown() def test_simple_search(self): self.es_connection.get_by_path("/_search?q=_id:http\:\/\/localhost\/noticia\/2\/fast", self.stop) response = self._verify_status_code_and_return_response() self.assertEqual(response["hits"]["total"], 1) self.assertEqual(response["hits"]["hits"][0]["_id"], u'http://localhost/noticia/2/fast') def test_search_for_specific_type_with_query(self): self.es_connection.search(callback=self.stop, source={"query": {"text": {"ID": "171171"}}}, type="materia", index="teste") response = self._verify_status_code_and_return_response() self.assertEqual(response["hits"]["total"], 1) self.assertEqual(response["hits"]["hits"][0]["_id"], u'171171') def test_search_all_entries(self): self.es_connection.search(self.stop) response = self._verify_status_code_and_return_response() self.assertEqual(response["hits"]["total"], 28) def test_search_specific_index(self): self.es_connection.search(callback=self.stop, index="outroteste") response = self._verify_status_code_and_return_response() self.assertEqual(response["hits"]["total"], 14) def test_search_apecific_type(self): self.es_connection.search(self.stop, type='galeria') response = self._verify_status_code_and_return_response() self.assertEqual(response["hits"]["total"], 2) def test_should_access_specific_documento(self): self.es_connection.get(index="teste", type="materia", uid="171171", callback=self.stop) response = self.wait() self.assertEqual(response['Portal'], "G1") self.assertEqual(response['Macrotema'], "Noticias") def test_should_accumulate_searches_before_search(self): source = {"query": {"text": {"_id": "171171"}}} self.es_connection.multi_search("teste", source=source) source = {"query": {"text": {"body": "multisearch"}}} self.es_connection.multi_search("neverEndIndex", source=source) self.assertListEqual(['{"index": "teste"}\n{"query": {"text": {"_id": "171171"}}}', '{"index": "neverEndIndex"}\n{"query": {"text": {"body": "multisearch"}}}' ], self.es_connection.bulk.bulk_list) def test_should_generate_empty_header_with_no_index_specified(self): source = {"query": {"text": {"_id": "171171"}}} self.es_connection.multi_search(index=None, source=source) source = {"query": {"text": {"body": "multisearch"}}} self.es_connection.multi_search(index=None, source=source) self.assertListEqual(['{}\n{"query": {"text": {"_id": "171171"}}}', '{}\n{"query": {"text": {"body": "multisearch"}}}' ], self.es_connection.bulk.bulk_list) def test_should_make_two_searches(self): self._make_multisearch() response = self._verify_status_code_and_return_response() self.assertEqual(response['responses'][0]['hits']['hits'][0]['_id'], "171171") self.assertFalse("hits" in response['responses'][1]) def test_should_clean_search_list_after_search(self): self._make_multisearch() self.wait() self.assertListEqual([], self.es_connection.bulk.bulk_list) def _make_multisearch(self): source = {"query": {"text": {"_id": "171171"}}} self.es_connection.multi_search(index="teste", source=source) source = {"query": {"text": {"_id": "101010"}}} self.es_connection.multi_search(index="neverEndIndex", source=source) self.es_connection.apply_search(callback=self.stop) def _verify_status_code_and_return_response(self): response = self.wait() return self._verify_response_and_returns_dict(response) def _verify_response_and_returns_dict(self, response): self.assertTrue(response.code in [200, 201], "Wrong response code: %d." % response.code) response = escape.json_decode(response.body) return response def test_can_put_and_delete_document(self): try: doc_id = str(uuid4()) self.es_connection.put("test", "document", doc_id, { "test": "document", "other": "property" }, parameters={'refresh': True}, callback=self.stop) response = self.wait() response_dict = self._verify_response_and_returns_dict(response) self.assertEqual(response_dict['_index'], 'test') self.assertEqual(response_dict['_type'], 'document') self.assertTrue(response_dict['ok']) self.assertEqual(response_dict['_id'], doc_id) self.assertIn('refresh=True', response.request.url) finally: self.es_connection.delete("test", "document", doc_id, parameters={'refresh': True}, callback=self.stop) response = self._verify_status_code_and_return_response() self.assertTrue(response['found']) self.assertTrue(response['ok']) self.assertEqual(response['_index'], 'test') self.assertEqual(response['_type'], 'document') self.assertEqual(response['_id'], doc_id) def test_count_all_entries(self): self.es_connection.count(callback=self.stop) response = self._verify_status_code_and_return_response() self.assertEqual(response["count"], 28) def test_count_specific_index(self): self.es_connection.count(callback=self.stop, index="outroteste") response = self._verify_status_code_and_return_response() self.assertEqual(response["count"], 14) def test_count_specific_type(self): self.es_connection.count(callback=self.stop, type='galeria') response = self._verify_status_code_and_return_response() self.assertEqual(response["count"], 2) def test_count_specific_query(self): source = {"query": {"text": {"_id": "171171"}}} self.es_connection.count(callback=self.stop, source=source) response = self._verify_status_code_and_return_response() self.assertEqual(response["count"], 1) def test_count_specific_query_with_parameters(self): source = {"query": {"text": {"_id": "171171"}}} parameters = {'refresh': True} self.es_connection.count(callback=self.stop, source=source, parameters=parameters) response = self.wait() response_dict = self._verify_response_and_returns_dict(response) self.assertEqual(response_dict["count"], 1) self.assertTrue(response.request.url.endswith('_count?refresh=True')) def test_count_specific_query_with_many_parameters(self): source = {"query": {"text": {"_id": "171171"}}} parameters = {'df': '_id', 'test': True} self.es_connection.count(callback=self.stop, source=source, parameters=parameters) response = self.wait() response_dict = self._verify_response_and_returns_dict(response) self.assertEqual(response_dict["count"], 1) self.assertTrue(response.request.url.endswith('_count?df=_id&test=True'))
def test_initilize_client_from_invalid_uri(self): with self.assertRaises(ValueError): ESConnection.from_uri("<<invalid:1234uri/")
def setUp(self): super(ESConnectionTestBase, self).setUp() self.es_connection = ESConnection("localhost", "9200", self.io_loop) self._set_version()
from greplin import scales from greplin.scales.meter import MeterStat import json from hashlib import md5 import re import unicodedata STATS = scales.collection('/index', MeterStat('docs')) BASE_PATH = '/Users/jisaacso/Documents/projects/bayes-impact/team-thorn/data/escort_all' FBDUMP = os.path.join(BASE_PATH, 'escort_all.tsv') es = ESConnection('localhost', 9200) es.httprequest_kwargs = { 'request_timeout': 1500.00, 'connect_timeout': 1500.00 } wspaceNuker = re.compile(' +') def fold_accents(raw): if type(raw) == str: raw = unicode(raw, 'utf-8') return ''.join([c for c in unicodedata.normalize('NFKD', raw).encode('ascii', 'ignore')]) def isspecialchar(char): specialchars = ['$', '.'] return char in specialchars def fb_to_es(line):
def __init__(self, index=settings.DB_NAME): self.es = ESConnection() self.es.httprequest_kwargs['headers'] = {'Connection': 'keep-alive'} self.index = index
class TestESConnectionWithTornadoGen(AsyncTestCase): def setUp(self): super(TestESConnectionWithTornadoGen, self).setUp() self.es_connection = ESConnection("localhost", "9200", self.io_loop) def tearDown(self): if (not IOLoop.initialized() or self.io_loop is not IOLoop.instance()): self.io_loop.close(all_fds=True) super(AsyncTestCase, self).tearDown() @gen_test def test_simple_search(self): response = yield self.es_connection.get_by_path("/_search?q=_id:http\:\/\/localhost\/noticia\/2\/fast", self.stop) response = self._verify_status_code_and_return_response(response) self.assertEqual(response["hits"]["total"], 1) self.assertEqual(response["hits"]["hits"][0]["_id"], u'http://localhost/noticia/2/fast') @gen_test def test_search_for_specific_type_with_query(self): response = yield self.es_connection.search( source={"query": {"term": {"ID": "171171"}}}, type="materia", index="teste" ) response = self._verify_status_code_and_return_response(response) self.assertEqual(response["hits"]["total"], 1) self.assertEqual(response["hits"]["hits"][0]["_id"], u'171171') @gen_test def test_search_specific_index(self): response = yield self.es_connection.search(index="outroteste") response = self._verify_status_code_and_return_response(response) self.assertEqual(response["hits"]["total"], 14) @gen_test def test_search_apecific_type(self): response = yield self.es_connection.search(type='galeria') response = self._verify_status_code_and_return_response(response) self.assertEqual(response["hits"]["total"], 2) @gen_test def test_should_access_specific_document_using_tornado_gen(self): response = yield self.es_connection.get(index="teste", type="materia", uid="171171") response = response["_source"] self.assertEqual(response['Portal'], "G1") self.assertEqual(response['Macrotema'], "Noticias") @gen_test def test_should_make_two_searches(self): self._make_multisearch() response = yield self.es_connection.apply_search() response = self._verify_status_code_and_return_response(response) self.assertEqual(response['responses'][0]['hits']['hits'][0]['_id'], "171171") self.assertFalse("hits" in response['responses'][1]) @gen_test def test_should_clean_search_list_after_search(self): self._make_multisearch() response = yield self.es_connection.apply_search() response = self._verify_status_code_and_return_response(response) self.assertListEqual([], self.es_connection.bulk.bulk_list) @gen_test def test_can_put_and_delete_document(self): try: doc_id = str(uuid4()) response = yield self.es_connection.put("test", "document", doc_id, { "test": "document", "other": "property" }, parameters={'refresh': True}) response_dict = self._verify_status_code_and_return_response(response) self.assertEqual(response_dict['_index'], 'test') self.assertEqual(response_dict['_type'], 'document') self.assertEqual(response_dict['_id'], doc_id) self.assertIn('refresh=True', response.request.url) finally: response = yield self.es_connection.delete("test", "document", doc_id, parameters={'refresh': True}) response = self._verify_status_code_and_return_response(response) self.assertTrue(response['found']) self.assertEqual(response['_index'], 'test') self.assertEqual(response['_type'], 'document') self.assertEqual(response['_id'], doc_id) @gen_test def test_count_specific_index(self): response = yield self.es_connection.count(index="outroteste") self.assertCount(response, 14) @gen_test def test_count_specific_type(self): response = yield self.es_connection.count(type='galeria') self.assertCount(response, 2) @gen_test def test_count_specific_query(self): source = {"query": {"term": {"_id": "171171"}}} response = yield self.es_connection.count(source=source) self.assertCount(response, 1) @gen_test def test_count_specific_query_with_parameters(self): source = {"query": {"term": {"_id": "171171"}}} parameters = {'refresh': True} response = yield self.es_connection.count(callback=self.stop, source=source, parameters=parameters) self.assertCount(response, 1) self.assertTrue(response.request.url.endswith('_count?refresh=True')) @gen_test def test_count_specific_query_with_many_parameters(self): source = {"query": {"term": {"_id": "171171"}}} parameters = {'df': '_id', 'test': True} response = yield self.es_connection.count(callback=self.stop, source=source, parameters=parameters) self.assertCount(response, 1) self.assertTrue('df=_id' in response.request.url) self.assertTrue('test=True' in response.request.url) def assertCount(self, response, count): response_dict = self._verify_status_code_and_return_response(response) self.assertEqual(response_dict["count"], count) def _make_multisearch(self): source = {"query": {"term": {"_id": "171171"}}} self.es_connection.multi_search(index="teste", source=source) source = {"query": {"term": {"_id": "101010"}}} self.es_connection.multi_search(index="neverEndIndex", source=source) def _verify_status_code_and_return_response(self, response): self.assertTrue(response.code in [200, 201], "Wrong response code: %d." % response.code) response = escape.json_decode(response.body) return response
class ES: url = None index = None es = None def __init__(self, index=settings.DB_NAME): self.es = ESConnection() self.es.httprequest_kwargs['headers'] = {'Connection': 'keep-alive'} self.index = index @gen.coroutine def search(self, mapping, query, extra_params): fields = settings.RELAY_ES_FIELDS return_data = list() if query: query = " AND ".join(query.split()) else: query = "*" time_q = extra_params['time'] query += self.__parse_extra(extra_params) if mapping == "bridges": fields = settings.BRIDGE_ES_FIELDS body_query = {"_source": fields, "query": { "filtered": { "query": { "query_string": { "query": query } }, "filter":{ "range": { time_q['type']: { "gte": time_q['range'][0] + " 00:00:00", "lte": time_q['range'][1] + " 23:59:59" } } } } }} if settings.DEBUG: print body_query result = yield self.es.search(index=self.index, type=mapping, source=body_query, size=settings.ES_RESULT_SIZE) result = escape.json_decode(result.body) if 'hits' in result and 'hits' in result['hits']: for hit in result['hits']['hits']: return_data.append(hit['_source']) else: raise Exception(result['error']) raise gen.Return(return_data) def __parse_extra(self, extra): query = "" if 'country' in extra: new_country = list() geo_split = extra['country'].split(',') for geo in geo_split: new_country.append("geo:" + geo) query = "{0} AND (".format(query) + " OR ".join(new_country) + ")" if 'flags' in extra: flags_split = extra['flags'].split(',') for flag in flags_split: query = "{0} AND flags:{1}".format(query, flag.title()) return query
class TestUserSearch(AsyncTestCase): def setUp(self): super(TestUserSearch, self).setUp() self.es_connection = ESConnection("localhost", 9200, self.io_loop) self.us = ESSearch(index_mapping=user_index_mapping, index_name="index", type_name="user", analyze_fields=user_analyze_fields, none_analyze_fields=user_none_analyze_fields, io_loop=self.io_loop) self.io_loop.run_sync(self.setup_coro) @gen.coroutine def setup_coro(self): yield self.us.clean_all() @gen_test def test_insert(self): uid, info_dict = test_case[0] response = yield self.us.insert(uid, info_dict) response_dict = self._verify_status_code_and_return_response(response) self.assertEqual(response_dict['_index'], 'index') self.assertEqual(response_dict['_type'], 'user') self.assertEqual(response_dict['_id'], uid) self.assertIn('refresh=True', response.request.url) @gen_test def test_get(self): uid, info_dict = test_case[0] yield self.us.insert(uid, info_dict) res = yield self.us.get(uid) self.assertEqual(info_dict, res['_source']) @gen_test def test_delete(self): uid, info_dict = test_case[0] res = yield self.us.delete(uid) self.assertEqual(res.code, 404) response = yield self.us.insert(uid, info_dict) res = yield self.us.delete(uid) self.assertEqual(res.code, 200) @gen_test def test_update_field(self): uid, info_dict = test_case[0] yield self.us.insert(uid, info_dict) response = yield self.us.update_field(uid, 'real_name', '小美') response_dict = self._verify_status_code_and_return_response(response) self.assertEqual(response_dict['_index'], 'index') self.assertEqual(response_dict['_type'], 'user') self.assertEqual(response_dict['_id'], uid) response = yield self.es_connection.get(index="index", type="user", uid=uid) response = response["_source"] self.assertEqual(response['real_name'], u"小美") @gen_test def test_update_multi_fields(self): uid, info_dict = test_case[0] yield self.us.insert(uid, info_dict) response = yield self.us.update_multi_fields(uid, {'real_name': '小美', 'tags_list': ['a', 'b']}) response_dict = self._verify_status_code_and_return_response(response) self.assertEqual(response_dict['_index'], 'index') self.assertEqual(response_dict['_type'], 'user') self.assertEqual(response_dict['_id'], uid) response = yield self.es_connection.get(index="index", type="user", uid=uid) response = response["_source"] self.assertEqual(response['real_name'], u"小美") self.assertEqual(response['tags_list'], ['a', 'b']) @gen_test def test_push(self): uid, info_dict = test_case[0] tags_list = info_dict['tags_list'] yield self.us.insert(uid, info_dict) yield self.us.push(uid, "tags_list", 'a') tags_list.append('a') response = yield self.es_connection.get(index="index", type="user", uid=uid) response = response["_source"] self.assertEqual(response['tags_list'], tags_list) yield self.us.push(uid, "tags_list", ['a', 'b']) tags_list.extend(['a', 'b']) response = yield self.es_connection.get(index="index", type="user", uid=uid) response = response["_source"] self.assertEqual(response['tags_list'], tags_list) @gen_test def test_add_to_set(self): uid, info_dict = test_case[0] tags_list = info_dict['tags_list'] yield self.us.insert(uid, info_dict) yield self.us.add_to_set(uid, "tags_list", 'a') tags_list.append('a') response = yield self.es_connection.get(index="index", type="user", uid=uid) response = response["_source"] self.assertEqual(sorted(response['tags_list']), sorted(tags_list)) yield self.us.add_to_set(uid, "tags_list", ['a', 'b']) tags_list = set(tags_list).union(['a', 'b']) tags_list = list(tags_list) response = yield self.es_connection.get(index="index", type="user", uid=uid) response = response["_source"] self.assertEqual(sorted(response['tags_list']), sorted(tags_list)) @gen_test def test_query(self): for uid, info_dict in test_case[:2]: yield self.us.insert(uid, info_dict) response = yield self.us.query(u"校园") response = self._verify_status_code_and_return_response(response) self.assertEqual(response["hits"]["total"], 2) response = yield self.us.query(u"软件大赛") response = self._verify_status_code_and_return_response(response) self.assertEqual(response["hits"]["total"], 1) self.assertEqual(response["hits"]["hits"][0]["_id"], '1') response = yield self.us.query(u'电子科技大学') response = self._verify_status_code_and_return_response(response) self.assertEqual(response["hits"]["total"], 2) self.assertEqual(response["hits"]["hits"][0]["_id"], '2') def _verify_status_code_and_return_response(self, response): self.assertTrue(response.code in [200, 201], "Wrong response code: %d." % response.code) response = escape.json_decode(response.body) return response
from greplin import scales from greplin.scales.meter import MeterStat import json from hashlib import md5 import re import unicodedata STATS = scales.collection('/index', MeterStat('docs')) BASE_PATH = '/Users/jisaacso/Documents/projects/bayes-impact/team-thorn/data/escort_all' FBDUMP = os.path.join(BASE_PATH, 'escort_all.tsv') es = ESConnection('localhost', 9200) es.httprequest_kwargs = { 'request_timeout': 1500.00, 'connect_timeout': 1500.00 } wspaceNuker = re.compile(' +') def fold_accents(raw): if type(raw) == str: raw = unicode(raw, 'utf-8') return ''.join([ c for c in unicodedata.normalize('NFKD', raw).encode('ascii', 'ignore') ])
class TestESConnection(AsyncTestCase): def setUp(self): self.io_loop = self.get_new_ioloop() self.es_connection = ESConnection("localhost", "9200", self.io_loop) def tearDown(self): if (not IOLoop.initialized() or self.io_loop is not IOLoop.instance()): self.io_loop.close(all_fds=True) super(AsyncTestCase, self).tearDown() def test_simple_search(self): self.es_connection.get_by_path("/_search?q=_id:http\:\/\/localhost\/noticia\/2\/fast", self.stop) response = self._verify_status_code_and_return_response() self.assertEqual(response["hits"]["total"], 1) self.assertEqual(response["hits"]["hits"][0]["_id"], u'http://localhost/noticia/2/fast') def test_search_for_specific_type_with_query(self): self.es_connection.search(callback=self.stop, source={"query": {"text": {"ID": "171171"}}}, type="materia", index="teste") response = self._verify_status_code_and_return_response() self.assertEqual(response["hits"]["total"], 1) self.assertEqual(response["hits"]["hits"][0]["_id"], u'171171') def test_search_all_entries(self): self.es_connection.search(self.stop) response = self._verify_status_code_and_return_response() self.assertEqual(response["hits"]["total"], 28) def test_search_specific_index(self): self.es_connection.search(callback=self.stop, index="outroteste") response = self._verify_status_code_and_return_response() self.assertEqual(response["hits"]["total"], 14) def test_search_apecific_type(self): self.es_connection.search(self.stop, type='galeria') response = self._verify_status_code_and_return_response() self.assertEqual(response["hits"]["total"], 2) def test_should_access_specific_documento(self): self.es_connection.get(index="teste", type="materia", uid="171171", callback=self.stop) response = self.wait() self.assertEqual(response['Portal'], "G1") self.assertEqual(response['Macrotema'], "Noticias") def test_should_accumulate_searches_before_search(self): source = {"query": {"text": {"_id": "171171"}}} self.es_connection.multi_search("teste", source=source) source = {"query": {"text": {"body": "multisearch"}}} self.es_connection.multi_search("neverEndIndex", source=source) self.assertListEqual(['{"index": "teste"}\n{"query": {"text": {"_id": "171171"}}}', '{"index": "neverEndIndex"}\n{"query": {"text": {"body": "multisearch"}}}' ], self.es_connection.bulk.bulk_list) def test_should_generate_empty_header_with_no_index_specified(self): source = {"query": {"text": {"_id": "171171"}}} self.es_connection.multi_search(index=None, source=source) source = {"query": {"text": {"body": "multisearch"}}} self.es_connection.multi_search(index=None, source=source) self.assertListEqual(['{}\n{"query": {"text": {"_id": "171171"}}}', '{}\n{"query": {"text": {"body": "multisearch"}}}' ], self.es_connection.bulk.bulk_list) def test_should_make_two_searches(self): self._make_multisearch() response = self._verify_status_code_and_return_response() self.assertEqual(response['responses'][0]['hits']['hits'][0]['_id'], "171171") self.assertFalse("hits" in response['responses'][1]) def test_should_clean_search_list_after_search(self): self._make_multisearch() self.wait() self.assertListEqual([], self.es_connection.bulk.bulk_list) def _make_multisearch(self): source = {"query": {"text": {"_id": "171171"}}} self.es_connection.multi_search(index="teste", source=source) source = {"query": {"text": {"_id": "101010"}}} self.es_connection.multi_search(index="neverEndIndex", source=source) self.es_connection.apply_search(callback=self.stop) def _verify_status_code_and_return_response(self): response = self.wait() self.assertEqual(response.code, 200, "Wrong response code.") response = escape.json_decode(response.body) return response
class TestESConnectionWithTornadoGen(AsyncTestCase): def setUp(self): self.io_loop = self.get_new_ioloop() self.es_connection = ESConnection("localhost", "9200", self.io_loop) def tearDown(self): if (not IOLoop.initialized() or self.io_loop is not IOLoop.instance()): self.io_loop.close(all_fds=True) super(AsyncTestCase, self).tearDown() @gen_test def test_simple_search(self): response = yield self.es_connection.get_by_path( "/_search?q=_id:http\:\/\/localhost\/noticia\/2\/fast", self.stop) response = self._verify_status_code_and_return_response(response) self.assertEqual(response["hits"]["total"], 1) self.assertEqual(response["hits"]["hits"][0]["_id"], u'http://localhost/noticia/2/fast') @gen_test def test_search_for_specific_type_with_query(self): response = yield self.es_connection.search( source={"query": { "term": { "ID": "171171" } }}, type="materia", index="teste") response = self._verify_status_code_and_return_response(response) self.assertEqual(response["hits"]["total"], 1) self.assertEqual(response["hits"]["hits"][0]["_id"], u'171171') @gen_test def test_search_specific_index(self): response = yield self.es_connection.search(index="outroteste") response = self._verify_status_code_and_return_response(response) self.assertEqual(response["hits"]["total"], 14) @gen_test def test_search_apecific_type(self): response = yield self.es_connection.search(type='galeria') response = self._verify_status_code_and_return_response(response) self.assertEqual(response["hits"]["total"], 2) @gen_test def test_should_access_specific_documento(self): response = yield self.es_connection.get(index="teste", type="materia", uid="171171") self.assertEqual(response['Portal'], "G1") self.assertEqual(response['Macrotema'], "Noticias") @gen_test def test_should_make_two_searches(self): self._make_multisearch() response = yield self.es_connection.apply_search() response = self._verify_status_code_and_return_response(response) self.assertEqual(response['responses'][0]['hits']['hits'][0]['_id'], "171171") self.assertFalse("hits" in response['responses'][1]) @gen_test def test_should_clean_search_list_after_search(self): self._make_multisearch() response = yield self.es_connection.apply_search() response = self._verify_status_code_and_return_response(response) self.assertListEqual([], self.es_connection.bulk.bulk_list) @gen_test def test_can_put_and_delete_document(self): try: doc_id = str(uuid4()) response = yield self.es_connection.put( "test", "document", doc_id, { "test": "document", "other": "property" }, parameters={'refresh': True}) response_dict = self._verify_status_code_and_return_response( response) self.assertEqual(response_dict['_index'], 'test') self.assertEqual(response_dict['_type'], 'document') self.assertEqual(response_dict['_id'], doc_id) self.assertIn('refresh=True', response.request.url) finally: response = yield self.es_connection.delete( "test", "document", doc_id, parameters={'refresh': True}) response = self._verify_status_code_and_return_response(response) self.assertTrue(response['found']) self.assertEqual(response['_index'], 'test') self.assertEqual(response['_type'], 'document') self.assertEqual(response['_id'], doc_id) @gen_test def test_count_specific_index(self): response = yield self.es_connection.count(index="outroteste") self.assertCount(response, 14) @gen_test def test_count_specific_type(self): response = yield self.es_connection.count(type='galeria') self.assertCount(response, 2) @gen_test def test_count_specific_query(self): source = {"query": {"term": {"_id": "171171"}}} response = yield self.es_connection.count(source=source) self.assertCount(response, 1) @gen_test def test_count_specific_query_with_parameters(self): source = {"query": {"term": {"_id": "171171"}}} parameters = {'refresh': True} response = yield self.es_connection.count(callback=self.stop, source=source, parameters=parameters) self.assertCount(response, 1) self.assertTrue(response.request.url.endswith('_count?refresh=True')) @gen_test def test_count_specific_query_with_many_parameters(self): source = {"query": {"term": {"_id": "171171"}}} parameters = {'df': '_id', 'test': True} response = yield self.es_connection.count(callback=self.stop, source=source, parameters=parameters) self.assertCount(response, 1) self.assertTrue( response.request.url.endswith('_count?df=_id&test=True')) def assertCount(self, response, count): response_dict = self._verify_status_code_and_return_response(response) self.assertEqual(response_dict["count"], count) def _make_multisearch(self): source = {"query": {"term": {"_id": "171171"}}} self.es_connection.multi_search(index="teste", source=source) source = {"query": {"term": {"_id": "101010"}}} self.es_connection.multi_search(index="neverEndIndex", source=source) def _verify_status_code_and_return_response(self, response): self.assertTrue(response.code in [200, 201], "Wrong response code: %d." % response.code) response = escape.json_decode(response.body) return response
class TestUserSearch(AsyncTestCase): def setUp(self): super(TestUserSearch, self).setUp() self.es_connection = ESConnection("localhost", 9200, self.io_loop) self.us = ESSearch(index_mapping=user_index_mapping, index_name="index", type_name="user", analyze_fields=user_analyze_fields, none_analyze_fields=user_none_analyze_fields, io_loop=self.io_loop) self.io_loop.run_sync(self.setup_coro) @gen.coroutine def setup_coro(self): yield self.us.clean_all() @gen_test def test_insert(self): uid, info_dict = test_case[0] response = yield self.us.insert(uid, info_dict) response_dict = self._verify_status_code_and_return_response(response) self.assertEqual(response_dict['_index'], 'index') self.assertEqual(response_dict['_type'], 'user') self.assertEqual(response_dict['_id'], uid) self.assertIn('refresh=True', response.request.url) @gen_test def test_get(self): uid, info_dict = test_case[0] yield self.us.insert(uid, info_dict) res = yield self.us.get(uid) self.assertEqual(info_dict, res['_source']) @gen_test def test_delete(self): uid, info_dict = test_case[0] res = yield self.us.delete(uid) self.assertEqual(res.code, 404) response = yield self.us.insert(uid, info_dict) res = yield self.us.delete(uid) self.assertEqual(res.code, 200) @gen_test def test_update_field(self): uid, info_dict = test_case[0] yield self.us.insert(uid, info_dict) response = yield self.us.update_field(uid, 'real_name', '小美') response_dict = self._verify_status_code_and_return_response(response) self.assertEqual(response_dict['_index'], 'index') self.assertEqual(response_dict['_type'], 'user') self.assertEqual(response_dict['_id'], uid) response = yield self.es_connection.get(index="index", type="user", uid=uid) response = response["_source"] self.assertEqual(response['real_name'], u"小美") @gen_test def test_update_multi_fields(self): uid, info_dict = test_case[0] yield self.us.insert(uid, info_dict) response = yield self.us.update_multi_fields(uid, { 'real_name': '小美', 'tags_list': ['a', 'b'] }) response_dict = self._verify_status_code_and_return_response(response) self.assertEqual(response_dict['_index'], 'index') self.assertEqual(response_dict['_type'], 'user') self.assertEqual(response_dict['_id'], uid) response = yield self.es_connection.get(index="index", type="user", uid=uid) response = response["_source"] self.assertEqual(response['real_name'], u"小美") self.assertEqual(response['tags_list'], ['a', 'b']) @gen_test def test_push(self): uid, info_dict = test_case[0] tags_list = info_dict['tags_list'] yield self.us.insert(uid, info_dict) yield self.us.push(uid, "tags_list", 'a') tags_list.append('a') response = yield self.es_connection.get(index="index", type="user", uid=uid) response = response["_source"] self.assertEqual(response['tags_list'], tags_list) yield self.us.push(uid, "tags_list", ['a', 'b']) tags_list.extend(['a', 'b']) response = yield self.es_connection.get(index="index", type="user", uid=uid) response = response["_source"] self.assertEqual(response['tags_list'], tags_list) @gen_test def test_add_to_set(self): uid, info_dict = test_case[0] tags_list = info_dict['tags_list'] yield self.us.insert(uid, info_dict) yield self.us.add_to_set(uid, "tags_list", 'a') tags_list.append('a') response = yield self.es_connection.get(index="index", type="user", uid=uid) response = response["_source"] self.assertEqual(sorted(response['tags_list']), sorted(tags_list)) yield self.us.add_to_set(uid, "tags_list", ['a', 'b']) tags_list = set(tags_list).union(['a', 'b']) tags_list = list(tags_list) response = yield self.es_connection.get(index="index", type="user", uid=uid) response = response["_source"] self.assertEqual(sorted(response['tags_list']), sorted(tags_list)) @gen_test def test_query(self): for uid, info_dict in test_case[:2]: yield self.us.insert(uid, info_dict) response = yield self.us.query(u"校园") response = self._verify_status_code_and_return_response(response) self.assertEqual(response["hits"]["total"], 2) response = yield self.us.query(u"软件大赛") response = self._verify_status_code_and_return_response(response) self.assertEqual(response["hits"]["total"], 1) self.assertEqual(response["hits"]["hits"][0]["_id"], '1') response = yield self.us.query(u'电子科技大学') response = self._verify_status_code_and_return_response(response) self.assertEqual(response["hits"]["total"], 2) self.assertEqual(response["hits"]["hits"][0]["_id"], '2') def _verify_status_code_and_return_response(self, response): self.assertTrue(response.code in [200, 201], "Wrong response code: %d." % response.code) response = escape.json_decode(response.body) return response