def search_posts_elasticsearch(token): es = ElasticSearch('http://localhost:9200/') #for result in es.search("_type:post", index=token.lower())['hits']['hits']: # print result["_source"] print es.search("id:sdifhsdihf", index="caacedeose0cban4zbmltsbcyxgzbzfrvq7uiqksk1uxep0njzgza7jtxei59ekp1izcjbg9czbum5qm0ojjuekaa3vwnn8tnxezcplgyaa2esvpi1dzcycai6xyvfwbrzco8quwns9orejsbecktw738yglnevljlqeascfgdfc0xdrjc1s0n40uun4ypytklsjarzand9gtfazdzd")
class GetSuggestions(): ''' Used to search in elastic for simmilar prepid as given ''' def __init__(self, typeof): self.es = ElasticSearch(config.DATABASE_URL) self.overflow = 20 self.announced = (typeof == 'announced') self.growing = (typeof == 'growing') self.historical = (typeof == 'historical') self.performance = (typeof == 'performance') def get(self, query): searchable = query.replace('-', '\-') if '-' in query: search = ('prepid:%s' % searchable) search_stats = ('pdmv_request_name:%s' % searchable) else: search = ('prepid:*%s*' % searchable) search_stats = ('pdmv_request_name:*%s*' % searchable) ext0 = [] ext1 = [] ext2 = [] if (self.historical or self.growing or self.announced or self.performance): # campaigns are expected in all modes ext0 = [s['_id'] for s in self.es.search(search, index='campaigns', size=self.overflow)['hits']['hits']] # extended search for historical if self.historical: ext1 = [s['_id'] for s in self.es.search(search, index='requests', size=self.overflow)['hits']['hits']] ext2 = [s['_id'] for s in self.es.search(search_stats, index='stats', size=self.overflow)['hits']['hits']] # extended search fo growing if self.growing: ext1 = [s['_id'] for s in self.es.search(search, index="chained_campaigns", size=self.overflow)['hits']['hits']] ext2 = [s['_id'] for s in self.es.search(search, index="chained_requests", size=self.overflow)['hits']['hits']] # order of ext does matter because of the typeahead in bootstrap return json.dumps({"results": ext0 + ext1 + ext2})
def get_image(url, output_path=""): es_server = 'http://localhost:9200/' if environ.get('ELASTICSEARCH_SERVER'): es_server = environ['ELASTICSEARCH_SERVER'] es = ElasticSearch(es_server) if output_path: output_path = output_path+'/' if url: query = { "query": { "term": { "url": url } }, "fields": ["thumbnail"] } res = es.search(query, index='memex', doc_type='page') hits = res['hits'] if (len(hits) > 0): img = base64.b64decode(hits['hits'][0]['fields']['thumbnail'][0]) with open(output_path+urllib2.quote(url).replace("/", "%2F")+'.png','wb') as f: f.write(img) else: print "No thumbnail found"
class ENC_Collection(object): def __init__(self, connection, supplied_name, frame="object"): if supplied_name.endswith("s"): self.name = supplied_name.replace("_", "-") self.search_name = supplied_name.rstrip("s").replace("-", "_") self.schema_name = self.search_name + ".json" elif supplied_name.endswith(".json"): self.name = supplied_name.replace("_", "-").rstrip(".json") self.search_name = supplied_name.replace("-", "_").rstrip(".json") self.schema_name = supplied_name else: self.name = supplied_name.replace("_", "-") + "s" self.search_name = supplied_name.replace("-", "_") self.schema_name = supplied_name.replace("-", "_") + ".json" schema_uri = "/profiles/" + self.schema_name self.connection = connection self.server = connection.server self.schema = get_ENCODE(schema_uri, connection) self.frame = frame search_string = "/search/?format=json&limit=all&type=%s&frame=%s" % (self.search_name, frame) collection = get_ENCODE(search_string, connection) self.items = collection["@graph"] self.es_connection = None def query(self, query_dict, maxhits=10000): from pyelasticsearch import ElasticSearch if self.es_connection == None: es_server = self.server.rstrip("/") + ":9200" self.es_connection = ElasticSearch(es_server) results = self.es_connection.search(query_dict, index="encoded", doc_type=self.search_name, size=maxhits) return results
def search(field, queryStr): es_server = 'http://localhost:9200/' es_index = 'memex' es_doc_type = 'page' if environ.get('ELASTICSEARCH_SERVER'): es_server = environ['ELASTICSEARCH_SERVER'] if environ.get('ELASTICSEARCH_INDEX'): es_index = environ['ELASTICSEARCH_INDEX'] if environ.get('ELASTICSEARCH_DOC_TYPE'): es_doc_type = environ['ELASTICSEARCH_DOC_TYPE'] es = ElasticSearch(es_server) if len(queryStr) > 0: query = { "query": { "query_string": { "fields": [field], "query": ' and '.join(queryStr[0:]), } }, "fields": [field] } print query res = es.search(query, index=es_index, doc_type=es_doc_type, size=500) hits = res['hits'] urls = [] for hit in hits['hits']: urls.append(hit['_id']) return urls
def get_image(url): es_server = 'http://localhost:9200/' if environ.get('ELASTICSEARCH_SERVER'): es_server = environ['ELASTICSEARCH_SERVER'] es = ElasticSearch(es_server) if url: query = { "query": { "term": { "url": url } }, "fields": ["thumbnail", "thumbnail_name"] } res = es.search(query, index=environ['ELASTICSEARCH_INDEX'] if environ.get('ELASTICSEARCH_INDEX') else 'memex', doc_type=environ['ELASTICSEARCH_DOC_TYPE'] if environ.get('ELASTICSEARCH_DOC_TYPE') else 'page') hits = res['hits']['hits'] if (len(hits) > 0): try: img = base64.b64decode(hits[0]['fields']['thumbnail'][0]) img_name = hits[0]['fields']['thumbnail_name'][0] return [img_name, img] except KeyError: print "No thumbnail found" else: print "No thumbnail found" return [None, None]
def term_search(field, queryStr): es_server = 'http://localhost:9200/' if environ.get('ELASTICSEARCH_SERVER'): es_server = environ['ELASTICSEARCH_SERVER'] es = ElasticSearch(es_server) if len(queryStr) > 0: query = { "query": { "match": { field: { "query": ' '.join(queryStr), "minimum_should_match": "100%" } } }, "fields": ["url"] } print query res = es.search(query, index=environ['ELASTICSEARCH_INDEX'] if environ.get('ELASTICSEARCH_INDEX') else 'memex', doc_type=environ['ELASTICSEARCH_DOC_TYPE'] if environ.get('ELASTICSEARCH_DOC_TYPE') else 'page', size=500) hits = res['hits'] urls = [] for hit in hits['hits']: urls.append(hit['_id']) return urls
def term_search(field, queryStr): es_server = 'http://localhost:9200/' if environ.get('ELASTICSEARCH_SERVER'): es_server = environ['ELASTICSEARCH_SERVER'] es = ElasticSearch(es_server) if len(queryStr) > 0: query = { "query" : { "match": { field: { "query": ' '.join(queryStr), "minimum_should_match":"100%" } } }, "fields": ["url"] } print query res = es.search(query, index=environ['ELASTICSEARCH_INDEX'] if environ.get('ELASTICSEARCH_INDEX') else 'memex', doc_type=environ['ELASTICSEARCH_DOC_TYPE'] if environ.get('ELASTICSEARCH_DOC_TYPE') else 'page', size=500) hits = res['hits'] urls = [] for hit in hits['hits']: urls.append(hit['_id']) return urls
def search(field, queryStr): es_server = 'http://localhost:9200/' es_index = 'memex' es_doc_type = 'page' if environ.get('ELASTICSEARCH_SERVER'): es_server = environ['ELASTICSEARCH_SERVER'] if environ.get('ELASTICSEARCH_INDEX'): es_index = environ['ELASTICSEARCH_INDEX'] if environ.get('ELASTICSEARCH_DOC_TYPE'): es_doc_type = environ['ELASTICSEARCH_DOC_TYPE'] es = ElasticSearch(es_server) if len(queryStr) > 0: query = { "query": { "query_string": { "fields" : [field], "query": ' and '.join(queryStr[0:]), } }, "fields": [field] } print query res = es.search(query, index=es_index, doc_type=es_doc_type, size=500) hits = res['hits'] urls = [] for hit in hits['hits']: urls.append(hit['_id']) return urls
class ESNotices(object): """Implementation of Elastic Search as notice backend""" def __init__(self): self.es = ElasticSearch(settings.ELASTIC_SEARCH_URLS) def put(self, doc_number, notice): """Store a single notice""" self.es.index(settings.ELASTIC_SEARCH_INDEX, 'notice', notice, id=doc_number) def get(self, doc_number): """Find the associated notice""" try: result = self.es.get(settings.ELASTIC_SEARCH_INDEX, 'notice', doc_number) return result['_source'] except ElasticHttpNotFoundError: return None def listing(self, part=None): """All notices or filtered by cfr_part""" if part: query = {'match': {'cfr_part': part}} else: query = {'match_all': {}} query = {'fields': ['effective_on', 'fr_url', 'publication_date'], 'query': query} notices = [] results = self.es.search(query, doc_type='notice', size=100, index=settings.ELASTIC_SEARCH_INDEX) for notice in results['hits']['hits']: notice['fields']['document_number'] = notice['_id'] notices.append(notice['fields']) return notices
class ENC_Collection: def __init__(self, connection, supplied_name, frame='object'): if supplied_name.endswith('s'): self.name = supplied_name.replace('_', '-') self.search_name = supplied_name.rstrip('s').replace('-', '_') self.schema_name = self.search_name + '.json' elif supplied_name.endswith('.json'): self.name = supplied_name.replace('_', '-').rstrip('.json') self.search_name = supplied_name.replace('-', '_').rstrip('.json') self.schema_name = supplied_name else: self.name = supplied_name.replace('_', '-') + 's' self.search_name = supplied_name.replace('-', '_') self.schema_name = supplied_name.replace('-', '_') + '.json' schema_uri = '/profiles/' + self.schema_name self.connection = connection self.server = connection.server self.schema = get_ENCODE(schema_uri, connection) self.frame = frame search_string = '/search/?format=json&limit=all&type=%s&frame=%s' % ( self.search_name, frame) collection = get_ENCODE(search_string, connection) self.items = collection['@graph'] self.es_connection = None def query(self, query_dict, maxhits=10000): from pyelasticsearch import ElasticSearch if self.es_connection == None: es_server = self.server.rstrip('/') + ':9200' self.es_connection = ElasticSearch(es_server) results = self.es_connection.search(query_dict, index='encoded', doc_type=self.search_name, size=maxhits) return results
def search(field, queryStr): es_server = 'http://localhost:9200/' es_index = 'memex' es_doc_type = 'page' if environ.get('ELASTICSEARCH_SERVER'): es_server = environ['ELASTICSEARCH_SERVER'] if environ.get('ELASTICSEARCH_INDEX'): es_index = environ['ELASTICSEARCH_INDEX'] if environ.get('ELASTICSEARCH_DOC_TYPE'): es_doc_type = environ['ELASTICSEARCH_DOC_TYPE'] es = ElasticSearch(es_server) if len(queryStr) > 0: query = { "query": { "query_string": { "fields": [field], "query": ' and '.join(queryStr[0:]), } }, "fields": [field] } print query res = es.search(query, index=es_index, doc_type=es_doc_type) hits = res['hits'] print 'Document found: %d' % hits['total'] return hits['hits']
def term_search(field, queryStr): es_server = 'http://localhost:9200/' if environ.get('ELASTICSEARCH_SERVER'): es_server = environ['ELASTICSEARCH_SERVER'] es = ElasticSearch(es_server) if len(queryStr) > 0: query = { "query" : { "match": { field: { "query": queryStr, "operator" : "and" } } }, "fields": ["url"] } print query res = es.search(query, index='memex', doc_type='page', size=500) hits = res['hits'] urls = [] for hit in hits['hits']: urls.append(hit['_id']) return urls
def get_context(terms): es_server = 'http://localhost:9200/' if environ.get('ELASTICSEARCH_SERVER'): es_server = environ['ELASTICSEARCH_SERVER'] es = ElasticSearch(es_server) if len(terms) > 0: query = { "query": { "match": { "text": { "query": ' and '.join(terms[0:]), "operator" : "and" } } }, "highlight" : { "fields" : { "text": { "fragment_size" : 100, "number_of_fragments" : 1 } } } } print query res = es.search(query, index='memex', doc_type='page') hits = res['hits'] print 'Document found: %d' % hits['total'] highlights = [] for hit in hits['hits']: highlights.append(hit['highlight']['text']) return highlights
def get_available_domains(es=None): if es is None: es = ElasticSearch("http://localhost:9200") query = { "query": { "match_all": {} }, } res = es.search(query, index='config', doc_type='domains', size=100 ) hits = res['hits']['hits'] res = [] for hit in hits: res.append(hit['_source']) for i in range(0,len(res)): res[i]['timestamp'] = long(convert_to_epoch(datetime.strptime(res[i]['timestamp'], '%Y-%m-%dT%H:%M:%S.%f'))) print datetime.utcfromtimestamp(res[i]['timestamp']) return res
def get_context(terms, es_index='memex', es_doc_type='page', es=None): if es is None: es = ElasticSearch("http://localhost:9200") if len(terms) > 0: query = { "query": { "match": { "text": { "query": ' and '.join(terms[0:]), "operator" : "and" } } }, "highlight" : { "fields" : { "text": { "fragment_size" : 100, "number_of_fragments" : 1 } } } } print query res = es.search(query, index=es_index, doc_type=es_doc_type, size=500) hits = res['hits'] highlights = [] for hit in hits['hits']: highlights.append(hit['highlight']['text'][0]) return highlights
def _get(self): """Build and run the ES query """ opts = self.opts es = ElasticSearch(opts.url) query = {'sort': {'@timestamp': 'desc'}, 'size': 1} if opts.query: query['query'] = { 'filtered': { 'query': { 'query_string': { 'query': opts.query } } } } # ElasticSearch allows us to pass an array of indices. However, # it will throw an exception if any of these don't exist. This # isn't the right behavior, because there may not actually be # a logstash index from X days ago. Instead, we need to iterate # through the daily log indexes in reverse order until we get a # non-error response. result = None for index in self._indexes(): try: result = es.search(query, index=index) break except ElasticHttpNotFoundError, e: pass
def get_documents(urls): host = environ['ELASTICSEARCH_SERVER'] if environ.get('ELASTICSEARCH_SERVER') else 'http://localhost:9200' es = ElasticSearch(host) if len(urls) > 0: results = {} for url in urls: query = { "query": { "term": { "url": url } }, "fields": ["text"] } res = es.search(query, index=environ['ELASTICSEARCH_INDEX'] if environ.get('ELASTICSEARCH_INDEX') else 'memex', doc_type=environ['ELASTICSEARCH_DOC_TYPE'] if environ.get('ELASTICSEARCH_DOC_TYPE') else 'page') hits = res['hits'] try: results[url] = hits['hits'][0]['fields']['text'][0] except KeyError, e: print url, e, " not found in database" except IndexError, e: print url, e, " not found in database"
def range(field, from_val, to_val, ret_fields=[], epoch=None, es_index='memex', es_doc_type='page', es=None): if es is None: es = ElasticSearch("http://localhost:9200") if not (epoch is None): if epoch: from_val = datetime.utcfromtimestamp(long(from_val)).strftime('%Y-%m-%dT%H:%M:%S') to_val = datetime.utcfromtimestamp(long(to_val)).strftime('%Y-%m-%dT%H:%M:%S') query = { "query" : { "range" : { field : { "from": from_val, "to": to_val } }, }, "fields": ret_fields } res = es.search(query, index=es_index, doc_type=es_doc_type, size=500) hits = res['hits']['hits'] results=[] for hit in hits: results.append(hit['fields']) return results
def get_image(url, es_index='memex', es_doc_type='page', es=None): if es is None: es = ElasticSearch("http://localhost:9200") if url: query = { "query": { "term": { "url": url } }, "fields": ["thumbnail", "thumbnail_name"] } res = es.search(query, index=es_index, doc_type=es_doc_type, size=500) hits = res['hits']['hits'] if (len(hits) > 0): try: img = base64.b64decode(hits[0]['fields']['thumbnail'][0]) img_name = hits[0]['fields']['thumbnail_name'][0] return [img_name, img] except KeyError: print "No thumbnail found" else: print "No thumbnail found" return [None, None]
def get_documents(terms, term_field, fields=["text"], es_index='memex', es_doc_type='page', es=None): if es is None: es = ElasticSearch('http://localhost:9200/') if len(terms) > 0: results = {} for term in terms: query = { "query": { "term": { term_field: term } }, "fields": fields } res = es.search(query, index=es_index, doc_type=es_doc_type) if res['hits']['hits']: hits = res['hits']['hits'][0] if not hits.get('fields') is None: hits = hits['fields'] record = {} for field in fields: if(not hits.get(field) is None): record[field] = hits[field][0] results[term] = record return results
class ENC_Collection(object): def __init__(self, connection, supplied_name, frame='object'): if supplied_name.endswith('s'): self.name = supplied_name.replace('_', '-') self.search_name = supplied_name.rstrip('s').replace('-', '_') self.schema_name = self.search_name + '.json' elif supplied_name.endswith('.json'): self.name = supplied_name.replace('_', '-').rstrip('.json') self.search_name = supplied_name.replace('-', '_').rstrip('.json') self.schema_name = supplied_name else: self.name = supplied_name.replace('_', '-') + 's' self.search_name = supplied_name.replace('-', '_') self.schema_name = supplied_name.replace('-', '_') + '.json' schema_uri = '/profiles/' + self.schema_name self.connection = connection self.server = connection.server self.schema = get_ENCODE(schema_uri, connection) self.frame = frame search_string = '/search/?format=json&limit=all&\ type=%s&frame=%s' % (self.search_name, frame) collection = get_ENCODE(search_string, connection) self.items = collection['@graph'] self.es_connection = None def query(self, query_dict, maxhits=10000): from pyelasticsearch import ElasticSearch if self.es_connection is None: es_server = self.server.rstrip('/') + ':9200' self.es_connection = ElasticSearch(es_server) results = self.es_connection.search(query_dict, index='encoded', doc_type=self.search_name, size=maxhits) return results
def search(q): """ Implement search method with ElasticSearch """ # Create connection es = ElasticSearch(ES_URL) # Get results from index results = es.search( { "query": { "query_string": { "query": q } } }, index=[ES_INDEX], doc_type=['watch'] ) return { 'count': results['hits']['total'], 'results': [ hh.get('_source') for hh in results['hits']['hits'] ] }
def search(request, doc_type, search_args): """Search elastic search for any matches in the node's text""" query = { 'fields': ['text', 'label', 'version', 'regulation', 'title', 'label_string'], 'from': search_args.page * search_args.page_size, 'size': search_args.page_size, } text_match = {'match': {'text': search_args.q, 'doc_type': doc_type}} if search_args.version or search_args.regulation: term = {} if search_args.version: term['version'] = search_args.version if search_args.regulation: term['regulation'] = search_args.regulation if search_args.is_root is not None: term['is_root'] = search_args.is_root if search_args.is_subpart is not None: term['is_subpart'] = search_args.is_subpart query['query'] = {'filtered': { 'query': text_match, 'filter': {'term': term} }} else: query['query'] = text_match es = ElasticSearch(settings.ELASTIC_SEARCH_URLS) results = es.search(query, index=settings.ELASTIC_SEARCH_INDEX) return success({ 'total_hits': results['hits']['total'], 'results': transform_results([h['fields'] for h in results['hits']['hits']]) })
def _query_applications(product_group, indices): hosts = [_url_for_host(env)] es = ElasticSearch(hosts, port = port) es_results = es.search(APPLICATIONS_QUERY, index=indices) applications = map((lambda result: result['term']), es_results['facets']['applications']['terms']) return applications
def _query_applications(indices): hosts = [_url_for_host(env)] es = ElasticSearch(hosts, port = port) es_results = es.search(APPLICATIONS_QUERY, index=indices, query_params={'ignore_unavailable':'true'}) applications = map((lambda result: result['key']), es_results['aggregations']['applications']['buckets']) return applications
def query(request): es = ElasticSearch(settings.ELASTIC_SEARCH) query = {"query": {"bool": {}}} #Building the query dict_value = dict(request.POST) for key in dict_value['query']: key = key value = ast.literal_eval(key) AndQueries = [] OrQueries = [] for index, key in enumerate(value['exact_query']): if key['condition'] == 'is equal to': query_values = {"term": {key['column']: key['value']}} if key['condition'] == 'is less than': query_values = {"range": {key['column']: {"lt": key['value']}}} if key['condition'] == 'is greater than': query_values = {"range": {key['column']: {"gt": key['value']}}} if key['condition'] == 'is less than or equal to': query_values = {"range": {key['column']: {"lte": key['value']}}} if key['condition'] == 'is greater than or equal to': query_values = {"range": {key['column']: {"gte": key['value']}}} if key['condition'] == 'is not equal to': query_values = { "must_not": { "term": { key['column']: key['value'] } } } if key['operation'] == 'and': AndQueries.append(query_values) if key['operation'] == 'or': OrQueries.append(query_values) if key['operation'] == '': if index < (len(value['exact_query']) - 1): next_value = value['exact_query'][index + 1] if next_value['operation'] == 'and': AndQueries.append(query_values) if next_value['operation'] == 'or': OrQueries.append(query_values) else: query['query']['bool']['must'] = query_values if len(AndQueries) != 0: query['query']['bool']['must'] = AndQueries if len(OrQueries) != 0: query['query']['bool']['should'] = OrQueries results = es.search(query, index=dict_value['index'][0], size=10000) return HttpResponse(json.dumps({ 'success': "Added successfully", 'results': results }), content_type="application/json")
def cli(index_name, doc_type, file_name, size): """ Export data from ElasticSearch to CSV file. \b Help: python es2csv.py --help \b Example: python es2csv.py --index-name=index_name --doc-type=typename --file-name=/tmp/save_file.csv """ es = ElasticSearch(ES_CONF['host']) mapping = es.get_mapping(index=index_name, doc_type=doc_type) fieldnames = mapping[index_name]['mappings'][doc_type]['properties'].keys() print "Fields Total: %d" % len(fieldnames) writer = csv.writer(file(file_name, 'wb'), quoting=csv.QUOTE_NONNUMERIC) writer.writerow(fieldnames) print fieldnames data = es.search("*", index=index_name, doc_type=doc_type, size=1) total = data['hits']['total'] print "Total: %d" % total size = 1000 for es_from in range(0, total + 1, size): data = es.search("*", index=index_name, doc_type=doc_type, es_from=es_from, size=size) data = data['hits']['hits'] format_data = [] for row in data: for k in fieldnames: if k not in row['_source']: row['_source'][k] = '' format_data.append([row['_source'][k] for k in fieldnames]) writer.writerows(format_data) print "Saved count %d" % (es_from + size) print 'ok'
class GetAnnounced(): ''' Used to return list of requests with some properties in a given campaign ''' def __init__(self): self.es = ElasticSearch(config.DATABASE_URL) self.overflow = 1000000 def get(self, campaign): # change all to wildcard if campaign == 'all': campaign = '*' # get list of requests - field has to be not analyzed by es res = [s['_source'] for s in self.es.search(('member_of_campaign:%s' % campaign), index='requests', size=self.overflow) ['hits']['hits']] # loop over and parse the db data for r in res: # requests that are done should have completed events value if r['status'] == 'done': r['total_events'] = r['completed_events'] try: # requests without output_dataset should have zero events if not len(r['output_dataset']): r['total_events'] = 0 except KeyError: r['total_events'] = 0 pass if r['status'] == 'submitted': try: if not len(r['reqmgr_name']): r['total_events'] = 0 except KeyError: r['total_events'] = 0 pass # requests that are new (-1) should have zero events if r['total_events'] == -1: r['total_events'] = 0 if r['time_event'] == -1: r['time_event'] = 0 # remove unnecessary fields to speed up api try: del r['completed_events'] del r['reqmgr_name'] del r['history'] del r['output_dataset'] except KeyError: print r['prepid'] return json.dumps({"results": res})
def fuzzysearch(name): es = ElasticSearch(settings.HAYSTACK_CONNECTIONS['default']['URL']) query = { "query":{ "fuzzy":{ "_all": str(name)}}} res = es.search(query, index=settings.HAYSTACK_CONNECTIONS['default']['INDEX_NAME']) id_list = get_template_id(res) return (Template.objects.filter(id=id).first() for id in id_list)
def get_posts_elasticsearch(token): es = ElasticSearch('http://localhost:9200/') r = [] for result in es.search("_type:post", index=token.lower(), size=1000)['hits']['hits']: r.append(result["_source"]) return r
def _query_applications(product_group, indices): hosts = [_url_for_host(env)] es = ElasticSearch(hosts, port=port) es_results = es.search(APPLICATIONS_QUERY, index=indices) applications = map((lambda result: result['term']), es_results['facets']['applications']['terms']) return applications
def search(request, doc_type): """Search elastic search for any matches in the node's text""" term = request.GET.get('q', '') version = request.GET.get('version', '') regulation = request.GET.get('regulation', '') is_root = request.GET.get('is_root') is_subpart = request.GET.get('is_subpart') try: page = int(request.GET.get('page', '0')) except ValueError: page = 0 if not term: return user_error('No query term') if not validate_boolean(is_root): return user_error('Parameter "is_root" must be "true" or "false"') if not validate_boolean(is_subpart): return user_error('Parameter "is_subpart" must be "true" or "false"') query = { 'fields': ['text', 'label', 'version', 'regulation', 'title', 'label_string'], 'from': page * PAGE_SIZE, 'size': PAGE_SIZE, } text_match = {'match': {'text': term, 'doc_type': doc_type}} if version or regulation: term = {} if version: term['version'] = version if regulation: term['regulation'] = regulation if is_root: term['is_root'] = is_root if is_subpart: term['is_subpart'] = is_subpart query['query'] = { 'filtered': { 'query': text_match, 'filter': { 'term': term } } } else: query['query'] = text_match es = ElasticSearch(settings.ELASTIC_SEARCH_URLS) results = es.search(query, index=settings.ELASTIC_SEARCH_INDEX) return success({ 'total_hits': results['hits']['total'], 'results': transform_results([h['fields'] for h in results['hits']['hits']]) })
def cli(index_name, doc_type, file_name, size): """ Export data from ElasticSearch to CSV file. \b Help: python es2csv.py --help \b Example: python es2csv.py --index-name=index_name --doc-type=typename --file-name=/tmp/save_file.csv """ es = ElasticSearch(ES_CONF['host']) mapping = es.get_mapping(index=index_name, doc_type=doc_type) fieldnames = mapping[index_name]['mappings'][doc_type]['properties'].keys() print "Fields Total: %d" % len(fieldnames) writer = csv.writer(file(file_name, 'wb'), quoting=csv.QUOTE_NONNUMERIC) writer.writerow(fieldnames) print fieldnames data = es.search("*", index=index_name, doc_type=doc_type, size=1) total = data['hits']['total'] print "Total: %d" % total size = 1000 for es_from in range(0, total+1, size): data = es.search("*", index=index_name, doc_type=doc_type, es_from=es_from, size=size) data = data['hits']['hits'] format_data = [] for row in data: for k in fieldnames: if k not in row['_source']: row['_source'][k] = '' format_data.append([row['_source'][k] for k in fieldnames]) writer.writerows(format_data) print "Saved count %d" % (es_from + size) print 'ok'
def dump_as_index_file(): es = ElasticSearch(CONTEXT['datahub-store']) total = es.count("owner:public AND display:timeline",index=CONTEXT['datahub-index'],doc_type='_all') series = es.search("owner:public AND display:timeline",index=CONTEXT['datahub-index'],size=total['count'],doc_type='_all') f = open(CONTEXT['correlation-index-path'],mode='w') for serie in series['hits']['hits']: f.write("%s;%s;%s;%s\n" % (serie['_id'],serie['_source']['name'],cjson.encode(serie['_source']['data']['series'][0]['data']),serie['_source']['category'])) f.close()
def search(elastic_config, fqdn): pattern = elastic_config.index_pattern lookback = elastic_config.lookback indices = common.get_indexes(lookback, pattern) hosts = elastic_config.hosts port = elastic_config.port username = elastic_config.username password = elastic_config.password environment = elastic_config.environment es = ElasticSearch(hosts, port=port, username=username, password=password) #try: doc = es.search(common.build_query(fqdn, environment), index=indices) return doc, fqdn
class GetSuggestions(): def __init__(self, typeof): self.es = ElasticSearch(config.DATABASE_URL) self.overflow = 20 self.lifetime = (typeof == 'lifetime') self.on = (typeof == 'true') def get(self, query): searchable = query.replace('-', '\-') if self.lifetime: if '-' in query: search_string = ('prepid:%s' % searchable) search_stats = ('pdmv_request_name:%s' % searchable) else: search_string = ('prepid:*%s*' % searchable) search_stats = ('pdmv_request_name:*%s*' % searchable) campa = [s['_id'] for s in self.es.search(search_string, index='campaigns', size=self.overflow)['hits']['hits']] reque = [s['_id'] for s in self.es.search(search_string, index='requests', size=self.overflow)['hits']['hits']] stats = [s['_id'] for s in self.es.search(search_stats, index='stats', size=self.overflow)['hits']['hits']] return json.dumps({'results': campa + reque + stats}) else: if '-' in query: search_string = ('prepid:%s' % searchable) else: search_string = ('prepid:*%s*' % searchable) if self.on: return json.dumps( {"results": [s['_id'] for s in self.es.search(search_string, index="chained_campaigns", size=self.overflow) ['hits']['hits']] + [s['_id'] for s in self.es.search(search_string, index="chained_requests", size=self.overflow)['hits']['hits']]}) else: return json.dumps( {"results": [s['_id'] for s in self.es.search(self.search_string, index="campaigns", size=self.overflow) ['hits']['hits']]})
def search(request, doc_type): """Search elastic search for any matches in the node's text""" term = request.GET.get('q', '') version = request.GET.get('version', '') regulation = request.GET.get('regulation', '') is_root = request.GET.get('is_root') is_subpart = request.GET.get('is_subpart') try: page = int(request.GET.get('page', '0')) except ValueError: page = 0 if not term: return user_error('No query term') if not validate_boolean(is_root): return user_error('Parameter "is_root" must be "true" or "false"') if not validate_boolean(is_subpart): return user_error('Parameter "is_subpart" must be "true" or "false"') query = { 'fields': ['text', 'label', 'version', 'regulation', 'title', 'label_string'], 'from': page * PAGE_SIZE, 'size': PAGE_SIZE, } text_match = {'match': {'text': term, 'doc_type': doc_type}} if version or regulation: term = {} if version: term['version'] = version if regulation: term['regulation'] = regulation if is_root: term['is_root'] = is_root if is_subpart: term['is_subpart'] = is_subpart query['query'] = {'filtered': { 'query': text_match, 'filter': {'term': term} }} else: query['query'] = text_match es = ElasticSearch(settings.ELASTIC_SEARCH_URLS) results = es.search(query, index=settings.ELASTIC_SEARCH_INDEX) return success({ 'total_hits': results['hits']['total'], 'results': transform_results([h['fields'] for h in results['hits']['hits']]) })
class GetCampaign(): def __init__(self): self.es = ElasticSearch(config.DATABASE_URL) self.overflow = 1000000 def get(self, campaign): if campaign == 'all': campaign = '*' return json.dumps( {"results": [s['_source'] for s in self.es.search(('member_of_campaign:%s' % campaign), index='requests', size=self.overflow)['hits']['hits']]})
class ESRegulations(object): """Implementation of Elastic Search as regulations backend""" def __init__(self): self.es = ElasticSearch(settings.ELASTIC_SEARCH_URLS) def get(self, label, version): """Find the regulation label + version""" try: result = self.es.get(settings.ELASTIC_SEARCH_INDEX, 'reg_tree', version + '/' + label) reg_node = result['_source'] del reg_node['regulation'] del reg_node['version'] del reg_node['label_string'] del reg_node['id'] return reg_node except ElasticHttpNotFoundError: return None def _transform(self, reg, version): """Add some meta data fields which are ES specific""" node = dict(reg) # copy node['version'] = version node['label_string'] = '-'.join(node['label']) node['regulation'] = node['label'][0] node['id'] = version + '/' + node['label_string'] node['root'] = len(node['label']) == 1 return node def bulk_put(self, regs, version, root_label): """Store all reg objects""" self.es.bulk_index(settings.ELASTIC_SEARCH_INDEX, 'reg_tree', map(lambda r: self._transform(r, version), regs)) def listing(self, label=None): """List regulation version-label pairs that match this label (or are root, if label is None)""" if label is None: query = {'match': {'root': True}} else: query = {'match': {'label_string': label}} query = {'fields': ['label_string', 'version'], 'query': query} result = self.es.search(query, index=settings.ELASTIC_SEARCH_INDEX, doc_type='reg_tree', size=100) return sorted((res['fields']['version'], res['fields']['label_string']) for res in result['hits']['hits'])
def search(request): """Search elastic search for any matches in the node's text""" term = request.GET.get('q', '') version = request.GET.get('version', '') regulation = request.GET.get('regulation', '') try: page = int(request.GET.get('page', '0')) except ValueError: page = 0 if not term: return user_error('No query term') query = { 'fields': ['text', 'label', 'version', 'regulation', 'title', 'label_string'], 'from': page * PAGE_SIZE, 'size': PAGE_SIZE, } text_match = {'match': {'text': term}} if version or regulation: term = {} if version: term['version'] = version if regulation: term['regulation'] = regulation query['query'] = { 'filtered': { 'query': text_match, 'filter': { 'term': term } } } else: query['query'] = text_match es = ElasticSearch(settings.ELASTIC_SEARCH_URLS) results = es.search(query, index=settings.ELASTIC_SEARCH_INDEX) return success({ 'total_hits': results['hits']['total'], 'results': transform_results([h['fields'] for h in results['hits']['hits']]) })
def search(query): pattern = elastic_config.index_pattern lookback = elastic_config.lookback indices = common.get_indexes(lookback, pattern) hosts = elastic_config.hosts port = elastic_config.port username = elastic_config.username password = elastic_config.password environment = elastic_config.environment es = ElasticSearch(hosts, port=443, username=username, password=password) try: logging.info("Querying Elasticsearch using {0}".format(query)) doc = es.search(query, index=indices) return doc except: logging.error( "Unexpected error searching for {0}. Passing".format(query)) pass
class SearchModel(object): def __init__(self): connection_url = settings.HAYSTACK_CONNECTIONS['default']['URL'] self.index = settings.HAYSTACK_CONNECTIONS['default']['INDEX_NAME'] self.elastic = ElasticSearch(connection_url) def find(self, field=None, term=None): search = self.elastic.search('{0}:{1}'.format(field, term), index=self.index,) results = None hits = search.get('hits', None) if hits != None: results = hits.get('hits', None) return results
def get_documents(urls): es_server = 'http://localhost:9200/' if environ.get('ELASTICSEARCH_SERVER'): es_server = environ['ELASTICSEARCH_SERVER'] es = ElasticSearch(es_server) if len(urls) > 0: results = {} for url in urls: query = {"query": {"term": {"url": url}}, "fields": ["text"]} res = es.search(query, index='memex', doc_type='page') hits = res['hits'] try: results[url] = hits['hits'][0]['fields']['text'][0] except KeyError, e: print url, e, " not found in database" except IndexError, e: print url, e, " not found in database"
class ESNotices(object): """Implementation of Elastic Search as notice backend""" def __init__(self): self.es = ElasticSearch(settings.ELASTIC_SEARCH_URLS) def put(self, doc_number, notice): """Store a single notice""" self.es.index(settings.ELASTIC_SEARCH_INDEX, 'notice', notice, id=doc_number) def get(self, doc_number): """Find the associated notice""" try: result = self.es.get(settings.ELASTIC_SEARCH_INDEX, 'notice', doc_number) return result['_source'] except ElasticHttpNotFoundError: return None def listing(self, part=None): """All notices or filtered by cfr_part""" if part: query = {'match': {'cfr_parts': part}} else: query = {'match_all': {}} query = { 'fields': ['effective_on', 'fr_url', 'publication_date'], 'query': query } notices = [] results = self.es.search(query, doc_type='notice', size=100, index=settings.ELASTIC_SEARCH_INDEX) for notice in results['hits']['hits']: notice['fields']['document_number'] = notice['_id'] notices.append(notice['fields']) return notices
def ElasticSearchJSON(server, query, object_type, hitnum): ''' Run an elasticsearch query and return JSON objects server: should be currently set to 'http://submit.encodedcc.org:9200' query: a dict formatted as specified by elasticsearch. the default match_all query is {'query': {'match_all': {}}} object_type: the name of the object type. for example 'biosample' this can also be a list of object types hitnum: the maximum number of returned json objects set this as high as you can take it (10000 will do for now) ''' #make instance of elastic search connection = ElasticSearch(server) # run query on server for index results = connection.search(query, index=object_type, size=hitnum) # result objects are embedded in a dict of search result metrics result_objects = results['hits']['hits'] # extract the json objects from the results json_objects = [] for result_object in result_objects: json_objects.append(result_object[u'_source']) return json_objects
def list(config): """List the indices the catalog knows about, with metadata. Dangling index pointers are conspicuously pointed out. """ secho('Current format: %s' % FORMAT, fg='green') echo('Catalog: %s\n' % config.es_catalog_index) es = ElasticSearch(config.es_hosts) query = { 'query': { 'match_all': {} }, 'sort': ['name', 'format'] } catalog_docs = sources(es.search(query, index=config.es_catalog_index, doc_type=TREE, size=10000)['hits']['hits']) aliases = alias_to_index_map(es, [d['es_alias'] for d in catalog_docs]) lines = [] colors = [] for d in catalog_docs: index_missing = d['es_alias'] not in aliases colors.append('red' if index_missing else ('green' if d['format'] == FORMAT else None)) lines.append([d['name'], d['format'], d['es_alias'], 'MISSING!' if index_missing else aliases[d['es_alias']], d['generated_date']]) table = tabulate(lines, headers=['Name', 'Format', 'Alias', 'Index', 'Generated'], tablefmt='simple').splitlines() echo(table[0]) echo(table[1]) for line, color in izip(table[2:], colors): secho(line, fg=color)
auxStartDate = dateBeginDate + datetime.timedelta(days=i) auxEndDate = dateBeginDate + datetime.timedelta(days=i + 1) #print str(auxStartDate) #print str(auxEndDate) query = { 'query': { "range": { "art_date": { "gte": str(auxStartDate), "lte": str(auxEndDate) } } } } #print query result = es.search(query, size=10000, index=index) for r in result['hits']['hits']: #print r['_source']['pub_content'] query2 = { 'query': { "bool": { "must": [{ "match_phrase": { "art_date": r['_source']['art_date'] } }, { "match": { "art_name_press_source": r['_source']['art_name_press_source'] } }],
class ElasticSearchProvider(SearchProvider): def __init__(self, config, db=None, authnz_wrapper=None, io_loop=None): self.debug = False self.config = config if db is not None: self.db = db self.syncES = ElasticSearch( '%(ELASTIC_SEARCH_PROTOCOL)s://%(ELASTIC_SEARCH_HOST)s:%(ELASTIC_SEARCH_PORT)s' % config ) self.asyncES = ESConnection( host=config.get('ELASTIC_SEARCH_HOST'), port=config.get('ELASTIC_SEARCH_PORT'), io_loop=io_loop, protocol=config.get('ELASTIC_SEARCH_PROTOCOL'), ) self.index = config.get('ELASTIC_SEARCH_INDEX') self.max_retries = config.get('ELASTIC_SEARCH_MAX_RETRIES') def activate_debug(self): self.debug = True def connect_to_db(self): from sqlalchemy import create_engine from sqlalchemy.orm import scoped_session, sessionmaker conn_string = self.config.get('SQLALCHEMY_CONNECTION_STRING') engine = create_engine( conn_string, convert_unicode=True, pool_size=1, max_overflow=0, echo=self.debug ) maker = sessionmaker(bind=engine, autoflush=True) self.db = scoped_session(maker) def _assemble_inner_query(self, domain=None, page_filter=None): if page_filter and domain: page_prefix = '%s/%s' % (domain.url, page_filter) else: page_prefix = None if page_prefix: return { 'prefix': { 'page_url': page_prefix } } else: return { 'match_all': {} } def _assemble_outer_query(self, inner_query, filter_terms): return { 'filtered': { 'query': inner_query, 'filter': { 'and': [{ 'term': filter_term } for filter_term in filter_terms] } } } def _assemble_filter_terms(self, key_id=None, domain=None): filter_terms = [] if key_id: filter_terms.append({'keys.id': key_id}) if domain: filter_terms.append({'domain_id': domain.id}) return filter_terms def gen_doc(self, review): return { 'keys': [{'id': violation.key_id} for violation in review.violations], 'uuid': str(review.uuid), 'completed_date': review.completed_date, 'violation_count': review.violation_count, 'page_id': review.page_id, 'page_uuid': str(review.page.uuid), 'page_url': review.page.url, 'page_last_review_date': review.page.last_review_date, 'domain_id': review.domain_id, 'domain_name': review.domain.name, } def index_review(self, review): for attempt in range(self.max_retries): try: self.syncES.send_request( method='POST', path_components=[self.index, 'review', review.page_id], body=dumps(self.gen_doc(review)), encode_body=False ) break except (Timeout, ConnectionError, ElasticHttpError, InvalidJsonResponseError) as e: values = review.id, review.page_id, str(e) logging.error('Could not index review (review_id:{0}, page_id:{1}): {2}'.format(*values)) time.sleep(1) if attempt >= self.max_retries - 1: raise else: raise def index_reviews(self, reviewed_pages, reviews_count, batch_size): action = {'index': {'_type': 'review'}} for i in range(0, reviews_count, batch_size): body_bits = [] for page in reviewed_pages[i:i + batch_size]: doc = self.gen_doc(page.last_review) action['index']['_id'] = doc['page_id'] body_bits.append(dumps(action)) body_bits.append(dumps(doc)) # Yes, that trailing newline IS necessary body = '\n'.join(body_bits) + '\n' self.syncES.send_request( method='POST', path_components=[self.index, '_bulk'], body=body, encode_body=False ) logging.info('Done!') @return_future def get_by_violation_key_name(self, key_id, current_page=1, page_size=10, domain=None, page_filter=None, callback=None): def treat_response(response): if response.error is None: try: hits = loads(response.body).get('hits', {'hits': []}) reviews_data = [] for hit in hits['hits']: completedAt = datetime.utcfromtimestamp(hit['_source']['completed_date']) reviews_data.append({ 'uuid': hit['_source']['uuid'], 'page': { 'uuid': hit['_source']['page_uuid'], 'url': hit['_source']['page_url'], 'completedAt': completedAt }, 'domain': hit['_source']['domain_name'] }) reviews_count = hits.get('total', 0) callback({ 'reviews': reviews_data, 'reviewsCount': reviews_count }) except Exception as e: reason = 'ElasticSearchProvider: invalid response (%s [%s])' % (type(e), e.message) logging.error(reason) callback({'error': {'status_code': 500, 'reason': reason}}) else: reason = 'ElasticSearchProvider: erroneous response (%s [%s])' % (response.error.message, response.body) logging.error(reason) callback({'error': {'status_code': 500, 'reason': reason}}) inner_query = self._assemble_inner_query(domain, page_filter) filter_terms = self._assemble_filter_terms(key_id, domain) query = self._assemble_outer_query(inner_query, filter_terms) sort_ = [{ 'completed_date': { 'order': 'desc' } }, { 'violation_count': { 'order': 'desc' } }] source = {'query': query, 'sort': sort_} self.asyncES.search( callback=treat_response, index=self.index, type='review', source=source, page=current_page, size=page_size, ) @return_future def get_domain_active_reviews(self, domain, current_page=1, page_size=10, page_filter=None, callback=None): def treat_response(response): if response.error is None: try: hits = loads(response.body).get('hits', {'hits': []}) pages = [] for hit in hits['hits']: completedAt = datetime.utcfromtimestamp(hit['_source']['completed_date']) pages.append({ 'url': hit['_source']['page_url'], 'uuid': hit['_source']['page_uuid'], 'violationCount': len(hit['_source']['keys']), 'completedAt': completedAt, 'reviewId': hit['_source']['uuid'] }) reviews_count = hits.get('total', 0) callback({ 'reviewsCount': reviews_count, 'pages': pages }) except Exception as e: reason = 'ElasticSearchProvider: invalid response (%s [%s])' % (type(e), e.message) logging.error(reason) callback({'error': {'status_code': 500, 'reason': reason}}) else: reason = 'ElasticSearchProvider: erroneous response (%s [%s])' % (response.error.message, response.body) logging.error(reason) callback({'error': {'status_code': 500, 'reason': reason}}) inner_query = self._assemble_inner_query(domain=domain, page_filter=page_filter) filter_terms = self._assemble_filter_terms(domain=domain) query = self._assemble_outer_query(inner_query, filter_terms) sort_ = [{ 'violation_count': { 'order': 'desc' } }, { 'completed_date': { 'order': 'desc' } }] source = {'query': query, 'sort': sort_} self.asyncES.search( callback=treat_response, index=self.index, type='review', source=source, page=current_page, size=page_size, ) def refresh(self): try: self.syncES.refresh(index=self.index) except Exception as e: logging.error('Could not refresh index (%s)' % e) def get_index_settings(cls): return { 'index': { 'number_of_shards': 4 } } def get_index_mapping(cls): return { 'review': { 'properties': { 'keys': { 'properties': { 'id': { 'type': 'integer' } } }, 'uuid': { 'type': 'string', 'index': 'not_analyzed' }, 'completed_date': { 'type': 'integer' }, 'violation_count': { 'type': 'float' }, 'page_id': { 'type': 'integer' }, 'page_uuid': { 'type': 'string', 'index': 'not_analyzed' }, 'page_url': { 'type': 'string', 'index': 'not_analyzed' }, 'page_last_review_date': { 'type': 'integer' }, 'domain_id': { 'type': 'integer' }, 'domain_name': { 'type': 'string', 'index': 'not_analyzed' } } } } def setup_index(self): try: settings = self.get_index_settings() self.syncES.create_index(index=self.index, settings=settings) mapping = self.get_index_mapping() self.syncES.put_mapping(index=self.index, doc_type='review', mapping=mapping) logging.info('Index %s created.' % self.index) except Exception as e: raise e def delete_index(self): try: self.syncES.delete_index(index=self.index) logging.info('Index %s deleted.' % self.index) except Exception as e: raise e def _get_max_page_id_from_index(self, must_have_domain_name=False): if must_have_domain_name: inner_query = { 'constant_score': { 'filter': { 'not': { 'missing': { 'field': 'domain_name' } } } } } else: inner_query = { 'match_all': {} } query = { 'query': inner_query, 'sort': [{ 'page_id': { 'order': 'desc' } }] } results = self.syncES.search(query, index=self.index, doc_type='review') if results['hits']['total'] > 0: return results['hits']['hits'][0]['_id'] or 0 return 0 def index_all_reviews(self, keys=None, batch_size=200, replace=False): logging.info('Querying database...') self.connect_to_db() if keys is not None: keys = [k.id for k in self.db.query(Key.id).filter(Key.name.in_(keys)).all()] try: max_page_id = self._get_max_page_id_from_index(must_have_domain_name=True) except Exception: logging.error('Could not retrieve max page_id! Use with --replace (with caution)') return def apply_filters(query): if keys is not None: query = query \ .filter(Violation.review_id == Page.last_review_id) \ .filter(Violation.key_id.in_(keys)) if not replace: query = query.filter(Page.id > max_page_id) return query.filter(Page.last_review_id != None) reviews_count = apply_filters(self.db.query(func.count(Page))).scalar() query = self.db.query(Page).options(joinedload('last_review')) reviewed_pages = apply_filters(query).order_by(Page.id.asc()) logging.info('Indexing %d reviews...' % reviews_count) self.index_reviews(reviewed_pages, reviews_count, batch_size) @classmethod def new_instance(cls, config): return ElasticSearchProvider(config) @classmethod def main(cls): import sys parser = cls.argparser() args = parser.parse_args() config = {} host = None port = None index = None es = None levels = ['ERROR', 'WARNING', 'INFO', 'DEBUG'] log_level = levels[args.verbose] logging.basicConfig(level=getattr(logging, log_level), format='%(levelname)s - %(message)s') if not (args.create or args.recreate or args.delete or args.keys or args.all_keys): parser.print_help() sys.exit(1) if args.conf: from derpconf.config import ConfigurationError from holmes.config import Config try: config = Config().load(args.conf[0]) host = config['ELASTIC_SEARCH_HOST'] port = config['ELASTIC_SEARCH_PORT'] index = config['ELASTIC_SEARCH_INDEX'] except ConfigurationError: logging.error('Could not load config! Use --conf conf_file') sys.exit(1) except KeyError: logging.error('Could not parse config! Check it\'s contents') sys.exit(1) if args.server: try: host, port = args.server[0].split(':') config['ELASTIC_SEARCH_HOST'] = host config['ELASTIC_SEARCH_PORT'] = port except Exception: logging.error('Could not parse server host and port! Use --server host:port') sys.exit(1) if args.index: index = args.index[0] config['ELASTIC_SEARCH_INDEX'] = index from pyelasticsearch.exceptions import IndexAlreadyExistsError, ElasticHttpNotFoundError, InvalidJsonResponseError from requests.exceptions import ConnectionError try: if args.create or args.recreate or args.delete: if host is None or port is None: logging.error('Need either a host and port or a config file to perform such operation!') sys.exit(1) if index is None: logging.error('Need either an index name or a config file to perform such operation!') sys.exit(1) else: es = cls.new_instance(config) if args.recreate or args.delete: try: es.delete_index() except ElasticHttpNotFoundError: pass except InvalidJsonResponseError as e: logging.error('Invalid response! Reason: %s' % e) sys.exit(1) if args.create or args.recreate: es.setup_index() if args.keys or args.all_keys: if config is None: logging.error('Need a config file to perform such operation! Use --conf conf_file') else: batch_size = args.batch_size[0] if args.batch_size else 200 es = cls.new_instance(config) if not es else es try: if args.verbose > 2: es.activate_debug() if args.keys: es.index_all_reviews(args.keys, replace=args.replace, batch_size=batch_size) elif args.all_keys: es.index_all_reviews(replace=args.replace, batch_size=batch_size) except InvalidJsonResponseError as e: logging.error('Invalid response! Reason: %s' % e) sys.exit(1) except IndexAlreadyExistsError: logging.error('Index %s already exists! Use --recreate (with caution) to recreate' % index) except ConnectionError: logging.error('Could not connect to server at %s:%s' % (host, port)) except KeyError: logging.error('Could not get host nor port! Use either -conf or --server') sys.exit(1)
class ElasticSearch(object): conn = None url = settings.ELASTICSEARCH_URL index_name = settings.ELASTICSEARCH_INDEX_NAME stdout = None stderr = None def __init__(self, index_name=None, stdout=None, stderr=None): self.conn = PyElasticSearch() if index_name: self.index_name = index_name if stdout: self.stdout = stdout if stderr: self.stderr = stderr def create_index(self, delete=True): if delete: try: self.conn.delete_index(self.index_name) except ElasticHttpNotFoundError as e: pass mappings = dict( (k, v) for k, v in get_elasticsearch_properties().items()) self.conn.create_index(self.index_name, settings={'mappings': mappings}) def index_activity_by_id(self, activity_id): activity = HistoricalActivity.objects.get(pk=activity_id) return self.index_activity(activity) def delete_activity_by_id(self, activity_id): activity = HistoricalActivity.objects.get(pk=activity_id) return self.delete_activity(activity) def index_activity(self, activity): for doc_type in DOC_TYPES_ACTIVITY: docs = self.get_activity_documents(activity, doc_type=doc_type) if len(docs) > 0: try: self.conn.bulk((self.conn.index_op( doc, id=doc.pop('id'), parent=doc.pop('_parent', None)) for doc in docs), index=self.index_name, doc_type=doc_type) except BulkError as e: for error in e.errors: msg = '%s: %s on ID %s' % ( error['index']['error']['type'], error['index']['error']['reason'], error['index']['_id']) if 'caused_by' in error['index']['error']: msg += ' (%s: %s)' % ( error['index']['error']['caused_by']['type'], error['index']['error']['caused_by']['reason']) self.stderr and self.stderr.write(msg) def index_investor(self, investor): for doc_type in DOC_TYPES_INVESTOR: docs = self.get_investor_documents(investor, doc_type=doc_type) if len(docs) > 0: try: self.conn.bulk((self.conn.index_op(doc, id=doc.pop('id')) for doc in docs), index=self.index_name, doc_type=doc_type) except BulkError as e: for error in e.errors: msg = '%s: %s on ID %s' % ( error['index']['error']['type'], error['index']['error']['reason'], error['index']['_id']) if 'caused_by' in error['index']['error']: msg += ' (%s: %s)' % ( error['index']['error']['caused_by']['type'], error['index']['error']['caused_by']['reason']) self.stderr and self.stderr.write(msg) def index_activity_documents(self, activity_identifiers=[]): activity_identifiers = activity_identifiers or HistoricalActivity.objects.filter( fk_status__in=( HistoricalActivity.STATUS_ACTIVE, HistoricalActivity.STATUS_PENDING, HistoricalActivity.STATUS_OVERWRITTEN, HistoricalActivity.STATUS_DELETED)).distinct().values_list( 'activity_identifier', flat=True).distinct() for doc_type in DOC_TYPES_ACTIVITY: docs = [] # Collect documents self.stdout and self.stdout.write( 'Collect %ss for %i deals...' % (doc_type, len(activity_identifiers))) for activity_identifier in activity_identifiers: for activity in self.get_activity_versions( activity_identifier): docs.extend( self.get_activity_documents(activity, doc_type=doc_type)) # Bulk index documents self.stdout and self.stdout.write('Index %i %ss...' % (len(docs), doc_type)) if len(docs) > 0: paginator = Paginator(docs, 1000) for page in paginator.page_range: try: self.conn.bulk( (self.conn.index_op(doc, id=doc.pop('id'), parent=doc.pop( '_parent', None)) for doc in paginator.page(page)), index=self.index_name, doc_type=doc_type) except BulkError as e: for error in e.errors: msg = '%s: %s on ID %s' % ( error['index']['error']['type'], error['index']['error']['reason'], error['index']['_id']) if 'caused_by' in error['index']['error']: msg += ' (%s: %s)' % (error['index']['error'] ['caused_by']['type'], error['index']['error'] ['caused_by']['reason']) self.stderr and self.stderr.write(msg) self.conn.refresh() def index_investor_documents(self): investors = Investor.objects.public().order_by( 'investor_identifier', '-id').distinct('investor_identifier') for doc_type in DOC_TYPES_INVESTOR: docs = [] # Collect documents self.stdout and self.stdout.write( 'Collect %ss for %i investors...' % (doc_type, investors.count())) for investor in investors: docs.extend( self.get_investor_documents(investor, doc_type=doc_type)) # Bulk index documents self.stdout and self.stdout.write('Index %i %ss...' % (len(docs), doc_type)) if len(docs) > 0: try: self.conn.bulk((self.conn.index_op(doc, id=doc.pop('id')) for doc in docs), index=self.index_name, doc_type=doc_type) except BulkError as e: for error in e.errors: msg = '%s: %s on ID %s' % ( error['index']['error']['type'], error['index']['error']['reason'], error['index']['_id']) if 'caused_by' in error['index']['error']: msg += ' (%s: %s)' % ( error['index']['error']['caused_by']['type'], error['index']['error']['caused_by']['reason']) self.stderr and self.stderr.write(msg) #def index_activity_by_version(self, activity_identifier): # for doc_type in get_elasticsearch_properties().keys(): # docs = self.get_documents_for_activity_version(activity_identifier, doc_type=doc_type) # if len(docs) > 0: # try: # self.conn.bulk((self.conn.index_op(doc, id=doc.pop('id')) for doc in docs), # index=self.index_name, # doc_type=doc_type) # except BulkError as e: # for error in e.errors: # stderr and stderr.write('%s: %s (caused by %s: %s, ID: %s)' % ( # error['index']['error']['type'], # error['index']['error']['reason'], # error['index']['error']['caused_by']['type'], # error['index']['error']['caused_by']['reason'], # error['index']['_id'] # )) def get_activity_versions(self, activity_identifier): versions = [] # get the newest non-pending, readable historic version: try: newest = HistoricalActivity.objects.filter( activity_identifier=activity_identifier, fk_status__in=( HistoricalActivity.STATUS_ACTIVE, HistoricalActivity.STATUS_OVERWRITTEN, HistoricalActivity.STATUS_DELETED)).distinct().latest() if newest and not newest.fk_status_id == HistoricalActivity.STATUS_DELETED: versions.append(newest) except HistoricalActivity.DoesNotExist: newest = None # get newer pendings pendings = HistoricalActivity.objects.filter( activity_identifier=activity_identifier, fk_status_id=HistoricalActivity.STATUS_PENDING).distinct() if newest: pendings.filter(history_date__gt=newest.history_date) versions.extend(pendings) return versions def get_activity_documents(self, activity, doc_type='deal'): docs = [] deal_attrs = { 'id': activity.id, 'activity_identifier': activity.activity_identifier, 'historical_activity_id': activity.id, 'status': activity.fk_status_id, } # Todo: Is there a nice way to prevent this extra Activity query? # e.g. if we save is_public/deal_scope as ActivityAttributes public_activity = Activity.objects.filter( activity_identifier=activity.activity_identifier).order_by( '-id').first() if public_activity: deal_attrs.update({ 'is_public': public_activity.is_public, 'deal_scope': public_activity.deal_scope, 'deal_size': public_activity.deal_size, 'current_negotiation_status': public_activity.negotiation_status, 'top_investors': public_activity.top_investors, 'fully_updated_date': public_activity.fully_updated_date, }) else: # Fixme: This should not happen self.stderr and self.stderr.write( _('Missing activity for historical activity %i (Activity identifier: #%i)' % (activity.id, activity.activity_identifier))) #except Activity.MultipleObjectsReturned: # # Fixme: This should not happen # self.stderr and self.stderr.write(_('Too much activities for historical activity %i (Activity identifier: #%i)' % ( # activity.id, # activity.activity_identifier # ))) for a in activity.attributes.select_related('fk_group__name').order_by( 'fk_group__name'): # do not include the django object id if a.name == 'id': continue attribute = None attribute_key = '%s_attr' % a.name if attribute_key in get_elasticsearch_properties( )['deal']['properties'].keys(): attribute = { 'value': a.value, 'value2': a.value2, 'date': a.date, 'is_current': a.is_current, } value = a.value # Area field? if a.name and 'area' in a.name and a.polygon is not None: # Get polygon #value = json.loads(a.polygon.json) # Apparently this is case sensitive: MultiPolygon as provided by the GeoJSON does not work #value['type'] = 'multipolygon' value = a.polygon.json or '' # do not include empty values if value is None or value == '': continue # Doc types: location, data_source or contract group_match = a.fk_group and a.fk_group.name or '' group_match = re.match( '(?P<doc_type>location|data_source|contract)_(?P<count>\d+)', group_match) if group_match: dt, count = group_match.groupdict()['doc_type'], int( group_match.groupdict()['count']) if doc_type == dt: while len(docs) < count: docs.append({ '_parent': activity.activity_identifier, 'id': a.id, #'%i_%i' % (a.id, count), }) docs[count - 1][a.name] = [ value, ] # Set doc type counter within deal doc type (for location/data_source/contract) elif doc_type == 'deal': # Set counter key = '%s_count' % dt if key not in deal_attrs.keys(): deal_attrs[key] = count elif deal_attrs[key] < count: deal_attrs[key] = count # Create list with correct length to ensure formset values have the same index if not a.name in deal_attrs: deal_attrs[a.name] = [''] * count if attribute: deal_attrs[attribute_key] = [''] * count else: while len(deal_attrs[a.name]) < count: deal_attrs[a.name].append('') if attribute: deal_attrs[attribute_key].append('') deal_attrs[a.name][count - 1] = value if attribute: deal_attrs['%s_attr' % a.name][count - 1] = attribute # Doc type: deal and not formset elif doc_type == 'deal': if a.name in deal_attrs: deal_attrs[a.name].append(value) if '%s_attr' % a.name in get_elasticsearch_properties( )['deal']['properties'].keys(): deal_attrs['%s_attr' % a.name].append(attribute) else: deal_attrs[a.name] = [ value, ] if '%s_attr' % a.name in get_elasticsearch_properties( )['deal']['properties'].keys(): deal_attrs['%s_attr' % a.name] = [ attribute, ] if doc_type == 'deal': # Additionally save operational company attributes oc = Investor.objects.filter( investoractivityinvolvement__fk_activity__activity_identifier= activity.activity_identifier) if oc.count() > 0: oc = oc.first() for field in Investor._meta.fields: if isinstance(field, ForeignKey): deal_attrs['operational_company_%s' % field.name] = getattr( oc, '%s_id' % field.name) else: deal_attrs['operational_company_%s' % field.name] = getattr(oc, field.name) else: pass #self.stderr and self.stderr.write("Missing operational company for deal #%i" % activity.activity_identifier) # Create single document for each location # FIXME: Saving single deals for each location might be deprecated since we have doc_type location now? spatial_names = list(get_spatial_properties()) for i in range(deal_attrs.get('location_count', 0)): doc = deal_attrs.copy() for name in spatial_names: if not name in doc: continue if len(deal_attrs[name]) > i: doc[name] = deal_attrs[name][i] else: doc[name] = '' # Set unique ID for location (deals can have multiple locations) doc['id'] = '%s_%i' % (doc['id'], i) point_lat = doc.get('point_lat', None) point_lon = doc.get('point_lon', None) if point_lat and point_lon: # Parse values try: parsed_lat, parsed_lon = float(point_lat), float(point_lon) doc['geo_point'] = '%s,%s' % (point_lat, point_lon) except ValueError: doc['geo_point'] = '0,0' else: doc['point_lat'] = '0' doc['point_lon'] = '0' doc['geo_point'] = '0,0' # FIXME: we dont really need 'point_lat' and 'point_lon' here, # so we should pop them from doc when adding 'geo_point' docs.append(doc) # Update docs with export values for doc in docs: doc.update(self.get_export_properties(doc, doc_type=doc_type)) return docs def get_export_properties(self, doc, doc_type='deal'): if doc_type == 'investor': return ExportInvestorForm.export(doc) elif doc_type == 'involvement': return InvestorVentureInvolvementForm.export(doc) else: properties = { 'deal_scope_export': doc.get('deal_scope', ''), 'is_public_export': doc.get('is_public', False) and str(_('Yes')) or str(_('No')), 'deal_size_export': doc.get('deal_size', ''), 'current_negotiation_status_export': doc.get('current_negotiation_status', ''), 'top_investors_export': doc.get('top_investors', ''), 'fully_updated_date_export': doc.get('fully_updated_date', ''), } # Doc types: deal, location, contract and data_source for form in ChangeDealView.FORMS: formset_name = hasattr(form, "form") and form.Meta.name or None form = formset_name and form.form or form properties.update(form.export(doc, formset=formset_name)) properties.update( ExportInvestorForm.export(doc, prefix='operational_company_')) return properties def get_investor_documents(self, investor, doc_type='investor'): docs = [] # Doc types: involvement and investor if doc_type == 'involvement': ivis = InvestorVentureInvolvement.objects.filter( Q(fk_venture=investor) | Q(fk_investor=investor)) for ivi in ivis: doc = {} for field in ivi._meta.local_fields: if isinstance(field, ForeignKey): doc[field.name] = getattr(ivi, '%s_id' % field.name) else: doc[field.name] = getattr(ivi, field.name) docs.append(doc) elif doc_type == 'investor': doc = {} for field in investor._meta.local_fields: if isinstance(field, ForeignKey): doc[field.name] = getattr(investor, '%s_id' % field.name) else: doc[field.name] = getattr(investor, field.name) docs.append(doc) # Update docs with export values for doc in docs: doc.update(self.get_export_properties(doc, doc_type=doc_type)) return docs def refresh_index(self): self.conn.refresh(self.index_name) def search(self, elasticsearch_query, doc_type='deal', sort=[]): """ Executes paginated queries until all results have been retrieved. @return: The full list of hits. """ start = 0 size = 10000 # 10000 is the default elasticsearch max_window_size (pagination is cheap, so more is not necessarily better) raw_result_list = [] done = False while not done: query = { 'query': elasticsearch_query, 'from': start, 'size': size, } if sort: query['sort'] = sort query_result = self.conn.search(query, index=self.index_name, doc_type=doc_type) raw_result_list.extend(query_result['hits']['hits']) results_total = query_result['hits']['total'] if len(raw_result_list) >= results_total: done = True else: start = len(raw_result_list) print('\nElasticsearch returned %i documents from a total of %i \n\n' % (len(raw_result_list), query_result['hits']['total'])) return raw_result_list def delete_activity(self, activity): for doc_type in DOC_TYPES_ACTIVITY: try: if doc_type == 'deal': self.conn.delete(id=activity.activity_identifier, index=self.index_name, doc_type=doc_type) else: self.conn.delete_by_query(query={ "parent_id": { "type": "deal", "id": str(activity.activity_identifier), } }, index=self.index_name, doc_type=doc_type) except ElasticHttpNotFoundError as e: pass def get_deals_by_activity_identifier(self, activity_identifier, doc_type='deal'): return self.search({ "constant_score": { "filter": { "term": { "activity_identifier": activity_identifier } } } })
#!/usr/bin/env python from pyelasticsearch import ElasticSearch from settings import HOST, INDEX, DOCTYPE es = ElasticSearch(HOST) results = es.search('*:*', index=INDEX, doc_type=DOCTYPE) hits = results['hits']['hits'] print hits
from pyelasticsearch import ElasticSearch es = ElasticSearch('http://localhost:9200/') es.search('name:Russell', index='agile_data_science')
es.bulk((es.index_op(doc, id=doc.pop('id')) for doc in docs), index='test', doc_type='test') es.refresh('test') res1 = es.get('test', 'test', 1) # 全文匹配, 注意中英文的分词方式. # https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl.html res8 = es.search(index='test', size=2, query={"query": { "query_string": { "query": "抽" } }}) # 前缀匹配查询,只接受小写. res12 = es.search(index='test', query={"query": {"prefix": {"title": "p"}}}) # search, 先must_match, filter res2 = es.search(index='test', query={ "query": { "bool": { "must": [{ "match": { "name": 'Jessica'
print 100 * '-' print thunder_name # _download_link = thunder_song.download_link # url = kcloud + _download_link # req = urllib2.Request(url) # res = urllib2.urlopen(req) # res = res.read() # res = json.loads(res) # download_link = res.get('result') # urllib.urlretrieve(download_link,'t_music/'+thunder_name+'.ts') es_songs = es.search(index='song', size=3, query={'query': { 'match': { 'name': thunder_name } }}) es_songs = es_songs['hits']['hits'] es_songs = [item['_source'] for item in es_songs] for item in es_songs: print item.get('id') print item.get('name') print item.get('artist') o2o_id = item.get('id') o2o_name = item.get('name')