class ES(object): def __init__(self): self.es = Elasticsearch() self.id = 0 def insert_es(self, id, good, description): doc = { 'id': id, 'good': good, 'description': description } res = self.es.index(index="test-index", doc_type='description_goods', id=self.id, body=doc) #print(res['created']) res = self.es.get(index="test-index", doc_type='description_goods', id=self.id) #print(res['_source']) self.es.indices.refresh(index="test-index") self.id += 1 def search_es(self, what, query): res = self.es.search(index="test-index", body={"query": {"match": {what: query}}}) #"author": 'kimchy' print("Got %d Hits" % res['hits']['total']) documents = [] for hit in res['hits']['hits']: #print hit documents.append(hit['_source']) return documents def del_by_query(self, query): res = self.es.delete_by_query(index="test-index", body={"query": {"match": {query}}}) #{"match_all": {}} def del_all(self): res = self.es.delete_by_query(index="test-index", body={"query": {"match_all": {}}}) #{"match_all": {}}
def annotate(config, documentId): if "getPosTags" in config and config["getPosTags"] == False: return esClient = Elasticsearch(config["elasticsearch"]["host"] + ":" + str(config["elasticsearch"]["port"])) corpusIndex = config["corpus"]["index"] corpusType = config["corpus"]["type"] corpusFields = config["corpus"]["text_fields"] processorIndex = config["processor"]["index"] processorType = config["processor"]["type"] document = esClient.get(index=corpusIndex, doc_type=corpusType, id = documentId, fields=corpusFields) content = "" if "fields" in document: for field in corpusFields: if field in document["fields"]: if type(document["fields"][field]) is list: for element in document["fields"][field]: content += element + ". " else: content += document["fields"][field] + ". " annotatedDocument = {} sentences = nltk.sent_tokenize(content) posTaggedSentences = [] for sentence in sentences: sentence = sentence.strip() if len(sentence) > 1: sentence = sentence.replace("-", " ") sentenceWords = nltk.word_tokenize(sentence.lower()) sentenceWords = map(lambda x: x.replace(".", ""), sentenceWords) posTags = nltk.pos_tag(sentenceWords) posTaggedSentences.append(posTags) if esClient.exists(index=processorIndex, doc_type=processorType, id=document["_id"]): annotatedDocument = esClient.get(index=processorIndex, doc_type=processorType, id=document["_id"])["_source"] annotatedDocument["pos_tagged_sentences"] = posTaggedSentences esClient.index(index=processorIndex, doc_type=processorType, id=document["_id"], body=annotatedDocument) config["logger"].info("pos-processor: Annotated document '" + document["_id"] + "'")
class TestMemcachedConnection(ElasticTestCase): def setUp(self): try: import pylibmc except ImportError: raise SkipTest("No pylibmc.") super(TestMemcachedConnection, self).setUp() nodes = self.client.nodes.info() for node_id, node_info in nodes["nodes"].items(): if 'memcached_address' in node_info: connection_info = ADDRESS_RE.search(node_info['memcached_address']).groupdict() self.mc_client = Elasticsearch( [connection_info], connection_class=MemcachedConnection ) break else: raise SkipTest("No memcached plugin.") def test_index(self): self.mc_client.index("test_index", "test_type", {"answer": 42}, id=1) self.assertTrue(self.client.exists("test_index", doc_type="test_type", id=1)) def test_get(self): self.client.index("test_index", "test_type", {"answer": 42}, id=1) self.assertEquals({"answer": 42}, self.mc_client.get("test_index", doc_type="test_type", id=1)["_source"]) def test_unicode(self): self.mc_client.index("test_index", "test_type", {"answer": u"你好"}, id=u"你好") self.assertEquals({"answer": u"你好"}, self.mc_client.get("test_index", doc_type="test_type", id=u"你好")["_source"]) def test_missing(self): self.assertRaises(NotFoundError, self.mc_client.get, "test_index", doc_type="test_type", id=42)
def commit(self, index_name, user_name): """ Commit the current state of factor network to a local Elastic instance The index_name should remain constant for an organization. The user_name refers to the specific user and provides the functionality to maintain the user provenance by making it the Elastic document type. Specifically, split the state into 3 components (1) root (the datum with which you started) (2) extension (the data you've confirmed based on factor network suggestions) (3) suggestions (the suggested extensions to your data) We index a factor network by taking the root and appending a _x to it. We loop through get requests on that particular lead to get based on the most recently committed root_x and we add 1 to x. The results of the commit will look as follows in Elastic: { "_index": "Your_Index_Name", "_type": "adam", "_id": "rootid_x", "_score": 1, "_source": { "root": [[0,1],[0,7],...], "extension": {[[1,2],[2,3],...]}, "suggestions": {[[3,4],[...],...]} } } """ es = Elasticsearch() source = set() target = set() edges = self.G.edges() for edge in edges: source.add(edge[0]) target.add(edge[1]) def split(intersection, edges): result = [] for i in intersection: for edge in edges: if i in edge: result.append(edge) return result state = {} state["root"] = split(source.difference(target), edges) state["extension"] = split(target.intersection(source), edges) state["suggestions"] = split(target.difference(source), edges) i = 1 preexisting = True while preexisting: try: index_id = state["root"][0][0] + "_" + str(i) es.get(index=index_name, id=index_id, doc_type=user_name) i = i + 1 except: preexisting = False res = es.index(index=index_name, id=index_id, doc_type=user_name, body=state) current_state = es.get(index=index_name, id=index_id, doc_type=user_name) return current_state
class ESClient: def __init__(self, es_params): self.es = Elasticsearch(es_params) def get_metric_metadata(self, metric_name, tenant_id): """ Get document from index metric_metadata for a given metric name and tenant id """ document_id = self.get_document_id(tenant_id=tenant_id, metric_name=metric_name) try: return self.es.get(index='metric_metadata', doc_type='metrics', id=document_id, routing=tenant_id) except NotFoundError as e: return e.info def get_enums_data(self, metric_name, tenant_id): """ Get document from index enums for a given metric name and tenant id """ document_id = self.get_document_id(tenant_id=tenant_id, metric_name=metric_name) try: return self.es.get(index='enums', doc_type='metrics', id=document_id, routing=tenant_id) except NotFoundError as e: return e.info def delete_metric_metadata(self, metric_name, tenant_id): """ Delete document from index metric_metadata for metric_metadata dictionary(obtained from get_metric_metadata call) and tenant id """ document_id = self.get_document_id(tenant_id=tenant_id, metric_name=metric_name) self.es.delete(index='metric_metadata', doc_type='metrics', id=document_id, routing=tenant_id) print 'Deleted from index metric_metadata for _id: [%s] routing: [%s]' % (document_id, tenant_id) def delete_enums_data(self, metric_name, tenant_id): """ Delete document from index enums for enums dictionary(obtained from get_enums_data call) and tenant id """ document_id = self.get_document_id(tenant_id=tenant_id, metric_name=metric_name) self.es.delete(index='enums', doc_type='metrics', id=document_id, routing=tenant_id) print 'Deleted from index enums for _id: [%s] routing: [%s]' % (document_id, tenant_id) def get_document_id(self, tenant_id, metric_name): """ Construct _id of elastic search from tenant id and metric name """ return tenant_id + ':' + metric_name
class CommonElasticsearch(object): """ Class for interfacing with Elasticsearch or docker logging directly """ def __init__(self, index_type='mediakraken', es_host='mkelk', es_port=9200, debug_override=None): if 'DEBUG' in os.environ and debug_override is None: self.debug = os.environ['DEBUG'].lower() if self.debug == 'es': self.es_inst = Elasticsearch([{'host': es_host, 'port': es_port}]) self.es_index = index_type else: if debug_override is None: self.debug = None else: self.debug = debug_override def com_elastic_index(self, log_type, body_data): # write log to elk if self.debug == 'es': # leave the try....as I don't want the container to fail if mkelk not accepting try: self.es_inst.index(index=self.es_index, doc_type='MediaKraken', body={"text": {"type": log_type, "data": json.dumps(body_data), "timestamp": time.strftime("%Y%m%d%H%M%S")}}) except: print((log_type, body_data)) # write log to host syslog elif self.debug == 'sys': try: sys.stdout.write(str({"type": log_type, "data": json.dumps(body_data), "timestamp": time.strftime("%Y%m%d%H%M%S")})) except: sys.stdout.write(str({"type": log_type, "timestamp": time.strftime("%Y%m%d%H%M%S")})) # write log to host syslog elif self.debug == 'print': try: print(str({"type": log_type, "data": json.dumps(body_data), "timestamp": time.strftime("%Y%m%d%H%M%S")})) except: print(str({"type": log_type, "data": str(body_data), "timestamp": time.strftime("%Y%m%d%H%M%S")})) def com_elastic_get(self, id): self.es_inst.get(index=self.es_index, doc_type='MediaKraken', id=id)
class ElasticSearchDb(PersistenceBase): def __init__(self): base = PersistenceBase() base.__init__() self.session = Elasticsearch() self.database = Config().elasticsearchindex if (not self.session.indices.exists(index=self.database)): self.session.indices.create(index=self.database) def selectalltables(self): tables = [] res = self.session.indices.get_mapping(index=self.database) for map in res[self.database]['mappings']: tables.append(map) return tables def selectallcolumns(self, tablename): columns = [] res = self.session.indices.get_mapping(index=self.database) for column in res[self.database]['mappings'][tablename]['properties']: columns.append(column) return columns def selectall(self, tablename): res = self.session.get(index=self.database, doc_type=tablename) return res def selectone(self, tablename, id): queryfilter = {'uuid': id} res = self.session.get(index=self.database, doc_type=tablename, body={'query': queryfilter}) return res def insert(self, obj, tablename): obj.updatedAt = datetime.isoformat(datetime.now()) obj.uuid = str(self.getuuid()) serialized_obj = self.getallvaluesfromobject(obj) self.session.index(index=self.database, doc_type=tablename, body=serialized_obj, id=obj.uuid) def update(self, obj, tablename): obj.updatedAt = datetime.isoformat(datetime.now()) self.session.update(index=self.database, doc_type=tablename, id=obj.uuid) def delete(self, obj, tablename): #todo: add to a table that manage deleted items (just to know if the obj was deleted and we don't have to add again) self.session.delete(index=self.database, doc_type=tablename, id=obj.uuid) def getallvaluesfromobject(self, obj): print(obj) ret = json.dumps(obj, default=lambda o: o.__dict__) return ret
def test_foo(sm_config): annotations = [('test_ds', 'test_db', 'H20', '+H', [], []), ('test_ds', 'test_db', 'Au', '+H', [], [])] db_mock = MagicMock(DB) db_mock.select.return_value = annotations es_exp = ESExporter(sm_config) es_exp.index_ds(db_mock, 'test_ds', 'test_db') es = Elasticsearch() d = es.get(index='sm', id='test_ds_test_db_H20_+H', doc_type='annotation', _source=True) assert d['_source'] == {'ds_name': 'test_ds', 'db_name': 'test_db', 'sf': 'H20', 'adduct': '+H', 'comp_names': '', 'comp_ids': ''} d = es.get(index='sm', id='test_ds_test_db_Au_+H', doc_type='annotation', _source=True) assert d['_source'] == {'ds_name': 'test_ds', 'db_name': 'test_db', 'sf': 'Au', 'adduct': '+H', 'comp_names': '', 'comp_ids': ''}
def index(): searchString = "" if request.method == 'POST': searchString = request.form['searchString'] elif request.method == "GET": searchString = request.args.get("searchString", "") notes = [] if(len(searchString) > 0): note = {} es = Elasticsearch(['http://159.203.66.191:9200']) searchString = searchString.strip(); if searchString.startswith("#"): tag = searchString[1:len(searchString)] try: res = es.get(index="brahman", doc_type='note', id=tag) note["title"] = tag; note["body"] = str(res['_source']['body']).strip() notes.append(note) except TransportError as e: app.logger.error(e.info) return redirect(url_for('addNote')+"?id="+tag) else: res = es.search(index="brahman", doc_type="note", body={"query": {"match": {"body": searchString}}}) returnString = "" for hit in res['hits']['hits']: note = {} note["title"] = str(hit["_id"]) notestr = str(hit["_source"]['body']) note["body"] = notestr.strip() notes.append(note) return render_template("index.html",notes = notes);
class ElasticSearchManager(object): def __init__(self, index=None, doc_type=None, *args, **kwargs): self.index = index self.doc_type = doc_type self.obj_es = Elasticsearch() def search(self, query = None, *args, **kwargs): data = self.obj_es.search(index=self.index, doc_type=self.doc_type, body={"query":{"match":query}}) return fetch_source(data['hits']['hits']) def get(self, *args, **kwargs): data=self.obj_es.get(index=self.index, doc_type=self.doc_type, id=kwargs['id']) return data['_source'] def get_list(self, *args, **kwargs): data = self.obj_es.search(index=self.index, body={"query": {"match_all": {}}}) return fetch_source(data['hits']['hits']) def insert(self, data = None): data = json.loads(data) data['user_name'] = data['user']['screen_name'] del data['user'] del data['entities'] res = self.obj_es.index(index=self.index, doc_type=self.doc_type, id=data['id'], body=data) logger.info("Getting stream:{0}".format(res)) def delete(self, data = None): pass def update(self, data = None): pass
def main(): es = Elasticsearch([{"host": "localhost", "port": 9200}]) r = redis.StrictRedis() samples = r.smembers("samples") conditions = Counter() for pkgName in samples: print "================" printPkgContent(pkgName) doc = es.get(id="npm:%s:js" % pkgName, index="throwtable", doc_type="implementation", ignore=404) if doc["found"]: actual = doc["_source"]["algorithm"] else: actual = [] expected = r.smembers("%s:map" % pkgName) result = checkPkg(pkgName, actual, expected, r, es) print result r.sadd("samples-%s" % result, pkgName) conditions[result] += 1 for (k, v) in conditions.items(): print "%s: %s" % (k, v) print "Precision:", 1.0 * conditions[TRUE_POSITIVE] / (conditions[TRUE_POSITIVE] + conditions[FALSE_POSITIVE]) print "Recall:", 1.0 * conditions[TRUE_POSITIVE] / (conditions[TRUE_POSITIVE] + conditions[FALSE_NEGATIVE])
def rcr(index='itest01', type='ttest01', id='dtest01'): """ Demonstrates the retrieve-change-reindex cycle for updating a document in Elasticsearch. """ body = '{"alist": ["element1"]}' es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) es.index(index=index, doc_type=type, id=id, body=body) res = es.get(index=index, doc_type=type, id=id) print("\nOriginal Document\n-----------------") pprint(res) res['_source']['alist'].extend(['element2', 'element3']) es.index(index=index, doc_type=type, id=id, body=res) res = es.get(index=index, doc_type=type, id=id) print("\nUpdated Document\n-----------------") pprint(res)
class ElasticsearchUtils(object): def __init__(self, host_ports): # host_ports格式 [{'host':'xxx', 'port':9200},{}] self.host_ports = host_ports self.es = None def init_connect(self): self.es = Elasticsearch(self.host_ports) return self.es.ping() def get_search_result(self, index_name, type_name, query_body): if self.es: return self.es.search(index=index_name, doc_type=type_name, body=query_body) return def get_id_result(self, index_name, type_name, doc_id): if self.es: return self.es.get(index=index_name, doc_type=type_name, id=doc_id)['_source'] return # doc_id为None说明让es自动生成id def add_index_doc(self, index_name, type_name, doc_id, doc_body): if doc_id: self.es.index(index=index_name, doc_type=type_name, id=doc_id, body=doc_body) else: self.es.index(index=index_name, doc_type=type_name, body=doc_body) def batch_index(self, index_name, type_name, doc_body_lines): self.es.bulk(index=index_name, doc_type=type_name, body=doc_body_lines)
def search(**kwargs): query = kwargs.get('query', dict(match_all={})) size = kwargs.get('size', 10) sort = kwargs.get('sort', ['_score']) body = dict(query=query, size=size, sort=sort) results = Elasticsearch().search(index='ecommerce', doc_type='product', body=body) return [x.get('_source') for x in results.get('hits').get('hits')]
def hashtagNote(tag): es = Elasticsearch(['http://159.203.66.191:9200']) res = es.get(index="brahman", doc_type='note', id=tag) note = res['_source']['body'] note = "<br />".join(note.split("\n")) note = " ".join(note.split("\t")) return note
def getTermStatistics(): startTime = time.time() es = Elasticsearch() for term in qtToQno.keys(): qtDocList= [] results = es.search(index='ap_dataset', doc_type="document", body={"query": {"match": {"TEXT": "'"+term +"'"}}} ,size=9000) for doc in results['hits']['hits'] : dictToAddInDStats = {} #print(doc["_id"]) qtDocList.append(doc["_id"]) if doc['_id'] in docTermStats.keys(): continue else: docToLength[doc['_id']] = len(set(es.get(index='ap_dataset',doc_type="document", id=doc['_id'])["_source"]["TEXT"].split()) - set(ignoreWordsList)) ts = es.termvector(index='ap_dataset', doc_type="document",id= doc['_id'], term_statistics = True,field_statistics = False) keysToKeep = set(qtToQno.keys()) & set(ts["term_vectors"]["TEXT"]["terms"].keys()) #print (keysToKeep) for tKeys in keysToKeep: dictToAddInDStats[tKeys] = ts["term_vectors"]["TEXT"]["terms"][tKeys] docTermStats[doc['_id']] = dictToAddInDStats print(len(docTermStats)) qtToDoc[term] = qtDocList elapsedTime= time.time() - startTime
def addNote(): es = Elasticsearch(['http://159.203.66.191:9200']) id = "" noteStr = "" if request.method == 'POST': id = request.form['id'] noteStr = request.form['note'] if len(noteStr.strip()) > 0 and len(id.strip()): note = {}; note["maintag"] = id note["body"] = noteStr es.index(index="brahman", doc_type='note', id=note["maintag"], body=note) return redirect(url_for('index')) elif request.method == "GET": id = request.args.get("id", "") if (len(id) > 0): note = {} try: res = es.get(index="brahman", doc_type='note', id=id) note["title"] = id; note["body"] = str(res['_source']['body']).strip() except TransportError as e: note["title"] = id; note["body"] = "" return render_template("addNote.html",note=note);
class ObjectManager(object): def __init__(self, index, doc_type, model_class): super(ObjectManager, self).__init__() self.index = index self.doc_type = doc_type self.model_class = model_class self.es = Elasticsearch() self.mapper = ObjectMapper() def find_one(self, pk): source_dict = self.es.get(index=self.index, doc_type=self.doc_type, id=pk) return self.mapper.from_dict_to_model(source_dict, self.model_class) def save(self, model): model_dict = self.mapper.from_model_to_dict(model) res = self.es.index(index=self.index, doc_type=self.doc_type, id=model.get_identity(), body=model_dict) return res['created'] def find_all(self): res = self.es.search(index=self.index, doc_type=self.doc_type, body={"query": {"match_all": {}}}) return [self.mapper.from_dict_to_model(model, self.model_class) for model in res['hits']['hits']] def update(self, model): model_dict = self.mapper.from_model_to_dict(model) res = self.es.update(index=self.index, doc_type=self.doc_type, id=model.pk, body={"doc": model_dict}) return res def delete(self, pk): return self.es.delete(index=self.index, doc_type=self.doc_type, id=pk)
def main(): es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) site = mw.Site('rosettacode.org', path='/mw/') r = redis.StrictRedis() samples = r.smembers('samples') conditions = Counter() counter = 0 for taskName in samples: print 'task # %d ================' % counter counter += 1 printTaskContent(taskName, site) impl_id = r.hget('rosetta-id-taskname-mapping', normalize(taskName)) if impl_id is None: actual = [] else: result = es.get(index='throwtable', doc_type='implementation', id=impl_id, ignore=404) if result['found']: actual = result['_source']['algorithm'] else: actual = [] expected = r.smembers("%s:map" % taskName) result = checkPkg(taskName, actual, expected, r) print result r.sadd('samples-%s' % result, taskName) conditions[result] += 1 for (k, v) in conditions.items(): print "%s: %s" % (k, v) print "Precision:", 1.0 * conditions[TRUE_POSITIVE] / (conditions[TRUE_POSITIVE] + conditions[FALSE_POSITIVE]) print "Recall:", 1.0 * conditions[TRUE_POSITIVE] / (conditions[TRUE_POSITIVE] + conditions[FALSE_NEGATIVE])
def get_rdap_asn(asn): es = Elasticsearch() does_exist = es.exists(index='whois', doc_type='asn_rdap', id = asn) print does_exist if does_exist is True: status = 200 print "Found it!" get_record = es.get(index='rdap',doc_type='asn', id = asn) results = jsonify(get_record['_source']) else: try: url = 'http://hailey.opendnsbl.net:8080/rdapbootstrap/autnum/%s' % asn r = requests.get(url) status = 200 b = r.json() #c = json.loads(b) #d = c['entities'] #print d #e = json.dumps(c) #es.index(index='rwhois', doc_type='asn', id=asn, body=json.dumps(b)) results = jsonify(b) except Exception as e: print e results_raw = jsonify({'status': "not_found"}) status = 404 results = jsonify({'status': "not_found"}) return results,status
class Feedback(object): host = '127.0.0.1' index='result_'+date.today().strftime('%Y%m%d') index_pattern = 'result_*' def __init__(self, doctype='feedback'): self.server = Elasticsearch([{'host': self.host}]) self.doctype = doctype def get(self, docid): try: doc = self.server.get(index=self.index, doc_type=self.doctype, id=docid) pprint(doc) except elasticsearch.ElasticsearchException as e: print e.info def update(self, docid, content): try: res = self.server.update(index=self.index, doc_type=self.doctype, id=docid, body=content) print res except elasticsearch.ElasticsearchException as e: print e.info def search(self, content): global opt_size, opt_source try: res = self.server.search(index=self.index, doc_type=self.doctype, body=content, _source=opt_source, size=opt_size) pprint(res, width=120) except elasticsearch.ElasticsearchException as e: print e.info
class ES(): def __init__(self): self.es = Elasticsearch() def setIndex(self, index): self.index = index def getIndex(self): return self.index def setDocType(self, doc_type): self.doc_type = doc_type def getDocType(self): return self.doc_type # Thêm tài liệu # Chưa có thì thêm mới, có rồi thì update # @params id # @params doucument # # @return document def insertOrUpdate(self, id, document): index = self.getIndex() doc_type = self.getDocType() exist = self.get(doc_type, id) if exist != 'null': result = self.update(id, document) else: result = self.es.index(index=index, doc_type=doc_type, id=id, body=document) return document def insert(self, id, document): index = self.getIndex() doc_type = self.getDocType() result = self.es.index(index=index, doc_type=doc_type, id=id, body=document) return result # Cập nhật tài liệu def update(self, id, document): index = self.getIndex() doc_type = self.getDocType() return self.es.update(index=index, doc_type=doc_type, id=id, body={"doc" : document}) # Lấy tài liệu def get(self, doc_type, id): index = self.getIndex() try: document = self.es.get(index = index, doc_type = doc_type, id = id) return document['_source'] except NotFoundError, e: return 'null' except TransportError, e: return 'null'
class NmapElasticsearchPlugin(NmapBackendPlugin): """ This class enables the user to store and manipulate nmap reports \ in a elastic search db. """ def __init__(self, index=None): if index is None: self.index = "nmap.{0}".format(datetime.now().strftime('%Y-%m-%d')) else: self.index = index self._esapi = Elasticsearch() def insert(self, report, doc_type=None): """ insert NmapReport in the backend :param NmapReport: :return: str the ident of the object in the backend for future usage or None """ if doc_type is None: doc_type = 'NmapReport' j = json.dumps(report, cls=ReportEncoder) res = self._esapi.index( index=self.index, doc_type=doc_type, body=json.loads(j)) rc = res['_id'] return rc def delete(self, id): """ delete NmapReport if the backend :param id: str """ raise NotImplementedError def get(self, id): """ retreive a NmapReport from the backend :param id: str :return: NmapReport """ res = self._esapi.get(index=self.index, doc_type="NmapReport", id=id)['_source'] return res def getall(self, filter=None): """ :return: collection of tuple (id,NmapReport) :param filter: Nice to have implement a filter capability """ rsearch = self._esapi.search(index=self.index, body={"query": {"match_all": {}}}) print("--------------------") print(type(rsearch)) print(rsearch) print("------------")
class PeragroClient(): """ An audio search client """ def __init__(self): """ initialize client object with elasticsearch object """ self.es = Elasticsearch() def set_index(self, index): """ set index for to lookup in elasticsearch Input: -index: an elasticsearch index """ self.index = index def get_sound(self, id_): """ Get sound by its id input: -id: id of sound output: -sound: sound details if it exists otherwise None Usage: >>> id = "X2VFAB12GH" >>> sound = c.get_sound(id) """ if self.es.exists(index=self.index, doc_type='_all', id=id_): res = self.es.get(index=self.index, id=id_) return res else: return None def text_search(self, query): """ Get sound results based on text query. It also has support for field queries. Usage: >>> query = "tum hi ho" >>> sounds = c.text_search(query) >>> # OR field query >>> query = "tags:'interscope' genre:'hip hop'" >>> sounds = c.text_search(query) """ # print self.index # print self.es.search(index=self.index) res = self.es.search(index=self.index, q=query) print("Got %d Hits:" % res['hits']['total']) return res
def run(node): id_a, id_b = node.get('id_a', '63166071_1'), node.get('id_b', '63166071_2') es = Elasticsearch() data_a = es.get(index="factor_state2016", doc_type='factor_network', id=id_a) data_b = es.get(index="factor_state2016", doc_type='factor_network', id=id_b) constructor = ElasticFactor(cfg["cdr_elastic_search"]["hosts"] + cfg["cdr_elastic_search"]["index"]) merged = constructor.merge(data_a["_source"], data_b["_source"]) return merged
class ESIndex: def __init__(self, hosts, index = "", doc_type = ""): self.es = Elasticsearch(hosts) self.index = index self.doc_type = doc_type def index(self, doc_id, body, index = "", doc_type = ""): index_ = self.index if index == "" else index doc_type_ = self.doc_type if doc_type == "" else doc_type return self.es.index(index=index_, doc_type=doc_type_, body=body, id=doc_id) def delete(self,doc_id, index = "", doc_type = ""): index_ = self.index if index == "" else index doc_type_ = self.doc_type if doc_type == "" else doc_type return self.es.delete(index=index_, doc_type = doc_type_, id = doc_id) def bulk(self, docs, index = "", doc_type = "", op_type = 'index'): ''' bulk sample: {"_op_type":"index", _index" : "test", "_type" : "type1", "_id" : "1" , "_source":{"field1":"value1", "field2":"value2"}} { "_op_type":"delete" , "_index" : "test", "_type" : "type1", "_id" : "2" } ''' index_ = self.index if index == "" else index doc_type_ = self.doc_type if doc_type == "" else doc_type allow_op = ['index', 'delete'] if op_type not in allow_op: raise exceptions.RequestError(400, '{"msg":"op_type is not allowed, you can use index or delete"}') actions = [] for doc in docs: action = {} action["_index"] = index_ action["_type"] = doc_type_ action["_id"] = doc["_id"] if op_type == 'index': del doc["_id"] action["_source"] = doc action["_op_type"] = op_type actions.append(action) return helpers.parallel_bulk(self.es, actions) def getDoc(self,doc_id, index = "", doc_type = ""): index_ = self.index if index == "" else index doc_type_ = self.doc_type if doc_type == "" else doc_type return self.es.get(index=index_, doc_type=doc_type_, id=doc_id) def putMapping(self, body, index = "", doc_type =""): index_ = self.index if index == "" else index doc_type_ = self.doc_type if doc_type == "" else doc_type return self.es.indices.put_mapping(index=index_, doc_type=doc_type_, body=body) def create(self, body = {}, index = "", timeout = 30): index_ = self.index if index == "" else index return self.es.indices.create(index_, body=body)
class ProjectDB(BaseProjectDB): __type__ = 'project' def __init__(self, hosts, index='pyspider'): self.index = index self.es = Elasticsearch(hosts=hosts) self.es.indices.create(index=self.index, ignore=400) if not self.es.indices.get_mapping(index=self.index, doc_type=self.__type__): self.es.indices.put_mapping(index=self.index, doc_type=self.__type__, body={ "_all": {"enabled": False}, "properties": { "updatetime": {"type": "double"} } }) def insert(self, name, obj={}): obj = dict(obj) obj['name'] = name obj['updatetime'] = time.time() obj.setdefault('group', '') obj.setdefault('status', 'TODO') obj.setdefault('script', '') obj.setdefault('comments', '') obj.setdefault('rate', 0) obj.setdefault('burst', 0) return self.es.index(index=self.index, doc_type=self.__type__, body=obj, id=name, refresh=True) def update(self, name, obj={}, **kwargs): obj = dict(obj) obj.update(kwargs) obj['updatetime'] = time.time() return self.es.update(index=self.index, doc_type=self.__type__, body={'doc': obj}, id=name, refresh=True, ignore=404) def get_all(self, fields=None): for record in elasticsearch.helpers.scan(self.es, index=self.index, doc_type=self.__type__, query={'query': {"match_all": {}}}, _source_include=fields or []): yield record['_source'] def get(self, name, fields=None): ret = self.es.get(index=self.index, doc_type=self.__type__, id=name, _source_include=fields or [], ignore=404) return ret.get('_source', None) def check_update(self, timestamp, fields=None): for record in elasticsearch.helpers.scan(self.es, index=self.index, doc_type=self.__type__, query={'query': {"range": { "updatetime": {"gte": timestamp} }}}, _source_include=fields or []): yield record['_source'] def drop(self, name): return self.es.delete(index=self.index, doc_type=self.__type__, id=name, refresh=True)
class Search(): def __init__(self, index_name): super().__init__() self.logger = logging.getLogger(__name__) self.__es = Elasticsearch(['pulsing.jhk.org:9200'], sniff_on_start=True) self.__index_name = index_name if self.__es.indices.exists(self.__index_name): self.logger.debug('index exists so deleting ' + self.__index_name) self.__es.indices.delete(self.__index_name) self.__es.indices.create(self.__index_name) self.__es.cluster.health(wait_for_status='yellow') def index(self, type_name, id_value, content): self.logger.debug('index %s/%s : %s', type_name, id_value, content) self.__es.index(index=self.__index_name, doc_type=type_name, id=id_value, body=content) def map(self, type_name, mapping): self.logger.debug('map %s', type_name) self.__es.indices.put_mapping(index=self.__index_name, doc_type=type_name, body={type_name: mapping}) def search(self, type_name, query={'match_all': {}}): self.logger.debug('search %s : %s', type_name, query) return self.__es.search(index=self.__index_name, doc_type=type_name, body={'query': query}) def get(self, type_name, id_value): self.logger.debug('get %s/%s', type_name, id_value) document = self.__es.get(index=self.__index_name, doc_type=type_name, id=id_value) self.logger.debug('got document ' + document) return document def delete(self, type_name, id_value): self.logger.debug('delete %s/%s', type_name, id_value) self.__es.delete(index=self.__index_name, doc_type=type_name, id=id_value) def optimize(self): """ forcemerge allows removal of deleted documents and reducing the number of segments (documents are marked as tombstone [like cassandra] but not purged from the segment's index for performance reasons) """ self.logger.debug('optimize') self.__es.forcemerge(self.__index_name) @property def es(self): return self.__es def __eq__(self, other): return self.__es == other.__es def __str__(self): return self.__es.__str__() def __hash__(self): return self.__es.__hash__()
def getNote(tag): es = Elasticsearch(['http://159.203.66.191:9200']) res = es.get(index="brahman", doc_type='note', id=tag) note = {} noteBody = res['_source']['body'] #note["body"] = noteBody; noteBody = "<br />".join(noteBody.split("\n")) note["body"] = " ".join(noteBody.split("\t")) return jsonify(note);
class Elastic_Search: def __init__(self, index='iis-logs-', aws_secret_id=None): self.timestamp = datetime.datetime.utcnow() self.index = index self._setup_Elastic_on_localhost() # default to localhost self._setup_Elastic_on_localhost() # default to localhost self._result = None if index and aws_secret_id: self._setup_Elastic_on_cloud_via_AWS_Secret(index, aws_secret_id) def _setup_Elastic_on_localhost(self): self.host = 'localhost' self.port = 9200 self.scheme = 'http' self.es = Elasticsearch([{'host': self.host, 'port': self.port}]) def _setup_Elastic_on_cloud_via_AWS_Secret(self, index, secret_id): credentials = json.loads(Secrets(secret_id).value()) self.host = credentials['host'] self.username = credentials['username'] self.password = credentials['password'] self.port = credentials['port'] self.index = index self._setup_Elastic_on_cloud(self.host, self.port, self.username, self.password) return self def _setup_Elastic_on_cloud(self, host, port, username, password): self.host = host self.port = port self.username = username self.password = password self.scheme = 'https' self.es = Elasticsearch([host], http_auth=(username, password), scheme="https", port=port) return self def add_data_with_timestamp(self, data): data["@timestamp"] = self.timestamp return self.es.index(index=self.index, doc_type='item', body=data) def add(self, data, id_key=None): try: if id_key is not None: return self.es.index(index=self.index, doc_type='item', body=data, id=data[id_key]) else: return self.es.index(index=self.index, doc_type='item', body=data) except Exception as error: print("elk-error", error) return {"elk-error": "{0}".format(error)} def add_bulk(self, data, id_key=None, pipeline=None): ok = 0 if data: actions = [] for item in data: item_data = { "_index": self.index, "_type": 'item', "_source": item, } if id_key is not None: item_data["_id"] = item[id_key] actions.append(item_data) if pipeline is None: ok, _ = helpers.bulk(self.es, actions, index=self.index) else: ok, _ = helpers.bulk(self.es, actions, index=self.index, pipeline=pipeline) return ok def create_index(self, body={}): if self.exists() is False: self._result = self.es.indices.create(index=self.index, body=body) return self def create_index_with_location_geo_point(self, field="location"): body = { "mappings": { "item": { "properties": { field: { "type": "geo_point" } } } } } self.create_index(body) return self def create_index_pattern(self, add_time_field=True): if add_time_field: payload = { "type": "index-pattern", "index-pattern": { "title": self.index + '*', "timeFieldName": "date" } } else: print('creating index without index pattern') payload = { "type": "index-pattern", "index-pattern": { "title": self.index + '*' } } data = json.dumps(payload) headers = {'Content-Type': 'application/json'} if self.host == 'localhost': url = 'http://{0}:{1}/.kibana/doc/index-pattern:{2}'.format( self.host, self.port, self.index) self._result = json.loads(PUT(url, data, headers)) else: url = 'https://{0}:{1}/.kibana/doc/index-pattern:{2}'.format( self.host, self.port, self.index) response = requests.put(url, data, headers=headers, auth=HTTPBasicAuth(self.username, self.password)) self._result = json.loads(response.text) return self def delete_index_pattern(self): try: if self.host == 'localhost': url = 'http://{0}:{1}/.kibana/doc/index-pattern:{2}'.format( self.host, self.port, self.index) self._result = json.loads(DELETE(url)) else: url = 'https://{0}:{1}/.kibana/doc/index-pattern:{2}'.format( self.host, self.port, self.index) response = requests.delete(url, auth=HTTPBasicAuth( self.username, self.password)) self._result = json.loads(response.text) except Exception as error: self._result = {'error': error} return self def delete_data_by_id(self, id): return self.es.delete(index=self.index, doc_type='item', id=id) def get_data(self, id): try: return self.es.get(index=self.index, doc_type='item', id=id) except NotFoundError: return None def get_many(self, ids): data = self.es.mget(index=self.index, doc_type='item', body={'ids': ids}) results = {} for item in data['docs']: _id = item['_id'] if item['found'] is False: results[_id] = None else: results[_id] = item['_source'] return results def get_data_First_10(self): results = self.es.search(index=self.index, body={"query": { "match_all": {} }}) for result in results['hits']['hits']: yield result['_source'] def get_index_settings(self): url = 'https://{3}:{4}@{0}:{1}/{2}/_settings'.format( self.host, self.port, self.index, self.username, self.password) return json.loads(requests.get(url).text) def search_using_lucene( self, query, size=10000, sort=None ): # for syntax and examples of lucene queries see https://www.elastic.co/guide/en/elasticsearch/reference/6.4/query-dsl-query-string-query.html#query-string-syntax query = query.replace('“', '"').replace( '”', '"') # fix the quotes we receive from Slack results = self.es.search(index=self.index, q=query, size=size, sort=sort) for result in results['hits']['hits']: yield result['_source'] def search_using_lucene_index_by_id( self, query, size=10000, sort=None ): # for syntax and examples of lucene queries see https://www.elastic.co/guide/en/elasticsearch/reference/6.4/query-dsl-query-string-query.html#query-string-syntax query = query.replace('“', '"').replace( '”', '"') # fix the quotes we receive from Slack elk_results = self.es.search(index=self.index, q=query, size=size, sort=sort) results = {} for result in elk_results['hits']['hits']: id = result['_id'] value = result['_source'] results[id] = value return results def search_using_lucene_sort_by_date( self, query, size=10000 ): # for syntax and examples of lucene queries see https://www.elastic.co/guide/en/elasticsearch/reference/6.4/query-dsl-query-string-query.html#query-string-syntax query = query.replace('“', '"').replace( '”', '"') # fix the quotes we receive from Slack elk_results = self.es.search(index=self.index, q=query, size=size, sort="date:desc") results = [] for result in elk_results['hits']['hits']: id = result['_id'] value = result['_source'] item = {"id": id, "value": value} results.append(item) return results def search_using_query(self, query, size=10000): results = self.es.search(index=self.index, body=query, size=size) for result in results['hits']['hits']: yield result['_source'] def search_on_field_for_value(self, field, value, size=10000): query = {"query": {"match": {field: {"query": value}}}} return self.search_using_query(query, size=size) def search_on_field_for_values(self, field, values): query = { "query": { "constant_score": { "filter": { "terms": { field: values } } } } } return self.search_using_query(query) # this is not working # def search_get_unique_field_values(self, field,size = 10000): # query = { # "size": 0, # "aggs": { # "unique_ids": { # "terms": { # "field": 'field', # "size": size # } # } # } # } # return self.search_using_query(query) def set_index_settings(self, settings): headers = {'Content-Type': 'application/json'} url = 'https://{0}:{1}/{2}/_settings'.format(self.host, self.port, self.index) response = requests.put(url, json.dumps(settings), headers=headers, auth=HTTPBasicAuth(self.username, self.password)) return response.text def set_index_settings_total_fields(self, value): self.set_index_settings({"index.mapping.total_fields.limit": value}) return self def delete_using_query(self, query): results = self.es.delete_by_query(index=self.index, body=query) return results def delete_index(self): if self.exists(): self._result = self.es.indices.delete(self.index) return self def index_list(self): return set(self.es.indices.get_alias()) def exists(self): return self.es.indices.exists(self.index) def set_index(self, index): self.index = index return self
class ElasticClient: def __init__(self, address='localhost:10000'): self.es = Elasticsearch(address) # ------ Simple operations ------ def index_documents(self): df = pd \ .read_csv('data/user_ratedmovies.dat', delimiter='\t', nrows=100000) \ .loc[:, ['userID', 'movieID', 'rating']] means = df.groupby(['userID'], as_index=False, sort=False) \ .mean() \ .loc[:, ['userID', 'rating']] \ .rename(columns={'rating': 'ratingMean'}) df = pd.merge(df, means, on='userID', how="left", sort=False) df['ratingNormal'] = df['rating'] - df['ratingMean'] ratings = df.loc[:, ['userID', 'movieID', 'ratingNormal']] \ .rename(columns={'ratingNormal': 'rating'}) \ .pivot_table(index='userID', columns='movieID', values='rating') \ .fillna(0) print("Indexing users...") index_users = [{ "_index": "users", "_type": "user", "_id": index, "_source": { 'ratings': row[row > 0] \ .sort_values(ascending=False) \ .index.values.tolist() } } for index, row in ratings.iterrows()] helpers.bulk(self.es, index_users) print("Done") print("Indexing movies...") index_movies = [{ "_index": "movies", "_type": "movie", "_id": column, "_source": { "whoRated": ratings[column][ratings[column] > 0] \ .sort_values(ascending=False) \ .index.values.tolist() } } for column in ratings] helpers.bulk(self.es, index_movies) print("Done") def get_movies_liked_by_user(self, user_id, index='users'): user_id = int(user_id) return self.es.get(index=index, doc_type="user", id=user_id)["_source"] def get_users_that_like_movie(self, movie_id, index='movies'): movie_id = int(movie_id) return self.es.get(index=index, doc_type="movie", id=movie_id)["_source"] def get_preselection_for_user(self, user_id, index='users'): user_id = int(user_id) movies_liked = self.es.search( index=index, body={"query": { "term": { "_id": user_id } }})["hits"]["hits"][0]["_source"]["ratings"] users_with_similar_taste = self.es.search( index=index, body={"query": { "terms": { "ratings": movies_liked } }})["hits"]["hits"] recommended_set = set() for ratings in users_with_similar_taste: if ratings["_id"] != user_id: ratings = ratings["_source"]["ratings"] for rating in ratings: if rating not in movies_liked: recommended_set.add(rating) return list(recommended_set) def get_preselection_for_movie(self, movie_id, index='movies'): movie_id = int(movie_id) users_liking = self.es.search( index=index, body={"query": { "term": { "_id": movie_id } }})["hits"]["hits"][0]["_source"]["whoRated"] movies_liked_by_the_same_people = self.es.search( index=index, body={"query": { "terms": { "whoRated": users_liking } }})["hits"]["hits"] recommended_set = set() for ratings in movies_liked_by_the_same_people: if ratings["_id"] != movie_id: ratings = ratings["_source"]["whoRated"] for rating in ratings: if rating not in users_liking: recommended_set.add(rating) return list(recommended_set) def add_user_document(self, user_id, movies_liked, user_index='users', movie_index='movies'): user_id = int(user_id) self.es.index(index=user_index, doc_type='user', id=user_id, body={"ratings": movies_liked}) for e in movies_liked: temp = list( self.get_users_that_like_movie(e, movie_index)["whoRated"]) temp.append(user_id) self.update_movie_document(int(e), temp, movie_index) def add_movie_document(self, movie_id, users_liking, movie_index='movies', user_index='users'): movie_id = int(movie_id) self.es.index(index=movie_index, doc_type='movie', id=movie_id, body={"whoRated": users_liking}) for e in users_liking: temp = list( self.get_movies_liked_by_user(e, user_index)["ratings"]) temp.append(movie_id) self.update_user_document(int(e), temp, user_index) def update_user_document(self, user_id, movies_liked, user_index='users'): user_id = int(user_id) self.es.index(index=user_index, doc_type='user', id=user_id, body={"ratings": movies_liked}) def update_movie_document(self, movie_id, users_liking, movie_index='movies'): movie_id = int(movie_id) self.es.index(index=movie_index, doc_type='movie', id=movie_id, body={"whoRated": users_liking}) def bulk_user_update(self, body, user_index): for e in body: user_id = int(e["user_id"]) movies_liked_before = self.get_movies_liked_by_user( user_id, user_index)["ratings"] for movie in list(movies_liked_before): temp = list(self.get_users_that_like_movie(movie)["whoRated"]) if user_id in temp: temp.remove(user_id) self.update_movie_document(int(movie), temp) self.es.index(index=user_index, doc_type='user', id=user_id, body={"ratings": e["liked_movies"]}) movies_liked_now = list(e["liked_movies"]) for movie in list(movies_liked_now): temp = list(self.get_users_that_like_movie(movie)["whoRated"]) temp.append(user_id) self.update_movie_document(int(movie), temp) def bulk_movie_update(self, body, movie_index): for e in body: movie_id = int(e["movie_id"]) users_liking_before = self.get_users_that_like_movie( movie_id, movie_index)["whoRated"] for user in list(users_liking_before): temp = list(self.get_movies_liked_by_user(user)["ratings"]) if movie_id in temp: temp.remove(movie_id) self.update_user_document(int(user), temp) self.es.index(index=movie_index, doc_type='movie', id=movie_id, body={"ratings": e["users_who_liked_movie"]}) users_liking_now = list(e["users_who_liked_movie"]) for user in list(users_liking_now): temp = list(self.get_users_that_like_movie(user)["ratings"]) temp.append(movie_id) self.update_movie_document(int(user), temp) def delete_user_document(self, user_id, user_index, movie_index='movies'): user_id = int(user_id) movies_liked = self.get_movies_liked_by_user(user_id, user_index)["ratings"] self.es.delete(index=user_index, doc_type="user", id=user_id) for e in list(movies_liked): temp = list( self.get_users_that_like_movie(e, movie_index)["whoRated"]) if user_id in temp: temp.remove(user_id) self.update_movie_document(int(e), temp) def delete_movie_document(self, movie_id, movie_index, user_index='users'): movie_id = int(movie_id) users_liking = self.get_users_that_like_movie(movie_id, movie_index)["whoRated"] self.es.delete(index=movie_index, doc_type="movie", id=movie_id) for e in list(users_liking): temp = list( self.get_movies_liked_by_user(e, user_index)["ratings"]) if movie_id in temp: temp.remove(movie_id) self.update_user_document(int(e), temp) def create_index(self, index): self.es.indices.create(index=index, body={ "settings": { "number_of_shards": 5, "number_of_replicas": 1 } }) def get_indexes(self): return self.es.indices.get_alias() def reindex(self, old_index, new_index): helpers.reindex(self.es, source_index=old_index, target_index=new_index) def delete_index(self, index): self.es.indices.delete(index=index, ignore=[400, 404])
class ElasticSearchClass(object): def __init__(self, host, port, user, passwrod): self.host = host self.port = port self.user = user self.password = passwrod self.connect() def connect(self): self.es = Elasticsearch(hosts=[{ 'host': self.host, 'port': self.port }], http_auth=(self.user, self.password)) return self.es def insertDocument(self, index, type, body, id=None): """ 插入一条数据body到指定的index、指定的type下;可指定Id,若不指定,ES会自动生成 :param index: 待插入的index值 :param type: 待插入的type值 :param body: 待插入的数据 -> dict型 :param id: 自定义Id值 :return: """ return self.es.index(index=index, doc_type=type, body=body, id=id) def count(self, indexname): """ :param indexname: :return: 统计index总数 """ return self.conn.count(index=indexname) def delete(self, indexname, doc_type, id): """ :param indexname: :param doc_type: :param id: :return: 删除index中具体的一条 """ self.es.delete(index=indexname, doc_type=doc_type, id=id) def get(self, doc_type, indexname, id): return self.es.get(index=indexname, doc_type=doc_type, id=id) def searchindex(self, index): """ 查找所有index数据 """ try: return self.es.search(index=index) except Exception as err: print(err) def searchDoc(self, index=None, type=None, body=None): ''' 查找index下所有符合条件的数据 :param index: :param type: :param body: 筛选语句,符合DSL语法格式 :return: ''' return self.es.search(index=index, doc_type=type, body=body) def search(self, index, type, body, size=10, scroll='10s'): """ 根据index,type查找数据, 其中size默认为十条数据,可以修改为其他数字,但是不能大于10000 """ return self.es.search(index=index, doc_type=type, body=body, size=size, scroll=scroll) def scroll(self, scroll_id, scroll): """ 根据上一个查询方法,查询出来剩下所有相关数据 """ return self.es.scroll(scroll_id=scroll_id, scroll=scroll)
flag += 1 for il in pages[1:]: dSet.add(il) if (flag == 1000): break if len(dSet) <= 500: baseSet = baseSet.union(dSet) else: dSet = random.sample(dSet, 500) textFile.close() print(len(baseSet)) for link in baseSet: olSet = set() res = es.get(index="hw3_crawl", doc_type='document', id=link) outLinks = set(res['_source'].get("outlinks").strip().split('\n')) for ol in outLinks: if ol in graphPages: olSet.add(ol) graphPages[link] = Page(link, 1.0, 1.0, set(), olSet) print(len(graphPages)) with open("linkgraph.txt", 'r') as textFile: for line in textFile.readlines(): plSet = set() pages = line.replace(' \n', '').replace('\n', '').split(' ') for link in baseSet: if (pages[0] == link): for p in pages[1:]: if p in graphPages:
class ElasticsearchDDL(object): def __init__(self, host='localhost', port='9200'): self.es = Elasticsearch([{ 'host': 'localhost', 'port': 9200 }], timeout=100) def createIndex(self, indexname): self.es.indices.create(index=indexname) def deleteIndex(self, indexname): self.es.indices.delete(index=indexname) def bulkInsert(self, indexname, doctype, data, no): datadim = len(data) bulk_data = [] i = 0 for elem in data: data_dict = {"id": i} if type(elem) is dict: if "title" in elem: data_dict["title"] = elem["title"] if "text" in elem: data_dict["text"] = elem["text"] else: data_dict["text"] = elem op_dict = { "index": { "_index": indexname, "_type": doctype, "_id": data_dict["id"] } } bulk_data.append(op_dict) bulk_data.append(data_dict) if i % no == 0 or i == datadim - 1: self.es.bulk(index=indexname, body=bulk_data, refresh=True) bulk_data = [] i += 1 def searchByCollocation(self, indexname, w1, w2): return self.es.search(index=indexname, body={ "query": { "span_near": { "clauses": [{ "span_term": { "text": w1 } }, { "span_term": { "text": w2 } }], "slop": 6, "in_order": True } }, "highlight": { "fields": { "text": {} } } }, size=5) def searchByBigram(self, indexname, bigram): return self.es.search(index=indexname, body={ "query": { "multi_match": { "query": bigram, "type": "phrase", "fields": ["text"] } }, "highlight": { "fields": { "text": {} } } }, size=5) def selectByQuery(self, indexname, query={}): res = self.es.search(index=indexname, body={"query": { "match_all": query }}) return res def selectOneByID(self, indexname, doctype, id): result = self.es.get(index=indexname, doc_type=doctype, id=id)['_source'] def indexExists(self, indexname): return self.es.indices.exists(index=indexname)
'port': 9200 }], sniff_on_start=True, # refresh nodes after a node fails to respond sniff_on_connection_fail=True, # and also every 60 seconds sniffer_timeout=60, # set sniffing request timeout to 10 seconds sniff_timeout=10) # Index some test data print("#############\nFirst data test\n#############") es.index(index='test-index', doc_type='test', id=1, body={'test': 'test'}) # Test if they are there res = es.get(index='test-index', doc_type='test', id=1) print(json.dumps(res['_source'], indent=4, sort_keys=True)) # Delete test data and try with something more interesting delete = es.delete(index='test-index', doc_type='test', id=1) print(delete) print('\n') if __debug__: print("No sleep") else: time.sleep(3) # Index some more complicated test data print("#############\nSecond data test\n#############") es.index(index='sw',
class DCIESEngine(object): def __init__(self, conf, index="global", timeout=30): self.esindex = index self.conn = Elasticsearch(conf['ES_HOST'], port=conf['ES_PORT'], timeout=timeout) def create_index(self): self.conn.indices.create(index=self.esindex) def get(self, id, team_id=None): res = self.conn.get(index=self.esindex, doc_type='log', id=id) if team_id: if res: if res['_source']['team_id'] != team_id: res = {} return res def delete(self, id): self.conn.delete(index=self.esindex, doc_type='log', id=id) return True def list(self, include=None, exclude=None): query = {"size": 10000, "query": {"match_all": {}}} if include: query['_source'] = {'include': include} if exclude: query['_source'] = {'exclude': exclude} if self.conn.indices.exists(index=self.esindex): return self.conn.search(index=self.esindex, body=query) else: return None def index(self, values): return self.conn.index(index=self.esindex, doc_type='log', id=values['id'], body=values) def refresh(self): return self.conn.indices.refresh(index=self.esindex, force=True) def search_content(self, pattern, team_id=None): if team_id: query = { "query": { "filtered": { "filter": { "match": { "team_id": team_id } }, "query": { "match": { "content": pattern } } } } } else: query = {"query": {"match": {"content": pattern}}} return self.conn.search(index=self.esindex, body=query, request_cache=False, size=100) def cleanup(self): if self.conn.indices.exists(index=self.esindex): return self.conn.indices.delete(index=self.esindex)
d["summary"] = re.sub("<[^<]+?>", "", jo["summary"]) ldocs.append(d) # connect to elastic es = Elasticsearch([{"host": "localhost", "port": 9200}]) # iterate through documents indexing them for doc in ldocs: es.index(index="tvshows", doc_type="bigbang", id=doc["id"], body=json.dumps(doc)) # python elasticsearch get by id print("###########") print(es.get(index="tvshows", doc_type="bigbang", id=2915)) print("###########") # term search print("term search") print( es.search( index="tvshows", doc_type="bigbang", body={"query": { "match": { "summary": "rivalry" } }}, ))
+ DASHBOARD_NAME dashboard['uiStateJSON'] = '{}' dashboard['optionsJSON'] = '{"darkTheme":false}' dashboard['version'] = 1 dashboard['timeRestore'] = False dashboard['kibanaSavedObjectMeta'] = { 'searchSourceJSON': '{"filter":[{"query":{"query_string":{"query":"*","analyze_wildcard":true}}}],' '"highlightAll":true,"version":true}' } # Check if visualizations already present in dashboard. If present, don't add, else, add at end ES_ID = DASHBOARD_NAME try: res = es.get(index='.kibana', doc_type='dashboard', id=ES_ID) print(json.dumps(res, indent=4)) # No exeception occured means dashboard found dashboard_found = True except exceptions.NotFoundError as e: print('No visualizations found') dashboard_found = False except Exception as e: print(e) print('Error Occurred') vis_ids_present = set() panelsJSON = [] if dashboard_found: panelsJSON = yaml.safe_load(res['_source']['panelsJSON'])
from elasticsearch import Elasticsearch es = Elasticsearch() res = es.get(index="trec_news", doc_type='news_articles', id='00f57310e5c8ec7833d6756ba637332e') print(res['_source'])
class KnowledgeBase(object): """ Represents interface to Knowledge Base. The Knowledge base uses Elasticsearch to provide primary storage and indexing for the articles. It leverages the default analyzer for text preprocessing at indexing time. View counts for articles are stored in Redis. It could be a field in the ES mapping for the article but that would mean reindexing the document everytime an article is viewed. To alleviate this unnecessary burden on the primary storage, the knowledge base uses an in memory key value store for fast access as well as update time. The class encapsulates interactions with ElasticSearch including initializing indices and mappings. """ # ES Index and Type name. INDEX = 'articles' TYPE = 'article' # Path to JSON mappings for Index and Type. INDEX_PATH = 'mappings/index_mapping.json' TYPE_PATH = 'mappings/type_mapping.json' # Configuration items for initializing connections with # databases. Ideally, these would be stored separately # in YAML format or managed using something like # Zookeeper. HOSTS = ['localhost'] USERNAME = '******' PASSWORD = '******' def __init__(self): # Initialize persistent connections to ES and Redis. self.client = Elasticsearch( hosts=self.HOSTS, http_auth=(self.USERNAME, self.PASSWORD), ) self.redis = Redis( host=self.HOSTS[0], ) def search(self, query_text, locale=None, fields=None): """ Return relevant articles given search text. Finding the query term in the title of an article is given twice as much weight as finding the text in the body. After the most relevant articles are obtained, they are ranked by the ranking module (uses view counts here, but can be easily extended). Args: query_text(str): Text to be searched. locale(str): String to filter results by location. fields(list(str)): If specified, restrict the fields returned to this list. Returns: list[dict]: Returns a ranked list of dictionaries representing articles [ { 'id': str, 'title': str, 'body': str, 'locale': str, }, . . ] """ # Create Search object to "match" query text against the title and body # of articles stored in the Knowledge base. s = Search( using=self.client, index=self.INDEX, doc_type=self.TYPE ).query( 'multi_match', query=query_text, fields=['title^2', 'body'] ) # If locale is provided, use it to filter the set of documents that are # queried for. if locale: s = s.filter('term', locale=locale) # Restrict fields if specified. s = s.source(fields) response = s.execute() results, result_dict = [], {} for hit in response: article_id = hit.meta['id'] result_dict[article_id] = hit.__dict__['_d_'] result_dict[article_id]['id'] = article_id # Retrieve view count for each relevant article. results.append((article_id, self.redis.get(article_id))) # Rank results using Ranking function. Currently sorts relevant results by # view counts. ranked_results = Ranker.rank(results) ranked_articles = [result_dict[article_id] for article_id in ranked_results] return ranked_articles def get(self, article_id, fields=None): """ Return an article specified by the given article_id. Increments view count for the specific article as well. Args: article_id(str): Unique ID representing an article in the knowledge base. fields(list[str]): If specified, restrict the fields returned to this list. Returns: dict: Dictionary representing a document, of the following format { 'id': str, 'title': str, 'body': str, 'locale': str, } Returns None if no article matching the id is found. """ try: response = self.client.get( index=self.INDEX, doc_type=self.TYPE, id=article_id, _source=fields, ) except: return None else: # Increment view count for accessed article. self.redis.incr(article_id) article = response['_source'] article['id'] = article_id return article def index(self, article, refresh=True): """ Index an article in the Knowledge Base. Intializes view count for the indexed article as well. Args: article(dict): Dictionary representing an article in the knowledge base. Must follow the field names defined in the mapping. Returns: tuple(bool, str): Returns (True, article_id) if article is successfully indexed. (False, None) otherwise. """ try: response = self.client.index( index=self.INDEX, doc_type=self.TYPE, body=article, refresh=refresh, ) except: return False, None else: # Initialize view count for newly indexed article. self.redis.set(response['_id'], 0) return response['created'], response['_id'] def delete(self, article_id, refresh=True): """ Delete an article from the Knowledge Base. Removes view count for the specific article as well. Args: article_id(str): Unique ID representing an article in the knowledge base. Returns: bool: Returns True if article is successfully deleted. False otherwise. """ try: response = self.client.delete( index=self.INDEX, doc_type=self.TYPE, id=article_id, refresh=refresh, ) except: return False else: # Remove article key from Redis. self.redis.delete(article_id) return response['found'] def _in_bulk(self, objects): """ Helper function to facilitate bulk operations. Args: objects(list[dict]): A list of dictionaries of the format [ { '_op_type': String representing operation, valid choices are 'index', 'create', 'update' and 'delete', 'body': Contains updated document or new document to be created, 'id': ID of article to be deleted. }, . . ] """ bulk(self.client, objects, index=self.INDEX) def _init_index(self): """ Helper method to initialize Knowledge base store. Uses the JSON mappings to initialize an Elasticsearch index with a type mapping for storing articles. Returns: tuple(bool, str): Returns a boolean value representing whether the index and mapping were initialized and a string representing the status. """ index_mapping = json.load(open(self.INDEX_PATH)) type_mapping = json.load(open(self.TYPE_PATH)) try: self.client.indices.create( index='articles', body=index_mapping, ) except: return False, 'Failed to create Index' try: self.client.indices.put_mapping( index='articles', doc_type='article', body=type_mapping, ) except: return False, 'Failed to put Mapping' return True, 'Successfully initialized Index'
#!/usr/bin/env python ''' Licensed to Elasticsearch B.V under one or more agreements. Elasticsearch B.V licenses this file to you under the Apache 2.0 License. See the LICENSE file in the project root for more information ''' from elasticsearch import Elasticsearch es = Elasticsearch() print("fbcf5078a6a9e09790553804054c36b3 - L:9") # tag::fbcf5078a6a9e09790553804054c36b3[] response = es.get( index='twitter', id=0, ) # end::fbcf5078a6a9e09790553804054c36b3[] print("---------------------------------------") print(response) print("---------------------------------------") print("98234499cfec70487cec5d013e976a84 - L:46") # tag::98234499cfec70487cec5d013e976a84[] response = es.exists( index='twitter', id=0, ) # end::98234499cfec70487cec5d013e976a84[] print("---------------------------------------") print(response)
class ServerConector(): def __init__(self, **kwargs): self.res = {'status': None, 'status_code': None} try: server = os.environ.get('ELASTIC_SERVER') self.el = Elasticsearch([{'host': server, 'port': 9200}]) if not self.el.ping(): self.res['status'] = 'Error' self.res['status_code'] = 503 except Exception as e: self.res['status'] = 'Error' self.res['status_code'] = 503 app.logger.error('Error Conecting Elasticserch') app.logger.error('Error detail: {}'.format(e)) def insert_el(self, index, data): try: resp_dict = self.el.index(index=index, body=data, refresh=True) self.res['doc_id'] = resp_dict['_id'] self.res['status'] = 'Success' self.res['status_code'] = 200 except Exception as e: self.res['status'] = 'Error' if not self.res['status_code']: self.res['status_code'] = 400 self.res['doc_id'] = None app.logger.error(self.res['status']) app.logger.error('Error inserting document in index {}. \ Error detail: {}'.format(index, e)) return self.res def update_el(self, index, _id, data): try: self.el.update(index=index, id=_id, body={'doc': data}, refresh=True) self.res['status'] = 'Success' self.res['status_code'] = 200 except Exception as e: self.res['status'] = 'Error' if not self.res['status_code']: self.res['status_code'] = 400 app.logger.error(self.res['status']) app.logger.error('Error updating document in index {}. \ Error detail: {}'.format(index, e)) return self.res def retrieve_id_el(self, index, _id, model=None): try: if not model: model = {} self.res = self.el.get(index=index, id=_id) model['doc_id'] = _id for key in self.res['_source']: model[key] = self.res['_source'][key] self.res = model self.res['status'] = 'Success' self.res['status_code'] = 200 except Exception as e: self.res['status'] = 'Error' if not self.res['status_code']: self.res['status_code'] = e.status_code app.logger.error(self.res['status']) app.logger.error('Error getting document in index {}. \ Error detail: {}'.format(index, e)) return self.res def retrieve_el(self, index, unique_hash_field, unique_hash_value, model=None): try: if not model: model = {} self.res = self.el.search(index=index, body={ 'query': { 'match_phrase': { unique_hash_field: unique_hash_value } } }) self.res['status'] = 'Success' self.res['status_code'] = 200 self.res['resp_list'] = [] for doc in self.res['hits']['hits']: resp_dict = model.copy() for key in doc['_source']: resp_dict[key] = doc['_source'][key] resp_dict['doc_id'] = doc['_id'] self.res['resp_list'].append(resp_dict) if not self.res['resp_list']: self.res['status_code'] = 404 return { 'resp_list': self.res['resp_list'], 'status': self.res['status'], 'status_code': self.res['status_code'] } except Exception as e: self.res['status'] = 'Error' if not self.res['status_code']: self.res['status_code'] = e.status_code app.logger.error(self.res['status']) app.logger.error('Error getting document in index {}. \ Error detail: {}'.format(index, e)) return self.res def retrieve_all_el(self, index): try: self.res = self.el.search(index=index, body={'query': { 'match_all': {} }}) self.res['status'] = 'Success' self.res['status_code'] = 200 if self.res['status'] == 'Success': self.res['resp_list'] = [] for doc in self.res['hits']['hits']: resp_dict = {} resp_dict['doc_id'] = doc['_id'] for key in doc['_source']: resp_dict[key] = doc['_source'][key] self.res['resp_list'].append(resp_dict) if not self.res['resp_list']: self.res['status_code'] = 404 return { 'resp_list': self.res['resp_list'], 'status': self.res['status'], 'status_code': self.res['status_code'] } except Exception as e: self.res['status'] = 'Error' if not self.res['status_code']: self.res['status_code'] = e.status_code app.logger.error(self.res['status']) app.logger.error('Error getting document in index {}. \ Error detail: {}'.format(index, e)) return self.res
class SearchEngine(object): def __init__(self, prefix=settings.ELASTICSEARCH_PREFIX): # serializer = JSONSerializer() serializer.mimetype = 'application/json' serializer.dumps = serializer.serialize serializer.loads = JSONDeserializer().deserialize self.es = Elasticsearch(hosts=settings.ELASTICSEARCH_HOSTS, serializer=serializer, **settings.ELASTICSEARCH_CONNECTION_OPTIONS) self.logger = logging.getLogger(__name__) self.prefix = prefix.lower() def _add_prefix(self, *args, **kwargs): if args: index = args[0].strip() else: index = kwargs.get('index', '').strip() if index is None or index == '': raise NotImplementedError("Elasticsearch index not specified.") prefix = '%s_' % self.prefix.strip( ) if self.prefix and self.prefix.strip() != '' else '' ret = [] for idx in index.split(','): ret.append('%s%s' % (prefix, idx)) index = ','.join(ret) if args: return index else: return dict(kwargs, index=index) def delete(self, **kwargs): """ Deletes a document from the index Pass an index and id to delete a specific document Pass a body with a query dsl to delete by query """ kwargs = self._add_prefix(**kwargs) kwargs['doc_type'] = kwargs.pop('doc_type', '_doc') body = kwargs.pop('body', None) if body != None: try: data = [] refresh = kwargs.pop('refresh', False) for hit in helpers.scan(self.es, query=body, **kwargs): hit['_op_type'] = 'delete' data.append(hit) return helpers.bulk(self.es, data, refresh=refresh, **kwargs) except Exception as detail: try: # ignore 404 errors (index_not_found_exception) if detail.status_code == 404: pass except: self.logger.warning( '%s: WARNING: failed to delete document by query: %s \nException detail: %s\n' % (datetime.now(), body, detail)) raise detail else: try: return self.es.delete(ignore=[404], **kwargs) except Exception as detail: self.logger.warning( '%s: WARNING: failed to delete document: %s \nException detail: %s\n' % (datetime.now(), body, detail)) raise detail def delete_index(self, **kwargs): """ Deletes an entire index """ kwargs = self._add_prefix(**kwargs) print 'deleting index : %s' % kwargs.get('index') return self.es.indices.delete(ignore=[400, 404], **kwargs) def search(self, **kwargs): """ Search for an item in the index. Pass an index and id to get a specific document Pass a body with a query dsl to perform a search """ kwargs = self._add_prefix(**kwargs) kwargs['doc_type'] = kwargs.pop('doc_type', '_doc') body = kwargs.get('body', None) id = kwargs.get('id', None) if id: if isinstance(id, list): kwargs.setdefault('body', {'ids': kwargs.pop('id')}) return self.es.mget(**kwargs) else: return self.es.get(**kwargs) ret = None try: ret = self.es.search(**kwargs) except Exception as detail: self.logger.warning( '%s: WARNING: search failed for query: %s \nException detail: %s\n' % (datetime.now(), body, detail)) pass return ret def create_mapping(self, index, fieldname='', fieldtype='string', fieldindex=None, body=None): """ Creates an Elasticsearch body for a single field given an index name and type name """ index = self._add_prefix(index) if not body: if fieldtype == 'geo_shape': body = { '_doc': { 'properties': { fieldname: { 'type': 'geo_shape', 'tree': 'geohash', 'precision': '1m' } } } } else: fn = {'type': fieldtype} if fieldindex: fn['index'] = fieldindex body = {'_doc': {'properties': {fieldname: fn}}} self.es.indices.create(index=index, ignore=400) self.es.indices.put_mapping(index=index, doc_type='_doc', body=body) print 'creating index : %s' % (index) def create_index(self, **kwargs): kwargs = self._add_prefix(**kwargs) self.es.indices.create(ignore=400, **kwargs) print 'creating index : %s' % kwargs.get('index', '') def index_data(self, index=None, body=None, idfield=None, id=None, **kwargs): """ Indexes a document or list of documents into Elasticsearch If "id" is supplied then will use that as the id of the document If "idfield" is supplied then will try to find that property in the document itself and use the value found for the id of the document """ index = self._add_prefix(index) if not isinstance(body, list): body = [body] for document in body: if idfield is not None: if isinstance(document, dict): id = document[idfield] else: id = getattr(document, idfield) try: self.es.index(index=index, doc_type='_doc', body=document, id=id) except Exception as detail: self.logger.warning( '%s: WARNING: failed to index document: %s \nException detail: %s\n' % (datetime.now(), document, detail)) raise detail def bulk_index(self, data, **kwargs): return helpers.bulk(self.es, data, **kwargs) def create_bulk_item(self, op_type='index', index=None, id=None, data=None): return { '_op_type': op_type, '_index': self._add_prefix(index), '_type': '_doc', '_id': id, '_source': data } def count(self, **kwargs): kwargs = self._add_prefix(**kwargs) kwargs['doc_type'] = kwargs.pop('doc_type', '_doc') body = kwargs.pop('body', None) # need to only pass in the query key as other keys (eg: _source) are not allowed if body: query = body.pop('query', None) if query: kwargs['body'] = {'query': query} count = self.es.count(**kwargs) if count is not None: return count['count'] else: return None def BulkIndexer(outer_self, batch_size=500, **kwargs): class _BulkIndexer(object): def __init__(self, **kwargs): self.queue = [] self.batch_size = kwargs.pop('batch_size', 500) self.kwargs = kwargs def add(self, op_type='index', index=None, id=None, data=None): doc = { '_op_type': op_type, '_index': outer_self._add_prefix(index), '_type': '_doc', '_id': id, '_source': data } self.queue.append(doc) if len(self.queue) >= self.batch_size: outer_self.bulk_index(self.queue, **self.kwargs) del self.queue[:] #clear out the array def close(self): outer_self.bulk_index(self.queue, **self.kwargs) def __enter__(self, **kwargs): return self def __exit__(self, type, value, traceback): return self.close() return _BulkIndexer(batch_size=batch_size, **kwargs)
DEBUG = True if __name__ == '__main__' else False faker = Factory.create() es = Elasticsearch() def get_name(): return { 'name': faker.name(), 'email': faker.email(), 'address': faker.address(), 'timestamp': dt.now(), } @test_speed def insert_all(max_records): for n in range(max_records): res = es.index(index='testing_index', doc_type='test', id=n, body=get_name()) print(res) if DEBUG: with Section('ElasticSearch (via ElasticSearch-py)'): insert_all(10) res = es.get(index='testing_index', doc_type='test', id=1) prnt('ES Results:', res)
print('start:', start) many = 0 count = 0 fail = 0 while True: if (many < 1): sval = input('How many messages:') if (len(sval) < 1): break many = int(sval) start = start + 1 # Skip rows that are already retrieved try: res = es.get(index='gmane', doc_type='message', id=start) print(res) continue except: pass many = many - 1 url = baseurl + str(start) + '/' + str(start + 1) text = 'None' try: # Open with a timeout of 30 seconds response = requests.get(url) text = response.text status = response.status_code if status != 200:
class ElasticDocRanker(object): """ Connect to an ElasticSearch index. Score pairs based on Elasticsearch """ def __init__(self, elastic_url=None, elastic_index=None, elastic_fields=None, elastic_field_doc_name=None, strict=True, elastic_field_content=None): """ Args: elastic_url: URL of the ElasticSearch server containing port elastic_index: Index name of ElasticSearch elastic_fields: Fields of the Elasticsearch index to search in elastic_field_doc_name: Field containing the name of the document (index) strict: fail on empty queries or continue (and return empty result) elastic_field_content: Field containing the content of document in plaint text """ # Load from disk elastic_url = elastic_url or DEFAULTS['elastic_url'] logger.info('Connecting to %s' % elastic_url) self.es = Elasticsearch(hosts=elastic_url) self.elastic_index = elastic_index self.elastic_fields = elastic_fields self.elastic_field_doc_name = elastic_field_doc_name self.elastic_field_content = elastic_field_content self.strict = strict # Elastic Ranker def get_doc_index(self, doc_id): """Convert doc_id --> doc_index""" field_index = self.elastic_field_doc_name if isinstance(field_index, list): field_index = '.'.join(field_index) result = self.es.search( index=self.elastic_index, body={'query': { 'match': { field_index: doc_id } }}) return result['hits']['hits'][0]['_id'] def get_doc_id(self, doc_index): """Convert doc_index --> doc_id""" result = self.es.search(index=self.elastic_index, body={'query': { 'match': { "_id": doc_index } }}) source = result['hits']['hits'][0]['_source'] return utils.get_field(source, self.elastic_field_doc_name) def closest_docs(self, query, k=1): """Closest docs by using ElasticSearch """ results = self.es.search(index=self.elastic_index, body={ 'size': k, 'query': { 'multi_match': { 'query': query, 'type': 'most_fields', 'fields': self.elastic_fields } } }) hits = results['hits']['hits'] doc_ids = [ utils.get_field(row['_source'], self.elastic_field_doc_name) for row in hits ] doc_scores = [row['_score'] for row in hits] return doc_ids, doc_scores def batch_closest_docs(self, queries, k=1, num_workers=None): """Process a batch of closest_docs requests multithreaded. Note: we can use plain threads here as scipy is outside of the GIL. """ with ThreadPool(num_workers) as threads: closest_docs = partial(self.closest_docs, k=k) results = threads.map(closest_docs, queries) return results # Elastic DB def __enter__(self): return self def close(self): """Close the connection to the database.""" self.es = None def get_doc_ids(self): """Fetch all ids of docs stored in the db.""" results = self.es.search(index=self.elastic_index, body={"query": { "match_all": {} }}) doc_ids = [ utils.get_field(result['_source'], self.elastic_field_doc_name) for result in results['hits']['hits'] ] return doc_ids def get_doc_text(self, doc_id): """Fetch the raw text of the doc for 'doc_id'.""" idx = self.get_doc_index(doc_id) result = self.es.get(index=self.elastic_index, doc_type='_doc', id=idx) return result if result is None else result['_source'][ self.elastic_field_content]
import json ip = sys.argv[1] port = int(sys.argv[2]) # 9200 try: es = Elasticsearch("{}:{}".format(ip, port), timeout=5) # 连接Elasticsearch,延时5秒 es.indices.create(index='unauth_text') print('[+] 成功连接 :{}'.format(ip)) print('[+] {} -> 成功创建测试节点unauth_text'.format(ip)) es.index(index="unauth_text", doc_type="test-type", id=2, body={"text": "text"}) print('[+] {} -> 成功往节点unauth_text插入数据'.format(ip)) ret = es.get(index="unauth_text", doc_type="test-type", id=2) print('[+] {} -> 成功获取节点unauth_text数据 : {}'.format(ip, ret)) es.indices.delete(index='unauth_text') print('[+] {} -> 清除测试节点unauth_text数据'.format(ip)) print('[ok] {} -> 存在ElasticSearch未授权漏洞'.format(ip)) print('尝试获取节点信息:↓') text = json.loads( requests.get(url='http://{}:{}/_nodes'.format(ip, port), timeout=5).text) nodes_total = text['_nodes']['total'] nodes = list(text['nodes'].keys()) print('[ok] {} -> [{}] : {}'.format(ip, nodes_total, nodes)) except Exception as e: error = e.args
class ElasticsearchDataStore(object): """Implements the datastore.""" # Number of events to queue up when bulk inserting events. DEFAULT_FLUSH_INTERVAL = 1000 DEFAULT_SIZE = 100 DEFAULT_LIMIT = DEFAULT_SIZE # Max events to return DEFAULT_FROM = 0 DEFAULT_STREAM_LIMIT = 5000 # Max events to return when streaming results def __init__(self, host='127.0.0.1', port=9200): """Create a Elasticsearch client.""" super(ElasticsearchDataStore, self).__init__() self._error_container = {} self.client = Elasticsearch([{'host': host, 'port': port}]) self.import_counter = Counter() self.import_events = [] @staticmethod def _build_labels_query(sketch_id, labels): """Build Elasticsearch query for Timesketch labels. Args: sketch_id: Integer of sketch primary key. labels: List of label names. Returns: Elasticsearch query as a dictionary. """ label_query = {'bool': {'must': []}} for label in labels: nested_query = { 'nested': { 'query': { 'bool': { 'must': [{ 'term': { 'timesketch_label.name.keyword': label } }, { 'term': { 'timesketch_label.sketch_id': sketch_id } }] } }, 'path': 'timesketch_label' } } label_query['bool']['must'].append(nested_query) return label_query @staticmethod def _build_events_query(events): """Build Elasticsearch query for one or more document ids. Args: events: List of Elasticsearch document IDs. Returns: Elasticsearch query as a dictionary. """ events_list = [event['event_id'] for event in events] query_dict = {'query': {'ids': {'values': events_list}}} return query_dict def build_query(self, sketch_id, query_string, query_filter, query_dsl=None, aggregations=None): """Build Elasticsearch DSL query. Args: sketch_id: Integer of sketch primary key query_string: Query string query_filter: Dictionary containing filters to apply query_dsl: Dictionary containing Elasticsearch DSL query aggregations: Dict of Elasticsearch aggregations Returns: Elasticsearch DSL query as a dictionary """ if query_dsl: query_dsl = json.loads(query_dsl) # Remove any aggregation coming from user supplied Query DSL. # We have no way to display this data in a good way today. if query_dsl.get('aggregations', None): del query_dsl['aggregations'] return query_dsl if query_filter.get('events', None): events = query_filter['events'] return self._build_events_query(events) query_dsl = { 'query': { 'bool': { 'must': [], 'must_not': [], 'filter': [] } } } # TODO: Remove when old UI has been deprecated. if query_filter.get('star', None): label_query = self._build_labels_query(sketch_id, ['__ts_star']) query_string = '*' query_dsl['query']['bool']['must'].append(label_query) # TODO: Remove when old UI has been deprecated. if query_filter.get('time_start', None): query_dsl['query']['bool']['filter'] = [{ 'bool': { 'should': [{ 'range': { 'datetime': { 'gte': query_filter['time_start'], 'lte': query_filter['time_end'] } } }] } }] if query_string: query_dsl['query']['bool']['must'].append( {'query_string': { 'query': query_string }}) # New UI filters if query_filter.get('chips', None): labels = [] must_filters = query_dsl['query']['bool']['must'] must_not_filters = query_dsl['query']['bool']['must_not'] datetime_ranges = { 'bool': { 'should': [], 'minimum_should_match': 1 } } for chip in query_filter['chips']: # Exclude chips that the user disabled if not chip.get('active', True): continue if chip['type'] == 'label': labels.append(chip['value']) elif chip['type'] == 'term': term_filter = { 'match_phrase': { '{}'.format(chip['field']): { 'query': "{}".format(chip['value']) } } } if chip['operator'] == 'must': must_filters.append(term_filter) elif chip['operator'] == 'must_not': must_not_filters.append(term_filter) elif chip['type'] == 'datetime_range': start = chip['value'].split(',')[0] end = chip['value'].split(',')[1] range_filter = { 'range': { 'datetime': { 'gte': start, 'lte': end } } } datetime_ranges['bool']['should'].append(range_filter) label_filter = self._build_labels_query(sketch_id, labels) must_filters.append(label_filter) must_filters.append(datetime_ranges) # Pagination if query_filter.get('from', None): query_dsl['from'] = query_filter['from'] # Number of events to return if query_filter.get('size', None): query_dsl['size'] = query_filter['size'] # Make sure we are sorting. if not query_dsl.get('sort', None): query_dsl['sort'] = {'datetime': query_filter.get('order', 'asc')} # Add any pre defined aggregations if aggregations: # post_filter happens after aggregation so we need to move the # filter to the query instead. if query_dsl.get('post_filter', None): query_dsl['query']['bool']['filter'] = query_dsl['post_filter'] query_dsl.pop('post_filter', None) query_dsl['aggregations'] = aggregations return query_dsl def search(self, sketch_id, query_string, query_filter, query_dsl, indices, count=False, aggregations=None, return_fields=None, enable_scroll=False): """Search ElasticSearch. This will take a query string from the UI together with a filter definition. Based on this it will execute the search request on ElasticSearch and get result back. Args: sketch_id: Integer of sketch primary key query_string: Query string query_filter: Dictionary containing filters to apply query_dsl: Dictionary containing Elasticsearch DSL query indices: List of indices to query count: Boolean indicating if we should only return result count aggregations: Dict of Elasticsearch aggregations return_fields: List of fields to return enable_scroll: If Elasticsearch scroll API should be used Returns: Set of event documents in JSON format """ scroll_timeout = None if enable_scroll: scroll_timeout = '1m' # Default to 1 minute scroll timeout # Exit early if we have no indices to query if not indices: return {'hits': {'hits': [], 'total': 0}, 'took': 0} # Check if we have specific events to fetch and get indices. if query_filter.get('events', None): indices = { event['index'] for event in query_filter['events'] if event['index'] in indices } query_dsl = self.build_query(sketch_id, query_string, query_filter, query_dsl, aggregations) # Default search type for elasticsearch is query_then_fetch. search_type = 'query_then_fetch' # Only return how many documents matches the query. if count: del query_dsl['sort'] count_result = self.client.count(body=query_dsl, index=list(indices)) return count_result.get('count', 0) if not return_fields: # Suppress the lint error because elasticsearch-py adds parameters # to the function with a decorator and this makes pylint sad. # pylint: disable=unexpected-keyword-arg return self.client.search(body=query_dsl, index=list(indices), search_type=search_type, scroll=scroll_timeout) # The argument " _source_include" changed to "_source_includes" in # ES version 7. This check add support for both version 6 and 7 clients. # pylint: disable=unexpected-keyword-arg if self.version.startswith('6'): _search_result = self.client.search(body=query_dsl, index=list(indices), search_type=search_type, _source_include=return_fields, scroll=scroll_timeout) else: _search_result = self.client.search(body=query_dsl, index=list(indices), search_type=search_type, _source_includes=return_fields, scroll=scroll_timeout) return _search_result def search_stream(self, sketch_id=None, query_string=None, query_filter=None, query_dsl=None, indices=None, return_fields=None, enable_scroll=True): """Search ElasticSearch. This will take a query string from the UI together with a filter definition. Based on this it will execute the search request on ElasticSearch and get result back. Args : sketch_id: Integer of sketch primary key query_string: Query string query_filter: Dictionary containing filters to apply query_dsl: Dictionary containing Elasticsearch DSL query indices: List of indices to query return_fields: List of fields to return enable_scroll: Boolean determing whether scrolling is enabled. Returns: Generator of event documents in JSON format """ if not query_filter.get('size'): query_filter['size'] = self.DEFAULT_STREAM_LIMIT if not query_filter.get('terminate_after'): query_filter['terminate_after'] = self.DEFAULT_STREAM_LIMIT result = self.search(sketch_id=sketch_id, query_string=query_string, query_dsl=query_dsl, query_filter=query_filter, indices=indices, return_fields=return_fields, enable_scroll=enable_scroll) if enable_scroll: scroll_id = result['_scroll_id'] scroll_size = result['hits']['total'] else: scroll_id = None scroll_size = 0 # Elasticsearch version 7.x returns total hits as a dictionary. # TODO: Refactor when version 6.x has been deprecated. if isinstance(scroll_size, dict): scroll_size = scroll_size.get('value', 0) for event in result['hits']['hits']: yield event while scroll_size > 0: # pylint: disable=unexpected-keyword-arg result = self.client.scroll(scroll_id=scroll_id, scroll='5m') scroll_id = result['_scroll_id'] scroll_size = len(result['hits']['hits']) for event in result['hits']['hits']: yield event def get_filter_labels(self, sketch_id, indices): """Aggregate labels for a sketch. Args: sketch_id: The Sketch ID indices: List of indices to aggregate on Returns: List with label names. """ # This is a workaround to return all labels by setting the max buckets # to something big. If a sketch has more than this amount of labels # the list will be incomplete but it should be uncommon to have >10k # labels in a sketch. max_labels = 10000 # pylint: disable=line-too-long aggregation = { 'aggs': { 'nested': { 'nested': { 'path': 'timesketch_label' }, 'aggs': { 'inner': { 'filter': { 'bool': { 'must': [{ 'term': { 'timesketch_label.sketch_id': sketch_id } }] } }, 'aggs': { 'labels': { 'terms': { 'size': max_labels, 'field': 'timesketch_label.name.keyword' } } } } } } } } labels = [] # pylint: disable=unexpected-keyword-arg result = self.client.search(index=indices, body=aggregation, size=0) buckets = result.get('aggregations', {}).get('nested', {}).get('inner', {}).get('labels', {}).get('buckets', []) for bucket in buckets: # Filter out special labels like __ts_star etc. if bucket['key'].startswith('__'): continue labels.append(bucket['key']) return labels def get_event(self, searchindex_id, event_id): """Get one event from the datastore. Args: searchindex_id: String of ElasticSearch index id event_id: String of ElasticSearch event id Returns: Event document in JSON format """ try: # Suppress the lint error because elasticsearch-py adds parameters # to the function with a decorator and this makes pylint sad. # pylint: disable=unexpected-keyword-arg if self.version.startswith('6'): event = self.client.get(index=searchindex_id, id=event_id, doc_type='_all', _source_exclude=['timesketch_label']) else: event = self.client.get(index=searchindex_id, id=event_id, doc_type='_all', _source_excludes=['timesketch_label']) return event except NotFoundError: abort(HTTP_STATUS_CODE_NOT_FOUND) def count(self, indices): """Count number of documents. Args: indices: List of indices. Returns: Number of documents. """ if not indices: return 0 try: result = self.client.count(index=indices) except (NotFoundError, RequestError): es_logger.error('Unable to count indexes (index not found)', exc_info=True) return 0 return result.get('count', 0) def set_label(self, searchindex_id, event_id, event_type, sketch_id, user_id, label, toggle=False, remove=False, single_update=True): """Set label on event in the datastore. Args: searchindex_id: String of ElasticSearch index id event_id: String of ElasticSearch event id event_type: String of ElasticSearch document type sketch_id: Integer of sketch primary key user_id: Integer of user primary key label: String with the name of the label remove: Optional boolean value if the label should be removed toggle: Optional boolean value if the label should be toggled single_update: Boolean if the label should be indexed immediately. Returns: Dict with updated document body, or None if this is a single update. """ # Elasticsearch painless script. update_body = { 'script': { 'lang': 'painless', 'source': UPDATE_LABEL_SCRIPT, 'params': { 'timesketch_label': { 'name': str(label), 'user_id': user_id, 'sketch_id': sketch_id }, remove: remove } } } if toggle: update_body['script']['source'] = TOGGLE_LABEL_SCRIPT if not single_update: script = update_body['script'] return dict(source=script['source'], lang=script['lang'], params=script['params']) doc = self.client.get(index=searchindex_id, id=event_id, doc_type='_all') try: doc['_source']['timesketch_label'] except KeyError: doc = {'doc': {'timesketch_label': []}} self.client.update(index=searchindex_id, doc_type=event_type, id=event_id, body=doc) self.client.update(index=searchindex_id, id=event_id, doc_type=event_type, body=update_body) return None def create_index(self, index_name=uuid4().hex, doc_type='generic_event'): """Create index with Timesketch settings. Args: index_name: Name of the index. Default is a generated UUID. doc_type: Name of the document type. Default id generic_event. Returns: Index name in string format. Document type in string format. """ _document_mapping = { 'properties': { 'timesketch_label': { 'type': 'nested' }, 'datetime': { 'type': 'date' } } } # TODO: Remove when we deprecate Elasticsearch version 6.x if self.version.startswith('6'): _document_mapping = {doc_type: _document_mapping} if not self.client.indices.exists(index_name): try: self.client.indices.create( index=index_name, body={'mappings': _document_mapping}) except ConnectionError: raise RuntimeError('Unable to connect to Timesketch backend.') except RequestError: index_exists = self.client.indices.exists(index_name) es_logger.warning( 'Attempting to create an index that already exists ' '({0:s} - {1:s})'.format(index_name, str(index_exists))) # We want to return unicode here to keep SQLalchemy happy. if six.PY2: if not isinstance(index_name, six.text_type): index_name = codecs.decode(index_name, 'utf-8') if not isinstance(doc_type, six.text_type): doc_type = codecs.decode(doc_type, 'utf-8') return index_name, doc_type def delete_index(self, index_name): """Delete Elasticsearch index. Args: index_name: Name of the index to delete. """ if self.client.indices.exists(index_name): try: self.client.indices.delete(index=index_name) except ConnectionError as e: raise RuntimeError( 'Unable to connect to Timesketch backend: {}'.format(e)) def import_event(self, index_name, event_type, event=None, event_id=None, flush_interval=DEFAULT_FLUSH_INTERVAL): """Add event to Elasticsearch. Args: index_name: Name of the index in Elasticsearch event_type: Type of event (e.g. plaso_event) event: Event dictionary event_id: Event Elasticsearch ID flush_interval: Number of events to queue up before indexing """ if event: for k, v in event.items(): if not isinstance(k, six.text_type): k = codecs.decode(k, 'utf8') # Make sure we have decoded strings in the event dict. if isinstance(v, six.binary_type): v = codecs.decode(v, 'utf8') event[k] = v # Header needed by Elasticsearch when bulk inserting. header = { 'index': { '_index': index_name, } } update_header = {'update': {'_index': index_name, '_id': event_id}} # TODO: Remove when we deprecate Elasticsearch version 6.x if self.version.startswith('6'): header['index']['_type'] = event_type update_header['update']['_type'] = event_type if event_id: # Event has "lang" defined if there is a script used for import. if event.get('lang'): event = {'script': event} else: event = {'doc': event} header = update_header self.import_events.append(header) self.import_events.append(event) self.import_counter['events'] += 1 if self.import_counter['events'] % int(flush_interval) == 0: _ = self.flush_queued_events() self.import_events = [] else: # Import the remaining events in the queue. if self.import_events: _ = self.flush_queued_events() return self.import_counter['events'] def flush_queued_events(self): """Flush all queued events. Returns: dict: A dict object that contains the number of events that were sent to Elastic as well as information on whether there were any errors, and what the details of these errors if any. """ if not self.import_events: return {} return_dict = { 'number_of_events': len(self.import_events) / 2, 'total_events': self.import_counter['events'], } try: results = self.client.bulk(body=self.import_events) except (ConnectionTimeout, socket.timeout): # TODO: Add a retry here. es_logger.error('Unable to add events', exc_info=True) errors_in_upload = results.get('errors', False) return_dict['errors_in_upload'] = errors_in_upload if errors_in_upload: items = results.get('items', []) return_dict['errors'] = [] es_logger.error('Errors while attempting to upload events.') for item in items: index = item.get('index', {}) index_name = index.get('_index', 'N/A') _ = self._error_container.setdefault(index_name, { 'errors': [], 'types': Counter(), 'details': Counter() }) error_counter = self._error_container[index_name]['types'] error_detail_counter = self._error_container[index_name][ 'details'] error_list = self._error_container[index_name]['errors'] error = index.get('error', {}) status_code = index.get('status', 0) doc_id = index.get('_id', '') caused_by = error.get('caused_by', {}) caused_reason = caused_by.get('reason', 'Unkown Detailed Reason') error_counter[error.get('type')] += 1 detail_msg = '{0:s}/{1:s}'.format( caused_by.get('type', 'Unknown Detailed Type'), ' '.join(caused_reason.split()[:5])) error_detail_counter[detail_msg] += 1 error_msg = '<{0:s}> {1:s} [{2:s}/{3:s}]'.format( error.get('type', 'Unknown Type'), error.get('reason', 'No reason given'), caused_by.get('type', 'Unknown Type'), caused_reason, ) error_list.append(error_msg) es_logger.error( 'Unable to upload document: {0:s} to index {1:s} - ' '[{2:d}] {3:s}'.format(doc_id, index_name, status_code, error_msg)) return_dict['error_container'] = self._error_container self.import_events = [] return return_dict @property def version(self): """Get Elasticsearch version. Returns: Version number as a string. """ version_info = self.client.info().get('version') return version_info.get('number')
class Report: def __init__(self, id_list, username, password, index_name): self.ES_HOST = '192.168.169.37' self.ES_PORT = 9206 # self.INDEX_NAME = 'weibo_report_management' self.INDEX_NAME = index_name self.TYPE = 'report' self.es = Elasticsearch([{'host': self.ES_HOST, 'port': self.ES_PORT}]) self.results = [] self.id_list = id_list self.username = username self.password = password self.currentTime = int(time.time()) def userList(self): self.results = [] for id in self.id_list: result = self.es.get(index=self.INDEX_NAME, doc_type=self.TYPE, id=id)['_source'] event_name = result['event_name'] report_time = result['report_time'] report_type = result['report_type'] xnr_user_no = result['xnr_user_no'] weibo_list = json.loads(result['report_content'])['fb_list'] for each in weibo_list: text = each['text'] timestamp = each['timestamp'] try: user = each['nick_name'] except: user = each['uid'] uid = each['uid'] fid = each['fid'] dict = {'event_name':event_name, 'report_time':report_time, 'report_type':report_type,\ 'xnr_user_no':xnr_user_no, 'text':text, 'timestamp':timestamp, 'user':user,\ 'uid':uid, 'fid':fid} self.results.append(dict) return self.results # def screen_shot(self, results): # for result in results: # screen = Screen(self.username, self.password) # screen.screenShot(result['uid'], result['mid']) def save_excel(self): results = self.userList() filename = 'xnr/static/doc/' + str(self.currentTime) + '.xlsx' if results: letters = "ABCDEFGHIJKLMN" #self.screen_shot(results) file = xlsxwriter.Workbook(filename) table = file.add_worksheet() field = [each for each in results[0].keys()] field = '^&*'.join(field).replace('event_name', u'上报名称').replace( 'report_time', u'上报时间').replace('report_type', u'上报类型').replace( 'xnr_user_no', u'虚拟人').replace('text', u'文本内容').replace( 'user', u'发博用户').replace('timestamp', u'发博时间') + u"^&*截图" field = field.split('^&*') for a, b in enumerate(field): table.write(letters[a] + str(1), b) lists = [] for result in results: list = [each for each in result.values()] lists.append(list) for i, k in enumerate(lists): table.insert_image(letters[len(k)] + str(i + 2), results[0]['fid'] + '.png', { 'x_scale': 0.05, 'y_scale': 0.05 }) for c, d in enumerate(k): qq = letters[c] + str(i + 2) table.write(qq, d) file.close() return filename def save_word(self): results = self.userList() filename = 'xnr/static/doc/' + str(self.currentTime) + '.docx' if results: #self.screen_shot(results) document = Document() for result in results: result_str = json.dumps(result, ensure_ascii=False).replace( '{', '').replace( '}', '').replace('event_name', u'上报名称').replace( 'report_time', u'上报时间').replace('report_type', u'上报类型').replace( 'xnr_user_no', u'虚拟人').replace('text', u'文本内容').replace( 'user', u'发博用户').replace('timestamp', u'发博时间') document.add_paragraph(result_str) #document.add_picture(result['fid']+'.png', width=Inches(1.25)) document.add_page_break() document.save(filename) return filename
class ElasticsearchWrapper(metaclass=DatabaseMeta): LOG_LEVEL = 1 def __init__(self, host=None, port=None, user=None, password=None, *args, **kwargs): assert host is not None, 'host can not be None.' assert port is not None, 'port can not be None.' self.__connector = Elasticsearch(host, http_auth=(user, password), scheme='http', port=port, http_compress=True, verify_certs=False) @elastic_verify def insert(self, index=None, body=None, *args, **kwargs): assert body is not None, '[insert] body can not be None.' return self.__connector.index(index=index, body=body) @elastic_verify def delete(self, index=None, _id=None, *args, **kwargs): assert _id is not None, '[delete] _id can not be None.' return self.__connector.delete(index=index, id=_id) @elastic_verify def update(self, index=None, _id=None, body=None, *args, **kwargs): assert _id is not None, '[update] _id can not be None.' assert body is not None, '[update] body can not be None.' return self.__connector.update(index=index, id=_id, body=body) @elastic_verify def get(self, index=None, _id=None, *args, **kwargs): assert _id is not None, '[get] _id can not be None.' return self.__connector.get(index=index, id=_id) @elastic_verify def create_index(self, index=None, *args, **kwargs): return self.__connector.create(index=index) @elastic_verify def delete_index(self, index=None, *args, **kwargs): return self.__connector.delete(index=index) @elastic_verify def is_index_exists(self, index=None, *args, **kwargs): if self.__connector.indices.exists(index=index): return True else: return False @elastic_verify def search_topic(self, index=None, query=None, *args, **kwargs): if query is None: logger.info('[search] query is None.') return [] results = self.__connector.search(index=index, body=query)['hits']['hits'] return [result['_source'] for result in results]
def main(args): #es2 = Elasticsearch([{'host': 'localhost', 'port': 9200}]) es2 = Elasticsearch( "https://96aa4157ead74b5ca4926523b1d1994e.us-east-1.aws.found.io:9243", http_auth=('elastic', 'MrkfJ5hxIcCOzTMfOa1Nftzy')) #elasticsearch.helpers.reindex(es1, "church_data", args.out_index, query=None, target_client=None, # chunk_size=500, scroll='5m', scan_kwargs={}, bulk_kwargs={}) checkpoint_path = fjoin(args.ckp, "checkpoint.%d." % args.ckp_no) if isfile(checkpoint_path + "frontier_map.pt"): frontier_map = pickle.load( open(checkpoint_path + "frontier_map.pt", "rb")) else: raise Exception("checkpoint not found") filesadded = 0 filesupdated = 0 # Load all the pickles of the crawled data for file in os.listdir(args.cdp): path = fjoin(args.cdp, file) res = pickle.load(open(path, "rb")) url = res['docno'] inlinkData = list(frontier_map[url].inlinks) j_inlinks = json.dumps(inlinkData) logging.info("Checking for url {}".format(url)) #Finding if the url is in the merged index result = es2.get(index=args.out_index, id=url, ignore=404) if result['found'] is True: logging.info("inlinks from local {}".format( len(set((frontier_map[url].inlinks))))) logging.info("inlinks retrieved {}".format( len(set(result['_source']['inlinks'])))) existing_inlinks = json.loads(j_inlinks) retrieved_inlinks = json.loads(result['_source']['inlinks']) #merging the inlinks from both local and merged set and updating the inlinks final_inlinkset = merge_inlinks( [retrieved_inlinks, existing_inlinks]) logging.info("length of final list {}".format( len(final_inlinkset))) es2.update(index=args.out_index, id=url, doc_type=args.doc_type, body={"doc": { "inlinks": json.dumps(final_inlinkset) }}) filesupdated += 1 logging.info("doc updated for url {}".format(url)) else: # indexing the data for the url which doesn't match any url in merged data index logging.info("value of res in else {}: ".format(len(result))) title = res['head'] content = res['text'] inlinks = j_inlinks outlinkData = list(frontier_map[url].outlinks) outlinks = json.dumps(outlinkData) doc = { 'head': title, 'text': content, 'inlinks': inlinks, 'outlinks': outlinks } es2.index(index=args.out_index, id=url, body=doc, doc_type=args.doc_type) filesadded += 1 logging.info("doc added for url {}: ".format(url)) logging.info("doc added {} and updated {}: ".format( filesadded, filesupdated))
class ES(Singleton): def __init__(self, address = ADDRESS): super(ES, self).__init__() if not hasattr(self,'_es'): try: self._es = Elasticsearch(address.split(',')) except Exception as e: raise else: log.debug('连接到Elasticsearch') def add(self, table, data, data_id = None, doc_type = ''): ''' @summary: --------- @param table: 索引 @param data_json: 数据 json类型 @param doc_type: 类型 空时以表命名。 doc_type可理解为同样的数据结构不同意意义。比如url表,doc_type 可以以网站名命名 @param data_id data_id不指定,会自己创建, data_id已存在,则更新 --------- @result: ''' try: table = table.lower() self._es.index(index = table, doc_type = doc_type or table ,id = data_id, body = data) except Exception as e: log.error(e) return False else: return True def get(self, table, data_id, doc_type = '_all'): ''' @summary: 根据id取数据 --------- @param table:索引 @param data_id:数据id 如 ID=1 的数据 @param doc_type:类型 _all 为全部 --------- @result: json ''' datas = {} try: table = table.lower() datas = self._es.get(index = table, doc_type = doc_type, id = data_id) except Exception as e: # log.error(e) pass return datas def search(self, table, body = {}): ''' @summary: --------- @param table: @param body: 查询条件 --------- @result: json ''' datas = {} try: table = table.lower() datas = self._es.search(index = table, body = body) except Exception as e: log.error(e) return datas def update_by_id(self, table, data_id, data, doc_type = ''): ''' @summary: --------- @param table: @param data_id: @param data: {"TITLE":"xxx"} 更新的字段及值 @param doc_type: --------- @result: ''' self._es.update(index = table, doc_type = doc_type or table, body = {"doc": data}, id = data_id) def delete_by_id(self, table, data_id, doc_type = ''): """ 根据给定的id,删除文档 :return: """ self._es.delete(index = table, doc_type = doc_type or table, id = data_id) def set_mapping(self, table, mapping, doc_type = ''): ''' @summary: --------- @param table: @param mapping: mapping = { doc_type: { "properties": { "document_id": { "type": "integer" }, "title": { "type": "string" }, "content": { "type": "string" } } } } @param doc_type: --------- @result: ''' if not self._es.indices.exists(index = table): # 创建Index和mapping self._es.indices.create(index = table, body = mapping, ignore=400) self._es.indices.put_mapping(index = table, doc_type = doc_type or table, body = mapping)
class ElasticsearchClient: """ Elasticsearch client for politylink endpoint """ def __init__(self, url='http://localhost:9200'): def to_node(url): res = urlparse(url) return {'host': res.hostname, 'port': res.port} self.client = Elasticsearch(hosts=[to_node(url)]) def index(self, obj): """ create or update a document """ assert isinstance(obj, AbstractText) try: return self.client.index(index=obj.index, id=obj.id, body=obj.__dict__) except Exception as e: raise ElasticsearchException(f'failed to index {obj}') from e def get(self, id_): """ get a document by politylink id (ref idgen) """ try: if id_.startswith('News'): cls = NewsText elif id_.startswith('Bill'): cls = BillText res = self.client.get(index=cls.index, id=id_) return cls(res['_source']) except Exception as e: raise ElasticsearchException(f'failed to get {id_}') from e def search(self, cls, query=None): """ search $cls documents by query return all documents when query is empty """ if query: query_doc = { 'query': { 'multi_match': { 'query': query, 'fields': cls.get_all_fields() } } } else: query_doc = {'query': {'match_all': {}}} try: res = self.client.search(index=cls.index, body=query_doc) return list( map(lambda hit: cls(hit['_source']), res['hits']['hits'])) except Exception as e: raise ElasticsearchException( f'failed to search NewsText for {query_doc}') from e
es.index(index='project_data', doc_type='projectData', id=1, body=json.load(open_file)) with open('Data/project_features.json') as open_file: es.index(index='project_features_index', doc_type='project_features_doc', id=1, body=json.load(open_file)) # with open('project.json') as open_file: # es.index(index='project_features',doc_type='project_feature',id= 1,body=json.load(open_file)) project_data_json = es.get(index='project_data', doc_type='projectData', id=1) project_data_json = project_data_json['_source'] project_features_json = es.get(index='project_features_index', doc_type='project_features_doc', id=1) project_features_json = project_features_json['_source'] ## project_data = es.get(index='ready_to_move',doc_type='project_data',id=1) ## project_data = project_data['_source'] ## project_features = es.get(index='project_features',doc_type='project_feature',id=1) ## project_features = project_features['_source'] allKeys = getAllKeys(project_features_json)
print "OK" es.indices.get(index='classes', ignore=[400, 404]) # 그냥 가져오면 오류 es.indices.create(index="classes") #인덱스 만들기 es.indices.get(index="classes") es.indices.delete(index="classes") es.indices.get(index='classes', ignore=[400, 404]) # 지웟으니 오류 #POST http://localhost:9200/classes/class/1 -d 인덱스/타입/id body값 body = {"title": "Algorithm", "professor": "John"} es.index(index="classes", doc_type="class", id=1, body=body) #GET http://localhost:9200/classes/class/1 인덱스/타입/id body값 res = es.get(index="classes", id=1) print(json.dumps(res, indent=2)) # 이쁘게 출력 가능 #json 파일 불러와서 저장하기 import os print(os.getcwd()) with open( 'D:\\github\\team-crawlcrawl\\crawling_code\\elasticsearch_test\\oneclass.json' ) as data_file: data = json.load(data_file) es.index(index="classes", doc_type="class", id=1, body=data) res = es.get(index="classes", id=1) print(json.dumps(res, indent=2)) # 이쁘게 출력 가능 ''' 데이터 업데이트 '''
# #e1={ # "first_name":"nitin", # "last_name":"panwar", # "age": 27, # "about": "Love to play cricket", # "interests": ['sports','music'], #} # ##print(e1) # ##Now let's store this document in Elasticsearch #res = es.index(index='megacorp',doc_type='employee',id=1,body=e1) res = es.index(index="test-index", doc_type='tweet', id=1, body=doc) #print(res['result']) res = es.get(index="test-index", doc_type='tweet', id=1) #print(res['_source']) es.indices.refresh(index="test-index") #res = es.search(index="test-index", body={"query": {"match_all": {}}}) #print("Got %d Hits:" % res['hits']['total']) res= es.search(index='test-index',body={'query':{'match':{'LastName':'Agrawal'}}}) print(res['hits']['hits']) #print(res['hits']['hits']) #for hit in res['hits']['hits']: # print("%(timestamp)s %(author)s: %(text)s" % hit["_source"])
tracer = logging.getLogger('elasticsearch.trace') tracer.setLevel(logging.INFO) tracer.addHandler(logging.FileHandler('/tmp/es_trace.log')) # instantiate es client, connects to localhost:9200 by default es = Elasticsearch() # we load the repo and all commits load_repo(es) # run the bulk operations success, _ = bulk(es, REPO_ACTIONS, index='git', raise_on_error=True) print('Performed %d actions' % success) # now we can retrieve the documents es_repo = es.get(index='git', doc_type='repos', id='elasticsearch') print('%s: %s' % (es_repo['_id'], es_repo['_source']['description'])) # update - add java to es tags es.update(index='git', doc_type='repos', id='elasticsearch', body={ "script": "ctx._source.tags += tag", "params": { "tag": "java" } }) # refresh to make the documents available for search es.indices.refresh(index='git')
"_id": "20fbba1230cabbc0f4644f917c6c2be52b8a63e8", "_op_type": "update", "doc": {"initial_commit": True}, }, { "_type": "_doc", "_id": "ae0073c8ca7e24d237ffd56fba495ed409081bf4", "_op_type": "update", "doc": {"release": "5.0.0"}, }, ] success, _ = bulk(client, UPDATES, index="git") client.indices.refresh(index="git") initial_commit = client.get(index="git", id="20fbba1230cabbc0f4644f917c6c2be52b8a63e8") # and now we can count the documents print(client.count(index="git")["count"], "documents in index") import csv with open("cars.csv") as csvfile: reader = csv.DictReader(csvfile, delimiter=";") ret = bulk(client, reader, index="cars") result = client.search( index="git", body={ "query": {
# -*-coding:utf-8-*- from elasticsearch import Elasticsearch from datetime import datetime es_service = [{"host": "127.0.0.1", "port": "9200"}] es = Elasticsearch(es_service) date = { 'author': 'stone', 'text': "今天的天气不太热", 'timestamp': datetime.now(), } res = es.index(index='test-index', doc_type='tweet', id=1, body=date) print(res) # 打印索引信息 res = es.get(index='test-index', doc_type='tweet', id=1) print(res['_source']) # 打印内容 es.indices.refresh(index="test-index") # 自定义查找顺序 res = es.search(index="test-index", body={"query": {"match_all": {}}}) print("Got %d Hits:" % res['hits']['total']['value']) for hit in res['hits']['hits']: # print(hit["_source"]) print("%(timestamp)s %(author)s: %(text)s" % hit["_source"])
def create_index(data): # connect to the elasticsearch instance es = Elasticsearch("http://ec2-52-3-61-194.compute-1.amazonaws.com:9200") INDEX_NAME = 'parktest' d = {} d['time'] = data[0][0] d['garage_name'] = data[0][1] location = {} location['lat'] = data[0][2] location['lon'] = data[0][3] d['location'] = location d['availability'] = data[1] # get the details about the document with id = garage_name res = es.get(index=INDEX_NAME, doc_type=INDEX_NAME, id=data[0][1], ignore=404) #if the document with id do not exist, create it if not res['found']: es.index(index=INDEX_NAME, doc_type=INDEX_NAME, id=data[0][1], body=d, refresh=True) else: #update the document qq = '{"doc": { "availability":'+str(data[1])+' }}' es.update(index=INDEX_NAME, doc_type=INDEX_NAME,id=data[0][1], body=qq) return d