示例#1
9
class ES(object):
    def __init__(self):
        self.es = Elasticsearch()
        self.id = 0

    def insert_es(self, id, good, description):
        doc = {
            'id': id,
            'good': good,
            'description': description
            }
        res = self.es.index(index="test-index", doc_type='description_goods', id=self.id, body=doc)
        #print(res['created'])
        res = self.es.get(index="test-index", doc_type='description_goods', id=self.id)
        #print(res['_source'])
        self.es.indices.refresh(index="test-index")
        self.id += 1

    def search_es(self, what, query):
        res = self.es.search(index="test-index", body={"query": {"match": {what: query}}})  #"author": 'kimchy'
        print("Got %d Hits" % res['hits']['total'])
        documents = []
        for hit in res['hits']['hits']:
            #print hit
            documents.append(hit['_source'])
        return documents

    def del_by_query(self, query):
        res = self.es.delete_by_query(index="test-index", body={"query": {"match": {query}}}) #{"match_all": {}}

    def del_all(self):
        res = self.es.delete_by_query(index="test-index", body={"query": {"match_all": {}}}) #{"match_all": {}}
示例#2
0
def annotate(config, documentId):
  if "getPosTags" in config and config["getPosTags"] == False: return
  esClient = Elasticsearch(config["elasticsearch"]["host"] + ":" + str(config["elasticsearch"]["port"]))
  corpusIndex = config["corpus"]["index"]
  corpusType = config["corpus"]["type"]
  corpusFields = config["corpus"]["text_fields"]
  processorIndex = config["processor"]["index"]
  processorType = config["processor"]["type"]
  document = esClient.get(index=corpusIndex, doc_type=corpusType, id = documentId, fields=corpusFields)
  content = ""
  if "fields" in document:
    for field in corpusFields:
      if field in document["fields"]:
        if type(document["fields"][field]) is list:
          for element in document["fields"][field]:
            content += element + ". "
        else:
          content += document["fields"][field] + ". "
      
  annotatedDocument = {}
  sentences = nltk.sent_tokenize(content)
  posTaggedSentences = []
  for sentence in sentences:
    sentence = sentence.strip()
    if len(sentence) > 1:
      sentence = sentence.replace("-", " ")
      sentenceWords = nltk.word_tokenize(sentence.lower())
      sentenceWords = map(lambda x: x.replace(".", ""), sentenceWords)
      posTags = nltk.pos_tag(sentenceWords)
      posTaggedSentences.append(posTags)
  if esClient.exists(index=processorIndex, doc_type=processorType, id=document["_id"]):
    annotatedDocument = esClient.get(index=processorIndex, doc_type=processorType, id=document["_id"])["_source"]
  annotatedDocument["pos_tagged_sentences"] = posTaggedSentences
  esClient.index(index=processorIndex, doc_type=processorType, id=document["_id"], body=annotatedDocument)
  config["logger"].info("pos-processor: Annotated document '" + document["_id"] + "'")
class TestMemcachedConnection(ElasticTestCase):
    def setUp(self):
        try:
            import pylibmc
        except ImportError:
            raise SkipTest("No pylibmc.")
        super(TestMemcachedConnection, self).setUp()
        nodes = self.client.nodes.info()
        for node_id, node_info in nodes["nodes"].items():
            if 'memcached_address' in node_info:
                connection_info = ADDRESS_RE.search(node_info['memcached_address']).groupdict()
                self.mc_client = Elasticsearch(
                    [connection_info],
                    connection_class=MemcachedConnection
                )
                break
        else:
            raise SkipTest("No memcached plugin.")

    def test_index(self):
        self.mc_client.index("test_index", "test_type", {"answer": 42}, id=1)
        self.assertTrue(self.client.exists("test_index", doc_type="test_type", id=1))

    def test_get(self):
        self.client.index("test_index", "test_type", {"answer": 42}, id=1)
        self.assertEquals({"answer": 42}, self.mc_client.get("test_index", doc_type="test_type", id=1)["_source"])

    def test_unicode(self):
        self.mc_client.index("test_index", "test_type", {"answer": u"你好"}, id=u"你好")
        self.assertEquals({"answer": u"你好"}, self.mc_client.get("test_index", doc_type="test_type", id=u"你好")["_source"])

    def test_missing(self):
        self.assertRaises(NotFoundError, self.mc_client.get, "test_index", doc_type="test_type", id=42)
示例#4
0
    def commit(self, index_name, user_name):
        """
        Commit the current state of factor network to a local Elastic instance

        The index_name should remain constant for an organization. The user_name refers to the specific user and provides the functionality to maintain the user provenance by making it the Elastic document type.

        Specifically, split the state into 3 components (1) root (the datum with which you started) (2) extension (the data you've confirmed based on factor network suggestions) (3) suggestions (the suggested extensions to your data)

        We index a factor network by taking the root and appending a _x to it. We loop through get requests on that particular lead to get based on the most recently committed root_x and we add 1 to x.

        The results of the commit will look as follows in Elastic:

        {
            "_index": "Your_Index_Name",
            "_type": "adam",
            "_id": "rootid_x",
            "_score": 1,
            "_source": {
                "root": [[0,1],[0,7],...],
                "extension": {[[1,2],[2,3],...]},
                "suggestions": {[[3,4],[...],...]}
            }
        }
        """
        es = Elasticsearch()
        source = set()
        target = set()
        edges = self.G.edges()
        for edge in edges:
            source.add(edge[0])
            target.add(edge[1])

        def split(intersection, edges):
            result = []
            for i in intersection:
                for edge in edges:
                    if i in edge:
                        result.append(edge)
            return result

        state = {}
        state["root"] = split(source.difference(target), edges)
        state["extension"] = split(target.intersection(source), edges)
        state["suggestions"] = split(target.difference(source), edges)

        i = 1
        preexisting = True
        while preexisting:
            try:
                index_id = state["root"][0][0] + "_" + str(i)
                es.get(index=index_name, id=index_id, doc_type=user_name)
                i = i + 1
            except:
                preexisting = False

        res = es.index(index=index_name, id=index_id, doc_type=user_name, body=state)
        current_state = es.get(index=index_name, id=index_id, doc_type=user_name)
        return current_state
示例#5
0
class ESClient:
    def __init__(self, es_params):
        self.es = Elasticsearch(es_params)

    def get_metric_metadata(self, metric_name, tenant_id):
        """
        Get document from index metric_metadata for a given metric name and tenant id
        """

        document_id = self.get_document_id(tenant_id=tenant_id, metric_name=metric_name)
        try:
            return self.es.get(index='metric_metadata', doc_type='metrics', id=document_id, routing=tenant_id)
        except NotFoundError as e:
            return e.info

    def get_enums_data(self, metric_name, tenant_id):
        """
        Get document from index enums for a given metric name and tenant id
        """

        document_id = self.get_document_id(tenant_id=tenant_id, metric_name=metric_name)

        try:
            return self.es.get(index='enums', doc_type='metrics', id=document_id, routing=tenant_id)
        except NotFoundError as e:
            return e.info

    def delete_metric_metadata(self, metric_name, tenant_id):
        """
        Delete document from index metric_metadata for metric_metadata dictionary(obtained from get_metric_metadata
        call) and tenant id
        """

        document_id = self.get_document_id(tenant_id=tenant_id, metric_name=metric_name)
        self.es.delete(index='metric_metadata', doc_type='metrics', id=document_id, routing=tenant_id)
        print 'Deleted from index metric_metadata for _id: [%s] routing: [%s]' % (document_id, tenant_id)

    def delete_enums_data(self, metric_name, tenant_id):
        """
        Delete document from index enums for enums dictionary(obtained from get_enums_data
        call) and tenant id
        """
        document_id = self.get_document_id(tenant_id=tenant_id, metric_name=metric_name)
        self.es.delete(index='enums', doc_type='metrics', id=document_id, routing=tenant_id)
        print 'Deleted from index enums for _id: [%s] routing: [%s]' % (document_id, tenant_id)

    def get_document_id(self, tenant_id, metric_name):
        """
        Construct _id of elastic search from tenant id and metric name
        """
        return tenant_id + ':' + metric_name
class CommonElasticsearch(object):
    """
    Class for interfacing with Elasticsearch or docker logging directly
    """

    def __init__(self, index_type='mediakraken', es_host='mkelk', es_port=9200,
                 debug_override=None):
        if 'DEBUG' in os.environ and debug_override is None:
            self.debug = os.environ['DEBUG'].lower()
            if self.debug == 'es':
                self.es_inst = Elasticsearch([{'host': es_host, 'port': es_port}])
                self.es_index = index_type
        else:
            if debug_override is None:
                self.debug = None
            else:
                self.debug = debug_override

    def com_elastic_index(self, log_type, body_data):
        # write log to elk
        if self.debug == 'es':
            # leave the try....as I don't want the container to fail if mkelk not accepting
            try:
                self.es_inst.index(index=self.es_index, doc_type='MediaKraken',
                                   body={"text": {"type": log_type,
                                                  "data": json.dumps(body_data),
                                                  "timestamp": time.strftime("%Y%m%d%H%M%S")}})
            except:
                print((log_type, body_data))
        # write log to host syslog
        elif self.debug == 'sys':
            try:
                sys.stdout.write(str({"type": log_type,
                                      "data": json.dumps(body_data),
                                      "timestamp": time.strftime("%Y%m%d%H%M%S")}))
            except:
                sys.stdout.write(str({"type": log_type,
                                      "timestamp": time.strftime("%Y%m%d%H%M%S")}))
        # write log to host syslog
        elif self.debug == 'print':
            try:
                print(str({"type": log_type,
                           "data": json.dumps(body_data),
                           "timestamp": time.strftime("%Y%m%d%H%M%S")}))
            except:
                print(str({"type": log_type,
                           "data": str(body_data),
                           "timestamp": time.strftime("%Y%m%d%H%M%S")}))

    def com_elastic_get(self, id):
        self.es_inst.get(index=self.es_index, doc_type='MediaKraken', id=id)
class ElasticSearchDb(PersistenceBase):
    def __init__(self):
        base = PersistenceBase()
        base.__init__()
        self.session = Elasticsearch()
        self.database = Config().elasticsearchindex
        if (not self.session.indices.exists(index=self.database)):
            self.session.indices.create(index=self.database)

    def selectalltables(self):
        tables = []
        res = self.session.indices.get_mapping(index=self.database)
        for map in res[self.database]['mappings']:
            tables.append(map)
        return tables

    def selectallcolumns(self, tablename):
        columns = []
        res = self.session.indices.get_mapping(index=self.database)
        for column in res[self.database]['mappings'][tablename]['properties']:
            columns.append(column)
        return columns

    def selectall(self, tablename):
        res = self.session.get(index=self.database, doc_type=tablename)
        return res

    def selectone(self, tablename, id):
        queryfilter = {'uuid': id}
        res = self.session.get(index=self.database, doc_type=tablename, body={'query': queryfilter})
        return res

    def insert(self, obj, tablename):
        obj.updatedAt = datetime.isoformat(datetime.now())
        obj.uuid = str(self.getuuid())
        serialized_obj = self.getallvaluesfromobject(obj)
        self.session.index(index=self.database, doc_type=tablename, body=serialized_obj, id=obj.uuid)

    def update(self, obj, tablename):
        obj.updatedAt = datetime.isoformat(datetime.now())
        self.session.update(index=self.database, doc_type=tablename, id=obj.uuid)

    def delete(self, obj, tablename):
        #todo: add to a table that manage deleted items (just to know if the obj was deleted and we don't have to add again)
        self.session.delete(index=self.database, doc_type=tablename, id=obj.uuid)

    def getallvaluesfromobject(self, obj):
        print(obj)
        ret = json.dumps(obj, default=lambda o: o.__dict__)
        return ret
示例#8
0
def test_foo(sm_config):
    annotations = [('test_ds', 'test_db', 'H20', '+H', [], []), ('test_ds', 'test_db', 'Au', '+H', [], [])]
    db_mock = MagicMock(DB)
    db_mock.select.return_value = annotations

    es_exp = ESExporter(sm_config)
    es_exp.index_ds(db_mock, 'test_ds', 'test_db')

    es = Elasticsearch()

    d = es.get(index='sm', id='test_ds_test_db_H20_+H', doc_type='annotation', _source=True)
    assert d['_source'] == {'ds_name': 'test_ds', 'db_name': 'test_db', 'sf': 'H20', 'adduct': '+H', 'comp_names': '', 'comp_ids': ''}

    d = es.get(index='sm', id='test_ds_test_db_Au_+H', doc_type='annotation', _source=True)
    assert d['_source'] == {'ds_name': 'test_ds', 'db_name': 'test_db', 'sf': 'Au', 'adduct': '+H', 'comp_names': '', 'comp_ids': ''}
示例#9
0
文件: main.py 项目: phamal/aageno
def index():
    searchString = ""
    if request.method == 'POST':
       searchString = request.form['searchString']
    elif request.method == "GET":
       searchString = request.args.get("searchString", "")

    notes = []
    if(len(searchString) > 0):
        note = {}
        es = Elasticsearch(['http://159.203.66.191:9200'])
        searchString = searchString.strip();
        if searchString.startswith("#"):
            tag = searchString[1:len(searchString)]
            try:
                res = es.get(index="brahman", doc_type='note', id=tag)
                note["title"] = tag;
                note["body"] = str(res['_source']['body']).strip()
                notes.append(note)
            except TransportError as e:
                app.logger.error(e.info)
                return redirect(url_for('addNote')+"?id="+tag)
        else:
            res = es.search(index="brahman", doc_type="note", body={"query": {"match": {"body": searchString}}})
            returnString = ""
            for hit in res['hits']['hits']:
                note = {}
                note["title"] = str(hit["_id"])
                notestr = str(hit["_source"]['body'])
                note["body"] = notestr.strip()
                notes.append(note)

    return render_template("index.html",notes = notes);
class ElasticSearchManager(object):

	def __init__(self, index=None, doc_type=None, *args, **kwargs):
		self.index = index
		self.doc_type = doc_type
		self.obj_es = Elasticsearch()

	def search(self, query = None, *args, **kwargs):
		data = self.obj_es.search(index=self.index, doc_type=self.doc_type, body={"query":{"match":query}})
		return fetch_source(data['hits']['hits'])

	def get(self, *args, **kwargs):
		data=self.obj_es.get(index=self.index, doc_type=self.doc_type, id=kwargs['id'])
		return data['_source']

	def get_list(self, *args, **kwargs):
		data = self.obj_es.search(index=self.index, body={"query": {"match_all": {}}})
		return fetch_source(data['hits']['hits'])

	def insert(self, data = None):
		data = json.loads(data)
		data['user_name'] = data['user']['screen_name']
		del data['user']
		del data['entities']
		res = self.obj_es.index(index=self.index, doc_type=self.doc_type, id=data['id'], body=data)
		logger.info("Getting stream:{0}".format(res))

	def delete(self, data = None):
		pass

	def update(self, data = None):
		pass
示例#11
0
def main():
    es = Elasticsearch([{"host": "localhost", "port": 9200}])
    r = redis.StrictRedis()
    samples = r.smembers("samples")
    conditions = Counter()
    for pkgName in samples:
        print "================"
        printPkgContent(pkgName)

        doc = es.get(id="npm:%s:js" % pkgName, index="throwtable", doc_type="implementation", ignore=404)
        if doc["found"]:
            actual = doc["_source"]["algorithm"]
        else:
            actual = []
        expected = r.smembers("%s:map" % pkgName)

        result = checkPkg(pkgName, actual, expected, r, es)
        print result

        r.sadd("samples-%s" % result, pkgName)
        conditions[result] += 1

    for (k, v) in conditions.items():
        print "%s: %s" % (k, v)

    print "Precision:", 1.0 * conditions[TRUE_POSITIVE] / (conditions[TRUE_POSITIVE] + conditions[FALSE_POSITIVE])
    print "Recall:", 1.0 * conditions[TRUE_POSITIVE] / (conditions[TRUE_POSITIVE] + conditions[FALSE_NEGATIVE])
示例#12
0
def rcr(index='itest01', type='ttest01', id='dtest01'):
    """
    Demonstrates the retrieve-change-reindex cycle for updating
    a document in Elasticsearch.
    """
    body = '{"alist": ["element1"]}'
    es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
    es.index(index=index, doc_type=type, id=id, body=body)
    res = es.get(index=index, doc_type=type, id=id)
    print("\nOriginal Document\n-----------------")
    pprint(res)
    res['_source']['alist'].extend(['element2', 'element3'])
    es.index(index=index, doc_type=type, id=id, body=res)
    res = es.get(index=index, doc_type=type, id=id)
    print("\nUpdated Document\n-----------------")
    pprint(res)
示例#13
0
class ElasticsearchUtils(object):
    def __init__(self, host_ports):
        # host_ports格式 [{'host':'xxx', 'port':9200},{}]
        self.host_ports = host_ports
        self.es = None

    def init_connect(self):
        self.es = Elasticsearch(self.host_ports)
        return self.es.ping()


    def get_search_result(self, index_name, type_name, query_body):
        if self.es:
            return self.es.search(index=index_name, doc_type=type_name, body=query_body)
        return

    def get_id_result(self, index_name, type_name, doc_id):
        if self.es:
            return self.es.get(index=index_name, doc_type=type_name, id=doc_id)['_source']
        return


    # doc_id为None说明让es自动生成id
    def add_index_doc(self, index_name, type_name, doc_id, doc_body):
        if doc_id:
            self.es.index(index=index_name, doc_type=type_name, id=doc_id, body=doc_body)
        else:
            self.es.index(index=index_name, doc_type=type_name, body=doc_body)

    def batch_index(self, index_name, type_name, doc_body_lines):
        self.es.bulk(index=index_name, doc_type=type_name, body=doc_body_lines)
示例#14
0
文件: models.py 项目: aeud/ecommerce
 def search(**kwargs):
     query = kwargs.get('query', dict(match_all={}))
     size = kwargs.get('size', 10)
     sort = kwargs.get('sort', ['_score'])
     body = dict(query=query, size=size, sort=sort)
     results = Elasticsearch().search(index='ecommerce', doc_type='product', body=body)
     return [x.get('_source') for x in results.get('hits').get('hits')]
示例#15
0
文件: main.py 项目: phamal/aageno
def hashtagNote(tag):
    es = Elasticsearch(['http://159.203.66.191:9200'])
    res = es.get(index="brahman", doc_type='note', id=tag)
    note = res['_source']['body']
    note = "<br />".join(note.split("\n"))
    note = "&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;".join(note.split("\t"))
    return note
def getTermStatistics():
    startTime = time.time()
    es = Elasticsearch()
    for term in qtToQno.keys():
        qtDocList= []
        results = es.search(index='ap_dataset', doc_type="document", body={"query": {"match": {"TEXT": "'"+term +"'"}}}
        ,size=9000)
        for doc in results['hits']['hits'] :
            dictToAddInDStats = {}
            #print(doc["_id"])
            qtDocList.append(doc["_id"])
            if doc['_id'] in docTermStats.keys():
                continue
            else:
                docToLength[doc['_id']] = len(set(es.get(index='ap_dataset',doc_type="document",
                                                     id=doc['_id'])["_source"]["TEXT"].split()) - set(ignoreWordsList))
                ts = es.termvector(index='ap_dataset', doc_type="document",id= doc['_id'],
                                                         term_statistics = True,field_statistics = False)
                keysToKeep = set(qtToQno.keys()) & set(ts["term_vectors"]["TEXT"]["terms"].keys())
                #print (keysToKeep)
                for tKeys in keysToKeep:
                    dictToAddInDStats[tKeys] = ts["term_vectors"]["TEXT"]["terms"][tKeys]
                docTermStats[doc['_id']] = dictToAddInDStats
                print(len(docTermStats))
        qtToDoc[term] = qtDocList
    elapsedTime= time.time() - startTime
示例#17
0
文件: main.py 项目: phamal/aageno
def addNote():
    es = Elasticsearch(['http://159.203.66.191:9200'])
    id = ""
    noteStr = ""
    if request.method == 'POST':
       id = request.form['id']
       noteStr = request.form['note']
       if len(noteStr.strip()) > 0 and len(id.strip()):
           note = {};
           note["maintag"] = id
           note["body"] = noteStr
           es.index(index="brahman", doc_type='note', id=note["maintag"], body=note)
           return redirect(url_for('index'))
    elif request.method == "GET":
       id = request.args.get("id", "")
    if (len(id) > 0):
        note = {}
        try:
            res = es.get(index="brahman", doc_type='note', id=id)
            note["title"] = id;
            note["body"] = str(res['_source']['body']).strip()
        except TransportError as e:
            note["title"] = id;
            note["body"] = ""

    return render_template("addNote.html",note=note);
示例#18
0
class ObjectManager(object):

    def __init__(self, index, doc_type, model_class):
        super(ObjectManager, self).__init__()
        self.index = index
        self.doc_type = doc_type
        self.model_class = model_class
        self.es = Elasticsearch()
        self.mapper = ObjectMapper()

    def find_one(self, pk):
        source_dict = self.es.get(index=self.index, doc_type=self.doc_type, id=pk)
        return self.mapper.from_dict_to_model(source_dict, self.model_class)

    def save(self, model):
        model_dict = self.mapper.from_model_to_dict(model)
        res = self.es.index(index=self.index, doc_type=self.doc_type, id=model.get_identity(), body=model_dict)
        return res['created']

    def find_all(self):
        res = self.es.search(index=self.index, doc_type=self.doc_type, body={"query": {"match_all": {}}})
        return [self.mapper.from_dict_to_model(model, self.model_class) for model in res['hits']['hits']]

    def update(self, model):
        model_dict = self.mapper.from_model_to_dict(model)
        res = self.es.update(index=self.index, doc_type=self.doc_type, id=model.pk, body={"doc": model_dict})
        return res

    def delete(self, pk):
        return self.es.delete(index=self.index, doc_type=self.doc_type, id=pk)
示例#19
0
def main():
    es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
    site = mw.Site('rosettacode.org', path='/mw/')
    r = redis.StrictRedis()
    samples = r.smembers('samples')
    conditions = Counter()
    counter = 0
    for taskName in samples:
        print 'task # %d ================' % counter
        counter += 1
        printTaskContent(taskName, site)

        impl_id = r.hget('rosetta-id-taskname-mapping', normalize(taskName))
        if impl_id is None:
            actual = []
        else:
            result = es.get(index='throwtable', doc_type='implementation',
                id=impl_id, ignore=404)
            if result['found']:
                actual = result['_source']['algorithm']
            else:
                actual = []
        expected = r.smembers("%s:map" % taskName)

        result = checkPkg(taskName, actual, expected, r)

        print result
        r.sadd('samples-%s' % result, taskName)
        conditions[result] += 1

    for (k, v) in conditions.items():
        print "%s: %s" % (k, v)

    print "Precision:", 1.0 * conditions[TRUE_POSITIVE] / (conditions[TRUE_POSITIVE] + conditions[FALSE_POSITIVE])
    print "Recall:", 1.0 * conditions[TRUE_POSITIVE] / (conditions[TRUE_POSITIVE] + conditions[FALSE_NEGATIVE])
示例#20
0
def get_rdap_asn(asn):
	es = Elasticsearch()
        does_exist = es.exists(index='whois', doc_type='asn_rdap', id = asn)
        print does_exist
        if does_exist is True:
                status = 200
                print "Found it!"
                get_record = es.get(index='rdap',doc_type='asn', id = asn)
                results = jsonify(get_record['_source'])
	else:
		try:
			url = 'http://hailey.opendnsbl.net:8080/rdapbootstrap/autnum/%s' % asn
			r = requests.get(url)
			status = 200
			b = r.json()
			#c = json.loads(b)
			#d = c['entities']
			#print d
			#e = json.dumps(c)
			#es.index(index='rwhois', doc_type='asn', id=asn, body=json.dumps(b))
			results = jsonify(b)
		except Exception as e:
			print e
			results_raw = jsonify({'status': "not_found"})
        	        status = 404
	                results = jsonify({'status': "not_found"})
	return results,status
示例#21
0
文件: es.py 项目: gccli/mylibrary
class Feedback(object):
  host = '127.0.0.1'
  index='result_'+date.today().strftime('%Y%m%d')
  index_pattern = 'result_*'

  def __init__(self, doctype='feedback'):
    self.server = Elasticsearch([{'host': self.host}])

    self.doctype = doctype
  

  def get(self, docid):
    try:
      doc = self.server.get(index=self.index, doc_type=self.doctype, id=docid)
      pprint(doc)
    except elasticsearch.ElasticsearchException as e:
      print e.info

  def update(self, docid, content):
    try:
      res = self.server.update(index=self.index, doc_type=self.doctype, id=docid, body=content)
      print res
    except elasticsearch.ElasticsearchException as e:
      print e.info

  def search(self, content):
    global opt_size, opt_source
    try:
      res = self.server.search(index=self.index, doc_type=self.doctype, body=content, _source=opt_source, size=opt_size)
      pprint(res, width=120)
    except elasticsearch.ElasticsearchException as e:
      print e.info
示例#22
0
class ES():

	def __init__(self):
		self.es = Elasticsearch()

	def setIndex(self, index):
		self.index = index

	def getIndex(self):
		return self.index

	def setDocType(self, doc_type):
		self.doc_type = doc_type

	def getDocType(self):
		return self.doc_type

	# Thêm tài liệu
	# Chưa có thì thêm mới, có rồi thì update
	# @params id
	# @params doucument
	#
	# @return document
	def insertOrUpdate(self, id, document):
		index = self.getIndex()
		doc_type = self.getDocType()

		exist = self.get(doc_type, id)

		if exist != 'null':
			result = self.update(id, document)
		else:
			result = self.es.index(index=index, doc_type=doc_type, id=id, body=document)

		return document


	def insert(self, id, document):
		index = self.getIndex()
		doc_type = self.getDocType()
		result = self.es.index(index=index, doc_type=doc_type, id=id, body=document)
		return result

	# Cập nhật tài liệu
	def update(self, id, document):
		index = self.getIndex()
		doc_type = self.getDocType()
		return self.es.update(index=index, doc_type=doc_type, id=id, body={"doc" : document})

	# Lấy tài liệu
	def get(self, doc_type, id):
		index = self.getIndex()
		try:
			document = self.es.get(index = index, doc_type = doc_type, id = id)
			return document['_source']
		except NotFoundError, e:
			return 'null'
		except TransportError, e:
			return 'null'
示例#23
0
文件: es.py 项目: 0x24bin/wyportmap
class NmapElasticsearchPlugin(NmapBackendPlugin):
    """
        This class enables the user to store and manipulate nmap reports \
        in a elastic search db.
    """
    def __init__(self, index=None):
        if index is None:
            self.index = "nmap.{0}".format(datetime.now().strftime('%Y-%m-%d'))
        else:
            self.index = index
        self._esapi = Elasticsearch()

    def insert(self, report, doc_type=None):
        """
            insert NmapReport in the backend
            :param NmapReport:
            :return: str the ident of the object in the backend for
            future usage
            or None
        """
        if doc_type is None:
            doc_type = 'NmapReport'
        j = json.dumps(report, cls=ReportEncoder)
        res = self._esapi.index(
            index=self.index,
            doc_type=doc_type,
            body=json.loads(j))
        rc = res['_id']
        return rc

    def delete(self, id):
        """
            delete NmapReport if the backend
            :param id: str
        """
        raise NotImplementedError

    def get(self, id):
        """
            retreive a NmapReport from the backend
            :param id: str
            :return: NmapReport
        """
        res = self._esapi.get(index=self.index,
                              doc_type="NmapReport",
                              id=id)['_source']
        return res

    def getall(self, filter=None):
        """
            :return: collection of tuple (id,NmapReport)
            :param filter: Nice to have implement a filter capability
        """
        rsearch = self._esapi.search(index=self.index,
                                     body={"query": {"match_all": {}}})
        print("--------------------")
        print(type(rsearch))
        print(rsearch)
        print("------------")
示例#24
0
class PeragroClient():
    """
    An audio search client
    """
    def __init__(self):
        """
        initialize client object with elasticsearch object
        """
        self.es = Elasticsearch()

    def set_index(self, index):
        """
        set index for to lookup in elasticsearch

        Input:
            -index: an elasticsearch index
        """
        self.index = index

    def get_sound(self, id_):
        """
        Get sound by its id

        input:
            -id: id of sound

        output:
            -sound: sound details if it exists otherwise None

        Usage:

        >>> id = "X2VFAB12GH"
        >>> sound = c.get_sound(id)
        """
        if self.es.exists(index=self.index, doc_type='_all', id=id_):
            res = self.es.get(index=self.index, id=id_)
            return res
        else:
            return None

    def text_search(self, query):
        """
        Get sound results based on text query.
        It also has support for field queries.

        Usage:

        >>> query = "tum hi ho"
        >>> sounds = c.text_search(query)

        >>> # OR field query
        >>> query = "tags:'interscope' genre:'hip hop'"
        >>> sounds = c.text_search(query)
        """
        # print self.index
        # print self.es.search(index=self.index)
        res = self.es.search(index=self.index, q=query)
        print("Got %d Hits:" % res['hits']['total'])
        return res
示例#25
0
def run(node):
    id_a, id_b 	= node.get('id_a', '63166071_1'), node.get('id_b', '63166071_2')
    es 			= Elasticsearch()
    data_a 		= es.get(index="factor_state2016", doc_type='factor_network', id=id_a)
    data_b 		= es.get(index="factor_state2016", doc_type='factor_network', id=id_b)
    constructor = ElasticFactor(cfg["cdr_elastic_search"]["hosts"] + cfg["cdr_elastic_search"]["index"])
    merged 		= constructor.merge(data_a["_source"], data_b["_source"])
    return merged
示例#26
0
class ESIndex:
    def __init__(self, hosts, index = "", doc_type = ""):
        self.es = Elasticsearch(hosts)
        self.index = index
        self.doc_type = doc_type

    def index(self, doc_id, body, index = "", doc_type = ""):
        index_ = self.index if index == "" else index
        doc_type_ = self.doc_type if doc_type == "" else doc_type
        return self.es.index(index=index_, doc_type=doc_type_, body=body, id=doc_id)

    def delete(self,doc_id, index = "", doc_type = ""):
        index_ = self.index if index == "" else index
        doc_type_ = self.doc_type if doc_type == "" else doc_type
        return self.es.delete(index=index_, doc_type = doc_type_, id = doc_id)
        
    def bulk(self, docs, index = "", doc_type = "", op_type = 'index'):
        '''
        bulk sample:
        {"_op_type":"index", _index" : "test", "_type" : "type1", "_id" : "1" , "_source":{"field1":"value1", "field2":"value2"}}
        { "_op_type":"delete" ,  "_index" : "test", "_type" : "type1", "_id" : "2" } 

        '''
        index_ = self.index if index == "" else index
        doc_type_ = self.doc_type if doc_type == "" else doc_type
 
        allow_op = ['index', 'delete']
        if op_type not in allow_op:
            raise exceptions.RequestError(400, '{"msg":"op_type is not allowed, you can use index or delete"}')

        actions = []
        for doc in docs:
            action = {}
            action["_index"] = index_
            action["_type"] = doc_type_
            action["_id"] = doc["_id"]
            if op_type == 'index':
                del doc["_id"]
                action["_source"] = doc
            action["_op_type"] = op_type
            actions.append(action)

        return helpers.parallel_bulk(self.es, actions)

    def getDoc(self,doc_id, index = "", doc_type = ""):
        index_ = self.index if index == "" else index
        doc_type_ = self.doc_type if doc_type == "" else doc_type
 
        return self.es.get(index=index_, doc_type=doc_type_, id=doc_id)

    def putMapping(self, body, index = "", doc_type =""):
        index_ = self.index if index == "" else index
        doc_type_ = self.doc_type if doc_type == "" else doc_type
        return self.es.indices.put_mapping(index=index_, doc_type=doc_type_, body=body)

    def create(self, body = {}, index = "", timeout = 30):
        index_ = self.index if index == "" else index
        return self.es.indices.create(index_, body=body)
示例#27
0
class ProjectDB(BaseProjectDB):
    __type__ = 'project'

    def __init__(self, hosts, index='pyspider'):
        self.index = index
        self.es = Elasticsearch(hosts=hosts)

        self.es.indices.create(index=self.index, ignore=400)
        if not self.es.indices.get_mapping(index=self.index, doc_type=self.__type__):
            self.es.indices.put_mapping(index=self.index, doc_type=self.__type__, body={
                "_all": {"enabled": False},
                "properties": {
                    "updatetime": {"type": "double"}
                }
            })

    def insert(self, name, obj={}):
        obj = dict(obj)
        obj['name'] = name
        obj['updatetime'] = time.time()

        obj.setdefault('group', '')
        obj.setdefault('status', 'TODO')
        obj.setdefault('script', '')
        obj.setdefault('comments', '')
        obj.setdefault('rate', 0)
        obj.setdefault('burst', 0)

        return self.es.index(index=self.index, doc_type=self.__type__, body=obj, id=name,
                             refresh=True)

    def update(self, name, obj={}, **kwargs):
        obj = dict(obj)
        obj.update(kwargs)
        obj['updatetime'] = time.time()
        return self.es.update(index=self.index, doc_type=self.__type__,
                              body={'doc': obj}, id=name, refresh=True, ignore=404)

    def get_all(self, fields=None):
        for record in elasticsearch.helpers.scan(self.es, index=self.index, doc_type=self.__type__,
                                                 query={'query': {"match_all": {}}},
                                                 _source_include=fields or []):
            yield record['_source']

    def get(self, name, fields=None):
        ret = self.es.get(index=self.index, doc_type=self.__type__, id=name,
                          _source_include=fields or [], ignore=404)
        return ret.get('_source', None)

    def check_update(self, timestamp, fields=None):
        for record in elasticsearch.helpers.scan(self.es, index=self.index, doc_type=self.__type__,
                                                 query={'query': {"range": {
                                                     "updatetime": {"gte": timestamp}
                                                 }}}, _source_include=fields or []):
            yield record['_source']

    def drop(self, name):
        return self.es.delete(index=self.index, doc_type=self.__type__, id=name, refresh=True)
示例#28
0
class Search():

    def __init__(self, index_name):
        super().__init__()
        self.logger = logging.getLogger(__name__)
        self.__es = Elasticsearch(['pulsing.jhk.org:9200'], sniff_on_start=True)
        
        self.__index_name = index_name
        if self.__es.indices.exists(self.__index_name):
            self.logger.debug('index exists so deleting ' + self.__index_name)
            self.__es.indices.delete(self.__index_name)
        
        self.__es.indices.create(self.__index_name)
        self.__es.cluster.health(wait_for_status='yellow')
    
    def index(self, type_name, id_value, content):
        self.logger.debug('index %s/%s : %s', type_name, id_value, content)
        self.__es.index(index=self.__index_name, doc_type=type_name, id=id_value, body=content)
    
    def map(self, type_name, mapping):
        self.logger.debug('map %s', type_name)
        self.__es.indices.put_mapping(index=self.__index_name, doc_type=type_name, body={type_name: mapping})
    
    def search(self, type_name, query={'match_all': {}}):
        self.logger.debug('search %s : %s', type_name, query)
        return self.__es.search(index=self.__index_name, doc_type=type_name, body={'query': query})
    
    def get(self, type_name, id_value):
        self.logger.debug('get %s/%s', type_name, id_value)
        document = self.__es.get(index=self.__index_name, doc_type=type_name, id=id_value)
        self.logger.debug('got document ' + document)
        return document
    
    def delete(self, type_name, id_value):
        self.logger.debug('delete %s/%s', type_name, id_value)
        self.__es.delete(index=self.__index_name, doc_type=type_name, id=id_value)

    def optimize(self):
        """ 
        forcemerge allows removal of deleted documents and reducing the number of segments
        (documents are marked as tombstone [like cassandra] but not purged from the segment's 
        index for performance reasons)
        """
        self.logger.debug('optimize')
        self.__es.forcemerge(self.__index_name)

    @property
    def es(self):
        return self.__es

    def __eq__(self, other):
        return self.__es == other.__es

    def __str__(self):
        return self.__es.__str__()

    def __hash__(self):
        return self.__es.__hash__()
示例#29
0
文件: main.py 项目: phamal/aageno
def getNote(tag):
    es = Elasticsearch(['http://159.203.66.191:9200'])
    res = es.get(index="brahman", doc_type='note', id=tag)
    note = {}
    noteBody = res['_source']['body']
    #note["body"] = noteBody;
    noteBody = "<br />".join(noteBody.split("\n"))
    note["body"] = "&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;".join(noteBody.split("\t"))
    return jsonify(note);
class Elastic_Search:
    def __init__(self, index='iis-logs-', aws_secret_id=None):
        self.timestamp = datetime.datetime.utcnow()
        self.index = index
        self._setup_Elastic_on_localhost()  # default to localhost
        self._setup_Elastic_on_localhost()  # default to localhost
        self._result = None

        if index and aws_secret_id:
            self._setup_Elastic_on_cloud_via_AWS_Secret(index, aws_secret_id)

    def _setup_Elastic_on_localhost(self):
        self.host = 'localhost'
        self.port = 9200
        self.scheme = 'http'
        self.es = Elasticsearch([{'host': self.host, 'port': self.port}])

    def _setup_Elastic_on_cloud_via_AWS_Secret(self, index, secret_id):
        credentials = json.loads(Secrets(secret_id).value())
        self.host = credentials['host']
        self.username = credentials['username']
        self.password = credentials['password']
        self.port = credentials['port']
        self.index = index
        self._setup_Elastic_on_cloud(self.host, self.port, self.username,
                                     self.password)
        return self

    def _setup_Elastic_on_cloud(self, host, port, username, password):
        self.host = host
        self.port = port
        self.username = username
        self.password = password
        self.scheme = 'https'
        self.es = Elasticsearch([host],
                                http_auth=(username, password),
                                scheme="https",
                                port=port)
        return self

    def add_data_with_timestamp(self, data):
        data["@timestamp"] = self.timestamp
        return self.es.index(index=self.index, doc_type='item', body=data)

    def add(self, data, id_key=None):
        try:
            if id_key is not None:
                return self.es.index(index=self.index,
                                     doc_type='item',
                                     body=data,
                                     id=data[id_key])
            else:
                return self.es.index(index=self.index,
                                     doc_type='item',
                                     body=data)
        except Exception as error:
            print("elk-error", error)
            return {"elk-error": "{0}".format(error)}

    def add_bulk(self, data, id_key=None, pipeline=None):
        ok = 0
        if data:
            actions = []
            for item in data:
                item_data = {
                    "_index": self.index,
                    "_type": 'item',
                    "_source": item,
                }
                if id_key is not None:
                    item_data["_id"] = item[id_key]
                actions.append(item_data)

            if pipeline is None:
                ok, _ = helpers.bulk(self.es, actions, index=self.index)
            else:
                ok, _ = helpers.bulk(self.es,
                                     actions,
                                     index=self.index,
                                     pipeline=pipeline)
        return ok

    def create_index(self, body={}):
        if self.exists() is False:
            self._result = self.es.indices.create(index=self.index, body=body)
        return self

    def create_index_with_location_geo_point(self, field="location"):
        body = {
            "mappings": {
                "item": {
                    "properties": {
                        field: {
                            "type": "geo_point"
                        }
                    }
                }
            }
        }
        self.create_index(body)
        return self

    def create_index_pattern(self, add_time_field=True):
        if add_time_field:
            payload = {
                "type": "index-pattern",
                "index-pattern": {
                    "title": self.index + '*',
                    "timeFieldName": "date"
                }
            }
        else:
            print('creating index without index pattern')
            payload = {
                "type": "index-pattern",
                "index-pattern": {
                    "title": self.index + '*'
                }
            }
        data = json.dumps(payload)
        headers = {'Content-Type': 'application/json'}

        if self.host == 'localhost':
            url = 'http://{0}:{1}/.kibana/doc/index-pattern:{2}'.format(
                self.host, self.port, self.index)
            self._result = json.loads(PUT(url, data, headers))

        else:
            url = 'https://{0}:{1}/.kibana/doc/index-pattern:{2}'.format(
                self.host, self.port, self.index)
            response = requests.put(url,
                                    data,
                                    headers=headers,
                                    auth=HTTPBasicAuth(self.username,
                                                       self.password))
            self._result = json.loads(response.text)

        return self

    def delete_index_pattern(self):
        try:
            if self.host == 'localhost':
                url = 'http://{0}:{1}/.kibana/doc/index-pattern:{2}'.format(
                    self.host, self.port, self.index)
                self._result = json.loads(DELETE(url))
            else:
                url = 'https://{0}:{1}/.kibana/doc/index-pattern:{2}'.format(
                    self.host, self.port, self.index)
                response = requests.delete(url,
                                           auth=HTTPBasicAuth(
                                               self.username, self.password))
                self._result = json.loads(response.text)
        except Exception as error:
            self._result = {'error': error}
        return self

    def delete_data_by_id(self, id):
        return self.es.delete(index=self.index, doc_type='item', id=id)

    def get_data(self, id):
        try:
            return self.es.get(index=self.index, doc_type='item', id=id)
        except NotFoundError:
            return None

    def get_many(self, ids):
        data = self.es.mget(index=self.index,
                            doc_type='item',
                            body={'ids': ids})
        results = {}
        for item in data['docs']:
            _id = item['_id']
            if item['found'] is False:
                results[_id] = None
            else:
                results[_id] = item['_source']
        return results

    def get_data_First_10(self):
        results = self.es.search(index=self.index,
                                 body={"query": {
                                     "match_all": {}
                                 }})
        for result in results['hits']['hits']:
            yield result['_source']

    def get_index_settings(self):
        url = 'https://{3}:{4}@{0}:{1}/{2}/_settings'.format(
            self.host, self.port, self.index, self.username, self.password)
        return json.loads(requests.get(url).text)

    def search_using_lucene(
        self,
        query,
        size=10000,
        sort=None
    ):  # for syntax and examples of lucene queries see https://www.elastic.co/guide/en/elasticsearch/reference/6.4/query-dsl-query-string-query.html#query-string-syntax
        query = query.replace('“', '"').replace(
            '”', '"')  # fix the quotes we receive from Slack
        results = self.es.search(index=self.index,
                                 q=query,
                                 size=size,
                                 sort=sort)
        for result in results['hits']['hits']:
            yield result['_source']

    def search_using_lucene_index_by_id(
        self,
        query,
        size=10000,
        sort=None
    ):  # for syntax and examples of lucene queries see https://www.elastic.co/guide/en/elasticsearch/reference/6.4/query-dsl-query-string-query.html#query-string-syntax
        query = query.replace('“', '"').replace(
            '”', '"')  # fix the quotes we receive from Slack
        elk_results = self.es.search(index=self.index,
                                     q=query,
                                     size=size,
                                     sort=sort)
        results = {}
        for result in elk_results['hits']['hits']:
            id = result['_id']
            value = result['_source']
            results[id] = value
        return results

    def search_using_lucene_sort_by_date(
        self,
        query,
        size=10000
    ):  # for syntax and examples of lucene queries see https://www.elastic.co/guide/en/elasticsearch/reference/6.4/query-dsl-query-string-query.html#query-string-syntax
        query = query.replace('“', '"').replace(
            '”', '"')  # fix the quotes we receive from Slack
        elk_results = self.es.search(index=self.index,
                                     q=query,
                                     size=size,
                                     sort="date:desc")
        results = []
        for result in elk_results['hits']['hits']:
            id = result['_id']
            value = result['_source']
            item = {"id": id, "value": value}
            results.append(item)
        return results

    def search_using_query(self, query, size=10000):
        results = self.es.search(index=self.index, body=query, size=size)
        for result in results['hits']['hits']:
            yield result['_source']

    def search_on_field_for_value(self, field, value, size=10000):
        query = {"query": {"match": {field: {"query": value}}}}
        return self.search_using_query(query, size=size)

    def search_on_field_for_values(self, field, values):
        query = {
            "query": {
                "constant_score": {
                    "filter": {
                        "terms": {
                            field: values
                        }
                    }
                }
            }
        }
        return self.search_using_query(query)

    # this is not working
    # def search_get_unique_field_values(self, field,size = 10000):
    #     query = {
    #         "size": 0,
    #         "aggs": {
    #             "unique_ids": {
    #                 "terms": {
    #                     "field": 'field',
    #                     "size": size
    #                 }
    #             }
    #         }
    #     }
    #     return self.search_using_query(query)

    def set_index_settings(self, settings):
        headers = {'Content-Type': 'application/json'}
        url = 'https://{0}:{1}/{2}/_settings'.format(self.host, self.port,
                                                     self.index)
        response = requests.put(url,
                                json.dumps(settings),
                                headers=headers,
                                auth=HTTPBasicAuth(self.username,
                                                   self.password))
        return response.text

    def set_index_settings_total_fields(self, value):
        self.set_index_settings({"index.mapping.total_fields.limit": value})
        return self

    def delete_using_query(self, query):
        results = self.es.delete_by_query(index=self.index, body=query)
        return results

    def delete_index(self):
        if self.exists():
            self._result = self.es.indices.delete(self.index)
        return self

    def index_list(self):
        return set(self.es.indices.get_alias())

    def exists(self):
        return self.es.indices.exists(self.index)

    def set_index(self, index):
        self.index = index
        return self
示例#31
0
class ElasticClient:
    def __init__(self, address='localhost:10000'):
        self.es = Elasticsearch(address)

    # ------ Simple operations ------
    def index_documents(self):
        df = pd \
                 .read_csv('data/user_ratedmovies.dat', delimiter='\t', nrows=100000) \
                 .loc[:, ['userID', 'movieID', 'rating']]
        means = df.groupby(['userID'], as_index=False, sort=False) \
                    .mean() \
                    .loc[:, ['userID', 'rating']] \
            .rename(columns={'rating': 'ratingMean'})
        df = pd.merge(df, means, on='userID', how="left", sort=False)
        df['ratingNormal'] = df['rating'] - df['ratingMean']

        ratings = df.loc[:, ['userID', 'movieID', 'ratingNormal']] \
            .rename(columns={'ratingNormal': 'rating'}) \
            .pivot_table(index='userID', columns='movieID', values='rating') \
            .fillna(0)

        print("Indexing users...")
        index_users = [{
            "_index": "users",
            "_type": "user",
            "_id": index,
            "_source": {
                'ratings': row[row > 0] \
                    .sort_values(ascending=False) \
                    .index.values.tolist()
            }
        } for index, row in ratings.iterrows()]
        helpers.bulk(self.es, index_users)
        print("Done")
        print("Indexing movies...")
        index_movies = [{
            "_index": "movies",
            "_type": "movie",
            "_id": column,
            "_source": {
                "whoRated": ratings[column][ratings[column] > 0] \
                    .sort_values(ascending=False) \
                    .index.values.tolist()
            }
        } for column in ratings]
        helpers.bulk(self.es, index_movies)
        print("Done")

    def get_movies_liked_by_user(self, user_id, index='users'):
        user_id = int(user_id)
        return self.es.get(index=index, doc_type="user", id=user_id)["_source"]

    def get_users_that_like_movie(self, movie_id, index='movies'):
        movie_id = int(movie_id)
        return self.es.get(index=index, doc_type="movie",
                           id=movie_id)["_source"]

    def get_preselection_for_user(self, user_id, index='users'):
        user_id = int(user_id)

        movies_liked = self.es.search(
            index=index, body={"query": {
                "term": {
                    "_id": user_id
                }
            }})["hits"]["hits"][0]["_source"]["ratings"]

        users_with_similar_taste = self.es.search(
            index=index, body={"query": {
                "terms": {
                    "ratings": movies_liked
                }
            }})["hits"]["hits"]

        recommended_set = set()
        for ratings in users_with_similar_taste:
            if ratings["_id"] != user_id:
                ratings = ratings["_source"]["ratings"]
                for rating in ratings:
                    if rating not in movies_liked:
                        recommended_set.add(rating)

        return list(recommended_set)

    def get_preselection_for_movie(self, movie_id, index='movies'):
        movie_id = int(movie_id)

        users_liking = self.es.search(
            index=index, body={"query": {
                "term": {
                    "_id": movie_id
                }
            }})["hits"]["hits"][0]["_source"]["whoRated"]

        movies_liked_by_the_same_people = self.es.search(
            index=index, body={"query": {
                "terms": {
                    "whoRated": users_liking
                }
            }})["hits"]["hits"]

        recommended_set = set()
        for ratings in movies_liked_by_the_same_people:
            if ratings["_id"] != movie_id:
                ratings = ratings["_source"]["whoRated"]
                for rating in ratings:
                    if rating not in users_liking:
                        recommended_set.add(rating)

        return list(recommended_set)

    def add_user_document(self,
                          user_id,
                          movies_liked,
                          user_index='users',
                          movie_index='movies'):
        user_id = int(user_id)
        self.es.index(index=user_index,
                      doc_type='user',
                      id=user_id,
                      body={"ratings": movies_liked})
        for e in movies_liked:
            temp = list(
                self.get_users_that_like_movie(e, movie_index)["whoRated"])
            temp.append(user_id)
            self.update_movie_document(int(e), temp, movie_index)

    def add_movie_document(self,
                           movie_id,
                           users_liking,
                           movie_index='movies',
                           user_index='users'):
        movie_id = int(movie_id)
        self.es.index(index=movie_index,
                      doc_type='movie',
                      id=movie_id,
                      body={"whoRated": users_liking})
        for e in users_liking:
            temp = list(
                self.get_movies_liked_by_user(e, user_index)["ratings"])
            temp.append(movie_id)
            self.update_user_document(int(e), temp, user_index)

    def update_user_document(self, user_id, movies_liked, user_index='users'):
        user_id = int(user_id)
        self.es.index(index=user_index,
                      doc_type='user',
                      id=user_id,
                      body={"ratings": movies_liked})

    def update_movie_document(self,
                              movie_id,
                              users_liking,
                              movie_index='movies'):
        movie_id = int(movie_id)
        self.es.index(index=movie_index,
                      doc_type='movie',
                      id=movie_id,
                      body={"whoRated": users_liking})

    def bulk_user_update(self, body, user_index):
        for e in body:
            user_id = int(e["user_id"])

            movies_liked_before = self.get_movies_liked_by_user(
                user_id, user_index)["ratings"]
            for movie in list(movies_liked_before):
                temp = list(self.get_users_that_like_movie(movie)["whoRated"])
                if user_id in temp:
                    temp.remove(user_id)
                self.update_movie_document(int(movie), temp)

            self.es.index(index=user_index,
                          doc_type='user',
                          id=user_id,
                          body={"ratings": e["liked_movies"]})

            movies_liked_now = list(e["liked_movies"])
            for movie in list(movies_liked_now):
                temp = list(self.get_users_that_like_movie(movie)["whoRated"])
                temp.append(user_id)
                self.update_movie_document(int(movie), temp)

    def bulk_movie_update(self, body, movie_index):
        for e in body:
            movie_id = int(e["movie_id"])

            users_liking_before = self.get_users_that_like_movie(
                movie_id, movie_index)["whoRated"]
            for user in list(users_liking_before):
                temp = list(self.get_movies_liked_by_user(user)["ratings"])
                if movie_id in temp:
                    temp.remove(movie_id)
                self.update_user_document(int(user), temp)
            self.es.index(index=movie_index,
                          doc_type='movie',
                          id=movie_id,
                          body={"ratings": e["users_who_liked_movie"]})

            users_liking_now = list(e["users_who_liked_movie"])
            for user in list(users_liking_now):
                temp = list(self.get_users_that_like_movie(user)["ratings"])
                temp.append(movie_id)
                self.update_movie_document(int(user), temp)

    def delete_user_document(self, user_id, user_index, movie_index='movies'):
        user_id = int(user_id)
        movies_liked = self.get_movies_liked_by_user(user_id,
                                                     user_index)["ratings"]
        self.es.delete(index=user_index, doc_type="user", id=user_id)
        for e in list(movies_liked):
            temp = list(
                self.get_users_that_like_movie(e, movie_index)["whoRated"])
            if user_id in temp:
                temp.remove(user_id)
            self.update_movie_document(int(e), temp)

    def delete_movie_document(self, movie_id, movie_index, user_index='users'):
        movie_id = int(movie_id)
        users_liking = self.get_users_that_like_movie(movie_id,
                                                      movie_index)["whoRated"]
        self.es.delete(index=movie_index, doc_type="movie", id=movie_id)
        for e in list(users_liking):
            temp = list(
                self.get_movies_liked_by_user(e, user_index)["ratings"])
            if movie_id in temp:
                temp.remove(movie_id)
            self.update_user_document(int(e), temp)

    def create_index(self, index):
        self.es.indices.create(index=index,
                               body={
                                   "settings": {
                                       "number_of_shards": 5,
                                       "number_of_replicas": 1
                                   }
                               })

    def get_indexes(self):
        return self.es.indices.get_alias()

    def reindex(self, old_index, new_index):
        helpers.reindex(self.es,
                        source_index=old_index,
                        target_index=new_index)

    def delete_index(self, index):
        self.es.indices.delete(index=index, ignore=[400, 404])
示例#32
0
class ElasticSearchClass(object):
    def __init__(self, host, port, user, passwrod):
        self.host = host
        self.port = port
        self.user = user
        self.password = passwrod
        self.connect()

    def connect(self):
        self.es = Elasticsearch(hosts=[{
            'host': self.host,
            'port': self.port
        }],
                                http_auth=(self.user, self.password))
        return self.es

    def insertDocument(self, index, type, body, id=None):
        """
        插入一条数据body到指定的index、指定的type下;可指定Id,若不指定,ES会自动生成
        :param index: 待插入的index值
        :param type: 待插入的type值
        :param body: 待插入的数据 -> dict型
        :param id: 自定义Id值
        :return:
        """
        return self.es.index(index=index, doc_type=type, body=body, id=id)

    def count(self, indexname):
        """
        :param indexname:
        :return: 统计index总数
        """
        return self.conn.count(index=indexname)

    def delete(self, indexname, doc_type, id):
        """
        :param indexname:
        :param doc_type:
        :param id:
        :return: 删除index中具体的一条
        """
        self.es.delete(index=indexname, doc_type=doc_type, id=id)

    def get(self, doc_type, indexname, id):
        return self.es.get(index=indexname, doc_type=doc_type, id=id)

    def searchindex(self, index):
        """
        查找所有index数据
        """
        try:
            return self.es.search(index=index)
        except Exception as err:
            print(err)

    def searchDoc(self, index=None, type=None, body=None):
        '''
        查找index下所有符合条件的数据
        :param index:
        :param type:
        :param body: 筛选语句,符合DSL语法格式
        :return:
        '''
        return self.es.search(index=index, doc_type=type, body=body)

    def search(self, index, type, body, size=10, scroll='10s'):
        """
        根据index,type查找数据,
        其中size默认为十条数据,可以修改为其他数字,但是不能大于10000
        """
        return self.es.search(index=index,
                              doc_type=type,
                              body=body,
                              size=size,
                              scroll=scroll)

    def scroll(self, scroll_id, scroll):
        """
        根据上一个查询方法,查询出来剩下所有相关数据
        """
        return self.es.scroll(scroll_id=scroll_id, scroll=scroll)
示例#33
0
                flag += 1
                for il in pages[1:]:
                    dSet.add(il)
        if (flag == 1000):
            break
    if len(dSet) <= 500:
        baseSet = baseSet.union(dSet)
    else:
        dSet = random.sample(dSet, 500)
    textFile.close()

print(len(baseSet))

for link in baseSet:
    olSet = set()
    res = es.get(index="hw3_crawl", doc_type='document', id=link)
    outLinks = set(res['_source'].get("outlinks").strip().split('\n'))
    for ol in outLinks:
        if ol in graphPages:
            olSet.add(ol)
    graphPages[link] = Page(link, 1.0, 1.0, set(), olSet)

print(len(graphPages))
with open("linkgraph.txt", 'r') as textFile:
    for line in textFile.readlines():
        plSet = set()
        pages = line.replace(' \n', '').replace('\n', '').split(' ')
        for link in baseSet:
            if (pages[0] == link):
                for p in pages[1:]:
                    if p in graphPages:
class ElasticsearchDDL(object):
    def __init__(self, host='localhost', port='9200'):
        self.es = Elasticsearch([{
            'host': 'localhost',
            'port': 9200
        }],
                                timeout=100)

    def createIndex(self, indexname):
        self.es.indices.create(index=indexname)

    def deleteIndex(self, indexname):
        self.es.indices.delete(index=indexname)

    def bulkInsert(self, indexname, doctype, data, no):
        datadim = len(data)
        bulk_data = []
        i = 0
        for elem in data:
            data_dict = {"id": i}
            if type(elem) is dict:
                if "title" in elem:
                    data_dict["title"] = elem["title"]
                if "text" in elem:
                    data_dict["text"] = elem["text"]
            else:
                data_dict["text"] = elem

            op_dict = {
                "index": {
                    "_index": indexname,
                    "_type": doctype,
                    "_id": data_dict["id"]
                }
            }
            bulk_data.append(op_dict)
            bulk_data.append(data_dict)
            if i % no == 0 or i == datadim - 1:
                self.es.bulk(index=indexname, body=bulk_data, refresh=True)
                bulk_data = []
            i += 1

    def searchByCollocation(self, indexname, w1, w2):
        return self.es.search(index=indexname,
                              body={
                                  "query": {
                                      "span_near": {
                                          "clauses": [{
                                              "span_term": {
                                                  "text": w1
                                              }
                                          }, {
                                              "span_term": {
                                                  "text": w2
                                              }
                                          }],
                                          "slop":
                                          6,
                                          "in_order":
                                          True
                                      }
                                  },
                                  "highlight": {
                                      "fields": {
                                          "text": {}
                                      }
                                  }
                              },
                              size=5)

    def searchByBigram(self, indexname, bigram):
        return self.es.search(index=indexname,
                              body={
                                  "query": {
                                      "multi_match": {
                                          "query": bigram,
                                          "type": "phrase",
                                          "fields": ["text"]
                                      }
                                  },
                                  "highlight": {
                                      "fields": {
                                          "text": {}
                                      }
                                  }
                              },
                              size=5)

    def selectByQuery(self, indexname, query={}):
        res = self.es.search(index=indexname,
                             body={"query": {
                                 "match_all": query
                             }})
        return res

    def selectOneByID(self, indexname, doctype, id):
        result = self.es.get(index=indexname, doc_type=doctype,
                             id=id)['_source']

    def indexExists(self, indexname):
        return self.es.indices.exists(index=indexname)
示例#35
0
        'port': 9200
    }],
    sniff_on_start=True,
    # refresh nodes after a node fails to respond
    sniff_on_connection_fail=True,
    # and also every 60 seconds
    sniffer_timeout=60,
    # set sniffing request timeout to 10 seconds
    sniff_timeout=10)

# Index some test data
print("#############\nFirst data test\n#############")
es.index(index='test-index', doc_type='test', id=1, body={'test': 'test'})

# Test if they are there
res = es.get(index='test-index', doc_type='test', id=1)
print(json.dumps(res['_source'], indent=4, sort_keys=True))

# Delete test data and try with something more interesting
delete = es.delete(index='test-index', doc_type='test', id=1)
print(delete)

print('\n')
if __debug__:
    print("No sleep")
else:
    time.sleep(3)

# Index some more complicated test data
print("#############\nSecond data test\n#############")
es.index(index='sw',
示例#36
0
class DCIESEngine(object):
    def __init__(self, conf, index="global", timeout=30):
        self.esindex = index
        self.conn = Elasticsearch(conf['ES_HOST'],
                                  port=conf['ES_PORT'],
                                  timeout=timeout)

    def create_index(self):
        self.conn.indices.create(index=self.esindex)

    def get(self, id, team_id=None):
        res = self.conn.get(index=self.esindex, doc_type='log', id=id)
        if team_id:
            if res:
                if res['_source']['team_id'] != team_id:
                    res = {}
        return res

    def delete(self, id):
        self.conn.delete(index=self.esindex, doc_type='log', id=id)
        return True

    def list(self, include=None, exclude=None):

        query = {"size": 10000, "query": {"match_all": {}}}
        if include:
            query['_source'] = {'include': include}
        if exclude:
            query['_source'] = {'exclude': exclude}
        if self.conn.indices.exists(index=self.esindex):
            return self.conn.search(index=self.esindex, body=query)
        else:
            return None

    def index(self, values):
        return self.conn.index(index=self.esindex,
                               doc_type='log',
                               id=values['id'],
                               body=values)

    def refresh(self):
        return self.conn.indices.refresh(index=self.esindex, force=True)

    def search_content(self, pattern, team_id=None):
        if team_id:
            query = {
                "query": {
                    "filtered": {
                        "filter": {
                            "match": {
                                "team_id": team_id
                            }
                        },
                        "query": {
                            "match": {
                                "content": pattern
                            }
                        }
                    }
                }
            }
        else:
            query = {"query": {"match": {"content": pattern}}}

        return self.conn.search(index=self.esindex,
                                body=query,
                                request_cache=False,
                                size=100)

    def cleanup(self):
        if self.conn.indices.exists(index=self.esindex):
            return self.conn.indices.delete(index=self.esindex)
示例#37
0
    d["summary"] = re.sub("<[^<]+?>", "", jo["summary"])
    ldocs.append(d)

# connect to elastic
es = Elasticsearch([{"host": "localhost", "port": 9200}])

# iterate through documents indexing them
for doc in ldocs:
    es.index(index="tvshows",
             doc_type="bigbang",
             id=doc["id"],
             body=json.dumps(doc))

# python elasticsearch get by id
print("###########")
print(es.get(index="tvshows", doc_type="bigbang", id=2915))
print("###########")

# term search
print("term search")

print(
    es.search(
        index="tvshows",
        doc_type="bigbang",
        body={"query": {
            "match": {
                "summary": "rivalry"
            }
        }},
    ))
示例#38
0
    + DASHBOARD_NAME
dashboard['uiStateJSON'] = '{}'
dashboard['optionsJSON'] = '{"darkTheme":false}'
dashboard['version'] = 1
dashboard['timeRestore'] = False
dashboard['kibanaSavedObjectMeta'] = {
    'searchSourceJSON':
    '{"filter":[{"query":{"query_string":{"query":"*","analyze_wildcard":true}}}],'
    '"highlightAll":true,"version":true}'
}

# Check if visualizations already present in dashboard. If present, don't add, else, add at end

ES_ID = DASHBOARD_NAME
try:
    res = es.get(index='.kibana', doc_type='dashboard', id=ES_ID)
    print(json.dumps(res, indent=4))
    # No exeception occured means dashboard found
    dashboard_found = True
except exceptions.NotFoundError as e:
    print('No visualizations found')
    dashboard_found = False
except Exception as e:
    print(e)
    print('Error Occurred')

vis_ids_present = set()
panelsJSON = []

if dashboard_found:
    panelsJSON = yaml.safe_load(res['_source']['panelsJSON'])
示例#39
0
from elasticsearch import Elasticsearch
es = Elasticsearch()

res = es.get(index="trec_news", doc_type='news_articles', id='00f57310e5c8ec7833d6756ba637332e')
print(res['_source'])
class KnowledgeBase(object):
    """ 
    Represents interface to Knowledge Base.

    The Knowledge base uses Elasticsearch to provide primary storage and
    indexing for the articles. It leverages the default analyzer for text
    preprocessing at indexing time.
    
    View counts for articles are stored in Redis. It could be a field in the
    ES mapping for the article but that would mean reindexing the document
    everytime an article is viewed. To alleviate this unnecessary burden on
    the primary storage, the knowledge base uses an in memory key value store
    for fast access as well as update time. 

    The class encapsulates interactions with ElasticSearch including initializing
    indices and mappings.
    """

    # ES Index and Type name.
    INDEX = 'articles'
    TYPE = 'article'
    
    # Path to JSON mappings for Index and Type.
    INDEX_PATH = 'mappings/index_mapping.json'
    TYPE_PATH = 'mappings/type_mapping.json'

    # Configuration items for initializing connections with
    # databases. Ideally, these would be stored separately
    # in YAML format or managed using something like
    # Zookeeper.
    HOSTS = ['localhost']
    USERNAME = '******'
    PASSWORD = '******'

    def __init__(self):
        # Initialize persistent connections to ES and Redis.
        self.client = Elasticsearch(
            hosts=self.HOSTS,
            http_auth=(self.USERNAME, self.PASSWORD),
        )
        self.redis = Redis(
            host=self.HOSTS[0],
        )

    def search(self, query_text, locale=None, fields=None):
        """
        Return relevant articles given search text.

        Finding the query term in the title of an article is given twice
        as much weight as finding the text in the body.

        After the most relevant articles are obtained, they are ranked by the
        ranking module (uses view counts here, but can be easily extended).
        
        Args:
            query_text(str): Text to be searched.
            locale(str): String to filter results by location.
            fields(list(str)): If specified, restrict the fields returned to
                this list.

        Returns:
            list[dict]: Returns a ranked list of dictionaries representing articles
                [
                    {
                        'id': str,
                        'title': str,
                        'body': str,
                        'locale': str,
                    },
                    .
                    .
                ]
                
        """
        # Create Search object to "match" query text against the title and body
        # of articles stored in the Knowledge base.
        s = Search(
            using=self.client,
            index=self.INDEX,
            doc_type=self.TYPE
        ).query(
            'multi_match',
            query=query_text,
            fields=['title^2', 'body']
        )
        
        # If locale is provided, use it to filter the set of documents that are
        # queried for.
        if locale:
            s = s.filter('term', locale=locale)

        # Restrict fields if specified.
        s = s.source(fields)
   
        response = s.execute()
        results, result_dict = [], {}
        for hit in response:
            article_id = hit.meta['id']
            result_dict[article_id] = hit.__dict__['_d_']
            result_dict[article_id]['id'] = article_id

            # Retrieve view count for each relevant article.
            results.append((article_id, self.redis.get(article_id)))

        # Rank results using Ranking function. Currently sorts relevant results by
        # view counts.
        ranked_results = Ranker.rank(results)
        ranked_articles = [result_dict[article_id] for article_id in ranked_results]

        return ranked_articles

    def get(self, article_id, fields=None):
        """
        Return an article specified by the given article_id.
	
	Increments view count for the specific article as well.

        Args:
            article_id(str): Unique ID representing an article in the knowledge base.
            fields(list[str]): If specified, restrict the fields returned to
                this list.

        Returns:
            dict: Dictionary representing a document, of the following format
                {
                    'id': str,
                    'title': str,
                    'body': str,
                    'locale': str,
                }
                Returns None if no article matching the id is found.
        """ 
        try:
            response = self.client.get(
                index=self.INDEX,
                doc_type=self.TYPE,
                id=article_id,
                _source=fields,
            )
        except:
            return None
        else:
            # Increment view count for accessed article.
            self.redis.incr(article_id)
            article = response['_source']
            article['id'] = article_id
            
            return article

    def index(self, article, refresh=True):
        """
        Index an article in the Knowledge Base.

	Intializes view count for the indexed article as well.
	
        Args:
            article(dict): Dictionary representing an article in the knowledge base.
                Must follow the field names defined in the mapping.

        Returns:
            tuple(bool, str): Returns (True, article_id) if article is successfully indexed.
                (False, None) otherwise.
        """
        try:
            response = self.client.index(
                index=self.INDEX,
                doc_type=self.TYPE,
                body=article,
                refresh=refresh,
            )
        except:
            return False, None
        else:
            # Initialize view count for newly indexed article.
            self.redis.set(response['_id'], 0)
            return response['created'], response['_id'] 

    def delete(self, article_id, refresh=True):
        """
        Delete an article from the Knowledge Base.
	
	Removes view count for the specific article as well.

        Args:
            article_id(str): Unique ID representing an article in the knowledge base.

        Returns:
            bool: Returns True if article is successfully deleted. False otherwise.
        """
        try:
            response = self.client.delete(
                index=self.INDEX,
                doc_type=self.TYPE,
                id=article_id,
                refresh=refresh,
            )
        except:
            return False
        else:
            # Remove article key from Redis.
            self.redis.delete(article_id)
            return response['found']

    def _in_bulk(self, objects):
        """
        Helper function to facilitate bulk operations.

        Args:
            objects(list[dict]): A list of dictionaries of the format
                [
                    {
                        '_op_type': String representing operation, valid choices are
							'index', 'create', 'update' and 'delete',
                        'body': Contains updated document or new document to be created,
                        'id': ID of article to be deleted. 
                    },
                    .
                    .
                ]
        """
        bulk(self.client, objects, index=self.INDEX)

    def _init_index(self):
        """
        Helper method to initialize Knowledge base store.

        Uses the JSON mappings to initialize an Elasticsearch index with a type mapping
        for storing articles.

        Returns:
            tuple(bool, str): Returns a boolean value representing whether the index and
                mapping were initialized and a string representing the status.
        """
        index_mapping = json.load(open(self.INDEX_PATH))
        type_mapping = json.load(open(self.TYPE_PATH))

        try:
            self.client.indices.create(
                index='articles',
                body=index_mapping,
            )
        except:
            return False, 'Failed to create Index'

        try:
            self.client.indices.put_mapping(
                index='articles',
                doc_type='article',
                body=type_mapping,
            )
        except:
            return False, 'Failed to put Mapping'
        
        return True, 'Successfully initialized Index'
示例#41
0
#!/usr/bin/env python
'''
Licensed to Elasticsearch B.V under one or more agreements.
Elasticsearch B.V licenses this file to you under the Apache 2.0 License.
See the LICENSE file in the project root for more information
'''

from elasticsearch import Elasticsearch

es = Elasticsearch()

print("fbcf5078a6a9e09790553804054c36b3 - L:9")
# tag::fbcf5078a6a9e09790553804054c36b3[]
response = es.get(
    index='twitter',
    id=0,
)
# end::fbcf5078a6a9e09790553804054c36b3[]
print("---------------------------------------")
print(response)
print("---------------------------------------")

print("98234499cfec70487cec5d013e976a84 - L:46")
# tag::98234499cfec70487cec5d013e976a84[]
response = es.exists(
    index='twitter',
    id=0,
)
# end::98234499cfec70487cec5d013e976a84[]
print("---------------------------------------")
print(response)
示例#42
0
class ServerConector():
    def __init__(self, **kwargs):
        self.res = {'status': None, 'status_code': None}
        try:
            server = os.environ.get('ELASTIC_SERVER')
            self.el = Elasticsearch([{'host': server, 'port': 9200}])
            if not self.el.ping():
                self.res['status'] = 'Error'
                self.res['status_code'] = 503
        except Exception as e:
            self.res['status'] = 'Error'
            self.res['status_code'] = 503
            app.logger.error('Error Conecting Elasticserch')
            app.logger.error('Error detail: {}'.format(e))

    def insert_el(self, index, data):
        try:
            resp_dict = self.el.index(index=index, body=data, refresh=True)
            self.res['doc_id'] = resp_dict['_id']
            self.res['status'] = 'Success'
            self.res['status_code'] = 200
        except Exception as e:
            self.res['status'] = 'Error'
            if not self.res['status_code']:
                self.res['status_code'] = 400
            self.res['doc_id'] = None

            app.logger.error(self.res['status'])
            app.logger.error('Error inserting document in index {}. \
                    Error detail: {}'.format(index, e))
        return self.res

    def update_el(self, index, _id, data):
        try:
            self.el.update(index=index,
                           id=_id,
                           body={'doc': data},
                           refresh=True)

            self.res['status'] = 'Success'
            self.res['status_code'] = 200
        except Exception as e:
            self.res['status'] = 'Error'
            if not self.res['status_code']:
                self.res['status_code'] = 400
            app.logger.error(self.res['status'])
            app.logger.error('Error updating document in index {}. \
                Error detail: {}'.format(index, e))
        return self.res

    def retrieve_id_el(self, index, _id, model=None):
        try:
            if not model:
                model = {}

            self.res = self.el.get(index=index, id=_id)

            model['doc_id'] = _id
            for key in self.res['_source']:
                model[key] = self.res['_source'][key]

            self.res = model
            self.res['status'] = 'Success'
            self.res['status_code'] = 200

        except Exception as e:
            self.res['status'] = 'Error'
            if not self.res['status_code']:
                self.res['status_code'] = e.status_code
            app.logger.error(self.res['status'])
            app.logger.error('Error getting document in index {}. \
                Error detail: {}'.format(index, e))
        return self.res

    def retrieve_el(self,
                    index,
                    unique_hash_field,
                    unique_hash_value,
                    model=None):
        try:
            if not model:
                model = {}

            self.res = self.el.search(index=index,
                                      body={
                                          'query': {
                                              'match_phrase': {
                                                  unique_hash_field:
                                                  unique_hash_value
                                              }
                                          }
                                      })
            self.res['status'] = 'Success'
            self.res['status_code'] = 200

            self.res['resp_list'] = []

            for doc in self.res['hits']['hits']:

                resp_dict = model.copy()

                for key in doc['_source']:
                    resp_dict[key] = doc['_source'][key]

                resp_dict['doc_id'] = doc['_id']

                self.res['resp_list'].append(resp_dict)

            if not self.res['resp_list']:
                self.res['status_code'] = 404

            return {
                'resp_list': self.res['resp_list'],
                'status': self.res['status'],
                'status_code': self.res['status_code']
            }

        except Exception as e:
            self.res['status'] = 'Error'
            if not self.res['status_code']:
                self.res['status_code'] = e.status_code
            app.logger.error(self.res['status'])
            app.logger.error('Error getting document in index {}. \
                Error detail: {}'.format(index, e))
        return self.res

    def retrieve_all_el(self, index):
        try:

            self.res = self.el.search(index=index,
                                      body={'query': {
                                          'match_all': {}
                                      }})
            self.res['status'] = 'Success'
            self.res['status_code'] = 200

            if self.res['status'] == 'Success':

                self.res['resp_list'] = []

                for doc in self.res['hits']['hits']:

                    resp_dict = {}

                    resp_dict['doc_id'] = doc['_id']

                    for key in doc['_source']:
                        resp_dict[key] = doc['_source'][key]

                    self.res['resp_list'].append(resp_dict)

                if not self.res['resp_list']:
                    self.res['status_code'] = 404

                return {
                    'resp_list': self.res['resp_list'],
                    'status': self.res['status'],
                    'status_code': self.res['status_code']
                }

        except Exception as e:
            self.res['status'] = 'Error'
            if not self.res['status_code']:
                self.res['status_code'] = e.status_code
            app.logger.error(self.res['status'])
            app.logger.error('Error getting document in index {}. \
                Error detail: {}'.format(index, e))
        return self.res
示例#43
0
class SearchEngine(object):
    def __init__(self, prefix=settings.ELASTICSEARCH_PREFIX):
        #
        serializer = JSONSerializer()
        serializer.mimetype = 'application/json'
        serializer.dumps = serializer.serialize
        serializer.loads = JSONDeserializer().deserialize
        self.es = Elasticsearch(hosts=settings.ELASTICSEARCH_HOSTS,
                                serializer=serializer,
                                **settings.ELASTICSEARCH_CONNECTION_OPTIONS)
        self.logger = logging.getLogger(__name__)
        self.prefix = prefix.lower()

    def _add_prefix(self, *args, **kwargs):
        if args:
            index = args[0].strip()
        else:
            index = kwargs.get('index', '').strip()
        if index is None or index == '':
            raise NotImplementedError("Elasticsearch index not specified.")

        prefix = '%s_' % self.prefix.strip(
        ) if self.prefix and self.prefix.strip() != '' else ''
        ret = []
        for idx in index.split(','):
            ret.append('%s%s' % (prefix, idx))

        index = ','.join(ret)
        if args:
            return index
        else:
            return dict(kwargs, index=index)

    def delete(self, **kwargs):
        """
        Deletes a document from the index
        Pass an index and id to delete a specific document
        Pass a body with a query dsl to delete by query

        """

        kwargs = self._add_prefix(**kwargs)
        kwargs['doc_type'] = kwargs.pop('doc_type', '_doc')
        body = kwargs.pop('body', None)
        if body != None:
            try:
                data = []
                refresh = kwargs.pop('refresh', False)
                for hit in helpers.scan(self.es, query=body, **kwargs):
                    hit['_op_type'] = 'delete'
                    data.append(hit)

                return helpers.bulk(self.es, data, refresh=refresh, **kwargs)
            except Exception as detail:
                try:
                    # ignore 404 errors (index_not_found_exception)
                    if detail.status_code == 404:
                        pass
                except:
                    self.logger.warning(
                        '%s: WARNING: failed to delete document by query: %s \nException detail: %s\n'
                        % (datetime.now(), body, detail))
                    raise detail
        else:
            try:
                return self.es.delete(ignore=[404], **kwargs)
            except Exception as detail:
                self.logger.warning(
                    '%s: WARNING: failed to delete document: %s \nException detail: %s\n'
                    % (datetime.now(), body, detail))
                raise detail

    def delete_index(self, **kwargs):
        """
        Deletes an entire index

        """

        kwargs = self._add_prefix(**kwargs)
        print 'deleting index : %s' % kwargs.get('index')
        return self.es.indices.delete(ignore=[400, 404], **kwargs)

    def search(self, **kwargs):
        """
        Search for an item in the index.
        Pass an index and id to get a specific document
        Pass a body with a query dsl to perform a search

        """

        kwargs = self._add_prefix(**kwargs)
        kwargs['doc_type'] = kwargs.pop('doc_type', '_doc')
        body = kwargs.get('body', None)
        id = kwargs.get('id', None)

        if id:
            if isinstance(id, list):
                kwargs.setdefault('body', {'ids': kwargs.pop('id')})
                return self.es.mget(**kwargs)
            else:
                return self.es.get(**kwargs)

        ret = None
        try:
            ret = self.es.search(**kwargs)
        except Exception as detail:
            self.logger.warning(
                '%s: WARNING: search failed for query: %s \nException detail: %s\n'
                % (datetime.now(), body, detail))
            pass

        return ret

    def create_mapping(self,
                       index,
                       fieldname='',
                       fieldtype='string',
                       fieldindex=None,
                       body=None):
        """
        Creates an Elasticsearch body for a single field given an index name and type name

        """

        index = self._add_prefix(index)
        if not body:
            if fieldtype == 'geo_shape':
                body = {
                    '_doc': {
                        'properties': {
                            fieldname: {
                                'type': 'geo_shape',
                                'tree': 'geohash',
                                'precision': '1m'
                            }
                        }
                    }
                }
            else:
                fn = {'type': fieldtype}
                if fieldindex:
                    fn['index'] = fieldindex
                body = {'_doc': {'properties': {fieldname: fn}}}

        self.es.indices.create(index=index, ignore=400)
        self.es.indices.put_mapping(index=index, doc_type='_doc', body=body)
        print 'creating index : %s' % (index)

    def create_index(self, **kwargs):
        kwargs = self._add_prefix(**kwargs)
        self.es.indices.create(ignore=400, **kwargs)
        print 'creating index : %s' % kwargs.get('index', '')

    def index_data(self,
                   index=None,
                   body=None,
                   idfield=None,
                   id=None,
                   **kwargs):
        """
        Indexes a document or list of documents into Elasticsearch

        If "id" is supplied then will use that as the id of the document

        If "idfield" is supplied then will try to find that property in the
            document itself and use the value found for the id of the document

        """

        index = self._add_prefix(index)
        if not isinstance(body, list):
            body = [body]

        for document in body:
            if idfield is not None:
                if isinstance(document, dict):
                    id = document[idfield]
                else:
                    id = getattr(document, idfield)

            try:
                self.es.index(index=index,
                              doc_type='_doc',
                              body=document,
                              id=id)
            except Exception as detail:
                self.logger.warning(
                    '%s: WARNING: failed to index document: %s \nException detail: %s\n'
                    % (datetime.now(), document, detail))
                raise detail

    def bulk_index(self, data, **kwargs):
        return helpers.bulk(self.es, data, **kwargs)

    def create_bulk_item(self,
                         op_type='index',
                         index=None,
                         id=None,
                         data=None):
        return {
            '_op_type': op_type,
            '_index': self._add_prefix(index),
            '_type': '_doc',
            '_id': id,
            '_source': data
        }

    def count(self, **kwargs):
        kwargs = self._add_prefix(**kwargs)
        kwargs['doc_type'] = kwargs.pop('doc_type', '_doc')
        body = kwargs.pop('body', None)

        # need to only pass in the query key as other keys (eg: _source) are not allowed
        if body:
            query = body.pop('query', None)
            if query:
                kwargs['body'] = {'query': query}

        count = self.es.count(**kwargs)
        if count is not None:
            return count['count']
        else:
            return None

    def BulkIndexer(outer_self, batch_size=500, **kwargs):
        class _BulkIndexer(object):
            def __init__(self, **kwargs):
                self.queue = []
                self.batch_size = kwargs.pop('batch_size', 500)
                self.kwargs = kwargs

            def add(self, op_type='index', index=None, id=None, data=None):
                doc = {
                    '_op_type': op_type,
                    '_index': outer_self._add_prefix(index),
                    '_type': '_doc',
                    '_id': id,
                    '_source': data
                }
                self.queue.append(doc)

                if len(self.queue) >= self.batch_size:
                    outer_self.bulk_index(self.queue, **self.kwargs)
                    del self.queue[:]  #clear out the array

            def close(self):
                outer_self.bulk_index(self.queue, **self.kwargs)

            def __enter__(self, **kwargs):
                return self

            def __exit__(self, type, value, traceback):
                return self.close()

        return _BulkIndexer(batch_size=batch_size, **kwargs)
示例#44
0
DEBUG = True if __name__ == '__main__' else False

faker = Factory.create()
es = Elasticsearch()


def get_name():
    return {
        'name': faker.name(),
        'email': faker.email(),
        'address': faker.address(),
        'timestamp': dt.now(),
    }


@test_speed
def insert_all(max_records):
    for n in range(max_records):
        res = es.index(index='testing_index',
                       doc_type='test',
                       id=n,
                       body=get_name())
        print(res)


if DEBUG:
    with Section('ElasticSearch (via ElasticSearch-py)'):
        insert_all(10)
        res = es.get(index='testing_index', doc_type='test', id=1)
        prnt('ES Results:', res)
示例#45
0
print('start:', start)

many = 0
count = 0
fail = 0
while True:
    if (many < 1):
        sval = input('How many messages:')
        if (len(sval) < 1): break
        many = int(sval)

    start = start + 1

    # Skip rows that are already retrieved
    try:
        res = es.get(index='gmane', doc_type='message', id=start)
        print(res)
        continue
    except:
        pass

    many = many - 1
    url = baseurl + str(start) + '/' + str(start + 1)

    text = 'None'
    try:
        # Open with a timeout of 30 seconds
        response = requests.get(url)
        text = response.text
        status = response.status_code
        if status != 200:
示例#46
0
class ElasticDocRanker(object):
    """ Connect to an ElasticSearch index.
        Score pairs based on Elasticsearch 
    """
    def __init__(self,
                 elastic_url=None,
                 elastic_index=None,
                 elastic_fields=None,
                 elastic_field_doc_name=None,
                 strict=True,
                 elastic_field_content=None):
        """
        Args:
            elastic_url: URL of the ElasticSearch server containing port
            elastic_index: Index name of ElasticSearch
            elastic_fields: Fields of the Elasticsearch index to search in
            elastic_field_doc_name: Field containing the name of the document (index)
            strict: fail on empty queries or continue (and return empty result)
            elastic_field_content: Field containing the content of document in plaint text
        """
        # Load from disk
        elastic_url = elastic_url or DEFAULTS['elastic_url']
        logger.info('Connecting to %s' % elastic_url)
        self.es = Elasticsearch(hosts=elastic_url)
        self.elastic_index = elastic_index
        self.elastic_fields = elastic_fields
        self.elastic_field_doc_name = elastic_field_doc_name
        self.elastic_field_content = elastic_field_content
        self.strict = strict

    # Elastic Ranker

    def get_doc_index(self, doc_id):
        """Convert doc_id --> doc_index"""
        field_index = self.elastic_field_doc_name
        if isinstance(field_index, list):
            field_index = '.'.join(field_index)
        result = self.es.search(
            index=self.elastic_index,
            body={'query': {
                'match': {
                    field_index: doc_id
                }
            }})
        return result['hits']['hits'][0]['_id']

    def get_doc_id(self, doc_index):
        """Convert doc_index --> doc_id"""
        result = self.es.search(index=self.elastic_index,
                                body={'query': {
                                    'match': {
                                        "_id": doc_index
                                    }
                                }})
        source = result['hits']['hits'][0]['_source']
        return utils.get_field(source, self.elastic_field_doc_name)

    def closest_docs(self, query, k=1):
        """Closest docs by using ElasticSearch
        """
        results = self.es.search(index=self.elastic_index,
                                 body={
                                     'size': k,
                                     'query': {
                                         'multi_match': {
                                             'query': query,
                                             'type': 'most_fields',
                                             'fields': self.elastic_fields
                                         }
                                     }
                                 })
        hits = results['hits']['hits']
        doc_ids = [
            utils.get_field(row['_source'], self.elastic_field_doc_name)
            for row in hits
        ]
        doc_scores = [row['_score'] for row in hits]
        return doc_ids, doc_scores

    def batch_closest_docs(self, queries, k=1, num_workers=None):
        """Process a batch of closest_docs requests multithreaded.
        Note: we can use plain threads here as scipy is outside of the GIL.
        """
        with ThreadPool(num_workers) as threads:
            closest_docs = partial(self.closest_docs, k=k)
            results = threads.map(closest_docs, queries)
        return results

    # Elastic DB

    def __enter__(self):
        return self

    def close(self):
        """Close the connection to the database."""
        self.es = None

    def get_doc_ids(self):
        """Fetch all ids of docs stored in the db."""
        results = self.es.search(index=self.elastic_index,
                                 body={"query": {
                                     "match_all": {}
                                 }})
        doc_ids = [
            utils.get_field(result['_source'], self.elastic_field_doc_name)
            for result in results['hits']['hits']
        ]
        return doc_ids

    def get_doc_text(self, doc_id):
        """Fetch the raw text of the doc for 'doc_id'."""
        idx = self.get_doc_index(doc_id)
        result = self.es.get(index=self.elastic_index, doc_type='_doc', id=idx)
        return result if result is None else result['_source'][
            self.elastic_field_content]
示例#47
0
import json

ip = sys.argv[1]
port = int(sys.argv[2])  # 9200
try:
    es = Elasticsearch("{}:{}".format(ip, port),
                       timeout=5)  # 连接Elasticsearch,延时5秒
    es.indices.create(index='unauth_text')
    print('[+] 成功连接 :{}'.format(ip))
    print('[+] {} -> 成功创建测试节点unauth_text'.format(ip))
    es.index(index="unauth_text",
             doc_type="test-type",
             id=2,
             body={"text": "text"})
    print('[+] {} -> 成功往节点unauth_text插入数据'.format(ip))
    ret = es.get(index="unauth_text", doc_type="test-type", id=2)
    print('[+] {} -> 成功获取节点unauth_text数据 : {}'.format(ip, ret))
    es.indices.delete(index='unauth_text')
    print('[+] {} -> 清除测试节点unauth_text数据'.format(ip))
    print('[ok] {} -> 存在ElasticSearch未授权漏洞'.format(ip))

    print('尝试获取节点信息:↓')
    text = json.loads(
        requests.get(url='http://{}:{}/_nodes'.format(ip, port),
                     timeout=5).text)
    nodes_total = text['_nodes']['total']
    nodes = list(text['nodes'].keys())
    print('[ok] {} -> [{}] : {}'.format(ip, nodes_total, nodes))

except Exception as e:
    error = e.args
示例#48
0
class ElasticsearchDataStore(object):
    """Implements the datastore."""

    # Number of events to queue up when bulk inserting events.
    DEFAULT_FLUSH_INTERVAL = 1000
    DEFAULT_SIZE = 100
    DEFAULT_LIMIT = DEFAULT_SIZE  # Max events to return
    DEFAULT_FROM = 0
    DEFAULT_STREAM_LIMIT = 5000  # Max events to return when streaming results

    def __init__(self, host='127.0.0.1', port=9200):
        """Create a Elasticsearch client."""
        super(ElasticsearchDataStore, self).__init__()
        self._error_container = {}
        self.client = Elasticsearch([{'host': host, 'port': port}])
        self.import_counter = Counter()
        self.import_events = []

    @staticmethod
    def _build_labels_query(sketch_id, labels):
        """Build Elasticsearch query for Timesketch labels.

        Args:
            sketch_id: Integer of sketch primary key.
            labels: List of label names.

        Returns:
            Elasticsearch query as a dictionary.
        """
        label_query = {'bool': {'must': []}}

        for label in labels:
            nested_query = {
                'nested': {
                    'query': {
                        'bool': {
                            'must': [{
                                'term': {
                                    'timesketch_label.name.keyword': label
                                }
                            }, {
                                'term': {
                                    'timesketch_label.sketch_id': sketch_id
                                }
                            }]
                        }
                    },
                    'path': 'timesketch_label'
                }
            }
            label_query['bool']['must'].append(nested_query)
        return label_query

    @staticmethod
    def _build_events_query(events):
        """Build Elasticsearch query for one or more document ids.

        Args:
            events: List of Elasticsearch document IDs.

        Returns:
            Elasticsearch query as a dictionary.
        """
        events_list = [event['event_id'] for event in events]
        query_dict = {'query': {'ids': {'values': events_list}}}
        return query_dict

    def build_query(self,
                    sketch_id,
                    query_string,
                    query_filter,
                    query_dsl=None,
                    aggregations=None):
        """Build Elasticsearch DSL query.

        Args:
            sketch_id: Integer of sketch primary key
            query_string: Query string
            query_filter: Dictionary containing filters to apply
            query_dsl: Dictionary containing Elasticsearch DSL query
            aggregations: Dict of Elasticsearch aggregations

        Returns:
            Elasticsearch DSL query as a dictionary
        """

        if query_dsl:
            query_dsl = json.loads(query_dsl)
            # Remove any aggregation coming from user supplied Query DSL.
            # We have no way to display this data in a good way today.
            if query_dsl.get('aggregations', None):
                del query_dsl['aggregations']
            return query_dsl

        if query_filter.get('events', None):
            events = query_filter['events']
            return self._build_events_query(events)

        query_dsl = {
            'query': {
                'bool': {
                    'must': [],
                    'must_not': [],
                    'filter': []
                }
            }
        }

        # TODO: Remove when old UI has been deprecated.
        if query_filter.get('star', None):
            label_query = self._build_labels_query(sketch_id, ['__ts_star'])
            query_string = '*'
            query_dsl['query']['bool']['must'].append(label_query)

        # TODO: Remove when old UI has been deprecated.
        if query_filter.get('time_start', None):
            query_dsl['query']['bool']['filter'] = [{
                'bool': {
                    'should': [{
                        'range': {
                            'datetime': {
                                'gte': query_filter['time_start'],
                                'lte': query_filter['time_end']
                            }
                        }
                    }]
                }
            }]

        if query_string:
            query_dsl['query']['bool']['must'].append(
                {'query_string': {
                    'query': query_string
                }})

        # New UI filters
        if query_filter.get('chips', None):
            labels = []
            must_filters = query_dsl['query']['bool']['must']
            must_not_filters = query_dsl['query']['bool']['must_not']
            datetime_ranges = {
                'bool': {
                    'should': [],
                    'minimum_should_match': 1
                }
            }

            for chip in query_filter['chips']:
                # Exclude chips that the user disabled
                if not chip.get('active', True):
                    continue

                if chip['type'] == 'label':
                    labels.append(chip['value'])

                elif chip['type'] == 'term':
                    term_filter = {
                        'match_phrase': {
                            '{}'.format(chip['field']): {
                                'query': "{}".format(chip['value'])
                            }
                        }
                    }

                    if chip['operator'] == 'must':
                        must_filters.append(term_filter)

                    elif chip['operator'] == 'must_not':
                        must_not_filters.append(term_filter)

                elif chip['type'] == 'datetime_range':
                    start = chip['value'].split(',')[0]
                    end = chip['value'].split(',')[1]
                    range_filter = {
                        'range': {
                            'datetime': {
                                'gte': start,
                                'lte': end
                            }
                        }
                    }
                    datetime_ranges['bool']['should'].append(range_filter)

            label_filter = self._build_labels_query(sketch_id, labels)
            must_filters.append(label_filter)
            must_filters.append(datetime_ranges)

        # Pagination
        if query_filter.get('from', None):
            query_dsl['from'] = query_filter['from']

        # Number of events to return
        if query_filter.get('size', None):
            query_dsl['size'] = query_filter['size']

        # Make sure we are sorting.
        if not query_dsl.get('sort', None):
            query_dsl['sort'] = {'datetime': query_filter.get('order', 'asc')}

        # Add any pre defined aggregations
        if aggregations:
            # post_filter happens after aggregation so we need to move the
            # filter to the query instead.
            if query_dsl.get('post_filter', None):
                query_dsl['query']['bool']['filter'] = query_dsl['post_filter']
                query_dsl.pop('post_filter', None)
            query_dsl['aggregations'] = aggregations

        return query_dsl

    def search(self,
               sketch_id,
               query_string,
               query_filter,
               query_dsl,
               indices,
               count=False,
               aggregations=None,
               return_fields=None,
               enable_scroll=False):
        """Search ElasticSearch. This will take a query string from the UI
        together with a filter definition. Based on this it will execute the
        search request on ElasticSearch and get result back.

        Args:
            sketch_id: Integer of sketch primary key
            query_string: Query string
            query_filter: Dictionary containing filters to apply
            query_dsl: Dictionary containing Elasticsearch DSL query
            indices: List of indices to query
            count: Boolean indicating if we should only return result count
            aggregations: Dict of Elasticsearch aggregations
            return_fields: List of fields to return
            enable_scroll: If Elasticsearch scroll API should be used

        Returns:
            Set of event documents in JSON format
        """

        scroll_timeout = None
        if enable_scroll:
            scroll_timeout = '1m'  # Default to 1 minute scroll timeout

        # Exit early if we have no indices to query
        if not indices:
            return {'hits': {'hits': [], 'total': 0}, 'took': 0}

        # Check if we have specific events to fetch and get indices.
        if query_filter.get('events', None):
            indices = {
                event['index']
                for event in query_filter['events']
                if event['index'] in indices
            }

        query_dsl = self.build_query(sketch_id, query_string, query_filter,
                                     query_dsl, aggregations)

        # Default search type for elasticsearch is query_then_fetch.
        search_type = 'query_then_fetch'

        # Only return how many documents matches the query.
        if count:
            del query_dsl['sort']
            count_result = self.client.count(body=query_dsl,
                                             index=list(indices))
            return count_result.get('count', 0)

        if not return_fields:
            # Suppress the lint error because elasticsearch-py adds parameters
            # to the function with a decorator and this makes pylint sad.
            # pylint: disable=unexpected-keyword-arg
            return self.client.search(body=query_dsl,
                                      index=list(indices),
                                      search_type=search_type,
                                      scroll=scroll_timeout)

        # The argument " _source_include" changed to "_source_includes" in
        # ES version 7. This check add support for both version 6 and 7 clients.
        # pylint: disable=unexpected-keyword-arg
        if self.version.startswith('6'):
            _search_result = self.client.search(body=query_dsl,
                                                index=list(indices),
                                                search_type=search_type,
                                                _source_include=return_fields,
                                                scroll=scroll_timeout)
        else:
            _search_result = self.client.search(body=query_dsl,
                                                index=list(indices),
                                                search_type=search_type,
                                                _source_includes=return_fields,
                                                scroll=scroll_timeout)

        return _search_result

    def search_stream(self,
                      sketch_id=None,
                      query_string=None,
                      query_filter=None,
                      query_dsl=None,
                      indices=None,
                      return_fields=None,
                      enable_scroll=True):
        """Search ElasticSearch. This will take a query string from the UI
        together with a filter definition. Based on this it will execute the
        search request on ElasticSearch and get result back.

        Args :
            sketch_id: Integer of sketch primary key
            query_string: Query string
            query_filter: Dictionary containing filters to apply
            query_dsl: Dictionary containing Elasticsearch DSL query
            indices: List of indices to query
            return_fields: List of fields to return
            enable_scroll: Boolean determing whether scrolling is enabled.

        Returns:
            Generator of event documents in JSON format
        """

        if not query_filter.get('size'):
            query_filter['size'] = self.DEFAULT_STREAM_LIMIT

        if not query_filter.get('terminate_after'):
            query_filter['terminate_after'] = self.DEFAULT_STREAM_LIMIT

        result = self.search(sketch_id=sketch_id,
                             query_string=query_string,
                             query_dsl=query_dsl,
                             query_filter=query_filter,
                             indices=indices,
                             return_fields=return_fields,
                             enable_scroll=enable_scroll)

        if enable_scroll:
            scroll_id = result['_scroll_id']
            scroll_size = result['hits']['total']
        else:
            scroll_id = None
            scroll_size = 0

        # Elasticsearch version 7.x returns total hits as a dictionary.
        # TODO: Refactor when version 6.x has been deprecated.
        if isinstance(scroll_size, dict):
            scroll_size = scroll_size.get('value', 0)

        for event in result['hits']['hits']:
            yield event

        while scroll_size > 0:
            # pylint: disable=unexpected-keyword-arg
            result = self.client.scroll(scroll_id=scroll_id, scroll='5m')
            scroll_id = result['_scroll_id']
            scroll_size = len(result['hits']['hits'])
            for event in result['hits']['hits']:
                yield event

    def get_filter_labels(self, sketch_id, indices):
        """Aggregate labels for a sketch.

        Args:
            sketch_id: The Sketch ID
            indices: List of indices to aggregate on

        Returns:
            List with label names.
        """
        # This is a workaround to return all labels by setting the max buckets
        # to something big. If a sketch has more than this amount of labels
        # the list will be incomplete but it should be uncommon to have >10k
        # labels in a sketch.
        max_labels = 10000

        # pylint: disable=line-too-long
        aggregation = {
            'aggs': {
                'nested': {
                    'nested': {
                        'path': 'timesketch_label'
                    },
                    'aggs': {
                        'inner': {
                            'filter': {
                                'bool': {
                                    'must': [{
                                        'term': {
                                            'timesketch_label.sketch_id':
                                            sketch_id
                                        }
                                    }]
                                }
                            },
                            'aggs': {
                                'labels': {
                                    'terms': {
                                        'size': max_labels,
                                        'field':
                                        'timesketch_label.name.keyword'
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }

        labels = []
        # pylint: disable=unexpected-keyword-arg
        result = self.client.search(index=indices, body=aggregation, size=0)
        buckets = result.get('aggregations',
                             {}).get('nested',
                                     {}).get('inner',
                                             {}).get('labels',
                                                     {}).get('buckets', [])
        for bucket in buckets:
            # Filter out special labels like __ts_star etc.
            if bucket['key'].startswith('__'):
                continue
            labels.append(bucket['key'])
        return labels

    def get_event(self, searchindex_id, event_id):
        """Get one event from the datastore.

        Args:
            searchindex_id: String of ElasticSearch index id
            event_id: String of ElasticSearch event id

        Returns:
            Event document in JSON format
        """
        try:
            # Suppress the lint error because elasticsearch-py adds parameters
            # to the function with a decorator and this makes pylint sad.
            # pylint: disable=unexpected-keyword-arg
            if self.version.startswith('6'):
                event = self.client.get(index=searchindex_id,
                                        id=event_id,
                                        doc_type='_all',
                                        _source_exclude=['timesketch_label'])
            else:
                event = self.client.get(index=searchindex_id,
                                        id=event_id,
                                        doc_type='_all',
                                        _source_excludes=['timesketch_label'])

            return event

        except NotFoundError:
            abort(HTTP_STATUS_CODE_NOT_FOUND)

    def count(self, indices):
        """Count number of documents.

        Args:
            indices: List of indices.

        Returns:
            Number of documents.
        """
        if not indices:
            return 0
        try:
            result = self.client.count(index=indices)
        except (NotFoundError, RequestError):
            es_logger.error('Unable to count indexes (index not found)',
                            exc_info=True)
            return 0
        return result.get('count', 0)

    def set_label(self,
                  searchindex_id,
                  event_id,
                  event_type,
                  sketch_id,
                  user_id,
                  label,
                  toggle=False,
                  remove=False,
                  single_update=True):
        """Set label on event in the datastore.

        Args:
            searchindex_id: String of ElasticSearch index id
            event_id: String of ElasticSearch event id
            event_type: String of ElasticSearch document type
            sketch_id: Integer of sketch primary key
            user_id: Integer of user primary key
            label: String with the name of the label
            remove: Optional boolean value if the label should be removed
            toggle: Optional boolean value if the label should be toggled
            single_update: Boolean if the label should be indexed immediately.

        Returns:
            Dict with updated document body, or None if this is a single update.
        """
        # Elasticsearch painless script.
        update_body = {
            'script': {
                'lang': 'painless',
                'source': UPDATE_LABEL_SCRIPT,
                'params': {
                    'timesketch_label': {
                        'name': str(label),
                        'user_id': user_id,
                        'sketch_id': sketch_id
                    },
                    remove: remove
                }
            }
        }

        if toggle:
            update_body['script']['source'] = TOGGLE_LABEL_SCRIPT

        if not single_update:
            script = update_body['script']
            return dict(source=script['source'],
                        lang=script['lang'],
                        params=script['params'])

        doc = self.client.get(index=searchindex_id,
                              id=event_id,
                              doc_type='_all')
        try:
            doc['_source']['timesketch_label']
        except KeyError:
            doc = {'doc': {'timesketch_label': []}}
            self.client.update(index=searchindex_id,
                               doc_type=event_type,
                               id=event_id,
                               body=doc)

        self.client.update(index=searchindex_id,
                           id=event_id,
                           doc_type=event_type,
                           body=update_body)

        return None

    def create_index(self, index_name=uuid4().hex, doc_type='generic_event'):
        """Create index with Timesketch settings.

        Args:
            index_name: Name of the index. Default is a generated UUID.
            doc_type: Name of the document type. Default id generic_event.

        Returns:
            Index name in string format.
            Document type in string format.
        """
        _document_mapping = {
            'properties': {
                'timesketch_label': {
                    'type': 'nested'
                },
                'datetime': {
                    'type': 'date'
                }
            }
        }

        # TODO: Remove when we deprecate Elasticsearch version 6.x
        if self.version.startswith('6'):
            _document_mapping = {doc_type: _document_mapping}

        if not self.client.indices.exists(index_name):
            try:
                self.client.indices.create(
                    index=index_name, body={'mappings': _document_mapping})
            except ConnectionError:
                raise RuntimeError('Unable to connect to Timesketch backend.')
            except RequestError:
                index_exists = self.client.indices.exists(index_name)
                es_logger.warning(
                    'Attempting to create an index that already exists '
                    '({0:s} - {1:s})'.format(index_name, str(index_exists)))

        # We want to return unicode here to keep SQLalchemy happy.
        if six.PY2:
            if not isinstance(index_name, six.text_type):
                index_name = codecs.decode(index_name, 'utf-8')

            if not isinstance(doc_type, six.text_type):
                doc_type = codecs.decode(doc_type, 'utf-8')

        return index_name, doc_type

    def delete_index(self, index_name):
        """Delete Elasticsearch index.

        Args:
            index_name: Name of the index to delete.
        """
        if self.client.indices.exists(index_name):
            try:
                self.client.indices.delete(index=index_name)
            except ConnectionError as e:
                raise RuntimeError(
                    'Unable to connect to Timesketch backend: {}'.format(e))

    def import_event(self,
                     index_name,
                     event_type,
                     event=None,
                     event_id=None,
                     flush_interval=DEFAULT_FLUSH_INTERVAL):
        """Add event to Elasticsearch.

        Args:
            index_name: Name of the index in Elasticsearch
            event_type: Type of event (e.g. plaso_event)
            event: Event dictionary
            event_id: Event Elasticsearch ID
            flush_interval: Number of events to queue up before indexing
        """
        if event:
            for k, v in event.items():
                if not isinstance(k, six.text_type):
                    k = codecs.decode(k, 'utf8')

                # Make sure we have decoded strings in the event dict.
                if isinstance(v, six.binary_type):
                    v = codecs.decode(v, 'utf8')

                event[k] = v

            # Header needed by Elasticsearch when bulk inserting.
            header = {
                'index': {
                    '_index': index_name,
                }
            }
            update_header = {'update': {'_index': index_name, '_id': event_id}}

            # TODO: Remove when we deprecate Elasticsearch version 6.x
            if self.version.startswith('6'):
                header['index']['_type'] = event_type
                update_header['update']['_type'] = event_type

            if event_id:
                # Event has "lang" defined if there is a script used for import.
                if event.get('lang'):
                    event = {'script': event}
                else:
                    event = {'doc': event}
                header = update_header

            self.import_events.append(header)
            self.import_events.append(event)
            self.import_counter['events'] += 1

            if self.import_counter['events'] % int(flush_interval) == 0:
                _ = self.flush_queued_events()
                self.import_events = []
        else:
            # Import the remaining events in the queue.
            if self.import_events:
                _ = self.flush_queued_events()

        return self.import_counter['events']

    def flush_queued_events(self):
        """Flush all queued events.

        Returns:
            dict: A dict object that contains the number of events
                that were sent to Elastic as well as information
                on whether there were any errors, and what the
                details of these errors if any.
        """
        if not self.import_events:
            return {}

        return_dict = {
            'number_of_events': len(self.import_events) / 2,
            'total_events': self.import_counter['events'],
        }

        try:
            results = self.client.bulk(body=self.import_events)
        except (ConnectionTimeout, socket.timeout):
            # TODO: Add a retry here.
            es_logger.error('Unable to add events', exc_info=True)

        errors_in_upload = results.get('errors', False)
        return_dict['errors_in_upload'] = errors_in_upload

        if errors_in_upload:
            items = results.get('items', [])
            return_dict['errors'] = []

            es_logger.error('Errors while attempting to upload events.')
            for item in items:
                index = item.get('index', {})
                index_name = index.get('_index', 'N/A')

                _ = self._error_container.setdefault(index_name, {
                    'errors': [],
                    'types': Counter(),
                    'details': Counter()
                })

                error_counter = self._error_container[index_name]['types']
                error_detail_counter = self._error_container[index_name][
                    'details']
                error_list = self._error_container[index_name]['errors']

                error = index.get('error', {})
                status_code = index.get('status', 0)
                doc_id = index.get('_id', '')
                caused_by = error.get('caused_by', {})

                caused_reason = caused_by.get('reason',
                                              'Unkown Detailed Reason')

                error_counter[error.get('type')] += 1
                detail_msg = '{0:s}/{1:s}'.format(
                    caused_by.get('type', 'Unknown Detailed Type'),
                    ' '.join(caused_reason.split()[:5]))
                error_detail_counter[detail_msg] += 1

                error_msg = '<{0:s}> {1:s} [{2:s}/{3:s}]'.format(
                    error.get('type', 'Unknown Type'),
                    error.get('reason', 'No reason given'),
                    caused_by.get('type', 'Unknown Type'),
                    caused_reason,
                )
                error_list.append(error_msg)
                es_logger.error(
                    'Unable to upload document: {0:s} to index {1:s} - '
                    '[{2:d}] {3:s}'.format(doc_id, index_name, status_code,
                                           error_msg))

        return_dict['error_container'] = self._error_container

        self.import_events = []
        return return_dict

    @property
    def version(self):
        """Get Elasticsearch version.

        Returns:
          Version number as a string.
        """
        version_info = self.client.info().get('version')
        return version_info.get('number')
示例#49
0
class Report:
    def __init__(self, id_list, username, password, index_name):
        self.ES_HOST = '192.168.169.37'
        self.ES_PORT = 9206
        # self.INDEX_NAME = 'weibo_report_management'
        self.INDEX_NAME = index_name
        self.TYPE = 'report'
        self.es = Elasticsearch([{'host': self.ES_HOST, 'port': self.ES_PORT}])
        self.results = []
        self.id_list = id_list
        self.username = username
        self.password = password
        self.currentTime = int(time.time())

    def userList(self):
        self.results = []
        for id in self.id_list:
            result = self.es.get(index=self.INDEX_NAME,
                                 doc_type=self.TYPE,
                                 id=id)['_source']
            event_name = result['event_name']
            report_time = result['report_time']
            report_type = result['report_type']
            xnr_user_no = result['xnr_user_no']

            weibo_list = json.loads(result['report_content'])['fb_list']
            for each in weibo_list:
                text = each['text']
                timestamp = each['timestamp']
                try:
                    user = each['nick_name']
                except:
                    user = each['uid']
                uid = each['uid']
                fid = each['fid']

                dict = {'event_name':event_name, 'report_time':report_time, 'report_type':report_type,\
                  'xnr_user_no':xnr_user_no, 'text':text, 'timestamp':timestamp, 'user':user,\
                  'uid':uid, 'fid':fid}
                self.results.append(dict)
        return self.results

    # def screen_shot(self, results):
    # 	for result in results:
    # 		screen = Screen(self.username, self.password)
    # 		screen.screenShot(result['uid'], result['mid'])

    def save_excel(self):
        results = self.userList()
        filename = 'xnr/static/doc/' + str(self.currentTime) + '.xlsx'
        if results:
            letters = "ABCDEFGHIJKLMN"
            #self.screen_shot(results)
            file = xlsxwriter.Workbook(filename)
            table = file.add_worksheet()
            field = [each for each in results[0].keys()]
            field = '^&*'.join(field).replace('event_name', u'上报名称').replace(
                'report_time',
                u'上报时间').replace('report_type', u'上报类型').replace(
                    'xnr_user_no', u'虚拟人').replace('text', u'文本内容').replace(
                        'user', u'发博用户').replace('timestamp',
                                                 u'发博时间') + u"^&*截图"
            field = field.split('^&*')
            for a, b in enumerate(field):
                table.write(letters[a] + str(1), b)
            lists = []
            for result in results:
                list = [each for each in result.values()]
                lists.append(list)
            for i, k in enumerate(lists):
                table.insert_image(letters[len(k)] + str(i + 2),
                                   results[0]['fid'] + '.png', {
                                       'x_scale': 0.05,
                                       'y_scale': 0.05
                                   })
                for c, d in enumerate(k):
                    qq = letters[c] + str(i + 2)
                    table.write(qq, d)
            file.close()
        return filename

    def save_word(self):
        results = self.userList()
        filename = 'xnr/static/doc/' + str(self.currentTime) + '.docx'
        if results:
            #self.screen_shot(results)
            document = Document()
            for result in results:
                result_str = json.dumps(result, ensure_ascii=False).replace(
                    '{', '').replace(
                        '}', '').replace('event_name', u'上报名称').replace(
                            'report_time',
                            u'上报时间').replace('report_type', u'上报类型').replace(
                                'xnr_user_no',
                                u'虚拟人').replace('text', u'文本内容').replace(
                                    'user',
                                    u'发博用户').replace('timestamp', u'发博时间')
                document.add_paragraph(result_str)
                #document.add_picture(result['fid']+'.png', width=Inches(1.25))
            document.add_page_break()
            document.save(filename)
        return filename
示例#50
0
class ElasticsearchWrapper(metaclass=DatabaseMeta):
    LOG_LEVEL = 1

    def __init__(self,
                 host=None,
                 port=None,
                 user=None,
                 password=None,
                 *args,
                 **kwargs):
        assert host is not None, 'host can not be None.'
        assert port is not None, 'port can not be None.'

        self.__connector = Elasticsearch(host,
                                         http_auth=(user, password),
                                         scheme='http',
                                         port=port,
                                         http_compress=True,
                                         verify_certs=False)

    @elastic_verify
    def insert(self, index=None, body=None, *args, **kwargs):
        assert body is not None, '[insert] body can not be None.'
        return self.__connector.index(index=index, body=body)

    @elastic_verify
    def delete(self, index=None, _id=None, *args, **kwargs):
        assert _id is not None, '[delete] _id can not be None.'
        return self.__connector.delete(index=index, id=_id)

    @elastic_verify
    def update(self, index=None, _id=None, body=None, *args, **kwargs):
        assert _id is not None, '[update] _id can not be None.'
        assert body is not None, '[update] body can not be None.'
        return self.__connector.update(index=index, id=_id, body=body)

    @elastic_verify
    def get(self, index=None, _id=None, *args, **kwargs):
        assert _id is not None, '[get] _id can not be None.'
        return self.__connector.get(index=index, id=_id)

    @elastic_verify
    def create_index(self, index=None, *args, **kwargs):
        return self.__connector.create(index=index)

    @elastic_verify
    def delete_index(self, index=None, *args, **kwargs):
        return self.__connector.delete(index=index)

    @elastic_verify
    def is_index_exists(self, index=None, *args, **kwargs):
        if self.__connector.indices.exists(index=index):
            return True
        else:
            return False

    @elastic_verify
    def search_topic(self, index=None, query=None, *args, **kwargs):
        if query is None:
            logger.info('[search] query is None.')
            return []

        results = self.__connector.search(index=index,
                                          body=query)['hits']['hits']
        return [result['_source'] for result in results]
示例#51
0
def main(args):
    #es2 = Elasticsearch([{'host': 'localhost', 'port': 9200}])
    es2 = Elasticsearch(
        "https://96aa4157ead74b5ca4926523b1d1994e.us-east-1.aws.found.io:9243",
        http_auth=('elastic', 'MrkfJ5hxIcCOzTMfOa1Nftzy'))

    #elasticsearch.helpers.reindex(es1, "church_data", args.out_index, query=None, target_client=None,
    #           chunk_size=500, scroll='5m', scan_kwargs={}, bulk_kwargs={})

    checkpoint_path = fjoin(args.ckp, "checkpoint.%d." % args.ckp_no)
    if isfile(checkpoint_path + "frontier_map.pt"):
        frontier_map = pickle.load(
            open(checkpoint_path + "frontier_map.pt", "rb"))
    else:
        raise Exception("checkpoint not found")

    filesadded = 0
    filesupdated = 0
    # Load all the pickles of the crawled data
    for file in os.listdir(args.cdp):
        path = fjoin(args.cdp, file)
        res = pickle.load(open(path, "rb"))
        url = res['docno']

        inlinkData = list(frontier_map[url].inlinks)
        j_inlinks = json.dumps(inlinkData)
        logging.info("Checking for url {}".format(url))

        #Finding if the url is in the merged index
        result = es2.get(index=args.out_index, id=url, ignore=404)

        if result['found'] is True:
            logging.info("inlinks from local  {}".format(
                len(set((frontier_map[url].inlinks)))))
            logging.info("inlinks retrieved {}".format(
                len(set(result['_source']['inlinks']))))
            existing_inlinks = json.loads(j_inlinks)
            retrieved_inlinks = json.loads(result['_source']['inlinks'])

            #merging the inlinks from both local and merged set and updating the inlinks
            final_inlinkset = merge_inlinks(
                [retrieved_inlinks, existing_inlinks])
            logging.info("length of final list {}".format(
                len(final_inlinkset)))
            es2.update(index=args.out_index,
                       id=url,
                       doc_type=args.doc_type,
                       body={"doc": {
                           "inlinks": json.dumps(final_inlinkset)
                       }})
            filesupdated += 1
            logging.info("doc updated for url {}".format(url))

        else:
            # indexing the data for the url which doesn't match any url in merged data index
            logging.info("value of res in else {}: ".format(len(result)))
            title = res['head']
            content = res['text']
            inlinks = j_inlinks
            outlinkData = list(frontier_map[url].outlinks)
            outlinks = json.dumps(outlinkData)
            doc = {
                'head': title,
                'text': content,
                'inlinks': inlinks,
                'outlinks': outlinks
            }
            es2.index(index=args.out_index,
                      id=url,
                      body=doc,
                      doc_type=args.doc_type)
            filesadded += 1
            logging.info("doc added for url {}: ".format(url))

    logging.info("doc added {} and updated {}: ".format(
        filesadded, filesupdated))
示例#52
0
class ES(Singleton):
    def __init__(self, address = ADDRESS):
        super(ES, self).__init__()
        if not hasattr(self,'_es'):
            try:
                self._es = Elasticsearch(address.split(','))
            except Exception as e:
                raise
            else:
                log.debug('连接到Elasticsearch')

    def add(self, table, data, data_id = None, doc_type = ''):
        '''
        @summary:
        ---------
        @param table: 索引
        @param data_json: 数据 json类型
        @param doc_type: 类型 空时以表命名。 doc_type可理解为同样的数据结构不同意意义。比如url表,doc_type 可以以网站名命名
        @param data_id data_id不指定,会自己创建, data_id已存在,则更新
        ---------
        @result:
        '''
        try:
            table = table.lower()
            self._es.index(index = table, doc_type = doc_type or table ,id = data_id, body = data)
        except Exception as e:
            log.error(e)
            return False
        else:
            return True

    def get(self, table, data_id, doc_type = '_all'):
        '''
        @summary: 根据id取数据
        ---------
        @param table:索引
        @param data_id:数据id 如 ID=1 的数据
        @param doc_type:类型 _all 为全部
        ---------
        @result: json
        '''
        datas = {}

        try:
            table = table.lower()
            datas = self._es.get(index = table, doc_type = doc_type, id = data_id)

        except Exception as e:
            # log.error(e)
            pass

        return datas


    def search(self, table, body = {}):
        '''
        @summary:
        ---------
        @param table:
        @param body: 查询条件
        ---------
        @result: json
        '''

        datas = {}

        try:
            table = table.lower()
            datas = self._es.search(index = table, body = body)

        except Exception as e:
            log.error(e)

        return datas

    def update_by_id(self, table, data_id, data, doc_type = ''):
        '''
        @summary:
        ---------
        @param table:
        @param data_id:
        @param data: {"TITLE":"xxx"} 更新的字段及值
        @param doc_type:
        ---------
        @result:
        '''


        self._es.update(index = table, doc_type = doc_type or table, body = {"doc": data}, id = data_id)

    def delete_by_id(self, table, data_id, doc_type = ''):
        """
        根据给定的id,删除文档
        :return:
        """
        self._es.delete(index = table, doc_type = doc_type or table, id = data_id)

    def set_mapping(self, table, mapping, doc_type = ''):
        '''
        @summary:
        ---------
        @param table:
        @param mapping:
        mapping = {
            doc_type: {
                "properties": {
                    "document_id": {
                        "type": "integer"
                    },
                    "title": {
                        "type": "string"
                    },
                    "content": {
                        "type": "string"
                    }
                }
            }
        }
        @param doc_type:
        ---------
        @result:
        '''

        if not self._es.indices.exists(index = table):
            # 创建Index和mapping
            self._es.indices.create(index = table, body = mapping, ignore=400)
            self._es.indices.put_mapping(index = table, doc_type = doc_type or table, body = mapping)
示例#53
0
class ElasticsearchClient:
    """
    Elasticsearch client for politylink endpoint
    """
    def __init__(self, url='http://localhost:9200'):
        def to_node(url):
            res = urlparse(url)
            return {'host': res.hostname, 'port': res.port}

        self.client = Elasticsearch(hosts=[to_node(url)])

    def index(self, obj):
        """
        create or update a document
        """

        assert isinstance(obj, AbstractText)
        try:
            return self.client.index(index=obj.index,
                                     id=obj.id,
                                     body=obj.__dict__)
        except Exception as e:
            raise ElasticsearchException(f'failed to index {obj}') from e

    def get(self, id_):
        """
        get a document by politylink id (ref idgen)
        """

        try:
            if id_.startswith('News'):
                cls = NewsText
            elif id_.startswith('Bill'):
                cls = BillText
            res = self.client.get(index=cls.index, id=id_)
            return cls(res['_source'])
        except Exception as e:
            raise ElasticsearchException(f'failed to get {id_}') from e

    def search(self, cls, query=None):
        """
        search $cls documents by query
        return all documents when query is empty
        """

        if query:
            query_doc = {
                'query': {
                    'multi_match': {
                        'query': query,
                        'fields': cls.get_all_fields()
                    }
                }
            }
        else:
            query_doc = {'query': {'match_all': {}}}
        try:
            res = self.client.search(index=cls.index, body=query_doc)
            return list(
                map(lambda hit: cls(hit['_source']), res['hits']['hits']))
        except Exception as e:
            raise ElasticsearchException(
                f'failed to search NewsText for {query_doc}') from e
示例#54
0
        es.index(index='project_data',
                 doc_type='projectData',
                 id=1,
                 body=json.load(open_file))

    with open('Data/project_features.json') as open_file:
        es.index(index='project_features_index',
                 doc_type='project_features_doc',
                 id=1,
                 body=json.load(open_file))

    # with open('project.json') as open_file:
    # es.index(index='project_features',doc_type='project_feature',id= 1,body=json.load(open_file))

    project_data_json = es.get(index='project_data',
                               doc_type='projectData',
                               id=1)
    project_data_json = project_data_json['_source']

    project_features_json = es.get(index='project_features_index',
                                   doc_type='project_features_doc',
                                   id=1)
    project_features_json = project_features_json['_source']

    ##    project_data = es.get(index='ready_to_move',doc_type='project_data',id=1)
    ##    project_data = project_data['_source']

    ##    project_features = es.get(index='project_features',doc_type='project_feature',id=1)
    ##    project_features = project_features['_source']

    allKeys = getAllKeys(project_features_json)
示例#55
0
    print "OK"

es.indices.get(index='classes', ignore=[400, 404])  # 그냥 가져오면 오류

es.indices.create(index="classes")  #인덱스 만들기
es.indices.get(index="classes")

es.indices.delete(index="classes")
es.indices.get(index='classes', ignore=[400, 404])  # 지웟으니 오류

#POST http://localhost:9200/classes/class/1 -d    인덱스/타입/id body값
body = {"title": "Algorithm", "professor": "John"}
es.index(index="classes", doc_type="class", id=1, body=body)

#GET http://localhost:9200/classes/class/1    인덱스/타입/id body값
res = es.get(index="classes", id=1)
print(json.dumps(res, indent=2))  # 이쁘게 출력 가능

#json 파일 불러와서 저장하기
import os
print(os.getcwd())
with open(
        'D:\\github\\team-crawlcrawl\\crawling_code\\elasticsearch_test\\oneclass.json'
) as data_file:
    data = json.load(data_file)
es.index(index="classes", doc_type="class", id=1, body=data)
res = es.get(index="classes", id=1)
print(json.dumps(res, indent=2))  # 이쁘게 출력 가능
'''
데이터 업데이트
'''
示例#56
0
#
#e1={
#    "first_name":"nitin",
#    "last_name":"panwar",
#    "age": 27,
#    "about": "Love to play cricket",
#    "interests": ['sports','music'],
#}
#
##print(e1)
#
##Now let's store this document in Elasticsearch 
#res = es.index(index='megacorp',doc_type='employee',id=1,body=e1)

res = es.index(index="test-index", doc_type='tweet', id=1, body=doc)
#print(res['result'])

res = es.get(index="test-index", doc_type='tweet', id=1)
#print(res['_source'])

es.indices.refresh(index="test-index")

#res = es.search(index="test-index", body={"query": {"match_all": {}}})
#print("Got %d Hits:" % res['hits']['total'])

res= es.search(index='test-index',body={'query':{'match':{'LastName':'Agrawal'}}})
print(res['hits']['hits'])

#print(res['hits']['hits'])
#for hit in res['hits']['hits']:
#    print("%(timestamp)s %(author)s: %(text)s" % hit["_source"])
示例#57
0
    tracer = logging.getLogger('elasticsearch.trace')
    tracer.setLevel(logging.INFO)
    tracer.addHandler(logging.FileHandler('/tmp/es_trace.log'))

    # instantiate es client, connects to localhost:9200 by default
    es = Elasticsearch()

    # we load the repo and all commits
    load_repo(es)

    # run the bulk operations
    success, _ = bulk(es, REPO_ACTIONS, index='git', raise_on_error=True)
    print('Performed %d actions' % success)

    # now we can retrieve the documents
    es_repo = es.get(index='git', doc_type='repos', id='elasticsearch')
    print('%s: %s' % (es_repo['_id'], es_repo['_source']['description']))

    # update - add java to es tags
    es.update(index='git',
              doc_type='repos',
              id='elasticsearch',
              body={
                  "script": "ctx._source.tags += tag",
                  "params": {
                      "tag": "java"
                  }
              })

    # refresh to make the documents available for search
    es.indices.refresh(index='git')
示例#58
0
        "_id": "20fbba1230cabbc0f4644f917c6c2be52b8a63e8",
        "_op_type": "update",
        "doc": {"initial_commit": True},
    },
    {
        "_type": "_doc",
        "_id": "ae0073c8ca7e24d237ffd56fba495ed409081bf4",
        "_op_type": "update",
        "doc": {"release": "5.0.0"},
    },
]

success, _ = bulk(client, UPDATES, index="git")
client.indices.refresh(index="git")

initial_commit = client.get(index="git", id="20fbba1230cabbc0f4644f917c6c2be52b8a63e8")

# and now we can count the documents
print(client.count(index="git")["count"], "documents in index")

import csv

with open("cars.csv") as csvfile:
    reader = csv.DictReader(csvfile, delimiter=";")
    ret = bulk(client, reader, index="cars")


result = client.search(
    index="git",
    body={
        "query": {
示例#59
0
# -*-coding:utf-8-*-
from elasticsearch import Elasticsearch

from datetime import datetime

es_service = [{"host": "127.0.0.1", "port": "9200"}]

es = Elasticsearch(es_service)

date = {
    'author': 'stone',
    'text': "今天的天气不太热",
    'timestamp': datetime.now(),
}

res = es.index(index='test-index', doc_type='tweet', id=1, body=date)
print(res)  # 打印索引信息

res = es.get(index='test-index', doc_type='tweet', id=1)
print(res['_source'])  # 打印内容

es.indices.refresh(index="test-index")

# 自定义查找顺序
res = es.search(index="test-index", body={"query": {"match_all": {}}})
print("Got %d Hits:" % res['hits']['total']['value'])
for hit in res['hits']['hits']:
    # print(hit["_source"])
    print("%(timestamp)s %(author)s: %(text)s" % hit["_source"])
示例#60
-7
def create_index(data):
    # connect to the elasticsearch instance
	es = Elasticsearch("http://ec2-52-3-61-194.compute-1.amazonaws.com:9200")

	INDEX_NAME = 'parktest'

   	d = {}
        d['time'] = data[0][0]
        d['garage_name'] = data[0][1]
        location = {}
        location['lat'] = data[0][2]
        location['lon'] = data[0][3]
        d['location'] = location
        d['availability'] = data[1]

    # get the details about the document with id = garage_name
	res = es.get(index=INDEX_NAME, doc_type=INDEX_NAME, id=data[0][1], ignore=404)

	#if the document with id do not exist, create it
	if not res['found']:
        	es.index(index=INDEX_NAME, doc_type=INDEX_NAME, id=data[0][1], body=d, refresh=True)
	else:
		#update the document
		qq = '{"doc": { "availability":'+str(data[1])+'  }}'
		es.update(index=INDEX_NAME, doc_type=INDEX_NAME,id=data[0][1], body=qq)

	return d