def index(): """docstring for fname""" import time fptr = open(sys.argv[1], 'rb') line_count = 0 conn = ES(["localhost:9200"]) #conn.create_index('test-index') start = time.clock() numb_exceptions = 0 for line in fptr: if ((line_count % 10000) == 0): end = time.clock() minutes = (end - start) / 60.0 print 'Done with %d took %f min. ' %(line_count, minutes) print 'number of exceptions ', numb_exceptions line_count += 1 data = json.loads(line) post_id = int(data['post_id']) if post_id and data: try: conn.index(data, "test-index", "test-type", post_id) except Exception: numb_exceptions += 1 continue print 'number of exceptions ', numb_exceptions
def init(): conn = ES('127.0.0.1:9200') try: conn.delete_index("zhihu") except: pass conn.create_index("zhihu") mapping = { u'id': { 'store': 'yes', 'type': u'integer' }, u'link': { 'store': 'yes', 'type': u'string' }, u'title': { 'boost': 1.0, 'index': 'analyzed', 'store': 'yes', 'type': u'string' }, } conn.put_mapping("answer", {'properties': mapping}, ["zhihu"]) for item in Data().getData(): conn.index(item, "zhihu", "answer", item['id']) conn.refresh(["zhihu"]) return redirect('/list')
class ElasticSearchPipeline(object): def __init__(self): self.settings = get_project_settings() basic_auth = {'username': self.settings['ELASTICSEARCH_USERNAME'], 'password': self.settings['ELASTICSEARCH_PASSWORD']} if self.settings['ELASTICSEARCH_PORT']: uri = "%s:%d" % (self.settings['ELASTICSEARCH_SERVER'], self.settings['ELASTICSEARCH_PORT']) else: uri = "%s" % (self.settings['ELASTICSEARCH_SERVER']) self.es = ES([uri], basic_auth=basic_auth) def process_item(self, item, spider): if self.__get_uniq_key() is None: log.msg("ELASTICSEARCH_UNIQ_KEY is NONE") self.es.index(dict(item), self.settings['ELASTICSEARCH_INDEX'], self.settings['ELASTICSEARCH_TYPE'], id=item['id'], op_type='create',) else: log.msg("Generation SHA1") self.es.index(dict(item), self.settings['ELASTICSEARCH_INDEX'], self.settings['ELASTICSEARCH_TYPE'], hashlib.sha1(item[self.__get_uniq_key()]).hexdigest()) log.msg("Item send to Elastic Search %s" % (self.settings['ELASTICSEARCH_INDEX']), level=log.DEBUG, spider=spider) return item def __get_uniq_key(self): if not self.settings['ELASTICSEARCH_UNIQ_KEY'] or self.settings['ELASTICSEARCH_UNIQ_KEY'] == "": return None return self.settings['ELASTICSEARCH_UNIQ_KEY']
class ElasticSearchPipeline(object): def __init__(self): from pyes import ES self.settings = get_project_settings() if self.settings['ELASTICSEARCH_PORT']: uri = "%s:%d" % (self.settings['ELASTICSEARCH_SERVER'], self.settings['ELASTICSEARCH_PORT']) else: uri = "%s" % (self.settings['ELASTICSEARCH_SERVER']) self.es = ES([uri]) def process_item(self, item, spider): if self.__get_uniq_key() is None: self.es.index(dict(item), self.settings['ELASTICSEARCH_INDEX'], self.settings['ELASTICSEARCH_TYPE'], id=item['id'], op_type='create',) else: self.es.index(dict(item), self.settings['ELASTICSEARCH_INDEX'], self.settings['ELASTICSEARCH_TYPE'], self._get_item_key(item)) return item def _get_item_key(self, item): uniq = self.__get_uniq_key() if isinstance(uniq, list): values = [item[key] for key in uniq] value = ''.join(values) else: value = uniq return hashlib.sha1(value).hexdigest() def __get_uniq_key(self): if not self.settings['ELASTICSEARCH_UNIQ_KEY'] or self.settings['ELASTICSEARCH_UNIQ_KEY'] == "": return None return self.settings['ELASTICSEARCH_UNIQ_KEY']
class ElasticSearchPipeline(object): def __init__(self): self.settings = get_project_settings() basic_auth = {'username': self.settings['ELASTICSEARCH_USERNAME'], 'password': self.settings['ELASTICSEARCH_PASSWORD']} if self.settings['ELASTICSEARCH_PORT']: uri = "%s:%d" % (self.settings['ELASTICSEARCH_SERVER'], self.settings['ELASTICSEARCH_PORT']) else: uri = "%s" % (self.settings['ELASTICSEARCH_SERVER']) self.es = ES([uri], basic_auth=basic_auth) def index_item(self, item): if self.settings['ELASTICSEARCH_UNIQ_KEY']: local_id = hashlib.sha1(item[uniq_key)]).hexdigest() log.msg("Generated unique key %s" % local_id, level=self.settings['ELASTICSEARCH_LOG_LEVEL']) op_type = 'none' else: op_type = 'create' local_id = item['id'] self.es.index(dict(item), self.settings['ELASTICSEARCH_INDEX'], self.settings['ELASTICSEARCH_TYPE'], id=local_id, op_type=op_type)
class StashHandler(logging.Handler): def __init__(self, constr, whitelist=None, blacklist=None): logging.Handler.__init__(self) self.conn = ES(constr) if blacklist is None: blacklist = set() self.whitelist = whitelist self.blacklist = blacklist self.record_type = 'record' @property def index_name(self): return 'logstash-'+datetime.date.today().strftime('%Y.%m.%d') def emit(self, record): if self.whitelist is None: d = { k: record.__dict__[k] for k in record.__dict__ if k not in self.blacklist } else: d = { k: record.__dict__[k] for k in record.__dict__ if k in self.whitelist and k not in self.blacklist } entry = { "@fields": d, "@message": record.msg, "@source": "gelf://localhost", "@source_host": "gelf://localhost", "@source_path": "/", "@tags": [], "@timestamp": datetime.datetime.utcnow().isoformat(), "@type": self.record_type} self.conn.index(entry, self.index_name, self.record_type)
def es_index(self): conn = ES(settings.ES_SERVERS, basic_auth=settings.ES_AUTH) conn.index( doc=self.get_search_kwargs(), index=self.tenant.slug, doc_type=self.Meta.document_type, id=unicode(self.id) )
class ElasticSearchPipeline(object): def __init__(self): self.conn = ES('localhost:9200') # self.file = open('urls.csv', 'wb') # self.file.write('spider,url' + '\n') def process_item(self, item, spider): #self.file.write(spider.name + ',' + spider.start_urls[0] + '\n') self.conn.index(dict(item), "qrator", spider.name) return item
class ElasticSearchPipeline(object): def __init__(self): self.settings = get_project_settings() basic_auth = { 'username': self.settings['ELASTICSEARCH_USERNAME'], 'password': self.settings['ELASTICSEARCH_PASSWORD'] } if self.settings['ELASTICSEARCH_PORT']: uri = "%s:%d" % (self.settings['ELASTICSEARCH_SERVER'], self.settings['ELASTICSEARCH_PORT']) else: uri = "%s" % (self.settings['ELASTICSEARCH_SERVER']) self.es = ES([uri], basic_auth=basic_auth) def process_item(self, item, spider): if self.__get_uniq_key() is None: log.msg("ELASTICSEARCH_UNIQ_KEY is NONE") self.es.index( dict(item), self.settings['ELASTICSEARCH_INDEX'], self.settings['ELASTICSEARCH_TYPE'], id=item['id'], op_type='create', ) else: self.es.index(dict(item), self.settings['ELASTICSEARCH_INDEX'], self.settings['ELASTICSEARCH_TYPE'], self._get_item_key(item)) log.msg("Item send to Elastic Search %s" % (self.settings['ELASTICSEARCH_INDEX']), level=log.DEBUG, spider=spider) return item def _get_item_key(self, item): uniq = self.__get_uniq_key() if isinstance(uniq, list): values = [item[key] for key in uniq] value = ''.join(values) else: value = uniq return hashlib.sha1(value).hexdigest() def __get_uniq_key(self): if not self.settings['ELASTICSEARCH_UNIQ_KEY'] or self.settings[ 'ELASTICSEARCH_UNIQ_KEY'] == "": return None return self.settings['ELASTICSEARCH_UNIQ_KEY']
def processData(esurl, esindex, estype, shpPath, simplify, tolerance, startfrom): # Open a file for reading try: with open(shpPath): pass except IOError: print "Unable to locate file: " + shpPath # open the es connection from pyes import ES conn = ES(esurl, timeout=60, bulk_size=10) # check that a tolerance is passed when simplifying. if simplify == True: if tolerance == None: raise ValueError("You must pass a valid tolerance if simplifying geometry") # use fiona to open the shapefile and read it try: with fiona.open(shpPath) as source: for f in source: featid = int(f["id"]) if featid > startfrom: # grab the geom from shapely.geometry import shape geom = shape(f["geometry"]) # simplify if required if validateGeometry(geom): if simplify == True: geom = simplifyGeometry(geom, tolerance) # if the geom is valid then push it into es if validateGeometry(geom): data = json.dumps(f) key = f["id"] conn.index(data, esindex, estype, key, bulk=True) else: logging.error("Invalid Geometry: " + f["id"]) except: raise
def processData(esurl, esindex, estype, shpPath, simplify, tolerance, startfrom): # Open a file for reading try: with open(shpPath): pass except IOError: print 'Unable to locate file: ' + shpPath #open the es connection from pyes import ES conn = ES(esurl, timeout=60, bulk_size=10) #check that a tolerance is passed when simplifying. if (simplify == True): if (tolerance == None): raise ValueError( 'You must pass a valid tolerance if simplifying geometry') #use fiona to open the shapefile and read it try: with fiona.open(shpPath) as source: for f in source: featid = int(f['id']) if (featid > startfrom): #grab the geom from shapely.geometry import shape geom = shape(f['geometry']) #simplify if required if (validateGeometry(geom)): if (simplify == True): geom = simplifyGeometry(geom, tolerance) #if the geom is valid then push it into es if (validateGeometry(geom)): data = json.dumps(f) key = f['id'] conn.index(data, esindex, estype, key, bulk=True) else: logging.error('Invalid Geometry: ' + f['id']) except: raise
class ElasticSearchSink(object): def __init__(self, server, index, type): from pyes import ES self.cxn = ES(server) self.index = index self.type = type def __call__(self, event): if isinstance(event, list): self.cxn.bulk_size = len(event) for e in event: self.cxn.index(e, self.index, self.type, bulk=True) self.cxn.flush_bulk() else: self.cxn.index(event, self.index, self.type)
class Importer(object): base_filename = "TicketNetworkDataFeed" model_map = { "performers": { "file": "Performers.csv", "model": Performer, }, "events": { "file": "Events.csv", "model": Event, }, "venues": { "file": "Venues.csv", "model": Venue, } } def __init__(self, data_type, csv_path="/tmp/", es_hosts=("http://localhost:9200", )): self.data_type = data_type self.doc_type = "ticketnetwork_%s" % self.data_type self.csv_path = csv_path self.es = ES(es_hosts) def model(self): return self.model_map[self.data_type]["model"] def filepath(self): return os.path.join( self.csv_path, '-'.join( [self.base_filename, self.model_map[self.data_type]["file"]])) def __call__(self, *args, **kwargs): with open(self.filepath()) as f: reader = DictReader(f) for entry in reader: sanitize(entry) model = self.model()(entry) d = model.dict() self.es.index(d, "oedi_sources", self.doc_type, model.hash(), bulk=True) self.es.flush_bulk(True)
def main(argv): start = 1 if len(sys.argv) > 1: if sys.argv[1]: start = sys.argv[1] bulksize = 1000 es = ES(("http", "localhost", 9200), bulk_size=bulksize) c0 = 0 t0 = time.time() c1 = 0 t1 = time.time() for n in range(start, start + 1000000): result = es.index( { 'a': random_string_generator(), 'b': random_string_generator(), 'c': random_string_generator(), 'd': random_string_generator(), 'e': random_string_generator(), 'f': random_string_generator(), 'g': random_string_generator(), 'h': random_string_generator(), 'i': random_string_generator(), 'j': random_string_generator(), 'k': random_string_generator(), 'l': random_string_generator(), 'm': random_string_generator(), 'n': random_string_generator(), 'o': random_string_generator(), 'p': random_string_generator(), 'q': random_string_generator(), 'r': random_string_generator(), 's': random_string_generator(), 't': random_string_generator(), 'u': random_string_generator(), 'v': random_string_generator(), 'w': random_string_generator(), 'x': random_string_generator(), 'y': random_string_generator(), 'z': random_string_generator() }, 'pyindex', 'pytype', n, bulk=True) c0 = c0 + bulksize c1 = c1 + bulksize if result: d0 = (time.time() - t0) d1 = (time.time() - t1) now = datetime.datetime.utcnow() print("{0},{1},{2},{3},{4},{5},{6},{7}".format( now.strftime("%Y-%m-%dT%H:%M:%S.%fZ"), result.took, c0, d0, c0 / (d0 * bulksize), c1, d1, c1 / (d1 * bulksize))) c1 = 0 t1 = time.time()
class Indexer: def __init__(self, es_host, batch_mode=True, batch_size=100): self.client = ES(es_host) self.batch_mode = batch_mode self.client.bulk_size = int(batch_size) def bulk_index(self, index, type, shapefile, sleep_time=0.1): print 'Indexing [%s] docs into [%s] from %s' % (type, index, shapefile) index_count = 0 id_re = re.compile('^.*?"id"\s*:\s*"([^"]+)"') parens_re = re.compile('\(.*?\)') for line in input(shapefile): id = id_re.match(line).group(1) # cleanup any lines that contain parentheticals line = parens_re.sub('', line).strip() # sweet dec/encodings bro line = line.decode('latin-1').encode('utf-8') id = id.decode('latin-1').encode('utf-8') try: self.client.index(line, index, type, id, bulk=self.batch_mode) except UnicodeDecodeError as e: print "Error processing line with id %s: %s" % (id, e.message) except NoServerAvailable as e: print "The server failed to respond while indexing %s: [%s]. Sleeping %d seconds and retrying..." % (id, e.message, sleep_time) sleep(5) try: print "Retrying indexing of %s" % id self.client.index(line, index, type, id, bulk=self.batch_mode) except NoServerAvailable as e: print "Failed to reconnect again. Skipping indexing %s" % id except Exception as e: print "This happened: %s" % e index_count += 1 if index_count % int(self.client.bulk_size) == 0: print 'Indexing batch of %d, starting from %s' % (self.client.bulk_size, id) sleep(sleep_time) # index remaining bulk entries self.client.force_bulk()
class ElasticSearchPipeline(object): def __init__(self, settings): basic_auth = {'username': settings.get('ELASTICSEARCH_USERNAME'), 'password': settings.get('ELASTICSEARCH_PASSWORD')} if settings.get('ELASTICSEARCH_PORT'): uri = "%s:%d" % (settings.get('ELASTICSEARCH_SERVER'), settings.get('ELASTICSEARCH_PORT')) else: uri = "%s" % (settings.get('ELASTICSEARCH_SERVER')) self.es = ES([uri], basic_auth=basic_auth) self.settings = settings @classmethod def from_crawler(cls, crawler): pipe = cls(crawler.settings) return pipe def process_item(self, item, spider): if self.__get_uniq_key() is None: log.info("ELASTICSEARCH_UNIQ_KEY is NONE") self.es.index(dict(item), self.settings.get('ELASTICSEARCH_INDEX'), self.settings.get('ELASTICSEARCH_TYPE'), id=item['id'], op_type='create', ) else: self.es.index(dict(item), self.settings.get('ELASTICSEARCH_INDEX'), self.settings.get('ELASTICSEARCH_TYPE'), self._get_item_key(item)) log.debug("Item send to Elastic Search %s" % (self.settings.get('ELASTICSEARCH_INDEX')), spider=spider) return item def _get_item_key(self, item): uniq = self.__get_uniq_key() if isinstance(uniq, list): values = [item[key] for key in uniq] value = ''.join(values) else: value = uniq return hashlib.sha1(value).hexdigest() def __get_uniq_key(self): if not self.settings.get('ELASTICSEARCH_UNIQ_KEY'): return self.settings.get('ELASTICSEARCH_UNIQ_KEY') else: return None
def index(fname, index_name, keys_to_tag): fptr = open(fname, 'rb') line_count = 0 conn = ES(["localhost:9200"]) if not conn.exists_index(index_name): conn.create_index(index_name) start = time.clock() numb_exceptions = 0 for line in fptr: if ((line_count % 10000) == 0): end = time.clock() minutes = (end - start) / 60.0 print 'File: %s Done with %d took %f min. ' %(fname, line_count, minutes) print 'number of exceptions ', numb_exceptions line_count += 1 data = json.loads(line) if not data.get('tags'): continue post_id = int(data['post_id']) found_content = False for k in keys_to_tag: if data.get(k): found_content = True if not found_content: continue index_data = dict() for k in keys_to_tag: value = data.get(k) if (value and (k == 'content')): try: stripped_value = utils.strip_tags(value) except Exception: stripped_value = value index_data[k] = stripped_value if post_id and data: try: conn.index(index_data, index_name, "test-type", post_id) except Exception: numb_exceptions += 1 continue print 'number of exceptions ', numb_exceptions
def main(fn, args): conn = ES(args.host, bulk_size=10*args.bulksize) if fn.endswith(".gz"): fp = gzip.open(fn) else: fp = open(fn) count = 0 total = 0 try: for line in fp: doc = json.loads(line.strip()) if doc.get("_id"): _id = doc["_id"] del doc["_id"] else: _id = None conn.index(doc=doc, index=args.index, doc_type=args.doctype, id=_id, bulk=True) count+=1 total+=1 if count % args.bulksize == 0: flush(conn, count) count = 0 except: print "traceback", "".join(traceback.format_exception(*sys.exc_info())) raise finally: fp.close() try: flush(conn, count) conn.refresh(args.index) except: pass print "Indexed %s docs total"%total
class ElasticSearchPipeline(object): def __init__(self): uri = "%s:%d" % (settings['ELASTICSEARCH_SERVER'], settings['ELASTICSEARCH_PORT']) self.es = ES([uri]) def process_item(self, item, spider): if self.__get_uniq_key() is None: self.es.index(dict(item), settings['ELASTICSEARCH_INDEX'], settings['ELASTICSEARCH_TYPE']) else: self.es.index(dict(item), settings['ELASTICSEARCH_INDEX'], settings['ELASTICSEARCH_TYPE'], hashlib.sha1(item[self.__get_uniq_key()]).hexdigest()) log.msg("Item send to Elastic Search %s" % (settings['ELASTIC_SEARCH_INDEX']), level=log.DEBUG, spider=spider) return item def __get_uniq_key(self): if not settings['ELASTICSEARCH_UNIQ_KEY'] or settings['ELASTICSEARCH_UNIQ_KEY'] == "": return None return settings['ELASTICSEARCH_UNIQ_KEY']
def main(argv): start = 1 if len(sys.argv) > 1: if sys.argv[1]: start = sys.argv[1] bulksize = 1000 es = ES(("http", "localhost", 9200), bulk_size=bulksize) c0 = 0 t0 = time.time() c1 = 0 t1 = time.time() for n in range(start, start + 1000000): result = es.index({ 'a' : random_string_generator(), 'b' : random_string_generator(), 'c' : random_string_generator(), 'd' : random_string_generator(), 'e' : random_string_generator(), 'f' : random_string_generator(), 'g' : random_string_generator(), 'h' : random_string_generator(), 'i' : random_string_generator(), 'j' : random_string_generator(), 'k' : random_string_generator(), 'l' : random_string_generator(), 'm' : random_string_generator(), 'n' : random_string_generator(), 'o' : random_string_generator(), 'p' : random_string_generator(), 'q' : random_string_generator(), 'r' : random_string_generator(), 's' : random_string_generator(), 't' : random_string_generator(), 'u' : random_string_generator(), 'v' : random_string_generator(), 'w' : random_string_generator(), 'x' : random_string_generator(), 'y' : random_string_generator(), 'z' : random_string_generator() }, 'pyindex', 'pytype', n, bulk=True) c0 = c0 + bulksize c1 = c1 + bulksize if result: d0 = (time.time() - t0) d1 = (time.time() - t1) now = datetime.datetime.utcnow() print("{0},{1},{2},{3},{4},{5},{6},{7}" .format(now.strftime("%Y-%m-%dT%H:%M:%S.%fZ"), result.took, c0, d0, c0/(d0*bulksize), c1, d1, c1/(d1*bulksize))) c1 = 0 t1 = time.time()
class Importer(object): base_filename = "TicketNetworkDataFeed" model_map = { "performers": { "file": "Performers.csv", "model": Performer, }, "events": { "file": "Events.csv", "model": Event, }, "venues": { "file": "Venues.csv", "model": Venue, } } def __init__(self, data_type, csv_path="/tmp/", es_hosts=("http://localhost:9200",)): self.data_type = data_type self.doc_type = "ticketnetwork_%s" % self.data_type self.csv_path = csv_path self.es = ES(es_hosts) def model(self): return self.model_map[self.data_type]["model"] def filepath(self): return os.path.join(self.csv_path, '-'.join([self.base_filename, self.model_map[self.data_type]["file"]])) def __call__(self, *args, **kwargs): with open(self.filepath()) as f: reader = DictReader(f) for entry in reader: sanitize(entry) model = self.model()(entry) d = model.dict() self.es.index(d, "oedi_sources", self.doc_type, model.hash(), bulk=True) self.es.flush_bulk(True)
def init(): conn = ES('127.0.0.1:9200') try: conn.delete_index("zhihu") except: pass conn.create_index("zhihu") mapping = { u'id': {'store': 'yes', 'type': u'integer'}, u'link': {'store': 'yes', 'type': u'string'}, u'title': {'boost': 1.0, 'index': 'analyzed', 'store': 'yes', 'type': u'string'}, } conn.put_mapping("answer", {'properties': mapping}, ["zhihu"]) for item in Data().getData(): conn.index(item, "zhihu", "answer", item['id']) conn.refresh(["zhihu"]) return redirect('/list')
def callback(body, message): """Do actual work.""" logger.info("body in callback() is %s" % body) # pull lat/lon, time path = body sd = SD(path) lat = N.array(sd.select('Latitude').get()) lon = N.array(sd.select('Longitude').get()) t = N.array(sd.select('Time').get()) sd.end() #logger.info("lat: %s" % str(lat.shape)) #logger.info("lon: %s" % str(lon.shape)) #logger.info("time: %s" % str(t.shape)) # build metadata json id = os.path.basename(path) md = { "id": id, "dataset": "AIRX2RET", "starttime": t[0,0], "endtime": t[44,29], "location": { "coordinates": [[ [ lon[0,0], lat[0,0] ], [ lon[0,29], lat[0,29] ], [ lon[44,29], lat[44,29] ], [ lon[44,0], lat[44,0] ], [ lon[0,0], lat[0,0] ], ]], "type": "polygon" }, "urls": "http://mozart/data/public/products/%s" % id } # publish pub_dir = '/data/public/products' ensure_dir(pub_dir) shutil.move(path, os.path.join(pub_dir, id)) # insert into ElasticSearch index = doctype = 'airs' conn = ES('http://localhost:9200') mapping = json.load(open('grq_mapping.json')) if not conn.indices.exists_index(index): conn.indices.create_index(index, mapping) conn.indices.put_mapping(doctype, mapping, index) ret = conn.index(md, index, doctype, md['id']) message.ack()
class KVStore(KVStoreBase): def __init__(self, *args, **kwargs): super(KVStore, self).__init__(*args, **kwargs) self.connection = ES(settings.THUMBNAIL_ELASTIC_SEARCH_SERVERS) def _get_raw(self, key): try: #import pdb; pdb.set_trace() value = self.connection.get(settings.THUMBNAIL_ELASTIC_SEARCH_INDEX, settings.THUMBNAIL_ELASTIC_SEARCH_DOCUMENT_TYPE, key) return value['_source']['value'] except: return None def _set_raw(self, key, value): ret = self.connection.index({"value": value}, settings.THUMBNAIL_ELASTIC_SEARCH_INDEX, settings.THUMBNAIL_ELASTIC_SEARCH_DOCUMENT_TYPE, key) return ret['ok'] def _delete_raw(self, *keys): rets = [] for key in keys: try: ret = self.connection.delete(settings.THUMBNAIL_ELASTIC_SEARCH_INDEX, settings.THUMBNAIL_ELASTIC_SEARCH_DOCUMENT_TYPE, key) rets.append(ret['ok']) except: rets.append(False) return rets def _find_keys_raw(self, prefix): search = Search(query=PrefixQuery("_id", prefix), size=1000, start=0, fields=[]) results = self.connection.search(search, indexes=[settings.THUMBNAIL_ELASTIC_SEARCH_INDEX,], doc_types=[settings.THUMBNAIL_ELASTIC_SEARCH_DOCUMENT_TYPE,]) return [hit['_id'] for hit in results['hits']['hits']]
def es_index(self): conn = ES(settings.ES_SERVERS, basic_auth=settings.ES_AUTH) conn.index(doc=self.get_search_kwargs(), index=self.tenant.slug, doc_type=self.Meta.document_type, id=unicode(self.id))
class ESIndexerBase(object): ES_HOST = ES_HOST ES_INDEX_NAME = ES_INDEX_NAME ES_INDEX_TYPE = 'gene' def __init__(self): self.conn = ES(self.ES_HOST, default_indexes=[self.ES_INDEX_NAME], timeout=10.0) self.step = 10000 def create_index(self): try: print self.conn.open_index(self.ES_INDEX_NAME) except IndexMissingException: print self.conn.create_index(self.ES_INDEX_NAME) def delete_index_type(self, index_type): '''Delete all indexes for a given index_type.''' index_name = self.ES_INDEX_NAME # index_type = self.ES_INDEX_TYPE #Check if index_type exists mapping = self.conn.get_mapping(index_type, index_name) if index_name not in mapping or index_type not in mapping[index_name]: print 'Error: index type "%s" does not exist in index "%s".' % (index_type, index_name) return path = '/%s/%s' % (index_name, index_type) if ask('Confirm to delete all data under "%s":' % path) == 'Y': return self.conn.delete_mapping(index_name, index_type) def index(self, doc, index_type, id=None): '''add a doc to the index. If id is not None, the existing doc will be updated. ''' # index_type = self.ES_INDEX_TYPE return self.conn.index(doc, self.ES_INDEX_NAME, index_type, id=id) def delete_index(self, index_type, id): '''delete a doc from the index based on passed id.''' # index_type = self.ES_INDEX_TYPE return self.conn.delete(self.ES_INDEX_NAME, index_type, id) def optimize(self): return self.conn.optimize(self.ES_INDEX_NAME, wait_for_merge=True) def get_field_mapping(self): import dataload reload(dataload) dataload.register_sources() return dataload.get_mapping() def build_index(self, doc_d, update_mapping=False, bulk=True): index_name = self.ES_INDEX_NAME index_type = self.ES_INDEX_TYPE #Test if index exists try: print "Opening index...", self.conn.open_index(index_name) except NotFoundException: print 'Error: index "%s" does not exist. Create it first.' % index_name return -1 try: cur_mapping = self.conn.get_mapping(index_type, index_name) empty_mapping = False except ElasticSearchException: #if no existing mapping available for index_type #force update_mapping to True empty_mapping = True update_mapping = True # empty_mapping = not cur_mapping[index_name].get(index_type, {}) # if empty_mapping: # #if no existing mapping available for index_type # #force update_mapping to True # update_mapping = True if update_mapping: print "Updating mapping...", if not empty_mapping: print "\n\tRemoving existing mapping...", print self.conn.delete_mapping(index_name, index_type) _mapping = self.get_field_mapping() print self.conn.put_mapping(index_type, _mapping, [index_name]) print "Building index..." t0 = time.time() for doc_id, doc in doc_d.items(): self.conn.index(doc, index_name, index_type, doc_id, bulk=bulk) print self.conn.flush() print self.conn.refresh() print "Done[%s]" % timesofar(t0) def query(self, qs, fields='symbol,name', **kwargs): _q = StringQuery(qs) res = self.conn.search(_q, fields=fields, **kwargs) return res
class ElasticCatalog(object): default_indexes = { 'zelastic_doc_id': { 'type': 'string', 'index': 'not_analyzed' } } def __init__(self, connection_string, elastic_name, storage, bulk=False, bulk_size=400): self.conn = ES(connection_string, bulk_size=bulk_size) self.bulk_size = bulk_size self.name = elastic_name self.storage = storage self.bulk = bulk def update_mapping(self, name): meta = self.storage.meta(name) indexes = meta['indexes'] properties = self.default_indexes.copy() try: self.conn.create_index(self.name) except IndexAlreadyExistsException: pass for index_name, _type in indexes.items(): index = None if _type == 'str': index = { 'type': 'string', 'index': 'not_analyzed', } elif _type == 'full': index = { 'type': 'string', 'index': 'analyzed', } elif _type == 'bool': index = { 'type': 'boolean' } elif _type == 'int': index = { 'type': 'integer', } elif _type in ('datetime', 'date'): index = { 'type': 'date', } elif _type == 'float': index = { 'type': 'float', } if index is not None: properties[index_name] = index self.conn.indices.put_mapping( doc_type=name, mapping={ 'ignore_conflicts': True, 'properties': properties }, indices=[self.name]) def id(self, container_name, key): return '%s-%s' % (container_name, key) def index(self, container_name, doc, key): # need to add data to the index that isn't actually persisted data = { 'zelastic_doc_id': key } meta = self.storage.meta(container_name) indexes = meta['indexes'] for index in indexes.keys(): if index in doc: data[index] = doc[index] self.conn.index( data, self.name, container_name, self.id(container_name, key), bulk=self.bulk) def delete(self, container_name, key): self.conn.delete( self.name, container_name, self.id(container_name, key), bulk=self.bulk) def delete_all(self, container_name): self.conn.delete_mapping( self.name, container_name) def search(self, container_name, query, **kwargs): return self.conn.search( query, indexes=[self.name], doc_types=[container_name], **kwargs) def getFacets(self, container_name, field, size=100): return self.conn.search_raw({ "facets": { field: { "terms": { "all_terms": True, "field": field, "size": size, "order": "term" } } } }, indexes=[self.name], doc_type=container_name)
class Elastic(object): def init_app(self, app): self.conn = ES(app.config['ELASTIC_URL'], timeout=2) #self.remote_conns = [ES(url) for url in app.config['REMOTE_ELASTIC_URL']] def search(self, start=0, size=20, doc_types='resource', indices='order_index', sort=None, **kwargs): # set filter filters = [] for k,v in kwargs.items(): if k and k!='complete_time': filters.append(TermFilter(k, v)) elif k and v!='' and k=='complete_time': ct = kwargs['complete_time'] if len(ct) == 2: filters.append(RangeFilter(ESRange('complete_time', from_value=ct[0], to_value=ct[1]))) else: filters.append(RangeFilter(ESRange('complete_time', from_value=ct[0]))) _filter = None if filters: _filter = ANDFilter(filters) bq = MatchAllQuery() # filtered q = FilteredQuery(bq, _filter) # sort if sort: sf = SortFactory() for s in sort: sf.add(s) s = Search(q, sort=sf) else: s = Search(q) # result return self.conn.search(s, indices=indices, doc_types=doc_types, start=start, size=size) def delete(self, index='order_index', doc_type='resource', id=''): return self.conn.delete(index=index, doc_type=doc_type, id=id) def create(self, index='order_index', doc_type='resource', doc=None): # try: # self.delete(index, doc_type, doc['id']) # except NotFoundException: # pass try: return self.conn.index(doc, index, doc_type, id=doc['id']) except:# not connection pass def multi_create(self, index='order_index', doc_type='resource', doc=None): """如果同步缓存到远程,要使用celery""" try: return self.conn.index(doc, index, doc_type, id=doc['id']) except:# not connection pass try: for rconn in self.remote_conns: rconn.index(doc, index, doc_type, id=doc['id']) except: print '--------sync cache to remote error------'
class ElasticSearchHandler(logging.Handler): """Logging handler that sends the logs to a Elasticsearch instance.""" def __init__( self, conn_strs=None, record_type="record", level=logging.NOTSET, fqdn=False, service_name=None, deploy_name=None, version=0, ): """Initialize the handler. Args: conn_strs (list): List of Elasticsearch connections strings. record_type (str): The record type always will be 'record'. level (str): Logging level. Default: NOTSET fqdn (bool): If True, the host field in the log record will be the fully qualified domain. Otherwise, the system hostname. service_name (str): Service name. deploy_name (str): Deploy name. version (int): If 1 it is used the Logstash formatter version 1. Otherwise, the logstash formatter version 0. """ logging.Handler.__init__(self, level=level) self.conn_strs = conn_strs if conn_strs else ["127.0.0.1:9200"] self.connected = False self.conn = None self.try_conn() self.record_type = record_type if version == 1: self.formatter = logstash.LogstashFormatterVersion1( record_type, fqdn, service_name, deploy_name) else: self.formatter = logstash.LogstashFormatterVersion0( record_type, fqdn, service_name, deploy_name) def try_conn(self): """Try a new connection to the Elasticsearch.""" try: self.conn = ES(self.conn_strs, timeout=5) self.connected = True except NoServerAvailable: print("Error connecting to elasticsearch for logging") @property def index_name(self): """Construct the logs Elasticsearch index. Returns: string: Logstash index. """ return "logstash-" + datetime.date.today().strftime("%Y.%m.%d") def emit(self, record): """Emit the specified log record. Args: record (LogRecord): Entry log to emit. """ entry = self.formatter.format(record) if self.connected: self.conn.index(entry, self.index_name, self.record_type) else: self.try_conn()
class SampleMaker(object): def __init__(self, name): log = open(name, "wb") self.log = log self.conn = ES(("http", "127.0.0.1", 9200), timeout=300.0, log_curl=True, dump_curl=log) self.index_name = "test-index" self.document_type = "test-type" self.conn.delete_index_if_exists(self.index_name) self.init_default_index() def init_default_index(self): from pyes.helpers import SettingsBuilder settings = SettingsBuilder() from pyes.mappings import DocumentObjectField from pyes.mappings import IntegerField from pyes.mappings import NestedObject from pyes.mappings import StringField, DateField, BooleanField, GeoPointField, FloatField docmapping = DocumentObjectField(name=self.document_type) docmapping.add_property( StringField(name="description", store=True, term_vector="with_positions_offsets", index="analyzed")) docmapping.add_property( StringField(name="name", store=True, term_vector="with_positions_offsets", index="analyzed")) docmapping.add_property(StringField(name="tag", store=True, index="not_analyzed")) docmapping.add_property(IntegerField(name="age", store=True)) docmapping.add_property(FloatField(name="price")) docmapping.add_property(DateField(name="date", store=True)) docmapping.add_property(BooleanField(name="in_stock", store=True, index="not_analyzed")) docmapping.add_property(GeoPointField(name="position")) nested_object = NestedObject(name="metadata") nested_object.add_property(StringField(name="name", store=True)) nested_object.add_property(StringField(name="value", store=True)) nested_object.add_property(IntegerField(name="num", store=True)) docmapping.add_property(nested_object) settings.add_mapping(docmapping) self.conn.ensure_index(self.index_name, settings) def generate_datafile(self, number_items=1000): """ Generate a dataset with number_items elements. """ names = get_names() totalnames = len(names) #init random seeder random.seed() #calculate items # names = random.sample(names, number_items) for i in xrange(number_items): data = {"name": names[random.randint(0, totalnames - 1)], "age": random.randint(1, 100), "price": random.random()*100.0, "tag":[words(1, False) for r in xrange(random.randint(1, 5))], "in_stock": random.choice([True, False]), "date": datetime.now()+timedelta(days=random.choice([1, -1])*random.randint(0,1000)), "position": { "lat" : random.choice([1, -1])* random.random()*90.0, "lon" : random.choice([1, -1])* random.random()*180.0 }, "description": words(random.randint(1, 100), False), "metadata":[{"name":names[random.randint(0, totalnames - 1)], "value":str(random.randint(1, 5)), "num":random.randint(1, 50) } for r in xrange(random.randint(1, 5))] } self.conn.index(data, self.index_name, self.document_type, id=str(i+1)) def close(self): self.conn.flush(self.index_name) self.log.close()
class ProcessSpiderData(Task): def run(self, spider_name): cities = [] backup_source = [] backup_created_date = None self.elastic = ES(settings.SEARCH_HOSTS, timeout=22.0, bulk_size=1500) java = JavaInterface() self.extractor = java.ArticleSentencesExtractor.INSTANCE self.logger = ProcessSpiderData.get_logger() spider = Data.objects.get(name=spider_name) source = spider.source if spider and len(source): backup_created_date = spider.created_date index_new = '%s_%d' % (spider.name, int(time.time())) # create new index (not connected to alias) self.elastic.create_index(index_new) self.elastic.put_mapping('job', {'job':{'properties':mapping}}, index_new) for item in source: item = self._process_content(item) item = self._get_location(item) if item.has_key('city'): cities.append(item['city']) self._create_index(index_new, item) backup_source.append(item) # save new index (in bulk) self.elastic.force_bulk() # create alias indices_old = self.elastic.get_alias(spider.name) self.elastic.set_alias(spider.name, [index_new]) # delete all indices for index in indices_old: self.elastic.delete_index_if_exists(index) # optimize self.elastic.optimize(index_new, refresh=True) # save backup (currently processed data) if len(backup_source) and backup_created_date: self._process_cities(set(cities), spider_name) cache.clear() obj = DataBackup.objects.get_or_create( name=spider_name, created_date=backup_created_date ) obj[0].source = binascii.hexlify(bz2.compress( JSONEncoder().encode(backup_source) )) obj[0].save() # force java & ES garbage collection self.elastic.connection.close() del self.extractor del java return True def _process_content(self, item): if len(item['content']): item['content'] = self.extractor.getText(jpype.JString(item['content'])) return item def _get_location(self, item): if not item.has_key('city'): return item try: geo = geocoders.GeoNames() places = geo.geocode(item['city'].encode('utf-8'), exactly_one=False) if places: place, (lat, lon) = places[0] if isinstance(places, list) else places if place: item['pin'] = { 'location': { 'lat': lat, 'lon': lon } } except: pass return item def _create_index(self, index, item): id = item['id'] del item['id'] try: self.elastic.get(index, 'job', id) except ElasticSearchException: self.elastic.index( dumps(item, cls=DjangoJSONEncoder), index, 'job', id, bulk=True ) def _process_cities(self, cities, spider_name): cities_current = City.objects.filter(indices__contains='"%s"' % spider_name) # save lists of saved cities cities_old_single = [ city.name for city in cities_current if city.indices and spider_name in city.indices and len(city.indices) == 1 ] cities_old_multi = [ city.name for city in cities_current if city.indices and spider_name in city.indices and len(city.indices) > 1 ] for city in cities: city = unicode(city.strip().lower()) city = normalize_spaces.sub(' ', city) city = remove_braces.sub('', city) city_clean = [remove_none_chars.sub('', word) for word in city.split(' ')] city_clean = ' '.join(filter(None, city_clean)) city, created = City.objects.get_or_create(name = city_clean[:255]) if created: city.indices = [spider_name] else: city.indices.append(spider_name) city.indices = list(set(city.indices)) city.save() if city.name in cities_old_single: cities_old_single.remove(city.name) if city.name in cities_old_multi: cities_old_multi.remove(city.name) # remove unlinked citie City.objects.filter(name__in=cities_old_single).delete() for item in City.objects.filter(name__in=cities_old_multi): if spider_name in item.indices: item.indices.remove(spider_name) item.save()
class DocManager(): """The DocManager class creates a connection to the backend engine and adds/removes documents, and in the case of rollback, searches for them. The reason for storing id/doc pairs as opposed to doc's is so that multiple updates to the same doc reflect the most up to date version as opposed to multiple, slightly different versions of a doc. We are using elastic native fields for _id and ns, but we also store them as fields in the document, due to compatibility issues. """ def __init__(self, url, auto_commit=True, unique_key='_id'): """Verify Elastic URL and establish a connection. """ if verify_url(url) is False: raise SystemError self.elastic = ES(server=url) self.auto_commit = auto_commit self.doc_type = 'string' # default type is string, change if needed self.unique_key = unique_key if auto_commit: self.run_auto_commit() def stop(self): """ Stops the instance """ self.auto_commit = False def upsert(self, doc): """Update or insert a document into Elastic If you'd like to have different types of document in your database, you can store the doc type as a field in Mongo and set doc_type to that field. (e.g. doc_type = doc['_type']) """ doc_type = self.doc_type index = doc['ns'] doc[self.unique_key] = str(doc[self.unique_key]) doc_id = doc[self.unique_key] id_query = TextQuery('_id', doc_id) elastic_cursor = self.elastic.search(query=id_query, indices=index) try: self.elastic.index(bsjson.dumps(doc), index, doc_type, doc_id) except ValueError: logging.info("Could not update %s" % (doc,)) self.elastic.refresh() def remove(self, doc): """Removes documents from Elastic The input is a python dictionary that represents a mongo document. """ try: self.elastic.delete(doc['ns'], 'string', str(doc[self.unique_key])) except (NotFoundException, TypeMissingException, IndexMissingException): pass def _remove(self): """For test purposes only. Removes all documents in test.test """ try: self.elastic.delete('test.test', 'string', '') except (NotFoundException, TypeMissingException, IndexMissingException): pass def search(self, start_ts, end_ts): """Called to query Elastic for documents in a time range. """ res = ESRange('_ts', from_value=start_ts, to_value=end_ts) results = self.elastic.search(RangeQuery(res)) return results def _search(self): """For test purposes only. Performs search on Elastic with empty query. Does not have to be implemented. """ results = self.elastic.search(MatchAllQuery()) return results def commit(self): """This function is used to force a refresh/commit. """ retry_until_ok(self.elastic.refresh) def run_auto_commit(self): """Periodically commits to the Elastic server. """ self.elastic.refresh() if self.auto_commit: Timer(1, self.run_auto_commit).start() def get_last_doc(self): """Returns the last document stored in the Elastic engine. """ result = self.elastic.search(MatchAllQuery(), size=1, sort='_ts:desc') for item in result: return item
from tools.FileTools import FileTools from tools.FormatTranslator import FormatTranslator from pyes.aggs import TermsAgg ftool = FileTools() ftrans = FormatTranslator() # 1. Create Connection conn = ES() # 2. Index Data dataset_json = open("../dataset.json") dataset = json.load(dataset_json)['data'] for data in dataset: conn.index(data, "example_index", "example_type", "example_id_"+str(dataset.index(data))) # 3. Create Simple Query query = MatchAllQuery() # 4. Create Simple Aggregation agg = TermsAgg('agg1', field="name",sub_aggs=[],size=100) # 5. Get Result search = Search(query,size=5) search.agg.add(agg) print search.serialize() result = conn.search(search, "example_index", "example_type" ) for i in result:
def import_instruments(instrs, es_url, index, alias): """Create JSON ES docs and import.""" prefix = { "bibo": "http://purl.org/ontology/bibo/", "dcterms": "http://purl.org/dc/terms/", "eos": "http://nasa.gov/eos.owl#", "gcis": "http://data.globalchange.gov/gcis.owl#", "hysds": "http://hysds.jpl.nasa.gov/hysds/0.1#", "info": "http://info-uri.info/", "xlink": "http://www.w3.org/1999/xlink" } conn = ES(es_url) if not conn.indices.exists_index(index): conn.indices.create_index(index) # track agencies/organizations orgs = {} for instr in instrs: identifier = "eos:%s" % instr['Instrument Name Short'] id = hashlib.md5(identifier).hexdigest() if 'Instrument Technology' in instr and not EMPTY.search(instr['Instrument Technology']): sensor = "eos:%s" % instr['Instrument Technology'] else: if 'Instrument Type' in instr and not EMPTY.search(instr['Instrument Type']): sensor = "eos:%s" % instr['Instrument Type'] else: if 'Subtype' in instr and not EMPTY.search(instr['Subtype']): sensor = "eos:%s" % instr['Subtype'] else: if 'Type' in instr and not EMPTY.search(instr['Type']): sensor = "eos:%s" % instr['Type'] else: if 'Class' in instr and not EMPTY.search(instr['Class']): sensor = "eos:%s" % instr['Class'] else: sensor = None #print(instr['Instrument Technology'], sensor) platform = None if 'Instrument Agencies' in instr and not EMPTY.search(instr['Instrument Agencies']): org = "eos:%s" % instr['Instrument Agencies'] if org not in orgs: orgs[org] = { "prov_es_json": { "prefix": prefix, "agent": { org: { "prov:type": { "type": "prov:QualifiedName", "$": "prov:Organization", }, }, }, }, "identifier": org, "prov:type": "prov:Organization", } if len(conn.search(query=TermQuery("_id", org), indices=[alias])) > 0: pass else: conn.index(orgs[org], index, 'agent', org) else: org = None doc = { "prov_es_json": { "prefix": prefix, "entity": { identifier: { "gcis:hasSensor": sensor, "gcis:inPlatform": platform, "prov:type": "eos:instrument", "gcis:hasGoverningOrganization": org, }, }, }, "gcis:hasSensor": sensor, "gcis:inPlatform": platform, "prov:type": "eos:instrument", "gcis:hasGoverningOrganization": org, "identifier": identifier, } if len(conn.search(query=TermQuery("_id", identifier), indices=[alias])) > 0: pass else: conn.index(doc, index, 'entity', identifier)
def ext_process(listname, hostname, url, filepath, msg): """Here's where you put your code to deal with the just archived message. Arguments here are the list name, the host name, the URL to the just archived message, the file system path to the just archived message and the message object. These can be replaced or augmented as needed. """ from pyes import ES from pyes.exceptions import ClusterBlockException, NoServerAvailable import datetime #CHANGE this settings to reflect your configuration _ES_SERVERS = ['127.0.0.1:9500'] # I prefer thrift _indexname = "mailman" _doctype = "mail" date = datetime.datetime.today() try: iconn = ES(_ES_SERVERS) status = None try: status = iconn.status(_indexname) logger.debug("Indexer status:%s" % status) except: iconn.create_index(_indexname) time.sleep(1) status = iconn.status(_indexname) mappings = { u'text': { 'store': 'true', 'type': u'text', "term_vector": "with_positions_offsets" }, u'url': { 'store': 'true', 'type': u'keyword' }, u'title': { 'store': 'true', 'type': u'text', "term_vector": "with_positions_offsets" }, u'date': { 'store': 'true', 'type': u'date' } } time.sleep(1) status = iconn.put_mapping(_doctype, mappings, _indexname) data = dict(url=url, title=msg.get('subject'), date=date, text=str(msg)) iconn.index(data, _indexname, _doctype) syslog('debug', 'listname: %s, hostname: %s, url: %s, path: %s, msg: %s', listname, hostname, url, filepath, msg) except ClusterBlockException: syslog( 'error', 'Cluster in revocery state: listname: %s, hostname: %s, url: %s, path: %s, msg: %s', listname, hostname, url, filepath, msg) except NoServerAvailable: syslog( 'error', 'No server available: listname: %s, hostname: %s, url: %s, path: %s, msg: %s', listname, hostname, url, filepath, msg) except: import traceback syslog( 'error', 'Unknown: listname: %s, hostname: %s, url: %s, path: %s, msg: %s\nstacktrace: %s', listname, hostname, url, filepath, msg, repr(traceback.format_exc())) return
def import_instruments(instrs, es_url, index, alias): """Create JSON ES docs and import.""" prefix = { "bibo": "http://purl.org/ontology/bibo/", "dcterms": "http://purl.org/dc/terms/", "eos": "http://nasa.gov/eos.owl#", "gcis": "http://data.globalchange.gov/gcis.owl#", "hysds": "http://hysds.jpl.nasa.gov/hysds/0.1#", "info": "http://info-uri.info/", "xlink": "http://www.w3.org/1999/xlink" } conn = ES(es_url) if not conn.indices.exists_index(index): conn.indices.create_index(index) # track agencies/organizations orgs = {} for instr in instrs: identifier = "eos:%s" % instr['Instrument Name Short'] id = hashlib.md5(identifier).hexdigest() if 'Instrument Technology' in instr and not EMPTY.search( instr['Instrument Technology']): sensor = "eos:%s" % instr['Instrument Technology'] else: if 'Instrument Type' in instr and not EMPTY.search( instr['Instrument Type']): sensor = "eos:%s" % instr['Instrument Type'] else: if 'Subtype' in instr and not EMPTY.search(instr['Subtype']): sensor = "eos:%s" % instr['Subtype'] else: if 'Type' in instr and not EMPTY.search(instr['Type']): sensor = "eos:%s" % instr['Type'] else: if 'Class' in instr and not EMPTY.search( instr['Class']): sensor = "eos:%s" % instr['Class'] else: sensor = None #print(instr['Instrument Technology'], sensor) platform = None if 'Instrument Agencies' in instr and not EMPTY.search( instr['Instrument Agencies']): org = "eos:%s" % instr['Instrument Agencies'] if org not in orgs: orgs[org] = { "prov_es_json": { "prefix": prefix, "agent": { org: { "prov:type": { "type": "prov:QualifiedName", "$": "prov:Organization", }, }, }, }, "identifier": org, "prov:type": "prov:Organization", } if len( conn.search(query=TermQuery("_id", org), indices=[alias])) > 0: pass else: conn.index(orgs[org], index, 'agent', org) else: org = None doc = { "prov_es_json": { "prefix": prefix, "entity": { identifier: { "gcis:hasSensor": sensor, "gcis:inPlatform": platform, "prov:type": "eos:instrument", "gcis:hasGoverningOrganization": org, }, }, }, "gcis:hasSensor": sensor, "gcis:inPlatform": platform, "prov:type": "eos:instrument", "gcis:hasGoverningOrganization": org, "identifier": identifier, } if len(conn.search(query=TermQuery("_id", identifier), indices=[alias])) > 0: pass else: conn.index(doc, index, 'entity', identifier)
from pyes import ES es = ES() index_name = "my_index" type_name = "my_type" from utils_pyes import create_and_add_mapping create_and_add_mapping(es, index_name, type_name) es.index(doc={ "name": "Joe Tester", "parsedtext": "Joe Testere nice guy", "uuid": "11111", "position": 1 }, index=index_name, doc_type=type_name, id=1) es.index(doc={ "name": "data1", "value": "value1" }, index=index_name, doc_type=type_name + "2", id=1, parent=1) es.index(doc={ "name": "Bill Baloney", "parsedtext": "Bill Testere nice guy",
def insertElasticsearch(_index, _type, data): # conn = ES('127.0.0.1:9200', timeout=3.5) # conn = ES('192.168.30.63:9200', timeout=3.5) # conn = ES('140.92.13.186:9200', timeout=3.5) conn = ES('localhost:9200', timeout=3.5) conn.index(data, _index, _type)
from pyes import ES, Search, MatchAllQuery from tools.FileTools import FileTools from tools.FormatTranslator import FormatTranslator from pyes.aggs import TermsAgg ftool = FileTools() ftrans = FormatTranslator() # 1. Create Connection conn = ES() # 2. Index Data dataset_json = open("../dataset.json") dataset = json.load(dataset_json)['data'] for data in dataset: conn.index(data, "example_index", "example_type", "example_id_" + str(dataset.index(data))) # 3. Create Simple Query query = MatchAllQuery() # 4. Create Simple Aggregation agg = TermsAgg('agg1', field="name", sub_aggs=[], size=100) # 5. Get Result search = Search(query, size=5) search.agg.add(agg) print search.serialize() result = conn.search(search, "example_index", "example_type") for i in result:
dataset = shelve.open("samples.shelve") mapping = { u'description': {'boost': 1.0, 'index': 'analyzed', 'store': 'yes', 'type': u'string', "term_vector" : "with_positions_offsets" }, u'name': {'boost': 1.0, 'index': 'analyzed', 'store': 'yes', 'type': u'string', "term_vector" : "with_positions_offsets" }, u'age': {'store': 'yes', 'type': u'integer'}, } conn.create_index("test-index") conn.put_mapping("test-type", {'properties':mapping}, ["test-index"]) start = datetime.now() for k, userdata in dataset.items(): # conn.index(userdata, "test-index", "test-type", k) conn.index(userdata, "test-index", "test-type", k, bulk=True) conn.force_bulk() end = datetime.now() print "time:", end-start dataset.close()
dataset = shelve.open("samples.shelve") mapping = { u'description': {'boost': 1.0, 'index': 'analyzed', 'store': 'yes', 'type': u'string', "term_vector" : "with_positions_offsets" }, u'name': {'boost': 1.0, 'index': 'analyzed', 'store': 'yes', 'type': u'string', "term_vector" : "with_positions_offsets" }, u'age': {'store': 'yes', 'type': u'integer'}, } conn.create_index("test-index") conn.put_mapping("test-type", {'properties':mapping}, ["test-index"]) start = datetime.now() for k, userdata in dataset.items(): # conn.index(userdata, "test-index", "test-type", k) conn.index(userdata, "test-index", "test-type", k, bulk=True) conn.force_bulk() end = datetime.now() print "time:", end - start dataset.close()
from pyes import ES es = ES() index_name = "my_index" type_name = "my_type" from utils_pyes import create_and_add_mapping create_and_add_mapping(es, index_name, type_name) es.index(doc={"name": "Joe Tester", "parsedtext": "Joe Testere nice guy", "uuid": "11111", "position": 1}, index=index_name, doc_type=type_name, id=1) es.index(doc={"name": "data1", "value": "value1"}, index=index_name, doc_type=type_name + "2", id=1, parent=1) es.index(doc={"name": "Bill Baloney", "parsedtext": "Bill Testere nice guy", "uuid": "22222", "position": 2}, index=index_name, doc_type=type_name, id=2, bulk=True) es.index(doc={"name": "data2", "value": "value2"}, index=index_name, doc_type=type_name + "2", id=2, parent=2, bulk=True) es.index(doc={"name": "Bill Clinton", "parsedtext": """Bill is not nice guy""", "uuid": "33333", "position": 3}, index=index_name, doc_type=type_name, id=3, bulk=True) es.force_bulk() es.update(index=index_name, doc_type=type_name, id=2, script='ctx._source.position += 1') es.update(index=index_name, doc_type=type_name, id=2, script='ctx._source.position += 1', bulk=True) es.delete(index=index_name, doc_type=type_name, id=1, bulk=True) es.delete(index=index_name, doc_type=type_name, id=3) es.force_bulk() es.indices.refresh(index_name)
class SampleMaker(object): def __init__(self, name): log = open(name, "wb") self.log = log self.conn = ES(("http", "127.0.0.1", 9200), timeout=300.0, log_curl=True, dump_curl=log) self.index_name = "test-index" self.document_type = "test-type" self.conn.delete_index_if_exists(self.index_name) self.init_default_index() def init_default_index(self): from pyes.helpers import SettingsBuilder settings = SettingsBuilder() from pyes.mappings import DocumentObjectField from pyes.mappings import IntegerField from pyes.mappings import NestedObject from pyes.mappings import StringField, DateField, BooleanField, GeoPointField, FloatField docmapping = DocumentObjectField(name=self.document_type) docmapping.add_property( StringField(name="description", store=True, term_vector="with_positions_offsets", index="analyzed")) docmapping.add_property( StringField(name="name", store=True, term_vector="with_positions_offsets", index="analyzed")) docmapping.add_property( StringField(name="tag", store=True, index="not_analyzed")) docmapping.add_property(IntegerField(name="age", store=True)) docmapping.add_property(FloatField(name="price")) docmapping.add_property(DateField(name="date", store=True)) docmapping.add_property( BooleanField(name="in_stock", store=True, index="not_analyzed")) docmapping.add_property(GeoPointField(name="position")) nested_object = NestedObject(name="metadata") nested_object.add_property(StringField(name="name", store=True)) nested_object.add_property(StringField(name="value", store=True)) nested_object.add_property(IntegerField(name="num", store=True)) docmapping.add_property(nested_object) settings.add_mapping(docmapping) self.conn.ensure_index(self.index_name, settings) def generate_datafile(self, number_items=1000): """ Generate a dataset with number_items elements. """ names = get_names() totalnames = len(names) #init random seeder random.seed() #calculate items # names = random.sample(names, number_items) for i in xrange(number_items): data = { "name": names[random.randint(0, totalnames - 1)], "age": random.randint(1, 100), "price": random.random() * 100.0, "tag": [words(1, False) for r in xrange(random.randint(1, 5))], "in_stock": random.choice([True, False]), "date": datetime.now() + timedelta(days=random.choice([1, -1]) * random.randint(0, 1000)), "position": { "lat": random.choice([1, -1]) * random.random() * 90.0, "lon": random.choice([1, -1]) * random.random() * 180.0 }, "description": words(random.randint(1, 100), False), "metadata": [{ "name": names[random.randint(0, totalnames - 1)], "value": str(random.randint(1, 5)), "num": random.randint(1, 50) } for r in xrange(random.randint(1, 5))] } self.conn.index(data, self.index_name, self.document_type, id=str(i + 1)) def close(self): self.conn.flush(self.index_name) self.log.close()
def ext_process(listname, hostname, url, filepath, msg): """Here's where you put your code to deal with the just archived message. Arguments here are the list name, the host name, the URL to the just archived message, the file system path to the just archived message and the message object. These can be replaced or augmented as needed. """ from pyes import ES from pyes.exceptions import ClusterBlockException, NoServerAvailable import datetime #CHANGE this settings to reflect your configuration _ES_SERVERS = ['127.0.0.1:9500'] # I prefer thrift _indexname = "mailman" _doctype = "mail" date = datetime.datetime.today() try: iconn = ES(_ES_SERVERS) status = None try: status = iconn.status(_indexname) logger.debug("Indexer status:%s" % status) except: iconn.create_index(_indexname) time.sleep(1) status = iconn.status(_indexname) mappings = { u'text': {'boost': 1.0, 'index': 'analyzed', 'store': 'yes', 'type': u'string', "term_vector" : "with_positions_offsets"}, u'url': {'boost': 1.0, 'index': 'not_analyzed', 'store': 'yes', 'type': u'string', "term_vector" : "no"}, u'title': {'boost': 1.0, 'index': 'analyzed', 'store': 'yes', 'type': u'string', "term_vector" : "with_positions_offsets"}, u'date': {'store': 'yes', 'type': u'date'}} time.sleep(1) status = iconn.put_mapping(_doctype, mappings, _indexname) data = dict(url=url, title=msg.get('subject'), date=date, text=str(msg) ) iconn.index(data, _indexname, _doctype) syslog('debug', 'listname: %s, hostname: %s, url: %s, path: %s, msg: %s', listname, hostname, url, filepath, msg) except ClusterBlockException: syslog('error', 'Cluster in revocery state: listname: %s, hostname: %s, url: %s, path: %s, msg: %s', listname, hostname, url, filepath, msg) except NoServerAvailable: syslog('error', 'No server available: listname: %s, hostname: %s, url: %s, path: %s, msg: %s', listname, hostname, url, filepath, msg) except: import traceback syslog('error', 'Unknown: listname: %s, hostname: %s, url: %s, path: %s, msg: %s\nstacktrace: %s', listname, hostname, url, filepath, msg, repr(traceback.format_exc())) return
class ElasticSearchServer(ESDBRequests): """ An object representing the CouchDB server, use it to list, create, delete and connect to databases. More info http://wiki.apache.org/couchdb/HTTP_database_API """ def __init__(self, dburl = 'http://localhost:9200', indices, types, usePYCurl = False, ckey = None, cert = None, capath = None): """ Set up a connection to the CouchDB server """ check_server_url(dburl) # PYCurl TODO # Same with cert and key self.url = dburl self.ESconn = ES(dburl) self.ckey = ckey self.cert = cert check_name(indices) check_name(types) self.indices = indices self.types = types def listDatabases(self): "List all the databases the server hosts" # TODO return self.get('/_all_dbs') def createDatabase(self, schema): """ A database must be named with all lowercase characters (a-z), digits (0-9), or any of the _$()+-/ characters and must end with a slash in the URL. """ self.ESconn.indices.create_index_if_missing(self.indices) self.ESconn.indices.put_mapping(self.types, {'properties': schema}, [self.indices]) def insertDoc(self, doc, _id): """ TODO """ self.ESconn.index(doc, self.indices, self.types, _id) def deleteDoc(self, _id): self.ESconn.delete(self.indices, self.types, _id) def termBoolQuery(self, query): """ query - dict must: key = key in the database value = searchable value should key = key in the database value = searchable value must_not key = key in the database value = searchable value """ queryMust = [] queryShould = [] queryMustNot = [] for item in ["must", "should", "must_not"]: if item in query: for dictVals in query[item]: for dictKey in dictVals: tempq = TermQuery(dictKey, dictVals[dictKey]) if item == "must": queryMust.append(tempq) elif item == "should": queryShould.append(tempq) elif item == "must_not": queryMustNot.append(tempq) query = BoolQuery(must=None if not queryMust else queryMust, should=None if not queryShould else queryShould, must_not=None if not queryMustNot else queryMustNot) search = Search(query) results = self.ESconn.search(search, self.indices) response = {"status_code": 200, "message": "Successful", "content": []} response["content"] = [result for result in results] return response
conn = ES('localhost:9200') type_name = 'shrimp' es_index = 'shrimp' db = MySQLdb.connect( host="localhost", # your host, usually localhost user="******", # your username passwd="password", # your password db="shrimp", # name of the data base cursorclass=MySQLdb.cursors.SSCursor) # you must create a Cursor object. It will let # you execute all the queries you need cur = db.cursor() # Use all the SQL you like cur.execute("select * from PERIODS;") row = cur.fetchone() while row is not None: MysqlToES = collections.OrderedDict() MysqlToES['period'] = row[4] MysqlToES['a'] = row[1] MysqlToES['b'] = row[2] MysqlToES['c'] = row[3] MyESJson = json.dumps(MysqlToES) thisid = id_generator() conn.index(MyESJson, es_index, type_name, id=thisid) row = cur.fetchone()
# get source and destination index src = sys.argv[1] dest = sys.argv[2] # get connection and create destination index conn = ES(es_url) if not conn.indices.exists_index(dest): conn.indices.create_index(dest) # index all docs from source index to destination index query = { "fields": "_source", "query": { "match_all": {} } } r = requests.post('%s/%s/_search?search_type=scan&scroll=60m&size=100' % (es_url, src), data=json.dumps(query)) scan_result = r.json() count = scan_result['hits']['total'] scroll_id = scan_result['_scroll_id'] results = [] while True: r = requests.post('%s/_search/scroll?scroll=60m' % es_url, data=scroll_id) res = r.json() scroll_id = res['_scroll_id'] if len(res['hits']['hits']) == 0: break for hit in res['hits']['hits']: doc = hit['_source'] conn.index(hit['_source'], dest, hit['_type'], hit['_id']) print "indexed %s" % hit['_id']