def delete(config, tree_names, all, force): """Delete indices and their catalog entries. This deletes the indices that have the format version of the copy of DXR this runs under. """ es = ElasticSearch(config.es_hosts) if all: echo('Deleting catalog...') es.delete_index(config.es_catalog_index) # TODO: Delete tree indices as well. else: for tree_name in tree_names: frozen_id = '%s/%s' % (FORMAT, tree_name) try: frozen = es.get(config.es_catalog_index, TREE, frozen_id) except ElasticHttpNotFoundError: raise ClickException('No tree "%s" in catalog.' % tree_name) # Delete the index first. That way, if that fails, we can still # try again; we won't have lost the catalog entry. Refresh is # infrequent enough that we wouldn't avoid a race around a # catalogued but deleted instance the other way around. try: es.delete_index(frozen['_source']['es_alias']) except ElasticHttpNotFoundError: # It's already gone. Fine. Just remove the catalog entry. pass es.delete(config.es_catalog_index, TREE, frozen_id)
def main(): """ Method to kick things off """ # Setup workers pool = Pool(processes=CPU_COUNT) # Prepare URLs urls = [] for url in CRAWL_URLS: urls.append(str(BASE_URL + url)) if USE_ES: # Create connection es = ElasticSearch(ES_URL) try: # Delete the existing index es.delete_index(ES_INDEX) except: # In case the index does not exist pass # Create the index to use es.create_index(ES_INDEX) else: # Setup the database tables, connect init_db() # Scrape and store async pool.map(scrape, urls)
def IndexData(request): es = ElasticSearch(settings.ELASTIC_SEARCH) for file in fileHolder: index = file['segment_name'].lower() rawfiles = file['rawfiles'] data_for_es = file['dataFrames'] try: es.delete_index(index.replace(" ", "")) except: pass es.create_index(index.replace(" ", "")) ## Loop dataframe and to elasticsearch index docs = json.loads(data_for_es.to_json(orient='records')) es.bulk((es.index_op(doc) for doc in docs), index=index.replace(" ", ""), doc_type=index) ##Create segment template file_names = [] for file in rawfiles: file_names.append(file.name) segment = Segments(name=index, files_added=",".join(file_names), es_index=index.replace(" ", "")) segment.save() segment = Segments.objects.get(name=index) return render(request, 'analyse.html', {'segment': segment})
def cli(index_name, delete_index, mapping_file, settings_file, doc_type, import_file, delimiter, tab, host, docs_per_chunk, bytes_per_chunk, parallel, quiet): """ Bulk import a delimited file into a target Elasticsearch instance. Common delimited files include things like CSV and TSV. \b Load a CSV file: csv2es --index-name potatoes --doc-type potato --import-file potatoes.csv \b For a TSV file, note the tab delimiter option csv2es --index-name tomatoes --doc-type tomato \ --import-file tomatoes.tsv --tab \b For a nifty pipe-delimited file (delimiters must be one character): csv2es --index-name pipes --doc-type pipe --import-file pipes.psv \ --delimiter '|' """ echo('Using host: ' + host, quiet) es = ElasticSearch(host) if delete_index: try: es.delete_index(index_name) echo('Deleted: ' + index_name, quiet) except ElasticHttpNotFoundError: echo('Index ' + index_name + ' not found, nothing to delete', quiet) try: if settings_file: echo('Applying mapping from: ' + settings_file, quiet) with open(settings_file) as f: settings = json.loads(f.read()) es.create_index(index_name, settings) else: es.create_index(index_name) echo('Created new index: ' + index_name, quiet) except ElasticHttpError as e: if e.error['type'] == 'index_already_exists_exception': echo('Index ' + index_name + ' already exists', quiet) else: raise echo('Using document type: ' + doc_type, quiet) if mapping_file: echo('Applying mapping from: ' + mapping_file, quiet) with open(mapping_file) as f: mapping = json.loads(f.read()) es.put_mapping(index_name, doc_type, mapping) target_delimiter = sanitize_delimiter(delimiter, tab) documents = documents_from_file(es, import_file, target_delimiter, quiet) perform_bulk_index(host, index_name, doc_type, documents, docs_per_chunk, bytes_per_chunk, parallel)
def cli(index_name, delete_index, mapping_file, settings_file, doc_type, host, docs_per_chunk, bytes_per_chunk, parallel, quiet, parser, config_file, user, passwd): with open(config_file, "rb") as f: con = json.loads(f.read()) host = con['es_config']['host'] echo('Using host: ' + host, quiet) es = ElasticSearch(host) if con['db']['type'] == "oracle": db = import_module('cx_Oracle') collection = db.connect(user, passwd, con['db']['con_str']) else: db = import_module('MySQLdb') collection = db.connect(con['db']['con_str'][0], user, passwd, con['db']['con_str'][1], charset=con['db']['con_str'][2]) if delete_index: # 删除索引 try: stamp = 0 es.delete_index(index_name) echo('Deleted: ' + index_name, quiet) except ElasticHttpNotFoundError: echo('Index ' + index_name + ' not found, nothing to delete', quiet) try: if settings_file: with open(settings_file, 'r') as f: settings_json = json.loads(f.read()) es.create_index(index_name, settings=settings_json) else: es.create_index(index_name) echo('Created new index: ' + index_name, quiet) except Exception: echo('Index ' + index_name + ' already exists', quiet) echo('Using document type: ' + doc_type, quiet) es.put_mapping(index_name, doc_type, con['mapping']) parser_fun = None if parser is not None: # 加载解释函数 parser_fun = import_module(PARSER_PATH + '.' + parser) documents = documents_from_file(es, collection, quiet, parser_fun, con) perform_bulk_index(host, index_name, doc_type, documents, docs_per_chunk, bytes_per_chunk, parallel) print "end:" + time.strftime( ISOTIMEFORMAT, time.localtime()) + '/n all records import complete.'
class ElasticSearchTestCase(unittest.TestCase): def setUp(self): self.conn = ElasticSearch('http://localhost:9200/') def tearDown(self): self.conn.delete_index("test-index") def assertResultContains(self, result, expected): for (key, value) in expected.items(): self.assertEquals(value, result[key])
def import_json_into_es(types, inputfolder, logger): """ imports entitied from the *name.json.bz2* files (one entity per line) into local elasticsearch :param types: json string like {'person': 'http://www.wikidata.org/entity/Q5', 'name': 'Wikidata-URI'} :param inputfolder: :param logger: :return: """ es = ElasticSearch(config.ELASTICSEARCH_URL) try: es.delete_index('wikidata') es.create_index('wikidata') logger.info('rebuild index [wikidata]') except: logger.warning('cant delete wikidata index') # convert type dictionary wd_types = dict() for key in types.keys(): value = int(types[key].split('/')[-1][1:]) wd_types[value] = {'type': key, 'filename': path.join(inputfolder, '{}.json.bz2'.format(key))} # import each given type for key in wd_types: logger.info(wd_types[key]) done = 0 items = [] for line in BZ2File(wd_types[key]['filename'],'rb'): line = line.strip() item = loads(line) item['uri'] = 'http://wikidata.org/wiki/' + item['id'] items.append(item) done += 1 if ( done % 5000 == 0 ): es.bulk_index('wikidata', wd_types[key]['type'], items, id_field='id') items = [] # if done % len(wd_types) / 10 == 0: # log 10% steps # logger.info('imported {}: {:,d} ({:,d})'.format(wd_types[key]['type'],done, 100*len(wd_types)/done )) if done % 10000 == 0: logger.info('imported {}: {}'.format(wd_types[key]['type'],format(done, ',d'))) if len(items) > 0: es.bulk_index('wikidata', wd_types[key]['type'], items, id_field='id') logger.info('imported {}: {}'.format(wd_types[key]['type'],format(done, ',d')))
def feed(index='monolith', type='downloads', es_port=9200): client = ElasticSearch('http://0.0.0.0:%d/' % es_port) platforms = ['Mac OS X', 'Windows 8', 'Ubuntu'] # indexing a year of data (2012) first_day = datetime.datetime(2012, 1, 1) last_day = datetime.datetime(2012, 12, 31) day_range = last_day - first_day for month in range(1, 13): name = 'time_2012-%.2d' % month try: client.delete_index(name) except Exception: pass client.create_index(name, settings={ 'number_of_shards': 1, 'number_of_replicas': 0, 'analysis': { 'analyzer': { 'default': { 'type': 'custom', 'tokenizer': 'keyword' } } }, 'store': { 'compress': { 'stored': 'true' } }, }) # indexing 100 apps for add_on in range(100): docs = defaultdict(list) for delta in range(day_range.days): date = first_day + datetime.timedelta(days=delta) data = { 'date': date, 'os': random.choice(platforms), 'downloads_count': random.randint(1000, 1500), 'users_count': random.randint(10000, 15000), 'add_on': add_on + 1 } docs[date.month].append(data) for month, values in docs.items(): client.bulk_index('time_2012-%.2d' % month, type, values) sys.stdout.write('.') sys.stdout.flush() client.optimize('time_*', max_num_segments=1, wait_for_merge=True) client.flush() sys.stdout.write('\nDone!\n')
def index_data(data_source, index_name, doc_type): es = ElasticSearch(urls='http://localhost', port=9200) try: es.delete_index(index_name) except: pass es.create_index(index_name) try: es.bulk_index(index_name, doc_type, data_source) except: print("Error! Skipping Document...!") pass
def setUp(self): es_connection = ElasticSearch('http://localhost:9200') try: es_connection.delete_index('unit_tests') except: pass es_connection.create_index('unit_tests') class TestModel(SearchModel): index_name = 'unit_tests' self.model = TestModel
def cli(index_name, delete_index, mapping_file, doc_type, import_file, delimiter, tab, host, docs_per_chunk, bytes_per_chunk, parallel, quiet, document_id_in_file): """ Bulk import a delimited file into a target Elasticsearch instance. Common delimited files include things like CSV and TSV. \b Load a CSV file: csv2es --index-name potatoes --doc-type potato --import-file potatoes.csv \b For a TSV file, note the tab delimiter option csv2es --index-name tomatoes --doc-type tomato --import-file tomatoes.tsv --tab \b For a nifty pipe-delimited file (delimiters must be one character): csv2es --index-name pipes --doc-type pipe --import-file pipes.psv --delimiter '|' """ echo('Using host: ' + host, quiet) es = ElasticSearch(host) if delete_index: try: es.delete_index(index_name) echo('Deleted: ' + index_name, quiet) except ElasticHttpNotFoundError: echo('Index ' + index_name + ' not found, nothing to delete', quiet) try: es.create_index(index_name) echo('Created new index: ' + index_name, quiet) except IndexAlreadyExistsError: echo('Index ' + index_name + ' already exists', quiet) except ElasticHttpError as exception: echo( 'Error creating index %s. ElasticHttpError [%s]' % (index_name, exception.error), quiet) echo('Using document type: ' + doc_type, quiet) if mapping_file: echo('Applying mapping from: ' + mapping_file, quiet) with open(mapping_file) as f: mapping = json.loads(f.read()) es.put_mapping(index_name, doc_type, mapping) target_delimiter = sanitize_delimiter(delimiter, tab) documents = documents_from_file(es, import_file, target_delimiter, quiet, document_id_in_file) perform_bulk_index(host, index_name, doc_type, documents, docs_per_chunk, bytes_per_chunk, parallel)
def es_indexer(): es=ElasticSearch('http://localhost:9200/') if es: # Delete index /sentiment_analysis if it already exists try: es.delete_index("sentiment_analysis") print "Deleted index sentiment_analysis if it already existed." except: raise 'ElasticHttpNotFoundError' finally: print "Creating index sentiment_analysis ...." es.create_index("sentiment_analysis",{ 'settings': { 'index': { 'store': { 'type': "default" }, 'number_of_shards': 1, 'number_of_replicas': 1 }, 'analysis': { 'analyzer': { 'default_english': { 'type': 'english' } } } }, "mappings": { "document": { "properties": { "text": { "type": "string", "store": True, "index": "analyzed", "term_vector": "with_positions_offsets_payloads", "analyzer": "default_english" }, "sentiment": { "type": "string", "store": True, "index": "analyzed", "analyzer": "default_english" } } } } }) print "Created index 'sentiment_analysis' with type 'document' and an analyzed field 'text'." else: print "ElasticSearch is not running or the default cluster is down."
class Indexer(object): def __init__(self, input): self.input = input self.es = ElasticSearch() self.index_name = "psim" self.doc_type = 'book' def delete_index(self): # Delete index if already found one try: self.es.delete_index(index = self.index_name) except Exception: pass def create_index(self): self.es.create_index(index=self.index_name, settings = self.get_index_settings()) def get_index_settings(self): settings = { "mappings": { "book": { "_all" : {"enabled" : "false"}, "properties": { "codes": {"type": "string", "term_vector": "yes", "store": "true"}, "pid" : {"type" : "string"}, "embedding": {"type": "float", "store": "true"}, "magnitude": {"type": "float", "store": "true"} } } } } return settings def documents(self): with open(self.input) as input_file: for line in input_file: json_doc = json.loads(line) yield self.es.index_op(json_doc, doc_type=self.doc_type) def index(self): self.delete_index() self.create_index() for chunk in bulk_chunks(self.documents(), docs_per_chunk=1000): self.es.bulk(chunk, index = self.index_name, doc_type = self.doc_type) self.es.refresh(self.index_name)
def index_data(data_path, chunksize, index_name, doc_type): f = open(data_path) csvfile = pd.read_csv(f, iterator=True, chunksize=chunksize) es = ElasticSearch(urls='http://localhost', port=9200) try: es.delete_index(index_name) except: pass es.create_index(index_name) for i, df in enumerate(csvfile): records = df.where(pd.notnull(df), None).T.to_dict() records_list = [records[i] for i in records] try: es.bulk_index(index_name, doc_type, records_list) except: print("Error! Skipping chunk...!") pass
def feed(index='monolith', type='downloads', es_port=9200): client = ElasticSearch('http://0.0.0.0:%d/' % es_port) platforms = ['Mac OS X', 'Windows 8', 'Ubuntu'] # indexing a year of data (2012) first_day = datetime.datetime(2012, 1, 1) last_day = datetime.datetime(2012, 12, 31) day_range = last_day - first_day for month in range(1, 13): name = 'time_2012-%.2d' % month try: client.delete_index(name) except Exception: pass client.create_index(name, settings={ 'number_of_shards': 1, 'number_of_replicas': 0, 'analysis': {'analyzer': {'default': { 'type': 'custom', 'tokenizer': 'keyword' }}}, 'store': {'compress': {'stored': 'true'}}, }) # indexing 100 apps for add_on in range(100): docs = defaultdict(list) for delta in range(day_range.days): date = first_day + datetime.timedelta(days=delta) data = {'date': date, 'os': random.choice(platforms), 'downloads_count': random.randint(1000, 1500), 'users_count': random.randint(10000, 15000), 'add_on': add_on + 1} docs[date.month].append(data) for month, values in docs.items(): client.bulk_index('time_2012-%.2d' % month, type, values) sys.stdout.write('.') sys.stdout.flush() client.optimize('time_*', max_num_segments=1, wait_for_merge=True) client.flush() sys.stdout.write('\nDone!\n')
'adId': {'type': 'integer'}, 'adUrl': {'type': 'string'}, 'adType': {'type': 'string'}, 'adSize': {'type': 'string'}, 'dateCreated': {'type': 'date', 'format' : 'YYYY-MM-dd HH:mm:ss'}, 'websiteId': {'type': 'integer'}, 'website': {'type': 'string', 'analyzer': 'simple'}, 'category': {'type': 'string'}, 'subCategory': {'type': 'string'} } } } es.health(wait_for_status='yellow') es.delete_index('write-ads') es.create_index('write-ads', settings={'mappings': ad_mapping}) dateYMD = args["date"] prepareDataFromDB(dateYMD) dir = DATA_FILES_JSON + '/' + dateYMD for filename in os.listdir(dir): if filename.endswith('.json'): with open(dir + '/' + filename) as open_file: json_docs = json.load(open_file) es.bulk((es.index_op(doc) for doc in json_docs), index='write-ads', doc_type='ad') es.refresh("write-ads")
class IbbdElasticSearch: """ es操作 文档:http://pyelasticsearch.readthedocs.io/en/latest/ """ es = None config = {} mapping_is_set = False # 判断是否已经设置了es的mapping def __init__(self, config): """ es初始化 配置参数: host: es连接字符串 indexName: index的名字 deleteIndex: 是否删除已经存在的index,默认为false,不删除 settings: index的配置。具体的配置项,请看es的文档。 settingsFile: index的配置,json文件。具体的配置项,请看es的文档。 mappings: mappings的配置。具体的配置项,请看es的文档。 mappingsFile: mappings的配置,json文件。具体的配置项,请看es的文档。 idField: id字段。有些数据是包含id字段的 说明:settings和settingsFile最多只能有一项 mappings和mappingsFile最多也只能有一项 """ self.es = ElasticSearch(config['host']) if 'docType' not in config: config['docType'] = config['indexName'] self.config = config if 'deleteIndex' in config and config['deleteIndex']: try: self.es.delete_index(config['indexName']) print('delete index ' + config['indexName'] + ' success!') except ElasticHttpNotFoundError: # 如果本来不存在,则输出提示就好 print('Index ' + config['indexName'] \ + ' not found, nothing to delete!') except: raise Exception('Index ' + config['indexName'] + ' delete error!') try: if 'settings' in config: self.es.create_index(config['indexName'], settings=config['settings']) elif 'settingsFile' in config: with open(config['settingsFile'], 'r') as f: config['settings'] = json.loads(f.read()) self.es.create_index(config['indexName'], settings=config['settings']) else: self.es.create_index(config['indexName']) print('create index ' + config['indexName'] + ' success!') except Exception: raise Exception("create index " + config['indexName'] + ' error!') def _putMapping(self, row): """ 设置es的mapping。 可以根据row生成默认配置, 生成配置规则如下: """ try: if 'mappingsFile' in self.config: with open(self.config['mappingsFile'], 'r') as f: self.config['mappings'] = json.loads(f.read()) if 'mappings' in self.config: self.es.put_mapping(self.config['indexName'], self.config['docType'], self.config['mappings']) print("put mapping " + self.config['indexName'] + ' success!') except Exception: raise Exception("put mapping " + self.config['indexName'] + ' error!') def read(self): pass def batchRead(self): pass def write(self, row): """ 写入单行记录 """ return self.batchWrite([row]) def batchWrite(self, rows): """ 写入多行记录 """ if not self.mapping_is_set: # 设置mapping self.mapping_is_set = True self._putMapping(rows[0]) docs = () if 'idField' in self.config: docs = (self.es.index_op(doc, id=doc.pop(self.config['idField'])) \ for doc in rows) else: docs = (self.es.index_op(doc) for doc in rows) self.es.bulk(docs, index=self.config['indexName'], doc_type=self.config['docType']) return True
def drop_elastic(settings): try: es = ElasticSearch(settings['ELASTICSEARCH_URL']) es.delete_index(settings['ELASTICSEARCH_INDEX']) except: pass
es = ElasticSearch(ELASTICSEARCH_HOST, timeout=120) FILES = [ "nofly/shapefile/us_national_parks", "nofly/shapefile/us_military", "nofly/shapefile/5_mile_airport", ] try: es.delete_all(ELASTICSEARCH_DOC, ELASTICSEARCH_INDEX) except ElasticHttpError: pass try: es.delete_index(ELASTICSEARCH_INDEX) except ElasticHttpError: pass index_settings = { "number_of_shards": 3, "number_of_replicas": 1, "mappings": { ELASTICSEARCH_DOC: { "properties": { "location": { "type": "geo_shape", "tree": "quadtree", "precision": "1m" } }
class ESWrapper(BaseDB): def __init__(self, index_name, doc_type, host='http://localhost', port=9200): self.eserver = ElasticSearch(urls=host, port=port, timeout=60, max_retries=3) #self._base_query = {"query": {"bool": {"must": {"match": {}}}}} #self._base_query = {"query": {"bool": {}}} self._geo_filter = {"distance": "20km", "coordinates": {}} self._population_filter = {'population': {'gte': 5000}} self._index = index_name self._doctype = doc_type def getByid(self, geonameId): maincondition = {"match": {"id": geonameId}} q = {"query": {"bool": {"must": maincondition}}} return self.eserver.search( q, index=self._index, doc_type=self._doctype)['hits']['hits'][0]['_source'] def _query(self, qkey, qtype="exact", analyzer=None, min_popln=None, size=10, **kwargs): """ qtype values are exact, relaxed or geo_distance Always limit results to 10 """ q = {"query": {"bool": {}}} query_name = kwargs.pop('query_name', 'must') query_name = "should" if query_name == "should": q["query"]["bool"]["minimum_number_should_match"] = 1 maincondition = {} if qtype == "exact": maincondition = [{ "term": { "name.raw": { "value": qkey } } }, { "term": { "asciiname.raw": { "value": qkey } } }, { "term": { "alternatenames": { "value": qkey } } }] #maincondition["match"] = {"name.raw": {"query": qkey}} if analyzer: maincondition["match"]["name.raw"]["analyzer"] = analyzer elif qtype == "relaxed": maincondition["match"] = {"alternatenames": {"query": qkey}} if analyzer: maincondition["match"]["alternatenames"]["analyzer"] = analyzer #q["query"]["bool"][query_name]["match"].pop("name.raw", "") elif qtype == "combined": maincondition = [{ "bool": { "must": { "multi_match": { "query": qkey, "fields": ["name.raw", "asciiname", "alternatenames"] } }, "filter": { "bool": { "should": [{ "range": { "population": { "gte": 5000 } } }, { "terms": { "featureCode": [ "pcla", "pcli", "cont", "rgn", "admd", "adm1", "adm2" ] } }] } } } }, { "term": { "name.raw": { "value": qkey } } }, { "term": { "asciiname.raw": { "value": qkey } } }, { "term": { "alternatenames": { "value": qkey[1:] } } }, { "match": { "alternatenames": { "query": qkey, 'fuzziness': kwargs.pop("fuzzy", 0), "max_expansions": kwargs.pop("max_expansion", 5), "prefix_length": kwargs.pop("prefix_length", 1) } } }] if maincondition: q["query"]["bool"][query_name] = maincondition if min_popln: filter_cond = [{"range": {"population": {"gte": min_popln}}}] else: filter_cond = [] if kwargs: #filter_cond = [{"range": {"population": {"gte": min_popln}}}] filter_cond += [{ "term": { key: val } } for key, val in kwargs.viewitems()] # print(filter_cond) q["query"]["bool"]["filter"] = {"bool": {"must": filter_cond}} elif min_popln: filter_cond = [{ "range": { "population": { "gte": min_popln } } }, { "terms": { "featureCode": ["ppla", "pplx"] } }] q["query"]["bool"]["filter"] = { "bool": { "should": filter_cond } } return self.eserver.search(q, index=self._index, doc_type=self._doctype) def query(self, qkey, min_popln=None, **kwargs): #res = self._query(qkey, min_popln=min_popln, **kwargs)['hits']['hits'] res = self._query(qkey, min_popln=min_popln, **kwargs)['hits'] #max_score = sum([r['_score'] for r in res]) max_score = res['max_score'] #sum([r['_score'] for r in res]) #for t in res: gps = [] if max_score == 0.0: ## no results were obtained by elasticsearch instead it returned a random/very ## low scoring one res['hits'] = [] for t in res['hits']: t['_source']['geonameid'] = t["_source"]["id"] #t['_source']['_score'] = t[1] / max_score t['_source']['_score'] = t['_score'] / max_score pt = GeoPoint(**t["_source"]) if t['_source']['featureCode'].lower() == "cont": gps = [pt] break gps.append(pt) if len(gps) == 1: gps[0]._score = (min(float(len(gps[0].name)), float(len(qkey))) / max(float(len(gps[0].name)), float(len(qkey)))) return gps def near_geo(self, geo_point, min_popln=5000, **kwargs): q2 = { "query": { "bool": { "must": { "match_all": {} }, "filter": [{ "geo_distance": { "distance": "30km", "coordinates": geo_point } }, { "terms": { "featureCode": ["pcli", "ppl", "ppla2", "adm3"] } }] } }, "sort": { "population": "desc" } } res = self.eserver.search(q2, index=self._index, doc_type=self._doctype, **kwargs)['hits']['hits'][0]['_source'] res['confidence'] = 1.0 return [GeoPoint(**res)] def create(self, datacsv, confDir="../data/"): with open(os.path.join(confDir, "es_settings.json")) as jf: settings = json.load(jf) settings['mappings'][self._doctype] = settings['mappings'].pop( 'places') try: self.eserver.create_index(index=self._index, settings=settings) except: self.eserver.delete_index(self._index) self.eserver.create_index(index=self._index, settings=settings) for chunk in bulk_chunks(self._opLoader(datacsv, confDir), docs_per_chunk=1000): self.eserver.bulk(chunk, index=self._index, doc_type=self._doctype) print "..", self.eserver.refresh(self._index) def _opLoader(self, datacsv, confDir): with DataReader(datacsv, os.path.join(confDir, 'geonames.conf')) as reader: cnt = 0 for row in reader: try: row['coordinates'] = [ float(row['longitude']), float(row['latitude']) ] try: row['population'] = int(row["population"]) except: row['population'] = -1 try: row['elevation'] = int(row['elevation']) except: row['elevation'] = -1 del (row['latitude']) del (row['longitude']) #print row['name'] row['alternatenames'] = row['alternatenames'].split(",") cnt += 1 yield self.eserver.index_op(row, index=self._index, doc_type=self._doctype) except: print json.dumps(row) continue
from pyelasticsearch import ElasticSearch import simplejson,sys s=ElasticSearch("http://localhost:9200") if "init" in sys.argv : try : s.delete_index("flights"); except Exception, e: print e try : s.create_index("flights") except Exception, e: print e else : print "Created flights" s.put_mapping("flights","flight",simplejson.loads('{"flight":{"properties":{"datum":{"type":"string","index":"not_analyzed","omit_norms":true,"index_options":"docs"},"type": { "type": "string", "index" : "not_analyzed" }, "duration":{"type":"double"},"end":{"properties":{"alt":{"type":"integer"},"dist":{"type":"float"},"speed":{"type":"integer"},"time":{"type":"date","format":"dateOptionalTime"},"town":{"type":"string","analyzer":"keyword"},"country":{"type":"string","analyzer":"keyword"}}},"flight":{"type":"string","store":true,"analyzer":"keyword"},"hex":{"type":"string","store":true,"analyzer":"keyword"},"id":{"type":"string","store":true},"radar":{"type":"string","store":true,"analyzer":"keyword"},"reg":{"type":"string","store":true,"analyzer":"keyword"},"route":{"properties":{"coordinates":{"type":"double"},"type":{"type":"string"}}},"start":{"properties":{"alt":{"type":"integer"},"dist":{"type":"float"},"speed":{"type":"integer"},"time":{"type":"date","format":"dateOptionalTime"},"town":{"type":"string","analyzer":"keyword"},"country":{"type":"string","analyzer":"keyword"}}}}}}')) def md(a) : a["datum"]=a["starttime"][:10] return a def makets(a) : for f in ("starttime","endtime") : a[f]=maket(a[f]) return a
def delete_all(): es = ElasticSearch(CONTEXT["datahub-store"]) es.delete_index(CONTEXT["datahub-index"])
from pyelasticsearch import ElasticSearch from pyelasticsearch.exceptions import ElasticHttpNotFoundError, IndexAlreadyExistsError import requests import time IGNORED_GENRES = ("9", "15", "19" ) # We only care about stations that play music. import settings es = ElasticSearch(settings.ES_URL) INDEX_NAME = settings.ES_INDEX try: es.delete_index(INDEX_NAME) except ElasticHttpNotFoundError: pass try: es.create_index(INDEX_NAME) except IndexAlreadyExistsError: pass headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.72 Safari/537.36' } failures = 0 pk = 0 while failures < 200:
def documents_from_mails(mails): """Build document from mail""" for mail in mails: if 'Date' in mail.headers: # Some mails seem broken. yield { '@source': 'stuff://', '@type': 'mailadmin', '@tags': [mail.headers['From']], '@fields': mail.headers, '@timestamp': parse_date(mail.headers['Date']), '@source_host': 'localhost', '@source_path': 'mail/admin ', '@message': mail.body, 'id': mail.headers['Message-Id'] } if __name__ == '__main__': # Instantiate it with an url es = ElasticSearch(sys.argv[1]) # Kibana need this kind of name NAME = 'logstash-2013.06.13' try: es.delete_index(NAME) except ElasticHttpNotFoundError: pass # Nobody cares emails = mbox(sys.argv[2]) for n, docs in enumerate(bulk_iterate(documents_from_mails(emails), 100)): es.bulk_index(NAME, 'mailadmin', docs) print(n) print es.refresh(NAME)
'analyzer': 'mmseg', 'boost': 0.7, 'term_vector': 'with_positions_offsets' }, 'categories': { 'type': 'nested', 'properties': { 'url': { 'type': 'string', 'index': 'not_analyzed' }, 'name': { 'type': 'string', 'index': 'not_analyzed' }, } } } } } } es = ElasticSearch(HOST) try: es.delete_index(INDEX) except ElasticHttpNotFoundError: # No index found pass es.create_index(INDEX, settings=index_settings)
def getPets(self, breed, page): query = breed['ename'] if '宠物' not in query: query = '宠物' + query url = self.pet_site_url % (query, page) request = urllib2.Request(url) response = urllib2.urlopen(request) result = response.read().decode('gbk') if result is not None: data = json.loads(result)['data'][0]['disp_data'] if data: return data es = ElasticSearch('http://localhost:9200/') es.delete_index('pet') spider = Spider() breeds = spider.getPetBreeds() p = Pinyin() for breed in breeds: flg = 1 page = 1 pet_list = [] while(flg): pets = spider.getPets(breed, (page - 1) * spider.limit) if not pets: flg = 0 else: page = page + 1 for pet in pets: pet_obj = {}
class ElasticSearchProvider(SearchProvider): def __init__(self, config, db=None, authnz_wrapper=None, io_loop=None): self.debug = False self.config = config if db is not None: self.db = db self.syncES = ElasticSearch( '%(ELASTIC_SEARCH_PROTOCOL)s://%(ELASTIC_SEARCH_HOST)s:%(ELASTIC_SEARCH_PORT)s' % config ) self.asyncES = ESConnection( host=config.get('ELASTIC_SEARCH_HOST'), port=config.get('ELASTIC_SEARCH_PORT'), io_loop=io_loop, protocol=config.get('ELASTIC_SEARCH_PROTOCOL'), ) self.index = config.get('ELASTIC_SEARCH_INDEX') self.max_retries = config.get('ELASTIC_SEARCH_MAX_RETRIES') def activate_debug(self): self.debug = True def connect_to_db(self): from sqlalchemy import create_engine from sqlalchemy.orm import scoped_session, sessionmaker conn_string = self.config.get('SQLALCHEMY_CONNECTION_STRING') engine = create_engine( conn_string, convert_unicode=True, pool_size=1, max_overflow=0, echo=self.debug ) maker = sessionmaker(bind=engine, autoflush=True) self.db = scoped_session(maker) def _assemble_inner_query(self, domain=None, page_filter=None): if page_filter and domain: page_prefix = '%s/%s' % (domain.url, page_filter) else: page_prefix = None if page_prefix: return { 'prefix': { 'page_url': page_prefix } } else: return { 'match_all': {} } def _assemble_outer_query(self, inner_query, filter_terms): return { 'filtered': { 'query': inner_query, 'filter': { 'and': [{ 'term': filter_term } for filter_term in filter_terms] } } } def _assemble_filter_terms(self, key_id=None, domain=None): filter_terms = [] if key_id: filter_terms.append({'keys.id': key_id}) if domain: filter_terms.append({'domain_id': domain.id}) return filter_terms def gen_doc(self, review): return { 'keys': [{'id': violation.key_id} for violation in review.violations], 'uuid': str(review.uuid), 'completed_date': review.completed_date, 'violation_count': review.violation_count, 'page_id': review.page_id, 'page_uuid': str(review.page.uuid), 'page_url': review.page.url, 'page_last_review_date': review.page.last_review_date, 'domain_id': review.domain_id, 'domain_name': review.domain.name, } def index_review(self, review): for attempt in range(self.max_retries): try: self.syncES.send_request( method='POST', path_components=[self.index, 'review', review.page_id], body=dumps(self.gen_doc(review)), encode_body=False ) break except (Timeout, ConnectionError, ElasticHttpError, InvalidJsonResponseError) as e: values = review.id, review.page_id, str(e) logging.error('Could not index review (review_id:{0}, page_id:{1}): {2}'.format(*values)) time.sleep(1) if attempt >= self.max_retries - 1: raise else: raise def index_reviews(self, reviewed_pages, reviews_count, batch_size): action = {'index': {'_type': 'review'}} for i in range(0, reviews_count, batch_size): body_bits = [] for page in reviewed_pages[i:i + batch_size]: doc = self.gen_doc(page.last_review) action['index']['_id'] = doc['page_id'] body_bits.append(dumps(action)) body_bits.append(dumps(doc)) # Yes, that trailing newline IS necessary body = '\n'.join(body_bits) + '\n' self.syncES.send_request( method='POST', path_components=[self.index, '_bulk'], body=body, encode_body=False ) logging.info('Done!') @return_future def get_by_violation_key_name(self, key_id, current_page=1, page_size=10, domain=None, page_filter=None, callback=None): def treat_response(response): if response.error is None: try: hits = loads(response.body).get('hits', {'hits': []}) reviews_data = [] for hit in hits['hits']: completedAt = datetime.utcfromtimestamp(hit['_source']['completed_date']) reviews_data.append({ 'uuid': hit['_source']['uuid'], 'page': { 'uuid': hit['_source']['page_uuid'], 'url': hit['_source']['page_url'], 'completedAt': completedAt }, 'domain': hit['_source']['domain_name'] }) reviews_count = hits.get('total', 0) callback({ 'reviews': reviews_data, 'reviewsCount': reviews_count }) except Exception as e: reason = 'ElasticSearchProvider: invalid response (%s [%s])' % (type(e), e.message) logging.error(reason) callback({'error': {'status_code': 500, 'reason': reason}}) else: reason = 'ElasticSearchProvider: erroneous response (%s [%s])' % (response.error.message, response.body) logging.error(reason) callback({'error': {'status_code': 500, 'reason': reason}}) inner_query = self._assemble_inner_query(domain, page_filter) filter_terms = self._assemble_filter_terms(key_id, domain) query = self._assemble_outer_query(inner_query, filter_terms) sort_ = [{ 'completed_date': { 'order': 'desc' } }, { 'violation_count': { 'order': 'desc' } }] source = {'query': query, 'sort': sort_} self.asyncES.search( callback=treat_response, index=self.index, type='review', source=source, page=current_page, size=page_size, ) @return_future def get_domain_active_reviews(self, domain, current_page=1, page_size=10, page_filter=None, callback=None): def treat_response(response): if response.error is None: try: hits = loads(response.body).get('hits', {'hits': []}) pages = [] for hit in hits['hits']: completedAt = datetime.utcfromtimestamp(hit['_source']['completed_date']) pages.append({ 'url': hit['_source']['page_url'], 'uuid': hit['_source']['page_uuid'], 'violationCount': len(hit['_source']['keys']), 'completedAt': completedAt, 'reviewId': hit['_source']['uuid'] }) reviews_count = hits.get('total', 0) callback({ 'reviewsCount': reviews_count, 'pages': pages }) except Exception as e: reason = 'ElasticSearchProvider: invalid response (%s [%s])' % (type(e), e.message) logging.error(reason) callback({'error': {'status_code': 500, 'reason': reason}}) else: reason = 'ElasticSearchProvider: erroneous response (%s [%s])' % (response.error.message, response.body) logging.error(reason) callback({'error': {'status_code': 500, 'reason': reason}}) inner_query = self._assemble_inner_query(domain=domain, page_filter=page_filter) filter_terms = self._assemble_filter_terms(domain=domain) query = self._assemble_outer_query(inner_query, filter_terms) sort_ = [{ 'violation_count': { 'order': 'desc' } }, { 'completed_date': { 'order': 'desc' } }] source = {'query': query, 'sort': sort_} self.asyncES.search( callback=treat_response, index=self.index, type='review', source=source, page=current_page, size=page_size, ) def refresh(self): try: self.syncES.refresh(index=self.index) except Exception as e: logging.error('Could not refresh index (%s)' % e) def get_index_settings(cls): return { 'index': { 'number_of_shards': 4 } } def get_index_mapping(cls): return { 'review': { 'properties': { 'keys': { 'properties': { 'id': { 'type': 'integer' } } }, 'uuid': { 'type': 'string', 'index': 'not_analyzed' }, 'completed_date': { 'type': 'integer' }, 'violation_count': { 'type': 'float' }, 'page_id': { 'type': 'integer' }, 'page_uuid': { 'type': 'string', 'index': 'not_analyzed' }, 'page_url': { 'type': 'string', 'index': 'not_analyzed' }, 'page_last_review_date': { 'type': 'integer' }, 'domain_id': { 'type': 'integer' }, 'domain_name': { 'type': 'string', 'index': 'not_analyzed' } } } } def setup_index(self): try: settings = self.get_index_settings() self.syncES.create_index(index=self.index, settings=settings) mapping = self.get_index_mapping() self.syncES.put_mapping(index=self.index, doc_type='review', mapping=mapping) logging.info('Index %s created.' % self.index) except Exception as e: raise e def delete_index(self): try: self.syncES.delete_index(index=self.index) logging.info('Index %s deleted.' % self.index) except Exception as e: raise e def _get_max_page_id_from_index(self, must_have_domain_name=False): if must_have_domain_name: inner_query = { 'constant_score': { 'filter': { 'not': { 'missing': { 'field': 'domain_name' } } } } } else: inner_query = { 'match_all': {} } query = { 'query': inner_query, 'sort': [{ 'page_id': { 'order': 'desc' } }] } results = self.syncES.search(query, index=self.index, doc_type='review') if results['hits']['total'] > 0: return results['hits']['hits'][0]['_id'] or 0 return 0 def index_all_reviews(self, keys=None, batch_size=200, replace=False): logging.info('Querying database...') self.connect_to_db() if keys is not None: keys = [k.id for k in self.db.query(Key.id).filter(Key.name.in_(keys)).all()] try: max_page_id = self._get_max_page_id_from_index(must_have_domain_name=True) except Exception: logging.error('Could not retrieve max page_id! Use with --replace (with caution)') return def apply_filters(query): if keys is not None: query = query \ .filter(Violation.review_id == Page.last_review_id) \ .filter(Violation.key_id.in_(keys)) if not replace: query = query.filter(Page.id > max_page_id) return query.filter(Page.last_review_id != None) reviews_count = apply_filters(self.db.query(func.count(Page))).scalar() query = self.db.query(Page).options(joinedload('last_review')) reviewed_pages = apply_filters(query).order_by(Page.id.asc()) logging.info('Indexing %d reviews...' % reviews_count) self.index_reviews(reviewed_pages, reviews_count, batch_size) @classmethod def new_instance(cls, config): return ElasticSearchProvider(config) @classmethod def main(cls): import sys parser = cls.argparser() args = parser.parse_args() config = {} host = None port = None index = None es = None levels = ['ERROR', 'WARNING', 'INFO', 'DEBUG'] log_level = levels[args.verbose] logging.basicConfig(level=getattr(logging, log_level), format='%(levelname)s - %(message)s') if not (args.create or args.recreate or args.delete or args.keys or args.all_keys): parser.print_help() sys.exit(1) if args.conf: from derpconf.config import ConfigurationError from holmes.config import Config try: config = Config().load(args.conf[0]) host = config['ELASTIC_SEARCH_HOST'] port = config['ELASTIC_SEARCH_PORT'] index = config['ELASTIC_SEARCH_INDEX'] except ConfigurationError: logging.error('Could not load config! Use --conf conf_file') sys.exit(1) except KeyError: logging.error('Could not parse config! Check it\'s contents') sys.exit(1) if args.server: try: host, port = args.server[0].split(':') config['ELASTIC_SEARCH_HOST'] = host config['ELASTIC_SEARCH_PORT'] = port except Exception: logging.error('Could not parse server host and port! Use --server host:port') sys.exit(1) if args.index: index = args.index[0] config['ELASTIC_SEARCH_INDEX'] = index from pyelasticsearch.exceptions import IndexAlreadyExistsError, ElasticHttpNotFoundError, InvalidJsonResponseError from requests.exceptions import ConnectionError try: if args.create or args.recreate or args.delete: if host is None or port is None: logging.error('Need either a host and port or a config file to perform such operation!') sys.exit(1) if index is None: logging.error('Need either an index name or a config file to perform such operation!') sys.exit(1) else: es = cls.new_instance(config) if args.recreate or args.delete: try: es.delete_index() except ElasticHttpNotFoundError: pass except InvalidJsonResponseError as e: logging.error('Invalid response! Reason: %s' % e) sys.exit(1) if args.create or args.recreate: es.setup_index() if args.keys or args.all_keys: if config is None: logging.error('Need a config file to perform such operation! Use --conf conf_file') else: batch_size = args.batch_size[0] if args.batch_size else 200 es = cls.new_instance(config) if not es else es try: if args.verbose > 2: es.activate_debug() if args.keys: es.index_all_reviews(args.keys, replace=args.replace, batch_size=batch_size) elif args.all_keys: es.index_all_reviews(replace=args.replace, batch_size=batch_size) except InvalidJsonResponseError as e: logging.error('Invalid response! Reason: %s' % e) sys.exit(1) except IndexAlreadyExistsError: logging.error('Index %s already exists! Use --recreate (with caution) to recreate' % index) except ConnectionError: logging.error('Could not connect to server at %s:%s' % (host, port)) except KeyError: logging.error('Could not get host nor port! Use either -conf or --server') sys.exit(1)
class TestClient(unittest.TestCase): def setUp(self): super(TestClient, self).setUp() docs = [] self.es_host = os.environ.get('ES_HOST', 'http://*****:*****@mock.patch('monolith.client.util.iterweeks') def test_datetime_ranges(self, _mock): "Test datetime ranges get converted to dates." client = self._make_one() start = datetime.datetime(2012, 1, 1, 12, 34, 56) end = datetime.datetime(2012, 1, 31, 12, 34, 56) list(client('downloads_count', start, end, interval='week')) self.assertEqual(_mock.call_args[0][0], datetime.date(2012, 1, 1)) assert not isinstance(_mock.call_args[0][0], datetime.datetime) self.assertEqual(_mock.call_args[0][1], datetime.date(2012, 1, 31)) assert not isinstance(_mock.call_args[0][1], datetime.datetime) def test_date_order(self): # Ensure fill doesn't change date ordering. client = self._make_one() prev_date = datetime.date(2000, 1, 1) # Addon 1 doesn't have downloads for every month and the client will # fill zeroes for the missing dates. hits = list( client('downloads_count', START, '2012-05-01', interval='month', add_on='1')) for hit in hits: d = hit['date'] assert prev_date < d prev_date = d
import sys, os from pyelasticsearch import ElasticSearch, ElasticHttpNotFoundError from flask import * sys.path.insert(0, '..') from app import app from flask.ext.sqlalchemy import SQLAlchemy from apps.glyph.models import * app.testing = True client = app.test_client() ctx = app.test_request_context() ctx.push() es = ElasticSearch('http://localhost:9200/') try: es.delete_index('cdpp') except ElasticHttpNotFoundError: # we can safely ignore this, because it might be an initial run pass res = db.session.query(Sign).all() for r in res: d = r.__dict__ d.pop('_sa_instance_state', None) # bulk-index the cleaned signs es.bulk_index('cdpp', 'sign', [r.__dict__ for r in res], id_field='id') tablets = db.session.query(Tablet).all() repr = [] for result in tablets:
from pyelasticsearch import ElasticSearch from pyelasticsearch.exceptions import ElasticHttpNotFoundError, IndexAlreadyExistsError import requests import time IGNORED_GENRES = ("9", "15", "19") # We only care about stations that play music. import settings es = ElasticSearch(settings.ES_URL) INDEX_NAME = settings.ES_INDEX try: es.delete_index(INDEX_NAME) except ElasticHttpNotFoundError: pass try: es.create_index(INDEX_NAME) except IndexAlreadyExistsError: pass headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.72 Safari/537.36' } failures = 0 pk = 0 while failures < 200: pk += 1 r = requests.get("http://www.iheart.com/a/live/station/%d/" % pk, headers=headers)
t0=time() csv_filename='robinhood-daily-rets.csv' # size of the bulk chunksize=5000 # parse csv with pandas csvfile=pd.read_csv(csv_filename) # init ElasticSearch es = ElasticSearch('http://104.236.201.91:9200/') # init index try : es.delete_index("robinhood") except : pass es.create_index("robinhood") # start bulk indexing print("now indexing %s..."%(csv_filename)) records=csvfile.where(pd.notnull(csvfile), None).T.to_dict() list_records=[records[it] for it in records] try : es.bulk_index("robinhood","myPortfolio",list_records) except : print("error!, skipping a date") pass
class ElasticSearch(object): conn = None url = settings.ELASTICSEARCH_URL index_name = settings.ELASTICSEARCH_INDEX_NAME stdout = None stderr = None def __init__(self, index_name=None, stdout=None, stderr=None): self.conn = PyElasticSearch() if index_name: self.index_name = index_name if stdout: self.stdout = stdout if stderr: self.stderr = stderr def create_index(self, delete=True): if delete: try: self.conn.delete_index(self.index_name) except ElasticHttpNotFoundError as e: pass mappings = dict( (k, v) for k, v in get_elasticsearch_properties().items()) self.conn.create_index(self.index_name, settings={'mappings': mappings}) def index_activity_by_id(self, activity_id): activity = HistoricalActivity.objects.get(pk=activity_id) return self.index_activity(activity) def delete_activity_by_id(self, activity_id): activity = HistoricalActivity.objects.get(pk=activity_id) return self.delete_activity(activity) def index_activity(self, activity): for doc_type in DOC_TYPES_ACTIVITY: docs = self.get_activity_documents(activity, doc_type=doc_type) if len(docs) > 0: try: self.conn.bulk((self.conn.index_op( doc, id=doc.pop('id'), parent=doc.pop('_parent', None)) for doc in docs), index=self.index_name, doc_type=doc_type) except BulkError as e: for error in e.errors: msg = '%s: %s on ID %s' % ( error['index']['error']['type'], error['index']['error']['reason'], error['index']['_id']) if 'caused_by' in error['index']['error']: msg += ' (%s: %s)' % ( error['index']['error']['caused_by']['type'], error['index']['error']['caused_by']['reason']) self.stderr and self.stderr.write(msg) def index_investor(self, investor): for doc_type in DOC_TYPES_INVESTOR: docs = self.get_investor_documents(investor, doc_type=doc_type) if len(docs) > 0: try: self.conn.bulk((self.conn.index_op(doc, id=doc.pop('id')) for doc in docs), index=self.index_name, doc_type=doc_type) except BulkError as e: for error in e.errors: msg = '%s: %s on ID %s' % ( error['index']['error']['type'], error['index']['error']['reason'], error['index']['_id']) if 'caused_by' in error['index']['error']: msg += ' (%s: %s)' % ( error['index']['error']['caused_by']['type'], error['index']['error']['caused_by']['reason']) self.stderr and self.stderr.write(msg) def index_activity_documents(self, activity_identifiers=[]): activity_identifiers = activity_identifiers or HistoricalActivity.objects.filter( fk_status__in=( HistoricalActivity.STATUS_ACTIVE, HistoricalActivity.STATUS_PENDING, HistoricalActivity.STATUS_OVERWRITTEN, HistoricalActivity.STATUS_DELETED)).distinct().values_list( 'activity_identifier', flat=True).distinct() for doc_type in DOC_TYPES_ACTIVITY: docs = [] # Collect documents self.stdout and self.stdout.write( 'Collect %ss for %i deals...' % (doc_type, len(activity_identifiers))) for activity_identifier in activity_identifiers: for activity in self.get_activity_versions( activity_identifier): docs.extend( self.get_activity_documents(activity, doc_type=doc_type)) # Bulk index documents self.stdout and self.stdout.write('Index %i %ss...' % (len(docs), doc_type)) if len(docs) > 0: paginator = Paginator(docs, 1000) for page in paginator.page_range: try: self.conn.bulk( (self.conn.index_op(doc, id=doc.pop('id'), parent=doc.pop( '_parent', None)) for doc in paginator.page(page)), index=self.index_name, doc_type=doc_type) except BulkError as e: for error in e.errors: msg = '%s: %s on ID %s' % ( error['index']['error']['type'], error['index']['error']['reason'], error['index']['_id']) if 'caused_by' in error['index']['error']: msg += ' (%s: %s)' % (error['index']['error'] ['caused_by']['type'], error['index']['error'] ['caused_by']['reason']) self.stderr and self.stderr.write(msg) self.conn.refresh() def index_investor_documents(self): investors = Investor.objects.public().order_by( 'investor_identifier', '-id').distinct('investor_identifier') for doc_type in DOC_TYPES_INVESTOR: docs = [] # Collect documents self.stdout and self.stdout.write( 'Collect %ss for %i investors...' % (doc_type, investors.count())) for investor in investors: docs.extend( self.get_investor_documents(investor, doc_type=doc_type)) # Bulk index documents self.stdout and self.stdout.write('Index %i %ss...' % (len(docs), doc_type)) if len(docs) > 0: try: self.conn.bulk((self.conn.index_op(doc, id=doc.pop('id')) for doc in docs), index=self.index_name, doc_type=doc_type) except BulkError as e: for error in e.errors: msg = '%s: %s on ID %s' % ( error['index']['error']['type'], error['index']['error']['reason'], error['index']['_id']) if 'caused_by' in error['index']['error']: msg += ' (%s: %s)' % ( error['index']['error']['caused_by']['type'], error['index']['error']['caused_by']['reason']) self.stderr and self.stderr.write(msg) #def index_activity_by_version(self, activity_identifier): # for doc_type in get_elasticsearch_properties().keys(): # docs = self.get_documents_for_activity_version(activity_identifier, doc_type=doc_type) # if len(docs) > 0: # try: # self.conn.bulk((self.conn.index_op(doc, id=doc.pop('id')) for doc in docs), # index=self.index_name, # doc_type=doc_type) # except BulkError as e: # for error in e.errors: # stderr and stderr.write('%s: %s (caused by %s: %s, ID: %s)' % ( # error['index']['error']['type'], # error['index']['error']['reason'], # error['index']['error']['caused_by']['type'], # error['index']['error']['caused_by']['reason'], # error['index']['_id'] # )) def get_activity_versions(self, activity_identifier): versions = [] # get the newest non-pending, readable historic version: try: newest = HistoricalActivity.objects.filter( activity_identifier=activity_identifier, fk_status__in=( HistoricalActivity.STATUS_ACTIVE, HistoricalActivity.STATUS_OVERWRITTEN, HistoricalActivity.STATUS_DELETED)).distinct().latest() if newest and not newest.fk_status_id == HistoricalActivity.STATUS_DELETED: versions.append(newest) except HistoricalActivity.DoesNotExist: newest = None # get newer pendings pendings = HistoricalActivity.objects.filter( activity_identifier=activity_identifier, fk_status_id=HistoricalActivity.STATUS_PENDING).distinct() if newest: pendings.filter(history_date__gt=newest.history_date) versions.extend(pendings) return versions def get_activity_documents(self, activity, doc_type='deal'): docs = [] deal_attrs = { 'id': activity.id, 'activity_identifier': activity.activity_identifier, 'historical_activity_id': activity.id, 'status': activity.fk_status_id, } # Todo: Is there a nice way to prevent this extra Activity query? # e.g. if we save is_public/deal_scope as ActivityAttributes public_activity = Activity.objects.filter( activity_identifier=activity.activity_identifier).order_by( '-id').first() if public_activity: deal_attrs.update({ 'is_public': public_activity.is_public, 'deal_scope': public_activity.deal_scope, 'deal_size': public_activity.deal_size, 'current_negotiation_status': public_activity.negotiation_status, 'top_investors': public_activity.top_investors, 'fully_updated_date': public_activity.fully_updated_date, }) else: # Fixme: This should not happen self.stderr and self.stderr.write( _('Missing activity for historical activity %i (Activity identifier: #%i)' % (activity.id, activity.activity_identifier))) #except Activity.MultipleObjectsReturned: # # Fixme: This should not happen # self.stderr and self.stderr.write(_('Too much activities for historical activity %i (Activity identifier: #%i)' % ( # activity.id, # activity.activity_identifier # ))) for a in activity.attributes.select_related('fk_group__name').order_by( 'fk_group__name'): # do not include the django object id if a.name == 'id': continue attribute = None attribute_key = '%s_attr' % a.name if attribute_key in get_elasticsearch_properties( )['deal']['properties'].keys(): attribute = { 'value': a.value, 'value2': a.value2, 'date': a.date, 'is_current': a.is_current, } value = a.value # Area field? if a.name and 'area' in a.name and a.polygon is not None: # Get polygon #value = json.loads(a.polygon.json) # Apparently this is case sensitive: MultiPolygon as provided by the GeoJSON does not work #value['type'] = 'multipolygon' value = a.polygon.json or '' # do not include empty values if value is None or value == '': continue # Doc types: location, data_source or contract group_match = a.fk_group and a.fk_group.name or '' group_match = re.match( '(?P<doc_type>location|data_source|contract)_(?P<count>\d+)', group_match) if group_match: dt, count = group_match.groupdict()['doc_type'], int( group_match.groupdict()['count']) if doc_type == dt: while len(docs) < count: docs.append({ '_parent': activity.activity_identifier, 'id': a.id, #'%i_%i' % (a.id, count), }) docs[count - 1][a.name] = [ value, ] # Set doc type counter within deal doc type (for location/data_source/contract) elif doc_type == 'deal': # Set counter key = '%s_count' % dt if key not in deal_attrs.keys(): deal_attrs[key] = count elif deal_attrs[key] < count: deal_attrs[key] = count # Create list with correct length to ensure formset values have the same index if not a.name in deal_attrs: deal_attrs[a.name] = [''] * count if attribute: deal_attrs[attribute_key] = [''] * count else: while len(deal_attrs[a.name]) < count: deal_attrs[a.name].append('') if attribute: deal_attrs[attribute_key].append('') deal_attrs[a.name][count - 1] = value if attribute: deal_attrs['%s_attr' % a.name][count - 1] = attribute # Doc type: deal and not formset elif doc_type == 'deal': if a.name in deal_attrs: deal_attrs[a.name].append(value) if '%s_attr' % a.name in get_elasticsearch_properties( )['deal']['properties'].keys(): deal_attrs['%s_attr' % a.name].append(attribute) else: deal_attrs[a.name] = [ value, ] if '%s_attr' % a.name in get_elasticsearch_properties( )['deal']['properties'].keys(): deal_attrs['%s_attr' % a.name] = [ attribute, ] if doc_type == 'deal': # Additionally save operational company attributes oc = Investor.objects.filter( investoractivityinvolvement__fk_activity__activity_identifier= activity.activity_identifier) if oc.count() > 0: oc = oc.first() for field in Investor._meta.fields: if isinstance(field, ForeignKey): deal_attrs['operational_company_%s' % field.name] = getattr( oc, '%s_id' % field.name) else: deal_attrs['operational_company_%s' % field.name] = getattr(oc, field.name) else: pass #self.stderr and self.stderr.write("Missing operational company for deal #%i" % activity.activity_identifier) # Create single document for each location # FIXME: Saving single deals for each location might be deprecated since we have doc_type location now? spatial_names = list(get_spatial_properties()) for i in range(deal_attrs.get('location_count', 0)): doc = deal_attrs.copy() for name in spatial_names: if not name in doc: continue if len(deal_attrs[name]) > i: doc[name] = deal_attrs[name][i] else: doc[name] = '' # Set unique ID for location (deals can have multiple locations) doc['id'] = '%s_%i' % (doc['id'], i) point_lat = doc.get('point_lat', None) point_lon = doc.get('point_lon', None) if point_lat and point_lon: # Parse values try: parsed_lat, parsed_lon = float(point_lat), float(point_lon) doc['geo_point'] = '%s,%s' % (point_lat, point_lon) except ValueError: doc['geo_point'] = '0,0' else: doc['point_lat'] = '0' doc['point_lon'] = '0' doc['geo_point'] = '0,0' # FIXME: we dont really need 'point_lat' and 'point_lon' here, # so we should pop them from doc when adding 'geo_point' docs.append(doc) # Update docs with export values for doc in docs: doc.update(self.get_export_properties(doc, doc_type=doc_type)) return docs def get_export_properties(self, doc, doc_type='deal'): if doc_type == 'investor': return ExportInvestorForm.export(doc) elif doc_type == 'involvement': return InvestorVentureInvolvementForm.export(doc) else: properties = { 'deal_scope_export': doc.get('deal_scope', ''), 'is_public_export': doc.get('is_public', False) and str(_('Yes')) or str(_('No')), 'deal_size_export': doc.get('deal_size', ''), 'current_negotiation_status_export': doc.get('current_negotiation_status', ''), 'top_investors_export': doc.get('top_investors', ''), 'fully_updated_date_export': doc.get('fully_updated_date', ''), } # Doc types: deal, location, contract and data_source for form in ChangeDealView.FORMS: formset_name = hasattr(form, "form") and form.Meta.name or None form = formset_name and form.form or form properties.update(form.export(doc, formset=formset_name)) properties.update( ExportInvestorForm.export(doc, prefix='operational_company_')) return properties def get_investor_documents(self, investor, doc_type='investor'): docs = [] # Doc types: involvement and investor if doc_type == 'involvement': ivis = InvestorVentureInvolvement.objects.filter( Q(fk_venture=investor) | Q(fk_investor=investor)) for ivi in ivis: doc = {} for field in ivi._meta.local_fields: if isinstance(field, ForeignKey): doc[field.name] = getattr(ivi, '%s_id' % field.name) else: doc[field.name] = getattr(ivi, field.name) docs.append(doc) elif doc_type == 'investor': doc = {} for field in investor._meta.local_fields: if isinstance(field, ForeignKey): doc[field.name] = getattr(investor, '%s_id' % field.name) else: doc[field.name] = getattr(investor, field.name) docs.append(doc) # Update docs with export values for doc in docs: doc.update(self.get_export_properties(doc, doc_type=doc_type)) return docs def refresh_index(self): self.conn.refresh(self.index_name) def search(self, elasticsearch_query, doc_type='deal', sort=[]): """ Executes paginated queries until all results have been retrieved. @return: The full list of hits. """ start = 0 size = 10000 # 10000 is the default elasticsearch max_window_size (pagination is cheap, so more is not necessarily better) raw_result_list = [] done = False while not done: query = { 'query': elasticsearch_query, 'from': start, 'size': size, } if sort: query['sort'] = sort query_result = self.conn.search(query, index=self.index_name, doc_type=doc_type) raw_result_list.extend(query_result['hits']['hits']) results_total = query_result['hits']['total'] if len(raw_result_list) >= results_total: done = True else: start = len(raw_result_list) print('\nElasticsearch returned %i documents from a total of %i \n\n' % (len(raw_result_list), query_result['hits']['total'])) return raw_result_list def delete_activity(self, activity): for doc_type in DOC_TYPES_ACTIVITY: try: if doc_type == 'deal': self.conn.delete(id=activity.activity_identifier, index=self.index_name, doc_type=doc_type) else: self.conn.delete_by_query(query={ "parent_id": { "type": "deal", "id": str(activity.activity_identifier), } }, index=self.index_name, doc_type=doc_type) except ElasticHttpNotFoundError as e: pass def get_deals_by_activity_identifier(self, activity_identifier, doc_type='deal'): return self.search({ "constant_score": { "filter": { "term": { "activity_identifier": activity_identifier } } } })
class ESWrapper(BaseDB): def __init__(self, index_name, doc_type, host='http://localhost', port=9200): self.eserver = ElasticSearch(urls=host, port=port, timeout=60, max_retries=3) #self._base_query = {"query": {"bool": {"must": {"match": {}}}}} #self._base_query = {"query": {"bool": {}}} self._geo_filter = {"distance": "20km", "coordinates": {}} self._population_filter = {'population': {'gte': 5000}} self._index = index_name self._doctype = doc_type def getByid(self, geonameId): maincondition = {"match": {"id": geonameId}} q = {"query": {"bool": {"must": maincondition}}} return self.eserver.search( q, index=self._index, doc_type=self._doctype)['hits']['hits'][0]['_source'] def _query(self, qkey, **kwargs): q = {"query": {"bool": {}}} query_name = "should" q["query"]["bool"]["minimum_number_should_match"] = 1 kwargs.pop("qtype", "") placetokens = [ l.strip() for l in tokenizer.split(qkey) if l and l not in STOP_WORDS and l[-1] != '.' ] if placetokens: reduced_placename = u" ".join(placetokens[0:]) if len(placetokens[0]) < 3 and len( placetokens) > 1 and 3.0 / len(placetokens) >= .5: reduced_placename = u" ".join(placetokens[1:]) else: reduced_placename = qkey # print "qkey", qkey, "reduced", reduced_placename maincondition = [ { "bool": { "must": [{ "multi_match": { "query": qkey, "fields": ["name.raw^5", "asciiname^5", "alternatenames"], "operator": "and" } }, { "terms": { "featureClass": ["a", "p"] } }], } }, { "term": { "name.raw": { "value": qkey } } }, { "term": { "asciiname.raw": { "value": qkey } } }, { "term": { "normalized_asciiname": { "value": qkey } } }, # {"term": {"alternatenames": {"value": qkey[1:]}}}, { "term": { "alternatenames": { "value": qkey } } }, # {"multi_match": {"query": reduced_placename if 'fuzzy' in kwargs else unicode(unidecode(reduced_placename)), { "multi_match": { "query": reduced_placename if 'fuzzy' in kwargs else unicode( unidecode(reduced_placename)), 'fuzziness': kwargs.pop("fuzzy", 0), "max_expansions": kwargs.pop("max_expansion", 10), "prefix_length": kwargs.pop("prefix_length", 1), 'operator': kwargs.pop("operator", "and"), "fields": [ "name^3", "asciiname^3", "alternatenames", "normalized_asciiname^3" ] } } ] q["query"]["bool"][query_name] = maincondition if kwargs: filter_cond = [] if 'min_popln' in kwargs: popln = kwargs.pop("min_popln") if popln is not None: filter_cond.append( {"range": { "population": { "gte": popln } }}) for key, val in kwargs.viewitems(): if not isinstance(val, basestring): val = list([(v) for v in val]) filter_cond.append({"terms": {key: val}}) else: filter_cond.append({"term": {key: (val)}}) q["query"]["bool"]["filter"] = {"bool": {"must": filter_cond}} q['from'] = 0 q['size'] = 50 return self.eserver.search(q, index=self._index, doc_type=self._doctype) def query(self, qkey, min_popln=None, **kwargs): #res = self._query(qkey, min_popln=min_popln, **kwargs)['hits']['hits'] res = self._query(qkey, min_popln=min_popln, **kwargs)['hits'] #max_score = sum([r['_score'] for r in res]) max_score = res['max_score'] #sum([r['_score'] for r in res]) #for t in res: # print(max_score) gps = [] if max_score == 0.0: ## no results were obtained by elasticsearch instead it returned a random/very ## low scoring one res['hits'] = [] for t in res['hits']: t['_source']['geonameid'] = t["_source"]["id"] #t['_source']['_score'] = t[1] / max_score t['_source']['_score'] = t['_score'] / max_score pt = GeoPoint(**t["_source"]) if t['_source']['featureCode'].lower() == "cont": gps = [pt] break gps.append(pt) if len(gps) == 1: gps[0]._score = (min(float(len(gps[0].name)), float(len(qkey))) / max(float(len(gps[0].name)), float(len(qkey)))) return gps def _oldquery(self, qkey, qtype="exact", analyzer=None, min_popln=None, size=10, **kwargs): """ qtype values are exact, relaxed or geo_distance Always limit results to 10 """ q = {"query": {"bool": {}}} query_name = kwargs.pop('query_name', 'must') query_name = "should" if query_name == "should": q["query"]["bool"]["minimum_number_should_match"] = 1 maincondition = {} if qtype == "exact": maincondition = [{ "term": { "name.raw": { "value": qkey } } }, { "term": { "asciiname.raw": { "value": qkey } } }, { "term": { "alternatenames": { "value": qkey } } }] if analyzer: maincondition["match"]["name.raw"]["analyzer"] = analyzer elif qtype == "relaxed": maincondition["match"] = {"alternatenames": {"query": qkey}} if analyzer: maincondition["match"]["alternatenames"]["analyzer"] = analyzer #q["query"]["bool"][query_name]["match"].pop("name.raw", "") elif qtype == "combined": maincondition = [{ "bool": { "must": { "multi_match": { "query": qkey, "fields": ["name.raw", "asciiname", "alternatenames"] } }, "filter": { "bool": { "should": [{ "range": { "population": { "gte": 5000 } } }, { "terms": { "featureCode": [ "pcla", "pcli", "cont", "rgn", "admd", "adm1", "adm2" ] } }] } } } }, { "term": { "name.raw": { "value": qkey } } }, { "term": { "asciiname.raw": { "value": qkey } } }, { "term": { "alternatenames": { "value": qkey[1:] } } }, { "match": { "alternatenames": { "query": qkey, 'fuzziness': kwargs.pop("fuzzy", 0), "max_expansions": kwargs.pop("max_expansion", 5), "prefix_length": kwargs.pop("prefix_length", 1) } } }] if maincondition: q["query"]["bool"][query_name] = maincondition if min_popln: filter_cond = [{"range": {"population": {"gte": min_popln}}}] else: filter_cond = [] if kwargs: #filter_cond = [{"range": {"population": {"gte": min_popln}}}] filter_cond += [{ "term": { key: val } } for key, val in kwargs.viewitems()] # print(filter_cond) q["query"]["bool"]["filter"] = {"bool": {"must": filter_cond}} elif min_popln: filter_cond = [{ "range": { "population": { "gte": min_popln } } }, { "terms": { "featureCode": ["ppla", "pplx"] } }] q["query"]["bool"]["filter"] = { "bool": { "should": filter_cond } } return self.eserver.search(q, index=self._index, doc_type=self._doctype) def oldquery(self, qkey, min_popln=None, **kwargs): #res = self._query(qkey, min_popln=min_popln, **kwargs)['hits']['hits'] res = self._query(qkey, min_popln=min_popln, **kwargs)['hits'] #max_score = sum([r['_score'] for r in res]) max_score = res['max_score'] #sum([r['_score'] for r in res]) #for t in res: gps = [] if max_score == 0.0: ## no results were obtained by elasticsearch instead it returned a random/very ## low scoring one res['hits'] = [] for t in res['hits']: t['_source']['geonameid'] = t["_source"]["id"] #t['_source']['_score'] = t[1] / max_score t['_source']['_score'] = t['_score'] / max_score pt = GeoPoint(**t["_source"]) if t['_source']['featureCode'].lower() == "cont": gps = [pt] break gps.append(pt) if len(gps) == 1: gps[0]._score = (min(float(len(gps[0].name)), float(len(qkey))) / max(float(len(gps[0].name)), float(len(qkey)))) return gps def near_geo(self, geo_point, min_popln=5000, **kwargs): q2 = { "query": { "bool": { "must": { "match_all": {} }, "filter": [ { "geo_distance": { "distance": "30km", "coordinates": geo_point } }, { "terms": # {"featureCode": # ["pcli", "ppl", "ppla2", "adm3"]} { "featureClass": ["a", "h", "l", "t", "p", "v"] } } ] } }, "sort": { "population": "desc" } } if kwargs: for key in kwargs: q2['query']['bool']['filter'].append( {"term": { key: kwargs[key] }}) res = self.eserver.search( q2, index=self._index, doc_type=self._doctype)['hits']['hits'][0]['_source'] res['confidence'] = 1.0 return [GeoPoint(**res)] def create(self, datacsv, confDir="../data/"): with open(os.path.join(confDir, "es_settings.json")) as jf: settings = json.load(jf) settings['mappings'][self._doctype] = settings['mappings'].pop( 'places') try: self.eserver.create_index(index=self._index, settings=settings) except: self.eserver.delete_index(self._index) self.eserver.create_index(index=self._index, settings=settings) for chunk in bulk_chunks(self._opLoader(datacsv, confDir), docs_per_chunk=1000): self.eserver.bulk(chunk, index=self._index, doc_type=self._doctype) print "..", self.eserver.refresh(self._index) def _opLoader(self, datacsv, confDir): ere = re.compile("[^\sa-zA-Z0-9]") with DataReader(datacsv, os.path.join(confDir, 'geonames.conf')) as reader: cnt = 0 for row in reader: try: row['coordinates'] = [ float(row['longitude']), float(row['latitude']) ] try: row['population'] = int(row["population"]) except: row['population'] = -1 try: row['elevation'] = int(row['elevation']) except: row['elevation'] = -1 del (row['latitude']) del (row['longitude']) #print row['name'] row['alternatenames'] = row['alternatenames'].lower( ).split(",") row['normalized_asciiname'] = (re.sub( r'\s+', r' ', ere.sub("", row['asciiname']))).strip() cnt += 1 yield self.eserver.index_op(row, index=self._index, doc_type=self._doctype) except: print json.dumps(row) continue def remove_dynamic_stopwords(self, term): # cc = {} # ttl = 0 words = [w for t in term.split("-") for w in t.split() if len(w) > 1] if len(words) == 1: return term stopword_removed = "" for word in words: try: t = self.eserver.count(word)['count'] if t >= 20000: continue except: pass stopword_removed += (word + " ") # else: # print(term, "stopword ", word) return stopword_removed.strip()
def import_json_into_es(types, inputfolder, logger): """ imports entitied from the *name.json.bz2* files (one entity per line) into local elasticsearch :param types: json string like {'person': 'http://www.wikidata.org/entity/Q5', 'name': 'Wikidata-URI'} :param inputfolder: :param logger: :return: """ es = ElasticSearch(config.ELASTICSEARCH_URL) try: es.delete_index('wikidata') es.create_index('wikidata') logger.info('rebuild index [wikidata]') except: logger.warning('cant delete wikidata index') # convert type dictionary wd_types = dict() for key in types.keys(): value = int(types[key].split('/')[-1][1:]) wd_types[value] = { 'type': key, 'filename': path.join(inputfolder, '{}.json.bz2'.format(key)) } # import each given type for key in wd_types: logger.info(wd_types[key]) done = 0 items = [] for line in BZ2File(wd_types[key]['filename'], 'rb'): line = line.strip() item = loads(line) item['uri'] = 'http://wikidata.org/wiki/' + item['id'] items.append(item) done += 1 if (done % 5000 == 0): es.bulk_index('wikidata', wd_types[key]['type'], items, id_field='id') items = [] # if done % len(wd_types) / 10 == 0: # log 10% steps # logger.info('imported {}: {:,d} ({:,d})'.format(wd_types[key]['type'],done, 100*len(wd_types)/done )) if done % 10000 == 0: logger.info('imported {}: {}'.format(wd_types[key]['type'], format(done, ',d'))) if len(items) > 0: es.bulk_index('wikidata', wd_types[key]['type'], items, id_field='id') logger.info('imported {}: {}'.format(wd_types[key]['type'], format(done, ',d')))
# size of the bulk chunksize=5000 # open csv file f = open(raw_data_path+csv_filename) # read csv # parse csv with pandas csvfile=pd.read_csv(f, iterator=True, chunksize=chunksize) # init ElasticSearch es = ElasticSearch('http://localhost:9200/') # init index try : es.delete_index("weiboscope") except : pass es.create_index("weiboscope") # start bulk indexing print ("now indexing %s..."%(csv_filename)) for i,df in enumerate(csvfile): print (i) records=df.where(pd.notnull(df), None).T.to_dict() list_records=[records[it] for it in records] try : es.bulk_index("weiboscope","tweet",list_records) except :
class TestClient(unittest.TestCase): def setUp(self): super(TestClient, self).setUp() docs = [] self.es_host = os.environ.get('ES_HOST', 'http://*****:*****@mock.patch('monolith.client.util.iterweeks') def test_datetime_ranges(self, _mock): "Test datetime ranges get converted to dates." client = self._make_one() start = datetime.datetime(2012, 1, 1, 12, 34, 56) end = datetime.datetime(2012, 1, 31, 12, 34, 56) list(client('downloads_count', start, end, interval='week')) self.assertEqual(_mock.call_args[0][0], datetime.date(2012, 1, 1)) assert not isinstance(_mock.call_args[0][0], datetime.datetime) self.assertEqual(_mock.call_args[0][1], datetime.date(2012, 1, 31)) assert not isinstance(_mock.call_args[0][1], datetime.datetime) def test_date_order(self): # Ensure fill doesn't change date ordering. client = self._make_one() prev_date = datetime.date(2000, 1, 1) # Addon 1 doesn't have downloads for every month and the client will # fill zeroes for the missing dates. hits = list(client('downloads_count', START, '2012-05-01', interval='month', add_on='1')) for hit in hits: d = hit['date'] assert prev_date < d prev_date = d