def delete_post(): # "POST /delete/post" -> "deleted.html"のレンダリング title = get_title('削除完了') isbn10 = request.form['isbn10'] # 削除対象書籍ISBN-10コード es = Elasticsearch('elasticsearch') book_title = es.get_source(index='book', id=isbn10)['title'] # 削除対象書籍タイトル es.delete(index='book', id=isbn10) # bookインデックスから対象書籍削除 logger.debug('書籍の削除に成功しました (ISBN-10: {})'.format(isbn10)) es.indices.refresh( index='book') # bookインデックス更新 <- 後のD2Vモデル再訓練時に削除した書籍が混入しないようにするため es.close() # 削除した書籍を推薦対象外とするため,削除ごとにDoc2Vecモデルを再構築 global d2v d2v = Doc2VecWrapper(model_path=Path('/projects/model/d2v.model'), initialize=True) d2v.train() return render_template('deleted.html', shishosan=config['shishosan'], title=title, isbn10=isbn10, book_title=book_title)
def test_add_messages(self): """ Test adding a thread that has messages associated with it. Adding a message that has messages associated with it should also add those messages to the search index. """ thread = create_thread() message = create_message(thread=thread) self.backend.add(thread) es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) source = es.get_source( index=self.backend.index, doc_type='message', id=message.pk) source_json = json.dumps(source) expected = { 'body': message.body, } expected_json = json.dumps(expected) self.assertJSONEqual(expected_json, source_json)
def history(): # "GET /history" -> "history.html"のレンダリング title = get_title('閲覧履歴') user_history = get_user_history(user=current_user) # 表示する閲覧履歴の最大冊数は30冊 hisotry_max_size, unique_user_history, bIds_set = 30, [], set() # 最新順(タイムスタンプ降順)取得 -> 重複履歴除外 es = Elasticsearch('elasticsearch') for lId, log in sorted(user_history.items(), reverse=True): if len(unique_user_history) == hisotry_max_size: break if log['bId'] in bIds_set: # 時系列順で後に閲覧した書籍 -> 除外 continue bIds_set.add(log['bId']) unique_user_history.append(log) unique_user_history[-1]['book'] = es.get_source( index='book', id=log['bId']) # 書籍情報取得 es.close() # 閲覧書籍0冊 -> None if len(user_history) == 0: unique_user_history = None return render_template('history.html', shishosan=config['shishosan'], title=title, user_history=unique_user_history)
def book(isbn10=None): # "GET /search/<isbn10>" -> "book.html"のレンダリング # 類似書籍: 非パーソナライズ推薦(Doc2Vec)による書籍 # 推薦書籍: パーソナライズ推薦(提案SBRS)による書籍 title = get_title('本:{0}'.format(isbn10)) es = Elasticsearch('elasticsearch') book = es.get_source(index='book', id=isbn10) # bookインデックスから取得 n_book = es.count(index='book')['count'] # 書籍総数 if n_book >= 10: # 10冊以上 -> D2Vモデル構築済み -> 非パーソナライズ推薦(類似書籍取得) try: sim_books_isbn10 = d2v.get_similar_books( isbn10=isbn10, topn=6, verbose=False) # 類似書籍ISBN-10 sim_books = [ es.get_source(index='book', id=sb[0]) for sb in sim_books_isbn10 ] # 類似書籍情報取得 except KeyError: # 分散表現未構築(モデル再構築前) -> 非パーソナライズ推薦キャンセル sim_books = None else: sim_books = None log = record_history(user=current_user, bId=isbn10) # 書籍閲覧履歴記録 rec_books_isbn10 = prop_sbrs.update(log=log) # 推薦書籍ISBN-10 # 推薦書籍なし(各情報不足により提案SBRSが推薦生成できず) -> 類似書籍のみ表示 if rec_books_isbn10 is None: rec_books = None else: # ISBN-10に対応する書籍情報習得 rec_books = [ es.get_source(index='book', id=isbn10) for isbn10 in rec_books_isbn10 ] es.close() return render_template('book.html', shishosan=config['shishosan'], title=title, isbn10=isbn10, book=book, sim_books=sim_books, rec_books=rec_books)
def test_remove(self): """ Test removing an object from the search index. Removing an object from the search index should make it inaccessible to elasticsearch. """ thread = create_thread() self.backend.add(thread) self.backend.remove(thread) es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) with self.assertRaises(NotFoundError): es.get_source( index=self.backend.index, doc_type='thread', id=thread.pk)
def test_remove_message(self): """ Test removing a thread with messages. If a thread has messages assocated with it, those messages should be removed from the search backend when the thread instance is removed. """ thread = create_thread() message = create_message(thread=thread) self.backend.add(thread) self.backend.remove(thread) es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) with self.assertRaises(NotFoundError): es.get_source( index=self.backend.index, doc_type='message', id=message.pk)
def test_update_old_threads(self): """ Test updating the index with old threads. If there was a thread that was previously in the index and has since been deleted, then it should be removed from the index. """ thread = create_thread() thread_pk = thread.pk backend = ElasticSearch() backend.add(thread) thread.delete() call_command('updateindex', stdout=self.out) es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) with self.assertRaises(NotFoundError): es.get_source( index='test', doc_type='thread', id=thread_pk)
def get_title_from_isbn10(self, isbn10: str) -> str: """ISBN-10から書籍タイトル取得(Elasticsearch経由) Args: isbn10 (str): ISBN-10コード Returns: str: 書籍タイトル """ es = Elasticsearch('elasticsearch') title = es.get_source(index="book", id=isbn10)['title'] es.close() return title
class ElasticsearchBackend: def __init__(self): self.client = Elasticsearch([ELASTIC_HOST], http_auth=ELASTIC_AUTH) self.index = ELASTIC_CACHE_INDEX def get(self, id_): try: return self.client.get_source(index=self.index, id=id_) except NotFoundError: return def set(self, id_, body): body['created'] = datetime.now().isoformat() return self.client.index(index=self.index, id=id_, body=body)
class ResultDB( BaseResultDB): collection_prefix = '' def __init__(self, url, database='resultdb'): self.conn = Elasticsearch() self.database = database #self.conn.IndicesClient(self.conn).delete(index=self.database); #self.save( "afxc2", "sd","http://www.5566.com",{"shopname":"sdfsdfs"} ) #print self.count( "afxc2" ) #print self.get( "afxc2" , "sd" ) #self.select( "afxc2" ) def _parse(self, data): return data["_source"] #if 'result' in data: # data['result'] = json.loads(data['result']) #return data def _stringify(self, data): if 'result' in data: data['result'] = json.dumps(data['result']) return data def save(self, project, taskid, url, result): obj = { 'taskid': taskid, 'url': url, 'result': result, 'updatetime': time.time(), } return self.conn.index( index=self.database, doc_type=project, id=taskid, body= obj ) def select(self, project, fields=None, offset=0, limit=0): ret = []; if limit==0 : limit = 10 items = self.conn.search( index=self.database, doc_type=project, fields=fields,_source=True , from_=offset,size=limit ); for item in items["hits"]["hits"]: ret.append( self._parse(item)) return ret; def count(self, project): r = self.conn.count(index=self.database, doc_type=project ); return r['count']; def get(self, project, taskid, fields=None): return self.conn.get_source( index=self.database, doc_type=project, id=taskid );
def test_add(self): """ Test adding an object to the search index. Adding an object to the search index should make it searchable by elasticsearch. """ thread = create_thread() self.backend.add(thread) es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) source = es.get_source( index=self.backend.index, doc_type='thread', id=thread.pk) source_json = json.dumps(source) expected = { 'title': thread.title, } expected_json = json.dumps(expected) self.assertJSONEqual(expected_json, source_json)
def delete_inquire(): # "GET /delete" -> "delete.html"のレンダリング if request.args.get('isbn10') and current_user.uId not in guest_uIds_set: # 本ページの削除ボタンからのアクセス title = get_title('削除確認') isbn10 = request.args['isbn10'] # 削除対象書籍ISBN-10コード # 削除問い合わせ対象書籍情報取得 es = Elasticsearch('elasticsearch') book = es.get_source(index='book', id=isbn10) es.close() return render_template('delete.html', shishosan=config['shishosan'], title=title, isbn10=isbn10, book=book) else: # 本ページの削除ボタンを経由しないアクセス title = get_title('削除不可') return render_template('delete.html', shishosan=config['shishosan'], title=title)
def test_update(self): """ Test updating search index with threads. Updating the index should add all existing threads to the index. """ thread = create_thread() call_command('updateindex', stdout=self.out) es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) source = es.get_source( index='test', doc_type='thread', id=thread.pk) source_json = json.dumps(source) expected = { 'title': thread.title, } expected_json = json.dumps(expected) self.assertJSONEqual(expected_json, source_json) self.assertIn("Updated 1 thread(s)", self.out.getvalue())
verify_certs=True) index = 'test-index' doc_type = 'test-type' doc_id = '1' #utils.get_id() body = {'id': '1', 'name': 'wxnaawefaefawcy', 'pwd': 'www'} # data = es.cluster.health(wait_for_status='yellow', request_timeout=1) # 创建索引 # data = es.indices.create(index=index) # 创建索引 忽略某些错误 # data = es.indices.create(index='test-index', ignore=400) # 创建数据 # data = es.create(index=index, doc_type=doc_type, id=doc_id, body=body) # 删除数据 # data = es.delete(index=index, doc_type=doc_type, id='1482204285.41') # 检查是否存在 return boolean # data = es.exists(index=index, doc_type=doc_type, id='1482204824.82') # 获取数据 # data = es.get(index=index, doc_type=doc_type, id='1482204824.82') # 获取数据source data = es.get_source(index=index, doc_type=doc_type, id='1') #搜索全部数据 # data = es.search(index='test-index') # data = es.msearch(index=index, doc_type=doc_type,'name') print data print data is None print es.info
class APIDatabase: def __init__(self, elastic_index='address-book', *args, **kwargs): # calls Elasticsearch() to connect to the database and creates the index for the address book if needed import json from elasticsearch import Elasticsearch from elasticsearch import exceptions as es_exceptions # hold on to the exceptions so they can be recognized in the try...except blocks later self.es_exceptions = es_exceptions # host and port information for Elasticshare() is in the separate json file try: with open('./elastic_host_config.json') as f: elastic_host_info = json.load(f) except FileNotFoundError: elastic_host_info = {'host': 'localhost', 'port': 9200} self.database = Elasticsearch(elastic_host_info, *args, **kwargs) self.elastic_index = elastic_index # ensure the Elasticsearch index exists self.database.indices.create(index=elastic_index, ignore=400) def get_contact_by_query(self, page_size, page_num, query_string): # searches the data store using query_string and returns page_size entries starting on page page_num if page_size < 0: return { 'error': 'pageSize must be a nonnegative integer', 'status': 400 } elif page_num < 0: return { 'error': 'page must be a nonnegative integer', 'status': 400 } try: result = self.database.search(index=self.elastic_index, from_=page_num, q=query_string, size=page_size) return [ contact['_source']['doc'] for contact in result['hits']['hits'] ] except self.es_exceptions.RequestError as err: return { 'error': err.info['error']['root_cause'][0]['reason'], 'status': err.status_code } def get_contact_by_name(self, name): # returns the contact with the given name try: return self.database.get_source(index=self.elastic_index, id=name)['doc'] except self.es_exceptions.NotFoundError: return {'error': 'not found', 'status': 404} def create_contact(self, contact_details): # creates a contact with the given contact_details (which includes a name) try: self.database.create(index=self.elastic_index, id=contact_details['name'], body={'doc': contact_details}) return {'message': 'created', 'status': 200} except self.es_exceptions.ConflictError: return {'error': 'contact already exists', 'status': 409} def update_contact(self, name, contact_details): # updates a contact with the new contact_details try: result = self.database.update( index=self.elastic_index, id=name, body={'doc': { 'doc': contact_details }}) return {'message': result['result'], 'status': 200} except self.es_exceptions.NotFoundError: return {'error': 'not found', 'status': 404} def delete_contact(self, name): # deletes the contact with the given name try: self.database.delete(index=self.elastic_index, id=name) return {'message': 'deleted', 'status': 200} except self.es_exceptions.NotFoundError: return {'error': 'not found', 'status': 404}
class ElasticSearchWrapper(object): SOURCE = "_source" HITS = "hits" TEXT = "text" PROPERTIES = "properties" MAPPING = "mappings" def __init__(self, configurations: ElasticSearchConfigurations): self.configurations = configurations self.es = Elasticsearch(hosts=[{ 'host': self.configurations.host, 'port': self.configurations.port }], ) def create(self, body: dict, documentId): ans = self.es.create(index=self.configurations.index, doc_type=self.configurations.docType, body=body, id=documentId) return ans def exists(self, documentId): return self.es.exists(index=self.configurations.index, doc_type=self.configurations.docType, id=documentId) def get(self, documentId): try: return self.es.get_source(index=self.configurations.index, doc_type=self.configurations.docType, id=documentId) except ElasticSearchNotFoundError as e: raise e @classmethod def constructPrefixFieldQuery(cls, fields: List[str], prefix: str): prefixes = { "query": { "bool": { "should": [{ "prefix": { field: prefix, } } for field in fields] } } } return prefixes def getByPrefix(self, prefix: str): try: textFields = self.getTextFields() ans = self.es.search(index=self.configurations.index, doc_type=self.configurations.docType, body=self.constructPrefixFieldQuery( fields=textFields, prefix=prefix)) return [elm[self.SOURCE] for elm in ans[self.HITS][self.HITS]] except ElasticSearchNotFoundError as e: raise e def getTextFields(self): mapping = self.getMapping() return [ k for k, v in mapping[self.configurations.index][self.MAPPING][ self.configurations.docType][self.PROPERTIES].items() if v["type"] == "text" ] def getMapping(self) -> dict: return self.es.indices.get_mapping( index=self.configurations.index, doc_type=self.configurations.docType)
class ElasticDataManager(object): """ This is the order abort - If needed. If any previous datamangers aborted. This before even begining this datamanager process. tpc_begin - Prepare for the transaction commit - This is like dry commit. Check for potential errors before commiting tpc_vote - After commit vote and tell the transacation manager , that I am fine to go or not tpc_finish - Final commit, no turning back after this. tpc_abort - If this manager voted no, then this function will be called for cleanup """ transaction_manager = transaction.manager def __init__(self): self._resources = [] self.current = 0 self._connection = None def connect(self, settings, default_index="", auto_create_index=False): """ Establish a elastic search connection """ eshosts = settings['elasticsearch_hosts'] self._connection = Elasticsearch( eshosts, # sniff before doing anything sniff_on_start=True, # refresh nodes after a node fails to respond sniff_on_connection_fail=True, # and also every 60 seconds sniffer_timeout=60, # request timeout timeout=30) self.default_index = default_index # applicable for 6.0 self.auto_create_index = auto_create_index self.versions = self._getVersion() self.isVersion6 = self._isVersion6_in_cluster() @property def connection(self): """ property get existing established elastic search connection """ return self._connection def get_connection(self): return self._connection def refresh(self, index="_all"): self._connection.indices.refresh(index) def add(self, item): """ Add document to the elasticsearch index. Required in the item dictionary _id = ID for the to be saved document _type = Type of the document _source = Source/body to be saved _index(optional) = If default_index is set, then this is optional This will be committed during the transaction process. """ log = logging.getLogger(__name__) log.info("Adding elasticsearch item") if (len(self._resources) == 0): log.info("Joining transaction") self.transaction_manager.get().join(self) item['_op'] = "add" item['processed'] = False item['_index'] = self._get_index(item) item['index_created'] = False self._check_type(item) self._check_id(item) self._resources.append(item) def remove(self, item, check_existence=False): """ Remove document from elasticsearch index Required in the item dictionary _id = ID for the to be saved document _type = Type of the document _index(optional) = If default_index is set, then this is optional This will be committed during the transaction process. """ log = logging.getLogger(__name__) log.info("Removing elasticsearch item") item['_op'] = "remove" item['processed'] = False item['_index'] = self._get_index(item) item['index_created'] = False self._check_type(item) self._check_id(item) if check_existence and not self._check_if_exists(item): return if (len(self._resources) == 0): log.info("Joining transaction") self.transaction_manager.get().join(self) self._resources.append(item) def update(self, item, check_existence=False): """ Update document already present in the elasticsearch index. Required in the item dictionary _id = ID for the to be saved document _type = Type of the document _index(optional) = If default_index is set, then this is optional _source = partial or full source to be updated. This will be committed during the transaction process. If the document isn't already present in the index, then this will be converted to an add request. """ log = logging.getLogger(__name__) log.info("Update elasticsearch item") item['_op'] = "update" item['processed'] = False item['_index'] = self._get_index(item) item['index_created'] = False self._check_type(item) self._check_id(item) if '_source' not in item: raise ElasticSearchParamMissingError( "_source data to update missing") if check_existence and not self._check_if_exists(item): return if (len(self._resources) == 0): log.info("Joining transaction") self.transaction_manager.get().join(self) self._resources.append(item) def update_by_query(self, item): """ Update document already present in the elasticsearch index. Required in the item dictionary _id = ID for the to be saved document _type = Type of the document _index(optional) = If default_index is set, then this is optional _query = Query DSL to update all the documents matching the query _source = partial or full source to be updated. This will be committed during the transaction process. If the document isn't already present in the index, then this will be converted to an add request. """ log = logging.getLogger(__name__) log.info("Update elasticsearch item") if (len(self._resources) == 0): log.info("Joining transaction") self.transaction_manager.get().join(self) if '_query' not in item: raise ElasticSearchParamMissingError("_query input missing") if '_script' not in item: raise ElasticSearchParamMissingError( "_script data to update missing") item['_op'] = "update_by_query" item['processed'] = False item['_index'] = self._get_index(item) item['index_created'] = False self._check_type(item) self._resources.append(item) def delete_by_query(self, item): """ Update document already present in the elasticsearch index. Required in the item dictionary _id = ID for the to be saved document _type = Type of the document _index(optional) = If default_index is set, then this is optional _query = Query DSL to delete all the documents matching the query This will be committed during the transaction process. If the document isn't already present in the index, then this will be converted to an add request. """ log = logging.getLogger(__name__) log.info("Update elasticsearch item") if (len(self._resources) == 0): log.info("Joining transaction") self.transaction_manager.get().join(self) if '_query' not in item: raise ElasticSearchParamMissingError("_query input missing") item['_op'] = "delete_by_query" item['processed'] = False item['_index'] = self._get_index(item) item['index_created'] = False self._check_type(item) self._resources.append(item) def _getVersion(self): data = self._connection.cluster.stats() if 'nodes' in data and 'versions' in data['nodes']: return data['nodes']['versions'] return [] def _isVersion6_in_cluster(self): for version in self.versions: split_version = version.split('.') if len(split_version) > 0 and split_version[0] == '6': return True return False def _check_if_exists(self, request): try: result = self._connection.get_source(index=request['_index'], doc_type=request['_type'], id=request['_id']) print(result) except NotFoundError as e: return False return True def _get_index(self, item): if ('_index' not in item and len(self.default_index) == 0): raise ElasticSearchParamMissingError( "_index input missing and default index is not set") return item['_index'] if '_index' in item else self.default_index def _check_type(self, item): if '_type' not in item: raise ElasticSearchParamMissingError("_type input missing") if self.isVersion6 and item['_type'] != 'doc': raise ElasticSearchException( "custom _type not supported in 6.x. _type should by default be 'doc'" ) def _check_id(self, item): if '_id' not in item: raise ElasticSearchParamMissingError("_id input missing") def _refresh_if_needed(self, last_operation, currentoperation, unique_indices): """ This function updates the in-memory buffer to a segment so that we can search and update the records immediately after creation https://www.elastic.co/guide/en/elasticsearch/guide/current/near-real-time.html """ if last_operation == currentoperation or last_operation == "": return last_operation, unique_indices else: self._connection.indices.refresh(list(unique_indices)) unique_indices.clear() return currentoperation, unique_indices def _checkAndCreateIndex(self, item): if not self._connection.indices.exists(item['_index'], ignore=[400, 404]): # index doesn't exist self._connection.indices.create(index=item['_index'], ignore=[400]) return True return False @property def savepoint(self): """ Savepoints are only supported when all connections support subtransactions """ return ElasticSavepoint(self) def abort(self, transaction): """ Outside of the two-phase commit proper, a transaction can be aborted before the commit is even attempted, in case we come across some error condition that makes it impossible to commit. The abort method is used for aborting a transaction and forgetting all changes, as well as end the participation of a data manager in the current transaction. """ log = logging.getLogger(__name__) log.info("abort") self.uncommitted = {'add': [], 'remove': []} def tpc_begin(self, transaction): """ The tpc_begin method is called at the start of the commit to perform any necessary steps for saving the data. """ log = logging.getLogger(__name__) log.info("tpc_begin") def commit(self, transaction): """ We record and backup existing data and then perform the operation. if any of the other transaction managers vote to back up, then we recommit all the data backed up during this commit process. """ # ## This is the step where data managers need to prepare to save the changes # ## and make sure that any conflicts or errors that could occur during the # ## save operation are handled. Changes should be ready but not made # ## permanent, because the transaction could still be aborted if other # ## transaction managers are not able to commit. log = logging.getLogger(__name__) log.info(__name__) log.info("commit") unique_indices = set() last_operation = "" # Lets commit and keep track of the items that are commited. In case we get # an abort request then remove those items. for item in self._resources: last_operation, unique_indices = self._refresh_if_needed( last_operation, item['_op'], unique_indices) unique_indices.add(item['_index']) if item['_op'] == 'add': # if version 6, there is no support for types # All documents get into their own index if self.isVersion6: item['index_created'] = self._checkAndCreateIndex(item) self._connection.create(index=item['_index'], doc_type=item['_type'], id=item['_id'], body=item['_source']) elif item['_op'] == 'remove': if (self._connection.exists(index=item['_index'], doc_type=item['_type'], id=item['_id'])): item['_backup'] = self._connection.get_source( index=item['_index'], doc_type=item['_type'], id=item['_id']) self._connection.delete(index=item['_index'], doc_type=item['_type'], id=item['_id']) else: raise ElasticSearchException("Unable to find " + item['_id'] + " in type " + item['_type'] + " and in index " + item['_index']) elif item['_op'] == "update": # Update if (self._connection.exists(index=item['_index'], doc_type=item['_type'], id=item['_id'])): item['_backup'] = self._connection.get_source( index=item['_index'], doc_type=item['_type'], id=item['_id']) # Dont get the source after update self._connection.update(index=item['_index'], doc_type=item['_type'], id=item['_id'], body={'doc': item['_source']}, _source=False) else: # The item was not present in the first place # moving this to add self._connection.create(index=item['_index'], doc_type=item['_type'], id=item['_id'], body=item['_source']) # Move the operation to add. In case of # abort we will only remove the newly created # document item['_op'] = 'add' elif item['_op'] == "update_by_query": # get all the fields provided by the user to update # keys = list(item['_source'].keys()) item['_backup'] = self._connection.search( index=item['_index'], doc_type=item['_type'], body={"query": item['_query']}, _source=True) # print(json.dumps(item['_backup'],sort_keys=True,indent=4)) # example script # "script":{ # "inline":"ctx._source.description = params.description;ctx._source.grp_hash = grp_hash", # "params" : { # "description" : "Srikanth group", # "grp_hash" : "3433" # } # } toupdate = { "script": item['_script'], "query": item['_query'], } # print(json.dumps(toupdate,sort_keys=True,indent=4)) self._connection.update_by_query(index=item['_index'], doc_type=item['_type'], body=toupdate, _source=True) # print(json.dumps(t, sort_keys=True, indent=4)) elif item['_op'] == "delete_by_query": # get all the fields provided by the user to update item['_backup'] = self._connection.search( index=item['_index'], doc_type=item['_type'], body={"query": item['_query']}, _source=True) self._connection.delete_by_query( index=item['_index'], doc_type=item['_type'], body={"query": item['_query']}) item['processed'] = True def tpc_vote(self, transaction): """ The last chance for a data manager to make sure that the data can be saved is the vote. The way to vote ‘no’ is to raise an exception here. """ log = logging.getLogger(__name__) log.info("tpc_vote") def tpc_finish(self, transaction): """ This method is only called if the manager voted ‘yes’ (no exceptions raised) during the voting step. This makes the changes permanent and should never fail. Any errors here could leave the database in an inconsistent state. In other words, only do things here that are guaranteed to work or you may have a serious error in your hands. """ # Do the operation to add it to elastic search # We are done lets cleanup self._resources = [] # Lets refresh all indices once self.refresh() log = logging.getLogger(__name__) log.info("tcp_finish") def tpc_abort(self, transaction): """ This method is only called if the manager voted ‘no’ by raising an exception during the voting step. It abandons all changes and ends the transaction. """ log = logging.getLogger(__name__) log.info("tpc_abort") unique_indices = set() last_operation = "" for item in self._resources: last_operation, unique_indices = self._refresh_if_needed( last_operation, item['_op'], unique_indices) unique_indices.add(item['_index']) if item['processed']: if item['_op'] == 'add': self._connection.delete(index=item['_index'], doc_type=item['_type'], id=item['_id']) if self.isVersion6 and item['index_created']: # We created index in the commit phase, # so we need to delete it if we are aborting # the transaction. self._connection.indices.delete(index=item['_index'], ignore=[400, 404]) elif item['_op'] == 'remove': self._connection.create(index=item['_index'], doc_type=item['_type'], id=item['_id'], body=item['_backup']) elif item['_op'] == "update": # Update self._connection.update(index=item['_index'], doc_type=item['_type'], id=item['_id'], body=item['_backup'], _source=False) elif item['_op'] == "update_by_query": for thing in item['_backup']['hits']['hits']: # update back with old value only if the document exists. if self._connection.exists(index=item['_index'], doc_type=item['_type'], id=thing['_id']): self._connection.update( index=item['_index'], doc_type=item['_type'], id=thing['_id'], body={'doc': thing['_source']}, _source=False) elif item['_op'] == "delete_by_query": for thing in item['backup']['hits']['hits']: self._connection.create(index=item['_index'], doc_type=item['_type'], id=thing['_id'], body=thing['_source']) def sortKey(self): """ Transaction manager tries to sort all the data manger alphabetically If we want our datamanger to commit last, then start with '~'. Here we dont care. Assuming """ return 'elasticsearch' + str(id(self))
from elasticsearch import Elasticsearch, NotFoundError from tqdm import tqdm from cluseter import termgraph es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) articles_path = '../crawler/articles' articles = [] titles = {} abstracts = {} dictionary = {} stop_words = set(nltk.corpus.stopwords.words('english')) for article_name in os.listdir(articles_path): try: article = es.get_source(index="rg", doc_type="article", id=int(article_name.split(".")[0])) articles.append(article) abstracts[article['id']] = collections.Counter( x for x in nltk.word_tokenize(article.get('abstract').lower()) if x not in stop_words) titles[article.get('id')] = collections.Counter( x for x in nltk.word_tokenize(article.get('title').lower()) if x not in stop_words) except NotFoundError: pass for doc in abstracts.values(): for t, v in doc.iteritems(): dictionary[t] = max(dictionary.get(t), v) def calc_clusters(cluster_num): MAX_LEVEL = 5 mean = [{} for cnum in range(cluster_num)]
class ESClient: def __init__(self, config): """ Init & Configure ES connection :param config: dict, config """ self.config = config or {} if self.config: self.hosts = [{ 'host': h['host'], 'port': h['port'] } for h in self.config.get('hosts')] else: self.hosts = [{'host': 'localhost'}] self.bulk_size = self.config.get("bulk_size") or BULK_SIZE self.connection = None self.connect() def connect(self): """ Establish ES connection """ try: self.connection = Elasticsearch(self.hosts, retry_on_timeout=RETRY_ON_TIMEOUT, timeout=REQUEST_TIMEOUT_SEC) except Exception as e: # pragma: no cover logging.error( "ESClient.connect failed with params {}, error {}".format( self.hosts, e)) def count_index(self, index): """ Count docs in index :param index: :return: """ return self.connection.count(index).get('count') def search(self, query=None, index_name=None, retries=0, query_type='search'): """ ES search query :param query: dict, es query :param index_name: str, index to query against :param retries: int, current retry attempt :param query_type: str, search or aggregation :return: list, found docs """ resp = [] try: resp = self.connection.search(body=query, index=index_name) if query_type == 'search': found = resp['hits']['hits'] else: # elif query_type == 'aggregation': found = resp['aggregations'] except KeyError: # No hits key in response logging.critical( "ESClient.search invalid response {}".format(resp)) if retries > RETRY_ATTEMPTS: # pragma: no cover logging.error( "ESClient.search max attempts exceeded (key error)") raise found = self.search(query=query, index_name=index_name, retries=retries + 1) except es_exceptions.RequestError as e: # pragma: no cover logging.critical("ESClient.search error {} on query {}".format( e, query)) raise except (es_exceptions.ConnectionTimeout, es_exceptions.ConnectionError, es_exceptions.TransportError): # pragma: no cover logging.warning("ESClient.search connection failed, retrying..." ) # Retry on timeout if retries > RETRY_ATTEMPTS: # pragma: no cover logging.error("ESClient.search max attempts exceeded") raise time.sleep(RECONNECT_SLEEP_SEC) self.connect() # Not sure if this is helpful found = self.search(query=query, index_name=index_name, retries=retries + 1) except Exception as e: # pragma: no cover logging.critical("ESClient.search error {} on query {}".format( e, query)) raise return found def msearch(self, queries, index_name, doc_type='_doc', retries=0, chunk_size=100): """ Es multi-search query :param queries: list of dict, es queries :param index_name: str, index to query against :param doc_type: str, defined doc type i.e. _doc :param retries: int, current retry attempt :param chunk_size: int, how many queries to send to es at a time Increase the search queue size before sending too many requests I.e. threadpool.search.queue_size: 50000 in es config :return: dict, found doc status """ search_header = json.dumps({'index': index_name, 'type': doc_type}) def chunk_queries(): for i in range(0, len(queries), chunk_size): yield queries[i:i + chunk_size] chunked_queries = chunk_queries() found = [] for query_chunk in chunked_queries: request = '' for q in query_chunk: # request head, body pairs request += '{}\n{}\n'.format(search_header, json.dumps(q)) resp = {} try: resp = self.connection.msearch(body=request, index=index_name) found.extend([r['hits']['hits'] for r in resp['responses']]) except (es_exceptions.ConnectionTimeout, es_exceptions.ConnectionError, es_exceptions.TransportError, KeyError) as e: # pragma: no cover if retries > RETRY_ATTEMPTS: # pragma: no cover logging.error( "ESClient.msearch max attempts exceeded, error {}". format(e)) raise logging.warning( "ESClient.msearch connection failed, retrying..." ) # Retry on timeout # No hits key in response, don't retry if es_rejected_execution_exception if e.__class__ == KeyError: # 'hits' missing, could be es_rejected_execution_exception, queue capacity reached logging.critical( "ESClient.msearch invalid response {}".format( resp.get('responses'))) # if 'search_phase_execution_exception' not in str(resp): # reason 'all shards failed' if 'es_rejected_execution_exception': # raise if underlying error is ConnectionRefusedError in urllib3 caused by NewConnectionError logging.error( "ESClient.msearch query rejected, error {}".format( e)) raise time.sleep(RECONNECT_SLEEP_SEC) self.connect() # Not sure if useful found = self.msearch(queries=queries, index_name=index_name, retries=retries + 1) except Exception as e: # pragma: no cover logging.critical( "ESClient.msearch error {} on query {}".format(e, queries)) raise return found def get_document(self, doc_id, index_name, doc_type='_doc', retries=0): """ Get contents of a document by id :param doc_id: int, the document id :param index_name: str, document name :param doc_type: str, document type :param retries: int, current retry attempt :return: dict, resulting document """ try: result = self.connection.get_source(id=doc_id, doc_type=doc_type, index=index_name, _source=True) except es_exceptions.NotFoundError: result = None except (es_exceptions.ConnectionTimeout, es_exceptions.ConnectionError): # pragma: no cover logging.warning( "ESClient.get_document connection failed, retrying..." ) # Retry on timeout if retries > RETRY_ATTEMPTS: # pragma: no cover raise time.sleep(RECONNECT_SLEEP_SEC) result = self.get_document(doc_id, index_name, doc_type, retries=retries + 1) return result def upsert_document(self, index_name, body, doc_id, doc_type='_doc', retries=0): """ Adds new or updates existing doc Upsert a document into an es index :param index_name: str, index name :param body: dict, doc :param doc_type: str, i.e. _doc :param doc_id: int, document id :param retries: int, number of retries of the function :return: dict, result """ # To avoid Field [_id] is a metadata field and cannot be added inside a document. # Use the index API request parameters. if ES_ID_FIELD in body: body = copy.deepcopy(body) del body[ES_ID_FIELD] # reserved field try: result = self.connection.update(index=index_name, doc_type=doc_type, id=doc_id, body={ 'doc': body, 'doc_as_upsert': True }) except (es_exceptions.ConnectionTimeout, es_exceptions.ConnectionError): # pragma: no cover logging.warning( "ESClient.upsert_document connection failed, retrying..." ) # Retry on timeout if retries > RETRY_ATTEMPTS: # pragma: no cover raise time.sleep(RECONNECT_SLEEP_SEC) result = self.upsert_document(doc_id, body, index_name, doc_type, retries=retries + 1) return result def remove_document(self, index_name, doc_id, doc_type='_doc', retries=0): """ Remove a document from es index :param index_name: str, index name :param doc_id: int, doc id :param doc_type: str, i.e. _doc :param retries: int, number of retries of the function :return: dict, result """ try: result = self.connection.delete(index=index_name, doc_type=doc_type, id=doc_id) except (es_exceptions.ConnectionTimeout, es_exceptions.ConnectionError): # pragma: no cover logging.warning( "ESClient.remove_document connection failed, retrying..." ) # Retry on timeout if retries > RETRY_ATTEMPTS: # pragma: no cover raise time.sleep(RECONNECT_SLEEP_SEC) result = self.remove_document(index_name, doc_id, doc_type, retries=retries + 1) return result def bulk_update_index(self, documents, index_name, doc_type='_doc', id_field='_id'): """ Bulk populates an es index with doc data Can also be used to add a single doc to index :param index_name: str, index name :param documents: list, of dicts index docs :param doc_type: str, document type for es :param id_field: str, document id field name :return: bool, success """ copied_docs = copy.deepcopy(documents) bulk_data = [] for body in copied_docs: doc_id = body[id_field] if ES_ID_FIELD in body: del body[ES_ID_FIELD] bulk_data.append({ '_index': index_name, '_type': doc_type, '_source': json.dumps(body), '_id': doc_id }) success = False for attempt in range(1, RETRY_ATTEMPTS + 1): try: helpers.bulk(self.connection, actions=bulk_data) self.connection.indices.refresh(index=index_name) success = True break except (es_exceptions.ConnectionTimeout, es_exceptions.ConnectionError): # pragma: no cover logging.warning("ESClient.bulk_update_index connection timeout" ) # Retry on timeout self.connect( ) # Not sure if this is helpful, or connection is lazy? continue return success def create_index(self, index_name, body=None, replace=False): """ Create an index by name, populate with body :param index_name: str, name of index :param body: dict, optional document to create :param replace: bool, force replace existing index :return: dict, created status info """ result = None for _attempt in range(1, RETRY_ATTEMPTS + 1): try: result = self.connection.indices.create(index=index_name, ignore=400, body=body) result = bool('acknowledged' in result) break except es_exceptions.AuthorizationException: # pragma: no cover result = False break except (es_exceptions.ConnectionTimeout, es_exceptions.ConnectionError) as e: # pragma: no cover logging.error( "ESClient.create_index connection error: {}".format( e)) # Retry on timeout time.sleep(RECONNECT_SLEEP_SEC) self.connect( ) # Not sure if this is helpful, or connection is lazy? continue if replace and not result: logging.warning( "ESClient.create_index replacing existing index {}".format( index_name)) self.delete_index(index_name) result = self.connection.indices.create(index=index_name, ignore=400, body=body) if result: self.connection.indices.refresh(index_name) return result def setup_index(self, index_name, doc_mapping, index_settings): """ Set up Index :param index_name: str, index name :param index_settings: str or dict, index settings document :param doc_mapping: str or dict, index doc mapping schema :return: bool, setup settings and index success """ # index_settings = index_settings or self.INDEX_SETTINGS doc_type = list(doc_mapping.keys())[0] settings = mapped = None for attempt in range(1, RETRY_ATTEMPTS + 1): try: # close index to modify settings self.connection.indices.close(index=index_name) # Creates es analyzer, filter settings settings = self.connection.indices.put_settings( index=index_name, body=index_settings) self.connection.indices.open(index=index_name) # Sets up document structure / mapping mapped = self.connection.indices.put_mapping(index=index_name, doc_type=doc_type, body=doc_mapping) break except (es_exceptions.ConnectionTimeout, es_exceptions.ConnectionError): # pragma: no cover logging.warning("ESClient.setup_index connection timeout" ) # Retry on timeout self.connect( ) # Not sure if this is helpful, or connection is lazy? continue return settings and mapped def delete_index(self, index_name): """ Delete an index by name :param index_name: str, index name :return: dict, removed status """ result = None for attempt in range(1, RETRY_ATTEMPTS + 1): try: result = self.connection.indices.delete(index=index_name) break except es_exceptions.NotFoundError: # pragma: no cover result = False break except (es_exceptions.ConnectionTimeout, es_exceptions.ConnectionError): # pragma: no cover logging.warning("ESClient.delete_index connection timeout" ) # Retry on timeout self.connect( ) # Not sure if this is helpful, or connection is lazy? continue if not result: # pragma: no cover logging.warning( "ESClient.delete_index failed for {}".format(index_name)) return result def add_alias(self, indexes, alias_name, retries=0): """ Set the alias current for new index Note: It is possible to have one alias for multiple indexes but bulk populate will fail for that alias :param indexes: list (or single str) of index names :param alias_name: str, alias to use for the index :param retries: int, number of retries of the function :return: dict, added info """ try: added = self.connection.indices.put_alias(index=indexes, name=alias_name) except (es_exceptions.ConnectionTimeout, es_exceptions.ConnectionError): # pragma: no cover logging.warning("ESClient.add_alias connection failed, retrying..." ) # Retry on timeout if retries > RETRY_ATTEMPTS: # pragma: no cover raise time.sleep(RECONNECT_SLEEP_SEC) added = self.get_alias(indexes, alias_name, retries=retries + 1) return added def get_alias(self, alias_name=None, index_name=None, retries=0): """ Return alias information i.e indexes either by alias name or index to get aliases for an index :param alias_name: str, alias to use for the index :param index_name: str, name of index :param retries: int, number of retries of the function :return: """ try: alias = self.connection.indices.get_alias(name=alias_name, index=index_name) except es_exceptions.NotFoundError: # pragma: no cover alias = None except (es_exceptions.ConnectionTimeout, es_exceptions.ConnectionError): # pragma: no cover logging.warning("ESClient.get_alias connection failed, retrying..." ) # Retry on timeout if retries > RETRY_ATTEMPTS: # pragma: no cover raise time.sleep(RECONNECT_SLEEP_SEC) alias = self.get_alias(alias_name, index_name, retries=retries + 1) return alias def delete_alias(self, index_name, alias_name, retries=0): """ Remove alias :param index_name: str, index name :param alias_name: str, alias to use for the index :param retries: int, number of retries of the function :return: dict, removed status """ try: removed = self.connection.indices.delete_alias(name=alias_name, index=index_name) except es_exceptions.NotFoundError: # pragma: no cover return False except (es_exceptions.ConnectionTimeout, es_exceptions.ConnectionError): # pragma: no cover logging.warning( "ESClient.delete_alias connection failed, retrying..." ) # Retry on timeout if retries > RETRY_ATTEMPTS: # pragma: no cover raise time.sleep(RECONNECT_SLEEP_SEC) removed = self.delete_alias(index_name, alias_name, retries=retries + 1) return removed
}, { 'Book': 'The tale of two cities', 'Author': 'Chals Dickens', 'year': 2003, 'volumes': 3 }] """ for i in range(1,len(doc)+1): res = es.index(index="practise",doc_type="writing",id = i, body = doc[i-1]) print(res['created']) """ #res = es.mget(index="practise",doc_type="writing", body = {"ids":[1,2,3]} , _source = True, realtime = True) res = es.get_source(index= "practise", doc_type="writing", id="3" ) #res = es.search(index= "practise", doc_type="writing") res = es.suggest(index= "practise", body = doc) #print (res['_source']) es.indices.refresh(index="practise") #res = es.search(index = "practise",body = {"query":{"match_all":{}}}) res = json.dumps(res, indent = 4, sort_keys = True) es.indices.refresh(index="practise") print res """ print("we got %d Hits:" %res['hits']['total'])
# tag::69a7be47f85138b10437113ab2f0d72d[] response = es.get( index='twitter', id=2, routing='user1', stored_fields='tags,counter', ) # end::69a7be47f85138b10437113ab2f0d72d[] print("---------------------------------------") print(response) print("---------------------------------------") print("89a8ac1509936acc272fc2d72907bc45 - L:229") # tag::89a8ac1509936acc272fc2d72907bc45[] response = es.get_source( index='twitter', id=1, ) # end::89a8ac1509936acc272fc2d72907bc45[] print("---------------------------------------") print(response) print("---------------------------------------") print("d222c6a6ec7a3beca6c97011b0874512 - L:238") # tag::d222c6a6ec7a3beca6c97011b0874512[] response = es.get_source( index='twitter', id=1, _source_includes='*.id', _source_excludes='entities', ) # end::d222c6a6ec7a3beca6c97011b0874512[]
return text if __name__ == "__main__": BILL_CACHE = 'bills.p' # Load bill list ids = pd.read_csv('../../data/ncsl/ncsl_data_from_sample_matched.csv') if not os.path.isfile(BILL_CACHE): # Initialize database for bill retrieval es = ES("localhost:9200", timeout=60) # Retrieve bills bills = [ es.get_source(index="state_bills", id=id_, doc_type="_all") for id_ in ids['matched_from_db'] ] print("Retrieved {} bills".format(len(bills))) pickle.dump(bills, open(BILL_CACHE, 'wb')) else: bills = pickle.load(open(BILL_CACHE, 'rb')) # Initialize text cleaner cleaner = TextCleaner() # Initialize dictionary dictionary = corpora.Dictionary()
class ElasticConnection(): def __init__(self, host="localhost", port=9200): self.es_connection = Elasticsearch([{ 'host': host, 'port': port }], timeout=200) # creates index for bills and model legislation stored in # data_path, overwriting index if it is already created def create_state_bill_index(self, data_path): if self.es_connection.indices.exists(STATE_BILL_INDEX): print("deleting '%s' index..." % (STATE_BILL_INDEX)) self.es_connection.indices.delete(index=STATE_BILL_INDEX) mapping_doc = json.loads( open(os.environ['POLICY_DIFFUSION'] + "/db/state_bill_mapping.json").read()) settings_doc = json.loads( open(os.environ['POLICY_DIFFUSION'] + "/db/state_bill_index.json").read()) print("creating '%s' index..." % (STATE_BILL_INDEX)) res = self.es_connection.indices.create(index=STATE_BILL_INDEX, body=settings_doc) print("adding mapping for bill_documents") res = self.es_connection.indices.put_mapping(index=STATE_BILL_INDEX, doc_type="bill_document", body=mapping_doc) bulk_data = [] for i, line in enumerate(open(data_path)): json_obj = json.loads(line.strip()) if json_obj is None: continue op_dict = { "index": { "_index": STATE_BILL_INDEX, "_type": "bill_document", "_id": json_obj["unique_id"] } } bulk_data.append(op_dict) bulk_data.append(json_obj) if len(bulk_data) == 1000: self.es_connection.bulk(index=STATE_BILL_INDEX, body=bulk_data, timeout=300) del bulk_data bulk_data = [] def create_evaluation_index_all_bills(self, data_path1, data_path2): ''' data_path1 corresponds to evaluation data data_path2 corresponds to rest of bill data ''' if self.es_connection.indices.exists(EVALUATION_INDEX_ALL_BILLS): print("deleting '%s' index..." % (EVALUATION_INDEX_ALL_BILLS)) self.es_connection.indices.delete(index=EVALUATION_INDEX_ALL_BILLS) #use same mapping as in state index mapping_doc = json.loads( open(os.environ['POLICY_DIFFUSION'] + "/db/evaluation_mapping.json").read()) settings_doc = json.loads( open(os.environ['POLICY_DIFFUSION'] + "/db/state_bill_index.json").read()) print("creating '%s' index..." % (EVALUATION_INDEX_ALL_BILLS)) res = self.es_connection.indices.create( index=EVALUATION_INDEX_ALL_BILLS, body=settings_doc, timeout=30) print("adding mapping for bill_documents") res = self.es_connection.indices.put_mapping( index=EVALUATION_INDEX_ALL_BILLS, doc_type="bill_document", body=mapping_doc) #load in evaluation data first bulk_data = [] for i, line in enumerate(open(data_path1)): json_obj = json.loads(line.strip()) if json_obj is None: continue op_dict = { "index": { "_index": EVALUATION_INDEX_ALL_BILLS, "_type": "bill_document", "_id": i } } bulk_data.append(op_dict) bulk_data.append(json_obj) self.es_connection.bulk(index=EVALUATION_INDEX_ALL_BILLS, body=bulk_data, timeout=300) #load in rest of state bill data bulk_data = [] for i, line in enumerate(open(data_path2)): json_obj = json.loads(line.strip()) if json_obj is None: continue op_dict = { "index": { "_index": EVALUATION_INDEX_ALL_BILLS, "_type": "bill_document", "_id": json_obj["unique_id"] } } bulk_data.append(op_dict) bulk_data.append(json_obj) if len(bulk_data) == 1000: self.es_connection.bulk(index=EVALUATION_INDEX_ALL_BILLS, body=bulk_data, timeout=300) del bulk_data bulk_data = [] def create_evaluation_index(self, data_path): if self.es_connection.indices.exists(EVALUATION_INDEX): print("deleting '%s' index..." % (EVALUATION_INDEX)) self.es_connection.indices.delete(index=EVALUATION_INDEX) #use same mapping as in state index mapping_doc = json.loads( open(os.environ['POLICY_DIFFUSION'] + "/db/evaluation_mapping.json").read()) settings_doc = json.loads( open(os.environ['POLICY_DIFFUSION'] + "/db/state_bill_index.json").read()) print("creating '%s' index..." % (EVALUATION_INDEX)) res = self.es_connection.indices.create(index=EVALUATION_INDEX, body=settings_doc, timeout=30) print("adding mapping for bill_documents") res = self.es_connection.indices.put_mapping(index=EVALUATION_INDEX, doc_type="bill_document", body=mapping_doc) bulk_data = [] for i, line in enumerate(open(data_path)): json_obj = json.loads(line.strip()) if json_obj is None: continue op_dict = { "index": { "_index": EVALUATION_INDEX, "_type": "bill_document", "_id": i } } bulk_data.append(op_dict) bulk_data.append(json_obj) self.es_connection.bulk(index=EVALUATION_INDEX, body=bulk_data, timeout=300) def get_all_doc_ids(self, index): count = self.es_connection.count(index)['count'] q = {"query": {"match_all": {}}, "fields": []} results = self.es_connection.search(index=index, body=q, size=count) doc_ids = [res['_id'] for res in results['hits']['hits']] return doc_ids def similar_doc_query(self, query, state_id=None, num_results=100, return_fields=["state"], index=STATE_BILL_INDEX, fields="bill_document_last"): json_query = """ { "query": { "more_like_this": { "fields": ["section_txt"], "like": "", "min_doc_freq": 2 } } } """ json_query = json.loads(json_query) json_query['query']['more_like_this']['like'] = query results = self.es_connection.search(index=index, body=json_query, size=num_results) results = results['hits']['hits'] result_docs = [] for idx, res in enumerate(results): doc = {} doc['score'] = res['_score'] doc['id'] = res['_id'] doc['sec_id'] = res['_source']['section_id'] doc['sec_txt'] = res['_source']['section_txt'] # print("#%d: sec_id is %s with score %f" % (idx+1, doc['sec_id'], doc['score'])) result_docs.append(doc) return result_docs def similar_doc_query_for_testing_lucene(self, query, match_group, state_id=None, num_results=100, return_fields=["state"], index=STATE_BILL_INDEX): ''' description: only for testing lucene scores match_group represents the group of bills that an evaluation bill belongs to (e.g., all the stand your ground bills) ''' json_query = """ { "query": { "filtered": { "query": { "more_like_this": { "fields": [ "bill_document_last" ], "like_text": "", "max_query_terms": 70, "min_term_freq": 1, "min_doc_freq": 2, "minimum_should_match": 1 } }, "filter": { "bool": { "must_not": { "term": { "bill_document.state": "" } } } } } } } """ json_query = json.loads(json_query) json_query['query']['filtered']['query']['more_like_this'][ 'like_text'] = query json_query['query']['filtered']['filter']['bool']['must_not']['term'][ 'bill_document.state'] = str(state_id) results = self.es_connection.search(index=index, body=json_query, fields=return_fields, size=num_results) results = results['hits']['hits'] result_docs = [] for res in results: doc = {} for f in res['fields']: doc[f] = res['fields'][f][0] doc['score'] = res['_score'] doc['id'] = res['_id'] #if applicable, only return docs that are from different states if doc['state'] != state_id: result_docs.append(doc) return result_docs def get_bill_by_id(self, id, index='state_bills'): match = self.es_connection.get_source(index=index, id=id) return match def get_model_legislation_by_id(self, id, index=MODEL_LEGISLATION_INDEX): match = self.es_connection.get_source(index=index, id=id) return match def get_constitution_by_id(self, id, index=CONSTITUTIONS_INDEX): match = self.es_connection.get_source(index=index, id=id) return match def get_all_bills(self, step=3000): es = self.es_connection # fix with .format: '{"from" :{0}, "size" :{1}'.format(start,size) body_gen = lambda start, size: '{"from" :' + str( start) + ', "size" : ' + str( size) + ', "query":{"bool":{"must":{"match_all":{}}}}} ' body = body_gen(0, 0) bills = es.search(index="state_bills", body=body) total = bills['hits']['total'] all_bills = [] start = 0 bad_count = 0 while start <= total: print(start) body = body_gen(start, step) bills = es.search(index="state_bills", body=body) bill_list = bills['hits']['hits'] all_bills.append(bill_list) start += step return all_bills def get_bills_by_state(self, state, num_bills='all', step=3000): es = self.es_connection if num_bills == 'all': bills = es.search(index='state_bills', doc_type='bill_document', q='state:' + state) total = bills['hits']['total'] else: total = num_bills #fix as above body_gen = lambda start, size: '{"from" :' + str( start) + ', "size" : ' + str( size ) + ',"query":{"term":{"bill_document.state":"' + state + '"}}}' all_bills = [] start = 0 bad_count = 0 if step >= total: body = body_gen(start, total) bills = es.search(index="state_bills", body=body) bill_list = bills['hits']['hits'] all_bills.extend(bill_list) else: while start <= total: body = body_gen(start, step) bills = es.search(index="state_bills", body=body) bill_list = bills['hits']['hits'] all_bills.extend(bill_list) start += step return all_bills
ES_PASS = os.environ.get('ES_PASS', 'changeme') es = Elasticsearch([ES_HOST], http_auth=(ES_USER, ES_PASS), port=9200, use_ssl=False) def log2es(body): es.index(index='nexpose-process-log', doc_type='nexpose-process-log', body=body) try: es.get_source(index='nexpose-log-user', doc_type='user', id='admin') except NotFoundError as e: es.create(index='nexpose-log-user', doc_type='user', id='admin', body=json.dumps({'_password': '******'})) def get_admin_user(username): try: return es.get_source(index='nexpose-log-user', doc_type='user', id=username) except NotFoundError as e: return None
class ElasticSearchManager(): ########################## Constructor ###################################### def __init__(self,indexName, typeName, type): self.client = Elasticsearch() self._indexName = indexName self._typeName = typeName self._uniqeTermInCorpus = 0 self._totalNoOfDocsInCorpus = 0 self._avgDocumentLength = 0 self._lengthOfAllDocuments = 0 if type == 1: print type self.__CreateIndex__() self.__SetMappingForIndex__() ########################## CREATES INDEX ###################################### def __CreateIndex__(self): # DELETE index if already exists if self.client.indices.exists(self._indexName): print("deleting '%s' index..." % (self._indexName)) res = self.client.indices.delete(index = self._indexName) print " response: '%s'" % (res), '\n' # CREATE a new index print("creating '%s' index..." % (self._indexName)) res = self.client.indices.create(index = self._indexName,body = Resource.INDEX_REQUEST_BODY) print(" response: '%s'" % (res), '\n') ########################### SETS MAPPING FOR INDEX ############################## def __SetMappingForIndex__(self): # PUT_MAPPING - Registers specific mapping definition for a specific type. print("creating '%s' mapping for..." % (self._typeName)) res2 = self.client.indices.put_mapping(doc_type = self._typeName, index = self._indexName, body=Resource.MAPPING_BODY) print(" response: '%s'" % (res2)) ########################### GENERATE LOGICAL DOC TO BE INDEXED ################# def __ConsituteDocument__(self,documentId, documentText): doc = { "docno": documentId, "text": documentText } return doc ########################## PERFORMS INDEXING ###################################### def __UploadDocumentToIndex__(self,docCounterForTesting,documentId, documentText): doc = self.__ConsituteDocument__(documentId, documentText) res = self.client.index(index=self._indexName,doc_type=self._typeName,id=documentId,body=doc) def __CurrentIndexStats__(self): res = self.client.count(index=self._indexName, doc_type=self._typeName) return res ###################################################################################################### # RETRIEVE ###################################################################################################### def __GetAllUnigramsAsFeatures__(self): requestBody = Resource.REQUEST_BODY_FIND_ALL_FEATURES # "aggregations": { # "features": { # "doc_count_error_upper_bound": 0, # "sum_other_doc_count": 0, # "buckets": [ # { # "key": "http", # "doc_count": 4274 # }, # { # "key": "s", # "doc_count": 3958 # } # ] # } # } res = self.client.search(self._indexName, self._typeName, body = requestBody, search_type = "count") return res ################################################################################################################ def __GetDocumentText__(self, docId): # {u'_source': {u'text': u" fatalities have increased an estimated 31 percent o}} res = self.client.get_source(index=self._indexName, doc_type=self._typeName, id = docId) return res['text'] ##################################################################################################### def __GetHitsForAllDocuments__(self): print "Fetching hits for text for all documents from ElasticSearch ..." requestBody = Resource.REQUEST_BODY_FIND_TEXT_FOR_ALLDOCS res = self.client.search(index=self._indexName, doc_type=self._typeName, body = requestBody) return res['hits']['hits']
from elasticsearch import Elasticsearch from pprint import pprint import sys import numpy as np es = Elasticsearch('http://172.27.125.139:9200/', timeout=10, retry_on_timeout=True, max_retries=1) doc = es.get_source(index="state_bills", id='az_49th-3rd-special_SB1010', doc_type="_all") print(len(doc['bill_document_last'])) sys.exit() with open('bill_ids.txt') as infile: ids = [x.strip('\n') for x in infile] # #o = np.zeros((len(ids))) #for i, id_ in enumerate(ids): # doc = None # s = 'failed' # doc = es.get_source(index="state_bills", id=id_, doc_type="_all") # if doc is not None: # o[i] = 1 # s = 'worked' # # print('{}: {}, {}'.format(s, i, id_)) #
class _ES(object): def __init__(self, index, doc_type, host, port, timeout=300, **args): self.host = host self.port = port self.index = index self.doc_type = doc_type self.es = Elasticsearch(hosts=[ { "host": self.host, "port": self.port }, ], timeout=timeout, **args) def check_properties(self, properties): """ Check if all properties are known (e.g. have mappings), and creates mappings as needed """ properties = set(properties) if not (properties - self.get_properties()): return to_add = properties - self.get_properties() if to_add: self.add_properties(to_add) def add_properties(self, to_add): """ Add the named properties, setting mapping depending on suffix """ mappings = {} for name in to_add: ftype = name.rsplit("_", 1)[1] if "_" in name else 'default' mappings[name] = settings.ES_MAPPING_TYPES[ftype] self.es.indices.put_mapping(index=self.index, doc_type=self.doc_type, body={"properties": mappings}) def get_mapping(self): m = self.es.indices.get_mapping(self.index, self.doc_type) return m[self.index]['mappings'][self.doc_type]['properties'] def get_properties(self): self.check_index() return set(self.get_mapping().keys()) def refresh(self): self.es.indices.refresh() def highlight_article(self, aid: int, query: str) -> dict: """Highlight article given by an article id using a Lucene query. The resulting strings are safe to insert into an HTML document even if the original document contained malicious constructs. If you need the original article including HTML, call html.unescape on this output.""" from amcat.tools.amcates_queryset import ESQuerySet qs = ESQuerySet().filter(id=aid).only("text", "title").highlight(query, mark="em") try: return next(iter(qs)).to_dict() except StopIteration: raise ValueError( "Article(id={}) not found in elastic index.".format(aid)) def clear_cache(self): self.es.indices.clear_cache() def delete_index(self): try: self.es.indices.delete(self.index) except NotFoundError: pass except Exception as e: if 'IndexMissingException' in str(e): return raise def create_index(self, shards=5, replicas=1): es_settings = settings.ES_SETTINGS.copy() es_settings.update({ "number_of_shards": shards, "number_of_replicas": replicas }) body = { "settings": es_settings, "mappings": { settings.ES_ARTICLE_DOCTYPE: settings.ES_MAPPING } } self.es.indices.create(self.index, body) def check_index(self): """ Check whether the server is up and the index exists. If the server is down, raise an exception. If the index does not exist, try to create it. """ if not self.es.ping(): raise Exception("Elastic server cannot be reached") if not self.es.indices.exists(self.index): log.info("Index {self.index} does not exist, creating".format( **locals())) self.create_index() return self.es.cluster.health(self.index, wait_for_status='yellow') def exists_type(self, doc_type, **kargs): return self.es.indices.exists_type(index=self.index, doc_type=doc_type, **kargs) def put_mapping(self, doc_type, body, **kargs): return self.es.indices.put_mapping(index=self.index, doc_type=doc_type, body=body, **kargs) def status(self): nodes = self.es.nodes.info()['nodes'].values() return { "ping": self.es.ping(), "nodes": [n['name'] for n in nodes], "index": self.index, "index_health": self.es.cluster.health(self.index), "transport_hosts": self.es.transport.hosts, } def get(self, id, **options): """ Get a single article from the index """ kargs = dict(index=self.index, doc_type=self.doc_type) kargs.update(options) return self.es.get_source(id=id, **kargs) def mget(self, ids, doc_type=None, parents=None): """ Get multiple articles from the index. If paret is given, it should be a sequence of the same length as ids """ if parents is None: parents = [None] * len(ids) if doc_type is None: doc_type = self.doc_type getdocs = [{ "_index": self.index, "_id": id, "_parent": parent, "_type": doc_type } for (id, parent) in zip(ids, parents)] return self.es.mget({"docs": getdocs})['docs'] def search(self, body, **options): """ Perform a 'raw' search on the underlying ES index """ kargs = dict(index=self.index, doc_type=self.doc_type) kargs.update(options) return self.es.search(body=body, **kargs) def scan(self, query, **kargs): """ Perform a scan query on the es index See: http://elasticsearch-py.readthedocs.org/en/latest/helpers.html#elasticsearch.helpers.scan """ return scan(self.es, index=self.index, doc_type=self.doc_type, query=query, **kargs) def query_ids(self, query=None, filters=EMPTY_RO_DICT, body=None, limit=None, **kwargs): """ Query the index returning a sequence of article ids for the mathced articles @param query: a elastic query string (i.e. lucene syntax, e.g. 'piet AND (ja* OR klaas)') @param filter: field filter DSL query dict @param body: if given, use this instead of constructing from query/filters @param filters: if filter is None, build filter from filters as accepted by build_query, e.g. sets=12345 Note that query and filters can be combined in a single call """ if body is None: body = dict(build_body(query, filters, query_as_filter=True)) for i, a in enumerate( scan(self.es, query=body, index=self.index, doc_type=self.doc_type, size=(limit or 1000), fields="")): if limit and i >= limit: return yield int(a['_id']) def query(self, query=None, filters=EMPTY_RO_DICT, highlight=False, lead=False, fields=(), score=True, **kwargs): """ Execute a query for the given fields with the given query and filter @param query: a elastic query string (i.e. lucene syntax, e.g. 'piet AND (ja* OR klaas)') @param filter: field filter DSL query dict, defaults to build_filter(**filters) @param kwargs: additional keyword arguments to pass to es.search, eg fields, sort, from_, etc @return: a list of named tuples containing id, score, and the requested fields """ body = dict( build_body(query, filters, query_as_filter=(not (highlight or score)))) if highlight and not score: body['query'] = {'constant_score': {'query': body['query']}} if 'sort' in kwargs: body['track_scores'] = True if highlight and query: if isinstance(highlight, dict): body['highlight'] = highlight else: body['highlight'] = HIGHLIGHT_OPTIONS if lead or False and query == "" and highlight: body['script_fields'] = { "lead": { "script": { "file": LEAD_SCRIPT_FIELD } } } result = self.search(body, fields=fields, **kwargs) return SearchResult(result, fields, score, body, query=query) def query_all(self, *args, **kargs): kargs.update({"from_": 0}) size = kargs.setdefault('size', 10000) result = self.query(*args, **kargs) total = result.total for offset in range(size, total, size): kargs['from_'] = offset result2 = self.query(*args, **kargs) result.hits += result2.hits return result def _get_used_properties(self, body__prop): body, prop = body__prop body["query"]["bool"]["must"][1]["exists"]["field"] = prop return bool( self.es.count(index=self.index, doc_type=self.doc_type, body=body)['count']) def get_used_properties(self, set_ids=None, article_ids=None, **filters): """ Returns a sequency of property names in use in the specified set(s) (or setids) """ if set_ids is not None: filters["sets"] = set_ids if article_ids is not None: filters["ids"] = article_ids all_properties = self.get_properties() flexible_properties = set(all_properties) - set(ALL_FIELDS) body = { "query": { "bool": { "must": [ build_filter(**filters), { "exists": { "field": "fakeprop" } } ] } } } bodies = (copy.deepcopy(body) for _ in range(len(flexible_properties))) pool = ThreadPool() results = pool.imap(self._get_used_properties, zip(bodies, flexible_properties)) try: for found, prop in zip(results, flexible_properties): if found: yield prop finally: pool.close() def add_articles(self, article_ids, batch_size=1000): """ Add the given article_ids to the index. This is done in batches, so there is no limit on the length of article_ids (which can be a generator). """ #WvA: remove redundancy with create_articles if not article_ids: return from amcat.models import Article, ArticleSetArticle n = len(article_ids) / batch_size for i, batch in enumerate( splitlist(article_ids, itemsperbatch=batch_size)): log.info("Adding batch {i}/{n}".format(**locals())) all_sets = multidict( (aa.article_id, aa.articleset_id) for aa in ArticleSetArticle.objects.filter(article__in=batch)) dicts = (get_article_dict(article, list(all_sets.get(article.id, []))) for article in Article.objects.filter(pk__in=batch)) self.bulk_insert(dicts, batch_size=None) def remove_from_set(self, setid, article_ids, flush=True): """Remove the given articles from the given set. This is done in batches, so there is no limit on the length of article_ids (which can be a generator).""" if not article_ids: return for batch in splitlist(article_ids, itemsperbatch=1000): self.bulk_update(batch, UPDATE_SCRIPT_REMOVE_FROM_SET, params={'set': setid}) def add_to_set(self, setid, article_ids, monitor=NullMonitor()): """Add the given articles to the given set. This is done in batches, so there is no limit on the length of article_ids (which can be a generator).""" if not article_ids: if monitor: monitor.update() return batches = list(splitlist(article_ids, itemsperbatch=1000)) monitor = monitor.submonitor(total=len(batches)) nbatches = len(batches) for i, batch in enumerate(batches): monitor.update(message="Adding batch {iplus}/{nbatches}..".format( iplus=i + 1, nbatches=nbatches)) self.bulk_update(batch, UPDATE_SCRIPT_ADD_TO_SET, params={'set': setid}) def get_tokens(self, aid: int, fields=["text", "title"]): """ Get a list of all tokens (words and their positions) in the given document :param aid: Article ID :param fields: List of fields to get the terms for :return: a sequence of (field, position, term) tuples """ fieldstr = ",".join(fields) data = self.es.termvectors(self.index, self.doc_type, aid, fields=fieldstr, field_statistics=False, payloads=False, offsets=False) for field in fields: if field in data['term_vectors']: for term, info in data['term_vectors'][field]['terms'].items(): for token in info['tokens']: yield field, token['position'], term def bulk_insert(self, dicts, batch_size=1000, monitor=NullMonitor()): """ Bulk insert the given articles in batches of batch_size """ batches = list(toolkit.splitlist( dicts, itemsperbatch=batch_size)) if batch_size else [dicts] monitor = monitor.submonitor(total=len(batches)) nbatches = len(batches) for i, batch in enumerate(batches): monitor.update( 1, "Adding batch {iplus}/{nbatches}".format(iplus=i + 1, **locals())) props, articles = set(), {} for d in batch: props |= (set(d.keys()) - ALL_FIELDS) articles[d["id"]] = serialize(d) self.check_properties(props) body = get_bulk_body(articles) resp = self.es.bulk(body=body, index=self.index, doc_type=settings.ES_ARTICLE_DOCTYPE) if resp["errors"]: raise ElasticSearchError(resp) def update_values(self, article_id, values): """Update properties of existing article. @param values: mapping from field name to (new) value @type values: dict""" return self.bulk_update_values({article_id: values}) def bulk_update_values(self, articles): """Updates set of articles in bulk. """ body = get_bulk_body( {aid: serialize({"doc": a}) for aid, a in articles.items()}, action="update") resp = self.es.bulk(body=body, index=self.index, doc_type=settings.ES_ARTICLE_DOCTYPE) if resp["errors"]: raise ElasticSearchError(resp) def bulk_update(self, article_ids, script, params): """ Execute a bulk update script with the given params on the given article ids. """ payload = serialize({"script": {"file": script, "params": params}}) body = get_bulk_body({aid: payload for aid in article_ids}, action="update") resp = self.es.bulk(body=body, index=self.index, doc_type=settings.ES_ARTICLE_DOCTYPE) if resp["errors"]: raise ElasticSearchError(resp) def synchronize_articleset(self, aset, full_refresh=False): """ Make sure the given articleset is correctly stored in the index @param full_refresh: if true, re-add all articles to the index. Use this after changing properties of articles """ self.check_index() # make sure index exists and is at least 'yellow' log.debug("Getting SOLR ids from set") solr_set_ids = set(self.query_ids(filters=dict(sets=[aset.id]))) log.debug("Getting DB ids") db_ids = aset.get_article_ids() log.debug("Getting SOLR ids") solr_ids = set(self.in_index(db_ids)) to_remove = solr_set_ids - db_ids if full_refresh: to_add_docs = db_ids to_add_set = set() else: to_add_docs = db_ids - solr_ids to_add_set = (db_ids & solr_ids) - solr_set_ids log.warning( "Refreshing index, full_refresh={full_refresh}," "|solr_set_ids|={nsolrset}, |db_set_ids|={ndb}, |solr_ids|={nsolr} " "|to_add| = {nta}, |to_add_set|={ntas}, |to_remove_set|={ntr}". format(nsolr=len(solr_ids), nsolrset=len(solr_set_ids), ndb=len(db_ids), nta=len(to_add_docs), ntas=len(to_add_set), ntr=len(to_remove), **locals())) log.info("Removing {} articles".format(len(to_remove))) self.remove_from_set(aset.id, to_remove) log.info("Adding {} articles to set".format(len(to_add_set))) self.add_to_set(aset.id, to_add_set) log.info("Adding {} articles to index".format(len(to_add_docs))) self.add_articles(to_add_docs) log.info("Refreshing") self.refresh() def _count(self, body): """Raw version of count directly passing given query to elastic, while setting the index and doc_type""" return self.es.count(index=self.index, doc_type=settings.ES_ARTICLE_DOCTYPE, body=body) def count(self, query=None, filters=None): """ Compute the number of items matching the given query / filter """ filters = dict(build_body(query, filters, query_as_filter=True)) body = {"query": {"constant_score": filters}} return self._count(body)["count"] def search_aggregate(self, aggregation, query=None, filters=None, **options): """ Run an aggregate search query and return the aggregation results @param aggregation: raw elastic query, e.g. {"terms" : {"field" : "medium"}} """ body = dict(query={ "filtered": dict(build_body(query, filters, query_as_filter=True)) }, aggregations={"aggregation": aggregation}) result = self.search(body, size=0, search_type="count", **options) return result['aggregations']['aggregation'] def _parse_terms_aggregate(self, aggregate, group_by, terms, sets): if not group_by: for term in terms: yield term, aggregate[term.label]['doc_count'] else: for term in terms: yield term, self._parse_aggregate(aggregate[term.label], list(group_by), terms, sets) def _parse_other_aggregate(self, aggregate, group_by, group, terms, sets): buckets = aggregate[group]["buckets"] if not group_by: return ((b['key'], b['doc_count']) for b in buckets) return ((b['key'], self._parse_aggregate(b, list(group_by), terms, sets)) for b in buckets) def _parse_aggregate(self, aggregate, group_by, terms, sets): """Parse a aggregation result to (nested) namedtuples.""" group = group_by.pop(0) if group == "terms": result = self._parse_terms_aggregate(aggregate, group_by, terms, sets) else: result = self._parse_other_aggregate(aggregate, group_by, group, terms, sets) if group == "sets" and sets is not None: # Filter sets if 'sets' is given result = ((aset_id, res) for aset_id, res in result if aset_id in set(sets)) elif group == "date": # Parse timestamps as datetime objects result = ((get_date(stamp), aggr) for stamp, aggr in result) # Return results as namedtuples ntuple = namedtuple("Aggr", [group, "buckets" if group_by else "count"]) return [ntuple(*r) for r in result] def _build_aggregate(self, group_by, date_interval, terms, sets): """Build nested aggregation query for list of groups""" group = group_by.pop(0) if group == 'date': aggregation = { group: { 'date_histogram': { 'field': group, 'interval': date_interval, "min_doc_count": 1 } } } elif group == 'terms': aggregation = { term.label: { 'filter': dict(build_body(term.query)) } for term in terms } else: aggregation = { group: { 'terms': { # Default size is too small, we want to return all results 'size': 999999, 'field': group } } } # We need to nest the other aggregations, see: # http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/search-aggregations.html if group_by: nested = self._build_aggregate(group_by, date_interval, terms, sets) for aggr in aggregation.values(): aggr["aggregations"] = nested return aggregation def aggregate_query(self, query=None, filters=None, group_by=None, terms=None, sets=None, date_interval='month'): """ Compute an aggregate query, e.g. select count(*) where <filters> group by <group_by>. If date is used as a group_by variable, uses date_interval to bin it. It does support multiple values for group_by. You can group_by on terms by supplying "terms" to group_by. In addition, you will need to supply terms as a parameter, which consists of a list of SearchQuery's. Query is then used as a global filter, while terms are 'local'. @param query: an elastic query string (i.e. lucene syntax, e.g. 'piet AND (ja* OR klaas)') @type group_by: list / tuple @type mediums: bool @param mediums: return Medium objects, instead of ids """ if isinstance(group_by, str): log.warning( "Passing strings to aggregate_query(group_by) is deprecated.") group_by = [group_by] if "terms" in group_by and terms is None: raise ValueError( "You should pass a list of terms if aggregating on it.") filters = dict(build_body(query, filters, query_as_filter=True)) aggregations = self._build_aggregate(list(group_by), date_interval, terms, sets) body = { "query": { "constant_score": filters }, "aggregations": aggregations } log.debug("es.search(body={body})".format(**locals())) result = self.search(body) result = self._parse_aggregate(result["aggregations"], list(group_by), terms, sets) return result def statistics(self, query=None, filters=None): """Compute and return a Result object with n, start_date and end_date for the selection""" body = { "query": { "constant_score": dict(build_body(query, filters, query_as_filter=True)) }, 'aggregations': { 'stats': { 'stats': { 'field': 'date' } } } } stats = self.search(body, size=0)['aggregations']['stats'] result = Result() result.n = stats['count'] if result.n == 0: result.start_date, result.end_date = None, None else: result.start_date = get_date(stats['min']) result.end_date = get_date(stats['max']) return result def list_dates(self, query=None, filters=None, interval="day"): from amcat.tools.aggregate_es import aggregate, IntervalCategory for date, count in aggregate(query, filters, [IntervalCategory(interval)], es=self): yield date def in_index(self, ids): """ Check whether the given ids are already indexed. @return: a sequence of ids that are in the index """ if not isinstance(ids, list): ids = list(ids) log.info( "Checking existence of {nids} documents".format(nids=len(ids))) if not ids: return for batch in splitlist(ids, itemsperbatch=10000): result = self.es.mget(index=self.index, doc_type=settings.ES_ARTICLE_DOCTYPE, body={"ids": batch}, fields=[]) for doc in result['docs']: if doc['found']: yield int(doc['_id']) def duplicate_exists(self, article): """ Check whether a duplicate of the given article already exists. If so, returns the sets that the duplicate is a member of. Duplication is checked using de get_hash function, so article should be an object with the appropriate attributes (.title etc) @return: A (possibly empty) sequence of results with .id and .sets """ hash = get_article_dict(article).hash return self.query(filters={'hashes': hash}, fields=["sets"], score=False) def _get_purge_actions(self, query): for id in self.query_ids(body=query): yield { "_op_type": "delete", "_id": id, "_index": self.index, "_type": settings.ES_ARTICLE_DOCTYPE } def purge_orphans(self): """Remove all articles without set from the index""" query = { "query": { "constant_score": { "filter": { "missing": { "field": "sets" } } } } } return bulk(self.es, self._get_purge_actions(query)) def get_child_type_counts(self, **filters): """Get the number of child documents per type""" filters = dict(build_body(filters=filters)) filter = { "has_parent": { "parent_type": self.doc_type, "filter": filters['filter'] } } aggs = {"module": {"terms": {"field": "_type"}}} body = {"aggs": {"prep": {"filter": filter, "aggs": aggs}}} r = self.es.search(index=self.index, search_type="count", body=body) for b in r['aggregations']['prep']['module']['buckets']: yield b['key'], b['doc_count'] def get_articles_without_child(self, child_doctype, limit=None, **filters): """Return the ids of all articles without a child of the given doctype""" nochild = { "not": { "has_child": { "type": child_doctype, "query": { "match_all": {} } } } } filter = dict(build_body(filters=filters))['filter'] body = {"filter": {"bool": {"must": [filter, nochild]}}} return self.query_ids(body=body, limit=limit)
if len(slashsplit)>5: if "?pretty" in args.server: args.pretty=True args.id=slashsplit[5].rsplit("?")[0] else: args.id=slashsplit[5] if args.pretty: tabbing=4 else: tabbing=None if args.idfile: for json_record in esidfilegenerator(host=args.host,port=args.port,index=args.index,type=args.type,body=args.body,source=args.source,headless=args.headless,source_exclude=args.exclude,source_include=args.include,idfile=args.idfile): sys.stdout.write(json.dumps(json_record,indent=tabbing)+"\n") elif args.idfile_consume: for json_record in esidfileconsumegenerator(host=args.host,port=args.port,index=args.index,type=args.type,body=args.body,source=args.source,headless=args.headless,source_exclude=args.exclude,source_include=args.include,idfile=args.idfile_consume): sys.stdout.write(json.dumps(json_record,indent=tabbing)+"\n") elif not args.id: for json_record in esgenerator(host=args.host,port=args.port,index=args.index,type=args.type,body=args.body,source=args.source,headless=args.headless,source_exclude=args.exclude,source_include=args.include,verbose=True): sys.stdout.write(json.dumps(json_record,indent=tabbing)+"\n") else: es=Elasticsearch([{"host":args.host}],port=args.port) json_record=None if not args.headless: json_record=es.get(index=args.index,doc_type=args.type,_source=True,_source_exclude=args.exclude,_source_include=args.include,id=args.id) else: json_record=es.get_source(index=args.index,doc_type=args.type,_source=True,_source_exclude=args.exclude,_source_include=args.include,id=args.id) if json_record: sys.stdout.write(json.dumps(json_record,indent=tabbing)+"\n")
class _ES(object): def __init__(self, index, doc_type, host, port, timeout=300, **args): self.host = host self.port = port self.index = index self.doc_type = doc_type self.es = Elasticsearch(hosts=[{"host": self.host, "port": self.port}, ], timeout=timeout, **args) def check_properties(self, properties): """ Check if all properties are known (e.g. have mappings), and creates mappings as needed """ properties = set(properties) if not (properties - self.get_properties()): return to_add = properties - self.get_properties() if to_add: self.add_properties(to_add) def add_properties(self, to_add): """ Add the named properties, setting mapping depending on suffix """ mappings = {} for name in to_add: ftype = name.rsplit("_", 1)[1] if "_" in name else 'default' mappings[name] = settings.ES_MAPPING_TYPES[ftype] self.es.indices.put_mapping(index=self.index, doc_type=self.doc_type, body={"properties": mappings}) def get_mapping(self): m = self.es.indices.get_mapping(self.index, self.doc_type) return m[self.index]['mappings'][self.doc_type]['properties'] def get_properties(self): self.check_index() return set(self.get_mapping().keys()) def refresh(self): self.es.indices.refresh() def highlight_article(self, aid: int, query: str) -> dict: """Highlight article given by an article id using a Lucene query. The resulting strings are safe to insert into an HTML document even if the original document contained malicious constructs. If you need the original article including HTML, call html.unescape on this output.""" from amcat.tools.amcates_queryset import ESQuerySet qs = ESQuerySet().filter(id=aid).only("text", "title").highlight(query, mark="em") try: return next(iter(qs)).to_dict() except StopIteration: raise ValueError("Article(id={}) not found in elastic index.".format(aid)) def clear_cache(self): self.es.indices.clear_cache() def delete_index(self): try: self.es.indices.delete(self.index) except NotFoundError: pass except Exception as e: if 'IndexMissingException' in str(e): return raise def create_index(self, shards=5, replicas=1): es_settings = settings.ES_SETTINGS.copy() es_settings.update({"number_of_shards": shards, "number_of_replicas": replicas}) body = { "settings": es_settings, "mappings": { settings.ES_ARTICLE_DOCTYPE: settings.ES_MAPPING } } self.es.indices.create(self.index, body) def check_index(self): """ Check whether the server is up and the index exists. If the server is down, raise an exception. If the index does not exist, try to create it. """ if not self.es.ping(): raise Exception("Elastic server cannot be reached") if not self.es.indices.exists(self.index): log.info("Index {self.index} does not exist, creating".format(**locals())) self.create_index() return self.es.cluster.health(self.index, wait_for_status='yellow') def exists_type(self, doc_type, **kargs): return self.es.indices.exists_type(index=self.index, doc_type=doc_type, **kargs) def put_mapping(self, doc_type, body, **kargs): return self.es.indices.put_mapping(index=self.index, doc_type=doc_type, body=body, **kargs) def status(self): nodes = self.es.nodes.info()['nodes'].values() return {"ping": self.es.ping(), "nodes": [n['name'] for n in nodes], "index": self.index, "index_health": self.es.cluster.health(self.index), "transport_hosts": self.es.transport.hosts, } def get(self, id, **options): """ Get a single article from the index """ kargs = dict(index=self.index, doc_type=self.doc_type) kargs.update(options) return self.es.get_source(id=id, **kargs) def mget(self, ids, doc_type=None, parents=None): """ Get multiple articles from the index. If paret is given, it should be a sequence of the same length as ids """ if parents is None: parents = [None] * len(ids) if doc_type is None: doc_type = self.doc_type getdocs = [{"_index": self.index, "_id": id, "_parent": parent, "_type": doc_type} for (id, parent) in zip(ids, parents)] return self.es.mget({"docs": getdocs})['docs'] def search(self, body, **options): """ Perform a 'raw' search on the underlying ES index """ kargs = dict(index=self.index, doc_type=self.doc_type) kargs.update(options) if log.isEnabledFor(logging.DEBUG): # pprint can be expensive log.debug("Search with body:\n {}".format(pprint.pformat(body))) return self.es.search(body=body, **kargs) def scan(self, query, **kargs): """ Perform a scan query on the es index See: http://elasticsearch-py.readthedocs.org/en/latest/helpers.html#elasticsearch.helpers.scan """ return scan(self.es, index=self.index, doc_type=self.doc_type, query=query, **kargs) def query_ids(self, query=None, filters=EMPTY_RO_DICT, body=None, limit=None, **kwargs): """ Query the index returning a sequence of article ids for the mathced articles @param query: a elastic query string (i.e. lucene syntax, e.g. 'piet AND (ja* OR klaas)') @param filter: field filter DSL query dict @param body: if given, use this instead of constructing from query/filters @param filters: if filter is None, build filter from filters as accepted by build_query, e.g. sets=12345 Note that query and filters can be combined in a single call """ if body is None: body = dict(build_body(query, filters, query_as_filter=True)) log.debug("query_ids with body:\n {}".format(pprint.pformat(body))) for i, a in enumerate(scan(self.es, query=body, index=self.index, doc_type=self.doc_type, size=(limit or 1000), _source=False)): if limit and i >= limit: return yield int(a['_id']) def query(self, query=None, filters=EMPTY_RO_DICT, highlight=False, lead=False, _source=(), score=True, **kwargs): """ Execute a query for the given fields with the given query and filter @param query: a elastic query string (i.e. lucene syntax, e.g. 'piet AND (ja* OR klaas)') @param filter: field filter DSL query dict, defaults to build_filter(**filters) @param kwargs: additional keyword arguments to pass to es.search, eg fields, sort, from_, etc @return: a list of named tuples containing id, score, and the requested fields """ body = dict(build_body(query, filters, query_as_filter=(not (highlight or score)))) if highlight and not score: body['query'] = {'constant_score': {'query': body['query']}} if 'sort' in kwargs: body['track_scores'] = True if highlight and query: if isinstance(highlight, dict): body['highlight'] = highlight else: body['highlight'] = HIGHLIGHT_OPTIONS if lead or False and query == "" and highlight: body['script_fields'] = {"lead": {"script": LEAD_SCRIPT_FIELD}} result = self.search(body, _source=_source, **kwargs) return SearchResult(result, _source, score, body, query=query) def query_all(self, *args, **kargs): kargs.update({"from_": 0}) size = kargs.setdefault('size', 10000) result = self.query(*args, **kargs) total = result.total for offset in range(size, total, size): kargs['from_'] = offset result2 = self.query(*args, **kargs) result.hits += result2.hits return result def _get_used_properties(self, body__prop): body, prop = body__prop body["query"]["bool"]["must"][1]["exists"]["field"] = prop return bool(self.es.count(index=self.index, doc_type=self.doc_type, body=body)['count']) def get_used_properties(self, set_ids=None, article_ids=None, **filters): """ Returns a sequency of property names in use in the specified set(s) (or setids) """ if set_ids is not None: filters["sets"] = set_ids if article_ids is not None: filters["ids"] = article_ids all_properties = self.get_properties() flexible_properties = set(all_properties) - set(ALL_FIELDS) body = {"query": {"bool": {"must": [ build_filter(**filters), {"exists": {"field": "fakeprop"}} ]}}} bodies = (copy.deepcopy(body) for _ in range(len(flexible_properties))) pool = ThreadPool() results = pool.imap(self._get_used_properties, zip(bodies, flexible_properties)) try: for found, prop in zip(results, flexible_properties): if found: yield prop finally: pool.close() def add_articles(self, article_ids, batch_size=1000): """ Add the given article_ids to the index. This is done in batches, so there is no limit on the length of article_ids (which can be a generator). """ # WvA: remove redundancy with create_articles if not article_ids: return from amcat.models import Article, ArticleSetArticle n = len(article_ids) // batch_size for i, batch in enumerate(splitlist(article_ids, itemsperbatch=batch_size)): log.info("Adding batch {i}/{n}".format(**locals())) all_sets = multidict((aa.article_id, aa.articleset_id) for aa in ArticleSetArticle.objects.filter(article__in=batch)) dicts = (get_article_dict(article, list(all_sets.get(article.id, []))) for article in Article.objects.filter(pk__in=batch)) self.bulk_insert(dicts, batch_size=None) def remove_from_set(self, setid, article_ids, flush=True): """Remove the given articles from the given set. This is done in batches, so there is no limit on the length of article_ids (which can be a generator).""" if not article_ids: return for batch in splitlist(article_ids, itemsperbatch=1000): self.bulk_update(batch, UPDATE_SCRIPT_REMOVE_FROM_SET, params={'set': setid}) def add_to_set(self, setid, article_ids, monitor=NullMonitor()): """Add the given articles to the given set. This is done in batches, so there is no limit on the length of article_ids (which can be a generator).""" if not article_ids: if monitor: monitor.update() return batches = [set(batch) for batch in splitlist(article_ids, itemsperbatch=1000)] monitor = monitor.submonitor(total=len(batches)) nbatches = len(batches) for i, batch in enumerate(batches): monitor.update(message="Adding batch {iplus}/{nbatches}..".format(iplus=i + 1, nbatches=nbatches)) missing = batch - set(self.in_index(batch)) if missing: logging.warning("Adding {} missing articles to elastic".format(len(missing))) self.add_articles(missing) if batch - missing: self.bulk_update(batch - missing, UPDATE_SCRIPT_ADD_TO_SET, params={'set': setid}) def get_tokens(self, aid: int, fields=["text", "title"]): """ Get a list of all tokens (words and their positions) in the given document :param aid: Article ID :param fields: List of fields to get the terms for :return: a sequence of (field, position, term) tuples """ fieldstr = ",".join(fields) data = self.es.termvectors(self.index, self.doc_type, aid, fields=fieldstr, field_statistics=False, payloads=False, offsets=False) for field in fields: if field in data['term_vectors']: for term, info in data['term_vectors'][field]['terms'].items(): for token in info['tokens']: yield field, token['position'], term def bulk_insert(self, dicts, batch_size=1000, monitor=NullMonitor()): """ Bulk insert the given articles in batches of batch_size """ batches = list(toolkit.splitlist(dicts, itemsperbatch=batch_size)) if batch_size else [dicts] monitor = monitor.submonitor(total=len(batches)) nbatches = len(batches) for i, batch in enumerate(batches): monitor.update(1, "Adding batch {iplus}/{nbatches}".format(iplus=i + 1, **locals())) props, articles = set(), {} for d in batch: props |= (set(d.keys()) - ALL_FIELDS) articles[d["id"]] = serialize(d) self.check_properties(props) body = get_bulk_body(articles) resp = self.es.bulk(body=body, index=self.index, doc_type=settings.ES_ARTICLE_DOCTYPE) if resp["errors"]: raise ElasticSearchError(resp) def update_values(self, article_id, values): """Update properties of existing article. @param values: mapping from field name to (new) value @type values: dict""" return self.bulk_update_values({article_id: values}) def bulk_update_values(self, articles): """Updates set of articles in bulk. """ body = get_bulk_body({aid: serialize({"doc": a}) for aid, a in articles.items()}, action="update") resp = self.es.bulk(body=body, index=self.index, doc_type=settings.ES_ARTICLE_DOCTYPE) if resp["errors"]: raise ElasticSearchError(resp) def bulk_update(self, article_ids, script, params): """ Execute a bulk update script with the given params on the given article ids. """ payload = serialize({"script": dict(script, params=params)}) body = get_bulk_body({aid: payload for aid in article_ids}, action="update") resp = self.es.bulk(body=body, index=self.index, doc_type=settings.ES_ARTICLE_DOCTYPE) if resp["errors"]: raise ElasticSearchError(resp) def synchronize_articleset(self, aset, full_refresh=False): """ Make sure the given articleset is correctly stored in the index @param full_refresh: if true, re-add all articles to the index. Use this after changing properties of articles """ self.check_index() # make sure index exists and is at least 'yellow' log.debug("Getting SOLR ids from set") solr_set_ids = set(self.query_ids(filters=dict(sets=[aset.id]))) log.debug("Getting DB ids") db_ids = aset.get_article_ids() log.debug("Getting SOLR ids") solr_ids = set(self.in_index(db_ids)) to_remove = solr_set_ids - db_ids if full_refresh: to_add_docs = db_ids to_add_set = set() else: to_add_docs = db_ids - solr_ids to_add_set = (db_ids & solr_ids) - solr_set_ids log.warning("Refreshing index, full_refresh={full_refresh}," "|solr_set_ids|={nsolrset}, |db_set_ids|={ndb}, |solr_ids|={nsolr} " "|to_add| = {nta}, |to_add_set|={ntas}, |to_remove_set|={ntr}" .format(nsolr=len(solr_ids), nsolrset=len(solr_set_ids), ndb=len(db_ids), nta=len(to_add_docs), ntas=len(to_add_set), ntr=len(to_remove), **locals())) log.info("Removing {} articles".format(len(to_remove))) self.remove_from_set(aset.id, to_remove) log.info("Adding {} articles to set".format(len(to_add_set))) self.add_to_set(aset.id, to_add_set) log.info("Adding {} articles to index".format(len(to_add_docs))) self.add_articles(to_add_docs) log.info("Refreshing") self.refresh() def _count(self, body): """Raw version of count directly passing given query to elastic, while setting the index and doc_type""" return self.es.count(index=self.index, doc_type=settings.ES_ARTICLE_DOCTYPE, body=body) def count(self, query=None, filters=None): """ Compute the number of items matching the given query / filter """ filters = dict(build_body(query, filters, query_as_filter=True)) body = {"query": {"constant_score": filters}} return self._count(body)["count"] def search_aggregate(self, aggregation, query=None, filters=None, **options): """ Run an aggregate search query and return the aggregation results @param aggregation: raw elastic query, e.g. {"terms" : {"field" : "medium"}} """ body = dict(query={"filtered": dict(build_body(query, filters, query_as_filter=True))}, aggregations={"aggregation": aggregation}) result = self.search(body, size=0, **options) return result['aggregations']['aggregation'] def _parse_terms_aggregate(self, aggregate, group_by, terms, sets): if not group_by: for term in terms: yield term, aggregate[term.label]['doc_count'] else: for term in terms: yield term, self._parse_aggregate(aggregate[term.label], list(group_by), terms, sets) def _parse_other_aggregate(self, aggregate, group_by, group, terms, sets): buckets = aggregate[group]["buckets"] if not group_by: return ((b['key'], b['doc_count']) for b in buckets) return ((b['key'], self._parse_aggregate(b, list(group_by), terms, sets)) for b in buckets) def _parse_aggregate(self, aggregate, group_by, terms, sets): """Parse a aggregation result to (nested) namedtuples.""" group = group_by.pop(0) if group == "terms": result = self._parse_terms_aggregate(aggregate, group_by, terms, sets) else: result = self._parse_other_aggregate(aggregate, group_by, group, terms, sets) if group == "sets" and sets is not None: # Filter sets if 'sets' is given result = ((aset_id, res) for aset_id, res in result if aset_id in set(sets)) elif group == "date": # Parse timestamps as datetime objects result = ((get_date(stamp), aggr) for stamp, aggr in result) # Return results as namedtuples ntuple = namedtuple("Aggr", [safe_identifier(group), "buckets" if group_by else "count"]) return [ntuple(*r) for r in result] def _build_aggregate(self, group_by, date_interval, terms, sets): """Build nested aggregation query for list of groups""" group = group_by.pop(0) if group == 'date': aggregation = { group: { 'date_histogram': { 'field': group, 'interval': date_interval, "min_doc_count": 1 } } } elif group == 'terms': aggregation = { term.label: { 'filter': dict(build_body(term.query))['query'] } for term in terms } else: aggregation = { group: { 'terms': { # Default size is too small, we want to return all results 'size': 999999, 'field': group } } } # We need to nest the other aggregations, see: # http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/search-aggregations.html if group_by: nested = self._build_aggregate(group_by, date_interval, terms, sets) for aggr in aggregation.values(): aggr["aggregations"] = nested return aggregation def aggregate_query(self, query=None, filters=None, group_by=None, terms=None, sets=None, date_interval='month'): """ Compute an aggregate query, e.g. select count(*) where <filters> group by <group_by>. If date is used as a group_by variable, uses date_interval to bin it. It does support multiple values for group_by. You can group_by on terms by supplying "terms" to group_by. In addition, you will need to supply terms as a parameter, which consists of a list of SearchQuery's. Query is then used as a global filter, while terms are 'local'. @param query: an elastic query string (i.e. lucene syntax, e.g. 'piet AND (ja* OR klaas)') @type group_by: list / tuple @type mediums: bool @param mediums: return Medium objects, instead of ids """ if isinstance(group_by, str): log.warning("Passing strings to aggregate_query(group_by) is deprecated.") group_by = [group_by] if "terms" in group_by and terms is None: raise ValueError("You should pass a list of terms if aggregating on it.") filters = dict(build_body(query, filters, query_as_filter=True)) aggregations = self._build_aggregate(list(group_by), date_interval, terms, sets) body = { "query": {"constant_score": filters}, "aggregations": aggregations } log.debug("es.search(body={body})".format(**locals())) result = self.search(body) result = self._parse_aggregate(result["aggregations"], list(group_by), terms, sets) return result def statistics(self, query=None, filters=None): """Compute and return a Result object with n, start_date and end_date for the selection""" body = { "query": { "constant_score": dict( build_body(query, filters, query_as_filter=True) ) }, 'aggregations': { 'stats': { 'stats': {'field': 'date'} } } } stats = self.search(body, size=0)['aggregations']['stats'] result = Result() result.n = stats['count'] if result.n == 0: result.start_date, result.end_date = None, None else: result.start_date = get_date(stats['min']) result.end_date = get_date(stats['max']) return result def list_dates(self, query=None, filters=None, interval="day"): from amcat.tools.aggregate_es import aggregate, IntervalCategory for date, count in aggregate(query, filters, [IntervalCategory(interval)], es=self): yield date def in_index(self, ids): """ Check whether the given ids are already indexed. @return: a sequence of ids that are in the index """ if not isinstance(ids, list): ids = list(ids) log.info("Checking existence of {nids} documents".format(nids=len(ids))) if not ids: return for batch in splitlist(ids, itemsperbatch=10000): result = self.es.mget(index=self.index, doc_type=settings.ES_ARTICLE_DOCTYPE, body={"ids": batch}, _source=[]) for doc in result['docs']: if doc['found']: yield int(doc['_id']) def duplicate_exists(self, article): """ Check whether a duplicate of the given article already exists. If so, returns the sets that the duplicate is a member of. Duplication is checked using de get_hash function, so article should be an object with the appropriate attributes (.title etc) @return: A (possibly empty) sequence of results with .id and .sets """ hash = get_article_dict(article).hash return self.query(filters={'hashes': hash}, _source=["sets"], score=False) def _get_purge_actions(self, query): for id in self.query_ids(body=query): yield { "_op_type": "delete", "_id": id, "_index": self.index, "_type": settings.ES_ARTICLE_DOCTYPE } def purge_orphans(self): """Remove all articles without set from the index""" query = {"query": {"bool": {"must_not": {"exists": {"field": "sets"}}}}} return bulk(self.es, self._get_purge_actions(query)) def get_child_type_counts(self, **filters): """Get the number of child documents per type""" filters = dict(build_body(filters=filters)) filter = {"has_parent": {"parent_type": self.doc_type, "filter": filters['filter']}} aggs = {"module": {"terms": {"field": "_type"}}} body = {"aggs": {"prep": {"filter": filter, "aggs": aggs}}} r = self.es.search(index=self.index, size=0, body=body) for b in r['aggregations']['prep']['module']['buckets']: yield b['key'], b['doc_count'] def get_articles_without_child(self, child_doctype, limit=None, **filters): """Return the ids of all articles without a child of the given doctype""" nochild = {"not": {"has_child": {"type": child_doctype, "query": {"match_all": {}}}}} filter = dict(build_body(filters=filters))['filter'] body = {"filter": {"bool": {"must": [filter, nochild]}}} return self.query_ids(body=body, limit=limit)
from tqdm import tqdm from cluseter import termgraph es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) articles_path = '../crawler/articles' articles = [] titles = {} abstracts = {} dictionary = {} stop_words = set(nltk.corpus.stopwords.words('english')) for article_name in os.listdir(articles_path): try: article = es.get_source(index="rg", doc_type="article", id=int(article_name.split(".")[0])) articles.append(article) abstracts[article['id']] = collections.Counter( x for x in nltk.word_tokenize(article.get('abstract').lower()) if x not in stop_words) titles[article.get('id')] = collections.Counter( x for x in nltk.word_tokenize(article.get('title').lower()) if x not in stop_words) except NotFoundError: pass for doc in abstracts.values(): for t, v in doc.iteritems(): dictionary[t] = max(dictionary.get(t), v)
class OmgAnalyzer(object): def __init__(self, config): """初始化 设置数据库连接的wait_timeout是一个小时 """ self.logger = logging.getLogger('omg.analyzer') self.config = config self.interval = config.getint('default', 'check.interval.second') self.running = True self.analyze_interval = config.getint('default', 'analyze.interval.second') self.cleaner_on = config.get( 'default', 'thread_analyzed_creative_cleaner.enable') self.collector_on = config.get('default', 'thread_creative_collector.enable') self.es_index = config.get('elasticsearch', 'index') self.es_type = config.get('elasticsearch', 'type') self.es_analyzer = config.get('elasticsearch', 'analyzer') self.es_timeout = config.getint('elasticsearch', 'timeout') self.es_hosts = self.config.get('elasticsearch', 'hosts').strip().split(",") self.imageteller_host = self.config.get('server', 'imageteller.host') self.imageteller_port = self.config.getint('server', 'imageteller.port') self.imageteller_transport = None self.imageteller_client = None django.db.connection.cursor().execute('set wait_timeout=3600') def initImagetellerClient(self, host, port): transport = TSocket.TSocket(host, port) transport = TTransport.TFramedTransport(transport) protocol = TBinaryProtocol.TBinaryProtocol(transport) self.imageteller_client = OmgService.Client(protocol) transport.open() self.imageteller_transport = transport def initClient(self): self.initImagetellerClient(self.imageteller_host, self.imageteller_port) self.es_client = Elasticsearch(hosts=self.es_hosts) def closeClient(self): if self.imageteller_transport is not None: try: self.imageteller_transport.close() self.imageteller_transport = None except: pass def run(self): self.runbackthread() self.runAnalyzer() def runbackthread(self): # 收集线程 if self.collector_on == "on": self.collector = OmgThreadCollector(self.config) self.collector.daemon = True self.collector.start() # 清理线程 if self.cleaner_on == "on": self.cleaner = OmgThreadCleaner() self.cleaner.daemon = True self.cleaner.start() def runAnalyzer(self): while self.running: try: # 注意,这里使用短连接,避免服务端重启造成本服务的长期不可用 # 如果遇到服务端的故障,则忽略这一轮检查,sleep之后再试 self.initClient() django.db.close_old_connections() self.analyze() except: self.logger.exception('analyzer error') finally: self.closeClient() if self.running: time.sleep(self.interval) def analyze(self): """监控 把上次检测之后的所有状态变化拿出来,发送报警 """ self.logger.info('>>> begin analyze image <<<') creatives = ZeusOmg.objects.filter(translated=0)[:10] self.logger.info('get %d creative to analyze', len(creatives)) for creative in creatives: tagstr = str() try: # 从图片识别服务中获取标签和描述 imageData = ImageData() imageData.image_url = creative.image_url imageAnalyzeResult = self.imageteller_client.analyzeImage( ImageDataType.IDT_URL, imageData, ImageAnalyzeLanguage.IAL_EN) self.logger.debug( "request [%s] and imageteller_client return %s", creative.image_url, imageAnalyzeResult) if len(imageAnalyzeResult.tags) == 0 and len( imageAnalyzeResult.descriptions) == 0: creative.translated = 2 creative.save() continue # 结果构成 tag for imageTag in imageAnalyzeResult.tags: tagstr += imageTag.tag + ' ' for description in imageAnalyzeResult.descriptions: tagstr += description + '. ' # 通过es的分词功能处理tags analyzeRes = self.es_client.indices.analyze( index=self.es_index, analyzer=self.es_analyzer, text=tagstr) tags = set() for token in analyzeRes['tokens']: tags.add(token['token']) # 如果已存在相同文案,则添加新tags sourceRes = None try: sourceRes = self.es_client.get_source( index=self.es_index, doc_type=self.es_type, id=hashlib.md5(creative.creative_text).hexdigest()) except NotFoundError: sourceRes = None # 说明找到了相同hash if sourceRes is not None: if sourceRes['mesg'] == creative.creative_text: # 可能不同文案出现了相同哈希,理论上还是存在这种可能的 # 那么如果实际文案是不同的,我们就放弃之前的文案和标签,保存最新的 # 所以如果text的字符串比较也确实相同了,那么我们就把之前的tag也加上,再做更新 for tag in sourceRes['tags'].split(): tags.add(tag) else: # 虽然hash相同,但字符串比较不同,那么就冲突了,我们放弃之前的记录 self.logger.warn( "[%s] and [%s] have same md5, keep later", sourceRes['mesg'], creative.creative_text) tagstr = string.join(tags) # 更新es的记录 self.es_client.index(index=self.es_index, doc_type=self.es_type, body={ "tags": tagstr, "mesg": creative.creative_text }, id=hashlib.md5( creative.creative_text).hexdigest(), request_timeout=self.es_timeout) except: self.logger.error( 'id:[%d], creative_id:[%d], text:[%s], image_url:[%s], tags:[%s]', creative.id, creative.creative_id, creative.creative_text, creative.image_url, tagstr) self.logger.exception('analyze creative error') continue self.logger.debug( 'id:[%d], creative_id:[%d], text:[%s], image_url:[%s], tags:[%s]', creative.id, creative.creative_id, creative.creative_text, creative.image_url, tagstr) # 走到这里代表已经识别图片并保存了tags和文案到es,那么可以标记这条记录为已经识别 creative.translated = 1 creative.save() time.sleep(self.analyze_interval) def stop(self): self.running = False
class IndexToolManager: ''' A class used to manage the database indexation tools used in this research. Provides functions to index a database with ArangoDB, Elasticsearch and Zettair, using the BM25 IR function implemented in each of those tools. Also makes it possible to query the indexed database using BM25. Attributes ---------- indexName : str a string to refer to the current working data set bm25_b : float BM25 b parameter to adjust the document length compensation bm25_k1 : float BM25 k1 parameter to adjust the term-frequency weight bm25_k3 : float BM25 k3 parameter to adjust the term-frequency weight in the query (used for long queries) top_k : int Number of results to be retrieved when querying the database Methods ------- initializeArango() Initializes ArangoDB, connect to a client, creates/connects to collection and view. ''' def __init__(self, indexName='default_index', bm25_b=0.75, bm25_k1=1.2, bm25_k3=0.0, top_k=100): self.indexName = indexName self.bm25_b = float(bm25_b) self.bm25_k1 = float(bm25_k1) self.bm25_k3 = float(bm25_k3) self.numberResults = int(top_k) self.root_path = "/home/ruan/Documentos/git/tcc-ii-ir-features-text-mining/tool-testing/" self.zettair_query_process = None self.initializeArango() self.initializeElastic() self.resultsIndexName = 'tcc_results' body = { "settings": { "number_of_shards": 1, } } if not self.elasticClient.indices.exists(index=self.resultsIndexName): self.elasticClient.indices.create(index=self.resultsIndexName, body=body) # Create a new database named "test" if it does not exist. if not self.arango_sys_db.has_database(self.resultsIndexName): self.arango_sys_db.create_database(self.resultsIndexName) # Connect to "test" database as root user. # This returns an API wrapper for "test" database. self.arangoResultsDb = self.arangoClient.db(self.resultsIndexName, username=None, password=None) db = self.arangoResultsDb # Create a new collection named "students" if it does not exist. # This returns an API wrapper for "students" collection. if db.has_collection(self.resultsIndexName): self.arangoResultsCollection = db.collection(self.resultsIndexName) else: self.arangoResultsCollection = db.create_collection( self.resultsIndexName) def get_parameters(self): parameters = { 'indexName': str(self.indexName), 'bm25_b': str(self.bm25_b), 'bm25_k1': str(self.bm25_k1), 'bm25_k3': str(self.bm25_k3), 'top_k': str(self.numberResults), } return parameters def clean_current(self): self.delete_all([str(self.indexName)]) def clean_default(self): default_list = [] for item in default_db_names: default_list.append(str(item)) default_list.append(str(item) + '_bulk') self.delete_all(default_list) def delete_all(self, index_list): ''' Deletes the databases/indexes from all tools. Parameters ---------- index_list : list String list of the database/indexes names. ''' self.arango_delete(index_list) self.elastic_delete(index_list) def log_result(self, itemKey, itemBody): ''' Inserts a document in the Elasticsearch database. Parameters ---------- itemKey : str or number Document identifier itemBody : dict Document body/data. ''' self.elasticClient.index(index=self.resultsIndexName, doc_type=self.elasticDocumentType, id=itemKey, body=itemBody) document = {'_key': itemKey} document.update(itemBody) self.arangoResultsCollection.insert(document) def get_text_from_child(self, tag): ''' Recursive function to get full text from XML elements with tags. Parameters ---------- tag : XML ElementTree element Element ''' text = ' ' if tag.text is not None: text = str(text + tag.text) count = 0 for child in tag: count = count + 1 text = str(text + self.get_text_from_child(child)) return text def get_documents(self, db='authorprof', documents_xml_folder='db_authorprof/en/', truth_txt='db_authorprof/truth.txt', append_class_to_id=False): ''' Generates a list with all documents from db formatted files. Parameters ---------- db : str Database name. documents_xml_folder : str Folder that contains the XML files from the authors' documents (twits), must follow the DB_AUTHORPROF task XML format. truth_txt : str Truth TXT file with authors' classifications of gender { female | male }, must follow the DB_AUTHORPROF task TXT format. ''' if (db == 'authorprof'): return self.get_documents_DB_AUTHORPROF(documents_xml_folder, truth_txt, append_class_to_id) if (db == 'botgender'): return self.get_documents_DB_BOTGENDER(documents_xml_folder, truth_txt, append_class_to_id) if (db == 'hyperpartisan'): return self.get_documents_DB_HYPERPARTISAN(documents_xml_folder, truth_txt, append_class_to_id) if (db == 'hyperpartisan_split_42'): return self.get_documents_DB_HYPERPARTISAN_split( documents_xml_folder, truth_txt, append_class_to_id) return [] def get_documents_DB_AUTHORPROF(self, documents_xml_folder='db_authorprof/en/', truth_txt='db_authorprof/truth.txt', append_class_to_id=False): ''' Generates a list with all documents from DB_AUTHORPROF formatted files. Parameters ---------- documents_xml_folder : str Folder that contains the XML files from the authors' documents (twits), must follow the DB_AUTHORPROF task XML format. truth_txt : str Truth TXT file with authors' classifications of gender { female | male }, must follow the DB_AUTHORPROF task TXT format. ''' documents = [] lines = [] separator = ':::' # Open the truth file with open(truth_txt) as f: lines = f.read().splitlines() # Iterates over the lines, and reads each author's XML file adding the documents to the list for line in lines: author_id, gender = line.split(separator) author_xml = documents_xml_folder + author_id + '.xml' # Open the author XML file tree_author = ET.parse(str(author_xml), parser=ET.XMLParser(encoding="utf-8")) root_author = tree_author.getroot() number = 1 for child in root_author[0]: document = { 'id': str(author_id + '-' + str(number)), 'author_id': str(author_id), 'gender': str(gender), 'class': str(gender), 'text': child.text } if append_class_to_id: document['id'] += str(':' + str(document['class'])) number = number + 1 documents.append(document) return documents def get_documents_DB_BOTGENDER(self, documents_xml_folder='db_botgender/en/', truth_txt='db_authorprof/truth.txt', append_class_to_id=False): ''' Generates a list with all documents from DB_BOTGENDER formatted files. Parameters ---------- documents_xml_folder : str Folder that contains the XML files from the authors' documents (twits), must follow the DB_BOTGENDER task XML format. truth_txt : str Truth TXT file with authors' classifications of kind {bot | human} and gender { bot | female | male }, must follow the DB_BOTGENDER task TXT format. ''' documents = [] lines = [] separator = ':::' # Open the truth file with open(truth_txt) as f: lines = f.read().splitlines() # Iterates over the lines, and reads each author's XML file adding the documents to the list for line in lines: author_id, kind, gender = line.split(separator) author_xml = documents_xml_folder + author_id + '.xml' # Open the author XML file tree_author = ET.parse(str(author_xml), parser=ET.XMLParser(encoding="utf-8")) root_author = tree_author.getroot() number = 1 for child in root_author[0]: document = { 'id': str(author_id + '-' + str(number)), 'author_id': str(author_id), 'kind': str(kind), 'gender': str(gender), 'text': child.text, 'class': str(kind), } if append_class_to_id: document['id'] += str(':' + str(document['class'])) number = number + 1 documents.append(document) return documents def get_documents_DB_HYPERPARTISAN( self, articles_xml='db_hyperpartisan/articles.xml', ground_truth_xml='db_hyperpartisan/ground_truth.xml', append_class_to_id=False): ''' Generates a list with all documents from DB_HYPERPARTISAN formatted files. Parameters ---------- articles_xml : str Articles XML file name, the file must have articles surrounded by <article> tags, must follow the DB_HYPERPARTISAN task XML format. ground_truth_xml : str Articles ground truth XML file with articles surrounded by <article> tags, must follow the DB_HYPERPARTISAN task XML format. ''' documents = [] # Openning the XML files tree_articles = ET.parse(str(articles_xml), parser=ET.XMLParser(encoding="utf-8")) root_articles = tree_articles.getroot() tree_ground_truth = ET.parse(str(ground_truth_xml)) root_ground_truth = tree_ground_truth.getroot() for a_child, g_child in zip(root_articles, root_ground_truth): document = { **a_child.attrib, **g_child.attrib, 'text': str(self.get_text_from_child(a_child)), 'class': str(g_child.get('hyperpartisan')), } if append_class_to_id: document['id'] += str(':' + str(document['class'])) documents.append(document) return documents def get_documents_DB_HYPERPARTISAN_split( self, articles_xml='db_hyperpartisan/articles.xml', ground_truth_xml='db_hyperpartisan/ground_truth.xml', append_class_to_id=False): ''' Generates a list with all documents from DB_HYPERPARTISAN formatted files. Parameters ---------- articles_xml : str Articles XML file name, the file must have articles surrounded by <article> tags, must follow the DB_HYPERPARTISAN task XML format. ground_truth_xml : str Articles ground truth XML file with articles surrounded by <article> tags, must follow the DB_HYPERPARTISAN task XML format. ''' df = pd.read_csv('db_hyperpartisan/train_set.csv', dtype=str) documents = [] # Openning the XML files tree_articles = ET.parse(str(articles_xml), parser=ET.XMLParser(encoding="utf-8")) root_articles = tree_articles.getroot() tree_ground_truth = ET.parse(str(ground_truth_xml)) root_ground_truth = tree_ground_truth.getroot() for a_child, g_child in zip(root_articles, root_ground_truth): document = { **a_child.attrib, **g_child.attrib, 'text': str(self.get_text_from_child(a_child)), 'class': str(g_child.get('hyperpartisan')), } if (df['0'].str.contains(document['id']).any()): if append_class_to_id: document['id'] += str(':' + str(document['class'])) documents.append(document) return documents def calc_IR(self, result_df, positive_class='true'): ''' Calculates IR attributes suggested in the research: CLASS_0_BM25_AVG CLASS_0_BM25_COUNT CLASS_0_BM25_SUM CLASS_1_BM25_AVG CLASS_1_BM25_COUNT CLASS_1_BM25_SUM and returns them as a dictionary. Parameters ---------- result_df : DataFrame A query result dataframe produced by the query methods. Must have the columns: * score * class positive_class : str Specifies which 'class' is the positive class. ''' df = result_df.copy() CLASS_0 = df.loc[(df['class'] != positive_class)]['score'] CLASS_1 = df.loc[(df['class'] == positive_class)]['score'] attrib_IR = { 'CLASS_0_BM25_AVG': (0 if math.isnan(CLASS_0.mean()) else CLASS_0.mean()), 'CLASS_0_BM25_COUNT': CLASS_0.count(), 'CLASS_0_BM25_SUM': CLASS_0.sum(), 'CLASS_1_BM25_AVG': (0 if math.isnan(CLASS_1.mean()) else CLASS_1.mean()), 'CLASS_1_BM25_COUNT': CLASS_1.count(), 'CLASS_1_BM25_SUM': CLASS_1.sum(), } return attrib_IR def initializeArango(self): ''' Initialize ArangoDB with the specific parameters used by the repository, also sets it up to the research, creating the collection and view needed. Parameters ---------- none : none ''' # Initialize the ArangoDB client. self.arangoClient = ArangoClient(hosts='http://localhost:8529') # Connect to "_system" database as root user. # This returns an API wrapper for "_system" database. self.arango_sys_db = self.arangoClient.db('_system', username=None, password=None) index_name = self.indexName # Create a new database named "test" if it does not exist. if not self.arango_sys_db.has_database(index_name): self.arango_sys_db.create_database(index_name) # Connect to "test" database as root user. # This returns an API wrapper for "test" database. self.arangoDb = self.arangoClient.db(index_name, username=None, password=None) db = self.arangoDb # Create a new collection named "students" if it does not exist. # This returns an API wrapper for "students" collection. if db.has_collection(index_name): self.arangoCollection = db.collection(index_name) else: self.arangoCollection = db.create_collection(index_name) # Retrieve list of views. view_list = db.views() # Creates the view used by the Analyzer to Search and use BM25 self.arangoViewName = str('v_' + index_name) if not view_list: db.create_view(name=self.arangoViewName, view_type='arangosearch', properties={ 'cleanupIntervalStep': 0, 'consolidationIntervalMsec': 0, 'writebufferSizeMax': 0, 'links': { index_name: { "analyzers": ["text_en"], "includeAllFields": True, "storeValues": 'id' } } }) # Configure AQL query cache properties db.aql.cache.configure(mode='off', max_results=100000) def arango_delete(self, databases): ''' Deletes the databases from ArangoDB. Parameters ---------- databases : list String list of the database names. ''' for db in databases: # Delete database named 'db' if it does exist. if self.arango_sys_db.has_database(str(db)): self.arango_sys_db.delete_database(str(db)) def insertArango(self, itemKey, itemBody): ''' Inserts a document in the ArangoDB 'indexName' collection. Parameters ---------- itemKey : str or number Document identifier itemBody : dict Document body/data. ''' document = {'_key': itemKey} document.update(itemBody) self.arangoCollection.insert(document) def insertDocumentArango(self, document): ''' Inserts a document in the ArangoDB 'indexName' collection. Parameters ---------- document : dict Document to be inserted, might contain a '_key' or '_id' value, e.g.: '_key' : ' document1', or '_id' : 'collection_name/document1' ''' self.arangoCollection.insert(document) def bulkListGeneratorArango(self, bulkItems): ''' Generates bulk documents ready to import to the ArangoDB collection. Parameters ---------- bulkItems : list Bulk items to be processed, must contain an 'id' field. ''' documentList = [] tempdict = bulkItems.copy() for item in tempdict: document = {'_key': item.pop('id'), **item} documentList.append(document) return documentList def bulkImportArango(self, documentList): ''' Bulk import to ArangoDB collection. Parameters ---------- documentList : list of dicts List of documents to be inserted in the ArangoDB collection. Every document must have an '_key' field. e.g. of document list: [{'_key': 'document1', 'field1': 'value1', 'field2': 'value2'}, {'_key': 'document2', 'field1': 'value4', 'field2': 'value5'}] ''' self.arangoCollection.import_bulk(documentList) def arango_query(self, query, ignore_first_result=False): ''' Query ArangoDB view and returns a Pandas DataFrame with the results. Parameters ---------- query : str Text to be queried to the view using BM25 analyzer. ''' initial = time.time() escaped_query = str(query).replace('\\', '') escaped_query = str(escaped_query).replace("'", "\\\'") nResults = int(self.numberResults) if ignore_first_result: nResults += 1 aqlquery = (f"FOR d IN {str(self.arangoViewName)} SEARCH " + f"ANALYZER(d.text IN TOKENS('{escaped_query}'" + f", 'text_en'), 'text_en') " + f"SORT BM25(d, {self.bm25_k1}, {self.bm25_b}) " + f"DESC LIMIT {nResults} " + f"LET sco = BM25(d, {self.bm25_k1}, " + f"{self.bm25_b}) RETURN {{ doc: d, score: sco }}") # print(aqlquery) cursor = self.arangoDb.aql.execute(query=aqlquery, count=True, batch_size=self.numberResults, optimizer_rules=['+all'], cache=True) item_list = [] # print(1, time.time()-initial) initial = time.time() for item in cursor.batch(): # print(item) item_list.append([ item['score'], item['doc']['_id'].split('/')[-1], item['doc']['class'] ]) # print(2, time.time()-initial) if ignore_first_result and (len(item_list) > 0): item_list.pop(0) return pd.DataFrame(item_list, columns=['score', 'id', 'class']) def arango_get_document(self, key): ''' Get a document from ArangoDB database, returns the document. Parameters ---------- key : str Document key. ''' result = self.arangoCollection.get(str(key)) return result def arango_get_IR_variables(self, query, positive_class='true', ignore_first_result=False): ''' Query ArangoDB view and returns a dict with the IR variables. Parameters ---------- query : str Text to be queried to the view using BM25 analyzer. ''' result_df = self.arango_query(query, ignore_first_result=ignore_first_result) return self.calc_IR(result_df=result_df, positive_class=positive_class) def initializeElastic(self): ''' Initialize Elasticsearch with the specific parameters used by the repository, setting it up to the research. Parameters ---------- none : none ''' # Initialize the Elasticsearch client. self.elasticClient = Elasticsearch(hosts='http://localhost:9200') self.elasticDocumentType = '_doc' body = { "settings": { "number_of_shards": 1, "index": { "similarity": { "default": { "type": "BM25", "b": self.bm25_b, "k1": self.bm25_k1 } } } } } if not self.elasticClient.indices.exists(index=self.indexName): self.elasticClient.indices.create(index=self.indexName, body=body) def elastic_delete(self, indices): ''' Deletes complete indices from Elasticsearch. Parameters ---------- indices : list String list of the indices names. ''' for index in indices: # Delete indice named 'index' if it does exist. if self.elasticClient.indices.exists(index=str(index)): self.elasticClient.indices.delete(index=str(index)) def insertElastic(self, itemKey, itemBody): ''' Inserts a document in the Elasticsearch database. Parameters ---------- itemKey : str or number Document identifier itemBody : dict Document body/data. ''' self.elasticClient.index(index=self.indexName, doc_type=self.elasticDocumentType, id=itemKey, body=itemBody) def bulkInsertGeneratorElastic(self, bulkItems): ''' Generates a bulk body of insert Elasticsearch operations. Parameters ---------- bulkItems : list Bulk items to be processed, must contain an 'id' field. ''' bulkBody = [] tempdict = bulkItems.copy() # item['_id'] = item.pop('id') for item in tempdict: action = [{ 'index': { "_index": self.indexName, "_id": item.pop('id') } }, item] bulkBody.extend(action) return bulkBody def bulkHelperInsertGeneratorElastic(self, bulkItems): ''' Generates a bulk body of insert Elasticsearch operations. Parameters ---------- bulkItems : list Bulk items to be processed, must contain an 'id' field. ''' # item['_id'] = item.pop('id') for item in bulkItems: item['_index'] = self.indexName item['_id'] = item.pop('id') item['_type'] = self.elasticDocumentType return bulkItems def bulkElastic(self, bulkBody): ''' Bulk Elasticsearch operations. Parameters ---------- bulkBody : list or str with operations separated by newlines ('\n') Bulk operations to be executed, already in the format and order to be executed. All operations must have an '_id' in their metadata field. e.g. of index operation over 'index_name' index: [{ 'index': {'_index': 'index_name', '_id' : 'document_id'}, {'field1' : 'value1'}] ''' self.elasticClient.bulk(index=self.indexName, body=bulkBody) def bulkHelperElastic(self, bulkHelperActions): ''' Bulk Helper Elasticsearch operations. Parameters ---------- bulkBody : list or str with operations separated by newlines ('\n') Bulk operations to be executed, already in the format and order to be executed. All operations must have an '_id' in their metadata field. e.g. of index operation over 'index_name' index: [{ 'index': {'_index': 'index_name', '_id' : 'document_id'}, {'field1' : 'value1'}] ''' # print(len(bulkHelperActions)) # print(bulkHelperActions[0]) r = ElasticsearchHelpers.bulk( client=self.elasticClient, actions=bulkHelperActions, index=self.indexName, # thread_count=6, chunk_size=500, max_chunk_bytes=1000 * 1024 * 1024) # print(r) def refreshElastic(self): ''' Refresh Elasticsearch indices. Parameters ---------- none : none ''' self.elasticClient.indices.refresh(index=self.indexName) def elastic_query(self, query, ignore_first_result=False): ''' Query Elasticsearch index, returns a Pandas DataFrame with the results. Parameters ---------- query : str Text to be queried to the index using BM25 similarity implemented by Elasticsearch. ''' # escaped_query = str(query).replace('\\', '') # escaped_query = str(query).replace('"', '\\\"') # escaped_query = json.JSONEncoder.encode(query) escaped_query = str(query).replace("'", " ") # escaped_query = str(query).replace("\\", "\'") # escaped_query = query # print('text\n\n\n\n\nHERE') # print(escaped_query) nResults = int(self.numberResults) if ignore_first_result: nResults += 1 result = self.elasticClient.search( index=self.indexName, body={"query": { "match": { "text": escaped_query } }}, size=nResults) hit_list = [] for hit in result['hits']['hits']: hit_list.append( [hit['_score'], hit['_id'], hit['_source']['class']]) if ignore_first_result and (len(hit_list) > 0): hit_list.pop(0) return pd.DataFrame(hit_list, columns=['score', 'id', 'class']) def elastic_get_document(self, id): ''' Get a document from a Elasticsearch index, returns the document. Parameters ---------- id : str Document id. ''' result = self.elasticClient.get_source(index=self.indexName, id=str(id)) return result def elastic_get_IR_variables(self, query, positive_class='true', ignore_first_result=False): ''' Query Elasticsearch index, returns a dict with the IR variables. Parameters ---------- query : str Text to be queried to the index using BM25 similarity implemented by Elasticsearch. ''' result_df = self.elastic_query(query, ignore_first_result=ignore_first_result) return self.calc_IR(result_df=result_df, positive_class=positive_class) def initializeZettair(self): print('') def saveToTrecFileZettair(self, bulkItems): filename = str(self.indexName) + '.txt' f = open(filename, "w+") for d in bulkItems: f.write(f'<DOC>\n<DOCNO>{d["id"]}</DOCNO>\n{d["text"]}\n</DOC>\n') f.close() def zettair_index(self): trecfile = str(self.indexName) + '.txt' cmd = f'zet -i -f {self.indexName} -t TREC --big-and-fast {trecfile}' res = subprocess.run(cmd, shell=True, universal_newlines=True, check=True, capture_output=True) # p = subprocess.Popen(['zet', '--index', '--filename', # self.indexName, '-t', 'TREC', # '--big-and-fast', str(trecfile)], # stdin=subprocess.PIPE, # stdout=subprocess.PIPE, # stderr=subprocess.PIPE) # p.terminate() print(res) def zettair_query(self, query, interactive=True, ignore_first_result=False): ''' Query Zettair index, returns a Pandas DataFrame with the results. Parameters ---------- query : str Text to be queried to the index using BM25 metric. ''' escaped_query = str(query).replace('\\', '') escaped_query = str(escaped_query).replace('"', ' ') escaped_query = str(escaped_query).replace('`', '\\`') nResults = int(self.numberResults) if ignore_first_result: nResults += 1 if (self.zettair_query_process is None): self.zettair_query_process = subprocess.Popen( [ 'zet', '-f', self.root_path + self.indexName, '-n', str(nResults), '--okapi', f'--b={self.bm25_b}', f'--k1={self.bm25_k1}', f'--k3={self.bm25_k3}', '--summary=none', '--big-and-fast' ], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # print(escaped_query) # p.terminate() out = '' lines = [] if interactive: escaped_query = str(escaped_query).replace("'", " ") escaped_query = "'" + escaped_query + "'" # out, err = self.zettair_query_process.communicate( # escaped_query.encode('utf-8')) # out = out.decode('utf-8') # print(out.decode('utf-8')) escaped_query = str(escaped_query).replace('\n', ' ') # print(escaped_query) self.zettair_query_process.stdin.write( escaped_query.encode('utf-8') + b'\n') # self.zettair_query_process.stdin.write(escaped_query+'\n') self.zettair_query_process.stdin.flush() # print(escaped_query) fl = self.zettair_query_process.stdout.readline() while len(fl.decode('utf-8').split()) > 7: # print(fl) fl = self.zettair_query_process.stdout.readline() # print(fl.decode('utf-8').split('>')) lines.append(fl.decode('utf-8').split('>')[1]) while fl != b'\n' and fl != b'> \n': fl = self.zettair_query_process.stdout.readline() self.zettair_query_process.stdout.flush() if not self.zettair_query_process.poll() is None: print('POOL\n', self.zettair_query_process.poll()) err = self.zettair_query_process.stderr.readline() if err != "": print('ERROR\n', err) lines.append(fl.decode('utf-8')) # for line in iter(self.zettair_query_process.stdout.readline, b'\n'): # lines.append(line.decode('utf-8')) # self.zettair_query_process.stdout.flush() # # print(line) # print('END') else: escaped_query = '"' + escaped_query + '"' cmd = f'zet -f {self.root_path}{self.indexName} -n {str(nResults)} --okapi ' + \ f'--b={self.bm25_b} --k1={self.bm25_k1} --k3={self.bm25_k3} ' + \ f'--summary=none --big-and-fast {escaped_query}' res = subprocess.run(cmd, shell=True, universal_newlines=True, check=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) out = res.stdout # Process Zettair query result # linesx = out.split('>')[1].splitlines() linesx = out.splitlines() # linesx = (line for line in linesx if line) # Non-blank lines for line in linesx: if line: lines.append(line) else: # breaks after first blank line, next line is the summary break # Iterates over the lines, extracts the id and score res_list = [] len_lines = len(lines) # if not (len_lines <= 2 and lines[0] == ' '): if (len_lines > 2 and (not lines[0] == ' ')): for line in lines: line_split = line.split() if (len(line_split) >= 4): stuff = line_split[1].split(':') cur_id = stuff[0] # print(f'stuff: {stuff}') if (len(stuff) > 1): cl = stuff[1] else: cl = self.elastic_get_document(str(cur_id))['class'] score = line_split[3].split(',')[0] # cl = 'true' res_list.append([float(score), cur_id, cl]) if ignore_first_result and (len(res_list) > 0): res_list.pop(0) return pd.DataFrame(res_list, columns=['score', 'id', 'class']) def zettair_get_IR_variables(self, query, positive_class='true', interactive=True, ignore_first_result=False): ''' Query Zettair index, returns a dict with the IR variables. Parameters ---------- query : str Text to be queried to the index using BM25 metric. ''' result_df = self.zettair_query(query, interactive, ignore_first_result=ignore_first_result) return self.calc_IR(result_df=result_df, positive_class=positive_class) def zettair_delete(self, index_name): '''
class EsSongci(): ES_HOST = [ "http://192.168.1.24:9200/", "http://192.168.1.24:9201/", ] def __init__(self, index_name="songci",index_type="songci_type", hosts=None, transport_class=Transport, **kwargs): """ :param index_name: 索引名称 :param index_type: 索引类型 :param hosts: :param transport_class: :param kwargs: """ self.index_name = index_name self.index_type = index_type if hosts is None: self.es = Elasticsearch(hosts=self.ES_HOST, transport_class=transport_class, **kwargs) else: self.es = Elasticsearch(hosts=hosts, transport_class=transport_class, **kwargs) def deleteIndex(self): if self.es.indices.exists(index=self.index_name) is True: result = self.es.indices.delete(index=self.index_name, ignore=(400, 404)) print(result) def createIndex(self): """ 创建映射: auth: 作者 title: 宋体标题 content: 词内容 md5: _id 创建索引,创建索引名称为songci,类型为songci_type的索引 :param ex: Elasticsearch对象 :return: """ _index_mappings = { '_source': { 'enabled': True }, "properties": { "content": { "type": "text", "index": True, "analyzer": "ik_max_word", "search_analyzer": "ik_max_word" }, # "date": { # "type": "text", # "index": True # }, "auth": { "type": "keyword", "index": False, # "index" : "not_analyzed" # 为了避免这种问题,我们需要告诉 Elasticsearch 该字段具有精确值,要将其设置成 not_analyzed 无需分析的。 }, "title": { "type": "text", "index": True, "analyzer": "ik_max_word", "search_analyzer": "ik_max_word" }, "random": { "type": "integer", "index": False, # "index" : "not_analyzed" # 为了避免这种问题,我们需要告诉 Elasticsearch 该字段具有精确值,要将其设置成 not_analyzed 无需分析的。 }, } } if self.es.indices.exists(index=self.index_name) is not True: # res = self.es.indices.create(index=self.index_name, body=_index_mappings) self.es.indices.create(index=self.index_name, ignore=400) res = self.es.indices.put_mapping(index='news', doc_type=self.index_type, body=_index_mappings) print(res) def builkIndexData(self): """ 用bulk将批量数据存储到es """ ACTIONS = [] i = 1 for md5, title, auth, content in getPoetry(): # print(md5, title, auth, content) action = { "_index": self.index_name, "_type": self.index_type, "_id": md5, #_id 也可以默认生成,不赋值 "_source": { "title": title, "auth": auth, "content": content, "random": i, } } i += 1 ACTIONS.append(action) success, _ = bulk(self.es, ACTIONS, index=self.index_name, raise_on_error=True) print('Performed %d actions' % success) # print(_) def deleteIndexData(self, id): ''' 删除索引中的一条 :param id: :return: ''' res = self.es.delete(index=self.index_name, doc_type=self.index_type, id=id) print(res) def updateDataByID(self, id, body=None): """ 更新文档 """ # {"doc": {"age": 37, "country": "china"}} res = self.es.update(index=self.index_name, id=id, doc_type=self.index_type, body=body) print(res) def getDataById(self, id): """ 获取文档信息 """ res = self.es.get(index=self.index_name, doc_type=self.index_type, id=id) pprint.pprint(res) # print(res['_source']) # 获取文档内容 res = self.es.get_source(index=self.index_name, id=id, doc_type=self.index_type) print(res) def getDataByBody(self): """ 获取索引中的一条 """ # doc = {'query': {'match_all': {}}} dsl = { "query": { "match": { "auth": "吴文英111" } } } dsl1 = { "query": { "bool": { "must": {"term": {"auth": "吴文英111"}} } } } dsl = { "query" : { "constant_score" : { "filter" : { "term" : { "random" : 2 } } } } } _searched = self.es.search(index=self.index_name, body=dsl) pprint.pprint(_searched) for hit in _searched['hits']['hits']: pass # print (hit['_source']['auth'], hit['_source']['content'], hit['_source']['title']) def mget(self, ids): """ 多条数据查询 """ res = self.es.mget(index=self.index_name, doc_type=self.index_type, body={'ids': ids}) pprint.pprint(res)
try: connection = Elasticsearch( config['elasticsearch_hosts'], # sniff before doing anything sniff_on_start=True, # refresh nodes after a node fails to respond sniff_on_connection_fail=True, # and also every 60 seconds sniffer_timeout=60) try: out = connection.delete(index=config['default_index'], doc_type='group', id='2') data = connection.get_source(index=config['default_index'], doc_type="group", id='2') print(json.dumps(data, indent=4, sort_keys=True)) except Exception as e: pass finally: pass except Exception as e: print("Failed to add item") print("Test failed") traceback.print_exc() ret = 1 finally: print(test_name + " Test complete") sys.exit(ret)
class ElasticBookStorage(object): def __init__(self): self.book_index = ELASTIC_INDEX self.book_doc = ELASTIC_DOC self.ELK_HOSTNAME = ELASTIC_HOSTNAME self.ELK_PORT = ELASTIC_PORT self.es = Elasticsearch([{ 'host': self.ELK_HOSTNAME, 'port': self.ELK_PORT }]) def create_book_index(self): """ The following function is used to create the book index :return: pass :Examples: >>> elk = ElasticBookStorage() >>> elk.create_book_index() """ try: self.es.indices.create(index=self.book_index, ignore=400) except Exception as ex: print(ex) pass def bulk_insert(self, data): """ The following function is used to insert bulk data to ElasticSearch :param data: list of dict :return: Example: >>> data = [{ "title": "Solr in Action", "authors": ["trey grainger", "timothy potter"], "summary" : "Comprehensive guide","publish_date" : "2015-12-03", "num_reviews": 18, "publisher": "manning" }] >>> bulk_insert(data) """ try: actions = [{ "_index": self.book_index, "_type": self.book_doc, "_id": i, "_source": data[i] } for i in range(len(data))] helpers.bulk(self.es, actions=actions) except Exception as e: print(e) def create_book_doc(self, title, authors, summary, publisher, num_reviews, publish_date): """ The following function is used to create a book entry to elasticsearch using the provided info :param title: book title :param authors: book authors :param summary: book summary :param publisher: book publisher :param num_reviews: book number of reviews :param publish_date: book publish date :return: pass :Example: >>> title="Some book" >>> authors=["Author1", "Author2"] >>> summary = "this is a book written by Author1 and Author2" >>> publisher = "Loki AE" >>> num_reviews = 20 >>> publish_date = "2014-04-05" >>> elk = ElasticBookStorage() >>> elk.create_book_doc(title, authors, summary, publisher, num_reviews, publish_date) """ try: body = { "title": title, "authors": authors, "summary": summary, "publisher": publisher, "num_reviews": num_reviews, "publish_date": publish_date } self.es.index(index=self.book_index, doc_type=self.book_doc, body=body) except Exception as e: print(e) def retrieve_book_by_id(self, book_id): """ The following function is used to retrieve a book document from the elastic search using is ID :param book_id: book id :return: result document :Example: >>> elk = ElasticBookStorage() >>> book = elk.retrieve_book_by_id(book_id=2) """ try: results = self.es.get_source(index=self.book_index, doc_type=self.book_doc, id=str(book_id)) return results except Exception as ex: print(ex) def remove_book_doc(self, book_id): """ The following function is used to remove a book entry from elastic search using its ID :param book_id: book ID :return: :Example: >>> elk = ElasticBookStorage() >>> elk.remove_book_doc(book_id=2) """ try: self.es.delete(index=self.book_index, doc_type=self.book_doc, id=str(book_id)) except Exception as e: print(e) def multi_match_query(self, query): """ The following function is used to perform a basic match query using elastic search functionalities :param query: provided query parameter :return: results :Examples: >>> elk = ElasticBookStorage() >>> results = elk.basic_match_query(query="guide") """ try: results = self.es.search(index=self.book_index, q=query)["hits"]["hits"] return results except Exception as e: print(e) def search_book_by_param(self, *args, _source=[]): """ The following function is used to retrieve results from elastic search searching for books that contains in their title the provided query :param _source: source argument :param query: provided query to search :return: results :Examples: >>> query = "in action" >>> elk = ElasticBookStorage() >>> results = elk.search_book_by_title("title", "in action") """ try: term = args[0] query = args[1] body = { "query": { "match": { "{}".format(term): query } }, "_source": _source } results = self.es.search(index=self.book_index, body=body)["hits"]["hits"] return results except Exception as ee: print(ee) def fuzzy_queries(self, query, _source=[], **kwargs): """ The following function receives a query and search to match books using the provided query to match books title and summary using Fuzzy matching. Fuzzy matching can be enabled on Match and Multi-Match queries to catch spelling errors. The degree of fuzziness is specified based on the Levenshtein distance from the original word, i.e. the number of one-character changes that need to be made to one string to make it the same as another string. :param query: provided query :return: results :Examples: >>> query="comprihensiv guide" >>> elk = ElasticBookStorage() >>> elk.fuzzy_queries(query, fields=["title", "summary"]) """ try: fields = kwargs["fields"] body = { "query": { "multi_match": { "query": query, "fields": fields, "fuzziness": "AUTO" } }, "_source": _source, "size": 1 } results = self.es.search(index=self.book_index, body=body)["hits"]["hits"] return results except Exception as ee: print(ee) def wild_card_query(self, _source=[], **kwargs): """ This function is used to perform ElasticSearch wild card queries :param args: given arguments :param query: given query :return: Example: >>> wild_card_query("authors", query="t*") """ try: field = kwargs['field'] query = kwargs['query'] body = { "query": { "wildcard": { "{}".format(field): query } }, "highlight": { "fields": { "".format(field): {} } }, "_source": _source } results = self.es.search(index=self.book_index, body=body)["hits"]["hits"] return results except Exception as ex: print(ex) def regex_query(self, **kwargs): """ Regexp queries allow you to specify more complex patterns than wildcard queries :param query: provided query :param args: provided arguments :return: Example: >>> regex_query("authors", query="t[a-z]*y") """ try: field = kwargs['field'], query = kwargs['query'] body = { "query": { "regexp": { "{}".format(field): query } }, "highlight": { "fields": { "{}".format(field): {} } }, } results = self.es.search(index=self.book_index, body=body)["hits"]["hits"] return results except Exception as ex: print(ex) def match_phrase_query(self, query, **kwargs): """ The match phrase query requires that all the terms in the query string be present in the document, be in the order specified in the query string and be close to each other. By default, the terms are required to be exactly beside each other but you can specify the slop value which indicates how far apart terms are allowed to be while still considering the document a match. :param query: provided query :param kwargs: provided kwargs :return: Example: >>> match_phrase_query(query="search engine", fields=["title", "summary"], slop=3, _source=[]) """ try: fields = kwargs["fields"] slop = kwargs["slop"] body = { "query": { "multi_match": { "query": query, "fields": fields, "type": "phrase", "slop": slop } }, "_source": [] } results = self.es.search(index=self.book_index, body=body)["hits"]["hits"] return results except Exception as ex: print(ex) def match_phrase_prefix(self, query, slop, max_expansions=10, _source=[]): """ Match phrase prefix queries provide search-as-you-type or a poor man’s version of autocomplete at query time without needing to prepare your data in any way.Like the match_phrase query, it accepts a slop parameter to make the word order and relative positions somewhat less rigid. It also accepts the max_expansions parameter to limit the number of terms matched in order to reduce resource intensity. :param query: provided query :param slop: provided slop :param max_expansions: provided max expansions :return: Example: >>> match_phrase_prefix(query="search en", slop=3) """ try: body = { "query": { "match_phrase_prefix": { "summary": { "query": query, "slop": slop, "max_expansions": max_expansions } } }, "_source": _source } results = self.es.search(index=self.book_index, body=body)["hits"]["hits"] return results except Exception as ex: print(ex) def term_query(self, _source=[], **kwargs): """ The above examples have been examples of full-text search. :param kwargs: provided kwargs :return: Example: >>> term_query(field="publisher", term="manning") """ try: field = kwargs["field"] term = kwargs["term"] if isinstance(term, list): term_or_terms = "terms" else: term_or_terms = "term" body = { "query": { "{}".format(term_or_terms): { "{}".format(field): term } }, "_source": _source } results = self.es.search(index=self.book_index, body=body)["hits"]["hits"] return results except Exception as ex: print(ex) def delete_by_query(self, query, fields): """ This function is used to delete by query :param query: provided query :param fields: provided fields :return: Example: >>> delete_by_query(query="python", fields=['title']) """ try: client = Elasticsearch() s = Search(using=client, index=self.book_index) retrieved_items = s.query( Q("multi_match", query=query, fields=fields)) retrieved_items.delete() except Exception as ex: print(ex) def update_by_query(self, **kwargs): """ This function is used to update ElasticSearch entries by record :param kwargs: provided kwargs :return: Example: >>> update_by_query(fields="publisher", query="oreilly", field_to_update="publisher", new_value="OnMedia") """ try: client = Elasticsearch() ubq = UpdateByQuery(using=client, index=self.book_index) search_fields = kwargs["fields"] query = kwargs["query"] field_to_update = kwargs["field_to_update"] new_value = kwargs["new_value"] ubq.query("multi_match", query=query, fields=search_fields).script( source="ctx._source.{}='{}'".format(field_to_update, new_value)).execute() except Exception as ex: print(ex) def query_combination(self, **kwargs): """ This function performs Combined bool queries :param kwargs: provided kwargs :return: Example: >>> should=[["title", "Elasticsearch"], ["title", "Solr"]], >>> must=[["authors", "clinton gormely"]], >>> must_not=[["authors", "radu george"]] >>> query_combination(should, must, must_not) """ try: client = Elasticsearch() s = Search(using=client, index=self.book_index) q = Q('bool', must=[ Q({ "multi_match": { "query": "{}".format(m[1]), "fields": ["{}".format(m[0])] } }) for m in kwargs["must"] ] if "must" in kwargs.keys() else [], should=[ Q({ "multi_match": { "query": "{}".format(m[1]), "fields": ["{}".format(m[0])] } }) for m in kwargs["should"] ] if "should" in kwargs.keys() else [], must_not=[ Q({ "multi_match": { "query": "{}".format(m[1]), "fields": ["{}".format(m[0])] } }) for m in kwargs["must_not"] ] if "must_not" in kwargs.keys() else [], minimum_should_match=1) response = s.query(q).execute()["hits"]["hits"] return response except Exception as ex: print(ex)