示例#1
0
def delete_post():
    # "POST /delete/post" -> "deleted.html"のレンダリング
    title = get_title('削除完了')
    isbn10 = request.form['isbn10']  # 削除対象書籍ISBN-10コード

    es = Elasticsearch('elasticsearch')
    book_title = es.get_source(index='book', id=isbn10)['title']  # 削除対象書籍タイトル
    es.delete(index='book', id=isbn10)  # bookインデックスから対象書籍削除
    logger.debug('書籍の削除に成功しました (ISBN-10: {})'.format(isbn10))

    es.indices.refresh(
        index='book')  # bookインデックス更新 <- 後のD2Vモデル再訓練時に削除した書籍が混入しないようにするため
    es.close()

    # 削除した書籍を推薦対象外とするため,削除ごとにDoc2Vecモデルを再構築
    global d2v
    d2v = Doc2VecWrapper(model_path=Path('/projects/model/d2v.model'),
                         initialize=True)
    d2v.train()

    return render_template('deleted.html',
                           shishosan=config['shishosan'],
                           title=title,
                           isbn10=isbn10,
                           book_title=book_title)
    def test_add_messages(self):
        """ Test adding a thread that has messages associated with it.

        Adding a message that has messages associated with it should
        also add those messages to the search index.
        """
        thread = create_thread()
        message = create_message(thread=thread)

        self.backend.add(thread)

        es = Elasticsearch([{'host': 'localhost', 'port': 9200}])

        source = es.get_source(
            index=self.backend.index,
            doc_type='message',
            id=message.pk)
        source_json = json.dumps(source)

        expected = {
            'body': message.body,
        }
        expected_json = json.dumps(expected)

        self.assertJSONEqual(expected_json, source_json)
示例#3
0
def history():
    # "GET /history" -> "history.html"のレンダリング
    title = get_title('閲覧履歴')
    user_history = get_user_history(user=current_user)
    # 表示する閲覧履歴の最大冊数は30冊
    hisotry_max_size, unique_user_history, bIds_set = 30, [], set()

    # 最新順(タイムスタンプ降順)取得 -> 重複履歴除外
    es = Elasticsearch('elasticsearch')
    for lId, log in sorted(user_history.items(), reverse=True):
        if len(unique_user_history) == hisotry_max_size:
            break
        if log['bId'] in bIds_set:
            # 時系列順で後に閲覧した書籍 -> 除外
            continue

        bIds_set.add(log['bId'])
        unique_user_history.append(log)
        unique_user_history[-1]['book'] = es.get_source(
            index='book', id=log['bId'])  # 書籍情報取得
    es.close()

    # 閲覧書籍0冊 -> None
    if len(user_history) == 0:
        unique_user_history = None

    return render_template('history.html',
                           shishosan=config['shishosan'],
                           title=title,
                           user_history=unique_user_history)
示例#4
0
def book(isbn10=None):
    # "GET /search/<isbn10>" -> "book.html"のレンダリング
    # 類似書籍: 非パーソナライズ推薦(Doc2Vec)による書籍
    # 推薦書籍: パーソナライズ推薦(提案SBRS)による書籍

    title = get_title('本:{0}'.format(isbn10))

    es = Elasticsearch('elasticsearch')
    book = es.get_source(index='book', id=isbn10)  # bookインデックスから取得
    n_book = es.count(index='book')['count']  # 書籍総数

    if n_book >= 10:
        # 10冊以上 -> D2Vモデル構築済み -> 非パーソナライズ推薦(類似書籍取得)
        try:
            sim_books_isbn10 = d2v.get_similar_books(
                isbn10=isbn10, topn=6, verbose=False)  # 類似書籍ISBN-10
            sim_books = [
                es.get_source(index='book', id=sb[0])
                for sb in sim_books_isbn10
            ]  # 類似書籍情報取得
        except KeyError:
            # 分散表現未構築(モデル再構築前) -> 非パーソナライズ推薦キャンセル
            sim_books = None
    else:
        sim_books = None

    log = record_history(user=current_user, bId=isbn10)  # 書籍閲覧履歴記録
    rec_books_isbn10 = prop_sbrs.update(log=log)  # 推薦書籍ISBN-10

    # 推薦書籍なし(各情報不足により提案SBRSが推薦生成できず) -> 類似書籍のみ表示
    if rec_books_isbn10 is None:
        rec_books = None
    else:
        # ISBN-10に対応する書籍情報習得
        rec_books = [
            es.get_source(index='book', id=isbn10)
            for isbn10 in rec_books_isbn10
        ]
    es.close()

    return render_template('book.html',
                           shishosan=config['shishosan'],
                           title=title,
                           isbn10=isbn10,
                           book=book,
                           sim_books=sim_books,
                           rec_books=rec_books)
    def test_remove(self):
        """ Test removing an object from the search index.

        Removing an object from the search index should make it
        inaccessible to elasticsearch.
        """
        thread = create_thread()
        self.backend.add(thread)
        self.backend.remove(thread)

        es = Elasticsearch([{'host': 'localhost', 'port': 9200}])

        with self.assertRaises(NotFoundError):
            es.get_source(
                index=self.backend.index,
                doc_type='thread',
                id=thread.pk)
    def test_remove_message(self):
        """ Test removing a thread with messages.

        If a thread has messages assocated with it, those messages
        should be removed from the search backend when the thread
        instance is removed.
        """
        thread = create_thread()
        message = create_message(thread=thread)

        self.backend.add(thread)
        self.backend.remove(thread)

        es = Elasticsearch([{'host': 'localhost', 'port': 9200}])

        with self.assertRaises(NotFoundError):
            es.get_source(
                index=self.backend.index,
                doc_type='message',
                id=message.pk)
    def test_update_old_threads(self):
        """ Test updating the index with old threads.

        If there was a thread that was previously in the index and has
        since been deleted, then it should be removed from the index.
        """
        thread = create_thread()
        thread_pk = thread.pk

        backend = ElasticSearch()
        backend.add(thread)

        thread.delete()

        call_command('updateindex', stdout=self.out)

        es = Elasticsearch([{'host': 'localhost', 'port': 9200}])

        with self.assertRaises(NotFoundError):
            es.get_source(
                index='test',
                doc_type='thread',
                id=thread_pk)
示例#8
0
    def get_title_from_isbn10(self, isbn10: str) -> str:
        """ISBN-10から書籍タイトル取得(Elasticsearch経由)

        Args:
            isbn10 (str): ISBN-10コード

        Returns:
            str: 書籍タイトル
        """
        es = Elasticsearch('elasticsearch')
        title = es.get_source(index="book", id=isbn10)['title']
        es.close()

        return title
示例#9
0
class ElasticsearchBackend:
    def __init__(self):
        self.client = Elasticsearch([ELASTIC_HOST], http_auth=ELASTIC_AUTH)
        self.index = ELASTIC_CACHE_INDEX

    def get(self, id_):
        try:
            return self.client.get_source(index=self.index, id=id_)
        except NotFoundError:
            return

    def set(self, id_, body):
        body['created'] = datetime.now().isoformat()
        return self.client.index(index=self.index, id=id_, body=body)
示例#10
0
class ResultDB( BaseResultDB):
    collection_prefix = ''

    def __init__(self, url, database='resultdb'):
        self.conn = Elasticsearch()
        self.database = database
        #self.conn.IndicesClient(self.conn).delete(index=self.database);
        #self.save( "afxc2", "sd","http://www.5566.com",{"shopname":"sdfsdfs"} )
        #print self.count( "afxc2" )
        #print self.get( "afxc2" ,  "sd" )
        #self.select( "afxc2" )


    def _parse(self, data):
        return data["_source"]
        #if 'result' in data:
        #    data['result'] = json.loads(data['result'])
        #return data

    def _stringify(self, data):
        if 'result' in data:
            data['result'] = json.dumps(data['result'])
        return data

    def save(self, project, taskid, url, result):
        obj = {
            'taskid': taskid,
            'url': url,
            'result': result,
            'updatetime': time.time(),
        }
        return self.conn.index( index=self.database, doc_type=project, id=taskid, body= obj )

    def select(self, project, fields=None, offset=0, limit=0):
        ret = [];
        if limit==0 :
            limit = 10
        items = self.conn.search( index=self.database, doc_type=project, fields=fields,_source=True , from_=offset,size=limit );
        for item in  items["hits"]["hits"]:
             ret.append( self._parse(item))
        return ret;

    def count(self, project):
        r = self.conn.count(index=self.database, doc_type=project );
        return r['count'];

    def get(self, project, taskid, fields=None):
        return  self.conn.get_source( index=self.database, doc_type=project, id=taskid );
    def test_add(self):
        """ Test adding an object to the search index.

        Adding an object to the search index should make it searchable
        by elasticsearch.
        """
        thread = create_thread()
        self.backend.add(thread)

        es = Elasticsearch([{'host': 'localhost', 'port': 9200}])

        source = es.get_source(
            index=self.backend.index,
            doc_type='thread',
            id=thread.pk)
        source_json = json.dumps(source)

        expected = {
            'title': thread.title,
        }
        expected_json = json.dumps(expected)

        self.assertJSONEqual(expected_json, source_json)
示例#12
0
def delete_inquire():
    # "GET /delete" -> "delete.html"のレンダリング
    if request.args.get('isbn10') and current_user.uId not in guest_uIds_set:
        # 本ページの削除ボタンからのアクセス
        title = get_title('削除確認')
        isbn10 = request.args['isbn10']  # 削除対象書籍ISBN-10コード

        # 削除問い合わせ対象書籍情報取得
        es = Elasticsearch('elasticsearch')
        book = es.get_source(index='book', id=isbn10)
        es.close()

        return render_template('delete.html',
                               shishosan=config['shishosan'],
                               title=title,
                               isbn10=isbn10,
                               book=book)
    else:
        # 本ページの削除ボタンを経由しないアクセス
        title = get_title('削除不可')
        return render_template('delete.html',
                               shishosan=config['shishosan'],
                               title=title)
    def test_update(self):
        """ Test updating search index with threads.

        Updating the index should add all existing threads to the index.
        """
        thread = create_thread()

        call_command('updateindex', stdout=self.out)

        es = Elasticsearch([{'host': 'localhost', 'port': 9200}])

        source = es.get_source(
            index='test',
            doc_type='thread',
            id=thread.pk)
        source_json = json.dumps(source)

        expected = {
            'title': thread.title,
        }
        expected_json = json.dumps(expected)

        self.assertJSONEqual(expected_json, source_json)
        self.assertIn("Updated 1 thread(s)", self.out.getvalue())
示例#14
0
    verify_certs=True)

index = 'test-index'
doc_type = 'test-type'
doc_id = '1'  #utils.get_id()
body = {'id': '1', 'name': 'wxnaawefaefawcy', 'pwd': 'www'}

# data = es.cluster.health(wait_for_status='yellow', request_timeout=1)
# 创建索引
# data = es.indices.create(index=index)
# 创建索引 忽略某些错误
# data = es.indices.create(index='test-index', ignore=400)
# 创建数据
# data = es.create(index=index, doc_type=doc_type, id=doc_id, body=body)
# 删除数据
# data = es.delete(index=index, doc_type=doc_type, id='1482204285.41')
# 检查是否存在 return boolean
# data = es.exists(index=index, doc_type=doc_type, id='1482204824.82')
# 获取数据
# data = es.get(index=index, doc_type=doc_type, id='1482204824.82')
# 获取数据source
data = es.get_source(index=index, doc_type=doc_type, id='1')
#搜索全部数据
# data = es.search(index='test-index')
# data = es.msearch(index=index, doc_type=doc_type,'name')

print data
print data is None

print es.info
示例#15
0
class APIDatabase:
    def __init__(self, elastic_index='address-book', *args, **kwargs):
        # calls Elasticsearch() to connect to the database and creates the index for the address book if needed

        import json
        from elasticsearch import Elasticsearch
        from elasticsearch import exceptions as es_exceptions

        # hold on to the exceptions so they can be recognized in the try...except blocks later
        self.es_exceptions = es_exceptions

        # host and port information for Elasticshare() is in the separate json file
        try:
            with open('./elastic_host_config.json') as f:
                elastic_host_info = json.load(f)
        except FileNotFoundError:
            elastic_host_info = {'host': 'localhost', 'port': 9200}

        self.database = Elasticsearch(elastic_host_info, *args, **kwargs)
        self.elastic_index = elastic_index

        # ensure the Elasticsearch index exists
        self.database.indices.create(index=elastic_index, ignore=400)

    def get_contact_by_query(self, page_size, page_num, query_string):
        # searches the data store using query_string and returns page_size entries starting on page page_num

        if page_size < 0:
            return {
                'error': 'pageSize must be a nonnegative integer',
                'status': 400
            }
        elif page_num < 0:
            return {
                'error': 'page must be a nonnegative integer',
                'status': 400
            }

        try:
            result = self.database.search(index=self.elastic_index,
                                          from_=page_num,
                                          q=query_string,
                                          size=page_size)
            return [
                contact['_source']['doc'] for contact in result['hits']['hits']
            ]
        except self.es_exceptions.RequestError as err:
            return {
                'error': err.info['error']['root_cause'][0]['reason'],
                'status': err.status_code
            }

    def get_contact_by_name(self, name):
        # returns the contact with the given name

        try:
            return self.database.get_source(index=self.elastic_index,
                                            id=name)['doc']
        except self.es_exceptions.NotFoundError:
            return {'error': 'not found', 'status': 404}

    def create_contact(self, contact_details):
        # creates a contact with the given contact_details (which includes a name)

        try:
            self.database.create(index=self.elastic_index,
                                 id=contact_details['name'],
                                 body={'doc': contact_details})
            return {'message': 'created', 'status': 200}
        except self.es_exceptions.ConflictError:
            return {'error': 'contact already exists', 'status': 409}

    def update_contact(self, name, contact_details):
        # updates a contact with the new contact_details

        try:
            result = self.database.update(
                index=self.elastic_index,
                id=name,
                body={'doc': {
                    'doc': contact_details
                }})
            return {'message': result['result'], 'status': 200}
        except self.es_exceptions.NotFoundError:
            return {'error': 'not found', 'status': 404}

    def delete_contact(self, name):
        # deletes the contact with the given name

        try:
            self.database.delete(index=self.elastic_index, id=name)
            return {'message': 'deleted', 'status': 200}
        except self.es_exceptions.NotFoundError:
            return {'error': 'not found', 'status': 404}
示例#16
0
class ElasticSearchWrapper(object):
    SOURCE = "_source"
    HITS = "hits"
    TEXT = "text"
    PROPERTIES = "properties"
    MAPPING = "mappings"

    def __init__(self, configurations: ElasticSearchConfigurations):
        self.configurations = configurations
        self.es = Elasticsearch(hosts=[{
            'host': self.configurations.host,
            'port': self.configurations.port
        }], )

    def create(self, body: dict, documentId):
        ans = self.es.create(index=self.configurations.index,
                             doc_type=self.configurations.docType,
                             body=body,
                             id=documentId)
        return ans

    def exists(self, documentId):
        return self.es.exists(index=self.configurations.index,
                              doc_type=self.configurations.docType,
                              id=documentId)

    def get(self, documentId):
        try:
            return self.es.get_source(index=self.configurations.index,
                                      doc_type=self.configurations.docType,
                                      id=documentId)
        except ElasticSearchNotFoundError as e:
            raise e

    @classmethod
    def constructPrefixFieldQuery(cls, fields: List[str], prefix: str):
        prefixes = {
            "query": {
                "bool": {
                    "should": [{
                        "prefix": {
                            field: prefix,
                        }
                    } for field in fields]
                }
            }
        }

        return prefixes

    def getByPrefix(self, prefix: str):
        try:
            textFields = self.getTextFields()
            ans = self.es.search(index=self.configurations.index,
                                 doc_type=self.configurations.docType,
                                 body=self.constructPrefixFieldQuery(
                                     fields=textFields, prefix=prefix))
            return [elm[self.SOURCE] for elm in ans[self.HITS][self.HITS]]
        except ElasticSearchNotFoundError as e:
            raise e

    def getTextFields(self):
        mapping = self.getMapping()
        return [
            k for k, v in mapping[self.configurations.index][self.MAPPING][
                self.configurations.docType][self.PROPERTIES].items()
            if v["type"] == "text"
        ]

    def getMapping(self) -> dict:
        return self.es.indices.get_mapping(
            index=self.configurations.index,
            doc_type=self.configurations.docType)
示例#17
0
class ElasticDataManager(object):
    """
        This is the order
        abort - If needed. If any previous datamangers aborted. This before
                even begining this datamanager process.

        tpc_begin - Prepare for the transaction
        commit - This is like dry commit. Check for potential errors before commiting
        tpc_vote - After commit vote and tell the transacation manager , that I am
                    fine to go or not
        tpc_finish - Final commit, no turning back after this.


        tpc_abort - If this manager voted no, then this function will
                     be called for cleanup

    """

    transaction_manager = transaction.manager

    def __init__(self):
        self._resources = []
        self.current = 0
        self._connection = None

    def connect(self, settings, default_index="", auto_create_index=False):
        """
            Establish a elastic search connection
        """
        eshosts = settings['elasticsearch_hosts']
        self._connection = Elasticsearch(
            eshosts,
            # sniff before doing anything
            sniff_on_start=True,
            # refresh nodes after a node fails to respond
            sniff_on_connection_fail=True,
            # and also every 60 seconds
            sniffer_timeout=60,
            # request timeout
            timeout=30)
        self.default_index = default_index
        # applicable for 6.0
        self.auto_create_index = auto_create_index
        self.versions = self._getVersion()
        self.isVersion6 = self._isVersion6_in_cluster()

    @property
    def connection(self):
        """
            property get existing established elastic search connection
        """
        return self._connection

    def get_connection(self):
        return self._connection

    def refresh(self, index="_all"):
        self._connection.indices.refresh(index)

    def add(self, item):
        """
            Add document to the elasticsearch index.
            Required in the item dictionary
                _id = ID for the to be saved document
                _type = Type of the document
                _source = Source/body to be saved
                _index(optional) = If default_index is set, then this is optional
            This will be committed during the transaction process.
        """
        log = logging.getLogger(__name__)
        log.info("Adding elasticsearch item")
        if (len(self._resources) == 0):
            log.info("Joining transaction")
            self.transaction_manager.get().join(self)

        item['_op'] = "add"
        item['processed'] = False
        item['_index'] = self._get_index(item)
        item['index_created'] = False
        self._check_type(item)
        self._check_id(item)

        self._resources.append(item)

    def remove(self, item, check_existence=False):
        """
            Remove document from elasticsearch index
            Required in the item dictionary
                _id = ID for the to be saved document
                _type = Type of the document
                _index(optional) = If default_index is set, then this is optional
            This will be committed during the transaction process.
        """
        log = logging.getLogger(__name__)
        log.info("Removing elasticsearch item")

        item['_op'] = "remove"
        item['processed'] = False
        item['_index'] = self._get_index(item)
        item['index_created'] = False
        self._check_type(item)
        self._check_id(item)

        if check_existence and not self._check_if_exists(item):
            return

        if (len(self._resources) == 0):
            log.info("Joining transaction")
            self.transaction_manager.get().join(self)

        self._resources.append(item)

    def update(self, item, check_existence=False):
        """
            Update document already present in the elasticsearch index.
            Required in the item dictionary
                _id = ID for the to be saved document
                _type = Type of the document
                _index(optional) = If default_index is set, then this is optional
                _source = partial or full source to be updated.
            This will be committed during the transaction process. If the document
            isn't already present in the index, then this will be converted to an add
            request.
        """
        log = logging.getLogger(__name__)
        log.info("Update elasticsearch item")

        item['_op'] = "update"
        item['processed'] = False
        item['_index'] = self._get_index(item)
        item['index_created'] = False
        self._check_type(item)
        self._check_id(item)

        if '_source' not in item:
            raise ElasticSearchParamMissingError(
                "_source data to update missing")

        if check_existence and not self._check_if_exists(item):
            return

        if (len(self._resources) == 0):
            log.info("Joining transaction")
            self.transaction_manager.get().join(self)

        self._resources.append(item)

    def update_by_query(self, item):
        """
            Update document already present in the elasticsearch index.
            Required in the item dictionary
                _id = ID for the to be saved document
                _type = Type of the document
                _index(optional) = If default_index is set, then this is optional
                _query = Query DSL to update all the documents matching the query
                _source = partial or full source to be updated.
            This will be committed during the transaction process. If the document
            isn't already present in the index, then this will be converted to an add
            request.
        """
        log = logging.getLogger(__name__)
        log.info("Update elasticsearch item")
        if (len(self._resources) == 0):
            log.info("Joining transaction")
            self.transaction_manager.get().join(self)

        if '_query' not in item:
            raise ElasticSearchParamMissingError("_query input missing")

        if '_script' not in item:
            raise ElasticSearchParamMissingError(
                "_script data to update missing")

        item['_op'] = "update_by_query"
        item['processed'] = False
        item['_index'] = self._get_index(item)
        item['index_created'] = False
        self._check_type(item)

        self._resources.append(item)

    def delete_by_query(self, item):
        """
            Update document already present in the elasticsearch index.
            Required in the item dictionary
                _id = ID for the to be saved document
                _type = Type of the document
                _index(optional) = If default_index is set, then this is optional
                _query = Query DSL to delete all the documents matching the query
            This will be committed during the transaction process. If the document
            isn't already present in the index, then this will be converted to an add
            request.
        """
        log = logging.getLogger(__name__)
        log.info("Update elasticsearch item")
        if (len(self._resources) == 0):
            log.info("Joining transaction")
            self.transaction_manager.get().join(self)

        if '_query' not in item:
            raise ElasticSearchParamMissingError("_query input missing")

        item['_op'] = "delete_by_query"
        item['processed'] = False
        item['_index'] = self._get_index(item)
        item['index_created'] = False
        self._check_type(item)

        self._resources.append(item)

    def _getVersion(self):
        data = self._connection.cluster.stats()
        if 'nodes' in data and 'versions' in data['nodes']:
            return data['nodes']['versions']
        return []

    def _isVersion6_in_cluster(self):
        for version in self.versions:
            split_version = version.split('.')
            if len(split_version) > 0 and split_version[0] == '6':
                return True
        return False

    def _check_if_exists(self, request):
        try:
            result = self._connection.get_source(index=request['_index'],
                                                 doc_type=request['_type'],
                                                 id=request['_id'])
            print(result)
        except NotFoundError as e:
            return False

        return True

    def _get_index(self, item):

        if ('_index' not in item and len(self.default_index) == 0):
            raise ElasticSearchParamMissingError(
                "_index input missing and default index is not set")

        return item['_index'] if '_index' in item else self.default_index

    def _check_type(self, item):

        if '_type' not in item:
            raise ElasticSearchParamMissingError("_type input missing")

        if self.isVersion6 and item['_type'] != 'doc':
            raise ElasticSearchException(
                "custom _type not supported in 6.x. _type should by default be 'doc'"
            )

    def _check_id(self, item):

        if '_id' not in item:
            raise ElasticSearchParamMissingError("_id input missing")

    def _refresh_if_needed(self, last_operation, currentoperation,
                           unique_indices):
        """
            This function updates the in-memory buffer to a segment so that
            we can search and update the records immediately after creation
            https://www.elastic.co/guide/en/elasticsearch/guide/current/near-real-time.html
        """
        if last_operation == currentoperation or last_operation == "":
            return last_operation, unique_indices
        else:
            self._connection.indices.refresh(list(unique_indices))
            unique_indices.clear()
            return currentoperation, unique_indices

    def _checkAndCreateIndex(self, item):
        if not self._connection.indices.exists(item['_index'],
                                               ignore=[400, 404]):
            # index doesn't exist
            self._connection.indices.create(index=item['_index'], ignore=[400])
            return True

        return False

    @property
    def savepoint(self):
        """
            Savepoints are only supported when all connections support subtransactions
        """
        return ElasticSavepoint(self)

    def abort(self, transaction):
        """
            Outside of the two-phase commit proper, a transaction can be
            aborted before the commit is even attempted, in case we come across
            some error condition that makes it impossible to commit. The abort
            method is used for aborting a transaction and forgetting all changes, as
            well as end the participation of a data manager in the current transaction.
        """
        log = logging.getLogger(__name__)
        log.info("abort")
        self.uncommitted = {'add': [], 'remove': []}

    def tpc_begin(self, transaction):
        """
            The tpc_begin method is called at the start of the commit to perform any
            necessary steps for saving the data.
        """
        log = logging.getLogger(__name__)
        log.info("tpc_begin")

    def commit(self, transaction):
        """
            We record and backup existing data and then perform the operation.
            if any of the other transaction managers vote to back up, then we recommit
            all the data backed up during this commit process.
        """

        # ## This is the step where data managers need to prepare to save the changes
        # ## and make sure that any conflicts or errors that could occur during the
        # ## save operation are handled. Changes should be ready but not made
        # ## permanent, because the transaction could still be aborted if other
        # ## transaction managers are not able to commit.

        log = logging.getLogger(__name__)
        log.info(__name__)
        log.info("commit")
        unique_indices = set()
        last_operation = ""
        # Lets commit and keep track of the items that are commited. In case we get
        # an abort request then remove those items.
        for item in self._resources:

            last_operation, unique_indices = self._refresh_if_needed(
                last_operation, item['_op'], unique_indices)

            unique_indices.add(item['_index'])

            if item['_op'] == 'add':
                # if version 6, there is no support for types
                # All documents get into their own index
                if self.isVersion6:
                    item['index_created'] = self._checkAndCreateIndex(item)

                self._connection.create(index=item['_index'],
                                        doc_type=item['_type'],
                                        id=item['_id'],
                                        body=item['_source'])

            elif item['_op'] == 'remove':
                if (self._connection.exists(index=item['_index'],
                                            doc_type=item['_type'],
                                            id=item['_id'])):
                    item['_backup'] = self._connection.get_source(
                        index=item['_index'],
                        doc_type=item['_type'],
                        id=item['_id'])
                    self._connection.delete(index=item['_index'],
                                            doc_type=item['_type'],
                                            id=item['_id'])
                else:
                    raise ElasticSearchException("Unable to find " +
                                                 item['_id'] + " in type " +
                                                 item['_type'] +
                                                 " and in index " +
                                                 item['_index'])
            elif item['_op'] == "update":  # Update

                if (self._connection.exists(index=item['_index'],
                                            doc_type=item['_type'],
                                            id=item['_id'])):

                    item['_backup'] = self._connection.get_source(
                        index=item['_index'],
                        doc_type=item['_type'],
                        id=item['_id'])
                    # Dont get the source after update
                    self._connection.update(index=item['_index'],
                                            doc_type=item['_type'],
                                            id=item['_id'],
                                            body={'doc': item['_source']},
                                            _source=False)
                else:
                    # The item was not present in the first place
                    # moving this to add
                    self._connection.create(index=item['_index'],
                                            doc_type=item['_type'],
                                            id=item['_id'],
                                            body=item['_source'])
                    # Move the operation to add. In case of
                    # abort we will only remove the newly created
                    # document
                    item['_op'] = 'add'
            elif item['_op'] == "update_by_query":

                # get all the fields provided by the user to update
                # keys = list(item['_source'].keys())

                item['_backup'] = self._connection.search(
                    index=item['_index'],
                    doc_type=item['_type'],
                    body={"query": item['_query']},
                    _source=True)

                # print(json.dumps(item['_backup'],sort_keys=True,indent=4))

                # example script
                #   "script":{
                #           "inline":"ctx._source.description = params.description;ctx._source.grp_hash = grp_hash",
                #           "params" : {
                #                    "description" : "Srikanth group",
                #                    "grp_hash" : "3433"
                #                       }
                #           }

                toupdate = {
                    "script": item['_script'],
                    "query": item['_query'],
                }
                # print(json.dumps(toupdate,sort_keys=True,indent=4))

                self._connection.update_by_query(index=item['_index'],
                                                 doc_type=item['_type'],
                                                 body=toupdate,
                                                 _source=True)

                # print(json.dumps(t, sort_keys=True, indent=4))

            elif item['_op'] == "delete_by_query":

                # get all the fields provided by the user to update

                item['_backup'] = self._connection.search(
                    index=item['_index'],
                    doc_type=item['_type'],
                    body={"query": item['_query']},
                    _source=True)

                self._connection.delete_by_query(
                    index=item['_index'],
                    doc_type=item['_type'],
                    body={"query": item['_query']})
            item['processed'] = True

    def tpc_vote(self, transaction):
        """
            The last chance for a data manager to make sure that the data can
            be saved is the vote. The way to vote ‘no’ is to raise an exception here.
        """
        log = logging.getLogger(__name__)
        log.info("tpc_vote")

    def tpc_finish(self, transaction):
        """
            This method is only called if the manager voted ‘yes’ (no exceptions raised)
            during the voting step. This makes the changes permanent and should never
            fail. Any errors here could leave the database in an inconsistent state. In
            other words, only do things here that are guaranteed to work or you may have
            a serious error in your hands.
        """
        # Do the operation to add it to elastic search
        # We are done lets cleanup
        self._resources = []
        # Lets refresh all indices once
        self.refresh()
        log = logging.getLogger(__name__)
        log.info("tcp_finish")

    def tpc_abort(self, transaction):
        """
            This method is only called if the manager voted ‘no’ by raising an exception
            during the voting step. It abandons all changes and ends the transaction.
        """
        log = logging.getLogger(__name__)
        log.info("tpc_abort")
        unique_indices = set()
        last_operation = ""
        for item in self._resources:
            last_operation, unique_indices = self._refresh_if_needed(
                last_operation, item['_op'], unique_indices)

            unique_indices.add(item['_index'])

            if item['processed']:
                if item['_op'] == 'add':

                    self._connection.delete(index=item['_index'],
                                            doc_type=item['_type'],
                                            id=item['_id'])
                    if self.isVersion6 and item['index_created']:
                        # We created index in the commit phase,
                        # so we need to delete it if we are aborting
                        # the transaction.
                        self._connection.indices.delete(index=item['_index'],
                                                        ignore=[400, 404])

                elif item['_op'] == 'remove':

                    self._connection.create(index=item['_index'],
                                            doc_type=item['_type'],
                                            id=item['_id'],
                                            body=item['_backup'])

                elif item['_op'] == "update":  # Update

                    self._connection.update(index=item['_index'],
                                            doc_type=item['_type'],
                                            id=item['_id'],
                                            body=item['_backup'],
                                            _source=False)

                elif item['_op'] == "update_by_query":

                    for thing in item['_backup']['hits']['hits']:
                        # update back with old value only if the document exists.
                        if self._connection.exists(index=item['_index'],
                                                   doc_type=item['_type'],
                                                   id=thing['_id']):

                            self._connection.update(
                                index=item['_index'],
                                doc_type=item['_type'],
                                id=thing['_id'],
                                body={'doc': thing['_source']},
                                _source=False)

                elif item['_op'] == "delete_by_query":

                    for thing in item['backup']['hits']['hits']:
                        self._connection.create(index=item['_index'],
                                                doc_type=item['_type'],
                                                id=thing['_id'],
                                                body=thing['_source'])

    def sortKey(self):
        """
            Transaction manager tries to sort all the data manger alphabetically
            If we want our datamanger to commit last, then start with '~'. Here
            we dont care. Assuming
        """
        return 'elasticsearch' + str(id(self))
示例#18
0
from elasticsearch import Elasticsearch, NotFoundError
from tqdm import tqdm
from cluseter import termgraph

es = Elasticsearch([{'host': 'localhost', 'port': 9200}])

articles_path = '../crawler/articles'
articles = []
titles = {}
abstracts = {}
dictionary = {}
stop_words = set(nltk.corpus.stopwords.words('english'))

for article_name in os.listdir(articles_path):
    try:
        article = es.get_source(index="rg", doc_type="article", id=int(article_name.split(".")[0]))
        articles.append(article)
        abstracts[article['id']] = collections.Counter(
            x for x in nltk.word_tokenize(article.get('abstract').lower()) if x not in stop_words)
        titles[article.get('id')] = collections.Counter(
            x for x in nltk.word_tokenize(article.get('title').lower()) if x not in stop_words)
    except NotFoundError:
        pass
for doc in abstracts.values():
    for t, v in doc.iteritems():
        dictionary[t] = max(dictionary.get(t), v)


def calc_clusters(cluster_num):
    MAX_LEVEL = 5
    mean = [{} for cnum in range(cluster_num)]
示例#19
0
class ESClient:
    def __init__(self, config):
        """
        Init & Configure ES connection
        :param config: dict, config
        """
        self.config = config or {}
        if self.config:
            self.hosts = [{
                'host': h['host'],
                'port': h['port']
            } for h in self.config.get('hosts')]
        else:
            self.hosts = [{'host': 'localhost'}]

        self.bulk_size = self.config.get("bulk_size") or BULK_SIZE
        self.connection = None
        self.connect()

    def connect(self):
        """
        Establish ES connection
        """
        try:
            self.connection = Elasticsearch(self.hosts,
                                            retry_on_timeout=RETRY_ON_TIMEOUT,
                                            timeout=REQUEST_TIMEOUT_SEC)
        except Exception as e:  # pragma: no cover
            logging.error(
                "ESClient.connect failed with params {}, error {}".format(
                    self.hosts, e))

    def count_index(self, index):
        """
        Count docs in index
        :param index:
        :return:
        """
        return self.connection.count(index).get('count')

    def search(self,
               query=None,
               index_name=None,
               retries=0,
               query_type='search'):
        """
        ES search query
        :param query: dict, es query
        :param index_name: str, index to query against
        :param retries: int, current retry attempt
        :param query_type: str, search or aggregation
        :return: list, found docs
        """
        resp = []
        try:
            resp = self.connection.search(body=query, index=index_name)
            if query_type == 'search':
                found = resp['hits']['hits']
            else:  # elif query_type == 'aggregation':
                found = resp['aggregations']
        except KeyError:  # No hits key in response
            logging.critical(
                "ESClient.search invalid response {}".format(resp))
            if retries > RETRY_ATTEMPTS:  # pragma: no cover
                logging.error(
                    "ESClient.search max attempts exceeded (key error)")
                raise
            found = self.search(query=query,
                                index_name=index_name,
                                retries=retries + 1)
        except es_exceptions.RequestError as e:  # pragma: no cover
            logging.critical("ESClient.search error {} on query {}".format(
                e, query))
            raise
        except (es_exceptions.ConnectionTimeout, es_exceptions.ConnectionError,
                es_exceptions.TransportError):  # pragma: no cover
            logging.warning("ESClient.search connection failed, retrying..."
                            )  # Retry on timeout
            if retries > RETRY_ATTEMPTS:  # pragma: no cover
                logging.error("ESClient.search max attempts exceeded")
                raise
            time.sleep(RECONNECT_SLEEP_SEC)
            self.connect()  # Not sure if this is helpful
            found = self.search(query=query,
                                index_name=index_name,
                                retries=retries + 1)
        except Exception as e:  # pragma: no cover
            logging.critical("ESClient.search error {} on query {}".format(
                e, query))
            raise

        return found

    def msearch(self,
                queries,
                index_name,
                doc_type='_doc',
                retries=0,
                chunk_size=100):
        """
        Es multi-search query
        :param queries: list of dict, es queries
        :param index_name: str, index to query against
        :param doc_type: str, defined doc type i.e. _doc
        :param retries: int, current retry attempt
        :param chunk_size: int, how many queries to send to es at a time
            Increase the search queue size before sending too many requests
            I.e. threadpool.search.queue_size: 50000  in es config
        :return: dict, found doc status
        """
        search_header = json.dumps({'index': index_name, 'type': doc_type})

        def chunk_queries():
            for i in range(0, len(queries), chunk_size):
                yield queries[i:i + chunk_size]

        chunked_queries = chunk_queries()

        found = []
        for query_chunk in chunked_queries:
            request = ''
            for q in query_chunk:
                # request head, body pairs
                request += '{}\n{}\n'.format(search_header, json.dumps(q))

            resp = {}
            try:
                resp = self.connection.msearch(body=request, index=index_name)
                found.extend([r['hits']['hits'] for r in resp['responses']])
            except (es_exceptions.ConnectionTimeout,
                    es_exceptions.ConnectionError,
                    es_exceptions.TransportError,
                    KeyError) as e:  # pragma: no cover
                if retries > RETRY_ATTEMPTS:  # pragma: no cover
                    logging.error(
                        "ESClient.msearch max attempts exceeded, error {}".
                        format(e))
                    raise

                logging.warning(
                    "ESClient.msearch connection failed, retrying..."
                )  # Retry on timeout

                # No hits key in response, don't retry if es_rejected_execution_exception
                if e.__class__ == KeyError:
                    # 'hits' missing, could be es_rejected_execution_exception, queue capacity reached
                    logging.critical(
                        "ESClient.msearch invalid response {}".format(
                            resp.get('responses')))
                    # if 'search_phase_execution_exception' not in str(resp):  # reason 'all shards failed'
                    if 'es_rejected_execution_exception':
                        # raise if underlying error is ConnectionRefusedError in urllib3 caused by NewConnectionError
                        logging.error(
                            "ESClient.msearch query rejected, error {}".format(
                                e))
                        raise

                time.sleep(RECONNECT_SLEEP_SEC)
                self.connect()  # Not sure if useful
                found = self.msearch(queries=queries,
                                     index_name=index_name,
                                     retries=retries + 1)
            except Exception as e:  # pragma: no cover
                logging.critical(
                    "ESClient.msearch error {} on query {}".format(e, queries))
                raise

        return found

    def get_document(self, doc_id, index_name, doc_type='_doc', retries=0):
        """
        Get contents of a document by id
        :param doc_id: int, the document id
        :param index_name: str, document name
        :param doc_type: str, document type
        :param retries: int, current retry attempt
        :return: dict, resulting document
        """
        try:
            result = self.connection.get_source(id=doc_id,
                                                doc_type=doc_type,
                                                index=index_name,
                                                _source=True)
        except es_exceptions.NotFoundError:
            result = None
        except (es_exceptions.ConnectionTimeout,
                es_exceptions.ConnectionError):  # pragma: no cover
            logging.warning(
                "ESClient.get_document connection failed, retrying..."
            )  # Retry on timeout
            if retries > RETRY_ATTEMPTS:  # pragma: no cover
                raise
            time.sleep(RECONNECT_SLEEP_SEC)
            result = self.get_document(doc_id,
                                       index_name,
                                       doc_type,
                                       retries=retries + 1)

        return result

    def upsert_document(self,
                        index_name,
                        body,
                        doc_id,
                        doc_type='_doc',
                        retries=0):
        """
        Adds new or updates existing doc
        Upsert a document into an es index
        :param index_name: str, index name
        :param body: dict, doc
        :param doc_type: str, i.e. _doc
        :param doc_id: int, document id
        :param retries: int, number of retries of the function
        :return: dict, result
        """
        # To avoid Field [_id] is a metadata field and cannot be added inside a document.
        # Use the index API request parameters.
        if ES_ID_FIELD in body:
            body = copy.deepcopy(body)
            del body[ES_ID_FIELD]  # reserved field

        try:
            result = self.connection.update(index=index_name,
                                            doc_type=doc_type,
                                            id=doc_id,
                                            body={
                                                'doc': body,
                                                'doc_as_upsert': True
                                            })
        except (es_exceptions.ConnectionTimeout,
                es_exceptions.ConnectionError):  # pragma: no cover
            logging.warning(
                "ESClient.upsert_document connection failed, retrying..."
            )  # Retry on timeout
            if retries > RETRY_ATTEMPTS:  # pragma: no cover
                raise
            time.sleep(RECONNECT_SLEEP_SEC)
            result = self.upsert_document(doc_id,
                                          body,
                                          index_name,
                                          doc_type,
                                          retries=retries + 1)

        return result

    def remove_document(self, index_name, doc_id, doc_type='_doc', retries=0):
        """
        Remove a document from es index
        :param index_name: str, index name
        :param doc_id: int, doc id
        :param doc_type: str, i.e. _doc
        :param retries: int, number of retries of the function
        :return: dict, result
        """
        try:
            result = self.connection.delete(index=index_name,
                                            doc_type=doc_type,
                                            id=doc_id)
        except (es_exceptions.ConnectionTimeout,
                es_exceptions.ConnectionError):  # pragma: no cover
            logging.warning(
                "ESClient.remove_document connection failed, retrying..."
            )  # Retry on timeout
            if retries > RETRY_ATTEMPTS:  # pragma: no cover
                raise
            time.sleep(RECONNECT_SLEEP_SEC)
            result = self.remove_document(index_name,
                                          doc_id,
                                          doc_type,
                                          retries=retries + 1)

        return result

    def bulk_update_index(self,
                          documents,
                          index_name,
                          doc_type='_doc',
                          id_field='_id'):
        """
        Bulk populates an es index with doc data
        Can also be used to add a single doc to index
        :param index_name: str, index name
        :param documents: list, of dicts index docs
        :param doc_type: str, document type for es
        :param id_field: str, document id field name
        :return: bool, success
        """
        copied_docs = copy.deepcopy(documents)

        bulk_data = []
        for body in copied_docs:
            doc_id = body[id_field]
            if ES_ID_FIELD in body:
                del body[ES_ID_FIELD]
            bulk_data.append({
                '_index': index_name,
                '_type': doc_type,
                '_source': json.dumps(body),
                '_id': doc_id
            })

        success = False
        for attempt in range(1, RETRY_ATTEMPTS + 1):
            try:
                helpers.bulk(self.connection, actions=bulk_data)
                self.connection.indices.refresh(index=index_name)
                success = True
                break
            except (es_exceptions.ConnectionTimeout,
                    es_exceptions.ConnectionError):  # pragma: no cover
                logging.warning("ESClient.bulk_update_index connection timeout"
                                )  # Retry on timeout
                self.connect(
                )  # Not sure if this is helpful, or connection is lazy?
                continue

        return success

    def create_index(self, index_name, body=None, replace=False):
        """
        Create an index by name, populate with body
        :param index_name: str, name of index
        :param body: dict, optional document to create
        :param replace: bool, force replace existing index
        :return: dict, created status info
        """
        result = None
        for _attempt in range(1, RETRY_ATTEMPTS + 1):
            try:
                result = self.connection.indices.create(index=index_name,
                                                        ignore=400,
                                                        body=body)
                result = bool('acknowledged' in result)
                break
            except es_exceptions.AuthorizationException:  # pragma: no cover
                result = False
                break
            except (es_exceptions.ConnectionTimeout,
                    es_exceptions.ConnectionError) as e:  # pragma: no cover
                logging.error(
                    "ESClient.create_index connection error: {}".format(
                        e))  # Retry on timeout
                time.sleep(RECONNECT_SLEEP_SEC)
                self.connect(
                )  # Not sure if this is helpful, or connection is lazy?
                continue

        if replace and not result:
            logging.warning(
                "ESClient.create_index replacing existing index {}".format(
                    index_name))
            self.delete_index(index_name)
            result = self.connection.indices.create(index=index_name,
                                                    ignore=400,
                                                    body=body)

        if result:
            self.connection.indices.refresh(index_name)

        return result

    def setup_index(self, index_name, doc_mapping, index_settings):
        """
        Set up Index
        :param index_name: str, index name
        :param index_settings: str or dict, index settings document
        :param doc_mapping: str or dict, index doc mapping schema
        :return: bool, setup settings and index success
        """
        # index_settings = index_settings or self.INDEX_SETTINGS
        doc_type = list(doc_mapping.keys())[0]
        settings = mapped = None
        for attempt in range(1, RETRY_ATTEMPTS + 1):
            try:
                # close index to modify settings
                self.connection.indices.close(index=index_name)
                # Creates es analyzer, filter settings
                settings = self.connection.indices.put_settings(
                    index=index_name, body=index_settings)
                self.connection.indices.open(index=index_name)

                # Sets up document structure / mapping
                mapped = self.connection.indices.put_mapping(index=index_name,
                                                             doc_type=doc_type,
                                                             body=doc_mapping)
                break
            except (es_exceptions.ConnectionTimeout,
                    es_exceptions.ConnectionError):  # pragma: no cover
                logging.warning("ESClient.setup_index connection timeout"
                                )  # Retry on timeout
                self.connect(
                )  # Not sure if this is helpful, or connection is lazy?
                continue

        return settings and mapped

    def delete_index(self, index_name):
        """
        Delete an index by name
        :param index_name: str, index name
        :return: dict, removed status
        """
        result = None
        for attempt in range(1, RETRY_ATTEMPTS + 1):
            try:
                result = self.connection.indices.delete(index=index_name)
                break
            except es_exceptions.NotFoundError:  # pragma: no cover
                result = False
                break
            except (es_exceptions.ConnectionTimeout,
                    es_exceptions.ConnectionError):  # pragma: no cover
                logging.warning("ESClient.delete_index connection timeout"
                                )  # Retry on timeout
                self.connect(
                )  # Not sure if this is helpful, or connection is lazy?
                continue

        if not result:  # pragma: no cover
            logging.warning(
                "ESClient.delete_index failed for {}".format(index_name))
        return result

    def add_alias(self, indexes, alias_name, retries=0):
        """
        Set the alias current for new index
        Note: It is possible to have one alias for multiple
        indexes but bulk populate will fail for that alias
        :param indexes: list (or single str) of index names
        :param alias_name: str, alias to use for the index
        :param retries: int, number of retries of the function
        :return: dict, added info
        """
        try:
            added = self.connection.indices.put_alias(index=indexes,
                                                      name=alias_name)
        except (es_exceptions.ConnectionTimeout,
                es_exceptions.ConnectionError):  # pragma: no cover
            logging.warning("ESClient.add_alias connection failed, retrying..."
                            )  # Retry on timeout
            if retries > RETRY_ATTEMPTS:  # pragma: no cover
                raise
            time.sleep(RECONNECT_SLEEP_SEC)
            added = self.get_alias(indexes, alias_name, retries=retries + 1)
        return added

    def get_alias(self, alias_name=None, index_name=None, retries=0):
        """
        Return alias information i.e indexes either by
        alias name or index to get aliases for an index
        :param alias_name: str, alias to use for the index
        :param index_name: str, name of index
        :param retries: int, number of retries of the function
        :return:
        """
        try:
            alias = self.connection.indices.get_alias(name=alias_name,
                                                      index=index_name)
        except es_exceptions.NotFoundError:  # pragma: no cover
            alias = None
        except (es_exceptions.ConnectionTimeout,
                es_exceptions.ConnectionError):  # pragma: no cover
            logging.warning("ESClient.get_alias connection failed, retrying..."
                            )  # Retry on timeout
            if retries > RETRY_ATTEMPTS:  # pragma: no cover
                raise
            time.sleep(RECONNECT_SLEEP_SEC)
            alias = self.get_alias(alias_name, index_name, retries=retries + 1)
        return alias

    def delete_alias(self, index_name, alias_name, retries=0):
        """
        Remove alias
        :param index_name: str, index name
        :param alias_name: str, alias to use for the index
        :param retries: int, number of retries of the function
        :return: dict, removed status
        """
        try:
            removed = self.connection.indices.delete_alias(name=alias_name,
                                                           index=index_name)
        except es_exceptions.NotFoundError:  # pragma: no cover
            return False
        except (es_exceptions.ConnectionTimeout,
                es_exceptions.ConnectionError):  # pragma: no cover
            logging.warning(
                "ESClient.delete_alias connection failed, retrying..."
            )  # Retry on timeout
            if retries > RETRY_ATTEMPTS:  # pragma: no cover
                raise
            time.sleep(RECONNECT_SLEEP_SEC)
            removed = self.delete_alias(index_name,
                                        alias_name,
                                        retries=retries + 1)
        return removed
示例#20
0
}, {
	
	'Book': 'The tale of two cities',
	'Author': 'Chals Dickens',
	'year': 2003,
	'volumes': 3

}]
"""
for i in range(1,len(doc)+1):
	res = es.index(index="practise",doc_type="writing",id = i, body = doc[i-1])

print(res['created'])
"""
#res = es.mget(index="practise",doc_type="writing", body = {"ids":[1,2,3]} , _source = True, realtime = True)
res = es.get_source(index= "practise", doc_type="writing", id="3"	)

#res = es.search(index= "practise", doc_type="writing")
res = es.suggest(index= "practise", body = doc)
#print (res['_source'])

es.indices.refresh(index="practise")

#res = es.search(index = "practise",body = {"query":{"match_all":{}}})

res = json.dumps(res, indent = 4, sort_keys = True)
es.indices.refresh(index="practise")

print res 
"""
print("we got %d Hits:" %res['hits']['total'])
示例#21
0
# tag::69a7be47f85138b10437113ab2f0d72d[]
response = es.get(
    index='twitter',
    id=2,
    routing='user1',
    stored_fields='tags,counter',
)
# end::69a7be47f85138b10437113ab2f0d72d[]
print("---------------------------------------")
print(response)
print("---------------------------------------")

print("89a8ac1509936acc272fc2d72907bc45 - L:229")
# tag::89a8ac1509936acc272fc2d72907bc45[]
response = es.get_source(
    index='twitter',
    id=1,
)
# end::89a8ac1509936acc272fc2d72907bc45[]
print("---------------------------------------")
print(response)
print("---------------------------------------")

print("d222c6a6ec7a3beca6c97011b0874512 - L:238")
# tag::d222c6a6ec7a3beca6c97011b0874512[]
response = es.get_source(
    index='twitter',
    id=1,
    _source_includes='*.id',
    _source_excludes='entities',
)
# end::d222c6a6ec7a3beca6c97011b0874512[]
示例#22
0
        return text


if __name__ == "__main__":

    BILL_CACHE = 'bills.p'

    # Load bill list
    ids = pd.read_csv('../../data/ncsl/ncsl_data_from_sample_matched.csv')

    if not os.path.isfile(BILL_CACHE):
        # Initialize database for bill retrieval
        es = ES("localhost:9200", timeout=60)
        # Retrieve bills
        bills = [
            es.get_source(index="state_bills", id=id_, doc_type="_all")
            for id_ in ids['matched_from_db']
        ]

        print("Retrieved {} bills".format(len(bills)))
        pickle.dump(bills, open(BILL_CACHE, 'wb'))

    else:
        bills = pickle.load(open(BILL_CACHE, 'rb'))

    # Initialize text cleaner
    cleaner = TextCleaner()

    # Initialize dictionary
    dictionary = corpora.Dictionary()
示例#23
0
class ElasticConnection():
    def __init__(self, host="localhost", port=9200):
        self.es_connection = Elasticsearch([{
            'host': host,
            'port': port
        }],
                                           timeout=200)

    # creates index for bills and model legislation stored in
    # data_path, overwriting index if it is already created
    def create_state_bill_index(self, data_path):
        if self.es_connection.indices.exists(STATE_BILL_INDEX):
            print("deleting '%s' index..." % (STATE_BILL_INDEX))
            self.es_connection.indices.delete(index=STATE_BILL_INDEX)

        mapping_doc = json.loads(
            open(os.environ['POLICY_DIFFUSION'] +
                 "/db/state_bill_mapping.json").read())
        settings_doc = json.loads(
            open(os.environ['POLICY_DIFFUSION'] +
                 "/db/state_bill_index.json").read())

        print("creating '%s' index..." % (STATE_BILL_INDEX))
        res = self.es_connection.indices.create(index=STATE_BILL_INDEX,
                                                body=settings_doc)

        print("adding mapping for bill_documents")
        res = self.es_connection.indices.put_mapping(index=STATE_BILL_INDEX,
                                                     doc_type="bill_document",
                                                     body=mapping_doc)

        bulk_data = []
        for i, line in enumerate(open(data_path)):
            json_obj = json.loads(line.strip())
            if json_obj is None:
                continue

            op_dict = {
                "index": {
                    "_index": STATE_BILL_INDEX,
                    "_type": "bill_document",
                    "_id": json_obj["unique_id"]
                }
            }

            bulk_data.append(op_dict)
            bulk_data.append(json_obj)
            if len(bulk_data) == 1000:
                self.es_connection.bulk(index=STATE_BILL_INDEX,
                                        body=bulk_data,
                                        timeout=300)

                del bulk_data
                bulk_data = []

    def create_evaluation_index_all_bills(self, data_path1, data_path2):
        '''
        data_path1 corresponds to evaluation data
        data_path2 corresponds to rest of bill data
        '''
        if self.es_connection.indices.exists(EVALUATION_INDEX_ALL_BILLS):
            print("deleting '%s' index..." % (EVALUATION_INDEX_ALL_BILLS))
            self.es_connection.indices.delete(index=EVALUATION_INDEX_ALL_BILLS)

        #use same mapping as in state index
        mapping_doc = json.loads(
            open(os.environ['POLICY_DIFFUSION'] +
                 "/db/evaluation_mapping.json").read())
        settings_doc = json.loads(
            open(os.environ['POLICY_DIFFUSION'] +
                 "/db/state_bill_index.json").read())

        print("creating '%s' index..." % (EVALUATION_INDEX_ALL_BILLS))
        res = self.es_connection.indices.create(
            index=EVALUATION_INDEX_ALL_BILLS, body=settings_doc, timeout=30)

        print("adding mapping for bill_documents")
        res = self.es_connection.indices.put_mapping(
            index=EVALUATION_INDEX_ALL_BILLS,
            doc_type="bill_document",
            body=mapping_doc)

        #load in evaluation data first
        bulk_data = []
        for i, line in enumerate(open(data_path1)):
            json_obj = json.loads(line.strip())
            if json_obj is None:
                continue

            op_dict = {
                "index": {
                    "_index": EVALUATION_INDEX_ALL_BILLS,
                    "_type": "bill_document",
                    "_id": i
                }
            }

            bulk_data.append(op_dict)
            bulk_data.append(json_obj)

        self.es_connection.bulk(index=EVALUATION_INDEX_ALL_BILLS,
                                body=bulk_data,
                                timeout=300)

        #load in rest of state bill data
        bulk_data = []
        for i, line in enumerate(open(data_path2)):
            json_obj = json.loads(line.strip())
            if json_obj is None:
                continue

            op_dict = {
                "index": {
                    "_index": EVALUATION_INDEX_ALL_BILLS,
                    "_type": "bill_document",
                    "_id": json_obj["unique_id"]
                }
            }

            bulk_data.append(op_dict)
            bulk_data.append(json_obj)
            if len(bulk_data) == 1000:
                self.es_connection.bulk(index=EVALUATION_INDEX_ALL_BILLS,
                                        body=bulk_data,
                                        timeout=300)

                del bulk_data
                bulk_data = []

    def create_evaluation_index(self, data_path):
        if self.es_connection.indices.exists(EVALUATION_INDEX):
            print("deleting '%s' index..." % (EVALUATION_INDEX))
            self.es_connection.indices.delete(index=EVALUATION_INDEX)

        #use same mapping as in state index
        mapping_doc = json.loads(
            open(os.environ['POLICY_DIFFUSION'] +
                 "/db/evaluation_mapping.json").read())
        settings_doc = json.loads(
            open(os.environ['POLICY_DIFFUSION'] +
                 "/db/state_bill_index.json").read())

        print("creating '%s' index..." % (EVALUATION_INDEX))
        res = self.es_connection.indices.create(index=EVALUATION_INDEX,
                                                body=settings_doc,
                                                timeout=30)

        print("adding mapping for bill_documents")
        res = self.es_connection.indices.put_mapping(index=EVALUATION_INDEX,
                                                     doc_type="bill_document",
                                                     body=mapping_doc)

        bulk_data = []
        for i, line in enumerate(open(data_path)):
            json_obj = json.loads(line.strip())
            if json_obj is None:
                continue

            op_dict = {
                "index": {
                    "_index": EVALUATION_INDEX,
                    "_type": "bill_document",
                    "_id": i
                }
            }

            bulk_data.append(op_dict)
            bulk_data.append(json_obj)

        self.es_connection.bulk(index=EVALUATION_INDEX,
                                body=bulk_data,
                                timeout=300)

    def get_all_doc_ids(self, index):
        count = self.es_connection.count(index)['count']
        q = {"query": {"match_all": {}}, "fields": []}
        results = self.es_connection.search(index=index, body=q, size=count)
        doc_ids = [res['_id'] for res in results['hits']['hits']]

        return doc_ids

    def similar_doc_query(self,
                          query,
                          state_id=None,
                          num_results=100,
                          return_fields=["state"],
                          index=STATE_BILL_INDEX,
                          fields="bill_document_last"):
        json_query = """ 
            {
                "query": {
                    "more_like_this": {
                        "fields": ["section_txt"],
                        "like": "",
                        "min_doc_freq": 2
                    }
                }
            }
        """
        json_query = json.loads(json_query)
        json_query['query']['more_like_this']['like'] = query

        results = self.es_connection.search(index=index,
                                            body=json_query,
                                            size=num_results)

        results = results['hits']['hits']
        result_docs = []
        for idx, res in enumerate(results):
            doc = {}

            doc['score'] = res['_score']
            doc['id'] = res['_id']
            doc['sec_id'] = res['_source']['section_id']
            doc['sec_txt'] = res['_source']['section_txt']

            # print("#%d: sec_id is %s with score %f" % (idx+1, doc['sec_id'], doc['score']))
            result_docs.append(doc)

        return result_docs

    def similar_doc_query_for_testing_lucene(self,
                                             query,
                                             match_group,
                                             state_id=None,
                                             num_results=100,
                                             return_fields=["state"],
                                             index=STATE_BILL_INDEX):
        '''
        description:
            only for testing lucene scores

        match_group represents the group of bills that an evaluation bill 
        belongs to (e.g., all the stand your ground bills)
        '''

        json_query = """ 
            {
              "query": {
                "filtered": {
                  "query": {
                    "more_like_this": {
                      "fields": [
                        "bill_document_last"
                      ],
                      "like_text": "",
                      "max_query_terms": 70,
                      "min_term_freq": 1,
                      "min_doc_freq": 2,
                      "minimum_should_match": 1
                    }
                  },
                  "filter": {
                    "bool": {
                      "must_not": {
                        "term": {
                          "bill_document.state": ""
                        }
                      }
                    }
                  }
                }
              }
            }
        """
        json_query = json.loads(json_query)
        json_query['query']['filtered']['query']['more_like_this'][
            'like_text'] = query
        json_query['query']['filtered']['filter']['bool']['must_not']['term'][
            'bill_document.state'] = str(state_id)

        results = self.es_connection.search(index=index,
                                            body=json_query,
                                            fields=return_fields,
                                            size=num_results)
        results = results['hits']['hits']
        result_docs = []
        for res in results:
            doc = {}
            for f in res['fields']:
                doc[f] = res['fields'][f][0]
            doc['score'] = res['_score']
            doc['id'] = res['_id']

            #if applicable, only return docs that are from different states
            if doc['state'] != state_id:
                result_docs.append(doc)

        return result_docs

    def get_bill_by_id(self, id, index='state_bills'):
        match = self.es_connection.get_source(index=index, id=id)
        return match

    def get_model_legislation_by_id(self, id, index=MODEL_LEGISLATION_INDEX):
        match = self.es_connection.get_source(index=index, id=id)
        return match

    def get_constitution_by_id(self, id, index=CONSTITUTIONS_INDEX):
        match = self.es_connection.get_source(index=index, id=id)
        return match

    def get_all_bills(self, step=3000):
        es = self.es_connection
        # fix with .format: '{"from" :{0}, "size" :{1}'.format(start,size)
        body_gen = lambda start, size: '{"from" :' + str(
            start) + ', "size" : ' + str(
                size) + ', "query":{"bool":{"must":{"match_all":{}}}}} '

        body = body_gen(0, 0)
        bills = es.search(index="state_bills", body=body)

        total = bills['hits']['total']

        all_bills = []
        start = 0
        bad_count = 0
        while start <= total:
            print(start)
            body = body_gen(start, step)
            bills = es.search(index="state_bills", body=body)
            bill_list = bills['hits']['hits']
            all_bills.append(bill_list)

            start += step

        return all_bills

    def get_bills_by_state(self, state, num_bills='all', step=3000):
        es = self.es_connection

        if num_bills == 'all':
            bills = es.search(index='state_bills',
                              doc_type='bill_document',
                              q='state:' + state)
            total = bills['hits']['total']
        else:
            total = num_bills

        #fix as above
        body_gen = lambda start, size: '{"from" :' + str(
            start) + ', "size" : ' + str(
                size
            ) + ',"query":{"term":{"bill_document.state":"' + state + '"}}}'

        all_bills = []
        start = 0
        bad_count = 0
        if step >= total:
            body = body_gen(start, total)
            bills = es.search(index="state_bills", body=body)
            bill_list = bills['hits']['hits']
            all_bills.extend(bill_list)
        else:
            while start <= total:
                body = body_gen(start, step)
                bills = es.search(index="state_bills", body=body)
                bill_list = bills['hits']['hits']
                all_bills.extend(bill_list)

                start += step

        return all_bills
示例#24
0
ES_PASS = os.environ.get('ES_PASS', 'changeme')

es = Elasticsearch([ES_HOST],
                   http_auth=(ES_USER, ES_PASS),
                   port=9200,
                   use_ssl=False)


def log2es(body):
    es.index(index='nexpose-process-log',
             doc_type='nexpose-process-log',
             body=body)


try:
    es.get_source(index='nexpose-log-user', doc_type='user', id='admin')
except NotFoundError as e:
    es.create(index='nexpose-log-user',
              doc_type='user',
              id='admin',
              body=json.dumps({'_password': '******'}))


def get_admin_user(username):
    try:
        return es.get_source(index='nexpose-log-user',
                             doc_type='user',
                             id=username)
    except NotFoundError as e:
        return None
class ElasticSearchManager():

    ##########################  Constructor  ######################################
    def __init__(self,indexName, typeName, type):
        self.client = Elasticsearch()
        self._indexName = indexName
        self._typeName = typeName

        self._uniqeTermInCorpus = 0
        self._totalNoOfDocsInCorpus = 0
        self._avgDocumentLength = 0
        self._lengthOfAllDocuments = 0

        if type == 1:
            print type
            self.__CreateIndex__()
            self.__SetMappingForIndex__()

    ##########################  CREATES INDEX  ######################################
    def __CreateIndex__(self):
        # DELETE index if already exists
        if self.client.indices.exists(self._indexName):
            print("deleting '%s' index..." % (self._indexName))
            res = self.client.indices.delete(index =  self._indexName)
            print " response: '%s'" % (res), '\n'

         # CREATE a new index
        print("creating '%s' index..." % (self._indexName))
        res = self.client.indices.create(index = self._indexName,body = Resource.INDEX_REQUEST_BODY)
        print(" response: '%s'" % (res), '\n')

    ########################### SETS MAPPING FOR INDEX  ##############################
    def __SetMappingForIndex__(self):
        # PUT_MAPPING - Registers specific mapping definition for a specific type.
        print("creating '%s' mapping for..." % (self._typeName))
        res2 = self.client.indices.put_mapping(doc_type = self._typeName,
                                               index =  self._indexName,
                                               body=Resource.MAPPING_BODY)
        print(" response: '%s'" % (res2))

    ########################### GENERATE LOGICAL DOC TO BE INDEXED   #################
    def __ConsituteDocument__(self,documentId, documentText):
        doc = {
            "docno": documentId,
            "text": documentText
        }
        return doc

    ##########################  PERFORMS INDEXING   ######################################
    def __UploadDocumentToIndex__(self,docCounterForTesting,documentId, documentText):
        doc = self.__ConsituteDocument__(documentId, documentText)
        res = self.client.index(index=self._indexName,doc_type=self._typeName,id=documentId,body=doc)

    def __CurrentIndexStats__(self):
        res = self.client.count(index=self._indexName, doc_type=self._typeName)
        return res

    ######################################################################################################
    # RETRIEVE
    ######################################################################################################
    def __GetAllUnigramsAsFeatures__(self):
        requestBody = Resource.REQUEST_BODY_FIND_ALL_FEATURES
        # "aggregations": {
        #     "features": {
        #         "doc_count_error_upper_bound": 0,
        #         "sum_other_doc_count": 0,
        #         "buckets": [
        #             {
        #                 "key": "http",
        #                 "doc_count": 4274
        #             },
        #             {
        #                 "key": "s",
        #                 "doc_count": 3958
        #             }
        #         ]
        #     }
        # }
        res = self.client.search(self._indexName, self._typeName, body = requestBody, search_type = "count")
        return res

    ################################################################################################################
    def __GetDocumentText__(self, docId):
        # {u'_source': {u'text': u"  fatalities have increased an estimated 31 percent o}}
        res = self.client.get_source(index=self._indexName, doc_type=self._typeName, id = docId)
        return res['text']

    #####################################################################################################
    def __GetHitsForAllDocuments__(self):
        print "Fetching hits for text for all documents from ElasticSearch ..."
        requestBody = Resource.REQUEST_BODY_FIND_TEXT_FOR_ALLDOCS
        res = self.client.search(index=self._indexName, doc_type=self._typeName, body = requestBody)
        return res['hits']['hits']
示例#26
0
from elasticsearch import Elasticsearch
from pprint import pprint
import sys
import numpy as np

es = Elasticsearch('http://172.27.125.139:9200/',
                   timeout=10,
                   retry_on_timeout=True,
                   max_retries=1)

doc = es.get_source(index="state_bills",
                    id='az_49th-3rd-special_SB1010',
                    doc_type="_all")
print(len(doc['bill_document_last']))
sys.exit()

with open('bill_ids.txt') as infile:
    ids = [x.strip('\n') for x in infile]
#
#o = np.zeros((len(ids)))

#for i, id_ in enumerate(ids):
#    doc = None
#    s = 'failed'
#    doc = es.get_source(index="state_bills", id=id_, doc_type="_all")
#    if doc is not None:
#        o[i] = 1
#        s = 'worked'
#
#    print('{}: {}, {}'.format(s, i, id_))
#
示例#27
0
文件: amcates.py 项目: isususi/amcat
class _ES(object):
    def __init__(self, index, doc_type, host, port, timeout=300, **args):
        self.host = host
        self.port = port
        self.index = index
        self.doc_type = doc_type
        self.es = Elasticsearch(hosts=[
            {
                "host": self.host,
                "port": self.port
            },
        ],
                                timeout=timeout,
                                **args)

    def check_properties(self, properties):
        """
        Check if all properties are known (e.g. have mappings), and creates mappings as needed
        """
        properties = set(properties)
        if not (properties - self.get_properties()):
            return
        to_add = properties - self.get_properties()
        if to_add:
            self.add_properties(to_add)

    def add_properties(self, to_add):
        """
        Add the named properties, setting mapping depending on suffix
        """
        mappings = {}
        for name in to_add:
            ftype = name.rsplit("_", 1)[1] if "_" in name else 'default'
            mappings[name] = settings.ES_MAPPING_TYPES[ftype]
        self.es.indices.put_mapping(index=self.index,
                                    doc_type=self.doc_type,
                                    body={"properties": mappings})

    def get_mapping(self):
        m = self.es.indices.get_mapping(self.index, self.doc_type)
        return m[self.index]['mappings'][self.doc_type]['properties']

    def get_properties(self):
        self.check_index()
        return set(self.get_mapping().keys())

    def refresh(self):
        self.es.indices.refresh()

    def highlight_article(self, aid: int, query: str) -> dict:
        """Highlight article given by an article id using a Lucene query. The resulting strings
        are safe to insert into an HTML document even if the original document contained malicious
        constructs.

        If you need the original article including HTML, call html.unescape on this output."""
        from amcat.tools.amcates_queryset import ESQuerySet

        qs = ESQuerySet().filter(id=aid).only("text",
                                              "title").highlight(query,
                                                                 mark="em")

        try:
            return next(iter(qs)).to_dict()
        except StopIteration:
            raise ValueError(
                "Article(id={}) not found in elastic index.".format(aid))

    def clear_cache(self):
        self.es.indices.clear_cache()

    def delete_index(self):
        try:
            self.es.indices.delete(self.index)
        except NotFoundError:
            pass
        except Exception as e:
            if 'IndexMissingException' in str(e):
                return
            raise

    def create_index(self, shards=5, replicas=1):
        es_settings = settings.ES_SETTINGS.copy()
        es_settings.update({
            "number_of_shards": shards,
            "number_of_replicas": replicas
        })

        body = {
            "settings": es_settings,
            "mappings": {
                settings.ES_ARTICLE_DOCTYPE: settings.ES_MAPPING
            }
        }

        self.es.indices.create(self.index, body)

    def check_index(self):
        """
        Check whether the server is up and the index exists.
        If the server is down, raise an exception.
        If the index does not exist, try to create it.
        """
        if not self.es.ping():
            raise Exception("Elastic server cannot be reached")
        if not self.es.indices.exists(self.index):
            log.info("Index {self.index} does not exist, creating".format(
                **locals()))
            self.create_index()
        return self.es.cluster.health(self.index, wait_for_status='yellow')

    def exists_type(self, doc_type, **kargs):
        return self.es.indices.exists_type(index=self.index,
                                           doc_type=doc_type,
                                           **kargs)

    def put_mapping(self, doc_type, body, **kargs):
        return self.es.indices.put_mapping(index=self.index,
                                           doc_type=doc_type,
                                           body=body,
                                           **kargs)

    def status(self):
        nodes = self.es.nodes.info()['nodes'].values()
        return {
            "ping": self.es.ping(),
            "nodes": [n['name'] for n in nodes],
            "index": self.index,
            "index_health": self.es.cluster.health(self.index),
            "transport_hosts": self.es.transport.hosts,
        }

    def get(self, id, **options):
        """
        Get a single article from the index
        """
        kargs = dict(index=self.index, doc_type=self.doc_type)
        kargs.update(options)
        return self.es.get_source(id=id, **kargs)

    def mget(self, ids, doc_type=None, parents=None):
        """
        Get multiple articles from the index.
        If paret is given, it should be a sequence of the same length as ids
        """
        if parents is None: parents = [None] * len(ids)
        if doc_type is None: doc_type = self.doc_type
        getdocs = [{
            "_index": self.index,
            "_id": id,
            "_parent": parent,
            "_type": doc_type
        } for (id, parent) in zip(ids, parents)]
        return self.es.mget({"docs": getdocs})['docs']

    def search(self, body, **options):
        """
        Perform a 'raw' search on the underlying ES index
        """
        kargs = dict(index=self.index, doc_type=self.doc_type)
        kargs.update(options)
        return self.es.search(body=body, **kargs)

    def scan(self, query, **kargs):
        """
        Perform a scan query on the es index
        See: http://elasticsearch-py.readthedocs.org/en/latest/helpers.html#elasticsearch.helpers.scan
        """
        return scan(self.es,
                    index=self.index,
                    doc_type=self.doc_type,
                    query=query,
                    **kargs)

    def query_ids(self,
                  query=None,
                  filters=EMPTY_RO_DICT,
                  body=None,
                  limit=None,
                  **kwargs):
        """
        Query the index returning a sequence of article ids for the mathced articles

        @param query: a elastic query string (i.e. lucene syntax, e.g. 'piet AND (ja* OR klaas)')
        @param filter: field filter DSL query dict
        @param body: if given, use this instead of constructing from query/filters
        @param filters: if filter is None, build filter from filters as accepted by build_query, e.g. sets=12345

        Note that query and filters can be combined in a single call
        """
        if body is None:
            body = dict(build_body(query, filters, query_as_filter=True))
        for i, a in enumerate(
                scan(self.es,
                     query=body,
                     index=self.index,
                     doc_type=self.doc_type,
                     size=(limit or 1000),
                     fields="")):
            if limit and i >= limit:
                return
            yield int(a['_id'])

    def query(self,
              query=None,
              filters=EMPTY_RO_DICT,
              highlight=False,
              lead=False,
              fields=(),
              score=True,
              **kwargs):
        """
        Execute a query for the given fields with the given query and filter
        @param query: a elastic query string (i.e. lucene syntax, e.g. 'piet AND (ja* OR klaas)')
        @param filter: field filter DSL query dict, defaults to build_filter(**filters)
        @param kwargs: additional keyword arguments to pass to es.search, eg fields, sort, from_, etc
        @return: a list of named tuples containing id, score, and the requested fields
        """
        body = dict(
            build_body(query,
                       filters,
                       query_as_filter=(not (highlight or score))))
        if highlight and not score:
            body['query'] = {'constant_score': {'query': body['query']}}

        if 'sort' in kwargs:
            body['track_scores'] = True

        if highlight and query:
            if isinstance(highlight, dict):
                body['highlight'] = highlight
            else:
                body['highlight'] = HIGHLIGHT_OPTIONS
        if lead or False and query == "" and highlight:
            body['script_fields'] = {
                "lead": {
                    "script": {
                        "file": LEAD_SCRIPT_FIELD
                    }
                }
            }

        result = self.search(body, fields=fields, **kwargs)
        return SearchResult(result, fields, score, body, query=query)

    def query_all(self, *args, **kargs):
        kargs.update({"from_": 0})
        size = kargs.setdefault('size', 10000)
        result = self.query(*args, **kargs)
        total = result.total
        for offset in range(size, total, size):
            kargs['from_'] = offset
            result2 = self.query(*args, **kargs)
            result.hits += result2.hits

        return result

    def _get_used_properties(self, body__prop):
        body, prop = body__prop
        body["query"]["bool"]["must"][1]["exists"]["field"] = prop
        return bool(
            self.es.count(index=self.index, doc_type=self.doc_type,
                          body=body)['count'])

    def get_used_properties(self, set_ids=None, article_ids=None, **filters):
        """
        Returns a sequency of property names in use in the specified set(s) (or setids)
        """
        if set_ids is not None:
            filters["sets"] = set_ids

        if article_ids is not None:
            filters["ids"] = article_ids

        all_properties = self.get_properties()
        flexible_properties = set(all_properties) - set(ALL_FIELDS)

        body = {
            "query": {
                "bool": {
                    "must": [
                        build_filter(**filters), {
                            "exists": {
                                "field": "fakeprop"
                            }
                        }
                    ]
                }
            }
        }

        bodies = (copy.deepcopy(body) for _ in range(len(flexible_properties)))
        pool = ThreadPool()
        results = pool.imap(self._get_used_properties,
                            zip(bodies, flexible_properties))

        try:
            for found, prop in zip(results, flexible_properties):
                if found:
                    yield prop
        finally:
            pool.close()

    def add_articles(self, article_ids, batch_size=1000):
        """
        Add the given article_ids to the index. This is done in batches, so there
        is no limit on the length of article_ids (which can be a generator).
        """
        #WvA: remove redundancy with create_articles
        if not article_ids: return
        from amcat.models import Article, ArticleSetArticle

        n = len(article_ids) / batch_size
        for i, batch in enumerate(
                splitlist(article_ids, itemsperbatch=batch_size)):
            log.info("Adding batch {i}/{n}".format(**locals()))
            all_sets = multidict(
                (aa.article_id, aa.articleset_id)
                for aa in ArticleSetArticle.objects.filter(article__in=batch))
            dicts = (get_article_dict(article,
                                      list(all_sets.get(article.id, [])))
                     for article in Article.objects.filter(pk__in=batch))
            self.bulk_insert(dicts, batch_size=None)

    def remove_from_set(self, setid, article_ids, flush=True):
        """Remove the given articles from the given set. This is done in batches, so there
        is no limit on the length of article_ids (which can be a generator)."""
        if not article_ids: return
        for batch in splitlist(article_ids, itemsperbatch=1000):
            self.bulk_update(batch,
                             UPDATE_SCRIPT_REMOVE_FROM_SET,
                             params={'set': setid})

    def add_to_set(self, setid, article_ids, monitor=NullMonitor()):
        """Add the given articles to the given set. This is done in batches, so there
        is no limit on the length of article_ids (which can be a generator)."""

        if not article_ids:
            if monitor:
                monitor.update()
            return

        batches = list(splitlist(article_ids, itemsperbatch=1000))
        monitor = monitor.submonitor(total=len(batches))

        nbatches = len(batches)
        for i, batch in enumerate(batches):
            monitor.update(message="Adding batch {iplus}/{nbatches}..".format(
                iplus=i + 1, nbatches=nbatches))
            self.bulk_update(batch,
                             UPDATE_SCRIPT_ADD_TO_SET,
                             params={'set': setid})

    def get_tokens(self, aid: int, fields=["text", "title"]):
        """
        Get a list of all tokens (words and their positions) in the given document
        :param aid: Article ID
        :param fields: List of fields to get the terms for
        :return: a sequence of (field, position, term) tuples
        """
        fieldstr = ",".join(fields)
        data = self.es.termvectors(self.index,
                                   self.doc_type,
                                   aid,
                                   fields=fieldstr,
                                   field_statistics=False,
                                   payloads=False,
                                   offsets=False)
        for field in fields:
            if field in data['term_vectors']:
                for term, info in data['term_vectors'][field]['terms'].items():
                    for token in info['tokens']:
                        yield field, token['position'], term

    def bulk_insert(self, dicts, batch_size=1000, monitor=NullMonitor()):
        """
        Bulk insert the given articles in batches of batch_size
        """
        batches = list(toolkit.splitlist(
            dicts, itemsperbatch=batch_size)) if batch_size else [dicts]
        monitor = monitor.submonitor(total=len(batches))
        nbatches = len(batches)
        for i, batch in enumerate(batches):
            monitor.update(
                1, "Adding batch {iplus}/{nbatches}".format(iplus=i + 1,
                                                            **locals()))
            props, articles = set(), {}
            for d in batch:
                props |= (set(d.keys()) - ALL_FIELDS)
                articles[d["id"]] = serialize(d)
            self.check_properties(props)
            body = get_bulk_body(articles)
            resp = self.es.bulk(body=body,
                                index=self.index,
                                doc_type=settings.ES_ARTICLE_DOCTYPE)
            if resp["errors"]:
                raise ElasticSearchError(resp)

    def update_values(self, article_id, values):
        """Update properties of existing article.

        @param values: mapping from field name to (new) value
        @type values: dict"""
        return self.bulk_update_values({article_id: values})

    def bulk_update_values(self, articles):
        """Updates set of articles in bulk.
        """
        body = get_bulk_body(
            {aid: serialize({"doc": a})
             for aid, a in articles.items()},
            action="update")
        resp = self.es.bulk(body=body,
                            index=self.index,
                            doc_type=settings.ES_ARTICLE_DOCTYPE)

        if resp["errors"]:
            raise ElasticSearchError(resp)

    def bulk_update(self, article_ids, script, params):
        """
        Execute a bulk update script with the given params on the given article ids.
        """
        payload = serialize({"script": {"file": script, "params": params}})
        body = get_bulk_body({aid: payload
                              for aid in article_ids},
                             action="update")
        resp = self.es.bulk(body=body,
                            index=self.index,
                            doc_type=settings.ES_ARTICLE_DOCTYPE)

        if resp["errors"]:
            raise ElasticSearchError(resp)

    def synchronize_articleset(self, aset, full_refresh=False):
        """
        Make sure the given articleset is correctly stored in the index
        @param full_refresh: if true, re-add all articles to the index. Use this
                             after changing properties of articles
        """
        self.check_index()  # make sure index exists and is at least 'yellow'

        log.debug("Getting SOLR ids from set")
        solr_set_ids = set(self.query_ids(filters=dict(sets=[aset.id])))
        log.debug("Getting DB ids")
        db_ids = aset.get_article_ids()
        log.debug("Getting SOLR ids")
        solr_ids = set(self.in_index(db_ids))

        to_remove = solr_set_ids - db_ids
        if full_refresh:
            to_add_docs = db_ids
            to_add_set = set()
        else:
            to_add_docs = db_ids - solr_ids
            to_add_set = (db_ids & solr_ids) - solr_set_ids

        log.warning(
            "Refreshing index, full_refresh={full_refresh},"
            "|solr_set_ids|={nsolrset}, |db_set_ids|={ndb}, |solr_ids|={nsolr} "
            "|to_add| = {nta}, |to_add_set|={ntas}, |to_remove_set|={ntr}".
            format(nsolr=len(solr_ids),
                   nsolrset=len(solr_set_ids),
                   ndb=len(db_ids),
                   nta=len(to_add_docs),
                   ntas=len(to_add_set),
                   ntr=len(to_remove),
                   **locals()))

        log.info("Removing {} articles".format(len(to_remove)))
        self.remove_from_set(aset.id, to_remove)
        log.info("Adding {} articles to set".format(len(to_add_set)))
        self.add_to_set(aset.id, to_add_set)
        log.info("Adding {} articles to index".format(len(to_add_docs)))
        self.add_articles(to_add_docs)
        log.info("Refreshing")
        self.refresh()

    def _count(self, body):
        """Raw version of count directly passing given query to elastic, while setting the index and doc_type"""
        return self.es.count(index=self.index,
                             doc_type=settings.ES_ARTICLE_DOCTYPE,
                             body=body)

    def count(self, query=None, filters=None):
        """
        Compute the number of items matching the given query / filter
        """
        filters = dict(build_body(query, filters, query_as_filter=True))
        body = {"query": {"constant_score": filters}}
        return self._count(body)["count"]

    def search_aggregate(self,
                         aggregation,
                         query=None,
                         filters=None,
                         **options):
        """
        Run an aggregate search query and return the aggregation results
        @param aggregation: raw elastic query, e.g. {"terms" : {"field" : "medium"}}
        """
        body = dict(query={
            "filtered":
            dict(build_body(query, filters, query_as_filter=True))
        },
                    aggregations={"aggregation": aggregation})
        result = self.search(body, size=0, search_type="count", **options)
        return result['aggregations']['aggregation']

    def _parse_terms_aggregate(self, aggregate, group_by, terms, sets):
        if not group_by:
            for term in terms:
                yield term, aggregate[term.label]['doc_count']
        else:
            for term in terms:
                yield term, self._parse_aggregate(aggregate[term.label],
                                                  list(group_by), terms, sets)

    def _parse_other_aggregate(self, aggregate, group_by, group, terms, sets):
        buckets = aggregate[group]["buckets"]
        if not group_by:
            return ((b['key'], b['doc_count']) for b in buckets)
        return ((b['key'], self._parse_aggregate(b, list(group_by), terms,
                                                 sets)) for b in buckets)

    def _parse_aggregate(self, aggregate, group_by, terms, sets):
        """Parse a aggregation result to (nested) namedtuples."""
        group = group_by.pop(0)

        if group == "terms":
            result = self._parse_terms_aggregate(aggregate, group_by, terms,
                                                 sets)
        else:
            result = self._parse_other_aggregate(aggregate, group_by, group,
                                                 terms, sets)
            if group == "sets" and sets is not None:
                # Filter sets if 'sets' is given
                result = ((aset_id, res) for aset_id, res in result
                          if aset_id in set(sets))
            elif group == "date":
                # Parse timestamps as datetime objects
                result = ((get_date(stamp), aggr) for stamp, aggr in result)

        # Return results as namedtuples
        ntuple = namedtuple("Aggr",
                            [group, "buckets" if group_by else "count"])
        return [ntuple(*r) for r in result]

    def _build_aggregate(self, group_by, date_interval, terms, sets):
        """Build nested aggregation query for list of groups"""
        group = group_by.pop(0)

        if group == 'date':
            aggregation = {
                group: {
                    'date_histogram': {
                        'field': group,
                        'interval': date_interval,
                        "min_doc_count": 1
                    }
                }
            }
        elif group == 'terms':
            aggregation = {
                term.label: {
                    'filter': dict(build_body(term.query))
                }
                for term in terms
            }
        else:
            aggregation = {
                group: {
                    'terms': {
                        # Default size is too small, we want to return all results
                        'size': 999999,
                        'field': group
                    }
                }
            }

        # We need to nest the other aggregations, see:
        # http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/search-aggregations.html
        if group_by:
            nested = self._build_aggregate(group_by, date_interval, terms,
                                           sets)
            for aggr in aggregation.values():
                aggr["aggregations"] = nested

        return aggregation

    def aggregate_query(self,
                        query=None,
                        filters=None,
                        group_by=None,
                        terms=None,
                        sets=None,
                        date_interval='month'):
        """
        Compute an aggregate query, e.g. select count(*) where <filters> group by <group_by>. If
        date is used as a group_by variable, uses date_interval to bin it. It does support multiple
        values for group_by.

        You can group_by on terms by supplying "terms" to group_by. In addition, you will need to
        supply terms as a parameter, which consists of a list of SearchQuery's. Query is then used
        as a global filter, while terms are 'local'.

        @param query: an elastic query string (i.e. lucene syntax, e.g. 'piet AND (ja* OR klaas)')
        @type group_by: list / tuple
        @type mediums: bool
        @param mediums: return Medium objects, instead of ids
        """
        if isinstance(group_by, str):
            log.warning(
                "Passing strings to aggregate_query(group_by) is deprecated.")
            group_by = [group_by]

        if "terms" in group_by and terms is None:
            raise ValueError(
                "You should pass a list of terms if aggregating on it.")

        filters = dict(build_body(query, filters, query_as_filter=True))
        aggregations = self._build_aggregate(list(group_by), date_interval,
                                             terms, sets)

        body = {
            "query": {
                "constant_score": filters
            },
            "aggregations": aggregations
        }

        log.debug("es.search(body={body})".format(**locals()))
        result = self.search(body)
        result = self._parse_aggregate(result["aggregations"], list(group_by),
                                       terms, sets)
        return result

    def statistics(self, query=None, filters=None):
        """Compute and return a Result object with n, start_date and end_date for the selection"""
        body = {
            "query": {
                "constant_score":
                dict(build_body(query, filters, query_as_filter=True))
            },
            'aggregations': {
                'stats': {
                    'stats': {
                        'field': 'date'
                    }
                }
            }
        }

        stats = self.search(body, size=0)['aggregations']['stats']
        result = Result()
        result.n = stats['count']
        if result.n == 0:
            result.start_date, result.end_date = None, None
        else:
            result.start_date = get_date(stats['min'])
            result.end_date = get_date(stats['max'])
        return result

    def list_dates(self, query=None, filters=None, interval="day"):
        from amcat.tools.aggregate_es import aggregate, IntervalCategory
        for date, count in aggregate(query,
                                     filters, [IntervalCategory(interval)],
                                     es=self):
            yield date

    def in_index(self, ids):
        """
        Check whether the given ids are already indexed.
        @return: a sequence of ids that are in the index
        """
        if not isinstance(ids, list): ids = list(ids)
        log.info(
            "Checking existence of {nids} documents".format(nids=len(ids)))
        if not ids: return
        for batch in splitlist(ids, itemsperbatch=10000):
            result = self.es.mget(index=self.index,
                                  doc_type=settings.ES_ARTICLE_DOCTYPE,
                                  body={"ids": batch},
                                  fields=[])
            for doc in result['docs']:
                if doc['found']: yield int(doc['_id'])

    def duplicate_exists(self, article):
        """
        Check whether a duplicate of the given article already exists.
        If so, returns the sets that the duplicate is a member of.
        Duplication is checked using de get_hash function, so article
        should be an object with the appropriate attributes (.title etc)
        @return: A (possibly empty) sequence of results with .id and .sets
        """
        hash = get_article_dict(article).hash
        return self.query(filters={'hashes': hash},
                          fields=["sets"],
                          score=False)

    def _get_purge_actions(self, query):
        for id in self.query_ids(body=query):
            yield {
                "_op_type": "delete",
                "_id": id,
                "_index": self.index,
                "_type": settings.ES_ARTICLE_DOCTYPE
            }

    def purge_orphans(self):
        """Remove all articles without set from the index"""
        query = {
            "query": {
                "constant_score": {
                    "filter": {
                        "missing": {
                            "field": "sets"
                        }
                    }
                }
            }
        }
        return bulk(self.es, self._get_purge_actions(query))

    def get_child_type_counts(self, **filters):
        """Get the number of child documents per type"""
        filters = dict(build_body(filters=filters))
        filter = {
            "has_parent": {
                "parent_type": self.doc_type,
                "filter": filters['filter']
            }
        }
        aggs = {"module": {"terms": {"field": "_type"}}}
        body = {"aggs": {"prep": {"filter": filter, "aggs": aggs}}}
        r = self.es.search(index=self.index, search_type="count", body=body)
        for b in r['aggregations']['prep']['module']['buckets']:
            yield b['key'], b['doc_count']

    def get_articles_without_child(self, child_doctype, limit=None, **filters):
        """Return the ids of all articles without a child of the given doctype"""
        nochild = {
            "not": {
                "has_child": {
                    "type": child_doctype,
                    "query": {
                        "match_all": {}
                    }
                }
            }
        }
        filter = dict(build_body(filters=filters))['filter']
        body = {"filter": {"bool": {"must": [filter, nochild]}}}
        return self.query_ids(body=body, limit=limit)
        if len(slashsplit)>5:
            if "?pretty" in args.server:
                args.pretty=True
                args.id=slashsplit[5].rsplit("?")[0]
            else:
                args.id=slashsplit[5]
    if args.pretty:
        tabbing=4
    else:
        tabbing=None
    if args.idfile:
        for json_record in esidfilegenerator(host=args.host,port=args.port,index=args.index,type=args.type,body=args.body,source=args.source,headless=args.headless,source_exclude=args.exclude,source_include=args.include,idfile=args.idfile):
            sys.stdout.write(json.dumps(json_record,indent=tabbing)+"\n")
    elif args.idfile_consume:
        for json_record in esidfileconsumegenerator(host=args.host,port=args.port,index=args.index,type=args.type,body=args.body,source=args.source,headless=args.headless,source_exclude=args.exclude,source_include=args.include,idfile=args.idfile_consume):
            sys.stdout.write(json.dumps(json_record,indent=tabbing)+"\n")
    elif not args.id:
        for json_record in esgenerator(host=args.host,port=args.port,index=args.index,type=args.type,body=args.body,source=args.source,headless=args.headless,source_exclude=args.exclude,source_include=args.include,verbose=True):
            sys.stdout.write(json.dumps(json_record,indent=tabbing)+"\n")
    else:
        es=Elasticsearch([{"host":args.host}],port=args.port)
        json_record=None
        if not args.headless:
            json_record=es.get(index=args.index,doc_type=args.type,_source=True,_source_exclude=args.exclude,_source_include=args.include,id=args.id)
        else:
            json_record=es.get_source(index=args.index,doc_type=args.type,_source=True,_source_exclude=args.exclude,_source_include=args.include,id=args.id)
        if json_record:
            sys.stdout.write(json.dumps(json_record,indent=tabbing)+"\n")
            
                
示例#29
0
文件: amcates.py 项目: amcat/amcat
class _ES(object):
    def __init__(self, index, doc_type, host, port, timeout=300, **args):
        self.host = host
        self.port = port
        self.index = index
        self.doc_type = doc_type
        self.es = Elasticsearch(hosts=[{"host": self.host, "port": self.port}, ], timeout=timeout, **args)

    def check_properties(self, properties):
        """
        Check if all properties are known (e.g. have mappings), and creates mappings as needed
        """
        properties = set(properties)
        if not (properties - self.get_properties()):
            return
        to_add = properties - self.get_properties()
        if to_add:
            self.add_properties(to_add)

    def add_properties(self, to_add):
        """
        Add the named properties, setting mapping depending on suffix
        """
        mappings = {}
        for name in to_add:
            ftype = name.rsplit("_", 1)[1] if "_" in name else 'default'
            mappings[name] = settings.ES_MAPPING_TYPES[ftype]
        self.es.indices.put_mapping(index=self.index, doc_type=self.doc_type,
                                    body={"properties": mappings})

    def get_mapping(self):
        m = self.es.indices.get_mapping(self.index, self.doc_type)
        return m[self.index]['mappings'][self.doc_type]['properties']

    def get_properties(self):
        self.check_index()
        return set(self.get_mapping().keys())

    def refresh(self):
        self.es.indices.refresh()

    def highlight_article(self, aid: int, query: str) -> dict:
        """Highlight article given by an article id using a Lucene query. The resulting strings
        are safe to insert into an HTML document even if the original document contained malicious
        constructs.

        If you need the original article including HTML, call html.unescape on this output."""
        from amcat.tools.amcates_queryset import ESQuerySet

        qs = ESQuerySet().filter(id=aid).only("text", "title").highlight(query, mark="em")

        try:
            return next(iter(qs)).to_dict()
        except StopIteration:
            raise ValueError("Article(id={}) not found in elastic index.".format(aid))

    def clear_cache(self):
        self.es.indices.clear_cache()

    def delete_index(self):
        try:
            self.es.indices.delete(self.index)
        except NotFoundError:
            pass
        except Exception as e:
            if 'IndexMissingException' in str(e):
                return
            raise

    def create_index(self, shards=5, replicas=1):
        es_settings = settings.ES_SETTINGS.copy()
        es_settings.update({"number_of_shards": shards,
                            "number_of_replicas": replicas})

        body = {
            "settings": es_settings,
            "mappings": {
                settings.ES_ARTICLE_DOCTYPE: settings.ES_MAPPING
            }
        }

        self.es.indices.create(self.index, body)

    def check_index(self):
        """
        Check whether the server is up and the index exists.
        If the server is down, raise an exception.
        If the index does not exist, try to create it.
        """
        if not self.es.ping():
            raise Exception("Elastic server cannot be reached")
        if not self.es.indices.exists(self.index):
            log.info("Index {self.index} does not exist, creating".format(**locals()))
            self.create_index()
        return self.es.cluster.health(self.index, wait_for_status='yellow')

    def exists_type(self, doc_type, **kargs):
        return self.es.indices.exists_type(index=self.index, doc_type=doc_type, **kargs)

    def put_mapping(self, doc_type, body, **kargs):
        return self.es.indices.put_mapping(index=self.index, doc_type=doc_type, body=body, **kargs)

    def status(self):
        nodes = self.es.nodes.info()['nodes'].values()
        return {"ping": self.es.ping(),
                "nodes": [n['name'] for n in nodes],
                "index": self.index,
                "index_health": self.es.cluster.health(self.index),
                "transport_hosts": self.es.transport.hosts,
                }

    def get(self, id, **options):
        """
        Get a single article from the index
        """
        kargs = dict(index=self.index, doc_type=self.doc_type)
        kargs.update(options)
        return self.es.get_source(id=id, **kargs)

    def mget(self, ids, doc_type=None, parents=None):
        """
        Get multiple articles from the index.
        If paret is given, it should be a sequence of the same length as ids
        """
        if parents is None: parents = [None] * len(ids)
        if doc_type is None: doc_type = self.doc_type
        getdocs = [{"_index": self.index, "_id": id, "_parent": parent, "_type": doc_type}
                   for (id, parent) in zip(ids, parents)]
        return self.es.mget({"docs": getdocs})['docs']

    def search(self, body, **options):
        """
        Perform a 'raw' search on the underlying ES index
        """
        kargs = dict(index=self.index, doc_type=self.doc_type)
        kargs.update(options)
        if log.isEnabledFor(logging.DEBUG):
            # pprint can be expensive
            log.debug("Search with body:\n {}".format(pprint.pformat(body)))
        return self.es.search(body=body, **kargs)

    def scan(self, query, **kargs):
        """
        Perform a scan query on the es index
        See: http://elasticsearch-py.readthedocs.org/en/latest/helpers.html#elasticsearch.helpers.scan
        """
        return scan(self.es, index=self.index, doc_type=self.doc_type, query=query, **kargs)

    def query_ids(self, query=None, filters=EMPTY_RO_DICT, body=None, limit=None, **kwargs):
        """
        Query the index returning a sequence of article ids for the mathced articles

        @param query: a elastic query string (i.e. lucene syntax, e.g. 'piet AND (ja* OR klaas)')
        @param filter: field filter DSL query dict
        @param body: if given, use this instead of constructing from query/filters
        @param filters: if filter is None, build filter from filters as accepted by build_query, e.g. sets=12345

        Note that query and filters can be combined in a single call
        """
        if body is None:
            body = dict(build_body(query, filters, query_as_filter=True))

        log.debug("query_ids with body:\n {}".format(pprint.pformat(body)))
        for i, a in enumerate(scan(self.es, query=body, index=self.index, doc_type=self.doc_type,
                                   size=(limit or 1000), _source=False)):
            if limit and i >= limit:
                return
            yield int(a['_id'])

    def query(self, query=None, filters=EMPTY_RO_DICT, highlight=False, lead=False, _source=(), score=True, **kwargs):
        """
        Execute a query for the given fields with the given query and filter
        @param query: a elastic query string (i.e. lucene syntax, e.g. 'piet AND (ja* OR klaas)')
        @param filter: field filter DSL query dict, defaults to build_filter(**filters)
        @param kwargs: additional keyword arguments to pass to es.search, eg fields, sort, from_, etc
        @return: a list of named tuples containing id, score, and the requested fields
        """
        body = dict(build_body(query, filters, query_as_filter=(not (highlight or score))))
        if highlight and not score:
            body['query'] = {'constant_score': {'query': body['query']}}

        if 'sort' in kwargs:
            body['track_scores'] = True

        if highlight and query:
            if isinstance(highlight, dict):
                body['highlight'] = highlight
            else:
                body['highlight'] = HIGHLIGHT_OPTIONS
        if lead or False and query == "" and highlight:
            body['script_fields'] = {"lead": {"script": LEAD_SCRIPT_FIELD}}

        result = self.search(body, _source=_source, **kwargs)
        return SearchResult(result, _source, score, body, query=query)

    def query_all(self, *args, **kargs):
        kargs.update({"from_": 0})
        size = kargs.setdefault('size', 10000)
        result = self.query(*args, **kargs)
        total = result.total
        for offset in range(size, total, size):
            kargs['from_'] = offset
            result2 = self.query(*args, **kargs)
            result.hits += result2.hits

        return result

    def _get_used_properties(self, body__prop):
        body, prop = body__prop
        body["query"]["bool"]["must"][1]["exists"]["field"] = prop
        return bool(self.es.count(index=self.index, doc_type=self.doc_type, body=body)['count'])

    def get_used_properties(self, set_ids=None, article_ids=None, **filters):
        """
        Returns a sequency of property names in use in the specified set(s) (or setids)
        """
        if set_ids is not None:
            filters["sets"] = set_ids

        if article_ids is not None:
            filters["ids"] = article_ids

        all_properties = self.get_properties()
        flexible_properties = set(all_properties) - set(ALL_FIELDS)

        body = {"query": {"bool": {"must": [
            build_filter(**filters),
            {"exists": {"field": "fakeprop"}}
        ]}}}

        bodies = (copy.deepcopy(body) for _ in range(len(flexible_properties)))
        pool = ThreadPool()
        results = pool.imap(self._get_used_properties, zip(bodies, flexible_properties))

        try:
            for found, prop in zip(results, flexible_properties):
                if found:
                    yield prop
        finally:
            pool.close()

    def add_articles(self, article_ids, batch_size=1000):
        """
        Add the given article_ids to the index. This is done in batches, so there
        is no limit on the length of article_ids (which can be a generator).
        """
        # WvA: remove redundancy with create_articles
        if not article_ids: return
        from amcat.models import Article, ArticleSetArticle

        n = len(article_ids) // batch_size
        for i, batch in enumerate(splitlist(article_ids, itemsperbatch=batch_size)):
            log.info("Adding batch {i}/{n}".format(**locals()))
            all_sets = multidict((aa.article_id, aa.articleset_id)
                                 for aa in ArticleSetArticle.objects.filter(article__in=batch))
            dicts = (get_article_dict(article, list(all_sets.get(article.id, [])))
                     for article in Article.objects.filter(pk__in=batch))
            self.bulk_insert(dicts, batch_size=None)

    def remove_from_set(self, setid, article_ids, flush=True):
        """Remove the given articles from the given set. This is done in batches, so there
        is no limit on the length of article_ids (which can be a generator)."""
        if not article_ids: return
        for batch in splitlist(article_ids, itemsperbatch=1000):
            self.bulk_update(batch, UPDATE_SCRIPT_REMOVE_FROM_SET, params={'set': setid})

    def add_to_set(self, setid, article_ids, monitor=NullMonitor()):
        """Add the given articles to the given set. This is done in batches, so there
        is no limit on the length of article_ids (which can be a generator)."""

        if not article_ids:
            if monitor:
                monitor.update()
            return

        batches = [set(batch) for batch in splitlist(article_ids, itemsperbatch=1000)]
        monitor = monitor.submonitor(total=len(batches))

        nbatches = len(batches)
        for i, batch in enumerate(batches):
            monitor.update(message="Adding batch {iplus}/{nbatches}..".format(iplus=i + 1, nbatches=nbatches))
            missing = batch - set(self.in_index(batch))
            if missing:
                logging.warning("Adding {} missing articles to elastic".format(len(missing)))
                self.add_articles(missing)
            if batch - missing:
                self.bulk_update(batch - missing, UPDATE_SCRIPT_ADD_TO_SET, params={'set': setid})

    def get_tokens(self, aid: int, fields=["text", "title"]):
        """
        Get a list of all tokens (words and their positions) in the given document
        :param aid: Article ID
        :param fields: List of fields to get the terms for
        :return: a sequence of (field, position, term) tuples
        """
        fieldstr = ",".join(fields)
        data = self.es.termvectors(self.index, self.doc_type, aid, fields=fieldstr, field_statistics=False,
                                   payloads=False, offsets=False)
        for field in fields:
            if field in data['term_vectors']:
                for term, info in data['term_vectors'][field]['terms'].items():
                    for token in info['tokens']:
                        yield field, token['position'], term

    def bulk_insert(self, dicts, batch_size=1000, monitor=NullMonitor()):
        """
        Bulk insert the given articles in batches of batch_size
        """
        batches = list(toolkit.splitlist(dicts, itemsperbatch=batch_size)) if batch_size else [dicts]
        monitor = monitor.submonitor(total=len(batches))
        nbatches = len(batches)
        for i, batch in enumerate(batches):
            monitor.update(1, "Adding batch {iplus}/{nbatches}".format(iplus=i + 1, **locals()))
            props, articles = set(), {}
            for d in batch:
                props |= (set(d.keys()) - ALL_FIELDS)
                articles[d["id"]] = serialize(d)
            self.check_properties(props)
            body = get_bulk_body(articles)
            resp = self.es.bulk(body=body, index=self.index, doc_type=settings.ES_ARTICLE_DOCTYPE)
            if resp["errors"]:
                raise ElasticSearchError(resp)

    def update_values(self, article_id, values):
        """Update properties of existing article.

        @param values: mapping from field name to (new) value
        @type values: dict"""
        return self.bulk_update_values({article_id: values})

    def bulk_update_values(self, articles):
        """Updates set of articles in bulk.
        """
        body = get_bulk_body({aid: serialize({"doc": a}) for aid, a in articles.items()}, action="update")
        resp = self.es.bulk(body=body, index=self.index, doc_type=settings.ES_ARTICLE_DOCTYPE)

        if resp["errors"]:
            raise ElasticSearchError(resp)

    def bulk_update(self, article_ids, script, params):
        """
        Execute a bulk update script with the given params on the given article ids.
        """
        payload = serialize({"script": dict(script, params=params)})
        body = get_bulk_body({aid: payload for aid in article_ids}, action="update")
        resp = self.es.bulk(body=body, index=self.index, doc_type=settings.ES_ARTICLE_DOCTYPE)

        if resp["errors"]:
            raise ElasticSearchError(resp)

    def synchronize_articleset(self, aset, full_refresh=False):
        """
        Make sure the given articleset is correctly stored in the index
        @param full_refresh: if true, re-add all articles to the index. Use this
                             after changing properties of articles
        """
        self.check_index()  # make sure index exists and is at least 'yellow'

        log.debug("Getting SOLR ids from set")
        solr_set_ids = set(self.query_ids(filters=dict(sets=[aset.id])))
        log.debug("Getting DB ids")
        db_ids = aset.get_article_ids()
        log.debug("Getting SOLR ids")
        solr_ids = set(self.in_index(db_ids))

        to_remove = solr_set_ids - db_ids
        if full_refresh:
            to_add_docs = db_ids
            to_add_set = set()
        else:
            to_add_docs = db_ids - solr_ids
            to_add_set = (db_ids & solr_ids) - solr_set_ids

        log.warning("Refreshing index, full_refresh={full_refresh},"
                    "|solr_set_ids|={nsolrset}, |db_set_ids|={ndb}, |solr_ids|={nsolr} "
                    "|to_add| = {nta}, |to_add_set|={ntas}, |to_remove_set|={ntr}"
                    .format(nsolr=len(solr_ids), nsolrset=len(solr_set_ids), ndb=len(db_ids),
                            nta=len(to_add_docs), ntas=len(to_add_set), ntr=len(to_remove), **locals()))

        log.info("Removing {} articles".format(len(to_remove)))
        self.remove_from_set(aset.id, to_remove)
        log.info("Adding {} articles to set".format(len(to_add_set)))
        self.add_to_set(aset.id, to_add_set)
        log.info("Adding {} articles to index".format(len(to_add_docs)))
        self.add_articles(to_add_docs)
        log.info("Refreshing")
        self.refresh()

    def _count(self, body):
        """Raw version of count directly passing given query to elastic, while setting the index and doc_type"""
        return self.es.count(index=self.index, doc_type=settings.ES_ARTICLE_DOCTYPE, body=body)

    def count(self, query=None, filters=None):
        """
        Compute the number of items matching the given query / filter
        """
        filters = dict(build_body(query, filters, query_as_filter=True))
        body = {"query": {"constant_score": filters}}
        return self._count(body)["count"]

    def search_aggregate(self, aggregation, query=None, filters=None, **options):
        """
        Run an aggregate search query and return the aggregation results
        @param aggregation: raw elastic query, e.g. {"terms" : {"field" : "medium"}}
        """
        body = dict(query={"filtered": dict(build_body(query, filters, query_as_filter=True))},
                    aggregations={"aggregation": aggregation})
        result = self.search(body, size=0, **options)
        return result['aggregations']['aggregation']

    def _parse_terms_aggregate(self, aggregate, group_by, terms, sets):
        if not group_by:
            for term in terms:
                yield term, aggregate[term.label]['doc_count']
        else:
            for term in terms:
                yield term, self._parse_aggregate(aggregate[term.label], list(group_by), terms, sets)

    def _parse_other_aggregate(self, aggregate, group_by, group, terms, sets):
        buckets = aggregate[group]["buckets"]
        if not group_by:
            return ((b['key'], b['doc_count']) for b in buckets)
        return ((b['key'], self._parse_aggregate(b, list(group_by), terms, sets)) for b in buckets)

    def _parse_aggregate(self, aggregate, group_by, terms, sets):
        """Parse a aggregation result to (nested) namedtuples."""
        group = group_by.pop(0)

        if group == "terms":
            result = self._parse_terms_aggregate(aggregate, group_by, terms, sets)
        else:
            result = self._parse_other_aggregate(aggregate, group_by, group, terms, sets)
            if group == "sets" and sets is not None:
                # Filter sets if 'sets' is given
                result = ((aset_id, res) for aset_id, res in result if aset_id in set(sets))
            elif group == "date":
                # Parse timestamps as datetime objects
                result = ((get_date(stamp), aggr) for stamp, aggr in result)

        # Return results as namedtuples
        ntuple = namedtuple("Aggr", [safe_identifier(group), "buckets" if group_by else "count"])
        return [ntuple(*r) for r in result]

    def _build_aggregate(self, group_by, date_interval, terms, sets):
        """Build nested aggregation query for list of groups"""
        group = group_by.pop(0)

        if group == 'date':
            aggregation = {
                group: {
                    'date_histogram': {
                        'field': group,
                        'interval': date_interval,
                        "min_doc_count": 1
                    }
                }
            }
        elif group == 'terms':
            aggregation = {
                term.label: {
                    'filter': dict(build_body(term.query))['query']
                } for term in terms
            }
        else:
            aggregation = {
                group: {
                    'terms': {
                        # Default size is too small, we want to return all results
                        'size': 999999,
                        'field': group
                    }
                }
            }

        # We need to nest the other aggregations, see:
        # http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/search-aggregations.html
        if group_by:
            nested = self._build_aggregate(group_by, date_interval, terms, sets)
            for aggr in aggregation.values():
                aggr["aggregations"] = nested

        return aggregation

    def aggregate_query(self, query=None, filters=None, group_by=None, terms=None, sets=None, date_interval='month'):
        """
        Compute an aggregate query, e.g. select count(*) where <filters> group by <group_by>. If
        date is used as a group_by variable, uses date_interval to bin it. It does support multiple
        values for group_by.

        You can group_by on terms by supplying "terms" to group_by. In addition, you will need to
        supply terms as a parameter, which consists of a list of SearchQuery's. Query is then used
        as a global filter, while terms are 'local'.

        @param query: an elastic query string (i.e. lucene syntax, e.g. 'piet AND (ja* OR klaas)')
        @type group_by: list / tuple
        @type mediums: bool
        @param mediums: return Medium objects, instead of ids
        """
        if isinstance(group_by, str):
            log.warning("Passing strings to aggregate_query(group_by) is deprecated.")
            group_by = [group_by]

        if "terms" in group_by and terms is None:
            raise ValueError("You should pass a list of terms if aggregating on it.")

        filters = dict(build_body(query, filters, query_as_filter=True))
        aggregations = self._build_aggregate(list(group_by), date_interval, terms, sets)

        body = {
            "query": {"constant_score": filters},
            "aggregations": aggregations
        }

        log.debug("es.search(body={body})".format(**locals()))
        result = self.search(body)
        result = self._parse_aggregate(result["aggregations"], list(group_by), terms, sets)
        return result

    def statistics(self, query=None, filters=None):
        """Compute and return a Result object with n, start_date and end_date for the selection"""
        body = {
            "query": {
                "constant_score": dict(
                    build_body(query, filters, query_as_filter=True)
                )
            },
            'aggregations': {
                'stats': {
                    'stats': {'field': 'date'}
                }
            }
        }

        stats = self.search(body, size=0)['aggregations']['stats']
        result = Result()
        result.n = stats['count']
        if result.n == 0:
            result.start_date, result.end_date = None, None
        else:
            result.start_date = get_date(stats['min'])
            result.end_date = get_date(stats['max'])
        return result

    def list_dates(self, query=None, filters=None, interval="day"):
        from amcat.tools.aggregate_es import aggregate, IntervalCategory
        for date, count in aggregate(query, filters, [IntervalCategory(interval)], es=self):
            yield date

    def in_index(self, ids):
        """
        Check whether the given ids are already indexed.
        @return: a sequence of ids that are in the index
        """
        if not isinstance(ids, list): ids = list(ids)
        log.info("Checking existence of {nids} documents".format(nids=len(ids)))
        if not ids: return
        for batch in splitlist(ids, itemsperbatch=10000):
            result = self.es.mget(index=self.index, doc_type=settings.ES_ARTICLE_DOCTYPE,
                                  body={"ids": batch}, _source=[])
            for doc in result['docs']:
                if doc['found']: yield int(doc['_id'])

    def duplicate_exists(self, article):
        """
        Check whether a duplicate of the given article already exists.
        If so, returns the sets that the duplicate is a member of.
        Duplication is checked using de get_hash function, so article
        should be an object with the appropriate attributes (.title etc)
        @return: A (possibly empty) sequence of results with .id and .sets
        """
        hash = get_article_dict(article).hash
        return self.query(filters={'hashes': hash}, _source=["sets"], score=False)

    def _get_purge_actions(self, query):
        for id in self.query_ids(body=query):
            yield {
                "_op_type": "delete",
                "_id": id,
                "_index": self.index,
                "_type": settings.ES_ARTICLE_DOCTYPE
            }

    def purge_orphans(self):
        """Remove all articles without set from the index"""
        query = {"query": {"bool": {"must_not": {"exists": {"field": "sets"}}}}}
        return bulk(self.es, self._get_purge_actions(query))

    def get_child_type_counts(self, **filters):
        """Get the number of child documents per type"""
        filters = dict(build_body(filters=filters))
        filter = {"has_parent": {"parent_type": self.doc_type, "filter": filters['filter']}}
        aggs = {"module": {"terms": {"field": "_type"}}}
        body = {"aggs": {"prep": {"filter": filter, "aggs": aggs}}}
        r = self.es.search(index=self.index, size=0, body=body)
        for b in r['aggregations']['prep']['module']['buckets']:
            yield b['key'], b['doc_count']

    def get_articles_without_child(self, child_doctype, limit=None, **filters):
        """Return the ids of all articles without a child of the given doctype"""
        nochild = {"not": {"has_child": {"type": child_doctype,
                                         "query": {"match_all": {}}}}}
        filter = dict(build_body(filters=filters))['filter']
        body = {"filter": {"bool": {"must": [filter, nochild]}}}
        return self.query_ids(body=body, limit=limit)
示例#30
0
from tqdm import tqdm
from cluseter import termgraph

es = Elasticsearch([{'host': 'localhost', 'port': 9200}])

articles_path = '../crawler/articles'
articles = []
titles = {}
abstracts = {}
dictionary = {}
stop_words = set(nltk.corpus.stopwords.words('english'))

for article_name in os.listdir(articles_path):
    try:
        article = es.get_source(index="rg",
                                doc_type="article",
                                id=int(article_name.split(".")[0]))
        articles.append(article)
        abstracts[article['id']] = collections.Counter(
            x for x in nltk.word_tokenize(article.get('abstract').lower())
            if x not in stop_words)
        titles[article.get('id')] = collections.Counter(
            x for x in nltk.word_tokenize(article.get('title').lower())
            if x not in stop_words)
    except NotFoundError:
        pass
for doc in abstracts.values():
    for t, v in doc.iteritems():
        dictionary[t] = max(dictionary.get(t), v)

示例#31
0
class OmgAnalyzer(object):
    def __init__(self, config):
        """初始化

		设置数据库连接的wait_timeout是一个小时
		"""
        self.logger = logging.getLogger('omg.analyzer')
        self.config = config
        self.interval = config.getint('default', 'check.interval.second')
        self.running = True
        self.analyze_interval = config.getint('default',
                                              'analyze.interval.second')

        self.cleaner_on = config.get(
            'default', 'thread_analyzed_creative_cleaner.enable')
        self.collector_on = config.get('default',
                                       'thread_creative_collector.enable')

        self.es_index = config.get('elasticsearch', 'index')
        self.es_type = config.get('elasticsearch', 'type')
        self.es_analyzer = config.get('elasticsearch', 'analyzer')
        self.es_timeout = config.getint('elasticsearch', 'timeout')
        self.es_hosts = self.config.get('elasticsearch',
                                        'hosts').strip().split(",")

        self.imageteller_host = self.config.get('server', 'imageteller.host')
        self.imageteller_port = self.config.getint('server',
                                                   'imageteller.port')

        self.imageteller_transport = None
        self.imageteller_client = None

        django.db.connection.cursor().execute('set wait_timeout=3600')

    def initImagetellerClient(self, host, port):
        transport = TSocket.TSocket(host, port)
        transport = TTransport.TFramedTransport(transport)
        protocol = TBinaryProtocol.TBinaryProtocol(transport)
        self.imageteller_client = OmgService.Client(protocol)
        transport.open()
        self.imageteller_transport = transport

    def initClient(self):
        self.initImagetellerClient(self.imageteller_host,
                                   self.imageteller_port)

        self.es_client = Elasticsearch(hosts=self.es_hosts)

    def closeClient(self):
        if self.imageteller_transport is not None:
            try:
                self.imageteller_transport.close()
                self.imageteller_transport = None
            except:
                pass

    def run(self):
        self.runbackthread()
        self.runAnalyzer()

    def runbackthread(self):
        # 收集线程
        if self.collector_on == "on":
            self.collector = OmgThreadCollector(self.config)
            self.collector.daemon = True
            self.collector.start()
        # 清理线程
        if self.cleaner_on == "on":
            self.cleaner = OmgThreadCleaner()
            self.cleaner.daemon = True
            self.cleaner.start()

    def runAnalyzer(self):
        while self.running:
            try:
                # 注意,这里使用短连接,避免服务端重启造成本服务的长期不可用
                # 如果遇到服务端的故障,则忽略这一轮检查,sleep之后再试
                self.initClient()
                django.db.close_old_connections()

                self.analyze()
            except:
                self.logger.exception('analyzer error')
            finally:
                self.closeClient()

            if self.running:
                time.sleep(self.interval)

    def analyze(self):
        """监控

		把上次检测之后的所有状态变化拿出来,发送报警
		"""

        self.logger.info('>>> begin analyze image <<<')
        creatives = ZeusOmg.objects.filter(translated=0)[:10]
        self.logger.info('get %d creative to analyze', len(creatives))

        for creative in creatives:
            tagstr = str()
            try:
                # 从图片识别服务中获取标签和描述
                imageData = ImageData()
                imageData.image_url = creative.image_url
                imageAnalyzeResult = self.imageteller_client.analyzeImage(
                    ImageDataType.IDT_URL, imageData,
                    ImageAnalyzeLanguage.IAL_EN)

                self.logger.debug(
                    "request [%s] and imageteller_client return %s",
                    creative.image_url, imageAnalyzeResult)

                if len(imageAnalyzeResult.tags) == 0 and len(
                        imageAnalyzeResult.descriptions) == 0:
                    creative.translated = 2
                    creative.save()
                    continue

                # 结果构成 tag
                for imageTag in imageAnalyzeResult.tags:
                    tagstr += imageTag.tag + ' '
                for description in imageAnalyzeResult.descriptions:
                    tagstr += description + '. '

                # 通过es的分词功能处理tags
                analyzeRes = self.es_client.indices.analyze(
                    index=self.es_index,
                    analyzer=self.es_analyzer,
                    text=tagstr)

                tags = set()
                for token in analyzeRes['tokens']:
                    tags.add(token['token'])

                # 如果已存在相同文案,则添加新tags
                sourceRes = None
                try:
                    sourceRes = self.es_client.get_source(
                        index=self.es_index,
                        doc_type=self.es_type,
                        id=hashlib.md5(creative.creative_text).hexdigest())
                except NotFoundError:
                    sourceRes = None

                # 说明找到了相同hash
                if sourceRes is not None:
                    if sourceRes['mesg'] == creative.creative_text:
                        # 可能不同文案出现了相同哈希,理论上还是存在这种可能的
                        # 那么如果实际文案是不同的,我们就放弃之前的文案和标签,保存最新的
                        # 所以如果text的字符串比较也确实相同了,那么我们就把之前的tag也加上,再做更新
                        for tag in sourceRes['tags'].split():
                            tags.add(tag)
                    else:
                        # 虽然hash相同,但字符串比较不同,那么就冲突了,我们放弃之前的记录
                        self.logger.warn(
                            "[%s] and [%s] have same md5, keep later",
                            sourceRes['mesg'], creative.creative_text)

                tagstr = string.join(tags)

                # 更新es的记录
                self.es_client.index(index=self.es_index,
                                     doc_type=self.es_type,
                                     body={
                                         "tags": tagstr,
                                         "mesg": creative.creative_text
                                     },
                                     id=hashlib.md5(
                                         creative.creative_text).hexdigest(),
                                     request_timeout=self.es_timeout)
            except:
                self.logger.error(
                    'id:[%d], creative_id:[%d], text:[%s], image_url:[%s], tags:[%s]',
                    creative.id, creative.creative_id, creative.creative_text,
                    creative.image_url, tagstr)
                self.logger.exception('analyze creative error')
                continue

            self.logger.debug(
                'id:[%d], creative_id:[%d], text:[%s], image_url:[%s], tags:[%s]',
                creative.id, creative.creative_id, creative.creative_text,
                creative.image_url, tagstr)

            # 走到这里代表已经识别图片并保存了tags和文案到es,那么可以标记这条记录为已经识别
            creative.translated = 1
            creative.save()

            time.sleep(self.analyze_interval)

    def stop(self):
        self.running = False
class IndexToolManager:
    '''
    A class used to manage the database indexation tools used in this research.
    Provides functions to index a database with ArangoDB, Elasticsearch and Zettair,
    using the BM25 IR function implemented in each of those tools.
    Also makes it possible to query the indexed database using BM25.

    Attributes
    ----------
    indexName : str
        a string to refer to the current working data set

    bm25_b : float
        BM25 b parameter to adjust the document length compensation

    bm25_k1 : float
        BM25 k1 parameter to adjust the term-frequency weight

    bm25_k3 : float
        BM25 k3 parameter to adjust the term-frequency weight in the query (used for long queries)

    top_k : int
        Number of results to be retrieved when querying the database

    Methods
    -------
    initializeArango()
       Initializes ArangoDB, connect to a client, creates/connects to collection and view.

    '''
    def __init__(self,
                 indexName='default_index',
                 bm25_b=0.75,
                 bm25_k1=1.2,
                 bm25_k3=0.0,
                 top_k=100):
        self.indexName = indexName
        self.bm25_b = float(bm25_b)
        self.bm25_k1 = float(bm25_k1)
        self.bm25_k3 = float(bm25_k3)
        self.numberResults = int(top_k)
        self.root_path = "/home/ruan/Documentos/git/tcc-ii-ir-features-text-mining/tool-testing/"

        self.zettair_query_process = None

        self.initializeArango()
        self.initializeElastic()

        self.resultsIndexName = 'tcc_results'
        body = {
            "settings": {
                "number_of_shards": 1,
            }
        }
        if not self.elasticClient.indices.exists(index=self.resultsIndexName):
            self.elasticClient.indices.create(index=self.resultsIndexName,
                                              body=body)

        # Create a new database named "test" if it does not exist.
        if not self.arango_sys_db.has_database(self.resultsIndexName):
            self.arango_sys_db.create_database(self.resultsIndexName)

        # Connect to "test" database as root user.
        # This returns an API wrapper for "test" database.
        self.arangoResultsDb = self.arangoClient.db(self.resultsIndexName,
                                                    username=None,
                                                    password=None)

        db = self.arangoResultsDb
        # Create a new collection named "students" if it does not exist.
        # This returns an API wrapper for "students" collection.
        if db.has_collection(self.resultsIndexName):
            self.arangoResultsCollection = db.collection(self.resultsIndexName)
        else:
            self.arangoResultsCollection = db.create_collection(
                self.resultsIndexName)

    def get_parameters(self):
        parameters = {
            'indexName': str(self.indexName),
            'bm25_b': str(self.bm25_b),
            'bm25_k1': str(self.bm25_k1),
            'bm25_k3': str(self.bm25_k3),
            'top_k': str(self.numberResults),
        }

        return parameters

    def clean_current(self):
        self.delete_all([str(self.indexName)])

    def clean_default(self):
        default_list = []

        for item in default_db_names:
            default_list.append(str(item))
            default_list.append(str(item) + '_bulk')

        self.delete_all(default_list)

    def delete_all(self, index_list):
        '''
        Deletes the databases/indexes from all tools.

        Parameters
        ----------
        index_list : list
            String list of the database/indexes names.
        '''

        self.arango_delete(index_list)
        self.elastic_delete(index_list)

    def log_result(self, itemKey, itemBody):
        '''
        Inserts a document in the Elasticsearch database.

        Parameters
        ----------
        itemKey : str or number
            Document identifier

        itemBody : dict
            Document body/data.
        '''

        self.elasticClient.index(index=self.resultsIndexName,
                                 doc_type=self.elasticDocumentType,
                                 id=itemKey,
                                 body=itemBody)

        document = {'_key': itemKey}
        document.update(itemBody)
        self.arangoResultsCollection.insert(document)

    def get_text_from_child(self, tag):
        '''
        Recursive function to get full text from XML elements with tags.

        Parameters
        ----------
        tag : XML ElementTree element
            Element
        '''

        text = ' '
        if tag.text is not None:
            text = str(text + tag.text)
        count = 0
        for child in tag:
            count = count + 1
            text = str(text + self.get_text_from_child(child))
        return text

    def get_documents(self,
                      db='authorprof',
                      documents_xml_folder='db_authorprof/en/',
                      truth_txt='db_authorprof/truth.txt',
                      append_class_to_id=False):
        '''
        Generates a list with all documents from db formatted files.

        Parameters
        ----------
        db : str
            Database name.

        documents_xml_folder : str
            Folder that contains the XML files from the authors' documents (twits),
            must follow the DB_AUTHORPROF task XML format.

        truth_txt : str
            Truth TXT file with authors' classifications of gender { female | male },
            must follow the DB_AUTHORPROF task TXT format.
        '''
        if (db == 'authorprof'):
            return self.get_documents_DB_AUTHORPROF(documents_xml_folder,
                                                    truth_txt,
                                                    append_class_to_id)
        if (db == 'botgender'):
            return self.get_documents_DB_BOTGENDER(documents_xml_folder,
                                                   truth_txt,
                                                   append_class_to_id)
        if (db == 'hyperpartisan'):
            return self.get_documents_DB_HYPERPARTISAN(documents_xml_folder,
                                                       truth_txt,
                                                       append_class_to_id)
        if (db == 'hyperpartisan_split_42'):
            return self.get_documents_DB_HYPERPARTISAN_split(
                documents_xml_folder, truth_txt, append_class_to_id)

        return []

    def get_documents_DB_AUTHORPROF(self,
                                    documents_xml_folder='db_authorprof/en/',
                                    truth_txt='db_authorprof/truth.txt',
                                    append_class_to_id=False):
        '''
        Generates a list with all documents from DB_AUTHORPROF formatted files.

        Parameters
        ----------
        documents_xml_folder : str
            Folder that contains the XML files from the authors' documents (twits),
            must follow the DB_AUTHORPROF task XML format.

        truth_txt : str
            Truth TXT file with authors' classifications of gender { female | male },
            must follow the DB_AUTHORPROF task TXT format.
        '''
        documents = []

        lines = []
        separator = ':::'

        # Open the truth file
        with open(truth_txt) as f:
            lines = f.read().splitlines()

        # Iterates over the lines, and reads each author's XML file adding the documents to the list
        for line in lines:
            author_id, gender = line.split(separator)
            author_xml = documents_xml_folder + author_id + '.xml'

            # Open the author XML file
            tree_author = ET.parse(str(author_xml),
                                   parser=ET.XMLParser(encoding="utf-8"))
            root_author = tree_author.getroot()

            number = 1
            for child in root_author[0]:
                document = {
                    'id': str(author_id + '-' + str(number)),
                    'author_id': str(author_id),
                    'gender': str(gender),
                    'class': str(gender),
                    'text': child.text
                }
                if append_class_to_id:
                    document['id'] += str(':' + str(document['class']))
                number = number + 1
                documents.append(document)

        return documents

    def get_documents_DB_BOTGENDER(self,
                                   documents_xml_folder='db_botgender/en/',
                                   truth_txt='db_authorprof/truth.txt',
                                   append_class_to_id=False):
        '''
        Generates a list with all documents from DB_BOTGENDER formatted files.

        Parameters
        ----------
        documents_xml_folder : str
            Folder that contains the XML files from the authors' documents (twits),
            must follow the DB_BOTGENDER task XML format.

        truth_txt : str
            Truth TXT file with authors' classifications of kind {bot | human} and gender { bot | female | male },
            must follow the DB_BOTGENDER task TXT format.
        '''
        documents = []

        lines = []
        separator = ':::'

        # Open the truth file
        with open(truth_txt) as f:
            lines = f.read().splitlines()

        # Iterates over the lines, and reads each author's XML file adding the documents to the list
        for line in lines:
            author_id, kind, gender = line.split(separator)
            author_xml = documents_xml_folder + author_id + '.xml'

            # Open the author XML file
            tree_author = ET.parse(str(author_xml),
                                   parser=ET.XMLParser(encoding="utf-8"))
            root_author = tree_author.getroot()

            number = 1
            for child in root_author[0]:
                document = {
                    'id': str(author_id + '-' + str(number)),
                    'author_id': str(author_id),
                    'kind': str(kind),
                    'gender': str(gender),
                    'text': child.text,
                    'class': str(kind),
                }
                if append_class_to_id:
                    document['id'] += str(':' + str(document['class']))
                number = number + 1
                documents.append(document)

        return documents

    def get_documents_DB_HYPERPARTISAN(
            self,
            articles_xml='db_hyperpartisan/articles.xml',
            ground_truth_xml='db_hyperpartisan/ground_truth.xml',
            append_class_to_id=False):
        '''
        Generates a list with all documents from DB_HYPERPARTISAN formatted files.

        Parameters
        ----------
        articles_xml : str
            Articles XML file name, the file must have articles surrounded by <article> tags,
            must follow the DB_HYPERPARTISAN task XML format.

        ground_truth_xml : str
            Articles ground truth XML file with articles surrounded by <article> tags,
            must follow the DB_HYPERPARTISAN task XML format.
        '''

        documents = []
        # Openning the XML files
        tree_articles = ET.parse(str(articles_xml),
                                 parser=ET.XMLParser(encoding="utf-8"))
        root_articles = tree_articles.getroot()
        tree_ground_truth = ET.parse(str(ground_truth_xml))
        root_ground_truth = tree_ground_truth.getroot()

        for a_child, g_child in zip(root_articles, root_ground_truth):
            document = {
                **a_child.attrib,
                **g_child.attrib,
                'text': str(self.get_text_from_child(a_child)),
                'class': str(g_child.get('hyperpartisan')),
            }
            if append_class_to_id:
                document['id'] += str(':' + str(document['class']))
            documents.append(document)
        return documents

    def get_documents_DB_HYPERPARTISAN_split(
            self,
            articles_xml='db_hyperpartisan/articles.xml',
            ground_truth_xml='db_hyperpartisan/ground_truth.xml',
            append_class_to_id=False):
        '''
        Generates a list with all documents from DB_HYPERPARTISAN formatted files.

        Parameters
        ----------
        articles_xml : str
            Articles XML file name, the file must have articles surrounded by <article> tags,
            must follow the DB_HYPERPARTISAN task XML format.

        ground_truth_xml : str
            Articles ground truth XML file with articles surrounded by <article> tags,
            must follow the DB_HYPERPARTISAN task XML format.
        '''

        df = pd.read_csv('db_hyperpartisan/train_set.csv', dtype=str)

        documents = []
        # Openning the XML files
        tree_articles = ET.parse(str(articles_xml),
                                 parser=ET.XMLParser(encoding="utf-8"))
        root_articles = tree_articles.getroot()
        tree_ground_truth = ET.parse(str(ground_truth_xml))
        root_ground_truth = tree_ground_truth.getroot()

        for a_child, g_child in zip(root_articles, root_ground_truth):
            document = {
                **a_child.attrib,
                **g_child.attrib,
                'text': str(self.get_text_from_child(a_child)),
                'class': str(g_child.get('hyperpartisan')),
            }
            if (df['0'].str.contains(document['id']).any()):
                if append_class_to_id:
                    document['id'] += str(':' + str(document['class']))
                documents.append(document)
        return documents

    def calc_IR(self, result_df, positive_class='true'):
        '''
        Calculates IR attributes suggested in the research:
            CLASS_0_BM25_AVG
            CLASS_0_BM25_COUNT
            CLASS_0_BM25_SUM
            CLASS_1_BM25_AVG
            CLASS_1_BM25_COUNT
            CLASS_1_BM25_SUM
        and returns them as a dictionary.

        Parameters
        ----------
        result_df : DataFrame
            A query result dataframe produced by the query methods.
            Must have the columns:
                * score
                * class

        positive_class : str
            Specifies which 'class' is the positive class.
        '''
        df = result_df.copy()
        CLASS_0 = df.loc[(df['class'] != positive_class)]['score']
        CLASS_1 = df.loc[(df['class'] == positive_class)]['score']
        attrib_IR = {
            'CLASS_0_BM25_AVG':
            (0 if math.isnan(CLASS_0.mean()) else CLASS_0.mean()),
            'CLASS_0_BM25_COUNT':
            CLASS_0.count(),
            'CLASS_0_BM25_SUM':
            CLASS_0.sum(),
            'CLASS_1_BM25_AVG':
            (0 if math.isnan(CLASS_1.mean()) else CLASS_1.mean()),
            'CLASS_1_BM25_COUNT':
            CLASS_1.count(),
            'CLASS_1_BM25_SUM':
            CLASS_1.sum(),
        }
        return attrib_IR

    def initializeArango(self):
        '''
        Initialize ArangoDB with the specific parameters used by the repository,
        also sets it up to the research, creating the collection and view needed.

        Parameters
        ----------
        none :
            none
        '''

        # Initialize the ArangoDB client.
        self.arangoClient = ArangoClient(hosts='http://localhost:8529')

        # Connect to "_system" database as root user.
        # This returns an API wrapper for "_system" database.
        self.arango_sys_db = self.arangoClient.db('_system',
                                                  username=None,
                                                  password=None)

        index_name = self.indexName
        # Create a new database named "test" if it does not exist.
        if not self.arango_sys_db.has_database(index_name):
            self.arango_sys_db.create_database(index_name)

        # Connect to "test" database as root user.
        # This returns an API wrapper for "test" database.
        self.arangoDb = self.arangoClient.db(index_name,
                                             username=None,
                                             password=None)

        db = self.arangoDb
        # Create a new collection named "students" if it does not exist.
        # This returns an API wrapper for "students" collection.
        if db.has_collection(index_name):
            self.arangoCollection = db.collection(index_name)
        else:
            self.arangoCollection = db.create_collection(index_name)

        # Retrieve list of views.
        view_list = db.views()

        # Creates the view used by the Analyzer to Search and use BM25
        self.arangoViewName = str('v_' + index_name)
        if not view_list:
            db.create_view(name=self.arangoViewName,
                           view_type='arangosearch',
                           properties={
                               'cleanupIntervalStep': 0,
                               'consolidationIntervalMsec': 0,
                               'writebufferSizeMax': 0,
                               'links': {
                                   index_name: {
                                       "analyzers": ["text_en"],
                                       "includeAllFields": True,
                                       "storeValues": 'id'
                                   }
                               }
                           })

        # Configure AQL query cache properties
        db.aql.cache.configure(mode='off', max_results=100000)

    def arango_delete(self, databases):
        '''
        Deletes the databases from ArangoDB.

        Parameters
        ----------
        databases : list
            String list of the database names.
        '''

        for db in databases:
            # Delete database named 'db' if it does exist.
            if self.arango_sys_db.has_database(str(db)):
                self.arango_sys_db.delete_database(str(db))

    def insertArango(self, itemKey, itemBody):
        '''
        Inserts a document in the ArangoDB 'indexName' collection.

        Parameters
        ----------
        itemKey : str or number
            Document identifier

        itemBody : dict
            Document body/data.
        '''

        document = {'_key': itemKey}
        document.update(itemBody)
        self.arangoCollection.insert(document)

    def insertDocumentArango(self, document):
        '''
        Inserts a document in the ArangoDB 'indexName' collection.

        Parameters
        ----------
        document : dict
            Document to be inserted, might contain a '_key' or '_id' value,
            e.g.: '_key' : ' document1',  or '_id' : 'collection_name/document1'
        '''

        self.arangoCollection.insert(document)

    def bulkListGeneratorArango(self, bulkItems):
        '''
        Generates bulk documents ready to import to the ArangoDB collection.

        Parameters
        ----------
        bulkItems : list
            Bulk items to be processed, must contain an 'id' field.
        '''

        documentList = []
        tempdict = bulkItems.copy()
        for item in tempdict:
            document = {'_key': item.pop('id'), **item}
            documentList.append(document)

        return documentList

    def bulkImportArango(self, documentList):
        '''
        Bulk import to ArangoDB collection.

        Parameters
        ----------
        documentList : list of dicts
            List of documents to be inserted in the ArangoDB collection.
            Every document must have an '_key' field.
            e.g. of document list:
                [{'_key': 'document1', 'field1': 'value1', 'field2': 'value2'},
                {'_key': 'document2', 'field1': 'value4', 'field2': 'value5'}]
        '''

        self.arangoCollection.import_bulk(documentList)

    def arango_query(self, query, ignore_first_result=False):
        '''
        Query ArangoDB view and returns a Pandas DataFrame with the results.

        Parameters
        ----------
        query : str
            Text to be queried to the view using BM25 analyzer.
        '''
        initial = time.time()
        escaped_query = str(query).replace('\\', '')
        escaped_query = str(escaped_query).replace("'", "\\\'")

        nResults = int(self.numberResults)
        if ignore_first_result:
            nResults += 1
        aqlquery = (f"FOR d IN {str(self.arangoViewName)} SEARCH " +
                    f"ANALYZER(d.text IN TOKENS('{escaped_query}'" +
                    f", 'text_en'), 'text_en') " +
                    f"SORT BM25(d, {self.bm25_k1}, {self.bm25_b}) " +
                    f"DESC LIMIT {nResults} " +
                    f"LET sco = BM25(d, {self.bm25_k1}, " +
                    f"{self.bm25_b}) RETURN {{ doc: d, score: sco }}")
        # print(aqlquery)
        cursor = self.arangoDb.aql.execute(query=aqlquery,
                                           count=True,
                                           batch_size=self.numberResults,
                                           optimizer_rules=['+all'],
                                           cache=True)
        item_list = []
        # print(1, time.time()-initial)
        initial = time.time()
        for item in cursor.batch():
            # print(item)
            item_list.append([
                item['score'], item['doc']['_id'].split('/')[-1],
                item['doc']['class']
            ])
        # print(2, time.time()-initial)
        if ignore_first_result and (len(item_list) > 0):
            item_list.pop(0)
        return pd.DataFrame(item_list, columns=['score', 'id', 'class'])

    def arango_get_document(self, key):
        '''
        Get a document from ArangoDB database, returns the document.

        Parameters
        ----------
        key : str
            Document key.
        '''
        result = self.arangoCollection.get(str(key))
        return result

    def arango_get_IR_variables(self,
                                query,
                                positive_class='true',
                                ignore_first_result=False):
        '''
         Query ArangoDB view and returns a dict with the IR variables.

        Parameters
        ----------
        query : str
            Text to be queried to the view using BM25 analyzer.
        '''
        result_df = self.arango_query(query,
                                      ignore_first_result=ignore_first_result)

        return self.calc_IR(result_df=result_df, positive_class=positive_class)

    def initializeElastic(self):
        '''
        Initialize Elasticsearch with the specific parameters used by the repository,
        setting it up to the research.

        Parameters
        ----------
        none :
            none
        '''
        # Initialize the Elasticsearch client.
        self.elasticClient = Elasticsearch(hosts='http://localhost:9200')

        self.elasticDocumentType = '_doc'
        body = {
            "settings": {
                "number_of_shards": 1,
                "index": {
                    "similarity": {
                        "default": {
                            "type": "BM25",
                            "b": self.bm25_b,
                            "k1": self.bm25_k1
                        }
                    }
                }
            }
        }
        if not self.elasticClient.indices.exists(index=self.indexName):
            self.elasticClient.indices.create(index=self.indexName, body=body)

    def elastic_delete(self, indices):
        '''
        Deletes complete indices from Elasticsearch.

        Parameters
        ----------
        indices : list
            String list of the indices names.
        '''

        for index in indices:
            # Delete indice named 'index' if it does exist.
            if self.elasticClient.indices.exists(index=str(index)):
                self.elasticClient.indices.delete(index=str(index))

    def insertElastic(self, itemKey, itemBody):
        '''
        Inserts a document in the Elasticsearch database.

        Parameters
        ----------
        itemKey : str or number
            Document identifier

        itemBody : dict
            Document body/data.
        '''

        self.elasticClient.index(index=self.indexName,
                                 doc_type=self.elasticDocumentType,
                                 id=itemKey,
                                 body=itemBody)

    def bulkInsertGeneratorElastic(self, bulkItems):
        '''
        Generates a bulk body of insert Elasticsearch operations.

        Parameters
        ----------
        bulkItems : list
            Bulk items to be processed, must contain an 'id' field.
        '''

        bulkBody = []
        tempdict = bulkItems.copy()
        # item['_id'] = item.pop('id')
        for item in tempdict:
            action = [{
                'index': {
                    "_index": self.indexName,
                    "_id": item.pop('id')
                }
            }, item]
            bulkBody.extend(action)

        return bulkBody

    def bulkHelperInsertGeneratorElastic(self, bulkItems):
        '''
        Generates a bulk body of insert Elasticsearch operations.

        Parameters
        ----------
        bulkItems : list
            Bulk items to be processed, must contain an 'id' field.
        '''

        # item['_id'] = item.pop('id')
        for item in bulkItems:
            item['_index'] = self.indexName
            item['_id'] = item.pop('id')
            item['_type'] = self.elasticDocumentType

        return bulkItems

    def bulkElastic(self, bulkBody):
        '''
        Bulk Elasticsearch operations.

        Parameters
        ----------
        bulkBody : list or str with operations separated by newlines ('\n')
            Bulk operations to be executed, already in the format and order to be executed.
            All operations must have an '_id' in their metadata field.
            e.g. of index operation over 'index_name' index:
                [{ 'index': {'_index': 'index_name', '_id' : 'document_id'}, {'field1' : 'value1'}]
        '''

        self.elasticClient.bulk(index=self.indexName, body=bulkBody)

    def bulkHelperElastic(self, bulkHelperActions):
        '''
        Bulk Helper Elasticsearch operations.

        Parameters
        ----------
        bulkBody : list or str with operations separated by newlines ('\n')
            Bulk operations to be executed, already in the format and order to be executed.
            All operations must have an '_id' in their metadata field.
            e.g. of index operation over 'index_name' index:
                [{ 'index': {'_index': 'index_name', '_id' : 'document_id'}, {'field1' : 'value1'}]
        '''

        # print(len(bulkHelperActions))
        # print(bulkHelperActions[0])
        r = ElasticsearchHelpers.bulk(
            client=self.elasticClient,
            actions=bulkHelperActions,
            index=self.indexName,  # thread_count=6,
            chunk_size=500,
            max_chunk_bytes=1000 * 1024 * 1024)
        # print(r)

    def refreshElastic(self):
        '''
        Refresh Elasticsearch indices.

        Parameters
        ----------
        none : none
        '''
        self.elasticClient.indices.refresh(index=self.indexName)

    def elastic_query(self, query, ignore_first_result=False):
        '''
        Query Elasticsearch index, returns a Pandas DataFrame with the results.

        Parameters
        ----------
        query : str
            Text to be queried to the index using BM25 similarity
            implemented by Elasticsearch.
        '''
        # escaped_query = str(query).replace('\\', '')
        # escaped_query = str(query).replace('"', '\\\"')
        # escaped_query = json.JSONEncoder.encode(query)
        escaped_query = str(query).replace("'", " ")
        # escaped_query = str(query).replace("\\", "\'")
        # escaped_query = query
        # print('text\n\n\n\n\nHERE')
        # print(escaped_query)
        nResults = int(self.numberResults)
        if ignore_first_result:
            nResults += 1
        result = self.elasticClient.search(
            index=self.indexName,
            body={"query": {
                "match": {
                    "text": escaped_query
                }
            }},
            size=nResults)
        hit_list = []
        for hit in result['hits']['hits']:
            hit_list.append(
                [hit['_score'], hit['_id'], hit['_source']['class']])
        if ignore_first_result and (len(hit_list) > 0):
            hit_list.pop(0)
        return pd.DataFrame(hit_list, columns=['score', 'id', 'class'])

    def elastic_get_document(self, id):
        '''
        Get a document from a Elasticsearch index, returns the document.

        Parameters
        ----------
        id : str
            Document id.
        '''
        result = self.elasticClient.get_source(index=self.indexName,
                                               id=str(id))
        return result

    def elastic_get_IR_variables(self,
                                 query,
                                 positive_class='true',
                                 ignore_first_result=False):
        '''
        Query Elasticsearch index, returns a dict with the IR variables.

        Parameters
        ----------
        query : str
            Text to be queried to the index using BM25 similarity
            implemented by Elasticsearch.
        '''
        result_df = self.elastic_query(query,
                                       ignore_first_result=ignore_first_result)

        return self.calc_IR(result_df=result_df, positive_class=positive_class)

    def initializeZettair(self):
        print('')

    def saveToTrecFileZettair(self, bulkItems):
        filename = str(self.indexName) + '.txt'
        f = open(filename, "w+")

        for d in bulkItems:
            f.write(f'<DOC>\n<DOCNO>{d["id"]}</DOCNO>\n{d["text"]}\n</DOC>\n')
        f.close()

    def zettair_index(self):
        trecfile = str(self.indexName) + '.txt'
        cmd = f'zet -i -f {self.indexName} -t TREC --big-and-fast {trecfile}'
        res = subprocess.run(cmd,
                             shell=True,
                             universal_newlines=True,
                             check=True,
                             capture_output=True)
        # p = subprocess.Popen(['zet', '--index',  '--filename',
        #                   self.indexName, '-t', 'TREC',
        #                   '--big-and-fast', str(trecfile)],
        #                  stdin=subprocess.PIPE,
        #                  stdout=subprocess.PIPE,
        #                  stderr=subprocess.PIPE)
        # p.terminate()
        print(res)

    def zettair_query(self,
                      query,
                      interactive=True,
                      ignore_first_result=False):
        '''
        Query Zettair index, returns a Pandas DataFrame with the results.

        Parameters
        ----------
        query : str
            Text to be queried to the index using BM25 metric.
        '''
        escaped_query = str(query).replace('\\', '')
        escaped_query = str(escaped_query).replace('"', ' ')
        escaped_query = str(escaped_query).replace('`', '\\`')
        nResults = int(self.numberResults)
        if ignore_first_result:
            nResults += 1
        if (self.zettair_query_process is None):
            self.zettair_query_process = subprocess.Popen(
                [
                    'zet', '-f', self.root_path + self.indexName, '-n',
                    str(nResults), '--okapi', f'--b={self.bm25_b}',
                    f'--k1={self.bm25_k1}', f'--k3={self.bm25_k3}',
                    '--summary=none', '--big-and-fast'
                ],
                stdin=subprocess.PIPE,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE)
        # print(escaped_query)
        # p.terminate()
        out = ''
        lines = []
        if interactive:
            escaped_query = str(escaped_query).replace("'", " ")
            escaped_query = "'" + escaped_query + "'"
            # out, err = self.zettair_query_process.communicate(
            #     escaped_query.encode('utf-8'))
            # out = out.decode('utf-8')
            # print(out.decode('utf-8'))
            escaped_query = str(escaped_query).replace('\n', ' ')
            # print(escaped_query)
            self.zettair_query_process.stdin.write(
                escaped_query.encode('utf-8') + b'\n')
            # self.zettair_query_process.stdin.write(escaped_query+'\n')
            self.zettair_query_process.stdin.flush()
            # print(escaped_query)
            fl = self.zettair_query_process.stdout.readline()
            while len(fl.decode('utf-8').split()) > 7:
                # print(fl)
                fl = self.zettair_query_process.stdout.readline()
            # print(fl.decode('utf-8').split('>'))
            lines.append(fl.decode('utf-8').split('>')[1])
            while fl != b'\n' and fl != b'> \n':
                fl = self.zettair_query_process.stdout.readline()
                self.zettair_query_process.stdout.flush()
                if not self.zettair_query_process.poll() is None:
                    print('POOL\n', self.zettair_query_process.poll())
                    err = self.zettair_query_process.stderr.readline()
                    if err != "":
                        print('ERROR\n', err)
                lines.append(fl.decode('utf-8'))
            # for line in iter(self.zettair_query_process.stdout.readline, b'\n'):
            #     lines.append(line.decode('utf-8'))
            #     self.zettair_query_process.stdout.flush()
            #     # print(line)
            # print('END')
        else:
            escaped_query = '"' + escaped_query + '"'
            cmd = f'zet -f {self.root_path}{self.indexName} -n {str(nResults)} --okapi ' + \
                f'--b={self.bm25_b} --k1={self.bm25_k1} --k3={self.bm25_k3} ' + \
                f'--summary=none --big-and-fast {escaped_query}'
            res = subprocess.run(cmd,
                                 shell=True,
                                 universal_newlines=True,
                                 check=True,
                                 stdin=subprocess.PIPE,
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE)
            out = res.stdout

            # Process Zettair query result
            # linesx = out.split('>')[1].splitlines()
            linesx = out.splitlines()
            # linesx = (line for line in linesx if line)    # Non-blank lines
            for line in linesx:
                if line:
                    lines.append(line)
                else:  # breaks after first blank line, next line is the summary
                    break
        # Iterates over the lines, extracts the id and score
        res_list = []
        len_lines = len(lines)
        # if not (len_lines <= 2 and lines[0] == ' '):
        if (len_lines > 2 and (not lines[0] == ' ')):
            for line in lines:
                line_split = line.split()
                if (len(line_split) >= 4):
                    stuff = line_split[1].split(':')
                    cur_id = stuff[0]
                    # print(f'stuff: {stuff}')
                    if (len(stuff) > 1):
                        cl = stuff[1]
                    else:
                        cl = self.elastic_get_document(str(cur_id))['class']
                    score = line_split[3].split(',')[0]
                    # cl = 'true'
                    res_list.append([float(score), cur_id, cl])
        if ignore_first_result and (len(res_list) > 0):
            res_list.pop(0)
        return pd.DataFrame(res_list, columns=['score', 'id', 'class'])

    def zettair_get_IR_variables(self,
                                 query,
                                 positive_class='true',
                                 interactive=True,
                                 ignore_first_result=False):
        '''
        Query Zettair index, returns a dict with the IR variables.

        Parameters
        ----------
        query : str
            Text to be queried to the index using BM25 metric.
        '''
        result_df = self.zettair_query(query,
                                       interactive,
                                       ignore_first_result=ignore_first_result)

        return self.calc_IR(result_df=result_df, positive_class=positive_class)

    def zettair_delete(self, index_name):
        '''
示例#33
0
文件: save.py 项目: tomsovic/Learn
class EsSongci():
    ES_HOST = [
        "http://192.168.1.24:9200/",
        "http://192.168.1.24:9201/",
    ]

    def __init__(self, index_name="songci",index_type="songci_type", hosts=None, transport_class=Transport, **kwargs):
        """
        :param index_name: 索引名称
        :param index_type: 索引类型
        :param hosts:
        :param transport_class:
        :param kwargs:
        """
        self.index_name = index_name
        self.index_type = index_type
        if hosts is None:
            self.es = Elasticsearch(hosts=self.ES_HOST, transport_class=transport_class, **kwargs)
        else:
            self.es = Elasticsearch(hosts=hosts, transport_class=transport_class, **kwargs)

    def deleteIndex(self):
        if self.es.indices.exists(index=self.index_name) is True:
            result = self.es.indices.delete(index=self.index_name, ignore=(400, 404))
            print(result)


    def createIndex(self):
        """ 创建映射:
        auth: 作者
        title: 宋体标题
        content: 词内容
        md5: _id
        创建索引,创建索引名称为songci,类型为songci_type的索引
        :param ex: Elasticsearch对象
        :return:
        """

        _index_mappings = {
            '_source': {
                'enabled': True
            },
            "properties": {
                "content": {
                    "type": "text",
                    "index": True,
                    "analyzer": "ik_max_word",
                    "search_analyzer": "ik_max_word"
                },
                # "date": {
                #     "type": "text",
                #     "index": True
                # },
                "auth": {
                    "type": "keyword",
                    "index": False,
                    # "index" : "not_analyzed" #  为了避免这种问题,我们需要告诉 Elasticsearch 该字段具有精确值,要将其设置成 not_analyzed 无需分析的。
                },
                "title": {
                    "type": "text",
                    "index": True,
                    "analyzer": "ik_max_word",
                    "search_analyzer": "ik_max_word"
                },
                "random": {
                    "type": "integer",
                    "index": False,
                    # "index" : "not_analyzed" #  为了避免这种问题,我们需要告诉 Elasticsearch 该字段具有精确值,要将其设置成 not_analyzed 无需分析的。
                },
            }
        }
        if self.es.indices.exists(index=self.index_name) is not True:
            # res = self.es.indices.create(index=self.index_name, body=_index_mappings)
            self.es.indices.create(index=self.index_name, ignore=400)
            res = self.es.indices.put_mapping(index='news', doc_type=self.index_type, body=_index_mappings)
            print(res)


    def builkIndexData(self):
        """  用bulk将批量数据存储到es """
        ACTIONS = []
        i = 1
        for md5, title, auth, content in getPoetry():
            # print(md5, title, auth, content)
            action = {
                "_index": self.index_name,
                "_type": self.index_type,
                "_id": md5,  #_id 也可以默认生成,不赋值
                "_source": {
                    "title": title,
                    "auth": auth,
                    "content": content,
                    "random": i,
                }
            }
            i += 1
            ACTIONS.append(action)
        success, _ = bulk(self.es, ACTIONS, index=self.index_name, raise_on_error=True)
        print('Performed %d actions' % success)
        # print(_)

    def deleteIndexData(self, id):
        '''
        删除索引中的一条
        :param id:
        :return:
        '''
        res = self.es.delete(index=self.index_name, doc_type=self.index_type, id=id)
        print(res)

    def updateDataByID(self, id, body=None):
        """ 更新文档 """
        # {"doc": {"age": 37, "country": "china"}}
        res = self.es.update(index=self.index_name, id=id, doc_type=self.index_type, body=body)
        print(res)


    def getDataById(self, id):
        """ 获取文档信息 """
        res = self.es.get(index=self.index_name, doc_type=self.index_type, id=id)
        pprint.pprint(res)
        # print(res['_source'])

        # 获取文档内容
        res = self.es.get_source(index=self.index_name, id=id, doc_type=self.index_type)
        print(res)

    def getDataByBody(self):
        """ 获取索引中的一条 """
        # doc = {'query': {'match_all': {}}}
        dsl = {
            "query": {
                "match": {
                    "auth": "吴文英111"
                }
            }
        }
        dsl1 = {
            "query": {
                "bool": {
                    "must": {"term": {"auth": "吴文英111"}}
                }
            }
        }
        dsl = {
            "query" : {
                "constant_score" : {
                    "filter" : {
                        "term" : { "random" : 2 }
                    }
                }
            }
        }
        _searched = self.es.search(index=self.index_name, body=dsl)
        pprint.pprint(_searched)
        for hit in _searched['hits']['hits']:
            pass
            # print (hit['_source']['auth'], hit['_source']['content'], hit['_source']['title'])


    def mget(self, ids):
        """
        多条数据查询
        """
        res = self.es.mget(index=self.index_name, doc_type=self.index_type, body={'ids': ids})
        pprint.pprint(res)
示例#34
0
    try:
        connection = Elasticsearch(
            config['elasticsearch_hosts'],
            # sniff before doing anything
            sniff_on_start=True,
            # refresh nodes after a node fails to respond
            sniff_on_connection_fail=True,
            # and also every 60 seconds
            sniffer_timeout=60)

        try:
            out = connection.delete(index=config['default_index'],
                                    doc_type='group',
                                    id='2')
            data = connection.get_source(index=config['default_index'],
                                         doc_type="group",
                                         id='2')
            print(json.dumps(data, indent=4, sort_keys=True))
        except Exception as e:
            pass
        finally:
            pass
    except Exception as e:
        print("Failed to add item")
        print("Test failed")
        traceback.print_exc()
        ret = 1
    finally:
        print(test_name + " Test complete")

    sys.exit(ret)
示例#35
0
class ElasticBookStorage(object):
    def __init__(self):
        self.book_index = ELASTIC_INDEX
        self.book_doc = ELASTIC_DOC
        self.ELK_HOSTNAME = ELASTIC_HOSTNAME
        self.ELK_PORT = ELASTIC_PORT

        self.es = Elasticsearch([{
            'host': self.ELK_HOSTNAME,
            'port': self.ELK_PORT
        }])

    def create_book_index(self):
        """
        The following function is used to create the book index
        :return: pass
        :Examples:
            >>> elk = ElasticBookStorage()
            >>> elk.create_book_index()
        """
        try:
            self.es.indices.create(index=self.book_index, ignore=400)
        except Exception as ex:
            print(ex)
        pass

    def bulk_insert(self, data):
        """
        The following function is used to insert bulk data to ElasticSearch

        :param data: list of dict
        :return:

        Example:
            >>> data = [{ "title": "Solr in Action", "authors": ["trey grainger", "timothy potter"], "summary" : "Comprehensive guide","publish_date" : "2015-12-03", "num_reviews": 18, "publisher": "manning" }]
            >>> bulk_insert(data)
        """
        try:
            actions = [{
                "_index": self.book_index,
                "_type": self.book_doc,
                "_id": i,
                "_source": data[i]
            } for i in range(len(data))]
            helpers.bulk(self.es, actions=actions)
        except Exception as e:
            print(e)

    def create_book_doc(self, title, authors, summary, publisher, num_reviews,
                        publish_date):
        """
        The following function is used to create a book entry to elasticsearch using the provided info
        :param title: book title
        :param authors: book authors
        :param summary: book summary
        :param publisher: book publisher
        :param num_reviews: book number of reviews
        :param publish_date: book publish date
        :return: pass

        :Example:
            >>> title="Some book"
            >>> authors=["Author1", "Author2"]
            >>> summary = "this is a book written by Author1 and Author2"
            >>> publisher = "Loki AE"
            >>> num_reviews = 20
            >>> publish_date = "2014-04-05"
            >>> elk = ElasticBookStorage()
            >>> elk.create_book_doc(title, authors, summary, publisher, num_reviews, publish_date)
        """
        try:
            body = {
                "title": title,
                "authors": authors,
                "summary": summary,
                "publisher": publisher,
                "num_reviews": num_reviews,
                "publish_date": publish_date
            }
            self.es.index(index=self.book_index,
                          doc_type=self.book_doc,
                          body=body)
        except Exception as e:
            print(e)

    def retrieve_book_by_id(self, book_id):
        """
        The following function is used to retrieve a book document from the elastic search using is ID
        :param book_id: book id
        :return: result document

        :Example:
            >>> elk = ElasticBookStorage()
            >>> book = elk.retrieve_book_by_id(book_id=2)
        """
        try:
            results = self.es.get_source(index=self.book_index,
                                         doc_type=self.book_doc,
                                         id=str(book_id))
            return results
        except Exception as ex:
            print(ex)

    def remove_book_doc(self, book_id):
        """
        The following function is used to remove a book entry from elastic search
        using its ID
        :param book_id: book ID
        :return:

        :Example:
            >>> elk = ElasticBookStorage()
            >>> elk.remove_book_doc(book_id=2)
        """
        try:
            self.es.delete(index=self.book_index,
                           doc_type=self.book_doc,
                           id=str(book_id))
        except Exception as e:
            print(e)

    def multi_match_query(self, query):
        """
        The following function is used to perform a basic match query using elastic search
        functionalities

        :param query: provided query parameter
        :return: results

        :Examples:
            >>> elk = ElasticBookStorage()
            >>> results = elk.basic_match_query(query="guide")
        """
        try:
            results = self.es.search(index=self.book_index,
                                     q=query)["hits"]["hits"]
            return results
        except Exception as e:
            print(e)

    def search_book_by_param(self, *args, _source=[]):
        """
        The following function is used to retrieve results from
        elastic search searching for books that contains in their title

        the provided query
        :param _source: source argument
        :param query: provided query to search
        :return: results

        :Examples:
            >>> query = "in action"
            >>> elk = ElasticBookStorage()
            >>> results = elk.search_book_by_title("title", "in action")
        """
        try:
            term = args[0]
            query = args[1]

            body = {
                "query": {
                    "match": {
                        "{}".format(term): query
                    }
                },
                "_source": _source
            }
            results = self.es.search(index=self.book_index,
                                     body=body)["hits"]["hits"]
            return results
        except Exception as ee:
            print(ee)

    def fuzzy_queries(self, query, _source=[], **kwargs):
        """
        The following function receives a query and search to match books using the provided query
        to match books title and summary using Fuzzy matching.

        Fuzzy matching can be enabled on Match and Multi-Match queries to catch spelling errors.
        The degree of fuzziness is specified based on the Levenshtein distance from the original word,
        i.e. the number of one-character changes that need to be made to one string to make it the same
        as another string.

        :param query: provided query
        :return: results

        :Examples:
            >>> query="comprihensiv guide"
            >>> elk = ElasticBookStorage()
            >>> elk.fuzzy_queries(query, fields=["title", "summary"])
        """
        try:
            fields = kwargs["fields"]
            body = {
                "query": {
                    "multi_match": {
                        "query": query,
                        "fields": fields,
                        "fuzziness": "AUTO"
                    }
                },
                "_source": _source,
                "size": 1
            }

            results = self.es.search(index=self.book_index,
                                     body=body)["hits"]["hits"]
            return results
        except Exception as ee:
            print(ee)

    def wild_card_query(self, _source=[], **kwargs):
        """
        This function is used to perform ElasticSearch wild card queries

        :param args: given arguments
        :param query: given query
        :return:

        Example:
            >>> wild_card_query("authors", query="t*")
        """
        try:
            field = kwargs['field']
            query = kwargs['query']

            body = {
                "query": {
                    "wildcard": {
                        "{}".format(field): query
                    }
                },
                "highlight": {
                    "fields": {
                        "".format(field): {}
                    }
                },
                "_source": _source
            }
            results = self.es.search(index=self.book_index,
                                     body=body)["hits"]["hits"]
            return results
        except Exception as ex:
            print(ex)

    def regex_query(self, **kwargs):
        """
        Regexp queries allow you to specify more complex patterns than wildcard queries

        :param query: provided query
        :param args: provided arguments
        :return:

        Example:
            >>> regex_query("authors", query="t[a-z]*y")
        """
        try:
            field = kwargs['field'],
            query = kwargs['query']

            body = {
                "query": {
                    "regexp": {
                        "{}".format(field): query
                    }
                },
                "highlight": {
                    "fields": {
                        "{}".format(field): {}
                    }
                },
            }
            results = self.es.search(index=self.book_index,
                                     body=body)["hits"]["hits"]
            return results
        except Exception as ex:
            print(ex)

    def match_phrase_query(self, query, **kwargs):
        """
        The match phrase query requires that all the terms in the query string be present in the document,
        be in the order specified in the query string and be close to each other.
        By default, the terms are required to be exactly beside each other but you can specify the slop value which
        indicates how far apart terms are allowed to be while still considering the document a match.

        :param query: provided query
        :param kwargs: provided kwargs
        :return:

        Example:
            >>> match_phrase_query(query="search engine", fields=["title", "summary"], slop=3, _source=[])
        """
        try:
            fields = kwargs["fields"]
            slop = kwargs["slop"]

            body = {
                "query": {
                    "multi_match": {
                        "query": query,
                        "fields": fields,
                        "type": "phrase",
                        "slop": slop
                    }
                },
                "_source": []
            }
            results = self.es.search(index=self.book_index,
                                     body=body)["hits"]["hits"]
            return results
        except Exception as ex:
            print(ex)

    def match_phrase_prefix(self, query, slop, max_expansions=10, _source=[]):
        """
        Match phrase prefix queries provide search-as-you-type or a poor man’s version of autocomplete at query
        time without needing to prepare your data in any way.Like the match_phrase query,
        it accepts a slop parameter to make the word order and relative positions somewhat less rigid.
        It also accepts the max_expansions parameter to limit the number of terms matched in order to reduce resource intensity.

        :param query: provided query
        :param slop: provided slop
        :param max_expansions: provided max expansions
        :return:

        Example:
            >>> match_phrase_prefix(query="search en", slop=3)
        """
        try:
            body = {
                "query": {
                    "match_phrase_prefix": {
                        "summary": {
                            "query": query,
                            "slop": slop,
                            "max_expansions": max_expansions
                        }
                    }
                },
                "_source": _source
            }
            results = self.es.search(index=self.book_index,
                                     body=body)["hits"]["hits"]
            return results
        except Exception as ex:
            print(ex)

    def term_query(self, _source=[], **kwargs):
        """
        The above examples have been examples of full-text search.

        :param kwargs: provided kwargs
        :return:

        Example:
            >>> term_query(field="publisher", term="manning")
        """
        try:
            field = kwargs["field"]
            term = kwargs["term"]

            if isinstance(term, list):
                term_or_terms = "terms"
            else:
                term_or_terms = "term"

            body = {
                "query": {
                    "{}".format(term_or_terms): {
                        "{}".format(field): term
                    }
                },
                "_source": _source
            }
            results = self.es.search(index=self.book_index,
                                     body=body)["hits"]["hits"]
            return results
        except Exception as ex:
            print(ex)

    def delete_by_query(self, query, fields):
        """
        This function is used to delete by query

        :param query: provided query
        :param fields: provided fields
        :return:

        Example:
            >>> delete_by_query(query="python", fields=['title'])
        """
        try:
            client = Elasticsearch()
            s = Search(using=client, index=self.book_index)

            retrieved_items = s.query(
                Q("multi_match", query=query, fields=fields))
            retrieved_items.delete()
        except Exception as ex:
            print(ex)

    def update_by_query(self, **kwargs):
        """
        This function is used to update ElasticSearch entries by record

        :param kwargs: provided kwargs
        :return:

        Example:
            >>> update_by_query(fields="publisher", query="oreilly", field_to_update="publisher", new_value="OnMedia")
        """
        try:
            client = Elasticsearch()
            ubq = UpdateByQuery(using=client, index=self.book_index)

            search_fields = kwargs["fields"]
            query = kwargs["query"]
            field_to_update = kwargs["field_to_update"]
            new_value = kwargs["new_value"]

            ubq.query("multi_match", query=query, fields=search_fields).script(
                source="ctx._source.{}='{}'".format(field_to_update,
                                                    new_value)).execute()
        except Exception as ex:
            print(ex)

    def query_combination(self, **kwargs):
        """
        This function performs Combined bool queries

        :param kwargs: provided kwargs
        :return:

        Example:
            >>> should=[["title", "Elasticsearch"], ["title", "Solr"]],
            >>> must=[["authors", "clinton gormely"]],
            >>> must_not=[["authors", "radu george"]]
            >>> query_combination(should, must, must_not)

        """
        try:
            client = Elasticsearch()

            s = Search(using=client, index=self.book_index)

            q = Q('bool',
                  must=[
                      Q({
                          "multi_match": {
                              "query": "{}".format(m[1]),
                              "fields": ["{}".format(m[0])]
                          }
                      }) for m in kwargs["must"]
                  ] if "must" in kwargs.keys() else [],
                  should=[
                      Q({
                          "multi_match": {
                              "query": "{}".format(m[1]),
                              "fields": ["{}".format(m[0])]
                          }
                      }) for m in kwargs["should"]
                  ] if "should" in kwargs.keys() else [],
                  must_not=[
                      Q({
                          "multi_match": {
                              "query": "{}".format(m[1]),
                              "fields": ["{}".format(m[0])]
                          }
                      }) for m in kwargs["must_not"]
                  ] if "must_not" in kwargs.keys() else [],
                  minimum_should_match=1)
            response = s.query(q).execute()["hits"]["hits"]
            return response
        except Exception as ex:
            print(ex)