Python IndicesClient.analyze 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: elasticsearch.client.indices

클래스/타입: IndicesClient

메소드/함수: analyze

hotexamples.com에서의 예제들: 2

Python IndicesClient.analyze - 2개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 elasticsearch.client.indices.IndicesClient.analyze에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

IndicesClient(17)

create(7)

delete(5)

exists(4)

get(3)

analyze(2)

get_settings(2)

close(1)

flush(1)

get_alias(1)

get_mapping(1)

open(1)

put_mapping(1)

예제 #1

파일 보기

파일: es_interface.py 프로젝트: pdsujnow/BioSum

class ESInterface():

    """Interface for ElasticSearch"""

    _count_total = -1  # N: Number of docs
    _idf = None  # dict for storing idf values

    def __init__(self, host='localhost', port=9200,
                 index_name='pubmed', cred_path='.cred'):
        self.host = host
        self.port = port
        self.index_name = index_name
        self.cred_path = cred_path
        # self.doc_type = 'papers'
        self.es = self.__connect()
        self.ic = IndicesClient(self.es)
        self.page_cache = shelve.open("/Users/rmn/git/BioSum/biosum-supervised/cache/pages.p", writeback=False)

    def login(self, username, password):
        pass

    @property
    def description(self):
        # get mapping, clean it up
        m = self.es.indices.get_mapping(self.index_name)
        m = m[self.index_name]['mappings']

        description = {'host': self.host,
                       'port': self.port,
                       'index_name': self.index_name,
                       'mapping': m}
        return description

    @property
    def size(self):
        stats = self.es.indices.stats()['indices'][self.index_name]
        return stats['total']['docs']['count']

    def __connect(self):
        '''Private method used to connect to the ElasticSearch instance.'''
        es = ES(hosts=[{'host': self.host, 'port': self.port}])

        # checks if server exists
        if not es.ping():
            err = ('It appears that nothing is running at http://%s:%s' %
                   (self.host, self.port))
            raise OSError(err)

        # load the credentials file (if possible)
#         with file(self.cred_path) as cf:
#             username, password = [l.strip() for l in cf.readlines()][:2]
#         data = json.dumps({'username': username, 'password': password})
        url = 'http://%s:%s/login' % (self.host, self.port)
        resp = json.loads(requests.post(url).text)
#         if resp['status'] == 200:
#             self.auth_token = resp['token']
#         else:
#             self.auth_token = ''

        # checks if index exists
        try:
            es.indices.get_mapping(self.index_name)
        except TransportError as e:
            if e.args[0] == 403:
                err = list(e.args)
                err[1] = ('Credentials not valid for %s:%s/%s' %
                          (self.host, self.port, self.index_name))
                e.args = tuple(err)
            elif e.args[0] == 404:
                self.__del__()
                err = list(e.args)
                err[1] = ('No index named "%s" is avaliable at %s:%s' %
                          (self.index_name, self.host, self.port))
                e.args = tuple(err)
            raise
        return es

    def __del__(self):
        requests.post('http://%s:%s/logout' % (self.host, self.port))

    # def get_scroll(self, scroll_size, scroll_timeout):
    #     q_body = {"query": {"match_all": {}}}
    #     return self.es.search(self.index_name, self.doc_type, q_body,
    #                           search_type='scan', scroll='100m',
    #                           size='10000')

    # def scroll(self, scroll_id):
    #     return self.es.scroll(scroll_id, scroll='10m')

    # def scan_and_scroll(self, doc_type, scroll_size=50, scroll_timeout=10):
    #     """
    #     The scan search type allows to efficiently scroll a large result set.
    #     The response will include no hits, with two important results,
    #     the total_hits will include the total hits that match the query
    #     and the scroll_id that allows to start the scroll process.

    #     @param scroll_size: scroll size
    #     @param scroll_timeout: rountdtrip timeout
    #     """
    #     q_body = {"query": {
    #         "match_all": {}
    #     }}
    #     result = self.es.search(self.index_name,
    #                             doc_type,
    #                             q_body,
    #                             search_type='scan',
    #                             scroll=str(scroll_timeout) +
    #                             'm',
    #                             size=scroll_size)
    #     res = self.es.scroll(
    #         result['_scroll_id'], scroll=str(scroll_timeout) + 'm')
    #     finalres = []
    #     while len(res['hits']['hits']) > 0:
    #         finalres.append(res)
    #         res = self.es.scroll(
    #             res['_scroll_id'], scroll=str(scroll_timeout) + 'm')
    #     return finalres

    # def esc(self, txt):
    #     for e in TO_ESCAPE:
    #         txt = txt.replace(e, '\%s' % e)
    #     return txt

    def find_all(self, source_fields=None, doc_type=''):
        if source_fields:
            q_body = {
                "fields": source_fields,
                "query": {
                    "match_all": {}
                }
            }
        else:
            q_body = {
                "query": {
                    "match_all": {}
                }
            }
        return self.es.search(
            body=q_body, size=1000000, index=self.index_name, doc_type=doc_type)['hits']['hits']

    def multi_field_search(self,
                           field_vals,
                           fields=['sentence', 'mm-concepts', 'noun_phrases'],
                           maxsize=1000,
                           field_boost=[1, 3, 2],
                           offset=0,
                           source_fields=[],
                           doc_type='',
                           params=None):
        '''Interface for simple query tasks.
        Parameters:
            - field_vals [requried]: a list of field values to query
            - maxsize [optional]:   number of results to get.
                                    default is 1000.
        Returns results.'''
#         q_body = {
#             "fields": source_fields,
#             "query": {
#                 "dis_max": {
#                     "queries": [
#                         {"match": {
#                             "sentence":  {
#                                 "query": sentence,
#                                 "boost": field_boost[0]
#                             }}},
#                         {"match": {
#                             "mm-concepts":  {
#                                 "query": concepts,
#                                 "boost": field_boost[1]
#                             }}},
#                         {"match": {
#                             "noun_phrases":  {
#                                 "query": noun_phrases,
#                                 "boost": field_boost[2]
#                             }}}
#                     ]
#                 }
#             }
#         }
        q_body = {
            "fields": source_fields,
            "query": {
                "dis_max": {
                    "queries": [
                    ]
                }
            }
        }
        for idx in range(len(field_vals)):
            q_body['query']['dis_max']['queries'].append({"match": {
                fields[idx]:  {
                    "query": field_vals[idx],
                    "boost": field_boost[idx]
                }}})

        if params is not None:
            for key in params:
                q_body['query']['dis_max'][key] = params[key]

        return self._cursor_search(q_body, maxsize, offset, doc_type)

    def simple_search(self, query, field='_all', maxsize=1000,
                      offset=0, source_fields=[], doc_type='',
                      operator='or', phrase_slop=0, escape=False, params=None):
        '''Interface for simple query tasks.
        Parameters:
            - query [requried]: the string to query
            - maxsize [optional]:   number of results to get.
                                    default is 1000.
        Returns results.'''

        if escape:
            query = self.esc(query)

        q_body = {
            "fields": source_fields,
            'query': {
                'query_string': {
                    'query': query,
                    'default_operator': operator,
                    'use_dis_max': True,
                    'auto_generate_phrase_queries': True,
                    'phrase_slop': phrase_slop
                }
            }
        }
        if params is not None:
            for key in params:
                q_body['query']['query_string'][key] = params[key]

        if field:
            q_body['query']['query_string']['default_field'] = field

        return self._cursor_search(q_body, maxsize, offset, doc_type)

    def count(self, query, field='_all', operator="AND"):
        q = {
            'query': {
                "query_string": {
                    "default_field": field,
                    "default_operator": operator,
                    "query": query
                }
            }
        }
        resp = self.es.count(body=q, index=self.index_name)
        if resp['_shards']['failed'] > 0:
            raise RuntimeError("ES count failed: %s", resp)

        return resp['count']

    def _cursor_search(self, q, maxsize, offset, doc_type):
        return self.es.search(index=self.index_name,
                              body=q,
                              size=maxsize,
                              from_=offset,
                              doc_type=doc_type)['hits']['hits']

    def update_field(self, docid, doc_type,
                     field_name, field_value):
        ''' Update field field_name with field_value'''
        body = {'doc': {field_name: field_value}}
        self.es.update(id=docid, doc_type=doc_type,
                       index=self.index_name, body=body)

    def get_page_by_res(self, res_dict, cache=False):
        return self.get_page(res_dict['_id'],
                             res_dict['_type'],
                             cache=cache)

    def get_page(self, docid, doc_type, cache=False):
        ''' Retrieve a page's source from the index
        Parameters:
            - id [required]: the ES id of the page to retrieve
            - doc_type [required]: the ES document type to retrieve
        '''
        k = str("-".join((docid, self.index_name, doc_type)))

        if not cache or k not in self.page_cache:
            page = self.es.get_source(id=docid,
                                      index=self.index_name,
                                      doc_type=doc_type)

            if cache:
                self.page_cache[k] = page
                self.page_cache.sync()
        else:
            page = self.page_cache[k]

        return page

    def get_index_analyzer(self):
        return self.ic.get_settings(index=self.index_name)\
            [self.index_name]['settings']['index']\
            ['analysis']['analyzer'].keys()[0]

    def tokenize(self, text, field="text", analyzer=None):
        ''' Return a list of tokenized tokens
        Parameters:
            - text [required]: the text to tokenize
            - field [optional]: the field whose ES analyzer
                                should be used (default: text)
        '''
        params = {}
        if analyzer is not None:
            params['analyzer'] = analyzer
        try:
            response = self.ic.analyze(body=text, field=field,
                                       index=self.index_name,
                                       params=params
                                       )
            return [d['token'] for d in response['tokens']]
        except RequestError:
            return []

    def phrase_search(self, phrase, doc_type='',
                      field='_all', slop=0, in_order=True,
                      maxsize=1000, offset=0, source_fields=[]):
        ''' Retrieve documents containing a phrase.
            Does not return the documents' source. '''

        phraseterms = self.tokenize(phrase, field=field)
        if len(phraseterms) == 0:
            return []

        q = {
            "fields": source_fields,
            "query": {
                "span_near": {
                    "clauses": [{"span_term": {field: term}}
                                for term in phraseterms],
                    "slop": slop,  # max number of intervening unmatched pos.
                    "in_order": in_order,
                    "collect_payloads": False
                }
            }
        }
        return self._cursor_search(q, maxsize, offset, doc_type)

    def phrase_count(self, phrase, field='_all', slop=0, in_order=True):
        phraseterms = self.tokenize(phrase, field=field)

        if len(phraseterms) == 0:
            return []

        q = {
            "query": {
                "span_near": {
                    "clauses": [{"span_term": {field: term}}
                                for term in phraseterms],
                    "slop": slop,  # max number of intervening unmatched pos.
                    "in_order": in_order,
                    "collect_payloads": False
                }
            }
        }

        resp = self.es.count(body=q, index=self.index_name)
        if resp['_shards']['failed'] > 0:
            raise RuntimeError("ES count failed: %s", resp)

        return resp['count']

    def index_hash(self):
        ''' Weak hash (only considers mapping and size) of index_name '''
        ic_sts = self.ic.stats(index=self.index_name)['_all']['total']['store']
        ic_map = self.ic.get_mapping(index=self.index_name)
        s = "_".join((unicode(ic_sts), unicode(ic_map)))
        return hashlib.md5(s).hexdigest()

    # def get_mappings(self):
    #     mappings = self.es.indices.get_mapping(self.index_name)
    #     return mappings[self.index_name]['mappings']

    def set_mappings(self, mapdict):
        ''' Set mapping for documents in index according to map_dict;
            only documents types with an entry in map dict are updated.
            No input check; PLEASE FOLLOW SPECIFICATIONS!
            format:
            {<doc_type_1>: {'properties': {'doc_field_1': {<properties>}
                                           ...
                                           'doc_filed_n': {<properties>}
                                           }
                            }
            }
        '''
        for doc_type, mapping in mapdict:
            self.es.indices.put_mapping(index=self.index_name,
                                        doc_type=doc_type,
                                        body=mapping)

    # def get_ids(self, doc_type):
    #     res = self.scan_and_scroll(doc_type, scroll_size=5000)
    #     return res

    # def get_types(self):
    #     from subprocess import check_output
    #     request = 'http://localhost:9200/indexname/_mapping?pretty=1'
    #     request = request.replace('indexname', self.index_name)
    #     res = json.loads(check_output(["curl", "-XGET", request]))
    #     return res[self.index_name]['mappings'].keys()

    def get_termvector(self, doc_type, docid, fields=None):
        """ Return the term vector and stratistics
            for document docid of type doc_type.
            If fields is not provided, term vectors
            are returned for each field.
        """
        if fields is None:
            fields = []
        body = {
            "fields": fields,
            "offsets": True,
            "payloads": True,
            "positions": True,
            "term_statistics": True,
            "field_statistics": True
        }
        resp = self.es.termvector(index=self.index_name,
                                  doc_type=doc_type,
                                  id=docid,
                                  body=body)
        return resp

    def add(self, index,
            doc_type,
            entry,
            docid=None):
        self.es.index(index=index, doc_type=doc_type, body=entry,
                      id=docid)

    def get_avg_size(self, field):
        '''
        Get the average document length for a the field sentence
        '''
        q = {"fields": [
            "sentence"
        ],
            "query": {
            "match_all": {

            }
        },
            "aggs": {
            "my_agg": {
                "avg": {
                    "script": "doc['sentence'].size()"
                }
            }
        }
        }
        res = self.es.search(index=self.index_name, body=q)
        return res['aggregations']['my_agg']['value']

    def get_idf(self, term):
        '''
        Returns the idf of a given term on the index

        Args:
            term(str)

        Returns:
            float -- idf value
        '''
        if self._count_total == -1:
            self._count_total = self.count(query='*:*')
        if self._idf is not None:
            if term in self._idf:
                return self._idf[term]
            else:
                count = self.count(term)
                if count == 0:
                    idf = 0
                else:
                    idf = math.log(
                        (self._count_total - count + 0.5) / (count + 0.5))
                self._idf[term] = idf
        else:
            count = self.count(term)
            if count == 0:
                idf = 0
            else:
                idf = math.log(
                    (self._count_total - count + 0.5) / (count + 0.5))
            self._idf = {term: idf}
        return idf

    def scan_and_scroll(self, doc_type, scroll_size=500, scroll_timeout=10):
        """
        The scan search type allows to efficiently scroll a large result set.
        The response will include no hits, with two important results,
        the total_hits will include the total hits that match the query
        and the scroll_id that allows to start the scroll process.
        Returns a list of results

        @param scroll_size: scroll size
        @param scroll_timeout: rountdtrip timeout
        """
        q_body = {"query": {
            "match_all": {}
        }}
        result = self.es.search(self.index_name,
                                doc_type,
                                q_body,
                                search_type='scan',
                                scroll=str(scroll_timeout) +
                                'm',
                                size=scroll_size)
        res = self.es.scroll(
            result['_scroll_id'], scroll=str(scroll_timeout) + 'm')
        finalres = []
        while len(res['hits']['hits']) > 0:
            print len(res['hits']['hits'])
            finalres += res['hits']['hits']
            res = self.es.scroll(
                res['_scroll_id'], scroll=str(scroll_timeout) + 'm')
        return finalres

예제 #2

파일 보기

파일: elasticsearchclient.py 프로젝트: AuthEceSoftEng/agora-elasticsearch-client

class ElasticSearchClient:
	"""
	Class used as a client to the Elasticsearch server.
	"""
	def __init__(self, host, port, username, password, indexname):
		"""
		Initializes this Elasticsearch Client.
		
		:param host: the HTTP address of the Elasticsearch server.
		:param port: the HTTP port of the Elasticsearch server.
		:param username: the username for connecting to the index.
		:param password: the password for connecting to the index.
		:param indexname: the name of the Elasticsearch index.
		"""
		self.indexname = indexname
		self.client = Elasticsearch(connection_class = SafeRequestsHttpConnection, host = host, port = int(port), http_auth = [username, password])
		self.snapshotclient = SnapshotClient(self.client)
		self.indicesclient = IndicesClient(self.client)

	def delete_index_and_mappings(self):
		"""
		Deletes the index and all its mappings.
		"""
		try:
			self.client.indices.delete(index = self.indexname)
		except NotFoundError:
			pass

	def create_index_and_mappings(self, update_mappings = False):
		"""
		Creates or updates the index and its mappings.
		
		:param update_mappings: boolean denoting whether the mappings should be created (False) or updated (True).
		"""
		if not self.client.indices.exists(self.indexname):
			self.client.indices.create(index = self.indexname, body = load_file_to_json("properties/indexsettings.json"))
		mappings = {}
		if self.indexname in self.client.indices.get_mapping(self.indexname):
			mappings = self.client.indices.get_mapping(self.indexname)[self.indexname]['mappings']
		if update_mappings:
			self.client.indices.close(self.indexname)
		if 'files' not in mappings or update_mappings:
			self.client.indices.put_mapping(index = self.indexname, doc_type = 'files',
				body = load_file_to_json("properties/filesproperties.json"))
		if 'projects' not in mappings or update_mappings:
			self.client.indices.put_mapping(index = self.indexname, doc_type = 'projects',
				body = load_file_to_json("properties/projectsproperties.json"))
		if update_mappings:
			self.client.indices.open(self.indexname)

	def has_project(self, project_id):
		"""
		Checks if the index contains a project.
		
		:param project_id: the id of the project to check if it is contained in the index.
		:returns: True if the index contains the project, or False otherwise.
		"""
		return self.client.exists(index = self.indexname, doc_type = 'projects', id = project_id)

	def has_file(self, file_id):
		"""
		Checks if the index contains a file.
		
		:param file_id: the id of the file to check if it is contained in the index.
		:returns: True if the index contains the file, or False otherwise.
		"""
		return self.client.exists(index = self.indexname, doc_type = 'files', id = file_id)

	def create_project(self, project):
		"""
		Creates a project in the index.
		
		:param project: the data of the project in JSON format.
		"""
		self.client.create(index = self.indexname, doc_type = 'projects', id = project['fullname'], body = project)

	def create_file(self, afile):
		"""
		Creates a file in the index.
		
		:param afile: the data of the file in JSON format.
		"""
		self.client.create(index = self.indexname, doc_type = 'files', id = afile['fullpathname'], parent = afile['project'], body = afile)

	def update_file(self, afile):
		"""
		Updates a file in the index.
		
		:param afile: the data of the file in JSON format.
		"""
		self.client.update(index = self.indexname, doc_type = 'files', id = afile['fullpathname'], parent = afile['project'], body = {'doc': afile})

	def delete_file(self, afile_id):
		"""
		Deletes a file from the index.
		
		:param afile_id: the id of the file to be deleted.
		"""
		self.client.delete(index = self.indexname, doc_type = 'files', id = afile_id, routing = '/'.join(afile_id.split('/')[0:2]))

	def delete_project(self, project_id):
		"""
		Deletes a project from the index. Note that this function also deletes all the files of the project.
		
		:param project_id: the id of the project to be deleted.
		"""
		self.client.delete_by_query(index = self.indexname, doc_type = 'files', body = {"query": { "bool": { "must": { "match_all": {} }, "filter": { "term": { "_routing": project_id } } } } })
		self.client.delete(index = self.indexname, doc_type = 'projects', id = project_id)

	def get_project_fileids_and_shas(self, project_id):
		"""
		Returns all the files and their corresponding shas for a project.
		
		:param project_id: the id of the project of which the files and the shas are returned.
		:returns: a dict containing the files of the project as keys and their shas as values.
		"""
		sourcefiles = self.client.search(index = self.indexname, doc_type = 'files',
			body = {"query": { "term" : { "_routing": project_id } } }, routing = project_id, size = 100000000)['hits']['hits']  # Limitation! Each project must have no more than 100000000 files
		fileidsandshas = {}
		for afile in sourcefiles:
			fileidsandshas[afile['_id']] = afile['_source']['sha']
		return fileidsandshas

	def execute_query(self, query, doc_type = 'files'):
		"""
		Executes a query on the index.
		
		:param query: the body of the query.
		:param doc_type: the document type to which the query is executed, either 'projects' or 'files'.
		:returns: the response of the query.
		"""
		return self.client.search(index = self.indexname, doc_type = doc_type, body = query)

	def test_analyzer(self, analyzer, text):
		"""
		Tests an analyzer of the index.
		
		:param analyzer: the analyzer to be tested.
		:param text: the text to be analyzed as a test.
		:returns: the analyzed text.
		"""
		result = self.indicesclient.analyze(index = self.indexname, analyzer = analyzer, body = text)
		return [r['token'] for r in result['tokens']]

	def backup(self, backupdir):
		"""
		Backups the index.
		
		:param backupdir: the directory used to backup the index.
		"""
		repositoryname = os.path.basename("backup" + self.indexname)
		try:
			self.snapshotclient.get_repository(repository = repositoryname)
		except:
			self.snapshotclient.create_repository(repository = repositoryname, body = {"type": "fs", "settings": {"location": backupdir + os.sep + self.indexname}})
		try:
			self.snapshotclient.get(repository = repositoryname, snapshot = self.indexname + "snapshot")
		except:
			self.snapshotclient.create(repository = repositoryname, snapshot = self.indexname + "snapshot", body = {"indices": self.indexname}, wait_for_completion = True)

	def delete_backup(self):
		"""
		Removes any backups of the index. If there are no backups, this function does nothing.
		"""
		repositoryname = os.path.basename("backup" + self.indexname)
		try:
			self.snapshotclient.delete(repository = repositoryname, snapshot = self.indexname + "snapshot")
		except:
			pass

	def restore_backup(self):
		"""
		Restores a backup of the index.
		"""
		repositoryname = os.path.basename("backup" + self.indexname)
		if not self.client.indices.exists(self.indexname):
			self.client.indices.create(index = self.indexname, body = load_file_to_json("properties/indexsettings.json"))
		self.client.indices.close(self.indexname)
		self.snapshotclient.restore(repository = repositoryname, snapshot = self.indexname + "snapshot", body = {"indices": self.indexname}, wait_for_completion = True)
		self.client.indices.open(self.indexname)

	def flush(self):
		"""
		Flushes the index.
		"""
		self.indicesclient.flush(index = self.indexname)