def __to_tf_idf(client, index, file_id): def tf_idf(params): (term, freq), (_, doc_freq) = params tf = freq / max_freq idf = np.log2(document_count / doc_freq) return term, tf * idf def document_term_vector(): tv = client.termvectors(index=index, doc_type='document', id=file_id, fields=['text'], positions=False, term_statistics=True) file_td, file_df = {}, {} if 'text' in tv['term_vectors']: for t in tv['term_vectors']['text']['terms']: file_td[t] = tv['term_vectors']['text']['terms'][t][ 'term_freq'] file_df[t] = tv['term_vectors']['text']['terms'][t][ 'doc_freq'] return sorted(file_td.items()), sorted(file_df.items()) term_frequency, document_appearances = document_term_vector() max_freq = max([f for _, f in term_frequency]) document_count = int( CatClient(client).count(index=[index], format='json')[0]['count']) vector = dict(map(tf_idf, zip(term_frequency, document_appearances))) return Rocchio.__normalize(vector)
def doc_count(client, index): """ Returns the number of documents in an index :param client: :param index: :return: """ return int(CatClient(client).count(index=[index], format='json')[0]['count'])
def doc_count(client, index): """ Returns the number of documents in an index :param client: ElasticSearch client :param index: Index to count documents from :return: number of documents for desired index """ return int( CatClient(client).count(index=[index], format='json')[0]['count'])
def relocate(): conf = config() try: es = Elasticsearch(conf['cluster_address']) escat = CatClient(es) escluster = ClusterClient(es) except Exception, e: print("Unable to connect to ES cluster. Reason: {}".format(e)) sys.exit(1)
def cluster_status(es): cluster = ClusterClient(es) print "\nCLUSTER HEALTH" pprint(cluster.health()) print "\nPENDING TASKS" pprint(cluster.pending_tasks()) print "\nNODES" for node in get_nodes_info(es): print node.name, node.docs print "\nSHARD ALLOCATION" cat = CatClient(es) print cat.allocation(v=True)
def doc_count(client, index): return int(CatClient(client).count(index=[index], format='json')[0]['count'])
def shard_status(es): cat = CatClient(es) print cat.shards(v=True)
from elasticsearch import Elasticsearch from elasticsearch.client import IndicesClient from elasticsearch.client import CatClient import json host = '192.168.15.168' es = Elasticsearch(hosts=host) print(es.ping()) index_cli = IndicesClient(es) # print(index_cli) cat_cli = CatClient(es) all_index = cat_cli.indices() # print(all_index) # get all index name index_list_source = all_index.split('\n')[:-1] index_list = [] for i in index_list_source: index = i.split()[2] index_list.append(index) # print(index_list) # get index mapping info for i in index_list: mapping_info = index_cli.get_mapping(index=i) mapping_info = json.dumps(mapping_info, indent=2, ensure_ascii=False) print('============') # print(i,mapping_info)