def index_document_by_node(self, node): """ Index a node by supplying a node argument """ if isinstance(node, Node): if node.type == Node.FILE_TYPE: size = node.get_size() else: size = '' put_url = '{idx_name}/node/{id}'.format( idx_name=self.idx_name, # dual url encoding as ES decodes it id=uenc(uenc(node.path.encode('utf-8')))) data = { 'name': node.name, 'parent': uenc(node.get_parent()), 'date_modified': node.date_modified, 'size': size, 'type': node.type } result = self.es_service.conn.put(put_url, data=data) if result['status'] != 201: error_msg = u'Couldn\'t index document: {doc} to ES'.format( doc=node.path ) self.app.logger.error(error_msg) else: self.app.logger.debug(u'Indexed {name}'.format(name=node.name)) else: raise TypeError(u'node is not of type domain.node.Node')
def images(self, **kwargs): """BrooklynMuseumAPI.images: perform a collection.getImages request""" if 'item_type' in kwargs and 'item_id' in kwargs: req_url = '%s?%s&method=collection.getImages&%s' % \ (self.url, uenc(self._params), uenc(kwargs)) rsp = urllib2.urlopen(req_url).read() return self.parse_response(rsp) else: raise
def do_folder_sync(self, node_id): """ Pass a folder_id, read it from disk and the ES index. Compare its content and files and return the correct results based what is stored on disk. We do two queries, first to find total hits and then use total hits to do the second query. The reason for this is that a huge size value decrease Elasticsearch query performance by a huge margin. """ folder = Folder.get_instance(node_id, decode=True) if folder: index_id = uenc(folder.path.encode('utf-8')) max_size = int(self.es_service.conn.get(self.count_url, data={ "query": { "bool": { "must": [{ "term": { "parent": index_id } }] } } })['hits']['total']) search_url = u'{idx_name}/node/_search'.format( idx_name=self.idx_name) results = self.es_service.conn.get(search_url, data={ "from": 0, "size": max_size, "fields": [], "query": { "bool": { "must": [{ "term": { "parent": index_id } }] } } }) es_node_ids = set([doc['_id'] for doc in results['hits']['hits']]) disk_nodes = {node.index_id: node for node in ( folder.folders + folder.files)} disk_node_ids = set(disk_nodes.keys()) deleted_docs = es_node_ids - disk_node_ids new_docs = disk_node_ids - es_node_ids for doc_to_delete in deleted_docs: self.delete_document_by_id(doc_to_delete) for new_document in new_docs: self.index_document_by_node(disk_nodes[new_document]) self.flush_index() else: self.app.logger.error( 'No folder found by passing node id: {node_id}'.format( node_id=node_id ))
def search(self, **kwargs): args = kwargs req_url = '%s/records/v%s.%s?api_key=%s&%s' % (\ self.base_url, self.version, self.format, self.api_key, uenc(kwargs)) rsp = urllib2.urlopen(req_url).read() return DigitalNZResponse(self, rsp)
def delete_document_by_id(self, document_id): """ Deletes documents from ES by id string argument """ del_url = '{idx_name}/node/{id}'.format( idx_name=self.idx_name, id=uenc(document_id)) # dual url encoding as ES decodes it result = self.es_service.conn.delete(del_url) if result['status'] != 200: error_msg = u'Couldn\'t delete document: {doc} from ES'.format( doc=del_url ) self.app.logger.error(error_msg) else: return document_id
def custom_search(self, title=None, **kwargs): args = kwargs if title is None: raise req_url = '%s/custom_searches/v%s/%s.%s?api_key=%s&%s' % (\ self.base_url, self.version, title, self.format, self.api_key, uenc(kwargs)) rsp = urllib2.urlopen(req_url).read() return DigitalNZResponse(self, rsp)
def index_folders_and_files(self, folder=None): """ Find all folders for the user. Path to the folder is url encoded and assigned as a id for the Folder type in the ElasticSearch index. """ items_indexed = 0 folders = self.find_all_folders(folder) folder_bulk = [] for folder in folders: index_id = uenc(folder['path'].encode('utf-8')) parent = uenc(folder['parent'].encode('utf-8')) folder_bulk.append({'index': {'_id': index_id}}) data = { 'name': folder['name'], 'parent': parent, 'date_modified': folder['date_modified'], 'type': Node.FOLDER_TYPE, 'size': '' } folder_bulk.append(data) items_indexed += self.index_files(folder) self.es_service.bulk_insert(self.bulk_insert_url, folder_bulk) items_indexed += len(folders) return items_indexed
def search(self, **kwargs): """BrooklynMuseumAPI.search: perform a collection.search request""" req_url = '%s?%s&method=collection.search&%s' % \ (self.url, uenc(self._params), uenc(kwargs)) rsp = urllib2.urlopen(req_url).read() return self.parse_response(rsp)