def full_dedup(limit=1000): from designsafe.apps.data.models.elasticsearch import IndexedFile from elasticsearch import Elasticsearch from elasticsearch.helpers import bulk files_alias = settings.ES_INDICES['files']['alias'] HOSTS = settings.ES_CONNECTIONS[settings.DESIGNSAFE_ENVIRONMENT]['hosts'] es_client = Elasticsearch(hosts=HOSTS) file_search = IndexedFile.search().sort('_id').extra(size=limit) res = file_search.execute() while res.hits: for hit in res.hits: if hit.name is None or hit.path is None: continue print((hit.meta.id)) try: IndexedFile.from_path(hit.system, hit.path) except Exception as e: print(e) search_after = res.hits.hits[-1]['sort'] logger.debug(search_after) file_search = IndexedFile.search().sort('_id').extra( size=limit, search_after=search_after) res = file_search.execute()
def listing(self, system, file_path, user_context=None, offset=None, limit=None): """Perform the search and output in a serializable format.""" ngram_query = Q("query_string", query=self.query_string, fields=["name"], minimum_should_match='80%', default_operator='or') match_query = Q("query_string", query=self.query_string, fields=["name._exact", "name._pattern"], default_operator='and') search = IndexedFile.search() search = search.filter("nested", path="permissions", query=Q("term", permissions__username=user_context)) search = search.query(ngram_query | match_query) search = search.query( Q('bool', must_not=[Q({'prefix': { 'path._exact': '/' + user_context }})])) search = search.filter("term", system=system) search = search.query( Q('bool', must_not=[ Q({ 'prefix': { 'path._exact': '{}/.Trash'.format(user_context) } }) ])) res = search.execute() children = [] if res.hits.total.value: children = [o.to_dict() for o in search[offset:limit]] result = { 'trail': [{ 'name': '$SEARCHSHARED', 'path': '/$SEARCH' }], 'name': '$SEARCHSHARED', 'path': '/$SEARCHSHARED', 'system': system, 'type': 'dir', 'children': children, 'permissions': 'READ' } return result
def test_init(self, mock_base): request = MagicMock() request.query_string = 'test_query' request.username = '******' sm = CommunityDataSearchManager(request) mock_base.assert_called_with(IndexedFile, IndexedFile.search())
def __init__(self, request=None, **kwargs): if request: self.query_string = request.GET.get('query_string').replace( "/", "\\/") else: self.query_string = kwargs.get('query_string').replace("/", "\\/") super(PublishedDataSearchManager, self).__init__(IndexedFile, IndexedFile.search())
def listing(client, system, path, username, offset=0, limit=100, *args, **kwargs): """ Perform a Tapis file listing Params ------ client: agavepy.agave.Agave Tapis client to use for the listing. system: str Tapis system ID. path: str Path in which to peform the listing. offset: int Offset for pagination. limit: int Number of results to return. Returns ------- list List of dicts containing file metadata """ if path: return agave_listing(client, system, path, offset, limit) username_q = Q('term', **{'permissions.username': username}) world_q = Q('term', **{'permissions.username': '******'}) pems_filter = Q('bool', should=[username_q, world_q]) nested_filter = Q('nested') nested_filter.path = 'permissions' nested_filter.query = pems_filter file_path = '/' home_filter = Q('prefix', **{'path._exact': '/' + username}) system_filter = Q('term', **{'system._exact': 'designsafe.storage.default'}) query = Q('bool', must_not=home_filter, filter=[nested_filter, system_filter]) search = IndexedFile.search().filter(query).sort('name._exact').extra( from_=int(offset), size=int(limit)) res = search.execute() hits = [hit.to_dict() for hit in res] return {'listing': hits, 'reachedEnd': len(hits) < int(limit)}
def repair_paths(limit=1000): from designsafe.apps.data.models.elasticsearch import IndexedFile file_search = IndexedFile.search().sort('_uid').extra(size=limit) res = file_search.execute() while res.hits: for hit in res.hits: print hit.name, hit.path new_path = repair_path(hit.name, hit.path) hit.update(**{'path': new_path}) hit.update(**{'basePath': os.path.dirname(new_path)}) # use from_path to remove any duplicates. # IndexedFile.from_path(hit.system, hit.path) search_after = res.hits.hits[-1]['sort'] logger.debug(search_after) file_search = IndexedFile.search().sort('_uid').extra(size=limit, search_after=search_after) res = file_search.execute()
def get(self, request): current_user = request.user q = IndexedFile.search()\ .query('bool', must=[Q("prefix", **{"path._exact": '/' + current_user.username})])\ .extra(size=0) q.aggs.metric('total_storage_bytes', 'sum', field="length") result = q.execute() agg = result.to_dict()["aggregations"] out = {"total_storage_bytes": agg["total_storage_bytes"]["value"]} return JsonResponse(out)
def __init__(self, request=None, **kwargs): if request: self.query_string = request.GET.get('query_string').replace( "/", "\\/") self.username = request.user.username else: self.query_string = kwargs.get('query_string').replace("/", "\\/") self.username = kwargs.get('username') super(PrivateDataSearchManager, self).__init__(IndexedFile, IndexedFile.search())
def repair_paths(limit=1000): from designsafe.apps.data.models.elasticsearch import IndexedFile from elasticsearch import Elasticsearch from elasticsearch.helpers import bulk files_alias = settings.ES_INDICES['files']['alias'] HOSTS = settings.ES_CONNECTIONS[settings.DESIGNSAFE_ENVIRONMENT]['hosts'] es_client = Elasticsearch(hosts=HOSTS) file_search = IndexedFile.search().sort('_id').extra(size=limit) res = file_search.execute() while res.hits: update_ops = [] for hit in res.hits: if hit.name is None or hit.path is None: continue new_path = repair_path(hit.name, hit.path) new_basepath = os.path.dirname(new_path) update_ops.append({ '_op_type': 'update', '_index': files_alias, '_type': 'file', '_id': hit.meta.id, 'doc': { 'path': new_path, 'basePath': new_basepath } }) # use from_path to remove any duplicates. # IndexedFile.from_path(hit.system, hit.path) bulk(es_client, update_ops) search_after = res.hits.hits[-1]['sort'] logger.debug(search_after) file_search = IndexedFile.search().sort('_id').extra( size=limit, search_after=search_after) res = file_search.execute()
def listing_recursive(self, system='designsafe.storage.default', path='/'): """Lists every folder's children""" search = IndexedFile.search() term_system_query = Q('term', **{'system._exact': system}) term_path_query = Q('term', **{'path._path': path}) bool_query = Q('bool') bool_query.must = [term_system_query, term_path_query] bool_query.filter = self._pems_filter() search = search.query(bool_query) search = search.sort({'name._exact': 'asc'}) res = search.execute() return res, search
def search(client, system, path, offset=0, limit=100, query_string='', **kwargs): """ Perform a search for files using a query string. Params ------ client: NoneType system: str Tapis system ID to filter on. path: NoneType offset: int Search offset for pagination. limit: int Number of search results to return query_string: str Query string to pass to Elasticsearch Returns ------- list List of dicts containing file metadata from Elasticsearch """ ngram_query = Q("query_string", query=query_string, fields=["name"], minimum_should_match='80%', default_operator='or') match_query = Q("query_string", query=query_string, fields=["name._exact, name._pattern"], default_operator='and') if not path.startswith('/'): path = '/' + path if not path.endswith('/'): path = path + '/' search = IndexedFile.search() search = search.query(ngram_query | match_query) search = search.filter('prefix', **{'path._exact': path}) search = search.filter('term', **{'system._exact': system}) search = search.extra(from_=int(offset), size=int(limit)) res = search.execute() hits = [hit.to_dict() for hit in res] return {'listing': hits, 'reachedEnd': len(hits) < int(limit)}
def __init__(self, request=None, **kwargs): if request: self.query_string = request.GET.get('query_string') else: self.query_string = kwargs.get('query_string') split_query = self.query_string.split(" ") for i, c in enumerate(split_query): if c.upper() not in ["AND", "OR", "NOT"]: split_query[i] = "*" + c + "*" self.query_string = " ".join(split_query) super(PublishedDataSearchManager, self).__init__(IndexedFile, IndexedFile.search())
def get(self, system='designsafe.storage.default', path='/', name=''): """Gets a file""" search = IndexedFile.search() term_system_query = Q('term', **{'system._exact': system}) term_path_query = Q('term', **{'path._exact': path}) term_username_query = Q('term', **{'name._exact': name}) bool_query = Q('bool') bool_query.must = [ term_system_query, term_path_query, term_username_query ] bool_query.filter = self._pems_filter() search = search.query(bool_query) search = search.sort({'name._exact': 'asc'}) res = search.execute() # logger.debug('search :%s', json.dumps(search.to_dict(), indent=2)) return res, search
def listing(self, system='designsafe.storage.default', path='/'): """Lists a file :param str system: System Id. Default: designsafe.storage.default :param str path: Path """ logger.debug('listing %s', os.path.join(system, path)) search = IndexedFile.search() term_system_query = Q('term', **{'system._exact': system}) term_path_query = Q('term', **{'path._exact': path}) bool_query = Q('bool') bool_query.must = [term_system_query, term_path_query] bool_query.filter = self._pems_filter() search = search.query(bool_query) search = search.sort({'name._exact': 'asc'}) res = search.execute() logger.debug('res %s', str(res.hits.total)) return res, search
def tearDown(self): s = IndexedFile.search() res = s.query('bool', must=[Q("match", **{"path._path": "ds_user"})]).extra(size=10000) res.execute()
def search(client, system, path, username, offset=0, limit=100, query_string='', **kwargs): """ Perform a search for files using a query string. Params ------ client: NoneType system: str Tapis system ID to filter on. path: NoneType offset: int Search offset for pagination. limit: int Number of search results to return query_string: str Query string to pass to Elasticsearch Returns ------- list List of dicts containing file metadata from Elasticsearch """ # Add leading slash to match agave formatting. if not path.startswith('/'): path = '/' + path # Add trailing slash so that prefix search in a folder doesn't return that folder. if not path.endswith('/'): path = path + '/' ngram_query = Q("query_string", query=query_string, fields=["name"], minimum_should_match='80%', default_operator='or') match_query = Q("query_string", query=query_string, fields=["name._exact, name._pattern"], default_operator='and') username_q = Q('term', **{'permissions.username': username}) world_q = Q('term', **{'permissions.username': '******'}) pems_filter = Q('bool', should=[username_q, world_q]) nested_filter = Q('nested') nested_filter.path = 'permissions' nested_filter.query = pems_filter home_filter = Q('prefix', **{'path._exact': '/' + username}) system_filter = Q('term', **{'system._exact': 'designsafe.storage.default'}) query = Q('bool', must_not=home_filter, filter=[nested_filter, system_filter]) search = IndexedFile.search().filter(query) search = search.query(ngram_query | match_query) search = search.filter('prefix', **{'path._exact': path}) search = search.extra(from_=int(offset), size=int(limit)) res = search.execute() hits = [hit.to_dict() for hit in res] return {'listing': hits, 'reachedEnd': len(hits) < int(limit)}
def listing(system, file_path, user_context=None, offset=None, limit=None): file_path = file_path or '/' file_path = file_path.strip('/') if file_path.strip('/').split('/')[0] != user_context: if file_path == '$SHARE': q = Q('bool', must=[Q('term', **{'system._exact': system})]) else: q = Q('bool', must=[ Q('term', **{'path._path': file_path}), Q('term', **{'system._exact': system}) ]) else: q = Q('bool', must=[ Q('term', **{'path._exact': file_path}), Q('term', **{'system._exact': system}) ]) if user_context is not None: username_q = Q('term', **{'permissions.username': user_context}) world_q = Q('term', **{'permissions.username': '******'}) pems_filter = Q('bool') pems_filter.should = [username_q, world_q] nested_filter = Q('nested') nested_filter.path = 'permissions' nested_filter.query = pems_filter if file_path == '$SHARE': file_path = '/' home_filter = Q('bool', must_not=Q('term', **{'path._path': '/' + user_context})) query = Q('bool', must=q, filter=[nested_filter, home_filter]) else: query = Q('bool', must=q) search = IndexedFile.search() search.query = query search = search.sort('path._exact', 'name._exact') try: res = search.execute() except (TransportError, ConnectionTimeout) as e: if getattr(e, 'status_code', 500) == 404: raise res = search.execute() if file_path == '/': result = { 'trail': [{ 'name': '$SHARE', 'path': '/$SHARE' }], 'name': '$SHARE', 'path': '/$SHARE', 'system': system, 'type': 'dir', 'children': [], 'permissions': 'NONE' } else: file_path_comps = file_path.split('/') if file_path_comps != '': file_path_comps.insert(0, '') trail_comps = [{ 'name': file_path_comps[i] or '/', 'system': system, 'path': '/'.join(file_path_comps[0:i + 1]) or '/', } for i in range(0, len(file_path_comps))] result = { 'trail': trail_comps, 'name': os.path.split(file_path)[1], 'path': file_path, 'system': system, 'type': 'dir', 'children': [], 'permissions': 'READ' } for f in res: result['children'].append(f.to_dict()) return result