def full_dedup(limit=1000): from designsafe.apps.data.models.elasticsearch import IndexedFile from elasticsearch import Elasticsearch from elasticsearch.helpers import bulk files_alias = settings.ES_INDICES['files']['alias'] HOSTS = settings.ES_CONNECTIONS[settings.DESIGNSAFE_ENVIRONMENT]['hosts'] es_client = Elasticsearch(hosts=HOSTS) file_search = IndexedFile.search().sort('_id').extra(size=limit) res = file_search.execute() while res.hits: for hit in res.hits: if hit.name is None or hit.path is None: continue print((hit.meta.id)) try: IndexedFile.from_path(hit.system, hit.path) except Exception as e: print(e) search_after = res.hits.hits[-1]['sort'] logger.debug(search_after) file_search = IndexedFile.search().sort('_id').extra( size=limit, search_after=search_after) res = file_search.execute()
def test_delete_recursive(self, mock_children, mock_delete): wrapped_doc = IndexedFile( **{ 'name': 'folder1', 'system': 'test.system', 'path': '/path/to/folder', 'format': 'folder' }) base = BaseESFile('test_user', system='test.system', wrapped_doc=wrapped_doc) object.__setattr__(base, '_wrapped', wrapped_doc) object.__setattr__(base, 'format', 'folder') object.__setattr__(base, 'path', '/path/to/folder') child_doc = IndexedFile( **{ 'name': 'child1', 'system': 'test.system', 'path': '/path/to/child1', 'format': 'file' }) base_child = BaseESFile('test_user', system='test.system', wrapped_doc=child_doc) object.__setattr__(base_child, '_wrapped', child_doc) object.__setattr__(base_child, 'format', 'file') object.__setattr__(base_child, 'path', '/path/to/child1') mock_children.return_value = iter([base_child]) base.delete() # Assert 2 delete calls: 1 for parent, 1 for child self.assertEqual(mock_delete.call_count, 2)
def test_children_returns_when_hits(self, mock_search, mock_get): search_res = IndexedFile(**{ 'name': 'res1', 'system': 'test.system', 'path': '/path/to/res1' }) search_res.meta.id = 'MOCK ID' mock_search().filter().filter().sort().extra( ).execute.return_value.hits.__len__.return_value = 1 mock_search().filter().filter().sort().extra().execute( ).__iter__.return_value = [search_res] mock_search().filter().filter().sort().extra( ).execute.return_value.hits.hits = [{ 'sort': 'MOCK SORTKEY' }] mock_get.return_value = search_res children = IndexedFile.children('test_user', system='test.system', path='/') mock_get.assert_called_with('MOCK ID') self.assertEqual(children, ([search_res], 'MOCK SORTKEY'))
def test_from_path_multiple_hits(self, mock_refresh, mock_get, mock_search, mock_delete): """ When there are multiple files sharing a system and path, ensure we delete all but one and return the remaining document. """ search_res = IndexedFile(**{ 'name': 'res1', 'system': 'test.system', 'path': '/path/to/res1' }) sys_filter = Q('term', **{'system._exact': 'test.system'}) path_filter = Q('term', **{'path._exact': '/path/to/res1'}) # Need to mock either slicing the result or retrieving a single element. mock_res = MagicMock() mock_res.hits.total.value = 3 mock_search().filter().execute.return_value = mock_res mock_get.return_value = search_res doc_from_path = IndexedFile.from_path('test.system', '/path/to/res1') self.assertEqual(mock_search().filter().delete.call_count, 1) self.assertEqual(doc_from_path, search_res)
def test_from_path_multiple_hits(self, mock_search, mock_delete): """ When there are multiple files sharing a system and path, ensure we delete all but one and return the remaining document. """ search_res = IndexedFile(**{ 'name': 'res1', 'system': 'test.system', 'path': '/path/to/res1' }) # Need to mock either slicing the result or retrieving a single element. def mock_getitem(i): if type(i) is slice: return [search_res, search_res] else: return search_res # mock a search result with 3 hits and the ability to get/slice. mock_res = MagicMock() mock_res.hits.total = 3 mock_res.__getitem__.side_effect = mock_getitem mock_search().filter().filter().execute.return_value = mock_res doc_from_path = IndexedFile.from_path('test.system', '/path/to/res1') mock_search().filter.assert_called_with( 'term', **{'system._exact': 'test.system'}) mock_search().filter().filter.assert_called_with( 'term', **{'path._exact': '/path/to/res1'}) self.assertEqual(mock_delete.call_count, 2) self.assertEqual(doc_from_path, search_res)
def test_children_function(self, mock_index): child_doc1 = IndexedFile(**{ 'name': 'child1', 'system': 'test.system', 'path': '/path/to/child1' }) child_doc2 = IndexedFile(**{ 'name': 'child2', 'system': 'test.system', 'path': '/path/to/child2' }) mock_index.return_value.children.side_effect = [([child_doc1], 'KEY1'), ([child_doc2], 'KEY2'), ([], None)] wrapped_doc = IndexedFile(**{ 'name': 'file1', 'system': 'test.system', 'path': '/path/to/file' }) base = BaseESFile('test_user', system='test.system', wrapped_doc=wrapped_doc) # Need to set attrs manually because the custom setter/getter in BaseESResource are mocked object.__setattr__(base, 'username', 'test_user') object.__setattr__(base, '_reindex', False) object.__setattr__(base, 'system', 'test.system') object.__setattr__(base, 'path', '/path/to/file') child_generator = base.children(limit=1) for child in child_generator: continue mock_index().children.assert_has_calls([ call('test_user', 'test.system', '/path/to/file', limit=1), call('test_user', 'test.system', '/path/to/file', limit=1, search_after='KEY1'), call('test_user', 'test.system', '/path/to/file', limit=1, search_after='KEY2'), ]) # Check that iteration ends after all children have been listed. self.assertRaises(StopIteration, child_generator.__next__)
def listing(self, system, file_path, user_context=None, offset=None, limit=None): """Perform the search and output in a serializable format.""" ngram_query = Q("query_string", query=self.query_string, fields=["name"], minimum_should_match='80%', default_operator='or') match_query = Q("query_string", query=self.query_string, fields=["name._exact", "name._pattern"], default_operator='and') search = IndexedFile.search() search = search.filter("nested", path="permissions", query=Q("term", permissions__username=user_context)) search = search.query(ngram_query | match_query) search = search.query( Q('bool', must_not=[Q({'prefix': { 'path._exact': '/' + user_context }})])) search = search.filter("term", system=system) search = search.query( Q('bool', must_not=[ Q({ 'prefix': { 'path._exact': '{}/.Trash'.format(user_context) } }) ])) res = search.execute() children = [] if res.hits.total.value: children = [o.to_dict() for o in search[offset:limit]] result = { 'trail': [{ 'name': '$SEARCHSHARED', 'path': '/$SEARCH' }], 'name': '$SEARCHSHARED', 'path': '/$SEARCHSHARED', 'system': system, 'type': 'dir', 'children': children, 'permissions': 'READ' } return result
def test_init(self, mock_base): request = MagicMock() request.query_string = 'test_query' request.username = '******' sm = CommunityDataSearchManager(request) mock_base.assert_called_with(IndexedFile, IndexedFile.search())
def test_search(self, mock_search, mock_base): request = MagicMock() request.query_string = 'test_query' request.username = '******' mock_res = MagicMock() mock_res.hits.total.value = 1 mock_res.__iter__.return_value = [IndexedFile(name='file01')] mock_search().query().extra().execute.return_value = mock_res sm = PrivateDataSearchManager(request) expected_result = { 'trail': [{ 'name': '$SEARCH', 'path': '/$SEARCH' }], 'name': '$SEARCH', 'path': '/', 'system': 'test.system', 'type': 'dir', 'children': [{ 'name': 'file01' }], 'permissions': 'READ' } listing = sm.listing('test.system', '/') self.assertEqual(listing, expected_result)
def test_children_returns_when_no_hits(self, mock_search): mock_search().filter().filter().sort().extra( ).execute.return_value.hits.__len__.return_value = 0 children = IndexedFile.children('test_user', system='test.system', path='/') self.assertEqual(children, ([], None))
def __init__(self, request=None, **kwargs): if request: self.query_string = request.GET.get('query_string').replace( "/", "\\/") else: self.query_string = kwargs.get('query_string').replace("/", "\\/") super(PublishedDataSearchManager, self).__init__(IndexedFile, IndexedFile.search())
def listing(client, system, path, username, offset=0, limit=100, *args, **kwargs): """ Perform a Tapis file listing Params ------ client: agavepy.agave.Agave Tapis client to use for the listing. system: str Tapis system ID. path: str Path in which to peform the listing. offset: int Offset for pagination. limit: int Number of results to return. Returns ------- list List of dicts containing file metadata """ if path: return agave_listing(client, system, path, offset, limit) username_q = Q('term', **{'permissions.username': username}) world_q = Q('term', **{'permissions.username': '******'}) pems_filter = Q('bool', should=[username_q, world_q]) nested_filter = Q('nested') nested_filter.path = 'permissions' nested_filter.query = pems_filter file_path = '/' home_filter = Q('prefix', **{'path._exact': '/' + username}) system_filter = Q('term', **{'system._exact': 'designsafe.storage.default'}) query = Q('bool', must_not=home_filter, filter=[nested_filter, system_filter]) search = IndexedFile.search().filter(query).sort('name._exact').extra( from_=int(offset), size=int(limit)) res = search.execute() hits = [hit.to_dict() for hit in res] return {'listing': hits, 'reachedEnd': len(hits) < int(limit)}
def repair_paths(limit=1000): from designsafe.apps.data.models.elasticsearch import IndexedFile file_search = IndexedFile.search().sort('_uid').extra(size=limit) res = file_search.execute() while res.hits: for hit in res.hits: print hit.name, hit.path new_path = repair_path(hit.name, hit.path) hit.update(**{'path': new_path}) hit.update(**{'basePath': os.path.dirname(new_path)}) # use from_path to remove any duplicates. # IndexedFile.from_path(hit.system, hit.path) search_after = res.hits.hits[-1]['sort'] logger.debug(search_after) file_search = IndexedFile.search().sort('_uid').extra(size=limit, search_after=search_after) res = file_search.execute()
def get(self, request): current_user = request.user q = IndexedFile.search()\ .query('bool', must=[Q("prefix", **{"path._exact": '/' + current_user.username})])\ .extra(size=0) q.aggs.metric('total_storage_bytes', 'sum', field="length") result = q.execute() agg = result.to_dict()["aggregations"] out = {"total_storage_bytes": agg["total_storage_bytes"]["value"]} return JsonResponse(out)
def test_attrs(self): f = IndexedFile() self.assertTrue(hasattr(f, 'name')) self.assertTrue(hasattr(f, 'path')) self.assertTrue(hasattr(f, 'lastModified')) self.assertTrue(hasattr(f, 'length')) self.assertTrue(hasattr(f, 'format')) self.assertTrue(hasattr(f, 'mimeType')) self.assertTrue(hasattr(f, 'type')) self.assertTrue(hasattr(f, 'system'))
def test_update(self, mock_update): wrapped_doc = IndexedFile( **{ 'name': 'folder1', 'system': 'test.system', 'path': '/path/to/folder', 'format': 'folder' }) base = BaseESResource(wrapped_doc=wrapped_doc) base.update(**{'name': 'folder2'}) mock_update.assert_called_with(**{'name': 'folder2'})
def test_init(self, mock_wrap): wrapped_doc = IndexedFile( **{ 'name': 'folder1', 'system': 'test.system', 'path': '/path/to/folder', 'format': 'folder' }) BaseESResource(wrapped_doc=wrapped_doc) mock_wrap.assert_called_with(wrapped_doc)
def test_from_path_1_hit(self, mock_search): search_res = IndexedFile(**{ 'name': 'res1', 'system': 'test.system', 'path': '/path/to/res1' }) mock_res = MagicMock() mock_res.hits.total = 1 mock_res.__getitem__.return_value = search_res mock_search().filter().filter().execute.return_value = mock_res doc_from_path = IndexedFile.from_path('test.system', '/path/to/res1') mock_search().filter.assert_called_with( 'term', **{'system._exact': 'test.system'}) mock_search().filter().filter.assert_called_with( 'term', **{'path._exact': '/path/to/res1'}) self.assertEqual(doc_from_path, search_res)
def __init__(self, request=None, **kwargs): if request: self.query_string = request.GET.get('query_string').replace( "/", "\\/") self.username = request.user.username else: self.query_string = kwargs.get('query_string').replace("/", "\\/") self.username = kwargs.get('username') super(PrivateDataSearchManager, self).__init__(IndexedFile, IndexedFile.search())
def repair_paths(limit=1000): from designsafe.apps.data.models.elasticsearch import IndexedFile from elasticsearch import Elasticsearch from elasticsearch.helpers import bulk files_alias = settings.ES_INDICES['files']['alias'] HOSTS = settings.ES_CONNECTIONS[settings.DESIGNSAFE_ENVIRONMENT]['hosts'] es_client = Elasticsearch(hosts=HOSTS) file_search = IndexedFile.search().sort('_id').extra(size=limit) res = file_search.execute() while res.hits: update_ops = [] for hit in res.hits: if hit.name is None or hit.path is None: continue new_path = repair_path(hit.name, hit.path) new_basepath = os.path.dirname(new_path) update_ops.append({ '_op_type': 'update', '_index': files_alias, '_type': 'file', '_id': hit.meta.id, 'doc': { 'path': new_path, 'basePath': new_basepath } }) # use from_path to remove any duplicates. # IndexedFile.from_path(hit.system, hit.path) bulk(es_client, update_ops) search_after = res.hits.hits[-1]['sort'] logger.debug(search_after) file_search = IndexedFile.search().sort('_id').extra( size=limit, search_after=search_after) res = file_search.execute()
def test_to_dict(self, mock_to_dict): wrapped_doc = IndexedFile( **{ 'name': 'folder1', 'system': 'test.system', 'path': '/path/to/folder', 'format': 'folder' }) base = BaseESResource(wrapped_doc=wrapped_doc) base.to_dict() mock_to_dict.assert_called_with()
def listing_recursive(self, system='designsafe.storage.default', path='/'): """Lists every folder's children""" search = IndexedFile.search() term_system_query = Q('term', **{'system._exact': system}) term_path_query = Q('term', **{'path._path': path}) bool_query = Q('bool') bool_query.must = [term_system_query, term_path_query] bool_query.filter = self._pems_filter() search = search.query(bool_query) search = search.sort({'name._exact': 'asc'}) res = search.execute() return res, search
def test_class_init_with_wrap(self): wd = IndexedFile(**{ 'name': 'file1', 'system': 'test.system', 'path': '/path/to/file' }) base = BaseESFile('test_user', wrapped_doc=wd) self.mock_base_init.assert_called_with(wd) self.mock_base_setattr.assert_has_calls( [call('username', 'test_user'), call('_reindex', False)])
def search(client, system, path, offset=0, limit=100, query_string='', **kwargs): """ Perform a search for files using a query string. Params ------ client: NoneType system: str Tapis system ID to filter on. path: NoneType offset: int Search offset for pagination. limit: int Number of search results to return query_string: str Query string to pass to Elasticsearch Returns ------- list List of dicts containing file metadata from Elasticsearch """ ngram_query = Q("query_string", query=query_string, fields=["name"], minimum_should_match='80%', default_operator='or') match_query = Q("query_string", query=query_string, fields=["name._exact, name._pattern"], default_operator='and') if not path.startswith('/'): path = '/' + path if not path.endswith('/'): path = path + '/' search = IndexedFile.search() search = search.query(ngram_query | match_query) search = search.filter('prefix', **{'path._exact': path}) search = search.filter('term', **{'system._exact': system}) search = search.extra(from_=int(offset), size=int(limit)) res = search.execute() hits = [hit.to_dict() for hit in res] return {'listing': hits, 'reachedEnd': len(hits) < int(limit)}
def setUp(self): # configure regular user user = get_user_model().objects.get(pk=2) user.set_password('user/password') user.save() f1 = IndexedFile( length=1, path="ds_user/test", ) f1.save(refresh=True) f2 = IndexedFile( length=1, path="ds_user/test", ) f2.save(refresh=True)
def __init__(self, request=None, **kwargs): if request: self.query_string = request.GET.get('query_string') else: self.query_string = kwargs.get('query_string') split_query = self.query_string.split(" ") for i, c in enumerate(split_query): if c.upper() not in ["AND", "OR", "NOT"]: split_query[i] = "*" + c + "*" self.query_string = " ".join(split_query) super(PublishedDataSearchManager, self).__init__(IndexedFile, IndexedFile.search())
def test_wrap(self, mock_update): wrapped_doc = IndexedFile( **{ 'name': 'folder1', 'system': 'test.system', 'path': '/path/to/folder', 'format': 'folder' }) base = BaseESResource(wrapped_doc=wrapped_doc) self.assertEqual(base._wrapped, wrapped_doc) base_with_kwargs = BaseESResource(wrapped_doc=wrapped_doc, **{'name': 'folder2'}) mock_update.assert_called_with(**{'name': 'folder2'})
def get(self, system='designsafe.storage.default', path='/', name=''): """Gets a file""" search = IndexedFile.search() term_system_query = Q('term', **{'system._exact': system}) term_path_query = Q('term', **{'path._exact': path}) term_username_query = Q('term', **{'name._exact': name}) bool_query = Q('bool') bool_query.must = [ term_system_query, term_path_query, term_username_query ] bool_query.filter = self._pems_filter() search = search.query(bool_query) search = search.sort({'name._exact': 'asc'}) res = search.execute() # logger.debug('search :%s', json.dumps(search.to_dict(), indent=2)) return res, search
def test_delete_no_dir(self, mock_delete): wrapped_doc = IndexedFile( **{ 'name': 'file1', 'system': 'test.system', 'path': '/path/to/file', 'format': 'file' }) base = BaseESFile('test_user', system='test.system', wrapped_doc=wrapped_doc) object.__setattr__(base, '_wrapped', wrapped_doc) base.delete() mock_delete.assert_called_with()
def test_getter_and_setter(self): wrapped_doc = IndexedFile( **{ 'name': 'folder1', 'system': 'test.system', 'path': '/path/to/folder', 'format': 'folder' }) base = BaseESResource(wrapped_doc=wrapped_doc) base.name = 'folder2' self.assertEqual(base.name, 'folder2') self.assertEqual(base._wrapped.name, 'folder2') base.newAttr = 'this attr is not in the wrapped doc' self.assertEqual(base.newAttr, 'this attr is not in the wrapped doc') self.assertFalse(hasattr(base._wrapped, 'newAttr'))