Exemplo n.º 1
0
def full_dedup(limit=1000):
    from designsafe.apps.data.models.elasticsearch import IndexedFile
    from elasticsearch import Elasticsearch
    from elasticsearch.helpers import bulk

    files_alias = settings.ES_INDICES['files']['alias']
    HOSTS = settings.ES_CONNECTIONS[settings.DESIGNSAFE_ENVIRONMENT]['hosts']
    es_client = Elasticsearch(hosts=HOSTS)
    file_search = IndexedFile.search().sort('_id').extra(size=limit)
    res = file_search.execute()

    while res.hits:
        for hit in res.hits:

            if hit.name is None or hit.path is None:
                continue

            print((hit.meta.id))
            try:
                IndexedFile.from_path(hit.system, hit.path)
            except Exception as e:
                print(e)

        search_after = res.hits.hits[-1]['sort']
        logger.debug(search_after)
        file_search = IndexedFile.search().sort('_id').extra(
            size=limit, search_after=search_after)
        res = file_search.execute()
Exemplo n.º 2
0
    def listing(self,
                system,
                file_path,
                user_context=None,
                offset=None,
                limit=None):
        """Perform the search and output in a serializable format."""

        ngram_query = Q("query_string",
                        query=self.query_string,
                        fields=["name"],
                        minimum_should_match='80%',
                        default_operator='or')

        match_query = Q("query_string",
                        query=self.query_string,
                        fields=["name._exact", "name._pattern"],
                        default_operator='and')

        search = IndexedFile.search()
        search = search.filter("nested",
                               path="permissions",
                               query=Q("term",
                                       permissions__username=user_context))
        search = search.query(ngram_query | match_query)

        search = search.query(
            Q('bool',
              must_not=[Q({'prefix': {
                  'path._exact': '/' + user_context
              }})]))
        search = search.filter("term", system=system)
        search = search.query(
            Q('bool',
              must_not=[
                  Q({
                      'prefix': {
                          'path._exact': '{}/.Trash'.format(user_context)
                      }
                  })
              ]))
        res = search.execute()

        children = []
        if res.hits.total.value:
            children = [o.to_dict() for o in search[offset:limit]]

        result = {
            'trail': [{
                'name': '$SEARCHSHARED',
                'path': '/$SEARCH'
            }],
            'name': '$SEARCHSHARED',
            'path': '/$SEARCHSHARED',
            'system': system,
            'type': 'dir',
            'children': children,
            'permissions': 'READ'
        }
        return result
Exemplo n.º 3
0
    def test_init(self, mock_base):
        request = MagicMock()
        request.query_string = 'test_query'
        request.username = '******'

        sm = CommunityDataSearchManager(request)
        mock_base.assert_called_with(IndexedFile, IndexedFile.search())
Exemplo n.º 4
0
    def __init__(self, request=None, **kwargs):
        if request:
            self.query_string = request.GET.get('query_string').replace(
                "/", "\\/")
        else:
            self.query_string = kwargs.get('query_string').replace("/", "\\/")

        super(PublishedDataSearchManager,
              self).__init__(IndexedFile, IndexedFile.search())
Exemplo n.º 5
0
def listing(client,
            system,
            path,
            username,
            offset=0,
            limit=100,
            *args,
            **kwargs):
    """
    Perform a Tapis file listing

    Params
    ------
    client: agavepy.agave.Agave
        Tapis client to use for the listing.
    system: str
        Tapis system ID.
    path: str
        Path in which to peform the listing.
    offset: int
        Offset for pagination.
    limit: int
        Number of results to return.

    Returns
    -------
    list
        List of dicts containing file metadata
    """

    if path:
        return agave_listing(client, system, path, offset, limit)

    username_q = Q('term', **{'permissions.username': username})
    world_q = Q('term', **{'permissions.username': '******'})
    pems_filter = Q('bool', should=[username_q, world_q])

    nested_filter = Q('nested')
    nested_filter.path = 'permissions'
    nested_filter.query = pems_filter

    file_path = '/'
    home_filter = Q('prefix', **{'path._exact': '/' + username})
    system_filter = Q('term',
                      **{'system._exact': 'designsafe.storage.default'})
    query = Q('bool',
              must_not=home_filter,
              filter=[nested_filter, system_filter])

    search = IndexedFile.search().filter(query).sort('name._exact').extra(
        from_=int(offset), size=int(limit))
    res = search.execute()

    hits = [hit.to_dict() for hit in res]

    return {'listing': hits, 'reachedEnd': len(hits) < int(limit)}
Exemplo n.º 6
0
def repair_paths(limit=1000):
    from designsafe.apps.data.models.elasticsearch import IndexedFile
    file_search = IndexedFile.search().sort('_uid').extra(size=limit)
    res = file_search.execute()

    while res.hits:
        for hit in res.hits:
            print hit.name, hit.path
            new_path = repair_path(hit.name, hit.path)
            hit.update(**{'path': new_path})
            hit.update(**{'basePath': os.path.dirname(new_path)})

            # use from_path to remove any duplicates.
            # IndexedFile.from_path(hit.system, hit.path)

        search_after = res.hits.hits[-1]['sort']
        logger.debug(search_after)
        file_search = IndexedFile.search().sort('_uid').extra(size=limit, search_after=search_after)
        res = file_search.execute()
Exemplo n.º 7
0
 def get(self, request):
     current_user = request.user
     q = IndexedFile.search()\
             .query('bool', must=[Q("prefix", **{"path._exact": '/' + current_user.username})])\
             .extra(size=0)
     q.aggs.metric('total_storage_bytes', 'sum', field="length")
     result = q.execute()
     agg = result.to_dict()["aggregations"]
     out = {"total_storage_bytes": agg["total_storage_bytes"]["value"]}
     return JsonResponse(out)
Exemplo n.º 8
0
    def __init__(self, request=None, **kwargs):
        if request:
            self.query_string = request.GET.get('query_string').replace(
                "/", "\\/")
            self.username = request.user.username
        else:
            self.query_string = kwargs.get('query_string').replace("/", "\\/")
            self.username = kwargs.get('username')

        super(PrivateDataSearchManager, self).__init__(IndexedFile,
                                                       IndexedFile.search())
Exemplo n.º 9
0
def repair_paths(limit=1000):
    from designsafe.apps.data.models.elasticsearch import IndexedFile
    from elasticsearch import Elasticsearch
    from elasticsearch.helpers import bulk

    files_alias = settings.ES_INDICES['files']['alias']
    HOSTS = settings.ES_CONNECTIONS[settings.DESIGNSAFE_ENVIRONMENT]['hosts']
    es_client = Elasticsearch(hosts=HOSTS)
    file_search = IndexedFile.search().sort('_id').extra(size=limit)
    res = file_search.execute()

    while res.hits:
        update_ops = []
        for hit in res.hits:

            if hit.name is None or hit.path is None:
                continue

            new_path = repair_path(hit.name, hit.path)
            new_basepath = os.path.dirname(new_path)

            update_ops.append({
                '_op_type': 'update',
                '_index': files_alias,
                '_type': 'file',
                '_id': hit.meta.id,
                'doc': {
                    'path': new_path,
                    'basePath': new_basepath
                }
            })

            # use from_path to remove any duplicates.
            # IndexedFile.from_path(hit.system, hit.path)

        bulk(es_client, update_ops)
        search_after = res.hits.hits[-1]['sort']
        logger.debug(search_after)
        file_search = IndexedFile.search().sort('_id').extra(
            size=limit, search_after=search_after)
        res = file_search.execute()
Exemplo n.º 10
0
 def listing_recursive(self, system='designsafe.storage.default', path='/'):
     """Lists every folder's children"""
     search = IndexedFile.search()
     term_system_query = Q('term', **{'system._exact': system})
     term_path_query = Q('term', **{'path._path': path})
     bool_query = Q('bool')
     bool_query.must = [term_system_query, term_path_query]
     bool_query.filter = self._pems_filter()
     search = search.query(bool_query)
     search = search.sort({'name._exact': 'asc'})
     res = search.execute()
     return res, search
Exemplo n.º 11
0
def search(client,
           system,
           path,
           offset=0,
           limit=100,
           query_string='',
           **kwargs):
    """
    Perform a search for files using a query string.

    Params
    ------
    client: NoneType
    system: str
        Tapis system ID to filter on.
    path: NoneType
    offset: int
        Search offset for pagination.
    limit: int
        Number of search results to return
    query_string: str
        Query string to pass to Elasticsearch

    Returns
    -------
    list
        List of dicts containing file metadata from Elasticsearch

    """
    ngram_query = Q("query_string",
                    query=query_string,
                    fields=["name"],
                    minimum_should_match='80%',
                    default_operator='or')
    match_query = Q("query_string",
                    query=query_string,
                    fields=["name._exact, name._pattern"],
                    default_operator='and')

    if not path.startswith('/'):
        path = '/' + path
    if not path.endswith('/'):
        path = path + '/'
    search = IndexedFile.search()
    search = search.query(ngram_query | match_query)
    search = search.filter('prefix', **{'path._exact': path})
    search = search.filter('term', **{'system._exact': system})
    search = search.extra(from_=int(offset), size=int(limit))
    res = search.execute()
    hits = [hit.to_dict() for hit in res]

    return {'listing': hits, 'reachedEnd': len(hits) < int(limit)}
Exemplo n.º 12
0
    def __init__(self, request=None, **kwargs):
        if request:
            self.query_string = request.GET.get('query_string')
        else:
            self.query_string = kwargs.get('query_string')

        split_query = self.query_string.split(" ")
        for i, c in enumerate(split_query):
            if c.upper() not in ["AND", "OR", "NOT"]:
                split_query[i] = "*" + c + "*"
        self.query_string = " ".join(split_query)

        super(PublishedDataSearchManager,
              self).__init__(IndexedFile, IndexedFile.search())
Exemplo n.º 13
0
 def get(self, system='designsafe.storage.default', path='/', name=''):
     """Gets a file"""
     search = IndexedFile.search()
     term_system_query = Q('term', **{'system._exact': system})
     term_path_query = Q('term', **{'path._exact': path})
     term_username_query = Q('term', **{'name._exact': name})
     bool_query = Q('bool')
     bool_query.must = [
         term_system_query, term_path_query, term_username_query
     ]
     bool_query.filter = self._pems_filter()
     search = search.query(bool_query)
     search = search.sort({'name._exact': 'asc'})
     res = search.execute()
     # logger.debug('search :%s', json.dumps(search.to_dict(), indent=2))
     return res, search
Exemplo n.º 14
0
    def listing(self, system='designsafe.storage.default', path='/'):
        """Lists a file

        :param str system: System Id. Default: designsafe.storage.default
        :param str path: Path
        """
        logger.debug('listing %s', os.path.join(system, path))
        search = IndexedFile.search()
        term_system_query = Q('term', **{'system._exact': system})
        term_path_query = Q('term', **{'path._exact': path})
        bool_query = Q('bool')
        bool_query.must = [term_system_query, term_path_query]
        bool_query.filter = self._pems_filter()
        search = search.query(bool_query)
        search = search.sort({'name._exact': 'asc'})
        res = search.execute()
        logger.debug('res %s', str(res.hits.total))
        return res, search
Exemplo n.º 15
0
 def tearDown(self):
     s = IndexedFile.search()
     res = s.query('bool', must=[Q("match",
                                   **{"path._path":
                                      "ds_user"})]).extra(size=10000)
     res.execute()
Exemplo n.º 16
0
def search(client,
           system,
           path,
           username,
           offset=0,
           limit=100,
           query_string='',
           **kwargs):
    """
    Perform a search for files using a query string.

    Params
    ------
    client: NoneType
    system: str
        Tapis system ID to filter on.
    path: NoneType
    offset: int
        Search offset for pagination.
    limit: int
        Number of search results to return
    query_string: str
        Query string to pass to Elasticsearch

    Returns
    -------
    list
        List of dicts containing file metadata from Elasticsearch

    """
    # Add leading slash to match agave formatting.
    if not path.startswith('/'):
        path = '/' + path
    # Add trailing slash so that prefix search in a folder doesn't return that folder.
    if not path.endswith('/'):
        path = path + '/'

    ngram_query = Q("query_string",
                    query=query_string,
                    fields=["name"],
                    minimum_should_match='80%',
                    default_operator='or')
    match_query = Q("query_string",
                    query=query_string,
                    fields=["name._exact, name._pattern"],
                    default_operator='and')

    username_q = Q('term', **{'permissions.username': username})
    world_q = Q('term', **{'permissions.username': '******'})
    pems_filter = Q('bool', should=[username_q, world_q])

    nested_filter = Q('nested')
    nested_filter.path = 'permissions'
    nested_filter.query = pems_filter

    home_filter = Q('prefix', **{'path._exact': '/' + username})
    system_filter = Q('term',
                      **{'system._exact': 'designsafe.storage.default'})
    query = Q('bool',
              must_not=home_filter,
              filter=[nested_filter, system_filter])

    search = IndexedFile.search().filter(query)
    search = search.query(ngram_query | match_query)
    search = search.filter('prefix', **{'path._exact': path})

    search = search.extra(from_=int(offset), size=int(limit))
    res = search.execute()
    hits = [hit.to_dict() for hit in res]

    return {'listing': hits, 'reachedEnd': len(hits) < int(limit)}
Exemplo n.º 17
0
    def listing(system, file_path, user_context=None, offset=None, limit=None):
        file_path = file_path or '/'
        file_path = file_path.strip('/')
        if file_path.strip('/').split('/')[0] != user_context:
            if file_path == '$SHARE':
                q = Q('bool', must=[Q('term', **{'system._exact': system})])
            else:
                q = Q('bool',
                      must=[
                          Q('term', **{'path._path': file_path}),
                          Q('term', **{'system._exact': system})
                      ])
        else:
            q = Q('bool',
                  must=[
                      Q('term', **{'path._exact': file_path}),
                      Q('term', **{'system._exact': system})
                  ])
        if user_context is not None:
            username_q = Q('term', **{'permissions.username': user_context})
            world_q = Q('term', **{'permissions.username': '******'})
            pems_filter = Q('bool')
            pems_filter.should = [username_q, world_q]
            nested_filter = Q('nested')
            nested_filter.path = 'permissions'
            nested_filter.query = pems_filter

        if file_path == '$SHARE':
            file_path = '/'
            home_filter = Q('bool',
                            must_not=Q('term',
                                       **{'path._path': '/' + user_context}))
            query = Q('bool', must=q, filter=[nested_filter, home_filter])
        else:
            query = Q('bool', must=q)

        search = IndexedFile.search()
        search.query = query
        search = search.sort('path._exact', 'name._exact')

        try:
            res = search.execute()
        except (TransportError, ConnectionTimeout) as e:
            if getattr(e, 'status_code', 500) == 404:
                raise
            res = search.execute()

        if file_path == '/':
            result = {
                'trail': [{
                    'name': '$SHARE',
                    'path': '/$SHARE'
                }],
                'name': '$SHARE',
                'path': '/$SHARE',
                'system': system,
                'type': 'dir',
                'children': [],
                'permissions': 'NONE'
            }
        else:
            file_path_comps = file_path.split('/')
            if file_path_comps != '':
                file_path_comps.insert(0, '')

            trail_comps = [{
                'name': file_path_comps[i] or '/',
                'system': system,
                'path': '/'.join(file_path_comps[0:i + 1]) or '/',
            } for i in range(0, len(file_path_comps))]
            result = {
                'trail': trail_comps,
                'name': os.path.split(file_path)[1],
                'path': file_path,
                'system': system,
                'type': 'dir',
                'children': [],
                'permissions': 'READ'
            }

        for f in res:
            result['children'].append(f.to_dict())

        return result