Exemplo n.º 1
0
 def _make_dataclass_offline(self, eid, dataset_types=None, cache_dir=None, **kwargs):
     if self._cache.size == 0:
         return SessionDataInfo()
     # select the session
     npeid = parquet.str2np(eid)[0]
     df = self._cache[self._cache['eid_0'] == npeid[0]]
     df = df[df['eid_1'] == npeid[1]]
     # select datasets
     df = df[ismember(df['dataset_type'], dataset_types)[0]]
     return SessionDataInfo.from_pandas(df, self._get_cache_dir(cache_dir))
Exemplo n.º 2
0
 def _make_dataclass(self, eid, dataset_types=None, cache_dir=None, dry_run=False,
                     clobber=False, offline=False, keep_uuid=False):
     # if the input as an UUID, add the beginning of URL to it
     cache_dir = self._get_cache_dir(cache_dir)
     # get session json information as a dictionary from the alyx API
     try:
         ses = self.alyx.rest('sessions', 'read', id=eid)
     except requests.HTTPError:
         raise requests.HTTPError('Session ' + eid + ' does not exist')
     # filter by dataset types
     dc = SessionDataInfo.from_session_details(ses, dataset_types=dataset_types, eid=eid)
     # loop over each dataset and download if necessary
     with concurrent.futures.ThreadPoolExecutor(max_workers=NTHREADS) as executor:
         futures = []
         for ind in range(len(dc)):
             if dc.url[ind] is None or dry_run:
                 futures.append(None)
             else:
                 futures.append(executor.submit(
                     self.download_dataset, dc.url[ind], cache_dir=cache_dir, clobber=clobber,
                     offline=offline, keep_uuid=keep_uuid, file_size=dc.file_size[ind],
                     hash=dc.hash[ind]))
         concurrent.futures.wait(list(filter(lambda x: x is not None, futures)))
         for ind, future in enumerate(futures):
             if future is None:
                 continue
             dc.local_path[ind] = future.result()
     # filter by daataset types and update the cache
     self._update_cache(ses, dataset_types=dataset_types)
     return dc
Exemplo n.º 3
0
 def _load(self, eid, dataset_types=None, dclass_output=False, dry_run=False, cache_dir=None,
           download_only=False, clobber=False, offline=False, keep_uuid=False):
     """
     From a Session ID and dataset types, queries Alyx database, downloads the data
     from Globus, and loads into numpy array. Single session only
     """
     # if the input as an UUID, add the beginning of URL to it
     cache_dir = self._get_cache_dir(cache_dir)
     if is_uuid_string(eid):
         eid = '/sessions/' + eid
     eid_str = eid[-36:]
     # get session json information as a dictionary from the alyx API
     try:
         ses = self.alyx.get('/sessions/' + eid_str)
     except requests.HTTPError:
         raise requests.HTTPError('Session ' + eid_str + ' does not exist')
     # ses = ses[0]
     # if no dataset_type is provided:
     # a) force the output to be a dictionary that provides context to the data
     # b) download all types that have a data url specified whithin the alf folder
     dataset_types = [dataset_types] if isinstance(dataset_types, str) else dataset_types
     if not dataset_types or dataset_types == ['__all__']:
         dclass_output = True
     dc = SessionDataInfo.from_session_details(ses, dataset_types=dataset_types, eid=eid_str)
     # loop over each dataset and download if necessary
     for ind in range(len(dc)):
         if dc.url[ind] and not dry_run:
             relpath = PurePath(dc.url[ind].replace(self._par.HTTP_DATA_SERVER, '.')).parents[0]
             cache_dir_file = PurePath(cache_dir, relpath)
             Path(cache_dir_file).mkdir(parents=True, exist_ok=True)
             dc.local_path[ind] = self._download_file(
                 dc.url[ind], str(cache_dir_file), clobber=clobber, offline=offline,
                 keep_uuid=keep_uuid, file_size=dc.file_size[ind], hash=dc.hash[ind])
     # load the files content in variables if requested
     if not download_only:
         for ind, fil in enumerate(dc.local_path):
             dc.data[ind] = load_file_content(fil)
     # parse output arguments
     if dclass_output:
         return dc
     # if required, parse the output as a list that matches dataset_types requested
     list_out = []
     for dt in dataset_types:
         if dt not in dc.dataset_type:
             _logger.warning('dataset ' + dt + ' not found for session: ' + eid_str)
             list_out.append(None)
             continue
         for i, x, in enumerate(dc.dataset_type):
             if dt == x:
                 if dc.data[i] is not None:
                     list_out.append(dc.data[i])
                 else:
                     list_out.append(dc.local_path[i])
     return list_out
Exemplo n.º 4
0
 def setUp(self):
     # Init connection to the database
     dc1 = SessionDataInfo(
         dataset_type='first dtype',
         dataset_id='first uuid data',
         local_path='/first/path',
         eid='first uuid session',
         url='first url',
         data=np.array([1, 2, 3]),
     )
     dc2 = SessionDataInfo(
         dataset_type='second dtype',
         dataset_id='second uuid data',
         local_path='/second/path',
         eid='second uuid session',
         url='second url',
         data=np.array([1, 2, 3]),
     )
     self.dc1 = dc1
     self.dc2 = dc2
     self.dce = SessionDataInfo()
Exemplo n.º 5
0
 def _load(self,
           eid,
           dataset_types=None,
           dclass_output=False,
           dry_run=False,
           cache_dir=None,
           download_only=False,
           clobber=False,
           offline=False,
           keep_uuid=False):
     """
     From a Session ID and dataset types, queries Alyx database, downloads the data
     from Globus, and loads into numpy array. Single session only
     """
     # if the input as an UUID, add the beginning of URL to it
     cache_dir = self._get_cache_dir(cache_dir)
     if is_uuid_string(eid):
         eid = '/sessions/' + eid
     eid_str = eid[-36:]
     # get session json information as a dictionary from the alyx API
     try:
         ses = self.alyx.get('/sessions/' + eid_str)
     except requests.HTTPError:
         raise requests.HTTPError('Session ' + eid_str + ' does not exist')
     # ses = ses[0]
     # if no dataset_type is provided:
     # a) force the output to be a dictionary that provides context to the data
     # b) download all types that have a data url specified whithin the alf folder
     dataset_types = [dataset_types] if isinstance(dataset_types,
                                                   str) else dataset_types
     if not dataset_types or dataset_types == ['__all__']:
         dclass_output = True
     # this performs the filtering
     dc = SessionDataInfo.from_session_details(ses,
                                               dataset_types=dataset_types,
                                               eid=eid_str)
     # loop over each dataset and download if necessary
     with concurrent.futures.ThreadPoolExecutor(
             max_workers=NTHREADS) as executor:
         futures = []
         for ind in range(len(dc)):
             if dc.url[ind] is None or dry_run:
                 futures.append(None)
             else:
                 futures.append(
                     executor.submit(self.download_dataset,
                                     dc.url[ind],
                                     cache_dir=cache_dir,
                                     clobber=clobber,
                                     offline=offline,
                                     keep_uuid=keep_uuid,
                                     file_size=dc.file_size[ind],
                                     hash=dc.hash[ind]))
         concurrent.futures.wait(
             list(filter(lambda x: x is not None, futures)))
         for ind, future in enumerate(futures):
             if future is None:
                 continue
             dc.local_path[ind] = future.result()
     # load the files content in variables if requested
     if not download_only:
         for ind, fil in enumerate(dc.local_path):
             dc.data[ind] = load_file_content(fil)
     # parse output arguments
     if dclass_output:
         return dc
     # if required, parse the output as a list that matches dataset_types requested
     list_out = []
     for dt in dataset_types:
         if dt not in dc.dataset_type:
             _logger.warning('dataset ' + dt + ' not found for session: ' +
                             eid_str)
             list_out.append(None)
             continue
         for i, x, in enumerate(dc.dataset_type):
             if dt == x:
                 if dc.data[i] is not None:
                     list_out.append(dc.data[i])
                 else:
                     list_out.append(dc.local_path[i])
     return list_out
Exemplo n.º 6
0
    def load(self,
             eid,
             dataset_types=None,
             dclass_output=False,
             dry_run=False,
             cache_dir=None,
             download_only=False,
             clobber=False):
        """
        From a Session ID and dataset types, queries Alyx database, downloads the data
        from Globus, and loads into numpy array.

        :param eid: Experiment ID, for IBL this is the UUID of the Session as per Alyx
         database. Could be a full Alyx URL:
         'http://localhost:8000/sessions/698361f6-b7d0-447d-a25d-42afdef7a0da' or only the UUID:
         '698361f6-b7d0-447d-a25d-42afdef7a0da'
        :type eid: str
        :param dataset_types: [None]: Alyx dataset types to be returned.
        :type dataset_types: list
        :param dclass_output: [False]: forces the output as dataclass to provide context.
        :type dclass_output: bool
         If None or an empty dataset_type is specified, the output will be a dictionary by default.
        :param cache_dir: temporarly overrides the cache_dir from the parameter file
        :type cache_dir: str
        :param download_only: do not attempt to load data in memory, just download the files
        :type download_only: bool
        :param clobber: force downloading even if files exists locally
        :type clobber: bool

        :return: List of numpy arrays matching the size of dataset_types parameter, OR
         a dataclass containing arrays and context data.
        :rtype: list, dict, dataclass SessionDataInfo
        """
        # if the input as an UUID, add the beginning of URL to it
        cache_dir = self._get_cache_dir(cache_dir)
        if is_uuid_string(eid):
            eid = '/sessions/' + eid
        eid_str = eid[-36:]
        # get session json information as a dictionary from the alyx API
        ses = self.alyx.get('/sessions?id=' + eid_str)
        if not ses:
            raise FileNotFoundError('Session ' + eid_str + ' does not exist')
        ses = ses[0]
        # if no dataset_type is provided:
        # a) force the output to be a dictionary that provides context to the data
        # b) download all types that have a data url specified
        dataset_types = [dataset_types] if isinstance(dataset_types,
                                                      str) else dataset_types
        if not dataset_types:
            dclass_output = True
            dataset_types = [
                d['dataset_type'] for d in ses['data_dataset_session_related']
                if d['data_url']
            ]
        dc = SessionDataInfo.from_session_details(ses,
                                                  dataset_types=dataset_types)
        # loop over each dataset and download if necessary
        for ind in range(len(dc)):
            if dc.url[ind] and not dry_run:
                relpath = PurePath(dc.url[ind].replace(
                    self._par.HTTP_DATA_SERVER, '.')).parents[0]
                cache_dir_file = PurePath(cache_dir, relpath)
                Path(cache_dir_file).mkdir(parents=True, exist_ok=True)
                dc.local_path[ind] = self._download_file(
                    dc.url[ind], str(cache_dir_file), clobber)
        # load the files content in variables if requested
        if not download_only:
            for ind, fil in enumerate(dc.local_path):
                dc.data[ind] = _load_file_content(fil)
        # parse output arguments
        if dclass_output:
            return dc
        # if required, parse the output as a list that matches dataset_types requested
        list_out = []
        for dt in dataset_types:
            if dt not in dc.dataset_type:
                list_out.append(None)
                continue
            for i, x, in enumerate(dc.dataset_type):
                if dt == x:
                    list_out.append(dc.data[i])
        return list_out