def get_start_date(self, exchange: str, data_type: str, pair: str) -> float: objs = [] files = [] if not self.del_file: file_pattern = f'{exchange}-{data_type}-{pair}-[0-9]*.parquet' files = glob.glob(file_pattern) if self._read: for func, bucket, prefix, kwargs in zip(self._list, self.bucket, self.prefix, self.kwargs): path = f'{exchange}/{data_type}/{pair}/' if prefix: path = f"{prefix}/{path}" ret = func(bucket, path, limit=1, **kwargs) objs.append(ret) if not files and not any(objs): return None if files: files = sorted(files) start = files[0] else: start = objs[0][0] for entry in objs: if entry[0] != start: raise InconsistentStorage( "Stored data differs, cannot backfill") if files: return float( pq.read_table(files[0], columns=['timestamp']).to_pandas().timestamp[0]) else: tmp = f'{exchange}-{pair}-temp.parquet' self._read[0](self.bucket[0], objs[0][0], tmp, **self.kwargs[0]) start = float( pq.read_table(tmp, columns=['timestamp']).to_pandas().timestamp[0]) os.remove(tmp) return start
def __init__(self, creds: str, exchanges: dict, prefix: str, folder_name_sep: str, path: Callable[[str, str, str], str]): """ Initialize a `drive` service, and create the list of folders' IDs, either retrieving them if already existing, or creating them if not existing. Parameters: creds (str): Path to credential file. exchanges (dict): List of exchanges with related data types and pairs that are retrieved by cryptostore. prefix (str): Base folder into which storing recorded data. folder_name_sep (str): Separator to be used between `exchange`, `data_type` and `pair` in Google Drive folder name. path (Callable[[str, str, str], str]): Function from which deriving folders' name. """ httplib2 = StorageEngines['httplib2'] self.folder_name_sep = folder_name_sep # Initialize a drive service, with an authorized caching-enabled # `http` object. if creds: google = StorageEngines['google.oauth2.service_account'] self.creds = google.oauth2.service_account.Credentials.from_service_account_file( creds).with_scopes(['https://www.googleapis.com/auth/drive']) else: # Use environment variable GOOGLE_APPLICATION_CREDENTIALS google = StorageEngines['google.auth'] self.creds, _ = google.auth.default( scopes=['https://www.googleapis.com/auth/drive']) googleapiclient = StorageEngines['googleapiclient._auth'] auth_http = googleapiclient._auth.authorized_http(self.creds) auth_http.cache = httplib2.FileCache(self.cache_path) googleapiclient = StorageEngines['googleapiclient.discovery'] self.drive = googleapiclient.discovery.build('drive', 'v3', http=auth_http) files = self.drive.files() # Retrieve candidates for child and parent folders in Google Drive. # `pageSize` is by default to 100 and is limited to 1000. g_drive_folders = [] request = files.list( q= "mimeType = 'application/vnd.google-apps.folder' and trashed = false", pageSize=800, fields='nextPageToken, files(id, name, parents)') while request is not None: res = request.execute() g_drive_folders.extend(res.get('files', [])) request = files.list_next(request, res) # Retrieve parent folder ID (prefix). p_folders = [ folder['id'] for folder in g_drive_folders if folder['name'] == prefix ] if len(p_folders) > 1: # If more than 2 folders with the same name, throw an error. We do not # know which one is the right one to record data. raise InconsistentStorage( "At least 2 parent folders identified with \ name {!s}. Please, make sure to provide a prefix corresponding to a unique \ folder name in your Google Drive space.".format(prefix)) elif not p_folders: # If parent folder is not found, ask the user to create one. raise InconsistentStorage( "No existing folder found with name {!s}. \ Please, make sure to provide a prefix corresponding to an existing and \ accessible folder.".format(prefix)) else: p_folder_id = p_folders[0] # Manage child folders. Build list of folders' name. c_folders = [] for exchange in exchanges: for dtype in exchanges[exchange]: # Skip over the retries arg in the config if present. if dtype in {'retries', 'channel_timeouts'}: continue for pair in exchanges[exchange][ dtype] if 'symbols' not in exchanges[exchange][ dtype] else exchanges[exchange][dtype]['symbols']: c_folders.append( folder_name_sep.join( path(exchange, dtype, pair).split('/'))) # Retrieve ID for existing ones. existing_childs = [ (folder['name'], folder['id']) for folder in g_drive_folders if ((folder['name'] in c_folders) and ('parents' in folder) and ( p_folder_id in folder['parents'])) ] # If duplicates in folder names, throw an exception. existing_as_dict = dict(existing_childs) n = len(existing_childs) - len(existing_as_dict) if n != 0: raise InconsistentStorage( "{!s} existing folder(s) share(s) same name with another. Please, clean content of {!s} folder." .format(n, prefix)) # Get missing ones and create corresponding child folders in batch. missing_childs = list(set(c_folders) - set(existing_as_dict)) # Number of calls in batch is limited to 1000. call_limit = 800 missing_in_chunks = [ missing_childs[x:x + call_limit] for x in range(0, len(missing_childs), call_limit) ] # Setup & operate requests in batch. def _callback(request_id, response, exception, keep=existing_as_dict): keep[response['name']] = response['id'] return for sub_list in missing_in_chunks: batch = self.drive.new_batch_http_request(_callback) for folder in sub_list: folder_metadata = { 'name': folder, 'mimeType': 'application/vnd.google-apps.folder', 'parents': [p_folder_id] } batch.add(files.create(body=folder_metadata, fields='id, name')) batch.execute() self.folders = existing_as_dict
def _get_folder_in_parent(drive: gad.Resource, path: str) -> Tuple[str, str]: """ Retrieve folder ID from given name and parent folder name. If not existing, it is created. Parameters: drive (gad.Resource): Service with which interacting with Google Drive. path (str): path = '{prefix}/{exchange}/{data_type}/{pair}/ {exchange}-{data_type}-{pair}-{int(timestamp)}.parquet' String from which is retrieved `prefix` (parent folder) and name of child folder '{exchange}-{data_type}-{pair}'. Returns: folder_id, folder_name (Tuple[str, str]): Id of child folder '{exchange}-{data_type}-{pair}'. Create it if not existing. """ # Retrieve parent folder (prefix), and child folder. path_struct = path.split('/') folder_name = '-'.join(path_struct[1:4]) if len(path_struct) > 5: # If larger than 5, it means prefix is more than a single folder. # This case is not supported. raise InconsistentStorage("Prefix {!s} appears to be a path. Only a single folder name is accepted.".format(folder_name)) parent_name = path_struct[0] # Retrieve candidates for child and parent folders. res = drive.files().list(q="(name = '" + parent_name + "' or name = '" + folder_name + "') and mimeType = 'application/vnd.google-apps.folder' and trashed = false", pageSize=20, fields='files(id, name, parents)').execute() folders = res.get('files', []) # Manage parent folder. p_folders = [(folder['id'], folder['name']) for folder in folders if folder['name'] == parent_name] if len(p_folders) > 1: # If more than 2 folders with the same name, throw an error. We do not # know which one is the right one to record data. raise InconsistentStorage("At least 2 parent folders identified with \ name {!s}. Please, make sure to provide a prefix corresponding to a unique \ folder name in your Google Drive space.".format(parent_name)) elif not p_folders: # If parent folder is not found, ask the user to create one. raise InconsistentStorage("No existing folder found with name {!s}. \ Please, make sure to provide a prefix corresponding to an existing and \ accessible folder.".format(parent_name)) else: p_folder_id = p_folders[0][0] # Manage child folder. c_folders = [(folder['id'], folder['name']) for folder in folders if ((folder['name'] == folder_name) and ('parents' in folder) and (p_folder_id in folder['parents']))] if len(c_folders) > 1: # If more than 2 folders with the same name, throw an error. We do not # know which one is the right one to record data. raise InconsistentStorage("At least 2 folders identified with name {!s}. Please, clean content of parent folder.".format(folder_name)) elif not c_folders: # If folder not found, create it. folder_metadata = {'name': folder_name, 'mimeType': 'application/vnd.google-apps.folder', 'parents': [p_folder_id]} folder = drive.files().create(body=folder_metadata, fields='id')\ .execute() return folder.get('id'), folder_name else: # Single folder found. return folders[0]['id'], folder_name
def _worker(self, exchange): r = Rest() storage = Storage(self.config) for pair in self.config.backfill[exchange]: try: start = self.config.backfill[exchange][pair].start while True: end = storage.get_start_date(exchange, 'trades', pair) if not all(e == end[0] for e in end): raise InconsistentStorage( "Stored data differs, cannot backfill") end = end[0] if end: break time.sleep(10) end = Timestamp(end, unit='s') end -= Timedelta(microseconds=1) start = Timestamp(start) if end <= Timestamp(start): LOG.info( "Data in storage is earlier than backfill start date for %s - %s", exchange, pair) continue LOG.info("Backfill - Starting for %s - %s for range %s - %s", exchange, pair, start, str(end)) # Backfill from end date to start date, 1 day at a time, in reverse order (from end -> start) while start < end: seg_start = end.replace(hour=0, minute=0, second=0, microsecond=0, nanosecond=0) if start > seg_start: seg_start = start LOG.info("Backfill - Reading %s to %s for %s - %s", seg_start, end, exchange, pair) trades = [] try: for t in r[exchange].trades(pair, str(seg_start), str(end)): trades.extend(t) except Exception: LOG.warning( "Backfill - encountered error backfilling %s - %s, trying again...", exchange, pair, exc_info=True) time.sleep(300) continue if not trades: end = seg_start - Timedelta(nanoseconds=1) continue storage.aggregate(trades) storage.write(exchange, 'trades', pair, end.timestamp()) LOG.info("Backfill - Wrote %s to %s for %s - %s", seg_start, end, exchange, pair) end = seg_start - Timedelta(nanoseconds=1) LOG.info("Backfill for %s - %s completed", exchange, pair) except Exception: LOG.error("Backfill failed for %s - %s", exchange, pair, exc_info=True)