def get_browser(browser, headless=True, download='', *args): """ Get browser Args: browser: one of firefox and chrome headless: whether to show browser download: folder to download files to - relative to ~/Downloads files in the folder WILL BE REMOVED when browser is returned """ br, br_opt = BROWSERS[browser] if headless: br_opt.add_argument('--headless') br_opt.add_argument('--disable-gpu') br_opt.add_experimental_option('excludeSwitches', ['enable-logging']) if download: dl_path = get_rel_path(folder=f'Downloads/{download}') if files.exists(dl_path): shutil.rmtree(dl_path, ignore_errors=True) files.create_folder(dl_path) br_opt.add_experimental_option( 'prefs', { 'download.default_directory': dl_path, 'download.prompt_for_download': False, 'download.directory_upgrade': True, }) for arg in args: br_opt.add_argument(arg) return br(options=br_opt)
def save_data(data, file_fmt, append=False, drop_dups=None, info=None, **kwargs): """ Save data to file Args: data: pd.DataFrame file_fmt: data file format in terms of f-strings append: if append data to existing data drop_dups: list, drop duplicates in columns info: dict, infomation to be hashed and passed to f-strings **kwargs: additional parameters for f-strings Examples: >>> data = pd.DataFrame([[1, 2], [3, 4]], columns=['a', 'b']) >>> # save_data( >>> # data, '{ROOT}/daily/{typ}.parq', >>> # ROOT='tests/data', typ='earnings' >>> # ) """ d_file = data_file(file_fmt=file_fmt, info=info, **kwargs) if append and files.exists(d_file): data = pd.DataFrame( pd.concat([pd.read_parquet(d_file), data], sort=False)) if drop_dups is not None: data.drop_duplicates(subset=utils.tolist(drop_dups), inplace=True) if not data.empty: data.to_parquet(d_file) return data
def load_file(data_file: str, load_func=None, **kwargs): """ Load data from cache """ logger = logs.get_logger(load_file, level=kwargs.get('log', 'info')) if (not data_file) or (not files.exists(data_file)): return if callable(load_func): return load_func(data_file) ext = data_file.split('.')[-1] if ext not in LOAD_FUNC: return logger.debug(f'Reading from {data_file} ...') return LOAD_FUNC[ext](data_file)
def from_json(cls, json_file): """ Instantiate class from json file Args: json_file: json file path Returns: Class instance """ if not files.exists(json_file): raise FileExistsError( f'{json_file} not exists to initiate {cls.__class__.__name__}') with open(json_file, 'r') as fp: return cls(**json.load(fp=fp))
def wrapper(*args, **kwargs): default.update(kwargs) kwargs.update(default) cur_mod = sys.modules[func.__module__] logger = logs.get_logger( name_or_func=f'{cur_mod.__name__}.{func.__name__}', types='stream') root_path = cur_mod.DATA_PATH date_type = kwargs.pop('date_type', 'date') save_static = kwargs.pop('save_static', True) save_dynamic = kwargs.pop('save_dynamic', True) symbol = kwargs.get('symbol') file_kw = dict(func=func, symbol=symbol, root=root_path, date_type=date_type) d_file = cache_file(has_date=True, **file_kw) s_file = cache_file(has_date=False, **file_kw) cached = kwargs.pop('cached', False) if cached and save_static and files.exists(s_file): logger.info(f'Reading data from {s_file} ...') return pd.read_parquet(s_file) data = func(*args, **kwargs) if save_static: files.create_folder(s_file, is_file=True) save_data(data=data, file_fmt=s_file, append=False) logger.info(f'Saved data file to {s_file} ...') if save_dynamic: drop_dups = kwargs.pop('drop_dups', None) files.create_folder(d_file, is_file=True) save_data(data=data, file_fmt=d_file, append=True, drop_dups=drop_dups) logger.info(f'Saved data file to {d_file} ...') return data
def wrapper(*args, **kwargs): # Check function parameters param = inspect.signature(func).parameters all_kw = { k: args[n] if n < len(args) else v.default for n, (k, v) in enumerate(param.items()) } all_kw.update(utils.func_kwarg(func=func, **kwargs)) kwargs.update(all_kw) # Data path and file name cur_dt = utils.cur_time( trading=False, tz=kwargs.get('_tz_', utils.DEFAULT_TZ), ) if data_root: root_path = data_root else: root_path = getattr(sys.modules[func.__module__], 'DATA_PATH') if file_fmt: file_name = target_file_name(fmt=file_fmt, **all_kw) else: file_name = f'{func.__name__}/[date].pkl' if callable(file_func): name_pattern = '' data_file = f'{root_path}/{file_func(**kwargs)}' else: name_pattern = (f'{root_path}/{file_name}'.replace( '\\', '/').replace('[today]', '[date]')) data_file = name_pattern.replace('[date]', cur_dt) # Reload data and override cache if necessary use_cache = not kwargs.get('_reload_', False) # Load data if exists if files.exists(data_file) and use_cache: return load_file(data_file=data_file, load_func=load_func, **kwargs) # Load data if it was updated within update frequency if update_freq and use_cache and ('[date]' in name_pattern): start_dt = pd.date_range(end=cur_dt, freq=update_freq, periods=2)[0] for dt in pd.date_range(start=start_dt, end=cur_dt, normalize=True)[1:][::-1]: cur_file = name_pattern.replace('[date]', dt.strftime('%Y-%m-%d')) if files.exists(cur_file): return load_file(data_file=cur_file, load_func=load_func, **kwargs) # Retrieve data data = func(**all_kw) # Save data to cache save_file(data=data, data_file=data_file, save_func=save_func, **kwargs) return data