def loader_func(**kwargs): path = kwargs.pop("path") engine = "xlrd" if path.endswith("xls") else "openpyxl" sheet_name = kwargs.pop("sheet", None) if path.startswith("http://") or path.startswith("https://"): proxy = kwargs.pop("proxy", None) req_kwargs = {} if proxy is not None: req_kwargs["proxies"] = dict(http=proxy, https=proxy) resp = requests.get(path, **req_kwargs) assert resp.status_code == 200 path = BytesIO(resp.content) if PY3 else StringIO(resp.content.decode("utf-8")) dfs = pd.read_excel( path, sheet_name=sheet_name, engine=engine, **{k: v for k, v in kwargs.items() if k in loader_prop_keys(LOADER_PROPS)} ) if dfs is None or not len(dfs): raise Exception("Failed to load Excel file. Returned no data.") if sheet_name: if sheet_name not in dfs: raise Exception( "Excel file loaded but there was no sheet named '{}'.".format( sheet_name ) ) return dfs[sheet_name] # this is required because there is no support for loading multiple datasets at once from the CLI # I can add this later... return dfs[list(dfs.keys())[0]]
def loader_func(**kwargs): path = handle_path(kwargs.pop("path"), kwargs) return pd.read_csv( path, **{ k: v for k, v in kwargs.items() if k in loader_prop_keys(LOADER_PROPS) })
def loader_func(**kwargs): try: import pyarrow # noqa: F401 except ImportError: try: import fastparquet # noqa: F401 except ImportError: raise ImportError( "In order to use the parquet loader you must install either pyarrow or fastparquet!" ) path = kwargs.pop("path") return pd.read_parquet( path, **{k: v for k, v in kwargs.items() if k in loader_prop_keys(LOADER_PROPS)} )
def loader_func(**kwargs): path = kwargs.pop("path") if path.startswith("http://") or path.startswith( "https://" ): # add support for URLs proxy = kwargs.pop("proxy", None) req_kwargs = {} if proxy is not None: req_kwargs["proxies"] = dict(http=proxy, https=proxy) resp = requests.get(path, **req_kwargs) assert resp.status_code == 200 path = BytesIO(resp.content) return pd.read_csv( path, **{k: v for k, v in kwargs.items() if k in loader_prop_keys(LOADER_PROPS)} )
def loader_func(**kwargs): path = kwargs.pop('path') normalize = kwargs.pop('normalize', False) if path.startswith('http://') or path.startswith('https://'): # add support for URLs proxy = kwargs.pop('proxy', None) req_kwargs = {} if proxy is not None: req_kwargs['proxies'] = dict(http=proxy, https=proxy) resp = requests.get(path, **req_kwargs) assert resp.status_code == 200 path = resp.json() if normalize else resp.text if normalize: normalize_func = pd.json_normalize if is_pandas1() else pd.io.json.json_normalize return normalize_func(path, **kwargs) return pd.read_json(path, **{k: v for k, v in kwargs.items() if k in loader_prop_keys(LOADER_PROPS)})
def load_file(sheet_name=None, **kwargs): path = kwargs.pop("path") engine = "xlrd" if path.endswith("xls") else "openpyxl" path = handle_path(path, kwargs) dfs = pd.read_excel(path, sheet_name=sheet_name, engine=engine, **{ k: v for k, v in kwargs.items() if k in loader_prop_keys(LOADER_PROPS) }) if dfs is None or not len(dfs): raise Exception("Failed to load Excel file. Returned no data.") return dfs
def loader_func(**kwargs): normalize = kwargs.pop("normalize", False) def resp_handler(resp): return resp.json() if normalize else resp.text path = handle_path(kwargs.pop("path"), kwargs, resp_handler=resp_handler) if normalize: normalize_func = (pd.json_normalize if is_pandas1() else pd.io.json.json_normalize) return normalize_func(path, **kwargs) return pd.read_json( path, **{ k: v for k, v in kwargs.items() if k in loader_prop_keys(LOADER_PROPS) })
def loader_func(**kwargs): path = kwargs.pop('path') if path.startswith('http://') or path.startswith( 'https://'): # add support for URLs proxy = kwargs.pop('proxy', None) req_kwargs = {} if proxy is not None: req_kwargs['proxies'] = dict(http=proxy, https=proxy) resp = requests.get(path, **req_kwargs) assert resp.status_code == 200 path = BytesIO(resp.content) if PY3 else StringIO( resp.content.decode('utf-8')) return pd.read_csv( path, **{ k: v for k, v in kwargs.items() if k in loader_prop_keys(LOADER_PROPS) })
def loader_func(**kwargs): path = kwargs.pop("path") engine = "xlrd" if path.endswith("xls") else "openpyxl" sheet_name = kwargs.pop("sheet", None) path = handle_path(path, kwargs) dfs = pd.read_excel( path, sheet_name=sheet_name, engine=engine, **{k: v for k, v in kwargs.items() if k in loader_prop_keys(LOADER_PROPS)} ) if dfs is None or not len(dfs): raise Exception("Failed to load Excel file. Returned no data.") if sheet_name: if sheet_name not in dfs: raise Exception( "Excel file loaded but there was no sheet named '{}'.".format( sheet_name ) ) return dfs[sheet_name] # this is required because there is no support for loading multiple datasets at once from the CLI # I can add this later... return dfs[list(dfs.keys())[0]]