def save_webfile(url: str, fpath: str): """ Save file from web. """ size = download(url, fpath) if size == 0: raise ExternalSourceError("Empty file") _format = identify_file_format(fpath) if not _format: raise ExternalSourceError("File's signature does not match its type.") return _format, size
def extract_by_fileslist(apath, files_list): """ Extract files from archive. Supports only rar, zip archives. """ # identify format frmt = identify_file_format(apath) _files_list, arch_obj = [], None if not frmt: raise ExternalSourceError("Not supported format") else: if frmt == 'zip': arch_obj = ZipFile(apath) elif frmt == 'rar': arch_obj = RarFile(apath) paths = [] for f in files_list: folder = os.path.abspath(os.path.dirname(f)) fname = os.path.basename(f) for _f in arch_obj.namelist(): if os.path.basename(_f) == fname: arch_obj.extract(_f, folder) # build path for just new extracted file src = os.path.join(folder, _f).replace('/', os.sep) move(src, f) paths.append(f) return paths
def parse_cut_id(): try: r = requests.get(rcuts_url) rcuts = r.json() cut_date = self.month_as_cut_date() try: rcut = list( filter(lambda _rcut: cut_date in Box(_rcut).name, rcuts)).pop() rcut_id = Box(rcut).id except IndexError: raise ExternalSourceError(f'No rcut on {cut_date}') # _cut_id = rcuts[self.month]["id"] return rcut_id except (ConnectionError, HTTPError, Timeout) as e: raise ExternalSourceError(f'{host} rcuts not available')
def parse_cut_id(): try: r = get(rcuts_url) rcuts = json.loads(r) _cut_id = rcuts[month]["id"] return _cut_id except (ConnectionError, HTTPError, Timeout) as e: raise ExternalSourceError('stat.gov.kz rcuts not available')
def parse_json_from_js(raw, pattern): raw = codecs.encode(raw, encoding="utf8") soup = BeautifulSoup(raw, 'lxml') scripts = ''.join([str(s) for s in soup.find_all('script')]) r = re.search(pattern, scripts) if r: return json.loads(r.group(2)) else: raise ExternalSourceError('Javascript data not found')
def load_data(url, struct): r = get(url, HEADERS) data = [] raw = json.loads() if isinstance(raw, dict): o = Box(raw) if hasattr(o, 'error'): # raise error if instead of data we get error dict in response raise ExternalSourceError(o.error) return [dict_to_csvrow(d, struct) for d in raw]
def extract_by_wildcard(arch_fpath: str, directory: str = None, wildcard: str = '*.xlsx', names=None): """ Extract files from archive. Supports only zip and rar formats. """ frmt = identify_file_format(arch_fpath) # detect archive format if not frmt: raise ExternalSourceError("Not supported format") else: if frmt == 'rar': arch_obj = RarFile(arch_fpath) else: arch_obj = ZipFile(arch_fpath) if directory: _dir = directory else: _dir = os.path.abspath(os.path.dirname(arch_fpath)) # filter by wildcard _flist = fnmatch.filter(arch_obj.namelist(), wildcard) if names: _flist = _flist[:len(names)] extracted_files_list = [] # extracting for i, f in enumerate(_flist): _fname = os.path.basename(f) for _f in arch_obj.namelist(): if os.path.basename(_f) == _fname: arch_obj.extract(_f, _dir) src = os.path.join(_dir, _f).replace('/', os.sep) dest = os.path.join(_dir, _fname) if names: dest = os.path.join(_dir, names[i]) if _fname: move(src, dest) extracted_files_list.append(dest) return extracted_files_list
def download(url, fpath): """ Download file using stream """ try: # we always specify verify to False # cause we don't use certificate into # Kazakhtelecom network with requests.get(url, stream=True, verify=False) as r: r.raise_for_status() f_size = 0 with open(fpath, 'wb') as f: for chunk in r.iter_content(chunk_size=8192): if chunk: f.write(chunk) f_size += len(chunk) return f_size except (ConnectionError, HTTPError) as e: if os.path.exists(fpath): os.remove(fpath) raise ExternalSourceError('Could not download file {}'.format(fpath))