示例#1
0
def save_webfile(url: str, fpath: str):
    """ Save file from web. """
    size = download(url, fpath)
    if size == 0:
        raise ExternalSourceError("Empty file")

    _format = identify_file_format(fpath)

    if not _format:
        raise ExternalSourceError("File's signature does not match its type.")

    return _format, size
示例#2
0
def extract_by_fileslist(apath, files_list):
    """
        Extract files from archive. Supports only rar, zip archives.
    """

    # identify format
    frmt = identify_file_format(apath)

    _files_list, arch_obj = [], None

    if not frmt:
        raise ExternalSourceError("Not supported format")
    else:
        if frmt == 'zip':
            arch_obj = ZipFile(apath)
        elif frmt == 'rar':
            arch_obj = RarFile(apath)

    paths = []

    for f in files_list:
        folder = os.path.abspath(os.path.dirname(f))
        fname = os.path.basename(f)
        for _f in arch_obj.namelist():
            if os.path.basename(_f) == fname:
                arch_obj.extract(_f, folder)
                # build path for just new extracted file
                src = os.path.join(folder, _f).replace('/', os.sep)
                move(src, f)
                paths.append(f)

    return paths
示例#3
0
 def parse_cut_id():
     try:
         r = requests.get(rcuts_url)
         rcuts = r.json()
         cut_date = self.month_as_cut_date()
         try:
             rcut = list(
                 filter(lambda _rcut: cut_date in Box(_rcut).name,
                        rcuts)).pop()
             rcut_id = Box(rcut).id
         except IndexError:
             raise ExternalSourceError(f'No rcut on {cut_date}')
         # _cut_id = rcuts[self.month]["id"]
         return rcut_id
     except (ConnectionError, HTTPError, Timeout) as e:
         raise ExternalSourceError(f'{host} rcuts not available')
示例#4
0
 def parse_cut_id():
     try:
         r = get(rcuts_url)
         rcuts = json.loads(r)
         _cut_id = rcuts[month]["id"]
         return _cut_id
     except (ConnectionError, HTTPError, Timeout) as e:
         raise ExternalSourceError('stat.gov.kz rcuts not available')
示例#5
0
def parse_json_from_js(raw, pattern):
    raw = codecs.encode(raw, encoding="utf8")
    soup = BeautifulSoup(raw, 'lxml')
    scripts = ''.join([str(s) for s in soup.find_all('script')])
    r = re.search(pattern, scripts)
    if r:
        return json.loads(r.group(2))
    else:
        raise ExternalSourceError('Javascript data not found')
示例#6
0
def load_data(url, struct):

    r = get(url, HEADERS)
    data = []

    raw = json.loads()

    if isinstance(raw, dict):
        o = Box(raw)
        if hasattr(o, 'error'):
            # raise error if instead of data we get error dict in response
            raise ExternalSourceError(o.error)

    return [dict_to_csvrow(d, struct) for d in raw]
示例#7
0
def extract_by_wildcard(arch_fpath: str,
                        directory: str = None,
                        wildcard: str = '*.xlsx',
                        names=None):
    """ Extract files from archive. Supports only zip and rar formats. """
    frmt = identify_file_format(arch_fpath)

    # detect archive format
    if not frmt:
        raise ExternalSourceError("Not supported format")
    else:
        if frmt == 'rar':
            arch_obj = RarFile(arch_fpath)
        else:
            arch_obj = ZipFile(arch_fpath)

    if directory:
        _dir = directory
    else:
        _dir = os.path.abspath(os.path.dirname(arch_fpath))

    # filter by wildcard
    _flist = fnmatch.filter(arch_obj.namelist(), wildcard)

    if names:
        _flist = _flist[:len(names)]

    extracted_files_list = []

    # extracting
    for i, f in enumerate(_flist):
        _fname = os.path.basename(f)
        for _f in arch_obj.namelist():
            if os.path.basename(_f) == _fname:
                arch_obj.extract(_f, _dir)
                src = os.path.join(_dir, _f).replace('/', os.sep)
                dest = os.path.join(_dir, _fname)
                if names:
                    dest = os.path.join(_dir, names[i])
                if _fname:
                    move(src, dest)
                    extracted_files_list.append(dest)

    return extracted_files_list
示例#8
0
def download(url, fpath):
    """ Download file using stream """
    try:
        # we always specify verify to False
        # cause we don't use certificate into
        # Kazakhtelecom network
        with requests.get(url, stream=True, verify=False) as r:
            r.raise_for_status()
            f_size = 0
            with open(fpath, 'wb') as f:
                for chunk in r.iter_content(chunk_size=8192):
                    if chunk:
                        f.write(chunk)
                        f_size += len(chunk)

        return f_size

    except (ConnectionError, HTTPError) as e:
        if os.path.exists(fpath):
            os.remove(fpath)
        raise ExternalSourceError('Could not download file {}'.format(fpath))