Пример #1
0
def download_file(url,
                  data_path=BIGDATA_PATH,
                  filename=None,
                  size=None,
                  chunk_size=4096,
                  verbose=True):
    """Uses stream=True and a reasonable chunk size to be able to download large (GB) files over https"""
    if filename is None:
        filename = dropbox_basename(url)
    if not isinstance(url, (basestring, str)):
        return [
            download_file(s,
                          data_path=data_path,
                          filename=filename,
                          size=size,
                          chunk_size=chunk_size,
                          verbose=verbose) for s in url
        ]
    filepath = expand_filepath(os.path.join(data_path, filename))
    if url.endswith('dl=0'):
        url = url[:-1] + '1'  # noninteractive Dropbox download
    tqdm_prog = tqdm if verbose else no_tqdm
    logger.info('requesting URL: {}'.format(url))
    r = requests.get(url, stream=True, allow_redirects=True)
    size = r.headers.get('Content-Length', -1) if size is None else size
    try:
        size = int(size)
    except ValueError:
        size = -1

    logger.info('remote size: {}'.format(size))

    stat = path_status(filepath)
    logger.info('local size: {}'.format(stat.get('size', None)))
    if stat['type'] == 'file' and stat[
            'size'] >= size:  # TODO: check md5 or get the right size of remote file
        r.close()
        logger.info('retained: {}'.format(filepath))
        return filepath

    filedir = os.path.dirname(filepath)
    created_dir = mkdir_p(filedir)
    if verbose:
        logger.info('data path created: {}'.format(created_dir))
    assert os.path.isdir(filedir)
    assert created_dir.endswith(filedir)
    logger.info('downloaded: {}'.format(filepath))
    with open(filepath, 'wb') as f:
        for chunk in tqdm_prog(r.iter_content(chunk_size=chunk_size),
                               total=ceil(size / float(chunk_size))):
            if chunk:  # filter out keep-alive chunks
                f.write(chunk)

    r.close()
    logger.debug('nlpia.loaders.download_file: return filepath=' +
                 str(filepath))
    return filepath
Пример #2
0
def download_file(url,
                  data_path=BIGDATA_PATH,
                  filename=None,
                  size=None,
                  chunk_size=4096,
                  verbose=True):
    """Uses stream=True and a reasonable chunk size to be able to download large (GB) files over https"""
    if filename is None:
        filename = dropbox_basename(url)
    file_path = os.path.join(data_path, filename)
    if url.endswith('?dl=0'):
        url = url[:-1] + '1'  # noninteractive download
    if verbose:
        tqdm_prog = tqdm
        print('requesting URL: {}'.format(url))
    else:
        tqdm_prog = no_tqdm
    r = requests.get(url, stream=True, allow_redirects=True)
    size = r.headers.get('Content-Length', None) if size is None else size
    print('remote size: {}'.format(size))

    stat = path_status(file_path)
    print('local size: {}'.format(stat.get('size', None)))
    if stat['type'] == 'file' and stat[
            'size'] >= size:  # TODO: check md5 or get the right size of remote file
        r.close()
        return file_path

    print('Downloading to {}'.format(file_path))

    with open(file_path, 'wb') as f:
        for chunk in tqdm_prog(r.iter_content(chunk_size=chunk_size)):
            if chunk:  # filter out keep-alive chunks
                f.write(chunk)

    r.close()
    return file_path