示例#1
0
 def test_recursive_rsync(self, tmpdir, volume_ids, pairtree_volume_paths):
     utils.download_file(htids=volume_ids,
                         outdir=tmpdir.dirname,
                         keep_dirs=True,
                         format='pairtree')
     for path in pairtree_volume_paths:
         assert os.path.exists(os.path.join(tmpdir.dirname, path))
示例#2
0
 def test_rsync_single_file(self, tmpdir, volume_ids,
                            pairtree_volume_paths):
     expected_fname = os.path.split(pairtree_volume_paths[0])[1]
     utils.download_file(htids=volume_ids[0],
                         outdir=tmpdir.dirname,
                         format='pairtree')
     assert os.path.exists(os.path.join(tmpdir.dirname, expected_fname))
示例#3
0
 def test_rsync_multi_file(self, tmpdir, volume_ids, pairtree_volume_paths):
     utils.download_file(htids=volume_ids,
                         outdir=tmpdir.dirname,
                         format='pairtree')
     for path in pairtree_volume_paths:
         expected_fname = os.path.split(path)[1]
         assert os.path.exists(os.path.join(tmpdir.dirname, expected_fname))
def download(df_path, out_dir):
    """Download HTRC features vols.
    """
    df = pd.read_json(df_path)

    for htids in tqdm(chunked(list(df.htid), 100)):

        try:
            download_file(htids, outdir=out_dir)
        except Exception as e:
            print(e)
示例#5
0
def download_vols(ids, output_dir=None):
    # If no explicit output directory is specified, just create a temporary one
    if output_dir is None:
        output_dir = tempfile.mkdtemp()

    # Download extracted features
    download_file(htids=ids, outdir=output_dir)
    
    paths = map(lambda x: '{}/{}.json.bz2'.format(output_dir, x), ids)
    paths = [p for p in paths if os.path.exists(p)]
    return paths
示例#6
0
def download_vols(ids, output_dir=None):
    # If no explicit output directory is specified, just create a temporary one
    if output_dir is None:
        output_dir = tempfile.mkdtemp()

    if 'volume_id' in ids:
        ids.remove('volume_id')

    # Download extracted features
    paths = {id: '{}/{}.json.bz2'.format(output_dir, id) for id in ids}
    try:
        download_file(htids=ids, outdir=output_dir)
    except subprocess.CalledProcessError:
        missing = [id for id, p in paths.items() if not os.path.exists(p)]
        with open('error_missing.log', 'w') as outfile:
            outfile.write('\n'.join(missing))
        
        print("{} volume{} failed to download. "
              "See `error_missing.log`.".format(len(missing), 's' if len(missing) > 1 else ''))
        print("Continuing with volumes that succesfully downloaded...")
    
    paths = [p for id, p in paths.items() if os.path.exists(p)]
    return paths
示例#7
0
def download_vols(ids, output_dir=None):
    # If no explicit output directory is specified, just create a temporary one
    if output_dir is None:
        output_dir = tempfile.mkdtemp()

    if 'volume_id' in ids:
        ids.remove('volume_id')

    # Download extracted features
    paths = {id: '{}/{}.json.bz2'.format(output_dir, id) for id in ids}
    try:
        download_file(htids=ids, outdir=output_dir)
    except subprocess.CalledProcessError:
        missing = [id for id, p in paths.items() if not os.path.exists(p)]
        with open('error_missing.log', 'w') as outfile:
            outfile.write('\n'.join(missing))

        print("{} volume{} failed to download. "
              "See `error_missing.log`.".format(
                  len(missing), 's' if len(missing) > 1 else ''))
        print("Continuing with volumes that succesfully downloaded...")

    paths = [p for id, p in paths.items() if os.path.exists(p)]
    return paths
示例#8
0
 def test_rsync_multi_file(self, tmpdir, volume_ids, volume_paths):
     utils.download_file(htids=volume_ids, outdir=tmpdir.dirname)
     for path in volume_paths:
         expected_fname = os.path.split(path)[1]
         assert os.path.exists(os.path.join(tmpdir.dirname, expected_fname))
示例#9
0
 def test_rsync_single_file(self, tmpdir, volume_ids, volume_paths):
     expected_fname = os.path.split(volume_paths[0])[1]
     utils.download_file(htids=volume_ids[0], outdir=tmpdir.dirname)
     assert os.path.exists(os.path.join(tmpdir.dirname, expected_fname))
示例#10
0
 def test_recursive_rsync(self, tmpdir, volume_ids, volume_paths):
     utils.download_file(htids=volume_ids, outdir=tmpdir.dirname, keep_dirs=True)
     for path in volume_paths:
         assert os.path.exists(os.path.join(tmpdir.dirname, path))