def test_download_tarfile(self): # this is done after the small file sorting happens, # so pick UUIDs that would be grouped together files_to_dl = ['small_no_friends'] index_client = GDCIndexClient(base_url) index_client._get_metadata(files_to_dl) client = GDCHTTPDownloadClient(uri=base_url, index_client=index_client, **client_kwargs) # it will remove redundant uuids tarfile_name, errors = client._download_tarfile(files_to_dl) assert tarfile_name != None assert os.path.exists(tarfile_name) assert tarfile.is_tarfile(tarfile_name) == True with tarfile.open(tarfile_name, 'r') as t: for member in t.getmembers(): m = t.extractfile(member) contents = m.read() assert contents == uuids[m.name]['contents'] os.remove(tarfile_name)
def test_rel_mock_get_metadata(self): index = GDCIndexClient(uri=base_url) index._get_metadata(['small_rel']) assert index.get_access('small_rel') == uuids['small_rel']['access'] assert index.get_filesize('small_rel') == uuids['small_rel']['file_size'] assert index.get_md5sum('small_rel') == uuids['small_rel']['md5sum'] assert index.get_related_files('small_rel') == uuids['small_rel']['related_files'] assert index.get_annotations('small_rel') == []
def test_md5_members(self): files_to_tar = ['small', 'small_ann', 'small_rel', 'small_no_friends'] tarfile_name = make_tarfile(files_to_tar) index_client = GDCIndexClient(base_url) index_client._get_metadata(files_to_tar) client = GDCHTTPDownloadClient(uri=base_url, index_client=index_client, **client_kwargs) client._untar_file(tarfile_name) errors = client._md5_members(files_to_tar) assert errors == [] for f in files_to_tar: os.path.exists(f) os.remove(f)
def test_fix_url(self): index_client = GDCIndexClient(base_url) client = GDCHTTPDownloadClient(uri=base_url, index_client=index_client, **client_kwargs) assert client.fix_url('api.gdc.cancer.gov') == \ 'https://api.gdc.cancer.gov/' assert client.fix_url('http://api.gdc.cancer.gov/') == \ 'http://api.gdc.cancer.gov/' assert client.fix_url('api.gdc.cancer.gov/') == \ 'https://api.gdc.cancer.gov/'
def test_big_full_separate_small_files(self): index = GDCIndexClient(uri=base_url) bigs, smalls = index.separate_small_files( ['big'], HTTP_CHUNK_SIZE, related_files=True, annotations=True) assert index.get_access('big') == uuids['big']['access'] assert index.get_filesize('big') == uuids['big']['file_size'] assert index.get_md5sum('big') == uuids['big']['md5sum'] assert index.get_related_files('big') == uuids['big']['related_files'] assert index.get_annotations('big') == uuids['big']['annotations'] assert bigs == ['big'] assert smalls == []
def test_untar_file(self): files_to_tar = ['small', 'small_ann', 'small_rel', 'small_no_friends'] tarfile_name = make_tarfile(files_to_tar) index_client = GDCIndexClient(base_url) client = GDCHTTPDownloadClient(uri=base_url, index_client=index_client, **client_kwargs) client._untar_file(tarfile_name) for f in files_to_tar: assert os.path.exists(f) os.remove(f)
def test_small_invalid_separate_small_files(self): """ If no metadata can be found about a file, attempt a download using the big file method """ invalid = 'invalid uuid' index = GDCIndexClient(uri=base_url) bigs, smalls = index.separate_small_files( [invalid], HTTP_CHUNK_SIZE, related_files=True, annotations=True) assert index.get_access(invalid) == None assert index.get_filesize(invalid) == None assert index.get_md5sum(invalid) == None assert index.get_related_files(invalid) == [] assert index.get_annotations(invalid) == [] assert bigs == [invalid] assert smalls == []
def test_small_full_separate_small_files(self): """ Currently if a file has related or annotation files the dtt processes it as if it were a big file so that it goes through the old method of downloading, regardless of size. NOTE: This will probably change in the future. """ index = GDCIndexClient(uri=base_url) bigs, smalls = index.separate_small_files( ['small'], HTTP_CHUNK_SIZE, related_files=True, annotations=True) assert index.get_access('small') == uuids['small']['access'] assert index.get_filesize('small') == uuids['small']['file_size'] assert index.get_md5sum('small') == uuids['small']['md5sum'] assert index.get_related_files('small') == uuids['small']['related_files'] assert index.get_annotations('small') == uuids['small']['annotations'] assert bigs == ['small'] assert smalls == []
def test_no_metadata_get_filesize(self): index = GDCIndexClient(uri=base_url) results = index.get_access(uuids['small']) assert results == None
def test_no_metadata_get_md5sum(self): index = GDCIndexClient(uri=base_url) results = index.get_md5sum(uuids['small']) assert results == None
def test_no_metadata_get_annotations(self): index = GDCIndexClient(uri=base_url) results = index.get_annotations(uuids['small']) assert results == []
def test_no_metadata_get_related_files(self): index = GDCIndexClient(uri=base_url) results = index.get_related_files(uuids['small']) assert results == []
def test_big_and_small_full_separate_small_files(self): index = GDCIndexClient(uri=base_url) bigs, smalls = index.separate_small_files( ['big', 'small'], HTTP_CHUNK_SIZE, related_files=True, annotations=True) assert index.get_access('big') == uuids['big']['access'] assert index.get_filesize('big') == uuids['big']['file_size'] assert index.get_md5sum('big') == uuids['big']['md5sum'] assert index.get_related_files('big') == uuids['big']['related_files'] assert index.get_annotations('big') == uuids['big']['annotations'] assert index.get_access('small') == uuids['small']['access'] assert index.get_filesize('small') == uuids['small']['file_size'] assert index.get_md5sum('small') == uuids['small']['md5sum'] assert index.get_related_files('small') == uuids['small']['related_files'] assert index.get_annotations('small') == uuids['small']['annotations'] # if a uuid has related files or annotations then they # are downloaded as big files assert bigs == ['big', 'small'] assert smalls == []
def download(parser, args): """ Downloads data from the GDC. Combine the smaller files (~KB range) into a grouped download. The API now supports combining UUID's into one uncompressed tarfile using the ?tarfile url parameter. Combining many smaller files into one download decreases the number of open connections we have to make """ successful_count = 0 unsuccessful_count = 0 big_errors = [] small_errors = [] total_download_count = 0 validate_args(parser, args) # sets do not allow duplicates in a list ids = set(args.file_ids) for i in args.manifest: if not i.get('id'): log.error('Invalid manifest') break ids.add(i['id']) index_client = GDCIndexClient(args.server) client = get_client(args, index_client) # separate the smaller files from the larger files bigs, smalls = index_client.separate_small_files(ids, args.http_chunk_size, client.related_files, client.annotations) # the big files will be normal downloads # the small files will be joined together and tarfiled if smalls: log.debug('Downloading smaller files...') # download small file grouped in an uncompressed tarfile small_errors, count = client.download_small_groups(smalls) successful_count += count i = 0 while i < args.retry_amount and small_errors: time.sleep(args.wait_time) log.debug('Retrying failed grouped downloads') small_errors, count = client.download_small_groups(small_errors) successful_count += count i += 1 # client.download_files is located in parcel which calls # self.parallel_download, which goes back to to gdc-client's parallel_download if bigs: log.debug('Downloading big files...') # create URLs to send to parcel for download bigs = [urlparse.urljoin(client.data_uri, b) for b in bigs] downloaded_files, big_error_dict = client.download_files(bigs) not_downloaded_url = '' big_errors_count = 0 if args.retry_amount > 0: for url, reason in big_error_dict.iteritems(): # only retry the download if it wasn't a controlled access error if '403' not in reason: not_downloaded_url = retry_download( client, url, args.retry_amount, args.no_auto_retry, args.wait_time) else: big_errors.append(url) not_downloaded_url = '' if not_downloaded_url: for b in big_error_dict: big_errors.append(url) if big_errors: log.debug('Big files not downloaded: {0}'.format(', '.join( [b.split('/')[-1] for b in big_errors]))) successful_count += len(bigs) - len(big_errors) unsuccessful_count = len(ids) - successful_count log.info('{0}: {1}'.format(colored('Successfully downloaded', 'green'), successful_count)) if unsuccessful_count > 0: log.info('{0}: {1}'.format(colored('Failed downloads', 'red'), unsuccessful_count)) return small_errors or big_errors