def test_make_filename(self): """download.py: Test directory structure creation from make_filename""" dest = dl.make_filename(name='full_path.html', dir=self.test_dir) self.assertEqual(dest,os.path.join(self.test_dir,'full_path.html')) long_new_path = os.path.join(self.test_dir,'alpha','beta','gamma') # First the negative test - assert it fails to create new directories with self.assertRaises(EnvironmentError): dest = dl.make_filename(name='foobar.html', dir=long_new_path) self.assertFalse(os.path.isdir(long_new_path)) # Then the positive - assert it succeeds when makedirs is on dest = dl.make_filename(name='foobar.html', dir=long_new_path, makedirs=True) self.assertTrue(os.path.isdir(long_new_path))
def download(cls,name, store=False,silent=True, retries=0): """Download a reference genome of the given name, and return a GRCGenome Fetches the named reference assembly from the web, and creates a new GRCGenome object to handle it. If store is False (default), the data will be kept in a temporary file, and will be destroyed as soon as the object is released. If True, the entire assembly will be saved in the current directory - ValueError will be raised if this file seems to already exist. The resulting file will have the '.assembly' suffix, and may be either a .zip, a .tar, a .tar.gz, or a .tar.bz2 file. See ``tigerlily.utility.archive.Archive`` for more information. If store is a string, it will be assumed to be a path to a directory (trailing slash optional) in which the .tar.gz archive should be stored. (Again, ValueError will be raised if the file already exists.) If necessary, any intermiediate directories will be created. If silent is False, status messages will be printed using print() to keep the user informed of the progress. This is usually very important in command line applications as the reference archives are about 900 MB in size and may take minutes or hours to download depending on the internet connection. Because of the large size of these files, it is highly recommended that the store option be set. Please do not use Tiger Lily to abuse the UCSC Genome Browser group's generosity in hosting these large files to the general public. >>> refgen = GRCGenome.download('test1') >>> refgen2 = GRCGenome.download('test1',store=True) >>> import os >>> os.path.isfile('test1.assembly') True >>> os.unlink('test1.assembly') Only supported reference genome assemblies are allowed, otherwise ValueError will be raised. >>> GRCGenome.download('invalid') Traceback (most recent call last): ... ValueError: Unknown or unsupported reference genome specified When downloading the assembly files the download will check with stored md5 values and compare to see if the download was completed correctly. if the assembly is not correctly downloaded after specified amount of retries then an exeption will be thrown and abort the download. """ if name not in SUPPORTED_ASSEMBLIES: raise ValueError('Unknown or unsupported reference genome' ' specified') url = SUPPORTED_ASSEMBLIES[name] client = ConsoleDownloader() if store and store is True: filename = make_filename(name='{}.assembly'.format(name)) elif store: name,dir = os.path.split(store) filename = make_filename(name=name,dir=dir,makedirs=True) else: temp = tempfile.NamedTemporaryFile() filename = temp.name client.retrieve(url[0], filename=filename, silent=silent) if url[1] != None: infile = open(filename,'rb') content = infile.read() infile.close() md5 = hashlib.md5(content).hexdigest() if md5 == url[1]: return GRCGenome.load_archive(Archive(filepath=filename)) else: os.remove(filename) if retries > 0: return GRCGenome.download(name,store,silent,retries-1) else: raise EnvironmentError('MD5sum failed 5 tries, download aborted') else: return GRCGenome.load_archive(Archive(filepath=filename))