def test_download_extract_archive(self): # Generate a gzipped tarfile output_filename = os.path.join(self.base_path, 'subfolder.tar.gz') output_dirname = os.path.join(self.base_path, 'subfolder') extracted_filename = os.path.join(output_dirname, 'subfolder_text.txt') with tarfile.open(output_filename, 'w:gz') as tar: tar.add(output_dirname, arcname='subfolder') shutil.rmtree(output_dirname) sha1_hash = download_from_google_storage.get_sha1(output_filename) input_filename = '%s/%s' % (self.base_url, sha1_hash) self.queue.put((sha1_hash, output_filename)) self.queue.put((None, None)) stdout_queue = Queue.Queue() download_from_google_storage._downloader_worker_thread(0, self.queue, True, self.base_url, self.gsutil, stdout_queue, self.ret_codes, True, True, delete=False) expected_calls = [('check_call', ('ls', input_filename)), ('check_call', ('cp', input_filename, output_filename))] if sys.platform != 'win32': expected_calls.append( ('check_call', ('stat', 'gs://sometesturl/%s' % sha1_hash))) expected_output = ['0> Downloading %s...' % output_filename] expected_output.extend([ '0> Extracting 3 entries from %s to %s' % (output_filename, output_dirname) ]) expected_ret_codes = [] self.assertEqual(list(stdout_queue.queue), expected_output) self.assertEqual(self.gsutil.history, expected_calls) self.assertEqual(list(self.ret_codes.queue), expected_ret_codes) self.assertTrue(os.path.exists(output_dirname)) self.assertTrue(os.path.exists(extracted_filename))
def test_download_extract_archive(self): # Generate a gzipped tarfile output_filename = os.path.join(self.base_path, 'subfolder.tar.gz') output_dirname = os.path.join(self.base_path, 'subfolder') extracted_filename = os.path.join(output_dirname, 'subfolder_text.txt') with tarfile.open(output_filename, 'w:gz') as tar: tar.add(output_dirname, arcname='subfolder') shutil.rmtree(output_dirname) sha1_hash = download_from_google_storage.get_sha1(output_filename) input_filename = '%s/%s' % (self.base_url, sha1_hash) self.queue.put((sha1_hash, output_filename)) self.queue.put((None, None)) stdout_queue = Queue.Queue() download_from_google_storage._downloader_worker_thread( 0, self.queue, True, self.base_url, self.gsutil, stdout_queue, self.ret_codes, True, True, delete=False) expected_calls = [ ('check_call', ('ls', input_filename)), ('check_call', ('cp', input_filename, output_filename))] if sys.platform != 'win32': expected_calls.append( ('check_call', ('stat', 'gs://sometesturl/%s' % sha1_hash))) expected_output = [ '0> Downloading %s...' % output_filename] expected_output.extend([ '0> Extracting 3 entries from %s to %s' % (output_filename, output_dirname)]) expected_ret_codes = [] self.assertEqual(list(stdout_queue.queue), expected_output) self.assertEqual(self.gsutil.history, expected_calls) self.assertEqual(list(self.ret_codes.queue), expected_ret_codes) self.assertTrue(os.path.exists(output_dirname)) self.assertTrue(os.path.exists(extracted_filename))
def upload_to_google_storage(input_filenames, base_url, gsutil, force, use_md5, num_threads, skip_hashing): # We only want one MD5 calculation happening at a time to avoid HD thrashing. md5_lock = threading.Lock() # Start up all the worker threads plus the printer thread. all_threads = [] ret_codes = Queue.Queue() ret_codes.put((0, None)) upload_queue = Queue.Queue() upload_timer = time.time() stdout_queue = Queue.Queue() printer_thread = threading.Thread(target=printer_worker, args=[stdout_queue]) printer_thread.daemon = True printer_thread.start() for thread_num in range(num_threads): t = threading.Thread(target=_upload_worker, args=[ thread_num, upload_queue, base_url, gsutil, md5_lock, force, use_md5, stdout_queue, ret_codes ]) t.daemon = True t.start() all_threads.append(t) # We want to hash everything in a single thread since its faster. # The bottleneck is in disk IO, not CPU. hashing_start = time.time() for filename in input_filenames: if not os.path.exists(filename): stdout_queue.put('Main> Error: %s not found, skipping.' % filename) continue if os.path.exists('%s.sha1' % filename) and skip_hashing: stdout_queue.put( 'Main> Found hash for %s, sha1 calculation skipped.' % filename) with open(filename + '.sha1', 'rb') as f: sha1_file = f.read(1024) if not re.match('^([a-z0-9]{40})$', sha1_file): print >> sys.stderr, 'Invalid sha1 hash file %s.sha1' % filename return 1 upload_queue.put((filename, sha1_file)) continue stdout_queue.put('Main> Calculating hash for %s...' % filename) sha1_sum = get_sha1(filename) with open(filename + '.sha1', 'wb') as f: f.write(sha1_sum) stdout_queue.put('Main> Done calculating hash for %s.' % filename) upload_queue.put((filename, sha1_sum)) hashing_duration = time.time() - hashing_start # Wait for everything to finish. for _ in all_threads: upload_queue.put((None, None)) # To mark the end of the work queue. for t in all_threads: t.join() stdout_queue.put(None) printer_thread.join() # Print timing information. print 'Hashing %s files took %1f seconds' % (len(input_filenames), hashing_duration) print 'Uploading took %1f seconds' % (time.time() - upload_timer) # See if we ran into any errors. max_ret_code = 0 for ret_code, message in ret_codes.queue: max_ret_code = max(ret_code, max_ret_code) if message: print >> sys.stderr, message if not max_ret_code: print 'Success!' return max_ret_code
def test_get_sha1(self): lorem_ipsum = os.path.join(self.base_path, 'lorem_ipsum.txt') self.assertEqual(download_from_google_storage.get_sha1(lorem_ipsum), '7871c8e24da15bad8b0be2c36edc9dc77e37727f')
def test_get_sha1(self): lorem_ipsum = os.path.join(self.base_path, 'lorem_ipsum.txt') self.assertEqual( download_from_google_storage.get_sha1(lorem_ipsum), '7871c8e24da15bad8b0be2c36edc9dc77e37727f')
def upload_to_google_storage( input_filenames, base_url, gsutil, force, use_md5, num_threads, skip_hashing): # We only want one MD5 calculation happening at a time to avoid HD thrashing. md5_lock = threading.Lock() # Start up all the worker threads plus the printer thread. all_threads = [] ret_codes = Queue.Queue() ret_codes.put((0, None)) upload_queue = Queue.Queue() upload_timer = time.time() stdout_queue = Queue.Queue() printer_thread = threading.Thread(target=printer_worker, args=[stdout_queue]) printer_thread.daemon = True printer_thread.start() for thread_num in range(num_threads): t = threading.Thread( target=_upload_worker, args=[thread_num, upload_queue, base_url, gsutil, md5_lock, force, use_md5, stdout_queue, ret_codes]) t.daemon = True t.start() all_threads.append(t) # We want to hash everything in a single thread since its faster. # The bottleneck is in disk IO, not CPU. hashing_start = time.time() for filename in input_filenames: if not os.path.exists(filename): stdout_queue.put('Main> Error: %s not found, skipping.' % filename) continue if os.path.exists('%s.sha1' % filename) and skip_hashing: stdout_queue.put( 'Main> Found hash for %s, sha1 calculation skipped.' % filename) with open(filename + '.sha1', 'rb') as f: sha1_file = f.read(1024) if not re.match('^([a-z0-9]{40})$', sha1_file): print >> sys.stderr, 'Invalid sha1 hash file %s.sha1' % filename return 1 upload_queue.put((filename, sha1_file)) continue stdout_queue.put('Main> Calculating hash for %s...' % filename) sha1_sum = get_sha1(filename) with open(filename + '.sha1', 'wb') as f: f.write(sha1_sum) stdout_queue.put('Main> Done calculating hash for %s.' % filename) upload_queue.put((filename, sha1_sum)) hashing_duration = time.time() - hashing_start # Wait for everything to finish. for _ in all_threads: upload_queue.put((None, None)) # To mark the end of the work queue. for t in all_threads: t.join() stdout_queue.put(None) printer_thread.join() # Print timing information. print 'Hashing %s files took %1f seconds' % ( len(input_filenames), hashing_duration) print 'Uploading took %1f seconds' % (time.time() - upload_timer) # See if we ran into any errors. max_ret_code = 0 for ret_code, message in ret_codes.queue: max_ret_code = max(ret_code, max_ret_code) if message: print >> sys.stderr, message if not max_ret_code: print 'Success!' return max_ret_code