def zip_generator_for_pipeline(files_list): pool = ThreadPool(3) zip_output = StreamingBytesIO() zip_input = ZipFile(zip_output, mode="w", compression=ZIP_STORED, allowZip64=True) try: # chunks_and_content is a list of tuples, of the chunk and the content of the file. # chunksize (which is a keyword argument of imap, not to be confused with Beiwe Chunks) # is the size of the batches that are handed to the pool. We always want to add the next # file to retrieve to the pool asap, so we want a chunk size of 1. # (In the documentation there are comments about the timeout, it is irrelevant under this construction.) chunks_and_content = pool.imap_unordered(batch_retrieve_pipeline_s3, files_list, chunksize=1) for pipeline_upload, file_contents in chunks_and_content: # file_name = determine_file_name(chunk) zip_input.writestr("data/" + pipeline_upload.file_name, file_contents) # These can be large, and we don't want them sticking around in memory as we wait for the yield del file_contents, pipeline_upload yield zip_output.getvalue() # yield the (compressed) file information zip_output.empty() # close, then yield all remaining data in the zip. zip_input.close() yield zip_output.getvalue() except DummyError: # The try-except-finally block is here to guarantee the Threadpool is closed and terminated. # we don't handle any errors, we just re-raise any error that shows up. # (with statement does not work.) raise finally: # We rely on the finally block to ensure that the threadpool will be closed and terminated, # and also to print an error to the log if we need to. pool.close() pool.terminate()
def csv_generator(study_id, number_of_new_patients): si = StreamingBytesIO() filewriter = writer(si) filewriter.writerow(['Patient ID', "Registration password"]) study_object_id = Study.objects.filter(pk=study_id).values_list('object_id', flat=True).get() for _ in xrange(number_of_new_patients): patient_id, password = Participant.create_with_password(study_id=study_id) # Creates an empty file on s3 indicating that this user exists s3_upload(patient_id, "", study_object_id) create_client_key_pair(patient_id, study_object_id) filewriter.writerow([patient_id, password]) yield si.getvalue() si.empty()
def participant_credential_generator(study_id, number_of_new_patients, desired_filename): si = StreamingBytesIO() filewriter = writer(si) filewriter.writerow(['Patient ID', "Registration password"]) study_object_id = Study.objects.filter(pk=study_id).values_list('object_id', flat=True).get() study_name = Study.objects.filter(pk=study_id).values_list('name', flat=True).get() for _ in xrange(number_of_new_patients): patient_id, password = Participant.create_with_password(study_id=study_id) # Creates an empty file on s3 indicating that this user exists s3_upload(construct_s3_raw_data_path(study_object_id, patient_id), "", study_object_id, raw_path=True) filewriter.writerow([patient_id, password]) yield si.getvalue() si.empty()
def zip_generator(files_list, construct_registry=False): """ Pulls in data from S3 in a multithreaded network operation, constructs a zip file of that data. This is a generator, advantage is it starts returning data (file by file, but wrapped in zip compression) almost immediately. """ processed_files = set() duplicate_files = set() pool = ThreadPool(3) # 3 Threads has been heuristically determined to be a good value, it does not cause the server # to be overloaded, and provides more-or-less the maximum data download speed. This was tested # on an m4.large instance (dual core, 8GB of ram). file_registry = {} zip_output = StreamingBytesIO() zip_input = ZipFile(zip_output, mode="w", compression=ZIP_STORED, allowZip64=True) try: # chunks_and_content is a list of tuples, of the chunk and the content of the file. # chunksize (which is a keyword argument of imap, not to be confused with Beiwe Chunks) # is the size of the batches that are handed to the pool. We always want to add the next # file to retrieve to the pool asap, so we want a chunk size of 1. # (In the documentation there are comments about the timeout, it is irrelevant under this construction.) chunks_and_content = pool.imap_unordered(batch_retrieve_s3, files_list, chunksize=1) total_size = 0 for chunk, file_contents in chunks_and_content: if construct_registry: file_registry[chunk['chunk_path']] = chunk["chunk_hash"] file_name = determine_file_name(chunk) if file_name in processed_files: duplicate_files.add((file_name, chunk['chunk_path'])) continue processed_files.add(file_name) zip_input.writestr(file_name, file_contents) # These can be large, and we don't want them sticking around in memory as we wait for the yield del file_contents, chunk x = zip_output.getvalue() total_size += len(x) # print "%s: %sK, %sM" % (random_id, total_size / 1024, total_size / 1024 / 1024) yield x # yield the (compressed) file information del x zip_output.empty() if construct_registry: zip_input.writestr("registry", json.dumps(file_registry)) yield zip_output.getvalue() zip_output.empty() # close, then yield all remaining data in the zip. zip_input.close() yield zip_output.getvalue() except DummyError: # The try-except-finally block is here to guarantee the Threadpool is closed and terminated. # we don't handle any errors, we just re-raise any error that shows up. # (with statement does not work.) raise finally: # We rely on the finally block to ensure that the threadpool will be closed and terminated, # and also to print an error to the log if we need to. pool.close() pool.terminate()