def __init__(self, out_store=None, overrides=Namespace()): """ Make a new context, so we can run the library. Takes an optional Namespace of overrides for default toil-vg and Toil configuration values. Overrides can also have a "config" key, in which case that config file will be loaded. """ # Load configuration and apply overrides. If the overrides are from # command-line options, we might also get a bunch of tool-specific # fields. self.config = apply_config_file_args(overrides) # Make a container runner for running tools self.runner = ContainerRunner( container_tool_map=get_container_tool_map(self.config), realtime_stderr=self.config.realTimeStderr) if out_store is not None: # We want to dump files to an IOStore # Make it an absolute path while we're getting set up. self.out_store_string = IOStore.absolute(out_store) else: # We don't want to use an out store self.out_store_string = None
def get_out_store(self): """ Return the IOStore to write output files to, or None if they should not be written anywhere besides the Toil file store. """ if self.out_store_string is not None: return IOStore.get(self.out_store_string) else: return None
def copy_everything(job, options): """ Download the file list and copy all the files. """ # Set up the IO stores. in_store = IOStore.get(options.in_store) out_store = IOStore.get(options.out_store) batch_count = 0 # List all the files. blobs_iterator = in_store.list_input_directory("", recursive=True) # Make an iterator that filters them filtered_iterator = (x for x in blobs_iterator if fnmatch.fnmatchcase(x, options.pattern)) # Batch them up for batch in group(filtered_iterator, options.batch_size): # For every batch, strip out any Nones that got put in when grouping batch = [x for x in batch if x is not None] # Copy everything in that batch job.addChildJobFn(copy_batch, options, batch, cores=1, memory="1G", disk="10G") batch_count += 1 if batch_count % 10 == 0: RealtimeLogger.info("Queued {} batches...".format(batch_count)) RealtimeLogger.info("Queued {} total batches".format(batch_count))
def _assertOutput(self, sample_name, outstore, f1_threshold=0.90): # grab the f1.txt file local_f1 = os.path.join(self.workdir, 'f1.txt') io_store = IOStore.get(outstore) f1_path = 'vcfeval_output_f1.txt' if sample_name: f1_path = sample_name + '_' + f1_path io_store.read_input_file(f1_path, local_f1) with open(local_f1) as f1_file: f1_score = float(f1_file.readline().strip()) print(f1_score) self.assertGreaterEqual(f1_score, f1_threshold)
def copy_batch(job, options, batch): """ Copy a batch of files from input to output. """ RealtimeLogger.info("Copying a batch") # Set up the IO stores. in_store = IOStore.get(options.in_store) out_store = IOStore.get(options.out_store) # Start some threads pool = ThreadPool(10) def download(filename): """ Download each file """ try: if (not options.overwrite) and out_store.exists(filename): # File exists. But make sure its size is correct. if not options.check_size: # Skip existing file. No need to check the length. RealtimeLogger.info("Skipped {}".format(filename)) return out_size = out_store.get_size(filename) in_size = in_store.get_size(filename) if out_size != in_size: # Complain about size mismatch and copy RealtimeLogger.warning( "Redownloading {}! Size was {} and not {}!".format( filename, out_size, in_size)) else: # Skip existing file RealtimeLogger.info("Skipped {}".format(filename)) return # Make a temp file (handle, path) = tempfile.mkstemp(dir=job.fileStore.getLocalTempDir()) os.close(handle) RealtimeLogger.debug("Download {}".format(filename)) # Download in_store.read_input_file(filename, path) # Store out_store.write_output_file(path, filename) # Clean up os.unlink(path) except: # Put all exception text into an exception and raise that raise Exception("".join( traceback.format_exception(*sys.exc_info()))) RealtimeLogger.info("Copied {}".format(filename)) # Run all the downloads in parallel pool.map(download, batch)