class PostprocessPredictionsTask(luigi.Task): """Postprocess prediction from fairseq.""" subset = luigi.Parameter() model = luigi.EnumParameter(enum=ModelType, description='Model type') result_path = luigi.Parameter('./outputs') def output(self) -> luigi.Target: return luigi.LocalTarget( Path(self.result_path) / self.model.name / self.subset / 'out.tsv') def requires(self) -> Dict[str, luigi.Task]: requirements = { 'model': DownloadModelTask(model=self.model), 'predictions': GeneratePredictionsTask(model=self.model, subset=self.subset), } return requirements @property def processor(self): if not hasattr(self, '__processor'): self.__processor = self.requires()['model'].get_processor() return self.__processor def extract_predictions(self, line: str, property_names: Optional[List[str]] = None ) -> Tuple[int, List[str]]: items = line.strip().split('\t') line_idx = int(items[0][2:]) tokens = items[2].strip().split(' ') decoded_text = self.processor.decode_pieces(tokens) predictions = [] for prediction in decoded_text.split('###'): if property_names: property_name = property_names[line_idx] prediction = f'{property_name}_:_{prediction.strip()}' if len(prediction.strip().replace(' ', '_').split('_:_')) == 2: name, value = prediction.strip().replace(' ', '_').split('_:_', maxsplit=1) predictions.append(f'{name}={value}') else: if prediction: predictions.append(prediction.strip().replace(' ', '_')) return line_idx, predictions def aggregate_by_article( self, predictions: Dict[int, List[str]]) -> Dict[int, List[str]]: preprocess_task = self.requires()['predictions'].requires( )['data'].requires()['prepare-data'] with open(preprocess_task.output()['indices'].path) as index_file: indices = [int(line.strip()) for line in index_file] aggregated_predictions: DefaultDict[int, List[str]] = defaultdict(list) for line_idx, doc_idx in enumerate(indices): aggregated_predictions[doc_idx].extend(predictions[line_idx]) return aggregated_predictions def remove_duplicates(self, predictions: DefaultDict[int, List[str]]): for idx in predictions: predictions[idx] = list(set(predictions[idx])) def read_property_names(self) -> List[str]: preprocess_task = self.requires()['predictions'].requires( )['data'].requires()['prepare-data'] with open( preprocess_task.output()['property_names'].path) as name_file: names = [line.strip() for line in name_file] return names def run(self): property_names = self.read_property_names( ) if self.model == ModelType.T5 else None final_predictions = defaultdict(list) with open(self.input()['predictions'].path) as generated_file: for line in generated_file: if line.startswith('H-'): line_idx, predictions = self.extract_predictions( line, property_names) final_predictions[line_idx] = predictions if self.model == ModelType.T5: final_predictions = self.aggregate_by_article(final_predictions) self.remove_duplicates(final_predictions) # if self.filter_extra_properties: property_names = [] with gzip.open( Path('dataset') / 'wikireading-recycled' / self.subset / 'in.tsv.gz', 'rt') as source_file: for line in source_file: property_names.append(line.strip().split('\t')[0].split(' ')) for doc_idx in range(max(final_predictions.keys()) + 1): final_predictions[doc_idx] = [ prediction for prediction in final_predictions[doc_idx] if prediction.split('=')[0] in property_names[doc_idx] ] with open(self.output().path, 'wt') as out_file: for i in range(max(final_predictions.keys()) + 1): out_file.write(' '.join(sorted(final_predictions[i])) + '\n')
class EvaluateModelTask(luigi.Task): model = luigi.EnumParameter(enum=ModelType) subset = luigi.ChoiceParameter( choices=[ 'all', 'unseen', 'rare', 'categorical', 'relational', 'exact-match', 'long-articles' ], default='all', var_type=str, ) split = luigi.ChoiceParameter( choices=['dev-0', 'test-A', 'test-B'], default='test-B', var_type=str, ) def requires(self): if self.subset in ('exact-match', 'long-articles'): subset_to_generate = f'{self.split}-{self.subset}' else: subset_to_generate = self.split requirements = { 'predictions': PostprocessPredictionsTask(model=self.model, subset=subset_to_generate), 'dataset': DownloadDatasetTask(), } return requirements def output(self): if self.subset == 'all': full_subset_name = self.split else: full_subset_name = f'{self.split}-{self.subset}' return luigi.LocalTarget( Path('./results') / self.model.name / full_subset_name) def run(self): Path(self.output().path).parent.mkdir(parents=True, exist_ok=True) if self.subset == 'all': properties = None reference_file = Path( self.input()['dataset'].path) / self.split / 'expected.tsv' elif self.subset in ('exact-match', 'long-articles'): properties = None reference_file = Path(self.input( )['dataset'].path) / f'{self.split}-{self.subset}' / 'expected.tsv' else: properties = (Path(self.input()['dataset'].path) / f'{self.split}-{self.subset}.properties').open() reference_file = Path( self.input()['dataset'].path) / self.split / 'expected.tsv' evaluate( prediction_file=Path(self.input()['predictions'].path).open(), reference_file=reference_file.open(), separator='=', output_file=Path(self.output().path).open('w'), metric='mean-F1', properties=properties, ignore_case=False, )
class DownloadModelTask(luigi.Task): output_path = luigi.Parameter( './models', description= 'the path where the dataset will be downloaded and extracted') model = luigi.EnumParameter(enum=ModelType, description='Model type') def output(self): out = {} if self.model == ModelType.T5: out['dict'] = luigi.LocalTarget( Path(self.output_path) / 't5' / 'dict.txt') out['model'] = luigi.LocalTarget( Path(self.output_path) / 't5' / 't5_best.pt') out['sentencepiece'] = luigi.LocalTarget( Path(self.output_path) / 't5' / 'sentencepiece.model') elif self.model == ModelType.DUAL_ROBERTA_TRANSFORMER: out['dict'] = luigi.LocalTarget( Path(self.output_path) / 'dual-source-roberta' / 'dict.txt') out['model'] = luigi.LocalTarget( Path(self.output_path) / 'dual-source-roberta' / 'roberta_best.pt') out['vocab.bpe'] = luigi.LocalTarget( Path(self.output_path) / 'dual-source-roberta' / 'vocab.bpe') out['encoder.json'] = luigi.LocalTarget( Path(self.output_path) / 'dual-source-roberta' / 'encoder.json') elif self.model == ModelType.DUAL_SOURCE_TRANSFORMER: out['dict'] = luigi.LocalTarget( Path(self.output_path) / 'dual-source-transformer' / 'dict.txt') out['model'] = luigi.LocalTarget( Path(self.output_path) / 'dual-source-transformer' / 'vanilla_best.pt') out['sentencepiece'] = luigi.LocalTarget( Path(self.output_path) / 'dual-source-transformer' / 'spm.model') return out def requires(self): return None def run(self): logger.info( f'Downloading {self.model} model dataset to {self.output_path}') urls = { ModelType.DUAL_ROBERTA_TRANSFORMER: 'https://applica-public.s3-eu-west-1.amazonaws.com/multi-property-extraction/fairseq-models/dual-source-roberta.tar.gz', ModelType.DUAL_SOURCE_TRANSFORMER: 'https://applica-public.s3-eu-west-1.amazonaws.com/multi-property-extraction/fairseq-models/dual-source-transformer.tar.gz', ModelType.T5: 'https://applica-public.s3-eu-west-1.amazonaws.com/multi-property-extraction/fairseq-models/t5.tar.gz', } response = response = requests.get(urls[self.model], stream=True) obj = io.BytesIO(response.content) tarfile.TarFile(mode='r', fileobj=gzip.GzipFile(fileobj=obj, mode='rb')).extractall( self.output_path) def get_processor(self): if self.model is ModelType.DUAL_SOURCE_TRANSFORMER: return SentencePieceProcessor( self.output()['sentencepiece'].path, tokens_to_end=['▁###'], tokens_to_ignore=[], ) elif self.model is ModelType.DUAL_ROBERTA_TRANSFORMER: return RobertaProcessor(self.output()['encoder.json'].path, self.output()['vocab.bpe'].path) elif self.model is ModelType.T5: return SentencePieceProcessor( path=self.output()['sentencepiece'].path, tokens_to_end=['▁#'], tokens_to_ignore=['##'], ) else: raise Exception(f'Unsupported vocab type: "{self.vocab_type}".')
class SynapsePipelineTask(luigi.Task): '''The synapse pipeline task processes the synapses in a volume''' task_namespace = "ariadne_microns_pipeline" volume = VolumeParameter(description="The volume to process") output_location = luigi.Parameter( description="Directory for the segmentation .h5 file") temp_dirs = luigi.ListParameter( description="Locations for temp files") classifier_location= luigi.Parameter( description="Location for the classifier pickle file") experiment = luigi.Parameter( description="The Butterfly experiment that produced the dataset") sample = luigi.Parameter( description="The ID of the biological sample that was imaged") dataset = luigi.Parameter( description="The name of the volume that was imaged") channel = luigi.Parameter( default="raw", description="The name of the channel from which we take data") # # Optional parameters # butterfly_url = luigi.Parameter( default="http://localhost:2001/api", description="The URL for the butterfly server") block_width = luigi.IntParameter( default=2048, description="The width of a block") block_height = luigi.IntParameter( default=2048, description="The height of a block") block_depth = luigi.IntParameter( default=100, description="The depth of a block") xy_overlap = luigi.IntParameter( default=50, description="The amount of overlap between blocks in the x and y " "directions.") z_overlap = luigi.IntParameter( default=10, description="The amount of overlap between blocks in the z direction.") synapse_class_name = luigi.Parameter( default="synapse", description="The name of the synapse class in the classifier") # # FindSynapsesTask parameters # synapse_xy_erosion = luigi.IntParameter( default=4, description = "# of pixels to erode the neuron segmentation in the " "X and Y direction prior to synapse segmentation.") synapse_z_erosion = luigi.IntParameter( default=1, description = "# of pixels to erode the neuron segmentation in the " "Z direction prior to synapse segmentation.") synapse_xy_sigma = luigi.FloatParameter( description="Sigma for smoothing Gaussian for synapse segmentation " "in the x and y directions.", default=1) synapse_z_sigma = luigi.FloatParameter( description="Sigma for smoothing Gaussian for symapse segmentation " "in the z direction.", default=.5) synapse_min_size_2d = luigi.IntParameter( default=100, description="Remove isolated synapse foreground in a plane if " "less than this # of pixels") synapse_max_size_2d = luigi.IntParameter( default=15000, description = "Remove large patches of mislabeled synapse in a plane " "that have an area greater than this") synapse_min_size_3d = luigi.IntParameter( default=500, description = "Minimum size in voxels of a synapse") min_synapse_depth = luigi.IntParameter( default=5, description="Minimum acceptable size of a synapse in the Z direction") synapse_threshold = luigi.FloatParameter( description="Threshold for synapse voxels vs background voxels", default=128.) # # connected components parameters # joining_method = luigi.EnumParameter( enum=JoiningMethod, default=JoiningMethod.PAIRWISE_MULTIMATCH, description="Algorithm to use to join neuroproofed segmentation blocks") min_percent_connected = luigi.FloatParameter( default=75.0, description="Minimum overlap required to join segments across blocks") min_overlap_volume = luigi.IntParameter( default=1000, description="The minimum # of voxels of overlap between two objects " "required to join them across blocks") max_poly_matches = luigi.IntParameter( default=1) dont_join_orphans = luigi.BoolParameter() orphan_min_overlap_ratio = luigi.FloatParameter( default=0.9) orphan_min_overlap_volume = luigi.IntParameter( default=1000, description="The minimum # of voxels of overlap needed to join " "an orphan segment") halo_size_xy = luigi.IntParameter( default=5, description="The number of pixels on either side of the origin to " "use as context when extracting the slice to be joined, " "joining slices in the x and y directions") halo_size_z = luigi.IntParameter( default=1, description="The number of pixels on either side of the origin to " "use as context when extracting the slice to be joined, " "joining slices in the z direction") def output(self): return luigi.LocalTarget(self.output_location+".done") def run(self): with self.output().open("w") as fd: fd.write("Done") def get_dirs(self, x, y, z): '''Return a directory suited for storing a file with the given offset Create a hierarchy of directories in order to limit the number of files in any one directory. ''' return [os.path.join(temp_dir, self.experiment, self.sample, self.dataset, self.channel, str(x), str(y), str(z)) for temp_dir in self.temp_dirs] def get_pattern(self, dataset_name): return "{x:09d}_{y:09d}_{z:09d}_"+dataset_name def get_dataset_location(self, volume, dataset_name): return DatasetLocation(self.get_dirs(volume.x, volume.y, volume.z), dataset_name, self.get_pattern(dataset_name)) def requires(self): self.compute_requirements() return self.requirements def compute_requirements(self): if hasattr(self, "requirements"): return try: rh_logger.logger.report_event("Assembling pipeline") except: rh_logger.logger.start_process("Ariadne pipeline", "Assembling pipeline") # # Configuration turns off the luigi-interface logger # import logging logging.getLogger("luigi-interface").disabled = False self.task_factory = AMTaskFactory() rh_logger.logger.report_event( "Loading classifier from %s" % self.classifier_location) self.pixel_classifier = PixelClassifierTarget(self.classifier_location) self.compute_coordinates() self.compute_block_requirements() self.compute_stitching_requirements() def compute_coordinates(self): '''Compute the coordinates of the blocks''' self.n_x = int(np.ceil(float(self.volume.width) / self.block_width)) self.n_y = int(np.ceil(float(self.volume.height) / self.block_height)) self.n_z = int(np.ceil(float(self.volume.depth) / self.block_depth)) x = np.linspace(self.volume.x, self.volume.x1, self.n_x+1).astype(int) self.xs = x[:-1] self.xe = x[1:] y = np.linspace(self.volume.y, self.volume.y1, self.n_y+1).astype(int) self.ys = y[:-1] self.ye = y[1:] z = np.linspace(self.volume.z, self.volume.z1, self.n_z+1).astype(int) self.zs = z[:-1] self.ze = z[1:] def compute_block_requirements(self): self.segmentation_tasks = \ np.zeros((self.n_z, self.n_y, self.n_x), object) for zi in range(self.n_z): for yi in range(self.n_y): for xi in range(self.n_x): self.segmentation_tasks[zi, yi, xi] = \ self.compute_block_requirement(xi, yi, zi) def compute_block_requirement(self, xi, yi, zi): x0 = self.xs[xi] x1 = self.xe[xi] y0 = self.ys[yi] y1 = self.ye[yi] z0 = self.zs[zi] z1 = self.ze[zi] # Account for overlap if x0 != self.volume.x: x0 -= self.xy_overlap if x1 != self.volume.x1: x1 += self.xy_overlap if y0 != self.volume.y: y0 -= self.xy_overlap if y1 != self.volume.y1: y1 += self.xy_overlap if z0 != self.volume.z: z0 -= self.z_overlap if z1 != self.volume.z: z1 += self.z_overlap volume = Volume(x0, y0, z0, x1 - x0, y1 - y0, z1 - z0) # # Get the classifier input block coordinates # classifier_xpad = self.pixel_classifier.classifier.get_x_pad() classifier_ypad = self.pixel_classifier.classifier.get_y_pad() classifier_zpad = self.pixel_classifier.classifier.get_z_pad() cx0 = x0 - classifier_xpad cx1 = x1 + classifier_xpad cy0 = y0 - classifier_ypad cy1 = y1 + classifier_ypad cz0 = z0 - classifier_zpad cz1 = z1 + classifier_zpad classifier_input_volume = Volume( cx0, cy0, cz0, cx1 - cx0, cy1 - cy0, cz1 - cz0) # # The dataset locations # dl_butterfly = self.get_dataset_location(classifier_input_volume, "image") dl_synapse = self.get_dataset_location(volume, "synapse-prediction") dl_segmentation = self.get_dataset_location( volume, "synapse-segmentation") # # Pipeline flow is Butterfly -> classifier -> shim -> find synapses # btask = self.task_factory.gen_get_volume_task( experiment=self.experiment, sample=self.sample, dataset=self.dataset, channel=self.channel, url=self.butterfly_url, volume=classifier_input_volume, location=dl_butterfly) paths = self.get_dirs(x0, y0, z0) ctask = self.task_factory.gen_classify_task( paths=paths, datasets={self.synapse_class_name:"synapse-prediction"}, pattern=self.get_pattern("synapse-prediction"), img_volume=btask.volume, img_location=btask.output().dataset_location, classifier_path=self.classifier_location) ctask.set_requirement(btask) shim_task = ClassifyShimTask.make_shim( classify_task=ctask, dataset_name="synapse-prediction") find_synapses_task = self.task_factory.gen_find_synapses_task( volume=volume, syn_location=shim_task.output().dataset_location, neuron_segmentation=EMPTY_DATASET_LOCATION, erosion_xy=self.synapse_xy_erosion, erosion_z=self.synapse_z_erosion, sigma_xy=self.synapse_xy_sigma, sigma_z=self.synapse_z_sigma, threshold=self.synapse_threshold, min_size_2d=self.synapse_min_size_2d, max_size_2d=self.synapse_max_size_2d, min_size_3d=self.synapse_min_size_3d, min_slice=self.min_synapse_depth, output_location=dl_segmentation) find_synapses_task.set_requirement(shim_task) return find_synapses_task def compute_stitching_requirements(self): '''Compute the tasks needed to stitch the blocks''' # # Pipeline is # block -> # x-connections / y-connections / z-connections -> # all-connected-components -> # stitch segmentation # cc_tasks = [] # # The x-blocks # for xi in range(self.n_x-1): for yi in range(self.n_y): for zi in range(self.n_z): cc_tasks.append( self.compute_x_connected_components_task(xi, yi, zi)) # # The y-blocks # for yi in range(self.n_y-1): for xi in range(self.n_x): for yi in range(self.n_z): cc_tasks.append( self.compute_y_connected_components_task(xi, yi, zi)) # # The z-blocks # for zi in range(self.n_z-1): for xi in range(self.n_x): for yi in range(self.n_y): cc_tasks.append( self.compute_z_connected_components_task(xi, yi, zi)) # # The all-connected-components task # acc_location = os.path.join( self.get_dirs(self.xs[0], self.ys[0], self.zs[0])[0], "connectivity-graph.json") if len(cc_tasks) > 0: acc_task = self.task_factory.gen_all_connected_components_task( [_.output().path for _ in cc_tasks], acc_location) for task in cc_tasks: acc_task.set_requirement(task) else: # only one block - do a fake connected components seg_tgt = self.segmentation_tasks[0, 0, 0].output() acc_task = FakeAllConnectedComponentsTask( volume=seg_tgt.volume, location=seg_tgt.dataset_location, output_location=acc_location) for task in self.segmentation_tasks.flatten(): acc_task.set_requirement(task) # # The stitching task # output_location = DatasetLocation( [self.output_location], "synapse_segmentation", self.get_pattern("synapse_segmentation")) stask = self.task_factory.gen_stitch_segmentation_task( [], acc_task.output().path, self.volume, output_location) stask.x_padding = self.xy_overlap / 2 stask.y_padding = self.xy_overlap / 2 stask.z_padding = self.z_overlap / 2 stask.set_requirement(acc_task) self.requirements = [stask] def configure_connected_components_task(self, task): task.joining_method = self.joining_method task.min_overlap_percent = self.min_percent_connected task.min_overlap_volume = self.min_overlap_volume task.max_poly_matches = self.max_poly_matches task.dont_join_orphans = self.dont_join_orphans task.orphan_min_overlap_ratio = self.orphan_min_overlap_ratio task.orphan_min_overlap_volume = self.orphan_min_overlap_volume def compute_x_connected_components_task(self, xi, yi, zi): task1 = self.segmentation_tasks[zi, yi, xi] tgt1 = task1.output() task2 = self.segmentation_tasks[zi, yi, xi+1] tgt2 = task2.output() y0 = max(tgt1.volume.y, tgt2.volume.y) y1 = min(tgt1.volume.y1, tgt2.volume.y1) z0 = max(tgt1.volume.z, tgt2.volume.z) z1 = min(tgt1.volume.z1, tgt2.volume.z1) overlap_volume = Volume( (tgt1.volume.x1 + tgt2.volume.x) / 2 - self.halo_size_xy / 2, y0, z0, self.halo_size_xy, y1-y0, z1-z0) output_location = os.path.join( self.get_dirs(tgt1.x, tgt1.y, tgt1.z)[0], "connected-components-x.json") cctask = self.task_factory.gen_connected_components_task( volume1=tgt1.volume, location1=tgt1.dataset_location, volume2=tgt2.volume, location2=tgt2.dataset_location, overlap_volume=overlap_volume, output_location=output_location) self.configure_connected_components_task(cctask) cctask.set_requirement(task1) cctask.set_requirement(task2) return cctask def compute_y_connected_components_task(self, xi, yi, zi): task1 = self.segmentation_tasks[zi, yi, xi] tgt1 = task1.output() task2 = self.segmentation_tasks[zi, yi+1, xi] tgt2 = task2.output() x0 = max(tgt1.volume.x, tgt2.volume.x) x1 = min(tgt1.volume.x1, tgt2.volume.x1) z0 = max(tgt1.volume.z, tgt2.volume.z) z1 = min(tgt1.volume.z1, tgt2.volume.z1) overlap_volume = Volume( x0, (tgt1.volume.y1 + tgt2.volume.y) / 2 - self.halo_size_xy / 2, z0, x1 - x0, self.halo_size_xy, z1-z0) output_location = os.path.join( self.get_dirs(tgt1.x, tgt1.y, tgt1.z)[0], "connected-components-y.json") cctask = self.task_factory.gen_connected_components_task( volume1=tgt1.volume, location1=tgt1.dataset_location, volume2=tgt2.volume, location2=tgt2.dataset_location, overlap_volume=overlap_volume, output_location=output_location) self.configure_connected_components_task(cctask) cctask.set_requirement(task1) cctask.set_requirement(task2) return cctask def compute_z_connected_components_task(self, xi, yi, zi): task1 = self.segmentation_tasks[zi, yi, xi] tgt1 = task1.output() task2 = self.segmentation_tasks[zi+1, yi, xi] tgt2 = task2.output() x0 = max(tgt1.volume.x, tgt2.volume.x) x1 = min(tgt1.volume.x1, tgt2.volume.x1) y0 = max(tgt1.volume.y, tgt2.volume.y) y1 = min(tgt1.volume.y1, tgt2.volume.y1) overlap_volume = Volume( x0, y0, (tgt1.volume.z1 + tgt2.volume.z) / 2 - self.halo_size_z / 2, x1 - x0, y1 - y0, self.halo_size_z) output_location = os.path.join( self.get_dirs(tgt1.x, tgt1.y, tgt1.z)[0], "connected-components-z.json") cctask = self.task_factory.gen_connected_components_task( volume1=tgt1.volume, location1=tgt1.dataset_location, volume2=tgt2.volume, location2=tgt2.dataset_location, overlap_volume=overlap_volume, output_location=output_location) self.configure_connected_components_task(cctask) cctask.set_requirement(task1) cctask.set_requirement(task2) return cctask
class Bar(RunOnceTask): eparam = luigi.EnumParameter(enum=Color)
class Baz(RunOnceTask): eparam = luigi.EnumParameter(enum=Color) another_param = luigi.IntParameter()
class PerformObjectTracking2D(luigi.Task): base_name = luigi.Parameter() tracking_type = luigi.EnumParameter(enum=TrackingType) timestep_interval = luigi.ListParameter(default=[]) U_offset = luigi.ListParameter(default=[]) run_in_temp_dir = luigi.BoolParameter(default=True) def requires(self): if REGEX_INSTANTENOUS_BASENAME.match(self.base_name): raise Exception("Shouldn't pass base_name with timestep suffix" " (`.tn`) to tracking util") required_vars = uclales_2d_tracking.get_required_vars( tracking_type=self.tracking_type) return [ TimeCrossSectionSlices2D(base_name=self.base_name, var_name=var_name) for var_name in required_vars ] def run(self): meta = _get_dataset_meta_info(self.base_name) if len(self.timestep_interval) == 0: tn_start = 0 N_timesteps = { input.fn: int(input.open().time.count()) for input in self.input() } if len(set(N_timesteps.values())) == 1: tn_end = list(N_timesteps.values())[0] - 1 else: s_files = "\n\t".join([ "{fn}: {N}".format(fn=k, N=v) for (k, v) in N_timesteps.items() ]) raise Exception( "The input files required for tracking don't currently have" " the same number of timesteps, maybe some of them need" " recreating? Required files and number of timesteps:\n" f"\n\t{s_files}") else: tn_start, tn_end = self.timestep_interval if tn_start != 0: warnings.warn("There is currently a bug in the cloud-tracking " "code which causes it to crash when not starting " "at time index 0 (fortran index 1). Setting " "tn_start=0") tn_start = 0 if meta.get("no_tracking_calls", False): filename = Path(self.output().fn).name p_source = Path(meta["path"]) p_source_tracking = p_source / "tracking_output" / filename if p_source_tracking.exists(): Path(self.output().fn).parent.mkdir(parents=True, exist_ok=True) os.symlink(p_source_tracking, self.output().fn) else: raise Exception( "Automatic tracking calls have been disabled and" f" couldn't find tracking output." " Please run tracking utility externally and place output" f" in `{p_source_tracking}`") else: dataset_name = meta["experiment_name"] if self.run_in_temp_dir: tempdir = tempfile.TemporaryDirectory() p_data = Path(tempdir.name) # symlink the source data files to the temporary directory for input in self.input(): os.symlink( Path(input.fn).absolute(), p_data / Path(input.fn).name) fn_track = f"{dataset_name}.out.xy.track.nc" # and the file for the tracking tool to write to Path(self.output().fn).parent.mkdir(exist_ok=True, parents=True) os.symlink( Path(self.output().fn).absolute(), p_data / fn_track) else: p_data = Path(self.input()[0].fn).parent fn_tracking = uclales_2d_tracking.call( data_path=p_data, dataset_name=dataset_name, tn_start=tn_start + 1, tn_end=tn_end, tracking_type=self.tracking_type, U_offset=self.U_offset, ) if not self.run_in_temp_dir: Path(self.output().fn).parent.mkdir(exist_ok=True, parents=True) shutil.move(fn_tracking, self.output().fn) def output(self): type_id = uclales_2d_tracking.TrackingType.make_identifier( self.tracking_type) if len(self.timestep_interval) == 0: interval_id = "__all__" else: tn_start, tn_end = self.timestep_interval interval_id = "{}__{}".format(tn_start, tn_end) if self.U_offset: offset_s = "u{}_v{}_offset".format(*self.U_offset) else: offset_s = "no_offset" FN_2D_FORMAT = ("{experiment_name}.tracking.{type_id}" ".{interval_id}.{offset}.nc") meta = _get_dataset_meta_info(self.base_name) experiment_name = meta["experiment_name"] fn = FN_2D_FORMAT.format( experiment_name=experiment_name, type_id=type_id, interval_id=interval_id, offset=offset_s, ) p = get_workdir() / self.base_name / "tracking_output" / fn return XArrayTargetUCLALESTracking(str(p))
class SqlScriptTask(DBAccessTask): '''Task to run a stylized SQL script. As seen in `script_lib`, a script may require (in the luigi sense) other scripts and it is complete iff its last query says so. Running a script may be parameterized with bind params and/or Oracle sqlplus style defined `&&variables`: >>> variables = dict(I2B2STAR='I2B2DEMODATA', CMS_RIF='CMS', ... cms_source_cd='X', bene_id_source='b') >>> txform = SqlScriptTask( ... account='sqlite:///', passkey=None, ... script=Script.cms_patient_mapping, ... param_vars=variables) >>> [task.script for task in txform.requires()] ... #doctest: +ELLIPSIS [<Package(cms_keys)>] >>> txform.complete() False ''' script = cast(Script, luigi.EnumParameter(enum=Script)) param_vars = cast(Environment, luigi.DictParameter(default={})) _log = logging.getLogger('sql_scripts') # ISSUE: ambient. magic-string @property def variables(self) -> Environment: '''Defined variables for this task (or task family). ''' return self.param_vars @property def vars_for_deps(self) -> Environment: '''Defined variables to supply to dependencies. ''' return self.variables def requires(self) -> List[luigi.Task]: '''Wrap each of `self.script.deps()` in a SqlScriptTask. ''' return [ SqlScriptTask(script=s, param_vars=self.vars_for_deps, account=self.account, passkey=self.passkey, echo=self.echo) for s in self.script.deps() ] def log_info(self) -> Dict[str, Any]: '''Include script, filename in self.log_info(). ''' return dict(DBAccessTask.log_info(self), script=self.script.name, filename=self.script.fname) def complete(self) -> bool: '''Each script's last query tells whether it is complete. It should be a scalar query that returns non-zero for done and either zero or an error for not done. ''' last_query = self.last_query() params = params_used(self.complete_params(), last_query) with self.connection(event=self.task_family + ' complete query: ' + self.script.name) as conn: try: result = conn.scalar(sql_text(last_query), params) return bool(result) except DatabaseError as exc: conn.log.warning('%(event)s: %(exc)s', dict(event='complete query error', exc=exc)) return False def last_query(self) -> SQL: """ Note: In order to support run-only variables as in UploadTask, we skip statements with unbound &&variables. """ return self.script.statements(skip_unbound=True, variables=self.variables)[-1] def complete_params(self) -> Dict[str, Any]: '''Make `task_id` available to complete query as a bind param. ''' return dict(task_id=self.task_id) def run(self) -> None: '''Run each statement in the script without any bind parameters. ''' self.run_bound() def run_bound(self, script_params: Opt[Params] = None) -> None: '''Run with a (default emtpy) set of parameters bound. ''' with self.connection(event='run script') as conn: self.run_event(conn, script_params=script_params) def run_event(self, conn: LoggedConnection, run_vars: Opt[Environment] = None, script_params: Opt[Params] = None) -> int: '''Run script inside a LoggedConnection event. @param run_vars: variables to define for this run @param script_params: parameters to bind for this run @return: count of rows bulk-inserted always 0 for this class, but see UploadTask To see how a script can ignore errors, see :mod:`script_lib`. ''' bulk_rows = 0 ignore_error = False run_params = dict(script_params or {}, task_id=self.task_id) fname = self.script.fname variables = dict(run_vars or {}, **self.variables) each_statement = self.script.each_statement(variables=variables) for line, _comment, statement in each_statement: try: if self.is_bulk(statement): bulk_rows = self.bulk_insert(conn, fname, line, statement, run_params, bulk_rows) else: ignore_error = self.execute_statement( conn, fname, line, statement, run_params, ignore_error) except DatabaseError as exc: db = self._dbtarget().engine err = SqlScriptError(exc, self.script, line, statement, str(db)) if ignore_error: conn.log.warning('%(event)s: %(error)s', dict(event='ignore', error=err)) else: raise err from None if bulk_rows > 0: conn.step.msg_parts.append(' %(rowtotal)s total rows') conn.step.argobj.update(dict(rowtotal=bulk_rows)) return self.loaded_record(conn, bulk_rows) def loaded_record(self, conn: LoggedConnection, bulk_rows: int) -> int: return bulk_rows def execute_statement(self, conn: LoggedConnection, fname: str, line: int, statement: SQL, run_params: Params, ignore_error: bool) -> bool: '''Log and execute one statement. ''' sqlerror = Script.sqlerror(statement) if sqlerror is not None: return sqlerror params = params_used(run_params, statement) self.set_status_message('%s:%s:\n%s\n%s' % (fname, line, statement, params)) conn.execute(statement, params) return ignore_error def is_bulk(self, statement: SQL) -> bool: '''always False for this class, but see UploadTask ''' return False def bulk_insert(self, conn: LoggedConnection, fname: str, line: int, statement: SQL, run_params: Params, bulk_rows: int) -> int: raise NotImplementedError( 'overriding is_bulk() requires overriding bulk_insert()')
class NeuroproofRunMixin: neuroproof = luigi.Parameter( description="Location of the neuroproof_graph_predict binary") neuroproof_ld_library_path = luigi.Parameter( description="Library paths to Neuroproof's shared libraries. " "This should include paths to CILK stdc++ libraries, Vigra libraries, " "JSONCPP libraries, and OpenCV libraries.") classifier_filename = luigi.Parameter( description="The Vigra random forest classifier or OpenCV random " "forest agglomeration classifier. In addition, there may be a file " "with the given filename with \"_ignore.txt\" appended which gives " "the indices of the features to ignore and similarly a file with " "\"_config.json\" appended which gives configuration information to " "neuroproof.") threshold = luigi.FloatParameter( default=0.2, description="Segmentation threshold for neuroproof") watershed_threshold = luigi.IntParameter( default=0, description="Threshold used for removing small bodies as a " "post-processing step") neuroproof_version = luigi.EnumParameter( enum=NeuroproofVersion, default=NeuroproofVersion.MIT, description="The command-line convention to be used to run the " "Neuroproof binary") def ariadne_run(self): '''Run the neuroproof subprocess''' if self.neuroproof_version == NeuroproofVersion.MINIMAL: self.run_standard() elif self.neuroproof_version == NeuroproofVersion.FLY_EM: self.run_optimized_with_copy() elif self.neuroproof_version == NeuroproofVersion.FAST: self.run_fast() else: self.run_optimized() def run_standard(self): '''Run the out-of-the-box neuroproof''' # # Write the segmentation and membrane probabilities to one # big temporary hdf5 file # prob_volume = DestVolumeReader(self.prob_loading_plan_path) seg_volume = DestVolumeReader(self.input_seg_loading_plan_path) additional_maps = \ [DestVolumeReader(_) for _ in self.additional_loading_plan_paths] h5file = tempfile.mktemp(".h5") probfile = tempfile.mktemp(".h5") rh_logger.logger.report_event("Neuroproof watershed: %s" % h5file) rh_logger.logger.report_event("Neuroproof probabilities: %s" % probfile) pool = multiprocessing.Pool(2) seg_result = pool.apply_async(write_seg_volume, args=(h5file, seg_volume, "segmentation")) duplicate = None if len(additional_maps) > 0 else False prob_result = pool.apply_async(write_prob_volume, args=(prob_volume, additional_maps, probfile, "probabilities", False, duplicate)) pool.close() pool.join() seg_result.get() prob_result.get() outfile = tempfile.mktemp(".h5") rh_logger.logger.report_event("Neuroproof output: %s" % outfile) try: args = [ self.neuroproof, "-threshold", str(self.threshold), "-algorithm", "1", "-nomito", "-min_region_sz", "0", "-watershed", h5file, "segmentation", "-prediction", probfile, "probabilities", "-output", outfile, "segmentation", "-classifier", self.classifier_filename ] rh_logger.logger.report_event(" ".join(args)) # # Inject the custom LD_LIBRARY_PATH into the subprocess environment # env = os.environ.copy() if "LD_LIBRARY_PATH" in env: ld_library_path = self.neuroproof_ld_library_path + os.pathsep +\ env["LD_LIBRARY_PATH"] else: ld_library_path = self.neuroproof_ld_library_path env["LD_LIBRARY_PATH"] = ld_library_path self.configure_env(env) # # Do the dirty deed... # subprocess.check_call(args, env=env, close_fds=True) # # There's an apparent bug in Neuroproof where it writes # the output to "fo.h5" for example, when you've asked it # to send the output to "foo.h5" # alt_outfile = os.path.splitext(outfile)[0][:-1] + ".h5" if (not os.path.exists(outfile)) and os.path.exists(alt_outfile): outfile = alt_outfile # # Finish the output volume # with h5py.File(outfile, "r") as fd: self.output().imwrite(fd["segmentation"][:].astype(np.uint32)) finally: os.remove(h5file) os.remove(probfile) if os.path.exists(outfile): os.remove(outfile) def run_optimized_with_copy(self): '''Run the MIT neuroproof, but copying everything''' inputs = self.input() prob_volume = inputs.next() seg_volume = inputs.next() additional_maps = list(inputs) h5file = tempfile.mktemp(".h5") probfile = tempfile.mktemp(".h5") rh_logger.logger.report_event("Neuroproof watershed: %s" % h5file) rh_logger.logger.report_event("Neuroproof probabilities: %s" % probfile) pool = multiprocessing.Pool(2) seg_result = pool.apply_async(write_seg_volume, args=(h5file, seg_volume, "stack")) prob_result = pool.apply_async(write_prob_volume, args=(prob_volume, additional_maps, probfile, "volume/predictions")) pool.close() pool.join() seg_result.get() prob_result.get() outfile = tempfile.mktemp(".h5") rh_logger.logger.report_event("Neuroproof output: %s" % outfile) try: args = [ self.neuroproof, h5file, probfile, self.classifier_filename, "--output-file", outfile, "--threshold", str(self.threshold), "--watershed-threshold", str(self.watershed_threshold) ] rh_logger.logger.report_event(" ".join(args)) # # Inject the custom LD_LIBRARY_PATH into the subprocess environment # env = os.environ.copy() if "LD_LIBRARY_PATH" in env: ld_library_path = self.neuroproof_ld_library_path + os.pathsep +\ env["LD_LIBRARY_PATH"] else: ld_library_path = self.neuroproof_ld_library_path env["LD_LIBRARY_PATH"] = ld_library_path self.configure_env(env) # # Do the dirty deed... # subprocess.check_call(args, env=env, close_fds=True) # # There's an apparent bug in Neuroproof where it writes # the output to "fo.h5" for example, when you've asked it # to send the output to "foo.h5" # alt_outfile = os.path.splitext(outfile)[0][:-1] + ".h5" if (not os.path.exists(outfile)) and os.path.exists(alt_outfile): outfile = alt_outfile # # Finish the output volume # with h5py.File(outfile, "r") as fd: self.output().imwrite(fd["stack"][:].astype(np.uint32)) finally: os.remove(h5file) os.remove(probfile) if os.path.exists(outfile): os.remove(outfile) def run_optimized(self): '''Run the MIT neuroproof''' # # The arguments for neuroproof_graph_predict: # output_target = self.output() output_target.create_directories() output = self.storage_plan # # neuroproof_graph_predict will take a .json file in place of a # prediction file. It has the following format: # # { "probabilities": [ # "<probability-loading-plan-1>", # ... # "<probability-loading-plan-N" # ] # "config": { # "invert": [ True or False per probability ], # "use-loading-plan": True, # "use-storage-plan": True # } # "watershed": "watershed-loading-plan", # "output": "output-storage-plan" } # # config is optional as are its key/value pairs. Predictably, # "invert" is False by default. # probabilities = \ [self.prob_loading_plan_path] + \ list(self.additional_loading_plan_paths) watershed = self.input_seg_loading_plan_path config_path = \ os.path.splitext(self.classifier_filename)[0] + "_config.json" if os.path.isfile(config_path): config = json.load(open(config_path, "r")) else: config = {} config["use-loading-plans"] = True config["use-storage-plans"] = True d = dict(config=config, probabilities=probabilities, watershed=watershed, output=output) fd, json_path = tempfile.mkstemp(".json") f = os.fdopen(fd, "w") json.dump(d, f) f.close() try: args = [ self.neuroproof, "--threshold", str(self.threshold), "--watershed-threshold", str(self.watershed_threshold), json_path, json_path, self.classifier_filename ] # # Inject the custom LD_LIBRARY_PATH into the subprocess environment # env = os.environ.copy() if "LD_LIBRARY_PATH" in env: ld_library_path = self.neuroproof_ld_library_path + os.pathsep +\ env["LD_LIBRARY_PATH"] else: ld_library_path = self.neuroproof_ld_library_path env["LD_LIBRARY_PATH"] = ld_library_path self.configure_env(env) # # Do the dirty deed... # subprocess.check_call(args, env=env, close_fds=True) # # Finish the output volume # # We collect some summary statistics here that are added to # the JSON file. # data = output_target.imread() d = json.load(open(output_target.storage_plan_path)) areas = np.bincount(data.ravel()) areas[0] = 0 labels = np.where(areas > 0)[0] areas = areas[labels] d["areas"] = areas.tolist() d["labels"] = labels.tolist() with output_target.open("w") as fd: json.dump(d, fd) finally: os.remove(json_path) def run_fast(self): '''Run using Tim Kaler's speedup + NeuroProof_plan''' # # Make the target directories for the .tif files # output_target = self.output() output_target.create_directories() arguments = [ self.neuroproof, "-watershed", self.input_seg_loading_plan_path, "-prediction", self.prob_loading_plan_path ] for path in self.additional_loading_plan_paths: arguments.append("-prediction") arguments.append(path) arguments += [ "-classifier", self.classifier_filename, "-output", self.storage_plan, "-threshold", str(self.threshold), "-algorithm", "1", "-nomito", "-min_region_sz", str(self.watershed_threshold) ] rh_logger.logger.report_event("Executing %s" % (" ".join(arguments))) # # Inject the custom LD_LIBRARY_PATH into the subprocess environment # env = os.environ.copy() if "LD_LIBRARY_PATH" in env: ld_library_path = self.neuroproof_ld_library_path + os.pathsep +\ env["LD_LIBRARY_PATH"] else: ld_library_path = self.neuroproof_ld_library_path env["LD_LIBRARY_PATH"] = ld_library_path self.configure_env(env) # # Do the dirty deed... # subprocess.check_call(arguments, env=env) # # Finish the output volume # # We collect some summary statistics here that are added to # the JSON file. # data = output_target.imread() d = json.load(open(output_target.storage_plan_path)) areas = np.bincount(data.ravel()) areas[0] = 0 labels = np.where(areas > 0)[0] areas = areas[labels] d["areas"] = areas.tolist() d["labels"] = labels.tolist() with output_target.open("w") as fd: json.dump(d, fd)
class Backtest(luigi.Task): trade_fn = luigi.EnumParameter( enum=TradeFunction, default=TradeFunction.b_cross ) rand_seed = luigi.IntParameter( default=random.randrange(sys.maxsize) ) def requires(self): return [BollingerBands()] def output(self): if self._is_stochastic(): return luigi.LocalTarget( config.data_dir + "trading/backtest_{}/{:06d}.csv".format( self.trade_fn, self.rand_seed ) ) else: return luigi.LocalTarget( config.data_dir + "trading/backtest_{}.csv".format( self.trade_fn ) ) def _is_stochastic(self): """Return true if selected trade_fn is stochastic""" return self.trade_fn in [TradeFunction.random] def run(self): random.seed(self.rand_seed) # Read input print(self.input()[0]["bollinger"].path) dta = pd.read_csv( self.input()[0]["bollinger"].path, usecols=[ 'Date(UTC)', 'Value', 'EMA', 'STD', 'Upper Band', 'Lower Band' ], parse_dates=['Date(UTC)'], converters={ 'Value': float, 'EMA': float, # 'STD': float, # 'Upper Band': float, # 'Lower Band': float, }, ) assets_dta = self._get_backtest_assets(dta) # Write to CSV assets_dta.to_csv(self.output().path, index=False) def _get_backtest_assets(self, price_dta, skip_first_n=1): """ parameters: ----------- skip_first_n : int allows calculations to catch up. must be >= 1 """ assert skip_first_n > 0 wallet = Wallet() assets = [{ **wallet.asset_dict(), 'date_time': price_dta['Date(UTC)'][0], 'netHoldings': 0, 'trade': 0, }] # Iterate over all rows, adding trade data # trades = pd.DataFrame(columns=['date_time', 'price', 'trade']) for index, row in price_dta.iterrows(): if index < skip_first_n: continue # implied else trade_amt = trade_function_map[self.trade_fn.value]( price=row['Value'], bollinger_lower=row['Lower Band'], bollinger_upper=row['Upper Band'], max_trade=floor(assets[-1]['btc']*0.5), eth_btc_ratio=(1+assets[-1]['eth']) / (1+assets[-1]['btc']), ) # if (trade_amt != 0): # trades = trades.append({ # "price": row['Value'], # "trade": trade_amt # }, ignore_index=True) # else: # trades = trades.append([ # row['Date(UTC)'], # row['Value'], # 0 # ]) # trades.to_csv(self.output()["trades"].path, index=False) trade_penalty = .05 if trade_amt != 0: wallet.trade( {'btc': - trade_amt}, {'eth': - trade_amt / row['Value'] * (1 - trade_penalty)}, ) assets.append({ "date_time": row['Date(UTC)'], **wallet.asset_dict(), "trade": trade_amt }) # Convert values to exchange currency assets[-1]['eth'] = assets[-1]['eth'] * row['Value'] # Calculate net value of holdings assets[-1]['netHoldings'] = ( assets[-1]['btc'] + assets[-1]['eth'] ) # Convert List to DataFrame return pd.DataFrame(assets)
class DerivedLabels2D(luigi.Task): """ Produce 2D label array at a specific time from tracked objects with specific properties (these conditions are implemented for each `label_type`) """ label_type = luigi.Parameter(default="newlyformed_singlecore_clouds") base_name = luigi.Parameter() time = NumpyDatetimeParameter() tracking_type = luigi.EnumParameter(enum=TrackingType) tracking_timestep_interval = luigi.ListParameter(default=[]) offset_labels_by_gal_transform = luigi.BoolParameter(default=False) track_without_gal_transform = luigi.BoolParameter(default=False) def requires(self): tasks = {} kws = dict( base_name=self.base_name, time=self.time, tracking_type=self.tracking_type, offset_labels_by_gal_transform=self.offset_labels_by_gal_transform, track_without_gal_transform=self.track_without_gal_transform, tracking_timestep_interval=self.tracking_timestep_interval, ) if self.label_type == "newlyformed_singlecore_clouds": tasks["labels"] = TrackingLabels2D( label_var="cloud", *kws, ) tasks["object_type"] = TrackingVariable2D( var_name="object_type", *kws, ) tasks["object_age"] = TrackingVariable2D( var_name="object_age", *kws, ) return tasks def run(self): if self.label_type == "newlyformed_singlecore_clouds": da_labels = self.input()["labels"].open().fillna(0).astype(int) raise NotImplementedError(da_labels) da_labels_filtered = None Path(self.output().fn).parent.mkdir(exist_ok=True, parents=True) da_labels_filtered.to_netcdf(self.output().fn) def output(self): type_id = uclales_2d_tracking.TrackingType.make_identifier( self.tracking_type) if self.tracking_timestep_interval: interval_id = "tn{}_to_tn{}".format( *self.tracking_timestep_interval) else: interval_id = "__all__" name_parts = [ self.var_name, f"of_{self.label_var}", f"tracked_{type_id}", interval_id, self.time.isoformat(), ] if self.dx: name_parts.insert(1, f"{self.dx}_{self.op}") else: name_parts.insert(1, self.op) if self.offset_labels_by_gal_transform: meta = _get_dataset_meta_info(self.base_name) u_gal, v_gal = meta["U_gal"] name_parts.append(f"go_labels_{u_gal}_{v_gal}") if self.track_without_gal_transform: name_parts.append("go_track") fn = f"{'.'.join(name_parts)}.nc" p = get_workdir( ) / self.base_name / "cross_sections" / "aggregated" / fn return XArrayTarget(str(p))
class MergePredictionsPipeline(luigi.Task): task_namespace="ariadne_microns_pipeline" operation = luigi.EnumParameter( enum=MergeOperation, default=MergeOperation.Average, description="The operation to perform") invert = luigi.BoolParameter( description="Subtract the result from the maximum allowed value") connectivity_graph_path = luigi.Parameter( description="The location of the connectivity graph") input_dataset_names = luigi.ListParameter( description="The dataset names of the inputs to be merged.") output_dataset_name = luigi.Parameter( description="The dataset name of the outputs to be generated.") index_file_name = luigi.Parameter( description="The name of the index file containing the ouput " "dataset's loading and storage plans") def output(self): return luigi.LocalTarget(self.index_file_name) def requires(self): if not hasattr(self, "requirements"): try: rh_logger.logger.start_process( "MergePredictions", "starting", []) except: pass self.compute_requirements() return self.requirements def compute_requirements(self): self.cg = ConnectivityGraph.load(open(self.connectivity_graph_path)) # # Find the loading plans of the input channels # rh_logger.logger.report_event("Finding input channel loading plans") self.find_input_channel_loading_plans() # # Find the storage plans of the input channels. These get used # to write loading plans that match the storage plans and to # write storage plans for the output channel. # rh_logger.logger.report_event("Finding input channel storage plans") self.find_input_channel_storage_plans() # # Write the loading plans for the input channels # rh_logger.logger.report_event("Writing input channel loading plans") self.write_input_channel_loading_plans() # # Write the storage plans for the output channel # rh_logger.logger.report_event("Writing output channel storage plans") self.write_output_channel_storage_plans() # # Write the loading plans for the output channel # rh_logger.logger.report_event("Write output channel loading plans") self.write_output_loading_plans() # # Make the needed tasks # self.requirements = self.make_merge_tasks() def find_input_channel_loading_plans(self): self.input_channel_lps = dict( [(channel, {}) for channel in self.input_dataset_names]) # # We get the input channel loading plans from the # Neuroproof loading plans by hacking their names # for volume, location in self.cg.locations.items(): location_dir = os.path.dirname(location) paths = glob.glob(os.path.join(location_dir, "*.loading.plan")) for channel in self.input_dataset_names: for path in paths: if os.path.split(path)[1].startswith(channel): self.input_channel_lps[channel][volume] = path def find_input_channel_storage_plans(self): self.input_channel_sps = dict( [(channel, {}) for channel in self.input_dataset_names]) # # We enumerate all the storage plans in each loading plan # for channel in self.input_dataset_names: d = self.input_channel_sps[channel] for volume, lp in self.input_channel_lps[channel].items(): for sp in DestVolumeReader(lp).get_source_targets(): d[to_hashable(sp.volume)] = sp.storage_plan_path def write_input_channel_loading_plans(self): '''Write loading plans that mirror the input channel storage plans''' self.input_channel_block_lps = dict( [(channel, {}) for channel in self.input_dataset_names]) for channel in self.input_dataset_names: d = self.input_channel_block_lps[channel] for volume, sp in self.input_channel_sps[channel].items(): sp_dir = os.path.dirname(sp) lp_path = os.path.join( sp_dir, "%s_%d-%d_%d-%d_%d-%d.loading_plan" % (channel, volume.x, volume.x1, volume.y, volume.y1, volume.z, volume.z1)) d[volume] = lp_path storage_plan = SrcVolumeTarget(sp) storage_plan.write_loading_plan(lp_path) def write_output_channel_storage_plans(self): '''Write a storage plan for each block to be merged''' self.output_channel_storage_plans = {} # # Copy channel 0's storage plan # ch0 = self.input_dataset_names[0] for volume, sp in self.input_channel_sps[ch0].items(): spd = json.load(open(sp)) sp_dir, sp0_file = os.path.split(sp) sp_file = "%s_%d-%d_%d-%d_%d-%d.storage.plan" % ( self.output_dataset_name, spd["x"], spd["x"] + spd["dimensions"][2], spd["y"], spd["y"] + spd["dimensions"][1], spd["z"], spd["z"] + spd["dimensions"][0]) sp_path = os.path.join(sp_dir, sp_file) spd["dataset_name"] = self.output_dataset_name blocks = spd["blocks"] spd["blocks"] = [] for v, tif_path in blocks: tif_file = "%s_%d-%d_%d-%d_%d-%d.tif" % ( self.output_dataset_name, v["x"], v["x"] + v["width"], v["y"], v["y"] + v["height"], v["z"], v["z"] + v["depth"]) tif_path = os.path.join(os.path.dirname(tif_path), tif_file) spd["blocks"].append((v, tif_path)) json.dump(spd, open(sp_path, "w")) self.output_channel_storage_plans[volume] = sp_path def write_output_loading_plans(self): '''Write loading plans for the output channel based on the input lps''' self.output_channel_loading_plans = {} # # Copy channel 0's loading plans # ch0 = self.input_dataset_names[0] for volume, lp in self.input_channel_lps[ch0].items(): lpd = json.load(open(lp)) lp_dir, lp0_file = os.path.split(lp) lp_file = "%s_%d-%d_%d-%d_%d-%d.loading.plan" % ( self.output_dataset_name, lpd["x"], lpd["x"] + lpd["dimensions"][2], lpd["y"], lpd["y"] + lpd["dimensions"][1], lpd["z"], lpd["z"] + lpd["dimensions"][0]) lp_path = os.path.join(lp_dir, lp_file) lpd["dataset_name"] = self.output_dataset_name blocks = lpd["blocks"] lpd["blocks"] = [] for tif_path, v in blocks: tif_file = "%s_%d-%d_%d-%d_%d-%d.tif" % ( self.output_dataset_name, v["x"], v["x"] + v["width"], v["y"], v["y"] + v["height"], v["z"], v["z"] + v["depth"]) tif_path = os.path.join(os.path.dirname(tif_path), tif_file) lpd["blocks"].append((tif_path, v)) json.dump(lpd, open(lp_path, "w")) self.output_channel_loading_plans[volume] = lp_path def make_merge_tasks(self): '''Make one merge task per block''' tasks = [] for volume, sp in self.output_channel_storage_plans.items(): lps = [self.input_channel_block_lps[channel][volume] for channel in self.input_dataset_names] task = MergePredictionsTask( storage_plan=sp, loading_plans=lps, operation=self.operation, invert=self.invert) tasks.append(task) return tasks def run(self): '''Make an index file with the details of the run''' d = dict(output_loading_plans=[], output_storage_plans=[]) lists = [] for channel in self.input_dataset_names: cd = d[channel] = {} for name in ("input_channel_loading_plans", "input_channel_block_loading_plans", "input_channel_storage_plans"): d1 = cd[name] = [] d2 = getattr(self, name)[channel] lists.append((d1, d2)) lists.append(d["output_channel_loading_plans"], self.output_channel_loading_plans) lists.append(d["output_channel_storage_plans"], self.output_channel_storage_plans) for l1, d2 in lists: for v, path in d2.items(): l1.append((to_json_serializable(v), path)) with self.output().open("w") as fd: json.dump(d, fd)
class LinkwaglOutputs(luigi.Task): """ Link all the multifile outputs from wagl into a single file. """ level1 = luigi.Parameter() work_root = luigi.Parameter() granule = luigi.OptionalParameter(default='') acq_parser_hint = luigi.OptionalParameter(default='') workflow = luigi.EnumParameter(enum=Workflow) vertices = luigi.TupleParameter(default=(5, 5)) pixel_quality = luigi.BoolParameter() method = luigi.EnumParameter(enum=Method, default=Method.SHEAR) dsm_fname = luigi.Parameter(significant=False) buffer_distance = luigi.FloatParameter(default=8000, significant=False) def requires(self): container = acquisitions(self.level1, self.acq_parser_hint) for group in container.supported_groups: kwargs = { 'level1': self.level1, 'work_root': self.work_root, 'granule': self.granule, 'group': group, 'workflow': self.workflow, 'vertices': self.vertices, 'pixel_quality': self.pixel_quality, 'method': self.method, 'dsm_fname': self.dsm_fname, 'buffer_distance': self.buffer_distance } yield DataStandardisation(**kwargs) def output(self): out_fname = pjoin(dirname(self.work_root), '{}.h5'.format(self.granule)) return luigi.LocalTarget(out_fname) def run(self): with self.output().temporary_path() as out_fname: for root, _, files in os.walk(self.work_root): # skip any private files if basename(root)[0] == '_': continue for file_ in files: if splitext(file_)[1] == '.h5': fname = pjoin(root, file_) grp_name = basename( dirname(fname.replace(self.work_root, ''))) with h5py.File(fname, 'r') as fid: groups = [g for g in fid] for pth in groups: new_path = ppjoin(self.granule, grp_name, pth) create_external_link(fname, pth, out_fname, new_path) with h5py.File(out_fname) as fid: fid.attrs['level1_uri'] = self.level1
class WriteTp5(luigi.Task): """Output the `tp5` formatted files.""" level1 = luigi.Parameter() work_root = luigi.Parameter(significant=False) granule = luigi.OptionalParameter(default='') vertices = luigi.TupleParameter() acq_parser_hint = luigi.OptionalParameter(default='') workflow = luigi.EnumParameter(enum=Workflow) base_dir = luigi.Parameter(default='_atmospherics', significant=False) compression = luigi.EnumParameter(enum=H5CompressionFilter, default=H5CompressionFilter.LZF, significant=False) filter_opts = luigi.DictParameter(default=None, significant=False) def requires(self): container = acquisitions(self.level1, self.acq_parser_hint) tasks = {} tasks['ancillary'] = AncillaryData(self.level1, self.work_root, self.granule, self.vertices, self.workflow) for group in container.supported_groups: args = [self.level1, self.work_root, self.granule, group] tsks = { 'sat_sol': CalculateSatelliteAndSolarGrids(*args), 'lon_lat': CalculateLonLatGrids(*args) } tasks[group] = tsks return tasks def output(self): out_fname = pjoin(self.work_root, 'atmospheric-inputs.h5') return luigi.LocalTarget(out_fname) def run(self): container = acquisitions(self.level1, self.acq_parser_hint) acqs, group = container.get_highest_resolution(granule=self.granule) # output filename format output_fmt = pjoin(POINT_FMT, ALBEDO_FMT, ''.join([POINT_ALBEDO_FMT, '.tp5'])) # input filenames ancillary_fname = self.input()['ancillary'].path sat_sol_fname = self.input()[group]['sat_sol'].path lon_lat_fname = self.input()[group]['lon_lat'].path with self.output().temporary_path() as out_fname: tp5_data = _format_tp5(acqs, sat_sol_fname, lon_lat_fname, ancillary_fname, out_fname, self.workflow) # keep this as an indented block, that way the target will remain # atomic and be moved upon closing for key in tp5_data: point, albedo = key tp5_fname = output_fmt.format(p=point, a=albedo.value) target = pjoin(dirname(out_fname), self.base_dir, tp5_fname) with luigi.LocalTarget(target).open('w') as src: src.writelines(tp5_data[key])
class FindSeedsRunMixin(DatasetMixin): dimensionality = luigi.EnumParameter( enum=Dimensionality, description="Whether to find seeds in each 2D plane or in the " "volume as a whole") method = luigi.EnumParameter( enum=SeedsMethodEnum, description="The algorithm used to find seeds") sigma_xy = luigi.FloatParameter( description= "The sigma of the smoothing Gaussian in the x & y directions", default=3) sigma_z = luigi.FloatParameter( description="The sigma of the smoothing Gaussian in the z direction", default=.4) threshold = luigi.FloatParameter( description="The intensity threshold cutoff for the seeds", default=1) minimum_distance_xy = luigi.FloatParameter( default=5, description="The minimum distance allowed between seeds") minimum_distance_z = luigi.FloatParameter( default=1.5, description="The minimum distance allowed between seed in the z dir") structuring_element = luigi.EnumParameter( enum=Shape, default=Shape.Cube, description="The shape of the structuring element." " Ellipsoid is slower, but honors the distances." " Cube is faster, but excludes due to extrema at the corners of " "the cube") distance_threshold = luigi.FloatParameter( default=20, description="The distance threshold cutoff for the seeds in nm") # # Parameters for block management of the distance threshold calculation # xy_nm = luigi.FloatParameter( default=4.0, description="Size of a voxel in the X and Y direction") z_nm = luigi.FloatParameter( default=30.0, description="Size of a voxel in the Z direction") dt_xy_overlap = luigi.IntParameter( default=40, description="Overlap between distance transform blocks in the x and y " "directions") dt_z_overlap = luigi.IntParameter( default=5, description="Overlap between distance transform blocks in the z " "direction") dt_xy_block_size = luigi.IntParameter( default=512, description="Block size in the x and y directions for the distance " "transform.") dt_z_block_size = luigi.IntParameter( default=40, description="Block size in the z direction for the distance transform") dt_n_cpus = luigi.IntParameter( default=4, description="Number of CPUs to use when computing the distance " "transform") def make_strel(self): '''make the structuring element for the minimum distance''' if self.structuring_element == Shape.Cube: return np.ones([ int(np.floor(_) * 2 + 1) for _ in self.minimum_distance_z, self.minimum_distance_xy, self.minimum_distance_xy ], bool) ixy = int(np.floor(self.minimum_distance_xy)) iz = int(np.floor(self.minimum_distance_z)) z, y, x = np.mgrid[-iz:iz + 1, -ixy:ixy + 1, -ixy:ixy + 1].astype(np.float32) strel = ((z / self.minimum_distance_z)**2 + (y / self.minimum_distance_xy)**2 + (x / self.minimum_distance_xy)**2) <= 1 return strel def find_using_2d_smoothing(self, probs): '''Find seeds in each plane, smoothing, then thresholding :param probs: the probability volume ''' offset = 0 seeds = [] for plane in probs.astype(np.float32): smoothed = gaussian_filter(plane.astype(np.float32), self.sigma_xy) size = self.minimum_distance_xy eroded = grey_erosion(smoothed, size) thresholded = (smoothed < self.threshold) & (smoothed == eroded) labels, count = label(thresholded) labels[labels != 0] += offset offset += count seeds.append(labels) return np.array(seeds) def find_using_3d_smoothing(self, probs): '''Find seeds after smoothing and thresholding :param probs: the probability volume ''' sigma = (self.sigma_z, self.sigma_xy, self.sigma_xy) smoothed = gaussian_filter(probs.astype(np.float32), sigma) eroded = grey_erosion(smoothed, footprint=self.make_strel()) thresholded = (smoothed < self.threshold) & (smoothed == eroded) labels, count = label(thresholded) rh_logger.logger.report_event("Found %d seeds" % count) return labels def find_using_2d_distance(self, probs): '''Find seeds in each plane by distance transform :param probs: the probability volume ''' offset = 0 seeds = [] for plane in probs.astype(np.float32): thresholded = plane < self.threshold distance = distance_transform_edt(thresholded) dilated = grey_dilation(distance, size=self.minimum_distance_xy) mask = (distance == dilated) & (distance >= self.distance_threshold) labels, count = label(mask) labels[labels != 0] += offset offset += count seeds.append(labels) return np.array(seeds) def find_using_3d_distance(self, probs): distance = [] thresholded = probs < self.threshold distance = parallel_distance_transform(thresholded, self.xy_nm, self.z_nm, self.dt_xy_overlap, self.dt_z_overlap, self.dt_xy_block_size, self.dt_z_block_size, self.dt_n_cpus) dilated = grey_dilation(distance, footprint=self.make_strel()) mask = (distance == dilated) & (distance >= self.distance_threshold) labels, count = label(mask) rh_logger.logger.report_event("Found %d seeds" % count) return labels def ariadne_run(self): prob_target = DestVolumeReader(self.prob_loading_plan_path) probs = prob_target.imread() if self.method == SeedsMethodEnum.Smoothing: if self.dimensionality == Dimensionality.D2: seeds = self.find_using_2d_smoothing(probs) else: seeds = self.find_using_3d_smoothing(probs) else: if self.dimensionality == Dimensionality.D2: seeds = self.find_using_2d_distance(probs) else: seeds = self.find_using_3d_distance(probs) seeds = seeds.astype(np.uint32) self.output().imwrite(seeds)