class FindMergesBase(luigi.Task): task_name = 'find_merges' src_file = os.path.abspath(__file__) allow_retry = False path = luigi.Parameter() key = luigi.Parameter() out_path = luigi.Parameter() clear_ids = luigi.ListParameter() min_overlap = luigi.IntParameter() dependency = luigi.TaskParameter() def requires(self): return self.dependency def run_impl(self): # get the global config and init configs shebang = self.global_config_values()[0] self.init(shebang) # load the task config config = self.get_task_config() config.update({'path': self.path, 'key': self.key, 'clear_ids': self.clear_ids, 'out_path': self.out_path, 'min_overlap': self.min_overlap}) # prime and run the jobs n_jobs = 1 self.prepare_jobs(n_jobs, None, config) self.submit_jobs(n_jobs) # wait till jobs finish and check for job success self.wait_for_jobs() self.check_jobs(n_jobs)
class NeuroproofLearnTaskMixin: prob_loading_plan_path = luigi.Parameter( description="Location of the probability prediction volume") additional_locations = luigi.ListParameter( default=[], description="Additional probability map locations for Neuroproof") seg_loading_plan_path = luigi.Parameter( description="Location of the pipeline's watershed segmentation") gt_loading_plan_path = luigi.Parameter( description="Location of the ground truth segmentation") output_location = luigi.Parameter( description="Location for the classifier file. Use an .xml extension " "to use the OpenCV random forest classifier. Use an .h5 extension " "to use the Vigra random forest classifier") def input(self): loading_plans = [self.prob_loading_plan_path, self.seg_loading_plan_path, self.gt_loading_plan_path] + \ list(self.additional_locations) for loading_plan in loading_plans: for tgt in DestVolumeReader(loading_plan).get_source_targets(): yield tgt def output(self): return luigi.LocalTarget(self.output_location)
class ScheduledExternalProgramTask(ExternalProgramTask): """ Variant of :class:`luigi.contrib.external_program.ExternalProgramTask` that executes the task with a :class:`Scheduler`. """ scheduler = luigi.ChoiceParameter(default=cfg.scheduler, choices=['local'] + [blurb for blurb in _schedulers], positional=False, significant=False, description='Scheduler to use for running the task') scheduler_partition = luigi.OptionalParameter(default=cfg.scheduler_partition, positional=False, significant=False, description='Scheduler partition (or queue) to use if supported') scheduler_extra_args = luigi.ListParameter(default=cfg.scheduler_extra_args, positional=False, significant=False, description='Extra arguments to pass to the scheduler') walltime = luigi.TimeDeltaParameter(default=datetime.timedelta(), positional=False, significant=False, description='Amout of time to allocate for the task, default value of zero implies unlimited time') cpus = luigi.IntParameter(default=1, positional=False, significant=False, description='Number of CPUs to allocate for the task') memory = luigi.FloatParameter(default=1, positional=False, significant=False, description='Amount of memory (in gigabyte) to allocate for the task') def __init__(self, *kwargs, **kwds): super(ScheduledExternalProgramTask, self).__init__(*kwargs, **kwds) try: if self.scheduler != 'local': self._scheduler = _schedulers[self.scheduler] except KeyError: raise ValueError('Unsupported scheduler {}'.format(self.scheduler)) @property def resources(self): if self.scheduler == 'local': # local_jobs is actually constrained by the number of workers return {'cpus': self.cpus, 'memory': self.memory} else: return {'{}_jobs'.format(self.scheduler): 1} def run(self): if self.scheduler == 'local': return super(ScheduledExternalProgramTask, self).run() else: return self._scheduler.run_task(self)
class BootstrapSpokeAsTask(tasks.PuppetTask): puppet_account_id = luigi.Parameter() account_id = luigi.Parameter() iam_role_arns = luigi.ListParameter() role_name = luigi.Parameter() permission_boundary = luigi.Parameter() puppet_role_name = luigi.Parameter() puppet_role_path = luigi.Parameter() def params_for_results_display(self): return { "puppet_account_id": self.puppet_account_id, "account_id": self.account_id, } def run(self): partition = config.get_partition() iam_role_arns_to_use = [ iam_role_arn for iam_role_arn in self.iam_role_arns ] iam_role_arns_to_use.append( f"arn:{partition}:iam::{self.account_id}:role/{self.role_name}") sdk.bootstrap_spoke_as( self.puppet_account_id, iam_role_arns_to_use, self.permission_boundary, self.puppet_role_name, self.puppet_role_path, ) self.write_output(self.params_for_results_display())
class DeleteStoragePlan(RunMixin, RequiresMixin, luigi.Task): '''a task to delete a storage plan's .tif files''' task_namespace = "ariadne_microns_pipeline" dependency_outputs = luigi.ListParameter( default=[], description="The outputs of this task's dependencies. The task " "requests these as inputs so that all of them must be present " "before the storage plan is deleted.") storage_plan_path = luigi.Parameter(description="Storage plan to delete") def input(self): yield SrcVolumeTarget(self.storage_plan_path) for dependency_output in self.dependency_outputs: yield luigi.LocalTarget(dependency_output) def output(self): return luigi.LocalTarget( SrcVolumeTarget.storage_plan_path_to_deleted_file( self.storage_plan_path)) def ariadne_run(self): self.input().next().remove() with self.output().open("w") as fd: fd.write("So sorry.\n")
class StringTieMerge(SlurmExecutableTask, CheckTargetNonEmpty): lib_list = luigi.ListParameter() output_prefix = luigi.Parameter() library = None def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) # Set the SLURM request params for this task self.mem = 2000 self.n_cpu = 4 self.partition = "nbi-medium" def requires(self): return [self.clone(StringTie, library=lib) for lib in self.lib_list] def output(self): return LocalTarget(os.path.join(self.base_dir, VERSION, PIPELINE, self.output_prefix, 'stringtie.gtf')) def work_script(self): self.temp = TemporaryFile() return '''#!/bin/bash source stringtie-1.3.0; set -euo pipefail echo '{input}' > {temp} stringtie -p {n_cpu} --merge {temp} > {output}.temp mv {output}.temp {output} '''.format(input="\n".join([x.path for x in self.input()]), output=self.output().path, temp=self.temp.path, n_cpu=self.n_cpu, )
class CreateBwaIndices(FtarcTask): fa_path = luigi.Parameter() bwa = luigi.Parameter(default='bwa') use_bwa_mem2 = luigi.BoolParameter(default=False) add_index_args = luigi.ListParameter(default=list()) sh_config = luigi.DictParameter(default=dict()) priority = 100 def output(self): return [ luigi.LocalTarget(f'{self.fa_path}.{s}') for s in (['0123', 'amb', 'ann', 'pac', 'bwt.2bit.64'] if self. use_bwa_mem2 else ['pac', 'bwt', 'ann', 'amb', 'sa']) ] def run(self): fa = Path(self.fa_path) run_id = fa.stem self.print_log(f'Create BWA indices:\t{run_id}') self.setup_shell(run_id=run_id, commands=self.bwa, cwd=fa.parent, **self.sh_config) self.run_shell(args=(f'set -e && {self.bwa} index' + ''.join(f' {a}' for a in self.add_index_args) + f' {fa}'), input_files_or_dirs=fa, output_files_or_dirs=[o.path for o in self.output()])
class CybersourceDataValidationTask(WarehouseMixin, luigi.WrapperTask): import_date = luigi.DateParameter() cybersource_merchant_ids = luigi.ListParameter( config_path={'section': 'payment', 'name': 'cybersource_merchant_ids'}, ) def requires(self): config = get_config() for merchant_id in self.cybersource_merchant_ids: section_name = 'cybersource:' + merchant_id interval_start = luigi.DateParameter().parse(config.get(section_name, 'interval_start')) interval_end = self.import_date merchant_close_date = config.get(section_name, 'merchant_close_date', '') if merchant_close_date: parsed_date = luigi.DateParameter().parse(merchant_close_date) interval_end = min(self.import_date, parsed_date) cybersource_interval = date_interval.Custom(interval_start, interval_end) for date in cybersource_interval: filename = "cybersource_{}.tsv".format(merchant_id) url = url_path_join(self.warehouse_path, 'payments', 'dt=' + date.isoformat(), filename) yield ExternalURL(url=url)
class SmaliList(luigi.Task): pkg = luigi.Parameter() apks = luigi.ListParameter(significant=False) def requires(self): return [ApiExtractorRun(file_name=fn, pkg=self.pkg) for fn in self.apks] def output(self): output_file = os.path.join(cfg.soot_smalilist_folder, self.pkg, self.pkg + ".json") return ExternalFileTarget(output_file) def run(self): app_smalilist = {} for i in self.input(): with i['loc'].open() as data_file: _, ver, _ = commons().get_apk_data(i['loc'].path) smali = json.load(data_file) # cast to set to remove duplicates app_smalilist[ver] = list(set(smali)) with self.output().open('w') as data_file: json.dump(app_smalilist, data_file, indent=2)
class TestDockerBuildBase(DockerBuildBase): goals = luigi.ListParameter([]) def get_goal_class_map(self) -> Dict[str, DockerAnalyzeImageTask]: goal_class_map = { "test-analyze-image-1": self.create_child_task( task_class=TestDockerBuildBaseTestAnalyzeImage, task_name="test-analyze-image-1"), "test-analyze-image-2": self.create_child_task(TestDockerBuildBaseTestAnalyzeImage, task_name="test-analyze-image-2") } return goal_class_map def get_default_goals(self) -> Set[str]: goals = {"test-analyze-image-1"} return goals def get_goals(self): return self.goals def run_task(self): build_tasks = self.create_build_tasks(False) image_infos_futures = yield from self.run_dependencies(build_tasks) image_infos = self.get_values_from_futures(image_infos_futures) self.return_object(image_infos)
class PostImportDatabaseTask(SchemaManagementTask): """ Task needed to run after importing database into warehouse. """ # Override the standard roles here since these tables will be rather raw. We may want to restrict access to a # subset of users. roles = luigi.ListParameter(config_path={ 'section': 'vertica-export', 'name': 'business_intelligence_team_roles' }, ) @property def queries(self): return [ "DROP SCHEMA IF EXISTS {schema} CASCADE;".format( schema=self.schema), "ALTER SCHEMA {schema_loading} RENAME TO {schema};".format( schema_loading=self.schema_loading, schema=self.schema), "GRANT USAGE ON SCHEMA {schema} TO {roles};".format( schema=self.schema, roles=self.vertica_roles), "GRANT SELECT ON ALL TABLES IN SCHEMA {schema} TO {roles};".format( schema=self.schema, roles=self.vertica_roles), ] @property def marker_name(self): return 'post_database_import_{schema}_{date}'.format( schema=self.schema, date=self.date.strftime('%Y-%m-%d'))
class lvl1(luigi.Task): files = luigi.ListParameter() outputpath = "" def __init__(self,*args, **kwargs): super(lvl1, self).__init__(*args, **kwargs) self.conf = initConf() self.preparePaths() @basicLoggerDecorator(pipelineLogger) def preparePaths(self): self.outputpath = self.conf['lvl1_outputpath'] @basicLoggerDecorator(pipelineLogger) def requires(self): return [] @basicLoggerDecorator(pipelineLogger) def output(self): return luigi.LocalTarget(self.outputpath) @basicLoggerDecorator(pipelineLogger) def run(self): with self.output().open('w') as f: dakd.extract.prepareFiles(self.files,f)
class RGBAnnotationMapImage(luigi.Task): input_path = luigi.Parameter() rgb_components = luigi.ListParameter(default=[0, 1, 2]) src_data_path = luigi.Parameter() render_tiles = luigi.BoolParameter(default=False) def make_plot(self, da_emb): return make_rgb_annotation_map_image( da=da_emb, rgb_components=self.rgb_components, dataset_path=self.dataset_path, ) def run(self): da_emb = xr.open_dataarray(self.input_path) fig, axes = self.make_plot(da_emb=da_emb) Path(self.output().fn).parent.mkdir(exist_ok=True, parents=True) plt.savefig(self.output().fn, fig=fig, bbox_inches="tight") def output(self): image_fullpath = Path(self.input_path) src_path, src_fn = image_fullpath.parent, image_fullpath.name fn_out = src_fn.replace( ".nc", ".rgb_map.{}__comp.png".format("_".join( [str(v) for v in self.rgb_components])), ) p = Path(src_path) / fn_out return luigi.LocalTarget(str(p))
class PreprocessingPipeline(luigi.Task): datasets = luigi.ListParameter(description="Names of the datasets to use") export_csv = luigi.BoolParameter( description="If specified, exports spectra as csv files", significant=False, visibility=luigi.parameter.ParameterVisibility.HIDDEN) pool_size = luigi.IntParameter( default=os.cpu_count() or 1, description= 'Size of parallel pool to use for computations. Choose carefully ' 'to not exceed the memory.', significant=False, visibility=luigi.parameter.ParameterVisibility.HIDDEN) def requires(self): for dataset in self.datasets: yield AssembleMetadata(dataset=dataset, pool_size=self.pool_size) for dataset in self.datasets: yield MergeDataset(dataset=dataset, datasets=self.datasets, pool_size=self.pool_size) if self.export_csv: for dataset in self.datasets: yield ExportCsv(dataset=dataset, datasets=self.datasets, pool_size=self.pool_size)
class NeuralNetworkClassificatorTask(ClassificatorTask): _name = 'neural_network' solver = luigi.Parameter(default='lbfgs') activation = luigi.Parameter(default='relu') hidden_layer_sizes = luigi.ListParameter() batch_size = luigi.Parameter(default='auto') def build_and_train(self, x, y): from sklearn.neural_network import MLPClassifier from sklearn.multiclass import OneVsRestClassifier lx,ly,lz = x.shape self.meta.update({ "word_vec_size": lz, "len_words": ly, }) x_train = x.reshape(lx, ly*lz) model = MLPClassifier( hidden_layer_sizes=list(self.hidden_layer_sizes), solver=self.solver, activation=self.activation, batch_size='auto' if self.batch_size == 'auto' else int(self.batch_size) ) if self.ovr_strategy: model = OneVsRestClassifier(model) model.fit(x_train, y) score = model.score(x_train, y) self.meta['score'] = score self.set_status_message(f'Model fit complete. Score {score}') return model
class UploadFilesToAzureAndRecord(luigi.Task): """ Just copies the result of the parent task into the task DB """ part_id = luigi.Parameter() path_list = luigi.ListParameter() task_namespace = 'azure' def requires(self): return UploadFilesToAzure(self.part_id, self.path_list) def run(self): # Record output in DB too: for item in self.path_list: tr = UploadToAzure(item).output() if not tr.exists(): tr.touch() # All done, so log this task as complete: self.output().touch() def output(self): """If this all works, record success in the DB""" return taskdb_target( 'azure_upload_set', '%s UPLOADED' % self.part_id)
class MergeBam(SlurmExecutableTask, CheckTargetNonEmpty): base_dir = luigi.Parameter(significant=False) scratch_dir = luigi.Parameter(default="/tgac/scratch/buntingd/", significant=False) lib_list = luigi.ListParameter() output_prefix = luigi.Parameter() library = None def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) # Set the SLURM request params for this task self.mem = 16000 self.n_cpu = 3 self.partition = "nbi-medium" def requires(self): return [self.clone(Library.MarkDuplicates, library=lib) for lib in self.lib_list] def output(self): return LocalTarget(os.path.join(self.scratch_dir, VERSION, PIPELINE, self.output_prefix, 'merged.bam')) def work_script(self): self.temp = TemporaryFile() return '''#!/bin/bash source samtools-1.3; set -euo pipefail echo '{input}' > {temp} samtools merge -f {output}.temp.bam -b {temp} --threads 2 mv {output}.temp.bam {output} '''.format(input="\n".join([x.path for x in self.input()]), output=self.output().path, temp=self.temp.path)
class DownloadArticle(luigi.Task): lexology_urls = luigi.ListParameter() # luigi parameter def run(self): for lexology_url in self.lexology_urls: r = requests.get(lexology_url) html = r.text soup = BeautifulSoup(html, 'html.parser') article_header = soup.select('h4') article_link_list = [] for links in article_header: article_links = links.find_all('a', href=True) for link in article_links: article_link = link.get('href') article_link_list.append(article_link) #print(article_link_list) all_text = [] for article_link in article_link_list: try: text = get_article(article_link) all_text.append(text) except Exception as e: print(f'error in {article_link}, {e}') df = pd.DataFrame(all_text, columns=['article_title', 'article_date', 'text']) df.to_pickle(self.output().open('w').path) def output(self): return luigi.LocalTarget(f'data/raw/raw_articles.pkl')
class PairwiseDistanceWorkflow(WorkflowBase): input_path = luigi.Parameter() input_key = luigi.Parameter() morphology_path = luigi.Parameter() morphology_key = luigi.Parameter() output_path = luigi.Parameter() max_distance = luigi.FloatParameter() resolution = luigi.ListParameter() max_size = luigi.IntParameter(default=None) def requires(self): distance_task = getattr(distance_tasks, self._get_task_name('ObjectDistances')) dep = distance_task(tmp_folder=self.tmp_folder, max_jobs=self.max_jobs, config_dir=self.config_dir, input_path=self.input_path, input_key=self.input_key, morphology_path=self.morphology_path, morphology_key=self.morphology_key, max_distance=self.max_distance, resolution=self.resolution, max_size=self.max_size) dep = MergePairwiseDistances(tmp_folder=self.tmp_folder, max_jobs=self.max_jobs, output_path=self.output_path, dependency=dep) return dep @staticmethod def get_config(): configs = super(PairwiseDistanceWorkflow, PairwiseDistanceWorkflow).get_config() configs.update({'object_distances': distance_tasks.ObjectDistancesLocal.default_task_config()}) return configs
class MergeSampleBams(luigi.Task): """Merge Multiple Bam Files for One Sample and Coverage Statistics Attributes: inbams (list): a list of bam files outbam (str): output bam filename Output: - {outdir}/mapping/{sample}.merged.bam """ resources = {"cpu": 2, "memory": 1} outbam = luigi.Parameter() inbams = luigi.ListParameter() def requires(self): return [] def output(self): return luigi.LocalTarget(self.outbam) def run(self): cmd = """samtools merge - {bams} | tee {outfile} \ | samtools index - {outfile}.bai """.format( bams = " ".join(self.inbams), outfile = self.outbam, ) logging.info(cmd) subprocess.run(cmd, shell=True)
class CreateSequenceDictionary(FtarcTask): fa_path = luigi.Parameter() gatk = luigi.Parameter(default='gatk') add_createsequencedictionary_args = luigi.ListParameter(default=list()) n_cpu = luigi.IntParameter(default=1) memory_mb = luigi.FloatParameter(default=4096) sh_config = luigi.DictParameter(default=dict()) priority = 70 def output(self): fa = Path(self.fa_path).resolve() return luigi.LocalTarget(fa.parent.joinpath(f'{fa.stem}.dict')) def run(self): run_id = Path(self.fa_path).stem self.print_log(f'Create a sequence dictionary:\t{run_id}') fa = Path(self.fa_path).resolve() seq_dict_path = self.output().path self.setup_shell(run_id=run_id, commands=self.gatk, cwd=fa.parent, **self.sh_config, env={ 'JAVA_TOOL_OPTIONS': self.generate_gatk_java_options( n_cpu=self.n_cpu, memory_mb=self.memory_mb) }) self.run_shell( args=(f'set -e && {self.gatk} CreateSequenceDictionary' + f' --REFERENCE {fa}' + ''.join(f' {a}' for a in self.add_createsequencedictionary_args) + f' --OUTPUT {seq_dict_path}'), input_files_or_dirs=fa, output_files_or_dirs=seq_dict_path)
class Freebayes(luigi.Task): """Freebayes Calling Variants Attributes: inbam (file): input dedup bam file outvcf (str): output vcf file """ resources = {"cpu": 1, "memory": 1} inbam = luigi.ListParameter() outvcf = luigi.Parameter() def requires(self): return [] def output(self): return luigi.LocalTarget(self.outvcf) def run(self): cmd = "freebayes -f {genome} {bam} > {outfile}".format( genome=Reference().genome, bam=self.inbam, outfile=self.outvcf ) # bcftools filter -e 'QUAL < 20' -s LOWQUAL {rawvcf} {outvcf} logging.info(cmd) subprocess.run(cmd, shell=True)
class WideRecommender(ClassifierWithTransferLearningKerasModelTraining): input_shape: Tuple[int, int] = luigi.TupleParameter(default=(100, )) batch_size: int = luigi.IntParameter(default=10) learning_rate = luigi.FloatParameter(default=1e-5) dense_layers: List[int] = luigi.ListParameter(default=[512, 512]) dropout: float = luigi.FloatParameter(default=None) activation_function: str = luigi.ChoiceParameter( choices=KERAS_ACTIVATION_FUNCTIONS.keys(), default="relu") kernel_initializer: str = luigi.ChoiceParameter( choices=KERAS_WEIGHT_INIT.keys(), default="glorot_uniform") def create_base_model(self) -> Model: x_input = Input(shape=self.input_shape, name='wide_inp') wide = Dense(self.input_shape[0], activation=self.activation_function, kernel_initializer=self.kernel_initializer, name='wide_mlp')(x_input) output = Dense(1, activation='sigmoid', kernel_initializer=self.kernel_initializer)(wide) model = Model(x_input, output, name='Wide') return model def create_model_with(self, base_model: Model) -> Model: return base_model
class Mpileup(luigi.Task): """Bcftools Mpileup calling variants Attributes: inbam (file): input dedup bam file outvcf (str): output vcf file """ resources = {"cpu": 1, "memory": 1} inbam = luigi.ListParameter() outvcf = luigi.Parameter() def requires(self): return [] def output(self): return luigi.LocalTarget(self.outvcf) def run(self): cmd = """bcftools mpileup -f {genome} {bam} \ | bcftools call -mv --ploidy {ploidy} -o {outvcf}""".format( bam=self.inbam, vcf=self.outvcf, genome=Reference().genome, ploidy=Reference().genome_version ) logging.info(cmd) subprocess.run(cmd, shell=True)
class AbyssSealerReduced(CheckTargetNonEmpty, SlurmExecutableTask): sealer_klist = luigi.ListParameter() def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) # Set the SLURM request params for this task self.mem = 6000 self.n_cpu = 2 self.partition = "nbi-medium" def output(self): return LocalTarget(os.path.join(self.base_dir, PIPELINE, VERSION, "sealer", "SOAP", 'K' + str(self.K), 'K' + str(self.K) + "_scaffold.fa")) def requires(self): return {'bloomfilters': [self.clone(Assemble.AbyssBloomBuild, bloom_k=k) for k in self.sealer_klist], 'scaffolds': self.clone(SOAPNremap)} def work_script(self): return '''#!/bin/bash source abyss-2.0.2; mkdir -p {output}/temp set -euo pipefail abyss-sealer {k_args} -P25 --flank-length=150 -j {n_cpu} -o {output}/temp/{prefix} -S {scaffolds} {bloomfilters} mv {output}/temp/{prefix}* {output}/ '''.format(k_args=' '.join(['-k' + str(k) for k in self.sealer_klist]), bloomfilters=' '.join(['-i ' + x.path for x in self.input()['bloomfilters']]), scaffolds=self.input()['scaffolds'].path, n_cpu=self.n_cpu, output=os.path.dirname(self.output().path), prefix='K' + str(self.K))
class _Tracking2DExtraction(luigi.Task): """ Base task for extracting fields from object tracking in 2D. This should never be called directly. Instead use either TrackingVariable2D or TrackingLabels2D """ base_name = luigi.Parameter() track_without_gal_transform = luigi.BoolParameter(default=False) tracking_type = luigi.EnumParameter(enum=TrackingType) tracking_timestep_interval = luigi.ListParameter(default=[]) def requires(self): U_tracking_offset = None if self.track_without_gal_transform: meta = _get_dataset_meta_info(self.base_name) U_tracking_offset = meta.get("U_gal", None) if U_tracking_offset is None: raise Exception( "To remove the Galilean transformation before tracking" " please define the transform velocity" " as `U_gal` in datasources.yaml for" " dataset `{}`".format(self.base_name)) return PerformObjectTracking2D( base_name=self.base_name, tracking_type=self.tracking_type, timestep_interval=self.tracking_timestep_interval, U_offset=U_tracking_offset, )
class RunFeatureUnion(PickleTask): dataset = luigi.Parameter() levels = luigi.ListParameter() sig_type = luigi.Parameter(default='logsig') def output(self): levels_name = '_'.join(map(str, self.levels)) filename = f"{self.sig_type}_concat_{levels_name}.pkl" return luigi.LocalTarget(PIPELINE_DIR / self.dataset / filename) def run(self): X_train, y_train, X_test, y_test = load_data(self.dataset) logit = LogisticRegression(random_state=42) r = [] for level in self.levels: m = classifiers.create_concatenator(logit, sig_type=self.sig_type, level=level) # start timing start = timeit.default_timer() m.fit(X_train, y_train) elapsed = timeit.default_timer() - start # end timing r.append([m.score(X_test, y_test), elapsed]) self.dump( pd.DataFrame(r, columns=["Score", "Elapsed"], index=self.levels))
class MultiSampleWorkflow(sl.WorkflowTask): """ This workflow is meant to take an entire dataset description and run the the SingleSampleWorkflow on each sample """ midas_db = sl.Parameter() dataset_description = sl.Parameter() workdir = sl.Parameter() contaminant_removal_method = sl.Parameter(default="bbsplit") filter_genomes = luigi.ListParameter() ref_info_dir = sl.Parameter() ref_combo_hash = sl.Parameter() def workflow(self): dataset_spec = json.load(self.dataset_description) tasks = [] if len(self.filter_genomes) > 0: index_task = self.new_task("ref_index", CreateIndexForContamRemoval) tasks.append(index_task) # Samples are in an array in the json. Each sample has a prefix and two read files for sample in dataset_spec["samples"]: wf = self.new_task('SampleWorkflow_' + sample["prefix"], SingleSampleWorkflow, workdir=self.workdir, prefix=sample["prefix"], in_fastq1=sample["in_fastq1"], in_fastq2=sample["in_fastq2"], midas_db=self.midas_db, filter_genomes=self.filter_genomes, ref_info_dir=self.ref_info_dir, ref_combo_hash=self.ref_combo_hash) tasks.append(wf) return tasks
class AggregateArtists(luigi.Task): months = luigi.ListParameter() user_id = luigi.Parameter() def output(self): return luigi.LocalTarget( "output/artist_streams_{}_{}.csv".format( self.user_id, "-".join(self.months) ), format=luigi.format.Nop, ) def requires(self): return [Streams(month, self.user_id) for month in self.months] def run(self): main_df = None for t in self.input(): with t.open("r") as in_file: df = pd.read_csv(in_file) if main_df is None: main_df = df else: main_df = main_df.append(df) counts = ( main_df.groupby("Artist Name") .count()["Apple Id Number"] .sort_values(ascending=False) .rename("Count") ) with self.output().open("w") as out_file: counts.to_csv(out_file)
class B(PipeTask): """ B required by A """ int_array = luigi.ListParameter(default=None) def pipe_run(self): print("B saving type [{}]".format(type(self.int_array))) return self.int_array