class WideRecommender(ClassifierWithTransferLearningKerasModelTraining): input_shape: Tuple[int, int] = luigi.TupleParameter(default=(100, )) batch_size: int = luigi.IntParameter(default=10) learning_rate = luigi.FloatParameter(default=1e-5) dense_layers: List[int] = luigi.ListParameter(default=[512, 512]) dropout: float = luigi.FloatParameter(default=None) activation_function: str = luigi.ChoiceParameter( choices=KERAS_ACTIVATION_FUNCTIONS.keys(), default="relu") kernel_initializer: str = luigi.ChoiceParameter( choices=KERAS_WEIGHT_INIT.keys(), default="glorot_uniform") def create_base_model(self) -> Model: x_input = Input(shape=self.input_shape, name='wide_inp') wide = Dense(self.input_shape[0], activation=self.activation_function, kernel_initializer=self.kernel_initializer, name='wide_mlp')(x_input) output = Dense(1, activation='sigmoid', kernel_initializer=self.kernel_initializer)(wide) model = Model(x_input, output, name='Wide') return model def create_model_with(self, base_model: Model) -> Model: return base_model
class MLPClassifier(ClassifierWithTransferLearningKerasModelTraining): input_shape: Tuple[int, int] = luigi.TupleParameter(default=(100, )) batch_size: int = luigi.IntParameter(default=10) learning_rate = luigi.FloatParameter(default=1e-5) dense_layers: List[int] = luigi.ListParameter(default=[512, 512]) dropout: float = luigi.FloatParameter(default=None) activation_function: str = luigi.ChoiceParameter( choices=KERAS_ACTIVATION_FUNCTIONS.keys(), default="relu") kernel_initializer: str = luigi.ChoiceParameter( choices=KERAS_WEIGHT_INIT.keys(), default="glorot_uniform") def create_base_model(self) -> Model: x_input = Input(shape=self.input_shape) mlp = Dense(self.dense_layers[0], activation=self.activation_function, kernel_initializer=self.kernel_initializer)(x_input) for dense_neurons in self.dense_layers[1:]: mlp = Dense(dense_neurons, activation=self.activation_function, kernel_initializer=self.kernel_initializer)(mlp) #model.add(BatchNormalization()) if self.dropout: mlp = Dropout(self.dropout)(mlp) output = Dense(1, activation='sigmoid')(mlp) model = Model(x_input, output, name='BaseMLP') return model def create_model_with(self, base_model: Model) -> Model: return base_model
class SeqrMTToESTask(HailElasticSearchTask): source_paths = luigi.Parameter(default="[]", description='Path or list of paths of VCFs to be loaded.') dest_path = luigi.Parameter(description='Path to write the matrix table.') genome_version = luigi.Parameter(description='Reference Genome Version (37 or 38)') vep_runner = luigi.ChoiceParameter(choices=['VEP', 'DUMMY'], default='VEP', description='Choice of which vep runner to annotate vep.') reference_ht_path = luigi.Parameter(default=None, description='Path to the Hail table storing the reference variants.') clinvar_ht_path = luigi.Parameter(default=None, description='Path to the Hail table storing the clinvar variants.') hgmd_ht_path = luigi.Parameter(default=None, description='Path to the Hail table storing the hgmd variants.') sample_type = luigi.ChoiceParameter(default="WES", choices=['WGS', 'WES'], description='Sample type, WGS or WES') dont_validate = luigi.BoolParameter(description='Disable checking whether the dataset matches the specified ' 'genome version and WGS vs. WES sample type.') dataset_type = luigi.ChoiceParameter(choices=['VARIANTS', 'SV'], default='VARIANTS', description='VARIANTS or SV.') remap_path = luigi.OptionalParameter(default=None, description="Path to a tsv file with two columns: s and seqr_id.") subset_path = luigi.OptionalParameter(default=None, description="Path to a tsv file with one column of sample IDs: s.") vep_config_json_path = luigi.OptionalParameter(default=None, description="Path of hail vep config .json file") def __init__(self, *args, **kwargs): # TODO: instead of hardcoded index, generate from project_guid, etc. kwargs['source_path'] = self.dest_path super().__init__(*args, **kwargs) self.completed_marker_path = os.path.join(self.dest_path, '_EXPORTED_TO_ES') def requires(self): return [SeqrVCFToMTTask( source_paths=self.source_paths, dest_path=self.dest_path, genome_version=self.genome_version, vep_runner=self.vep_runner, reference_ht_path=self.reference_ht_path, clinvar_ht_path=self.clinvar_ht_path, hgmd_ht_path=self.hgmd_ht_path, sample_type=self.sample_type, dont_validate=self.dont_validate, dataset_type=self.dataset_type, remap_path=self.remap_path, subset_path=self.subset_path, vep_config_json_path=self.vep_config_json_path, )] def output(self): # TODO: Use https://luigi.readthedocs.io/en/stable/api/luigi.contrib.esindex.html. return GCSorLocalTarget(filename=self.completed_marker_path) def complete(self): # Complete is called by Luigi to check if the task is done and will skip if it is. # By default it checks to see that the output exists, but we want to check for the # _EXPORTED_TO_ES file to make sure it was not terminated halfway. return GCSorLocalTarget(filename=self.completed_marker_path).exists() def run(self): mt = self.import_mt() row_table = SeqrVariantsAndGenotypesSchema.elasticsearch_row(mt) self.export_table_to_elasticsearch(row_table, self._mt_num_shards(mt)) with hl.hadoop_open(self.completed_marker_path, "w") as f: f.write(".") self.cleanup()
class mapReadsToGenome(luigi.Task): # Global Parameters project_name = luigi.Parameter(default="RNASeqAnalysis") read_library_type = GlobalParameter().read_library_type adapter = luigi.Parameter(default="./tasks/utility/adapters.fasta.gz") genome_name = GlobalParameter().genome_name organism_domain = GlobalParameter().organism_domain threads = GlobalParameter().threads maxMemory = GlobalParameter().maxMemory # Local Parameters # sampleName = luigi.Parameter() pre_process_reads = luigi.ChoiceParameter(choices=["yes", "no"], var_type=str) rnaseq_aligner = luigi.ChoiceParameter( choices=["subread", "star", "hisat2", "dart", "segemehl", "bowtie2"], var_type=str) annotation_file_type = luigi.ChoiceParameter(choices=["GFF", "GTF"], var_type=str) #####################################################################################################3 def requires(self): if self.read_library_type == "pe": return [ alignReads(pre_process_reads=self.pre_process_reads, annotation_file_type=self.annotation_file_type, rnaseq_aligner=self.rnaseq_aligner, sampleName=i) for i in [ line.strip() for line in open( os.path.join(os.getcwd(), "sample_list", "pe_samples.lst")) ] ] if self.read_library_type == "se": return [ alignReads(pre_process_reads=self.pre_process_reads, annotation_file_type=self.annotation_file_type, rnaseq_aligner=self.rnaseq_aligner, sampleName=i) for i in [ line.strip() for line in open( os.path.join(os.getcwd(), "sample_list", "se_samples.lst")) ] ] def output(self): timestamp = time.strftime('%Y%m%d.%H%M%S', time.localtime()) return luigi.LocalTarget( os.path.join( os.getcwd(), "task_logs", 'task.align.read.to.genome.complete.{t}'.format(t=timestamp))) def run(self): timestamp = time.strftime('%Y%m%d.%H%M%S', time.localtime()) with self.output().open('w') as outfile: outfile.write('Read Alignment finished at {t}'.format(t=timestamp))
class alignmentFreeQuant(luigi.Task): project_name = luigi.Parameter(default="RNASeqAnalysis") organism_domain = GlobalParameter().organism_domain genome_name = GlobalParameter().genome_name read_library_type = GlobalParameter().read_library_type adapter = GlobalParameter().adapter pre_process_reads = luigi.ChoiceParameter(choices=["yes", "no"], var_type=str) annotation_file_type = luigi.ChoiceParameter(choices=["GFF", "GTF", "NA"], var_type=str) quant_method = luigi.ChoiceParameter(choices=["salmon", "kallisto"], var_type=str) def requires(self): if self.read_library_type == "pe": return [ transQuant(quant_method=self.quant_method, pre_process_reads=self.pre_process_reads, annotation_file_type=self.annotation_file_type, sampleName=i) for i in [ line.strip() for line in open((os.path.join( os.getcwd(), "sample_list", "pe_samples.lst"))) ] ] if self.read_library_type == "se": return [ transQuant(quant_method=self.quant_method, pre_process_reads=self.pre_process_reads, annotation_file_type=self.annotation_file_type, sampleName=i) for i in [ line.strip() for line in open((os.path.join( os.getcwd(), "sample_list", "se_samples.lst"))) ] ] def output(self): timestamp = time.strftime('%Y%m%d.%H%M%S', time.localtime()) return luigi.LocalTarget( os.path.join( os.getcwd(), "task_logs", 'task.generate.transcript.count.complete.{t}'.format( t=timestamp))) def run(self): timestamp = time.strftime('%Y%m%d.%H%M%S', time.localtime()) with self.output().open('w') as outfile: outfile.write('Generate Transcript Count finished at {t}'.format( t=timestamp))
class alignmentBasedQuant(luigi.Task): project_name = luigi.Parameter(default="RNASeqAnalysis") read_library_type = GlobalParameter().read_library_type threads = GlobalParameter().threads genome_name = GlobalParameter().genome_name adapter = GlobalParameter().adapter organism_domain = GlobalParameter().domain feature_type = GlobalParameter().feature_type annotation_file_type = GlobalParameter().annotation_suffix rnaseq_aligner = luigi.ChoiceParameter( choices=["subread", "star", "hisat2", "dart", "segemehl", "bowtie2"], var_type=str) attribute_type = luigi.Parameter( default="gene_id", description='''Specify attribute type in GTF annotation. string(=[gene_id])''') strandType = luigi.ChoiceParameter( default="0", choices=['0', '1', '2'], description= '''perform strand-specific read counting. int([=0]unstranded) OR [=1] stranded] OR [=2] reversely-stranded. default[ =0]''') pre_process_reads = luigi.ChoiceParameter(choices=["yes", "no"], var_type=str) def requires(self): return [ featureCounts(pre_process_reads=self.pre_process_reads, rnaseq_aligner=self.rnaseq_aligner, attribute_type=self.attribute_type, strandType=self.strandType) ] #annotation_file_type=self.annotation_file_type, def output(self): timestamp = time.strftime('%Y%m%d.%H%M%S', time.localtime()) return luigi.LocalTarget( os.path.join( os.getcwd(), "task_logs", 'task.generate.count.complete.{t}'.format(t=timestamp))) def run(self): timestamp = time.strftime('%Y%m%d.%H%M%S', time.localtime()) with self.output().open('w') as outfile: outfile.write( 'Count File Generation finished at {t}'.format(t=timestamp))
class DecisionTreeClassifierTraining(BaseModelTraining): criterion: str = luigi.ChoiceParameter(choices=["gini", "entropy"], default="gini") splitter: str = luigi.ChoiceParameter(choices=["best", "random"], default="best") max_depth: int = luigi.IntParameter(default=None) min_samples_split: int = luigi.IntParameter(default=2) min_samples_leaf: int = luigi.IntParameter(default=1) min_weight_fraction_leaf: float = luigi.FloatParameter(default=0.0) max_features: int = luigi.IntParameter(default=None) max_leaf_nodes: int = luigi.IntParameter(default=None) min_impurity_decrease: float = luigi.FloatParameter(default=0.0) ccp_alpha: float = luigi.FloatParameter(default=0.0) def create_model(self) -> BaseEstimator: return DecisionTreeClassifier( criterion=self.criterion, splitter=self.splitter, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, min_weight_fraction_leaf=self.min_weight_fraction_leaf, max_features=self.max_features, max_leaf_nodes=self.max_leaf_nodes, min_impurity_decrease=self.min_impurity_decrease, ccp_alpha=self.ccp_alpha, class_weight=self.class_weight if self.class_weight != "none" else None, random_state=self.seed) def _save_feature_importances(self, metrics: dict): with open(os.path.join(self.output().path, "feature_importances.json"), "w") as file: json.dump(metrics, file, indent=4) def run(self): super().run() self._save_feature_importances({ "feature_names": self.feature_names, "feature_importances": self.model.feature_importances_.tolist(), }) if isinstance(self.model, DecisionTreeClassifier): plot_decision_tree(self.model, self.feature_names, ["No Stroke", "Stroke"]).savefig( os.path.join(self.output().path, "tree.png"))
class ConvertPV(luigi.Task): """ Convert Landsat-8 data from Biome or 38-Clouds dataset to same spectral and spatial resolution as Proba-V. Following the transformation proposed in: Transferring deep learning models for cloud detection between Landsat-8 and Proba-V https://www.sciencedirect.com/science/article/abs/pii/S0924271619302801 Landsat-8 file is exported in the same format as Proba-V (HDF5 files) python convert_landsat_probav.py ConvertPV --l8img BC/LC80010112014080LGN00 """ l8img = luigi.Parameter(description="Folder with Landsat 8 image") outfolder = luigi.Parameter(default="landsataspv") resolution = luigi.ChoiceParameter(choices=["333M", "100M", "1KM"], default="333M") type_product = luigi.ChoiceParameter( choices=["biome", "38c", "landsat8"], default="landsat8", description= "Flag that indicates if the product has a manually annotated " "cloud mask from Biome or 38-Clouds dataset") def l8obj(self): if not hasattr(self, "l8obj_computed"): if self.type_product == "biome": obj = l8image.Biome(self.l8img) elif self.type_product == "38c": obj = l8image.L8_38Clouds(self.l8img) else: obj = l8image.L8Image(self.l8img) setattr(self, "l8obj_computed", obj) return getattr(self, "l8obj_computed") def output(self): l8img = self.l8obj() path_out = os.path.join( self.outfolder, l8img.name + "_" + str(self.resolution) + ".HDF5") return luigi.LocalTarget(path_out) def run(self): l8img = self.l8obj() out = self.output() out.makedirs() landsat_as_pv.convert_landsat_to_probav(l8img, self.resolution, file_out=out.path)
class dbg2olc(luigi.Task): projectName = luigi.Parameter(default="GenomeAssembly") pre_process_reads = luigi.ChoiceParameter(choices=["yes", "no"], var_type=str) read_library_type = luigi.ChoiceParameter(description="Choose From['pe-lr: paired-end and long read'," "'pe-mp-lr: paired-end, mate-pair and long read'", choices=["pe-lr","pe-mp-lr"], var_type=str) def requires(self): if self.read_library_type == "pe-lr" or self.read_library_type == "pe-mp-lr": return [minia(read_library_type=self.read_library_type, pre_process_reads=self.pre_process_reads)] def output(self): dbg2olc_assembly_folder = os.path.join(os.getcwd(), "GenomeAssembly", "DBG2OLC" + "/") return {'out': luigi.LocalTarget(dbg2olc_assembly_folder + "DBG2OLC_contigs.fa")} def run(self): minia_assembly_folder = os.path.join(os.getcwd(), "GenomeAssembly", "MINIA" + "/") dbg2olc_assembly_folder = os.path.join(os.getcwd(), "GenomeAssembly", "DBG2OLC" + "/") DBG2OLC_assembly_log_folder = os.path.join(os.getcwd(), "log", "GenomeAssembly", "DBG2OLC" + "/") kmer = minia_kmer((os.path.join(os.getcwd(),"GenomeAssembly", "MINIA","minia.fofn"))) print("Optimal Kmer: ", kmer) dbg2olc_input=dbg2olc_formater((os.path.join(os.getcwd(), "sample_list", "lr_samples.lst"))) run_cmd_dbg2olc = "[ -d {dbg2olc_assembly_folder} ] || mkdir -p {dbg2olc_assembly_folder}; " \ "mkdir -p {DBG2OLC_assembly_log_folder}; cd {dbg2olc_assembly_folder}; " \ "/usr/bin/time -v DBG2OLC " \ "k {kmer} Contigs {minia_assembly_folder}minia.contigs.fa " \ "KmerCovTh 2 MinOverlap 20 AdaptiveTh 0.005 " \ "{dbg2olc_input} " \ "2>&1 | tee {DBG2OLC_assembly_log_folder}dbg2olc_assembly.log " \ .format(minia_assembly_folder=minia_assembly_folder, dbg2olc_assembly_folder=dbg2olc_assembly_folder, dbg2olc_input=dbg2olc_input, DBG2OLC_assembly_log_folder=DBG2OLC_assembly_log_folder, kmer=kmer) if self.read_library_type == "pe-lr" or self.read_library_type == "pe-mp-lr": print("****** NOW RUNNING COMMAND ******: " + run_cmd_dbg2olc) print(run_cmd(run_cmd_dbg2olc))
class DownloadSample(TaskWithOutputMixin, WrapperTask): """ This is a generic task for downloading an individual sample in an experiment. Note that the 'gemma' source does not provide individual samples. """ experiment_id = luigi.Parameter() sample_id = luigi.Parameter() source = luigi.ChoiceParameter( default='local', choices=['gemma', 'geo', 'arrayexpress', 'local', 'sra'], positional=False) def requires(self): if self.source in ['geo', 'gemma']: return DownloadGeoSample(self.sample_id) elif self.source == 'sra': return DownloadSraExperiment(self.sample_id) elif self.source == 'arrayexpress': return DownloadArrayExpressSample(self.experiment_id, self.sample_id) elif self.source == 'local': return DownloadLocalSample(self.experiment_id, self.sample_id) else: raise ValueError('Unknown source for sample: {}.'.format( self.source))
class Merge(BcftoolsTask): """ Merge the samples of two or more VCF files """ input_file = luigi.ListParameter() filter_logic = luigi.ChoiceParameter(default='+', choices=['x', '+'], positional=False) info_rules = luigi.ListParameter(default=[], positional=False) output_file = luigi.Parameter() output_format = luigi.Parameter(positional=False, default='z') def subcommand_args(self): args = ['merge'] args.extend(['--filter-logic', self.filter_logic]) if self.info_rules: args.extend(['--info-rules', ','.join(self.info_rules)]) args.extend([ '--output-type', self.output_format, '--output', self.output_file ]) return args def subcommand_input_args(self): return self.input_file def output(self): return luigi.LocalTarget(self.output_file)
class FollowFilteredEdgelist(luigi.Task): '''edgelistの左側にunknownが出て来るエッジを消して、居住地の付けたユーザからのデータのみにしたエッジリスト Args: --name LocationUserListとUnknownListがわかるように保存パスに使われる名前 --month ''' month = luigi.MonthParameter() name = luigi.Parameter() type = luigi.ChoiceParameter(choices=['followers', 'following']) sources = luigi.TupleParameter(default=('followers', 'following')) def requires(self): return { 'edgelist': TwitterFollowRawEdgelist(month=self.month, type=self.type), 'hl': RemainedHomeLocation(name=self.name, month=self.month) } def output(self): return luigi.LocalTarget( os.path.join( NETWORK_DIR, 'filtered', self.name, self.month.strftime('%Y%m_{}.tsv.gz'.format(self.type)))) def run(self): with self.output().temporary_path() as temp_output_path: cmd = 'zcat {edgelist.path} | python -m snlocest.scripts.edgefilter -i {hl.path} | gzip > {}'.format( temp_output_path, **self.input()) run(cmd, shell=True, check=True)
class DownloadImageResults(ee_ipl_uv.luigi_utils.DownloadImage): split = luigi.Parameter() method = luigi.ChoiceParameter(choices=["percentile","persistence","linear","kernel"], var_type=str, default="percentile") def output(self): return ee_ipl_uv.luigi_utils.RasterTarget(os.path.join(self.basepath, self.image_index+"_"+self.split+"_"+self.method)) def load_region_of_interest(self): locations = get_location_splits() return [[p[1], p[0]] for p in locations[str(self.image_index)][str(self.split)][0]] def load_image(self): image_predict_clouds = ee.Image('LANDSAT/LC8_L1T_TOA_FMASK/' + str(self.image_index)) # Select region of interest (lng,lat) pol = self.load_region_of_interest() region_of_interest = ee.Geometry.Polygon(pol) cloud_score_percentile, pred_percentile = multitemporal_cloud_masking.CloudClusterScore(image_predict_clouds, region_of_interest, method_pred=self.method) ground_truth = ee.Image("users/gonzmg88/LANDSAT8_CLOUDS/" + self.image_index + "_fixedmask") image_download = image_predict_clouds.addBands(cloud_score_percentile.select(["cluster"], ["cloudscore"])) \ .addBands(ground_truth.select(["b1"], ["fixedmask"]))\ .addBands(pred_percentile).clip(region_of_interest).toFloat() properties = ["system:time_start", 'system:index'] return image_download, properties
class compositionProfiling(luigi.Task): project_name=GlobalParameter().projectName adapter = GlobalParameter().adapter threads = GlobalParameter().threads max_memory = GlobalParameter().maxMemory pre_process_reads = luigi.ChoiceParameter(choices=["yes", "no"], var_type=str) read_library_type = GlobalParameter().seq_platforms def requires(self): return [metaphlan(pre_process_reads=self.pre_process_reads, sampleName=i) for i in [line.strip() for line in open((os.path.join(os.getcwd(), "config", "pe_samples.lst")))]] def output(self): timestamp = time.strftime('%Y%m%d.%H%M%S', time.localtime()) return luigi.LocalTarget(os.path.join(os.getcwd(),"task_logs",'task.genome.binning.complete.{t}'.format( t=timestamp))) def run(self): timestamp = time.strftime('%Y%m%d.%H%M%S', time.localtime()) with self.output().open('w') as outfile: outfile.write('Metagenome binning finished at {t}'.format(t=timestamp))
class profileTaxonomy(luigi.Task): project_name=GlobalParameter().projectName adapter = GlobalParameter().adapter threads = GlobalParameter().threads max_memory = GlobalParameter().maxMemory pre_process_reads = luigi.ChoiceParameter(choices=["yes", "no"], var_type=str) read_library_type = GlobalParameter().seq_platforms condition_column=luigi.Parameter(default="conditions") def requires(self): return [graphlan(pre_process_reads=self.pre_process_reads)] def output(self): ampvis_image_folder = os.path.join(os.getcwd(), GlobalParameter().projectName, "metaphlan_analysis" ,"figures" + "/") return {'out1': luigi.LocalTarget(ampvis_image_folder + "/" + "family_heatmap.tiff")} def run(self): ampvis_folder = os.path.join(os.getcwd(), GlobalParameter().projectName, "metaphlan_analysis" + "/") inDir=os.path.join(os.getcwd(), GlobalParameter().projectName, "metaphlan_analysis" ,"data_for_images" + "/") map_file=os.path.join(os.getcwd(), "config","metagenome_condition.tsv") cmd_run_ampvis="[ -d {ampvis_folder} ] || mkdir -p {ampvis_folder} ; cd {ampvis_folder} ;" \ "ampvis.r -t {map_file} " \ "-v {condition_column} " \ "-a {inDir}/otu_table_ampvis2.txt".format(map_file=map_file, inDir=inDir, ampvis_folder=ampvis_folder, condition_column=self.condition_column) print("****NOW RUNNING COMMAND****:" + cmd_run_ampvis) print(run_cmd(cmd_run_ampvis))
class DownloadExperiment(TaskWithPriorityMixin, TaskWithOutputMixin, WrapperTask): """ This is a generic task that detects which kind of experiment is intended to be downloaded so that downstream tasks can process regardless of the data source. :source: Indicate the origin of the experiment, otherwise it will be inferred from the :experiment_id: parameter. """ experiment_id = luigi.Parameter() source = luigi.ChoiceParameter( default='local', choices=['gemma', 'geo', 'sra', 'arrayexpress', 'local'], positional=False) def requires(self): if self.source == 'gemma': return DownloadGemmaExperiment(self.experiment_id) elif self.source == 'geo': return DownloadGeoSeries(self.experiment_id) elif self.source == 'sra': return DownloadSraProject(self.experiment_id) elif self.source == 'arrayexpress': return DownloadArrayExpressExperiment(self.experiment_id) elif self.source == 'local': return DownloadLocalExperiment(self.experiment_id) else: raise ValueError('Unknown download source for experiment: {}.')
class QualityControlExperiment(TaskWithPriorityMixin, DynamicTaskWithOutputMixin, DynamicWrapperTask): """ Quality control all the samples in a given experiment. """ experiment_id = luigi.Parameter() source = luigi.ChoiceParameter( default='local', choices=['gemma', 'geo', 'sra', 'arrayexpress', 'local'], positional=False) def requires(self): return DownloadExperiment(self.experiment_id, source=self.source).requires().requires() def run(self): download_sample_tasks = next( DownloadExperiment(self.experiment_id, source=self.source).requires().run()) yield [ QualityControlSample(self.experiment_id, dst.sample_id, source=self.source) for dst in download_sample_tasks ]
class MercadoLivreTraining(SupervisedModelTraining): loss_function: str = luigi.ChoiceParameter(choices=["ce", "custom_ce"], default="ce") def class_weights(self): df_weights = pd.concat([ self.train_dataset._data_frame[[ self.project_config.output_column.name ]], self.val_dataset._data_frame[[ self.project_config.output_column.name ]] ]) weights = 1 / df_weights[self.project_config.output_column. name].value_counts().sort_index().values weights = np.array(list([0, 0, 0]) + list(weights)) return weights def _get_loss_function(self): if self.loss_function == 'custom_ce': self.loss_function_params = dict(self.loss_function_params) self.loss_function_params['class_weights'] = self.class_weights() return TORCH_LOSS_FUNCTIONS[self.loss_function]( **self.loss_function_params)
class PrepareReference(ScheduledExternalProgramTask): task_namespace = 'rsem' annotation_file = luigi.Parameter() reference_fasta_files = luigi.ListParameter() reference_name = luigi.Parameter() aligner = luigi.ChoiceParameter(choices=['star'], positional=False) star_path = luigi.OptionalParameter(default=None, positional=False) def program_args(self): args = [join(cfg.rsem_dir, 'rsem-prepare-reference')] args.extend(['--gtf', self.annotation_file]) if self.aligner == 'star': args.append('--star') if self.star_path is not None: args.extend(['--star-path', self.star_path]) args.extend(['-p', self.cpus]) args.extend(self.reference_fasta_files) args.append(self.reference_name) return args def output(self): return RsemReference(self.reference_name)
class Pull_data(lu.Task): v = lu.NumericalParameter(default=0.1, var_type=float, min_value=0, max_value=100) boro = lu.ChoiceParameter(default='Queens', var_type=str, choices=['Queens', 'Brooklyn', 'Manhattan']) prod = lu.BoolParameter() def output(self): prod_ = "prod" if self.prod else 'staging' path = f'data/{prod_}/{self.boro}/raw_{self.v}.csv' path = str(this_folder / path) return lu.LocalTarget(path) # def complete(self): # return self.output().exist() # def requires(self): # return ... def run(self): source = f'https://raw.githubusercontent.com/Codecademy/datasets/master/streeteasy/{self.boro.lower()}.csv' data = pd.read_csv(source) self.output().makedirs() data.to_csv(self.output().path)
class binRefinement(luigi.Task): project_name = GlobalParameter().projectName adapter = GlobalParameter().adapter threads = GlobalParameter().threads max_memory = GlobalParameter().maxMemory pre_process_reads = luigi.ChoiceParameter(choices=["yes", "no"], var_type=str) read_library_type = GlobalParameter().seq_platforms min_contig_length = luigi.IntParameter(default="1500") def requires(self): return [ refineM(pre_process_reads=self.pre_process_reads, min_contig_length=self.min_contig_length) ] def output(self): timestamp = time.strftime('%Y%m%d.%H%M%S', time.localtime()) return luigi.LocalTarget( os.path.join( os.getcwd(), "task_logs", 'task.bin.refinement.complete.{t}'.format(t=timestamp))) def run(self): timestamp = time.strftime('%Y%m%d.%H%M%S', time.localtime()) with self.output().open('w') as outfile: outfile.write('Bin Refinement finished at {t}'.format(t=timestamp))
class ScheduledExternalProgramTask(ExternalProgramTask): """ Variant of :class:`luigi.contrib.external_program.ExternalProgramTask` that executes the task with a :class:`Scheduler`. """ scheduler = luigi.ChoiceParameter(default=cfg.scheduler, choices=['local'] + [blurb for blurb in _schedulers], positional=False, significant=False, description='Scheduler to use for running the task') scheduler_partition = luigi.OptionalParameter(default=cfg.scheduler_partition, positional=False, significant=False, description='Scheduler partition (or queue) to use if supported') scheduler_extra_args = luigi.ListParameter(default=cfg.scheduler_extra_args, positional=False, significant=False, description='Extra arguments to pass to the scheduler') walltime = luigi.TimeDeltaParameter(default=datetime.timedelta(), positional=False, significant=False, description='Amout of time to allocate for the task, default value of zero implies unlimited time') cpus = luigi.IntParameter(default=1, positional=False, significant=False, description='Number of CPUs to allocate for the task') memory = luigi.FloatParameter(default=1, positional=False, significant=False, description='Amount of memory (in gigabyte) to allocate for the task') def __init__(self, *kwargs, **kwds): super(ScheduledExternalProgramTask, self).__init__(*kwargs, **kwds) try: if self.scheduler != 'local': self._scheduler = _schedulers[self.scheduler] except KeyError: raise ValueError('Unsupported scheduler {}'.format(self.scheduler)) @property def resources(self): if self.scheduler == 'local': # local_jobs is actually constrained by the number of workers return {'cpus': self.cpus, 'memory': self.memory} else: return {'{}_jobs'.format(self.scheduler): 1} def run(self): if self.scheduler == 'local': return super(ScheduledExternalProgramTask, self).run() else: return self._scheduler.run_task(self)
class LeaveOneOutPrediction(luigi.Task): name = luigi.Parameter() # データセット名 edgetype = luigi.ChoiceParameter( choices=['linked', 'mutual', 'followee', 'follower']) method = luigi.Parameter() extra = luigi.BoolParameter(default=False) def requires(self): return { 'edgelist': Edgelist(name=self.name, edgetype=self.edgetype), 'truth': HomeLocation(name=self.name) } def output(self): return luigi.LocalTarget( os.path.join('data/experiments/loocv/predicted', self.name, self.method, 'f_{}.tsv'.format(self.edgetype))) def run(self): extra_cmd = '' if self.extra: extra_cmd = '--extra' cmd = 'python -m snlocest.scripts.loocv {edgelist.path} {truth.path} {} --fast {} > {}' with self.output().temporary_path() as temp_output_path: run(cmd.format(self.method, extra_cmd, temp_output_path, **self.input()), shell=True, check=True)
class CloudMask(luigi.Task): namemodel = luigi.ChoiceParameter( description="name to save the binary cloud mask", choices=["rgbi", "rgbiswir"], default="rgbiswir") def satobj(self): raise NotImplementedError("Must add a satname") def satname(self): raise NotImplementedError("Must add a satname") def cloud_detection_model(self): if hasattr(self, "model_clouds"): return self.model_clouds else: self.model_clouds = utils.Model(satname=self.satname(), namemodel=self.namemodel) return self.model_clouds def output(self): path_img = os.path.join(self.satobj().folder, "dluvclouds_" + self.namemodel + ".tif") return luigi.LocalTarget(path_img) def run(self): satobj = self.satobj() model = self.cloud_detection_model() cloud_prob_bin = model.predict(satobj) # Save the cloud mask utils.save_cloud_mask(satobj, cloud_prob_bin, self.output().path)
class AlignExperiment(TaskWithPriorityMixin, DynamicTaskWithOutputMixin, DynamicWrapperTask): """ Align all the samples in a given experiment. The output is one sample alignment output per sample contained in the experiment. """ experiment_id = luigi.Parameter() source = luigi.ChoiceParameter( default='local', choices=['gemma', 'geo', 'sra', 'arrayexpress', 'local'], positional=False) taxon = luigi.Parameter(default='human', positional=False) reference_id = luigi.Parameter(default='hg38_ncbi', positional=False) scope = luigi.Parameter(default='genes', positional=False) def requires(self): return DownloadExperiment(self.experiment_id, source=self.source).requires().requires() def run(self): download_sample_tasks = next( DownloadExperiment(self.experiment_id, source=self.source).requires().run()) yield [ AlignSample(self.experiment_id, dst.sample_id, source=self.source, taxon=self.taxon, reference_id=self.reference_id, scope=self.scope) for dst in download_sample_tasks ]
class DTA(luigi.Task): project_name = luigi.Parameter(default="RNASeqAnalysis") adapter = GlobalParameter().adapter organism_domain = GlobalParameter().organism_domain threads = GlobalParameter().threads pre_process_reads = luigi.ChoiceParameter(choices=["yes", "no"], var_type=str) read_library_type = GlobalParameter().read_library_type rnaseq_assembler = luigi.ChoiceParameter( choices=["trinity", "spades", "rockhopper"], var_type=str) def requires(self): if all([ self.organism_domain == "prokaryote", self.rnaseq_assembler == "rockhopper" ]): return [ rockhopper(project_name=self.project_name, pre_process_reads=self.pre_process_reads) ] if self.rnaseq_assembler == "spades": return [ spades(project_name="RNASeqAnalysis", read_library_type=self.read_library_type, mode="rna", pre_process_reads=self.pre_process_reads) ] if self.rnaseq_assembler == "trinity": return [ trinity(project_name=self.project_name, pre_process_reads=self.pre_process_reads) ] def output(self): timestamp = time.strftime('%Y%m%d.%H%M%S', time.localtime()) return luigi.LocalTarget( os.path.join( os.getcwd(), "task_logs", 'task.assemble.transcript.complete.{t}'.format(t=timestamp))) def run(self): timestamp = time.strftime('%Y%m%d.%H%M%S', time.localtime()) with self.output().open('w') as outfile: outfile.write( 'transcript assembly finished at {t}'.format(t=timestamp))
class ParameterizedTask(luigi.Task): example_str = luigi.Parameter(default='foo') example_bool = luigi.BoolParameter(default=True) example_int = luigi.IntParameter(default=0) example_float = luigi.FloatParameter(default=10.5) example_dict = luigi.DictParameter(default={'fizz': 'buzz'}) example_date = luigi.DateParameter(default=datetime.date.today()) example_choice = luigi.ChoiceParameter(choices=[1, 2, 3], var_type=int)
class quantifyDAT(luigi.Task): project_name = luigi.Parameter(default="RNASeqAnalysis") organism_domain = GlobalParameter().organism_domain read_library_type = GlobalParameter().read_library_type pre_process_reads = luigi.ChoiceParameter(choices=["yes", "no"], var_type=str) adapter = GlobalParameter().adapter genome_name = GlobalParameter().genome_name rnaseq_assembler = luigi.ChoiceParameter( choices=["trinity", "spades", "rockhopper"], var_type=str) threads = GlobalParameter().threads def requires(self): if self.read_library_type == "pe": return [ denovoQuant(rnaseq_assembler=self.rnaseq_assembler, pre_process_reads=self.pre_process_reads, project_name=self.project_name, sampleName=i) for i in [ line.strip() for line in open((os.path.join( os.getcwd(), "sample_list", "pe_samples.lst"))) ] ] if self.read_library_type == "se": return [ denovoQuant(rnaseq_assembler=self.rnaseq_assembler, project_name=self.project_name, pre_process_reads=self.pre_process_reads, sampleName=i) for i in [ line.strip() for line in open((os.path.join( os.getcwd(), "sample_list", "se_samples.lst"))) ] ] def output(self): timestamp = time.strftime('%Y%m%d.%H%M%S', time.localtime()) return luigi.LocalTarget('workflow.complete.{t}'.format(t=timestamp)) def run(self): timestamp = time.strftime('%Y%m%d.%H%M%S', time.localtime()) with self.output().open('w') as outfile: outfile.write('workflow finished at {t}'.format(t=timestamp))
class TrivagoModelTrainingMixin(object): recommender_module_class: str = None recommender_extra_params: dict = None loss_function: str = luigi.ChoiceParameter( choices=TORCH_LOSS_FUNCTIONS.keys(), default="crm") n_factors: int = luigi.IntParameter(default=128) weight_init: str = luigi.ChoiceParameter(choices=TORCH_WEIGHT_INIT.keys(), default="lecun_normal") dropout_prob: float = luigi.FloatParameter(default=0.1) dropout_module: str = luigi.ChoiceParameter( choices=TORCH_DROPOUT_MODULES.keys(), default="alpha") activation_function: str = luigi.ChoiceParameter( choices=TORCH_ACTIVATION_FUNCTIONS.keys(), default="selu") filter_sizes: List[int] = luigi.ListParameter(default=[1, 3, 5]) num_filters: int = luigi.IntParameter(default=64) @property def window_hist_size(self): if not hasattr(self, "_window_hist_size"): self._window_hist_size = int( self.train_data_frame.iloc[0]["window_hist_size"]) return self._window_hist_size @property def metadata_size(self): if not hasattr(self, "_meta_data_size"): self._meta_data_size = int(self.metadata_data_frame.shape[1] - 3) return self._meta_data_size def create_module(self) -> nn.Module: return SimpleLinearModel( project_config=self.project_config, index_mapping=self.index_mapping, window_hist_size=self.window_hist_size, vocab_size=self.vocab_size, metadata_size=self.metadata_size, n_factors=self.n_factors, filter_sizes=self.filter_sizes, num_filters=self.num_filters, dropout_prob=self.dropout_prob, dropout_module=TORCH_DROPOUT_MODULES[self.dropout_module], )
class metaphlan(luigi.Task): project_name=luigi.Parameter(default="MetagenomeAnalysis") adapter = GlobalParameter().adapter threads = GlobalParameter().threads max_memory = GlobalParameter().maxMemory pre_process_reads = luigi.ChoiceParameter(choices=["yes", "no"], var_type=str) read_library_type = GlobalParameter().seq_platforms sampleName = luigi.Parameter(description="name of the sample to be analyzed. (string)") def requires(self): if self.read_library_type == "pe" and self.pre_process_reads=="yes": return [cleanFastq(sampleName=i) for i in [line.strip() for line in open((os.path.join(os.getcwd(), "config", "pe_samples.lst")))]] if self.read_library_type == "pe" and self.pre_process_reads=="no": return [reformat(sampleName=i) for i in [line.strip() for line in open((os.path.join(os.getcwd(), "config", "pe_samples.lst")))]] def output(self): profiled_samples = os.path.join(os.getcwd(), GlobalParameter().projectName, "metaphlan_analysis","profiled_samples" ) return {'out1': luigi.LocalTarget(profiled_samples, self.sampleName +".txt"), 'out2': luigi.LocalTarget(profiled_samples, self.sampleName +".out")} def run(self): #metaphlan_folder = os.path.join(os.getcwd(), GlobalParameter().projectName, "metaphlan_analysis" ) profiled_samples= os.path.join(os.getcwd(), GlobalParameter().projectName, "metaphlan_analysis", "profiled_samples" + "/") #createFolder(profiled_samples) if self.pre_process_reads=="no": pe_read_folder = os.path.join(os.getcwd(), GlobalParameter().projectName, "ReadQC", "VerifiedReads", "PE-Reads" + "/") if self.pre_process_reads=="yes": pe_read_folder = os.path.join(os.getcwd(), GlobalParameter().projectName, "ReadQC", "CleanedReads", "PE-Reads" + "/") run_metaphlan="[ -d {profiled_samples} ] || mkdir -p {profiled_samples} ; " \ "metaphlan --input_type fastq " \ "{pe_read_folder}{sample}_1.fastq,{pe_read_folder}{sample}_2.fastq " \ "--nproc {threads} " \ "--bt2_ps very-sensitive " \ "--bowtie2out {profiled_samples}{sample}.out 2>&1 | tee {profiled_samples}{sample}.txt ".format( pe_read_folder=pe_read_folder,profiled_samples=profiled_samples,sample=self.sampleName,threads=self.threads) print("****NOW RUNNING COMMAND****:" + run_metaphlan) print(run_cmd(run_metaphlan))