예제 #1
0
class WideRecommender(ClassifierWithTransferLearningKerasModelTraining):
    input_shape: Tuple[int, int] = luigi.TupleParameter(default=(100, ))
    batch_size: int = luigi.IntParameter(default=10)
    learning_rate = luigi.FloatParameter(default=1e-5)
    dense_layers: List[int] = luigi.ListParameter(default=[512, 512])
    dropout: float = luigi.FloatParameter(default=None)
    activation_function: str = luigi.ChoiceParameter(
        choices=KERAS_ACTIVATION_FUNCTIONS.keys(), default="relu")
    kernel_initializer: str = luigi.ChoiceParameter(
        choices=KERAS_WEIGHT_INIT.keys(), default="glorot_uniform")

    def create_base_model(self) -> Model:
        x_input = Input(shape=self.input_shape, name='wide_inp')

        wide = Dense(self.input_shape[0],
                     activation=self.activation_function,
                     kernel_initializer=self.kernel_initializer,
                     name='wide_mlp')(x_input)

        output = Dense(1,
                       activation='sigmoid',
                       kernel_initializer=self.kernel_initializer)(wide)

        model = Model(x_input, output, name='Wide')

        return model

    def create_model_with(self, base_model: Model) -> Model:
        return base_model
예제 #2
0
파일: mlp.py 프로젝트: marlesson/recsys_ctr
class MLPClassifier(ClassifierWithTransferLearningKerasModelTraining):
    input_shape: Tuple[int, int] = luigi.TupleParameter(default=(100, ))
    batch_size: int = luigi.IntParameter(default=10)
    learning_rate = luigi.FloatParameter(default=1e-5)
    dense_layers: List[int] = luigi.ListParameter(default=[512, 512])
    dropout: float = luigi.FloatParameter(default=None)
    activation_function: str = luigi.ChoiceParameter(
        choices=KERAS_ACTIVATION_FUNCTIONS.keys(), default="relu")
    kernel_initializer: str = luigi.ChoiceParameter(
        choices=KERAS_WEIGHT_INIT.keys(), default="glorot_uniform")

    def create_base_model(self) -> Model:
        x_input = Input(shape=self.input_shape)

        mlp = Dense(self.dense_layers[0],
                    activation=self.activation_function,
                    kernel_initializer=self.kernel_initializer)(x_input)

        for dense_neurons in self.dense_layers[1:]:
            mlp = Dense(dense_neurons,
                        activation=self.activation_function,
                        kernel_initializer=self.kernel_initializer)(mlp)
            #model.add(BatchNormalization())
            if self.dropout:
                mlp = Dropout(self.dropout)(mlp)

        output = Dense(1, activation='sigmoid')(mlp)
        model = Model(x_input, output, name='BaseMLP')

        return model

    def create_model_with(self, base_model: Model) -> Model:
        return base_model
class SeqrMTToESTask(HailElasticSearchTask):
    source_paths = luigi.Parameter(default="[]", description='Path or list of paths of VCFs to be loaded.')
    dest_path = luigi.Parameter(description='Path to write the matrix table.')
    genome_version = luigi.Parameter(description='Reference Genome Version (37 or 38)')
    vep_runner = luigi.ChoiceParameter(choices=['VEP', 'DUMMY'], default='VEP', description='Choice of which vep runner to annotate vep.')

    reference_ht_path = luigi.Parameter(default=None, description='Path to the Hail table storing the reference variants.')
    clinvar_ht_path = luigi.Parameter(default=None, description='Path to the Hail table storing the clinvar variants.')
    hgmd_ht_path = luigi.Parameter(default=None, description='Path to the Hail table storing the hgmd variants.')
    sample_type = luigi.ChoiceParameter(default="WES", choices=['WGS', 'WES'], description='Sample type, WGS or WES')
    dont_validate = luigi.BoolParameter(description='Disable checking whether the dataset matches the specified '
                                                    'genome version and WGS vs. WES sample type.')
    dataset_type = luigi.ChoiceParameter(choices=['VARIANTS', 'SV'], default='VARIANTS', description='VARIANTS or SV.')
    remap_path = luigi.OptionalParameter(default=None, description="Path to a tsv file with two columns: s and seqr_id.")
    subset_path = luigi.OptionalParameter(default=None, description="Path to a tsv file with one column of sample IDs: s.")
    vep_config_json_path = luigi.OptionalParameter(default=None, description="Path of hail vep config .json file")

    def __init__(self, *args, **kwargs):
        # TODO: instead of hardcoded index, generate from project_guid, etc.
        kwargs['source_path'] = self.dest_path
        super().__init__(*args, **kwargs)

        self.completed_marker_path = os.path.join(self.dest_path, '_EXPORTED_TO_ES')

    def requires(self):
        return [SeqrVCFToMTTask(
            source_paths=self.source_paths,
            dest_path=self.dest_path,
            genome_version=self.genome_version,
            vep_runner=self.vep_runner,
            reference_ht_path=self.reference_ht_path,
            clinvar_ht_path=self.clinvar_ht_path,
            hgmd_ht_path=self.hgmd_ht_path,
            sample_type=self.sample_type,
            dont_validate=self.dont_validate,
            dataset_type=self.dataset_type,
            remap_path=self.remap_path,
            subset_path=self.subset_path,
            vep_config_json_path=self.vep_config_json_path,
        )]

    def output(self):
        # TODO: Use https://luigi.readthedocs.io/en/stable/api/luigi.contrib.esindex.html.
        return GCSorLocalTarget(filename=self.completed_marker_path)

    def complete(self):
        # Complete is called by Luigi to check if the task is done and will skip if it is.
        # By default it checks to see that the output exists, but we want to check for the
        # _EXPORTED_TO_ES file to make sure it was not terminated halfway.
        return GCSorLocalTarget(filename=self.completed_marker_path).exists()

    def run(self):
        mt = self.import_mt()
        row_table = SeqrVariantsAndGenotypesSchema.elasticsearch_row(mt)
        self.export_table_to_elasticsearch(row_table, self._mt_num_shards(mt))

        with hl.hadoop_open(self.completed_marker_path, "w") as f:
            f.write(".")

        self.cleanup()
예제 #4
0
class mapReadsToGenome(luigi.Task):
    # Global Parameters
    project_name = luigi.Parameter(default="RNASeqAnalysis")
    read_library_type = GlobalParameter().read_library_type
    adapter = luigi.Parameter(default="./tasks/utility/adapters.fasta.gz")
    genome_name = GlobalParameter().genome_name
    organism_domain = GlobalParameter().organism_domain
    threads = GlobalParameter().threads
    maxMemory = GlobalParameter().maxMemory

    # Local Parameters
    # sampleName = luigi.Parameter()
    pre_process_reads = luigi.ChoiceParameter(choices=["yes", "no"],
                                              var_type=str)
    rnaseq_aligner = luigi.ChoiceParameter(
        choices=["subread", "star", "hisat2", "dart", "segemehl", "bowtie2"],
        var_type=str)
    annotation_file_type = luigi.ChoiceParameter(choices=["GFF", "GTF"],
                                                 var_type=str)

    #####################################################################################################3

    def requires(self):
        if self.read_library_type == "pe":
            return [
                alignReads(pre_process_reads=self.pre_process_reads,
                           annotation_file_type=self.annotation_file_type,
                           rnaseq_aligner=self.rnaseq_aligner,
                           sampleName=i) for i in [
                               line.strip() for line in open(
                                   os.path.join(os.getcwd(), "sample_list",
                                                "pe_samples.lst"))
                           ]
            ]

        if self.read_library_type == "se":
            return [
                alignReads(pre_process_reads=self.pre_process_reads,
                           annotation_file_type=self.annotation_file_type,
                           rnaseq_aligner=self.rnaseq_aligner,
                           sampleName=i) for i in [
                               line.strip() for line in open(
                                   os.path.join(os.getcwd(), "sample_list",
                                                "se_samples.lst"))
                           ]
            ]

    def output(self):
        timestamp = time.strftime('%Y%m%d.%H%M%S', time.localtime())
        return luigi.LocalTarget(
            os.path.join(
                os.getcwd(), "task_logs",
                'task.align.read.to.genome.complete.{t}'.format(t=timestamp)))

    def run(self):
        timestamp = time.strftime('%Y%m%d.%H%M%S', time.localtime())
        with self.output().open('w') as outfile:
            outfile.write('Read Alignment finished at {t}'.format(t=timestamp))
class alignmentFreeQuant(luigi.Task):

    project_name = luigi.Parameter(default="RNASeqAnalysis")
    organism_domain = GlobalParameter().organism_domain
    genome_name = GlobalParameter().genome_name
    read_library_type = GlobalParameter().read_library_type
    adapter = GlobalParameter().adapter
    pre_process_reads = luigi.ChoiceParameter(choices=["yes", "no"],
                                              var_type=str)
    annotation_file_type = luigi.ChoiceParameter(choices=["GFF", "GTF", "NA"],
                                                 var_type=str)
    quant_method = luigi.ChoiceParameter(choices=["salmon", "kallisto"],
                                         var_type=str)

    def requires(self):
        if self.read_library_type == "pe":
            return [
                transQuant(quant_method=self.quant_method,
                           pre_process_reads=self.pre_process_reads,
                           annotation_file_type=self.annotation_file_type,
                           sampleName=i)
                for i in [
                    line.strip() for line in open((os.path.join(
                        os.getcwd(), "sample_list", "pe_samples.lst")))
                ]
            ]

        if self.read_library_type == "se":
            return [
                transQuant(quant_method=self.quant_method,
                           pre_process_reads=self.pre_process_reads,
                           annotation_file_type=self.annotation_file_type,
                           sampleName=i)
                for i in [
                    line.strip() for line in open((os.path.join(
                        os.getcwd(), "sample_list", "se_samples.lst")))
                ]
            ]

    def output(self):
        timestamp = time.strftime('%Y%m%d.%H%M%S', time.localtime())
        return luigi.LocalTarget(
            os.path.join(
                os.getcwd(), "task_logs",
                'task.generate.transcript.count.complete.{t}'.format(
                    t=timestamp)))

    def run(self):
        timestamp = time.strftime('%Y%m%d.%H%M%S', time.localtime())
        with self.output().open('w') as outfile:
            outfile.write('Generate Transcript Count finished at {t}'.format(
                t=timestamp))
예제 #6
0
class alignmentBasedQuant(luigi.Task):
    project_name = luigi.Parameter(default="RNASeqAnalysis")
    read_library_type = GlobalParameter().read_library_type
    threads = GlobalParameter().threads
    genome_name = GlobalParameter().genome_name
    adapter = GlobalParameter().adapter
    organism_domain = GlobalParameter().domain
    feature_type = GlobalParameter().feature_type
    annotation_file_type = GlobalParameter().annotation_suffix

    rnaseq_aligner = luigi.ChoiceParameter(
        choices=["subread", "star", "hisat2", "dart", "segemehl", "bowtie2"],
        var_type=str)

    attribute_type = luigi.Parameter(
        default="gene_id",
        description='''Specify attribute type in GTF annotation. 
												 string(=[gene_id])''')

    strandType = luigi.ChoiceParameter(
        default="0",
        choices=['0', '1', '2'],
        description=
        '''perform strand-specific read counting. int([=0]unstranded) 
																OR [=1] stranded] OR [=2] reversely-stranded. default[
																=0]''')
    pre_process_reads = luigi.ChoiceParameter(choices=["yes", "no"],
                                              var_type=str)

    def requires(self):
        return [
            featureCounts(pre_process_reads=self.pre_process_reads,
                          rnaseq_aligner=self.rnaseq_aligner,
                          attribute_type=self.attribute_type,
                          strandType=self.strandType)
        ]

        #annotation_file_type=self.annotation_file_type,

    def output(self):
        timestamp = time.strftime('%Y%m%d.%H%M%S', time.localtime())
        return luigi.LocalTarget(
            os.path.join(
                os.getcwd(), "task_logs",
                'task.generate.count.complete.{t}'.format(t=timestamp)))

    def run(self):
        timestamp = time.strftime('%Y%m%d.%H%M%S', time.localtime())
        with self.output().open('w') as outfile:
            outfile.write(
                'Count File Generation finished at {t}'.format(t=timestamp))
예제 #7
0
class DecisionTreeClassifierTraining(BaseModelTraining):
    criterion: str = luigi.ChoiceParameter(choices=["gini", "entropy"],
                                           default="gini")
    splitter: str = luigi.ChoiceParameter(choices=["best", "random"],
                                          default="best")
    max_depth: int = luigi.IntParameter(default=None)
    min_samples_split: int = luigi.IntParameter(default=2)
    min_samples_leaf: int = luigi.IntParameter(default=1)
    min_weight_fraction_leaf: float = luigi.FloatParameter(default=0.0)
    max_features: int = luigi.IntParameter(default=None)
    max_leaf_nodes: int = luigi.IntParameter(default=None)
    min_impurity_decrease: float = luigi.FloatParameter(default=0.0)
    ccp_alpha: float = luigi.FloatParameter(default=0.0)

    def create_model(self) -> BaseEstimator:
        return DecisionTreeClassifier(
            criterion=self.criterion,
            splitter=self.splitter,
            max_depth=self.max_depth,
            min_samples_split=self.min_samples_split,
            min_samples_leaf=self.min_samples_leaf,
            min_weight_fraction_leaf=self.min_weight_fraction_leaf,
            max_features=self.max_features,
            max_leaf_nodes=self.max_leaf_nodes,
            min_impurity_decrease=self.min_impurity_decrease,
            ccp_alpha=self.ccp_alpha,
            class_weight=self.class_weight
            if self.class_weight != "none" else None,
            random_state=self.seed)

    def _save_feature_importances(self, metrics: dict):
        with open(os.path.join(self.output().path, "feature_importances.json"),
                  "w") as file:
            json.dump(metrics, file, indent=4)

    def run(self):
        super().run()

        self._save_feature_importances({
            "feature_names":
            self.feature_names,
            "feature_importances":
            self.model.feature_importances_.tolist(),
        })

        if isinstance(self.model, DecisionTreeClassifier):
            plot_decision_tree(self.model, self.feature_names,
                               ["No Stroke", "Stroke"]).savefig(
                                   os.path.join(self.output().path,
                                                "tree.png"))
예제 #8
0
class ConvertPV(luigi.Task):
    """
    Convert Landsat-8 data from Biome or 38-Clouds dataset to same spectral and spatial resolution as Proba-V.
    Following the transformation proposed in:
    Transferring deep learning models for cloud detection between Landsat-8 and Proba-V
    https://www.sciencedirect.com/science/article/abs/pii/S0924271619302801

    Landsat-8 file is exported in the same format as Proba-V (HDF5 files)

    python convert_landsat_probav.py ConvertPV --l8img BC/LC80010112014080LGN00

    """
    l8img = luigi.Parameter(description="Folder with Landsat 8 image")
    outfolder = luigi.Parameter(default="landsataspv")
    resolution = luigi.ChoiceParameter(choices=["333M", "100M", "1KM"],
                                       default="333M")
    type_product = luigi.ChoiceParameter(
        choices=["biome", "38c", "landsat8"],
        default="landsat8",
        description=
        "Flag that indicates if the product has a manually annotated "
        "cloud mask from Biome or 38-Clouds dataset")

    def l8obj(self):
        if not hasattr(self, "l8obj_computed"):
            if self.type_product == "biome":
                obj = l8image.Biome(self.l8img)
            elif self.type_product == "38c":
                obj = l8image.L8_38Clouds(self.l8img)
            else:
                obj = l8image.L8Image(self.l8img)

            setattr(self, "l8obj_computed", obj)
        return getattr(self, "l8obj_computed")

    def output(self):
        l8img = self.l8obj()
        path_out = os.path.join(
            self.outfolder, l8img.name + "_" + str(self.resolution) + ".HDF5")
        return luigi.LocalTarget(path_out)

    def run(self):
        l8img = self.l8obj()
        out = self.output()
        out.makedirs()
        landsat_as_pv.convert_landsat_to_probav(l8img,
                                                self.resolution,
                                                file_out=out.path)
예제 #9
0
class dbg2olc(luigi.Task):
    projectName = luigi.Parameter(default="GenomeAssembly")
    pre_process_reads = luigi.ChoiceParameter(choices=["yes", "no"], var_type=str)
    read_library_type = luigi.ChoiceParameter(description="Choose From['pe-lr: paired-end and long read',"
                                                "'pe-mp-lr: paired-end, mate-pair and long read'",
                                    choices=["pe-lr","pe-mp-lr"], var_type=str)


    def requires(self):

        if self.read_library_type == "pe-lr" or self.read_library_type == "pe-mp-lr":
            return [minia(read_library_type=self.read_library_type,
                          pre_process_reads=self.pre_process_reads)]


    def output(self):
        dbg2olc_assembly_folder = os.path.join(os.getcwd(), "GenomeAssembly", "DBG2OLC" + "/")
        return {'out': luigi.LocalTarget(dbg2olc_assembly_folder + "DBG2OLC_contigs.fa")}

    def run(self):
        minia_assembly_folder = os.path.join(os.getcwd(), "GenomeAssembly", "MINIA" + "/")
        dbg2olc_assembly_folder = os.path.join(os.getcwd(), "GenomeAssembly", "DBG2OLC" + "/")

        DBG2OLC_assembly_log_folder = os.path.join(os.getcwd(), "log", "GenomeAssembly", "DBG2OLC" +  "/")

        kmer = minia_kmer((os.path.join(os.getcwd(),"GenomeAssembly", "MINIA","minia.fofn")))

        print("Optimal Kmer: ", kmer)

        dbg2olc_input=dbg2olc_formater((os.path.join(os.getcwd(), "sample_list", "lr_samples.lst")))

        run_cmd_dbg2olc = "[ -d  {dbg2olc_assembly_folder} ] || mkdir -p {dbg2olc_assembly_folder}; " \
                        "mkdir -p {DBG2OLC_assembly_log_folder}; cd {dbg2olc_assembly_folder}; " \
                        "/usr/bin/time -v DBG2OLC " \
                        "k {kmer} Contigs {minia_assembly_folder}minia.contigs.fa " \
                        "KmerCovTh 2 MinOverlap 20 AdaptiveTh 0.005 " \
                        "{dbg2olc_input} " \
                        "2>&1 | tee {DBG2OLC_assembly_log_folder}dbg2olc_assembly.log " \
            .format(minia_assembly_folder=minia_assembly_folder,
                    dbg2olc_assembly_folder=dbg2olc_assembly_folder,
                    dbg2olc_input=dbg2olc_input,
                    DBG2OLC_assembly_log_folder=DBG2OLC_assembly_log_folder,
                    kmer=kmer)

        if self.read_library_type == "pe-lr" or self.read_library_type == "pe-mp-lr":

            print("****** NOW RUNNING COMMAND ******: " + run_cmd_dbg2olc)
            print(run_cmd(run_cmd_dbg2olc))
예제 #10
0
class DownloadSample(TaskWithOutputMixin, WrapperTask):
    """
    This is a generic task for downloading an individual sample in an
    experiment.

    Note that the 'gemma' source does not provide individual samples.
    """
    experiment_id = luigi.Parameter()
    sample_id = luigi.Parameter()

    source = luigi.ChoiceParameter(
        default='local',
        choices=['gemma', 'geo', 'arrayexpress', 'local', 'sra'],
        positional=False)

    def requires(self):
        if self.source in ['geo', 'gemma']:
            return DownloadGeoSample(self.sample_id)
        elif self.source == 'sra':
            return DownloadSraExperiment(self.sample_id)
        elif self.source == 'arrayexpress':
            return DownloadArrayExpressSample(self.experiment_id,
                                              self.sample_id)
        elif self.source == 'local':
            return DownloadLocalSample(self.experiment_id, self.sample_id)
        else:
            raise ValueError('Unknown source for sample: {}.'.format(
                self.source))
예제 #11
0
class Merge(BcftoolsTask):
    """
    Merge the samples of two or more VCF files
    """
    input_file = luigi.ListParameter()

    filter_logic = luigi.ChoiceParameter(default='+',
                                         choices=['x', '+'],
                                         positional=False)
    info_rules = luigi.ListParameter(default=[], positional=False)

    output_file = luigi.Parameter()
    output_format = luigi.Parameter(positional=False, default='z')

    def subcommand_args(self):
        args = ['merge']

        args.extend(['--filter-logic', self.filter_logic])

        if self.info_rules:
            args.extend(['--info-rules', ','.join(self.info_rules)])

        args.extend([
            '--output-type', self.output_format, '--output', self.output_file
        ])

        return args

    def subcommand_input_args(self):
        return self.input_file

    def output(self):
        return luigi.LocalTarget(self.output_file)
예제 #12
0
class FollowFilteredEdgelist(luigi.Task):
    '''edgelistの左側にunknownが出て来るエッジを消して、居住地の付けたユーザからのデータのみにしたエッジリスト

    Args:
        --name LocationUserListとUnknownListがわかるように保存パスに使われる名前
        --month
    '''
    month = luigi.MonthParameter()
    name = luigi.Parameter()
    type = luigi.ChoiceParameter(choices=['followers', 'following'])
    sources = luigi.TupleParameter(default=('followers', 'following'))

    def requires(self):
        return {
            'edgelist': TwitterFollowRawEdgelist(month=self.month,
                                                 type=self.type),
            'hl': RemainedHomeLocation(name=self.name, month=self.month)
        }

    def output(self):
        return luigi.LocalTarget(
            os.path.join(
                NETWORK_DIR, 'filtered', self.name,
                self.month.strftime('%Y%m_{}.tsv.gz'.format(self.type))))

    def run(self):
        with self.output().temporary_path() as temp_output_path:
            cmd = 'zcat {edgelist.path} | python -m snlocest.scripts.edgefilter -i {hl.path} | gzip > {}'.format(
                temp_output_path, **self.input())
            run(cmd, shell=True, check=True)
예제 #13
0
class DownloadImageResults(ee_ipl_uv.luigi_utils.DownloadImage):
    split = luigi.Parameter()
    method = luigi.ChoiceParameter(choices=["percentile","persistence","linear","kernel"],
                                   var_type=str,
                                   default="percentile")

    def output(self):
        return ee_ipl_uv.luigi_utils.RasterTarget(os.path.join(self.basepath,
                                                               self.image_index+"_"+self.split+"_"+self.method))
    def load_region_of_interest(self):
        locations = get_location_splits()
        return [[p[1], p[0]] for p in locations[str(self.image_index)][str(self.split)][0]]

    def load_image(self):
        image_predict_clouds = ee.Image('LANDSAT/LC8_L1T_TOA_FMASK/' + str(self.image_index))

        # Select region of interest (lng,lat)
        pol = self.load_region_of_interest()
        region_of_interest = ee.Geometry.Polygon(pol)

        cloud_score_percentile, pred_percentile = multitemporal_cloud_masking.CloudClusterScore(image_predict_clouds,
                                                                                                region_of_interest,
                                                                                                method_pred=self.method)

        ground_truth = ee.Image("users/gonzmg88/LANDSAT8_CLOUDS/" + self.image_index + "_fixedmask")

        image_download = image_predict_clouds.addBands(cloud_score_percentile.select(["cluster"], ["cloudscore"])) \
            .addBands(ground_truth.select(["b1"], ["fixedmask"]))\
            .addBands(pred_percentile).clip(region_of_interest).toFloat()

        properties = ["system:time_start", 'system:index']

        return image_download, properties
예제 #14
0
class compositionProfiling(luigi.Task):
	project_name=GlobalParameter().projectName
	adapter = GlobalParameter().adapter
	threads = GlobalParameter().threads
	max_memory = GlobalParameter().maxMemory
	pre_process_reads = luigi.ChoiceParameter(choices=["yes", "no"], var_type=str)
	read_library_type = GlobalParameter().seq_platforms

	def requires(self):

		return [metaphlan(pre_process_reads=self.pre_process_reads, 
				sampleName=i)
                	for i in [line.strip()
                          for line in
                          	open((os.path.join(os.getcwd(), "config", "pe_samples.lst")))]]	


	def output(self):
		timestamp = time.strftime('%Y%m%d.%H%M%S', time.localtime())
		return luigi.LocalTarget(os.path.join(os.getcwd(),"task_logs",'task.genome.binning.complete.{t}'.format(
			t=timestamp)))

	def run(self):
		timestamp = time.strftime('%Y%m%d.%H%M%S', time.localtime())
		with self.output().open('w') as outfile:
			outfile.write('Metagenome binning finished at {t}'.format(t=timestamp))
예제 #15
0
class profileTaxonomy(luigi.Task):
	project_name=GlobalParameter().projectName
	adapter = GlobalParameter().adapter
	threads = GlobalParameter().threads
	max_memory = GlobalParameter().maxMemory
	pre_process_reads = luigi.ChoiceParameter(choices=["yes", "no"], var_type=str)
	read_library_type = GlobalParameter().seq_platforms
	condition_column=luigi.Parameter(default="conditions")
	

	def requires(self):
		return [graphlan(pre_process_reads=self.pre_process_reads)]

	def output(self):
		ampvis_image_folder = os.path.join(os.getcwd(), GlobalParameter().projectName, "metaphlan_analysis" ,"figures" + "/")
		return {'out1': luigi.LocalTarget(ampvis_image_folder + "/" + "family_heatmap.tiff")}


	def run(self):
		ampvis_folder = os.path.join(os.getcwd(), GlobalParameter().projectName, "metaphlan_analysis" + "/")
		inDir=os.path.join(os.getcwd(), GlobalParameter().projectName, "metaphlan_analysis" ,"data_for_images" + "/")
		map_file=os.path.join(os.getcwd(), "config","metagenome_condition.tsv")

		cmd_run_ampvis="[ -d {ampvis_folder} ] || mkdir -p {ampvis_folder} ;  cd {ampvis_folder} ;" \
						"ampvis.r -t {map_file} " \
						"-v {condition_column} " \
						"-a {inDir}/otu_table_ampvis2.txt".format(map_file=map_file,
																 inDir=inDir,
																 ampvis_folder=ampvis_folder,
																 condition_column=self.condition_column)
		print("****NOW RUNNING COMMAND****:" + cmd_run_ampvis)
		print(run_cmd(cmd_run_ampvis))
예제 #16
0
class DownloadExperiment(TaskWithPriorityMixin, TaskWithOutputMixin,
                         WrapperTask):
    """
    This is a generic task that detects which kind of experiment is intended to
    be downloaded so that downstream tasks can process regardless of the data
    source.

    :source: Indicate the origin of the experiment, otherwise it will be
    inferred from the :experiment_id: parameter.
    """
    experiment_id = luigi.Parameter()

    source = luigi.ChoiceParameter(
        default='local',
        choices=['gemma', 'geo', 'sra', 'arrayexpress', 'local'],
        positional=False)

    def requires(self):
        if self.source == 'gemma':
            return DownloadGemmaExperiment(self.experiment_id)
        elif self.source == 'geo':
            return DownloadGeoSeries(self.experiment_id)
        elif self.source == 'sra':
            return DownloadSraProject(self.experiment_id)
        elif self.source == 'arrayexpress':
            return DownloadArrayExpressExperiment(self.experiment_id)
        elif self.source == 'local':
            return DownloadLocalExperiment(self.experiment_id)
        else:
            raise ValueError('Unknown download source for experiment: {}.')
예제 #17
0
class QualityControlExperiment(TaskWithPriorityMixin,
                               DynamicTaskWithOutputMixin, DynamicWrapperTask):
    """
    Quality control all the samples in a given experiment.
    """
    experiment_id = luigi.Parameter()
    source = luigi.ChoiceParameter(
        default='local',
        choices=['gemma', 'geo', 'sra', 'arrayexpress', 'local'],
        positional=False)

    def requires(self):
        return DownloadExperiment(self.experiment_id,
                                  source=self.source).requires().requires()

    def run(self):
        download_sample_tasks = next(
            DownloadExperiment(self.experiment_id,
                               source=self.source).requires().run())
        yield [
            QualityControlSample(self.experiment_id,
                                 dst.sample_id,
                                 source=self.source)
            for dst in download_sample_tasks
        ]
예제 #18
0
class MercadoLivreTraining(SupervisedModelTraining):
    loss_function: str = luigi.ChoiceParameter(choices=["ce", "custom_ce"],
                                               default="ce")

    def class_weights(self):

        df_weights = pd.concat([
            self.train_dataset._data_frame[[
                self.project_config.output_column.name
            ]], self.val_dataset._data_frame[[
                self.project_config.output_column.name
            ]]
        ])
        weights = 1 / df_weights[self.project_config.output_column.
                                 name].value_counts().sort_index().values
        weights = np.array(list([0, 0, 0]) + list(weights))

        return weights

    def _get_loss_function(self):
        if self.loss_function == 'custom_ce':
            self.loss_function_params = dict(self.loss_function_params)
            self.loss_function_params['class_weights'] = self.class_weights()

        return TORCH_LOSS_FUNCTIONS[self.loss_function](
            **self.loss_function_params)
예제 #19
0
class PrepareReference(ScheduledExternalProgramTask):
    task_namespace = 'rsem'

    annotation_file = luigi.Parameter()
    reference_fasta_files = luigi.ListParameter()
    reference_name = luigi.Parameter()

    aligner = luigi.ChoiceParameter(choices=['star'], positional=False)

    star_path = luigi.OptionalParameter(default=None, positional=False)

    def program_args(self):
        args = [join(cfg.rsem_dir, 'rsem-prepare-reference')]

        args.extend(['--gtf', self.annotation_file])

        if self.aligner == 'star':
            args.append('--star')

            if self.star_path is not None:
                args.extend(['--star-path', self.star_path])

        args.extend(['-p', self.cpus])

        args.extend(self.reference_fasta_files)

        args.append(self.reference_name)

        return args

    def output(self):
        return RsemReference(self.reference_name)
예제 #20
0
class Pull_data(lu.Task):
    v = lu.NumericalParameter(default=0.1,
                              var_type=float,
                              min_value=0,
                              max_value=100)

    boro = lu.ChoiceParameter(default='Queens',
                              var_type=str,
                              choices=['Queens', 'Brooklyn', 'Manhattan'])

    prod = lu.BoolParameter()

    def output(self):
        prod_ = "prod" if self.prod else 'staging'
        path = f'data/{prod_}/{self.boro}/raw_{self.v}.csv'
        path = str(this_folder / path)

        return lu.LocalTarget(path)

    # def complete(self):
    #     return self.output().exist()

    # def requires(self):
    #     return ...

    def run(self):
        source = f'https://raw.githubusercontent.com/Codecademy/datasets/master/streeteasy/{self.boro.lower()}.csv'
        data = pd.read_csv(source)

        self.output().makedirs()
        data.to_csv(self.output().path)
class binRefinement(luigi.Task):
    project_name = GlobalParameter().projectName
    adapter = GlobalParameter().adapter
    threads = GlobalParameter().threads
    max_memory = GlobalParameter().maxMemory
    pre_process_reads = luigi.ChoiceParameter(choices=["yes", "no"],
                                              var_type=str)
    read_library_type = GlobalParameter().seq_platforms
    min_contig_length = luigi.IntParameter(default="1500")

    def requires(self):
        return [
            refineM(pre_process_reads=self.pre_process_reads,
                    min_contig_length=self.min_contig_length)
        ]

    def output(self):
        timestamp = time.strftime('%Y%m%d.%H%M%S', time.localtime())
        return luigi.LocalTarget(
            os.path.join(
                os.getcwd(), "task_logs",
                'task.bin.refinement.complete.{t}'.format(t=timestamp)))

    def run(self):
        timestamp = time.strftime('%Y%m%d.%H%M%S', time.localtime())
        with self.output().open('w') as outfile:
            outfile.write('Bin Refinement finished at {t}'.format(t=timestamp))
예제 #22
0
class ScheduledExternalProgramTask(ExternalProgramTask):
    """
    Variant of :class:`luigi.contrib.external_program.ExternalProgramTask` that
    executes the task with a :class:`Scheduler`.
    """
    scheduler = luigi.ChoiceParameter(default=cfg.scheduler, choices=['local'] + [blurb for blurb in _schedulers], positional=False, significant=False, description='Scheduler to use for running the task')
    scheduler_partition = luigi.OptionalParameter(default=cfg.scheduler_partition, positional=False, significant=False, description='Scheduler partition (or queue) to use if supported')
    scheduler_extra_args = luigi.ListParameter(default=cfg.scheduler_extra_args, positional=False, significant=False, description='Extra arguments to pass to the scheduler')

    walltime = luigi.TimeDeltaParameter(default=datetime.timedelta(), positional=False, significant=False, description='Amout of time to allocate for the task, default value of zero implies unlimited time')
    cpus = luigi.IntParameter(default=1, positional=False, significant=False, description='Number of CPUs to allocate for the task')
    memory = luigi.FloatParameter(default=1, positional=False, significant=False, description='Amount of memory (in gigabyte) to allocate for the task')

    def __init__(self, *kwargs, **kwds):
        super(ScheduledExternalProgramTask, self).__init__(*kwargs, **kwds)
        try:
            if self.scheduler != 'local':
                self._scheduler = _schedulers[self.scheduler]
        except KeyError:
            raise ValueError('Unsupported scheduler {}'.format(self.scheduler))

    @property
    def resources(self):
        if self.scheduler == 'local':
            # local_jobs is actually constrained by the number of workers
            return {'cpus': self.cpus, 'memory': self.memory}
        else:
            return {'{}_jobs'.format(self.scheduler): 1}

    def run(self):
        if self.scheduler == 'local':
            return super(ScheduledExternalProgramTask, self).run()
        else:
            return self._scheduler.run_task(self)
예제 #23
0
class LeaveOneOutPrediction(luigi.Task):
    name = luigi.Parameter()  # データセット名
    edgetype = luigi.ChoiceParameter(
        choices=['linked', 'mutual', 'followee', 'follower'])
    method = luigi.Parameter()
    extra = luigi.BoolParameter(default=False)

    def requires(self):
        return {
            'edgelist': Edgelist(name=self.name, edgetype=self.edgetype),
            'truth': HomeLocation(name=self.name)
        }

    def output(self):
        return luigi.LocalTarget(
            os.path.join('data/experiments/loocv/predicted', self.name,
                         self.method, 'f_{}.tsv'.format(self.edgetype)))

    def run(self):
        extra_cmd = ''
        if self.extra:
            extra_cmd = '--extra'
        cmd = 'python -m snlocest.scripts.loocv {edgelist.path} {truth.path} {} --fast {} > {}'
        with self.output().temporary_path() as temp_output_path:
            run(cmd.format(self.method, extra_cmd, temp_output_path,
                           **self.input()),
                shell=True,
                check=True)
예제 #24
0
class CloudMask(luigi.Task):
    namemodel = luigi.ChoiceParameter(
        description="name to save the binary cloud mask",
        choices=["rgbi", "rgbiswir"],
        default="rgbiswir")

    def satobj(self):
        raise NotImplementedError("Must add a satname")

    def satname(self):
        raise NotImplementedError("Must add a satname")

    def cloud_detection_model(self):
        if hasattr(self, "model_clouds"):
            return self.model_clouds
        else:
            self.model_clouds = utils.Model(satname=self.satname(),
                                            namemodel=self.namemodel)

        return self.model_clouds

    def output(self):
        path_img = os.path.join(self.satobj().folder,
                                "dluvclouds_" + self.namemodel + ".tif")
        return luigi.LocalTarget(path_img)

    def run(self):
        satobj = self.satobj()
        model = self.cloud_detection_model()
        cloud_prob_bin = model.predict(satobj)

        # Save the cloud mask
        utils.save_cloud_mask(satobj, cloud_prob_bin, self.output().path)
예제 #25
0
class AlignExperiment(TaskWithPriorityMixin, DynamicTaskWithOutputMixin,
                      DynamicWrapperTask):
    """
    Align all the samples in a given experiment.

    The output is one sample alignment output per sample contained in the
    experiment.
    """
    experiment_id = luigi.Parameter()
    source = luigi.ChoiceParameter(
        default='local',
        choices=['gemma', 'geo', 'sra', 'arrayexpress', 'local'],
        positional=False)
    taxon = luigi.Parameter(default='human', positional=False)
    reference_id = luigi.Parameter(default='hg38_ncbi', positional=False)
    scope = luigi.Parameter(default='genes', positional=False)

    def requires(self):
        return DownloadExperiment(self.experiment_id,
                                  source=self.source).requires().requires()

    def run(self):
        download_sample_tasks = next(
            DownloadExperiment(self.experiment_id,
                               source=self.source).requires().run())
        yield [
            AlignSample(self.experiment_id,
                        dst.sample_id,
                        source=self.source,
                        taxon=self.taxon,
                        reference_id=self.reference_id,
                        scope=self.scope) for dst in download_sample_tasks
        ]
예제 #26
0
class DTA(luigi.Task):
    project_name = luigi.Parameter(default="RNASeqAnalysis")
    adapter = GlobalParameter().adapter
    organism_domain = GlobalParameter().organism_domain
    threads = GlobalParameter().threads
    pre_process_reads = luigi.ChoiceParameter(choices=["yes", "no"],
                                              var_type=str)
    read_library_type = GlobalParameter().read_library_type
    rnaseq_assembler = luigi.ChoiceParameter(
        choices=["trinity", "spades", "rockhopper"], var_type=str)

    def requires(self):
        if all([
                self.organism_domain == "prokaryote",
                self.rnaseq_assembler == "rockhopper"
        ]):
            return [
                rockhopper(project_name=self.project_name,
                           pre_process_reads=self.pre_process_reads)
            ]

        if self.rnaseq_assembler == "spades":
            return [
                spades(project_name="RNASeqAnalysis",
                       read_library_type=self.read_library_type,
                       mode="rna",
                       pre_process_reads=self.pre_process_reads)
            ]

        if self.rnaseq_assembler == "trinity":
            return [
                trinity(project_name=self.project_name,
                        pre_process_reads=self.pre_process_reads)
            ]

    def output(self):
        timestamp = time.strftime('%Y%m%d.%H%M%S', time.localtime())
        return luigi.LocalTarget(
            os.path.join(
                os.getcwd(), "task_logs",
                'task.assemble.transcript.complete.{t}'.format(t=timestamp)))

    def run(self):
        timestamp = time.strftime('%Y%m%d.%H%M%S', time.localtime())
        with self.output().open('w') as outfile:
            outfile.write(
                'transcript assembly finished at {t}'.format(t=timestamp))
예제 #27
0
class ParameterizedTask(luigi.Task):
    example_str = luigi.Parameter(default='foo')
    example_bool = luigi.BoolParameter(default=True)
    example_int = luigi.IntParameter(default=0)
    example_float = luigi.FloatParameter(default=10.5)
    example_dict = luigi.DictParameter(default={'fizz': 'buzz'})
    example_date = luigi.DateParameter(default=datetime.date.today())
    example_choice = luigi.ChoiceParameter(choices=[1, 2, 3], var_type=int)
예제 #28
0
class quantifyDAT(luigi.Task):

    project_name = luigi.Parameter(default="RNASeqAnalysis")
    organism_domain = GlobalParameter().organism_domain
    read_library_type = GlobalParameter().read_library_type
    pre_process_reads = luigi.ChoiceParameter(choices=["yes", "no"],
                                              var_type=str)
    adapter = GlobalParameter().adapter
    genome_name = GlobalParameter().genome_name
    rnaseq_assembler = luigi.ChoiceParameter(
        choices=["trinity", "spades", "rockhopper"], var_type=str)
    threads = GlobalParameter().threads

    def requires(self):
        if self.read_library_type == "pe":
            return [
                denovoQuant(rnaseq_assembler=self.rnaseq_assembler,
                            pre_process_reads=self.pre_process_reads,
                            project_name=self.project_name,
                            sampleName=i)
                for i in [
                    line.strip() for line in open((os.path.join(
                        os.getcwd(), "sample_list", "pe_samples.lst")))
                ]
            ]

        if self.read_library_type == "se":
            return [
                denovoQuant(rnaseq_assembler=self.rnaseq_assembler,
                            project_name=self.project_name,
                            pre_process_reads=self.pre_process_reads,
                            sampleName=i)
                for i in [
                    line.strip() for line in open((os.path.join(
                        os.getcwd(), "sample_list", "se_samples.lst")))
                ]
            ]

    def output(self):
        timestamp = time.strftime('%Y%m%d.%H%M%S', time.localtime())
        return luigi.LocalTarget('workflow.complete.{t}'.format(t=timestamp))

    def run(self):
        timestamp = time.strftime('%Y%m%d.%H%M%S', time.localtime())
        with self.output().open('w') as outfile:
            outfile.write('workflow finished at {t}'.format(t=timestamp))
예제 #29
0
class TrivagoModelTrainingMixin(object):
    recommender_module_class: str = None
    recommender_extra_params: dict = None

    loss_function: str = luigi.ChoiceParameter(
        choices=TORCH_LOSS_FUNCTIONS.keys(), default="crm")
    n_factors: int = luigi.IntParameter(default=128)
    weight_init: str = luigi.ChoiceParameter(choices=TORCH_WEIGHT_INIT.keys(),
                                             default="lecun_normal")
    dropout_prob: float = luigi.FloatParameter(default=0.1)
    dropout_module: str = luigi.ChoiceParameter(
        choices=TORCH_DROPOUT_MODULES.keys(), default="alpha")
    activation_function: str = luigi.ChoiceParameter(
        choices=TORCH_ACTIVATION_FUNCTIONS.keys(), default="selu")
    filter_sizes: List[int] = luigi.ListParameter(default=[1, 3, 5])
    num_filters: int = luigi.IntParameter(default=64)

    @property
    def window_hist_size(self):
        if not hasattr(self, "_window_hist_size"):
            self._window_hist_size = int(
                self.train_data_frame.iloc[0]["window_hist_size"])
        return self._window_hist_size

    @property
    def metadata_size(self):
        if not hasattr(self, "_meta_data_size"):
            self._meta_data_size = int(self.metadata_data_frame.shape[1] - 3)
        return self._meta_data_size

    def create_module(self) -> nn.Module:

        return SimpleLinearModel(
            project_config=self.project_config,
            index_mapping=self.index_mapping,
            window_hist_size=self.window_hist_size,
            vocab_size=self.vocab_size,
            metadata_size=self.metadata_size,
            n_factors=self.n_factors,
            filter_sizes=self.filter_sizes,
            num_filters=self.num_filters,
            dropout_prob=self.dropout_prob,
            dropout_module=TORCH_DROPOUT_MODULES[self.dropout_module],
        )
예제 #30
0
class metaphlan(luigi.Task):
	project_name=luigi.Parameter(default="MetagenomeAnalysis")
	adapter = GlobalParameter().adapter
	threads = GlobalParameter().threads
	max_memory = GlobalParameter().maxMemory
	pre_process_reads = luigi.ChoiceParameter(choices=["yes", "no"], var_type=str)
	read_library_type = GlobalParameter().seq_platforms
	sampleName = luigi.Parameter(description="name of the sample to be analyzed. (string)")


	def requires(self):
		if self.read_library_type == "pe" and self.pre_process_reads=="yes":
			return [cleanFastq(sampleName=i)
				for i in [line.strip()
						  for line in
						  open((os.path.join(os.getcwd(), "config", "pe_samples.lst")))]]

		if self.read_library_type == "pe" and self.pre_process_reads=="no":
			return [reformat(sampleName=i)
				for i in [line.strip()
						  for line in
						  open((os.path.join(os.getcwd(), "config", "pe_samples.lst")))]]

		

	def output(self):
		profiled_samples = os.path.join(os.getcwd(), GlobalParameter().projectName, "metaphlan_analysis","profiled_samples" )

			
		return {'out1': luigi.LocalTarget(profiled_samples, self.sampleName +".txt"),
				'out2': luigi.LocalTarget(profiled_samples, self.sampleName +".out")}
		
	def run(self):
		#metaphlan_folder = os.path.join(os.getcwd(), GlobalParameter().projectName, "metaphlan_analysis" )
		profiled_samples= os.path.join(os.getcwd(), GlobalParameter().projectName, "metaphlan_analysis", "profiled_samples" + "/")

		#createFolder(profiled_samples)

		if self.pre_process_reads=="no":
			pe_read_folder = os.path.join(os.getcwd(), GlobalParameter().projectName, "ReadQC", "VerifiedReads", "PE-Reads" + "/")
			
		if self.pre_process_reads=="yes":
			pe_read_folder = os.path.join(os.getcwd(), GlobalParameter().projectName, "ReadQC", "CleanedReads", "PE-Reads" + "/")

	

		run_metaphlan="[ -d {profiled_samples} ] || mkdir -p {profiled_samples} ; " \
					  "metaphlan --input_type fastq " \
					  "{pe_read_folder}{sample}_1.fastq,{pe_read_folder}{sample}_2.fastq " \
					  "--nproc {threads} " \
					  "--bt2_ps very-sensitive " \
					  "--bowtie2out {profiled_samples}{sample}.out 2>&1 | tee {profiled_samples}{sample}.txt ".format(
						pe_read_folder=pe_read_folder,profiled_samples=profiled_samples,sample=self.sampleName,threads=self.threads)

		print("****NOW RUNNING COMMAND****:" + run_metaphlan)
		print(run_cmd(run_metaphlan))