def get_ssl_rpi_solutions(self, add_floats, privileged, rows, dataset): """ Get RPI-based pipelines for SSL tasks. """ if self.types_present is None: return if 'AUDIO' in self.types_present or \ 'VIDEO' in self.types_present or \ 'TIMESERIES' in self.types_present or \ 'IMAGE' in self.types_present or \ 'TEXT' in self.types_present: return basic_sol = solutiondescription.SolutionDescription(self.problem) basic_sol.set_privileged(privileged) basic_sol.set_add_floats(add_floats) basic_sol.initialize_RPI_solution('NOTUNE_PIPELINE_RPI') outputstep = basic_sol.index_denormalize + 4 if add_floats is not None and len(add_floats) > 0: outputstep = basic_sol.index_denormalize + 5 sslVariants = solution_templates.sslVariants total_cols = self.total_cols if rows <= 100000: try: basic_sol.run_basic_solution( inputs=[dataset], output_step=outputstep, dataframe_step=basic_sol.index_denormalize + 1) total_cols = basic_sol.get_total_cols() logging.info("Total cols = %s", total_cols) except: logging.error(sys.exc_info()[0]) basic_sol = None else: sslVariants = [ 'd3m.primitives.classification.random_forest.SKlearn', 'd3m.primitives.classification.bagging.SKlearn' ] if basic_sol is None or total_cols > 200: return for python_path in sslVariants: valid = is_primitive_reasonable(python_path, rows, total_cols, self.types_present, self.task_name) if valid is False: continue pipe = copy.deepcopy(basic_sol) pipe.id = str(uuid.uuid4()) pipe.add_step( 'd3m.primitives.semisupervised_classification.iterative_labeling.AutonBox', outputstep=pipe.index_denormalize + 4) pipe.add_ssl_variant(python_path) self.solutions.append(pipe)
def get_solution_by_task_name(self, name, ML_prim=None, outputstep=3, dataframestep=1): """ Get a single complete pipeline by task name. Used for less frequent tasks (not Classification or regression). name: Name of pipeline template ML_prim: ML primitive to be appended (classifier/regressor mostly) outputstep: Step in pipeline producing targets dataframestep: Dataframe step in pipeline. Used for constructing predictions. """ pipe = solutiondescription.SolutionDescription(self.problem) pipe.initialize_solution(name) pipe.id = str(uuid.uuid4()) if ML_prim is not None: step1 = pipe.index_denormalize + outputstep step2 = pipe.index_denormalize + dataframestep pipe.add_step(ML_prim, outputstep=step1, dataframestep=step2) else: pipe.add_outputs() return pipe
def search_solutions(self, request, dataset): """ Populate potential solutions for TA3 """ primitives = self._primitives problem = request.problem.problem template = request.template task_name = problem_pb2.TaskType.Name(problem.task_type) logging.info(task_name) solutions = [] if template != None and isinstance(template, pipeline_pb2.PipelineDescription) and len(template.steps) > 0: basic_sol = solutiondescription.SolutionDescription(request.problem, None) basic_sol.create_from_pipelinedescription(pipeline_description=template) if basic_sol.contains_placeholder() == False: # Fully defined solutions.append(basic_sol) return solutions taskname = task_name.replace('_', '') solutions = solution_templates.get_solutions(taskname, dataset, primitives, request.problem) return solutions
def search_solutions(self, request, dataset): """ Populate potential solutions for TA3 """ primitives = self._primitives problem = request.problem.problem template = request.template task_name = self.get_task_name(problem.task_keywords) logging.critical(task_name) solutions = [] pipeline_placeholder_present = False basic_sol = None # TA3 has specified a pipeline if template != None and isinstance( template, pipeline_pb2.PipelineDescription) and len(template.steps) > 0: basic_sol = solutiondescription.SolutionDescription( request.problem) basic_sol.create_from_pipelinedescription( self._solutions, pipeline_description=template) if basic_sol.contains_placeholder() == False: # Fully defined solutions.append(basic_sol) return (solutions, 0) else: # Placeholder present pipeline_placeholder_present = True inputs = [] inputs.append(dataset) new_dataset = basic_sol.fit(inputs=inputs, solution_dict=self._solutions) dataset = new_dataset logging.critical("New datset from specified pipeline: %s", new_dataset) taskname = task_name.replace('_', '') logging.critical("taskname = %s", taskname) metric = request.problem.problem.performance_metrics[0].metric posLabel = request.problem.problem.performance_metrics[0].pos_label start = timer() automl = auto_solutions.auto_solutions(taskname, request.problem) solutions = automl.get_solutions(dataset) #solutions = solution_templates.get_solutions(taskname, dataset, primitives, metric, posLabel, request.problem) end = timer() time_used = end - start #try: # keywords = None # if request.problem.data_augmentation is not None and len(request.problem.data_augmentation) > 0: # data_augment = request.problem.data_augmentation # logging.critical("keywords = %s", data_augment[0].keywords) # (augmented_solutions, augmented_time_used) = solution_templates.get_augmented_solutions(taskname, dataset, primitives, metric, posLabel, request.problem, data_augment) # (solutions, time_used) = (augmented_solutions + solutions, augmented_time_used + time_used) #except: # logging.critical(sys.exc_info()[0]) logging.critical("Solutions = %s", solutions) if pipeline_placeholder_present is True: new_solution_set = [] for s in solutions: try: pipe = copy.deepcopy(basic_sol) pipe.id = str(uuid.uuid4()) pipe.add_subpipeline(s) self._solutions[s.id] = s new_solution_set.append(pipe) except: logging.critical(sys.exc_info()[0]) logging.critical("%s", new_solution_set) return (new_solution_set, time_used) return (solutions, time_used)
def get_rpi_solutions(self, add_floats, privileged, rows, dataset): """ Get RPI-based pipelines for classification/regression tasks. """ if self.types_present is None: return if self.task_name != "REGRESSION" and self.task_name != "CLASSIFICATION": return if rows >= 100000 and self.total_cols > 12: return if 'AUDIO' in self.types_present or \ 'VIDEO' in self.types_present or \ 'TIMESERIES' in self.types_present or \ 'IMAGE' in self.types_present or \ 'TEXT' in self.types_present: return basic_sol = solutiondescription.SolutionDescription(self.problem) basic_sol.set_privileged(privileged) basic_sol.set_add_floats(add_floats) outputstep = basic_sol.index_denormalize + 4 if add_floats is not None and len(add_floats) > 0: outputstep = basic_sol.index_denormalize + 5 total_cols = self.total_cols if self.task_name == "REGRESSION": listOfSolutions = solution_templates.regressors_rpi else: listOfSolutions = solution_templates.classifiers_rpi if rows >= 25000: # No tuning basic_sol.initialize_RPI_solution('NOTUNE_PIPELINE_RPI') if self.task_name == "REGRESSION": listOfSolutions = [ 'd3m.primitives.regression.random_forest.SKlearn' ] else: listOfSolutions = [ 'd3m.primitives.classification.random_forest.SKlearn' ] else: # Grid-search over RPI's binsize and model's no. of estimators. This can be expensive basic_sol.initialize_RPI_solution('PIPELINE_RPI') try: basic_sol.run_basic_solution( inputs=[dataset], output_step=outputstep, dataframe_step=basic_sol.index_denormalize + 1) total_cols = basic_sol.get_total_cols() logging.info("Total cols = %s", total_cols) except: logging.error(sys.exc_info()[0]) basic_sol = None if basic_sol is None or total_cols > 200: return RPI_steps = [ 'd3m.primitives.feature_selection.joint_mutual_information.AutoRPI', 'd3m.primitives.feature_selection.simultaneous_markov_blanket.AutoRPI' ] for python_path in listOfSolutions: # Avoid expensive solutions!!! if 'gradient_boosting' in python_path and ( (rows > 1000 and total_cols > 50) or (rows > 5000) or total_cols > 100): continue if rows >= 25000: pipe = copy.deepcopy(basic_sol) pipe.id = str(uuid.uuid4()) pipe.add_step(python_path, outputstep) self.solutions.append(pipe) else: for step in RPI_steps: pipe = copy.deepcopy(basic_sol) pipe.id = str(uuid.uuid4()) pipe.add_RPI_step(step, python_path, outputstep) self.solutions.append(pipe)
def get_solutions(self, dataset): """ Get a list of available solutions(pipelines) for the specified task Used by both TA2 in "search" phase and TA2-TA3 """ if self.task_name == 'VERTEXNOMINATION' or self.task_name == 'VERTEXCLASSIFICATION': self.task_name = 'VERTEXCLASSIFICATION' self.get_vertex_classification_pipelines() elif self.task_name == 'COMMUNITYDETECTION': self.get_community_detection_pipelines() elif self.task_name == 'LINKPREDICTION': self.get_link_prediction_pipelines() elif self.task_name == 'GRAPHMATCHING': self.get_graph_matching_pipelines() elif self.task_name == 'CLUSTERING': self.get_clustering_pipelines() elif self.task_name == 'OBJECTDETECTION': self.get_object_detection_pipelines() elif self.task_name == 'COLLABORATIVEFILTERING': self.get_collaborative_filtering_pipelines() elif self.task_name == 'FORECASTING': self.get_forecasting_pipelines(dataset) elif self.task_name == 'LINKPREDICTIONTIMESERIES': self.get_link_prediction_timeseries_pipelines() else: # CLASSIFICATION / REGRESSION / SEMISUPERVISED / FORE_REGRESSION # Initialize dataset properties, data types etc. self.create_basic_solutions(dataset) # Run common steps of solutions before forking out processes for classifiers/regressors output_step_index = self.basic_sol.index_denormalize + 3 if 'AUDIO' in self.types_present: output_step_index = self.basic_sol.index_denormalize + 2 if self.basic_sol.splitter_present() == False or \ 'IMAGE' in self.types_present or \ 'AUDIO' in self.types_present: try: self.basic_sol.run_basic_solution( inputs=[dataset], output_step=output_step_index) self.total_cols = self.basic_sol.get_total_cols() logging.critical("Total cols = %s", self.total_cols) except: logging.error(sys.exc_info()[0]) self.basic_sol = None # Run additional(secondary solution, if any if self.addn_sol is not None and ( self.addn_sol.splitter_present() == False or 'IMAGE' in self.types_present): try: self.addn_sol.run_basic_solution( inputs=[dataset], output_step=output_step_index) except: logging.error(sys.exc_info()[0]) self.addn_sol = None # Add primitives to basic solutions if self.task_name == 'CLASSIFICATION' or self.task_name == 'REGRESSION' or self.task_name == 'FORE_REGRESSION': for sol in [self.basic_sol, self.addn_sol]: self.get_primitive_solutions(sol, self.rows) # Try RPI solutions if self.basic_sol is not None and self.basic_sol.splitter == False: if self.is_multi_column_output( self.basic_sol, output_step_index ) is False: # multi-column output? self.get_rpi_solutions(self.basic_sol.add_floats, self.basic_sol.privileged, self.rows, dataset) conditioner_prim = 'd3m.primitives.classification.bagging.SKlearn' if self.task_name != 'CLASSIFICATION': conditioner_prim = 'd3m.primitives.regression.bagging.SKlearn' pipe = self.get_solution_by_task_name('CONDITIONER', ML_prim=conditioner_prim) if self.rows < 1000000: self.solutions.append(pipe) self.create_imvadio_solution() else: # Add primitives for SSL solutions self.get_SSL_pipelines(self.rows) self.get_ssl_rpi_solutions(self.basic_sol.add_floats, self.basic_sol.privileged, self.rows, dataset) # Extra pipeline for TS classification if self.task_name == 'CLASSIFICATION' and 'TIMESERIES' in self.types_present: pipe = solutiondescription.SolutionDescription(self.problem) pipe.initialize_solution('TIMESERIES2') pipe.id = str(uuid.uuid4()) pipe.run_basic_solution(inputs=[dataset], input_step=1, output_step=3, dataframe_step=2) pipe.add_step( 'd3m.primitives.time_series_classification.k_neighbors.Kanine', inputstep=1, outputstep=3, dataframestep=2) self.solutions.append(pipe) return self.solutions
def create_basic_solutions(self, dataset): """ In case of tasks with multiple pipelines (classification/regression/SSL), primary data processing/featurization steps are run only once and cached for all the pipelines. """ if self.task_name != 'CLASSIFICATION' and self.task_name != 'REGRESSION' and self.task_name != 'SEMISUPERVISED' and self.task_name != 'FORE_REGRESSION': return basic_sol = None try: # Set data types, and meta data information to begin with basic_sol = solutiondescription.SolutionDescription(self.problem) basic_sol.initialize_solution(self.task_name) (self.types_present, self.total_cols, self.rows, categorical_atts, ordinal_atts, ok_to_denormalize, privileged, add_floats, ok_to_augment, profiler_needed ) = solutiondescription.column_types_present(dataset) logging.critical(self.types_present) basic_sol.set_add_floats(add_floats) basic_sol.set_categorical_atts(categorical_atts) basic_sol.set_ordinal_atts(ordinal_atts) basic_sol.set_denormalize(ok_to_denormalize) basic_sol.set_privileged(privileged) basic_sol.profiler_needed = profiler_needed basic_sol.initialize_solution(self.task_name) except: logging.error(sys.exc_info()[0]) basic_sol = None self.types_present = None self.basic_sol = None return # For file in each data point, we treat as time series for featurization if len(self.types_present) == 1 and self.types_present[0] == 'FILES': self.types_present[0] = 'TIMESERIES' self.basic_sol = basic_sol # Initialize basic solutions based on data types if 'TIMESERIES' in self.types_present: self.basic_sol.initialize_solution('TIMESERIES') elif 'IMAGE' in self.types_present: if self.rows > 1200: self.basic_sol.add_splitter() self.rows = 1200 self.basic_sol.initialize_solution('IMAGE') self.addn_sol = copy.deepcopy(basic_sol) self.addn_sol.initialize_solution('IMAGE2') elif 'TEXT' in self.types_present: if self.task_name == "FORE_REGRESSION": self.basic_sol.initialize_solution('DISTILTEXT') self.basic_sol.set_distil_text_hyperparam() else: self.basic_sol.initialize_solution('TEXT') self.addn_sol = copy.deepcopy(basic_sol) self.addn_sol.initialize_solution('DISTILTEXT') if self.task_name == 'REGRESSION': self.addn_sol.set_distil_text_hyperparam() elif 'AUDIO' in self.types_present: if self.rows > 1200: self.basic_sol.add_splitter() self.rows = 1200 self.basic_sol.initialize_solution('AUDIO') print(self.basic_sol.primitives_arguments) elif 'VIDEO' in self.types_present: self.basic_sol.initialize_solution('VIDEO')
def get_solutions(task_name, dataset, primitives, problem): """ Get a list of available solutions(pipelines) for the specified task Used by both TA2 in "search" phase and TA2-TA3 """ solutions = [] try: static_dir = os.environ['D3MSTATICDIR'] except: static_dir = None basic_sol = solutiondescription.SolutionDescription(problem, static_dir) basic_sol.initialize_solution(task_name) if task_name == 'CLASSIFICATION' or task_name == 'REGRESSION': (types_present, total_cols, rows) = solutiondescription.column_types_present(dataset) if 'TIMESERIES' in types_present: basic_sol = solutiondescription.SolutionDescription( problem, static_dir) basic_sol.initialize_solution('TIMESERIES') elif 'IMAGE' in types_present: basic_sol = solutiondescription.SolutionDescription( problem, static_dir) basic_sol.initialize_solution('IMAGE') elif 'TEXT' in types_present: basic_sol = solutiondescription.SolutionDescription( problem, static_dir) basic_sol.initialize_solution('TEXT') elif 'AUDIO' in types_present: basic_sol = solutiondescription.SolutionDescription( problem, static_dir) basic_sol.initialize_solution('AUDIO') try: basic_sol.run_basic_solution(inputs=[dataset]) except: basic_sol = None print("Total cols = ", total_cols) # Iterate through primitives which match task type for populative pool of solutions for classname, p in primitives.items(): if p.primitive_class.family == task_name and basic_sol is not None: python_path = p.primitive_class.python_path if 'd3m.primitives.sri.' in python_path or 'JHU' in python_path or 'lupi_svm' in python_path or 'bbn' in python_path: continue if 'Find_projections' in python_path and (total_cols > 20 or rows > 10000): continue pipe = copy.deepcopy(basic_sol) pipe.id = str(uuid.uuid4()) pipe.add_step(p.primitive_class.python_path) solutions.append(pipe) elif task_name == 'COLLABORATIVEFILTERING' or \ task_name == 'VERTEXNOMINATION' or \ task_name == 'COMMUNITYDETECTION' or \ task_name == 'GRAPHMATCHING' or \ task_name == 'LINKPREDICTION' or \ task_name == 'CLUSTERING' or \ task_name == 'TIMESERIESFORECASTING': if task_name == 'TIMESERIESFORECASTING': basic_sol.add_step('d3m.primitives.sri.psl.RelationalTimeseries') pipe = copy.deepcopy(basic_sol) pipe.id = str(uuid.uuid4()) pipe.add_outputs() solutions.append(pipe) else: logging.info("No matching solutions") if task_name == 'GRAPHMATCHING': # Add alternative solution for graph matching problem. basic_sol = solutiondescription.SolutionDescription( problem, static_dir) basic_sol.initialize_solution('GRAPHMATCHING2') pipe = copy.deepcopy(basic_sol) pipe.id = str(uuid.uuid4()) pipe.add_outputs() solutions.append(pipe) basic_sol = solutiondescription.SolutionDescription(problem, static_dir) basic_sol.initialize_solution('FALLBACK1') pipe = copy.deepcopy(basic_sol) pipe.id = str(uuid.uuid4()) pipe.add_outputs() solutions.append(pipe) return solutions