예제 #1
0
    def get_ssl_rpi_solutions(self, add_floats, privileged, rows, dataset):
        """
        Get RPI-based pipelines for SSL tasks.
        """
        if self.types_present is None:
            return

        if 'AUDIO' in self.types_present or \
           'VIDEO' in self.types_present or \
           'TIMESERIES' in self.types_present or \
           'IMAGE' in self.types_present or \
           'TEXT' in self.types_present:
            return

        basic_sol = solutiondescription.SolutionDescription(self.problem)
        basic_sol.set_privileged(privileged)
        basic_sol.set_add_floats(add_floats)
        basic_sol.initialize_RPI_solution('NOTUNE_PIPELINE_RPI')
        outputstep = basic_sol.index_denormalize + 4
        if add_floats is not None and len(add_floats) > 0:
            outputstep = basic_sol.index_denormalize + 5

        sslVariants = solution_templates.sslVariants
        total_cols = self.total_cols
        if rows <= 100000:
            try:
                basic_sol.run_basic_solution(
                    inputs=[dataset],
                    output_step=outputstep,
                    dataframe_step=basic_sol.index_denormalize + 1)
                total_cols = basic_sol.get_total_cols()
                logging.info("Total cols = %s", total_cols)
            except:
                logging.error(sys.exc_info()[0])
                basic_sol = None
        else:
            sslVariants = [
                'd3m.primitives.classification.random_forest.SKlearn',
                'd3m.primitives.classification.bagging.SKlearn'
            ]

        if basic_sol is None or total_cols > 200:
            return

        for python_path in sslVariants:
            valid = is_primitive_reasonable(python_path, rows, total_cols,
                                            self.types_present, self.task_name)
            if valid is False:
                continue

            pipe = copy.deepcopy(basic_sol)
            pipe.id = str(uuid.uuid4())
            pipe.add_step(
                'd3m.primitives.semisupervised_classification.iterative_labeling.AutonBox',
                outputstep=pipe.index_denormalize + 4)
            pipe.add_ssl_variant(python_path)
            self.solutions.append(pipe)
예제 #2
0
 def get_solution_by_task_name(self,
                               name,
                               ML_prim=None,
                               outputstep=3,
                               dataframestep=1):
     """
     Get a single complete pipeline by task name.
     Used for less frequent tasks (not Classification or regression). 
     name: Name of pipeline template
     ML_prim: ML primitive to be appended (classifier/regressor mostly)
     outputstep: Step in pipeline producing targets
     dataframestep: Dataframe step in pipeline. Used for constructing predictions.
     """
     pipe = solutiondescription.SolutionDescription(self.problem)
     pipe.initialize_solution(name)
     pipe.id = str(uuid.uuid4())
     if ML_prim is not None:
         step1 = pipe.index_denormalize + outputstep
         step2 = pipe.index_denormalize + dataframestep
         pipe.add_step(ML_prim, outputstep=step1, dataframestep=step2)
     else:
         pipe.add_outputs()
     return pipe
예제 #3
0
    def search_solutions(self, request, dataset):
        """
        Populate potential solutions for TA3
        """
        primitives = self._primitives
        problem = request.problem.problem
        template = request.template
        task_name = problem_pb2.TaskType.Name(problem.task_type)
        logging.info(task_name)

        solutions = []

        if template != None and isinstance(template, pipeline_pb2.PipelineDescription) and len(template.steps) > 0:
            basic_sol = solutiondescription.SolutionDescription(request.problem, None)
            basic_sol.create_from_pipelinedescription(pipeline_description=template)
            if basic_sol.contains_placeholder() == False:  # Fully defined
                solutions.append(basic_sol)
                return solutions

        taskname = task_name.replace('_', '')
        solutions = solution_templates.get_solutions(taskname, dataset, primitives, request.problem)

        return solutions
예제 #4
0
    def search_solutions(self, request, dataset):
        """
        Populate potential solutions for TA3
        """
        primitives = self._primitives
        problem = request.problem.problem
        template = request.template
        task_name = self.get_task_name(problem.task_keywords)
        logging.critical(task_name)

        solutions = []
        pipeline_placeholder_present = False
        basic_sol = None

        # TA3 has specified a pipeline
        if template != None and isinstance(
                template,
                pipeline_pb2.PipelineDescription) and len(template.steps) > 0:
            basic_sol = solutiondescription.SolutionDescription(
                request.problem)
            basic_sol.create_from_pipelinedescription(
                self._solutions, pipeline_description=template)
            if basic_sol.contains_placeholder() == False:  # Fully defined
                solutions.append(basic_sol)
                return (solutions, 0)
            else:  # Placeholder present
                pipeline_placeholder_present = True
                inputs = []
                inputs.append(dataset)
                new_dataset = basic_sol.fit(inputs=inputs,
                                            solution_dict=self._solutions)
                dataset = new_dataset
                logging.critical("New datset from specified pipeline: %s",
                                 new_dataset)

        taskname = task_name.replace('_', '')
        logging.critical("taskname = %s", taskname)
        metric = request.problem.problem.performance_metrics[0].metric
        posLabel = request.problem.problem.performance_metrics[0].pos_label
        start = timer()
        automl = auto_solutions.auto_solutions(taskname, request.problem)
        solutions = automl.get_solutions(dataset)
        #solutions = solution_templates.get_solutions(taskname, dataset, primitives, metric, posLabel, request.problem)
        end = timer()
        time_used = end - start
        #try:
        #    keywords = None
        #    if request.problem.data_augmentation is not None and len(request.problem.data_augmentation) > 0:
        #        data_augment = request.problem.data_augmentation
        #        logging.critical("keywords = %s", data_augment[0].keywords)
        #        (augmented_solutions, augmented_time_used) = solution_templates.get_augmented_solutions(taskname, dataset, primitives, metric, posLabel, request.problem, data_augment)
        #        (solutions, time_used) = (augmented_solutions + solutions, augmented_time_used + time_used)
        #except:
        #    logging.critical(sys.exc_info()[0])

        logging.critical("Solutions = %s", solutions)
        if pipeline_placeholder_present is True:
            new_solution_set = []
            for s in solutions:
                try:
                    pipe = copy.deepcopy(basic_sol)
                    pipe.id = str(uuid.uuid4())
                    pipe.add_subpipeline(s)
                    self._solutions[s.id] = s
                    new_solution_set.append(pipe)
                except:
                    logging.critical(sys.exc_info()[0])
            logging.critical("%s", new_solution_set)
            return (new_solution_set, time_used)

        return (solutions, time_used)
예제 #5
0
    def get_rpi_solutions(self, add_floats, privileged, rows, dataset):
        """
        Get RPI-based pipelines for classification/regression tasks.
        """
        if self.types_present is None:
            return

        if self.task_name != "REGRESSION" and self.task_name != "CLASSIFICATION":
            return

        if rows >= 100000 and self.total_cols > 12:
            return

        if 'AUDIO' in self.types_present or \
           'VIDEO' in self.types_present or \
           'TIMESERIES' in self.types_present or \
           'IMAGE' in self.types_present or \
           'TEXT' in self.types_present:
            return

        basic_sol = solutiondescription.SolutionDescription(self.problem)
        basic_sol.set_privileged(privileged)
        basic_sol.set_add_floats(add_floats)
        outputstep = basic_sol.index_denormalize + 4
        if add_floats is not None and len(add_floats) > 0:
            outputstep = basic_sol.index_denormalize + 5

        total_cols = self.total_cols
        if self.task_name == "REGRESSION":
            listOfSolutions = solution_templates.regressors_rpi
        else:
            listOfSolutions = solution_templates.classifiers_rpi

        if rows >= 25000:
            # No tuning
            basic_sol.initialize_RPI_solution('NOTUNE_PIPELINE_RPI')
            if self.task_name == "REGRESSION":
                listOfSolutions = [
                    'd3m.primitives.regression.random_forest.SKlearn'
                ]
            else:
                listOfSolutions = [
                    'd3m.primitives.classification.random_forest.SKlearn'
                ]
        else:
            # Grid-search over RPI's binsize and model's no. of estimators. This can be expensive
            basic_sol.initialize_RPI_solution('PIPELINE_RPI')
            try:
                basic_sol.run_basic_solution(
                    inputs=[dataset],
                    output_step=outputstep,
                    dataframe_step=basic_sol.index_denormalize + 1)
                total_cols = basic_sol.get_total_cols()
                logging.info("Total cols = %s", total_cols)
            except:
                logging.error(sys.exc_info()[0])
                basic_sol = None

        if basic_sol is None or total_cols > 200:
            return

        RPI_steps = [
            'd3m.primitives.feature_selection.joint_mutual_information.AutoRPI',
            'd3m.primitives.feature_selection.simultaneous_markov_blanket.AutoRPI'
        ]

        for python_path in listOfSolutions:
            # Avoid expensive solutions!!!
            if 'gradient_boosting' in python_path and (
                (rows > 1000 and total_cols > 50) or
                (rows > 5000) or total_cols > 100):
                continue

            if rows >= 25000:
                pipe = copy.deepcopy(basic_sol)
                pipe.id = str(uuid.uuid4())
                pipe.add_step(python_path, outputstep)
                self.solutions.append(pipe)
            else:
                for step in RPI_steps:
                    pipe = copy.deepcopy(basic_sol)
                    pipe.id = str(uuid.uuid4())
                    pipe.add_RPI_step(step, python_path, outputstep)
                    self.solutions.append(pipe)
예제 #6
0
    def get_solutions(self, dataset):
        """
        Get a list of available solutions(pipelines) for the specified task
        Used by both TA2 in "search" phase and TA2-TA3
        """
        if self.task_name == 'VERTEXNOMINATION' or self.task_name == 'VERTEXCLASSIFICATION':
            self.task_name = 'VERTEXCLASSIFICATION'
            self.get_vertex_classification_pipelines()
        elif self.task_name == 'COMMUNITYDETECTION':
            self.get_community_detection_pipelines()
        elif self.task_name == 'LINKPREDICTION':
            self.get_link_prediction_pipelines()
        elif self.task_name == 'GRAPHMATCHING':
            self.get_graph_matching_pipelines()
        elif self.task_name == 'CLUSTERING':
            self.get_clustering_pipelines()
        elif self.task_name == 'OBJECTDETECTION':
            self.get_object_detection_pipelines()
        elif self.task_name == 'COLLABORATIVEFILTERING':
            self.get_collaborative_filtering_pipelines()
        elif self.task_name == 'FORECASTING':
            self.get_forecasting_pipelines(dataset)
        elif self.task_name == 'LINKPREDICTIONTIMESERIES':
            self.get_link_prediction_timeseries_pipelines()
        else:  # CLASSIFICATION / REGRESSION / SEMISUPERVISED / FORE_REGRESSION
            # Initialize dataset properties, data types etc.
            self.create_basic_solutions(dataset)

            # Run common steps of solutions before forking out processes for classifiers/regressors
            output_step_index = self.basic_sol.index_denormalize + 3
            if 'AUDIO' in self.types_present:
                output_step_index = self.basic_sol.index_denormalize + 2
            if self.basic_sol.splitter_present() == False or \
                'IMAGE' in self.types_present or \
                'AUDIO' in self.types_present:
                try:
                    self.basic_sol.run_basic_solution(
                        inputs=[dataset], output_step=output_step_index)
                    self.total_cols = self.basic_sol.get_total_cols()
                    logging.critical("Total cols = %s", self.total_cols)
                except:
                    logging.error(sys.exc_info()[0])
                    self.basic_sol = None

            # Run additional(secondary solution, if any
            if self.addn_sol is not None and (
                    self.addn_sol.splitter_present() == False
                    or 'IMAGE' in self.types_present):
                try:
                    self.addn_sol.run_basic_solution(
                        inputs=[dataset], output_step=output_step_index)
                except:
                    logging.error(sys.exc_info()[0])
                    self.addn_sol = None

            # Add primitives to basic solutions
            if self.task_name == 'CLASSIFICATION' or self.task_name == 'REGRESSION' or self.task_name == 'FORE_REGRESSION':
                for sol in [self.basic_sol, self.addn_sol]:
                    self.get_primitive_solutions(sol, self.rows)
                # Try RPI solutions
                if self.basic_sol is not None and self.basic_sol.splitter == False:
                    if self.is_multi_column_output(
                            self.basic_sol, output_step_index
                    ) is False:  # multi-column output?
                        self.get_rpi_solutions(self.basic_sol.add_floats,
                                               self.basic_sol.privileged,
                                               self.rows, dataset)
                conditioner_prim = 'd3m.primitives.classification.bagging.SKlearn'
                if self.task_name != 'CLASSIFICATION':
                    conditioner_prim = 'd3m.primitives.regression.bagging.SKlearn'
                pipe = self.get_solution_by_task_name('CONDITIONER',
                                                      ML_prim=conditioner_prim)
                if self.rows < 1000000:
                    self.solutions.append(pipe)
                self.create_imvadio_solution()
            else:
                # Add primitives for SSL solutions
                self.get_SSL_pipelines(self.rows)
                self.get_ssl_rpi_solutions(self.basic_sol.add_floats,
                                           self.basic_sol.privileged,
                                           self.rows, dataset)

            # Extra pipeline for TS classification
            if self.task_name == 'CLASSIFICATION' and 'TIMESERIES' in self.types_present:
                pipe = solutiondescription.SolutionDescription(self.problem)
                pipe.initialize_solution('TIMESERIES2')
                pipe.id = str(uuid.uuid4())
                pipe.run_basic_solution(inputs=[dataset],
                                        input_step=1,
                                        output_step=3,
                                        dataframe_step=2)
                pipe.add_step(
                    'd3m.primitives.time_series_classification.k_neighbors.Kanine',
                    inputstep=1,
                    outputstep=3,
                    dataframestep=2)
                self.solutions.append(pipe)

        return self.solutions
예제 #7
0
    def create_basic_solutions(self, dataset):
        """
        In case of tasks with multiple pipelines (classification/regression/SSL), primary data processing/featurization steps are
        run only once and cached for all the pipelines.
        """
        if self.task_name != 'CLASSIFICATION' and self.task_name != 'REGRESSION' and self.task_name != 'SEMISUPERVISED' and self.task_name != 'FORE_REGRESSION':
            return

        basic_sol = None
        try:
            # Set data types, and meta data information to begin with
            basic_sol = solutiondescription.SolutionDescription(self.problem)
            basic_sol.initialize_solution(self.task_name)
            (self.types_present, self.total_cols, self.rows, categorical_atts,
             ordinal_atts, ok_to_denormalize, privileged, add_floats,
             ok_to_augment, profiler_needed
             ) = solutiondescription.column_types_present(dataset)
            logging.critical(self.types_present)
            basic_sol.set_add_floats(add_floats)
            basic_sol.set_categorical_atts(categorical_atts)
            basic_sol.set_ordinal_atts(ordinal_atts)
            basic_sol.set_denormalize(ok_to_denormalize)
            basic_sol.set_privileged(privileged)
            basic_sol.profiler_needed = profiler_needed
            basic_sol.initialize_solution(self.task_name)
        except:
            logging.error(sys.exc_info()[0])
            basic_sol = None
            self.types_present = None
            self.basic_sol = None
            return

        # For file in each data point, we treat as time series for featurization
        if len(self.types_present) == 1 and self.types_present[0] == 'FILES':
            self.types_present[0] = 'TIMESERIES'

        self.basic_sol = basic_sol
        # Initialize basic solutions based on data types
        if 'TIMESERIES' in self.types_present:
            self.basic_sol.initialize_solution('TIMESERIES')
        elif 'IMAGE' in self.types_present:
            if self.rows > 1200:
                self.basic_sol.add_splitter()
                self.rows = 1200
            self.basic_sol.initialize_solution('IMAGE')
            self.addn_sol = copy.deepcopy(basic_sol)
            self.addn_sol.initialize_solution('IMAGE2')
        elif 'TEXT' in self.types_present:
            if self.task_name == "FORE_REGRESSION":
                self.basic_sol.initialize_solution('DISTILTEXT')
                self.basic_sol.set_distil_text_hyperparam()
            else:
                self.basic_sol.initialize_solution('TEXT')
                self.addn_sol = copy.deepcopy(basic_sol)
                self.addn_sol.initialize_solution('DISTILTEXT')
                if self.task_name == 'REGRESSION':
                    self.addn_sol.set_distil_text_hyperparam()
        elif 'AUDIO' in self.types_present:
            if self.rows > 1200:
                self.basic_sol.add_splitter()
                self.rows = 1200
            self.basic_sol.initialize_solution('AUDIO')
            print(self.basic_sol.primitives_arguments)
        elif 'VIDEO' in self.types_present:
            self.basic_sol.initialize_solution('VIDEO')
예제 #8
0
def get_solutions(task_name, dataset, primitives, problem):
    """
    Get a list of available solutions(pipelines) for the specified task
    Used by both TA2 in "search" phase and TA2-TA3
    """
    solutions = []

    try:
        static_dir = os.environ['D3MSTATICDIR']
    except:
        static_dir = None

    basic_sol = solutiondescription.SolutionDescription(problem, static_dir)
    basic_sol.initialize_solution(task_name)

    if task_name == 'CLASSIFICATION' or task_name == 'REGRESSION':
        (types_present, total_cols,
         rows) = solutiondescription.column_types_present(dataset)

        if 'TIMESERIES' in types_present:
            basic_sol = solutiondescription.SolutionDescription(
                problem, static_dir)
            basic_sol.initialize_solution('TIMESERIES')
        elif 'IMAGE' in types_present:
            basic_sol = solutiondescription.SolutionDescription(
                problem, static_dir)
            basic_sol.initialize_solution('IMAGE')
        elif 'TEXT' in types_present:
            basic_sol = solutiondescription.SolutionDescription(
                problem, static_dir)
            basic_sol.initialize_solution('TEXT')
        elif 'AUDIO' in types_present:
            basic_sol = solutiondescription.SolutionDescription(
                problem, static_dir)
            basic_sol.initialize_solution('AUDIO')

        try:
            basic_sol.run_basic_solution(inputs=[dataset])
        except:
            basic_sol = None

        print("Total cols = ", total_cols)

        # Iterate through primitives which match task type for populative pool of solutions
        for classname, p in primitives.items():
            if p.primitive_class.family == task_name and basic_sol is not None:
                python_path = p.primitive_class.python_path
                if 'd3m.primitives.sri.' in python_path or 'JHU' in python_path or 'lupi_svm' in python_path or 'bbn' in python_path:
                    continue

                if 'Find_projections' in python_path and (total_cols > 20
                                                          or rows > 10000):
                    continue

                pipe = copy.deepcopy(basic_sol)
                pipe.id = str(uuid.uuid4())
                pipe.add_step(p.primitive_class.python_path)
                solutions.append(pipe)
    elif task_name == 'COLLABORATIVEFILTERING' or \
         task_name == 'VERTEXNOMINATION' or \
         task_name == 'COMMUNITYDETECTION' or \
         task_name == 'GRAPHMATCHING' or \
         task_name == 'LINKPREDICTION' or \
         task_name == 'CLUSTERING' or \
         task_name == 'TIMESERIESFORECASTING':
        if task_name == 'TIMESERIESFORECASTING':
            basic_sol.add_step('d3m.primitives.sri.psl.RelationalTimeseries')
        pipe = copy.deepcopy(basic_sol)
        pipe.id = str(uuid.uuid4())
        pipe.add_outputs()
        solutions.append(pipe)
    else:
        logging.info("No matching solutions")

    if task_name == 'GRAPHMATCHING':  # Add alternative solution for graph matching problem.
        basic_sol = solutiondescription.SolutionDescription(
            problem, static_dir)
        basic_sol.initialize_solution('GRAPHMATCHING2')
        pipe = copy.deepcopy(basic_sol)
        pipe.id = str(uuid.uuid4())
        pipe.add_outputs()
        solutions.append(pipe)

    basic_sol = solutiondescription.SolutionDescription(problem, static_dir)
    basic_sol.initialize_solution('FALLBACK1')
    pipe = copy.deepcopy(basic_sol)
    pipe.id = str(uuid.uuid4())
    pipe.add_outputs()
    solutions.append(pipe)

    return solutions