예제 #1
0
    def f_materialize(self, f_node, old_record=None):
        """
        Assuming all the components in this f_node are ready. Generate the Feature and the FTransform based on those.
        The function doesn't search db. Any searched result can be passed to the old_record parameter

        :param f_node:
        :param old_record: a document that matches the node
        :return:
        """
        if not isinstance(f_node, FNode):
            raise TypeError("The parameter f_node should be of the type FNode.")

        if old_record:
            if old_record["filepaths"]:
                filepaths = old_record["filepaths"]
                feature_id = old_record["_id"]
                f_transform_id = old_record["essentials"]["f_transform"]

                ih = IOHandler()
                feature = ih.load_obj_from_file(feature_id, "Feature", filepaths)
                f_transform = ih.load_obj_from_file(f_transform_id, "FTransform", filepaths)
            else:
                feature, f_transform = self.recover_with_existing_doc(f_node, old_record)
        else:
            feature, f_transform = self.create_and_record(f_node)

        return feature, f_transform
예제 #2
0
    def f_knit(self, f_node):
        if f_node.lst_fed:
            for f in f_node.lst_fed:
                self.f_subknit(f)
        if f_node.l_node:
            self.l_subknit(f_node.l_node)

        if f_node.filepaths:
            ih = IOHandler()
            feature = ih.load_obj_from_file(f_node.obj_id, "Feature", f_node.filepaths)
            f_transform_id = feature.essentials["f_transform"]
            f_transform = ih.load_obj_from_file(f_transform_id, "FTransform", f_node.filepaths)
        else:
            # TODO: Current f_materialize doesn't work with non-empty ftransform. Fix this and rm the next line
            self.fnode_has_empty_ftransform(f_node)

            doc = self.fc.collect_doc(f_node)
            feature, f_transform = self.fc.f_materialize(f_node, doc)
            if doc and "filepaths" in doc:
                """
                Update if the obj is already saved in the filepaths.
                Whether or not save a new created one should be decided by a higher level function
                """
                f_node.filepaths = doc["filepaths"]

                obj_id = doc["_id"]
            else:
                obj_id = feature.obj_id

            if f_node.obj_id is None:
                f_node.obj_id = obj_id

        return feature, f_transform
예제 #3
0
    def l_materialize(self, l_node, old_record=None):
        """
        Assuming all the components in this l_node are ready. Generate the Label and the LTransform based on those.
        The function doesn't search db. Any searched result can be passed to the old_record parameter

        :param l_node: LNode
        :param old_record: a document matched the node.
        :return:
        """
        if not isinstance(l_node, LNode):
            raise TypeError("The parameter l_node should be of the type LNode.")

        if old_record:
            if old_record["filepaths"]:
                filepaths = old_record["filepaths"]
                label_id = old_record["_id"]
                l_transform_id = old_record["essentials"]["l_transform"]

                ih = IOHandler()
                label = ih.load_obj_from_file(label_id, "Label", filepaths)
                l_transform = ih.load_obj_from_file(l_transform_id, "LTransform", filepaths)
            else:
                label, l_transform = self.recover_with_existing_doc(l_node, old_record)
        else:
            label, l_transform = self.create_and_record(l_node)

        return label, l_transform
예제 #4
0
    def l_knit(self, l_node):
        # TODO: need to prevent knitting fitted l_transform somehow
        if l_node.lab_fed:
            self.l_subknit(l_node.lab_fed)

        if l_node.filepaths:
            ih = IOHandler()
            label = ih.load_obj_from_file(l_node.obj_id, "Label", l_node.filepaths)
            l_transform_id = label.l_transform
            l_transform = ih.load_obj_from_file(l_transform_id, "LTransform", l_node.filepaths)
        else:
            doc = self.lc.collect_doc(l_node)
            label, l_transform = self.lc.l_materialize(l_node, doc)
            if doc and "filepaths" in doc:
                """"
                Update if the obj is already saved in the filepaths.
                Whether or not save a new created one should be decided by a higher level function
                """
                l_node.filepaths = doc["filepaths"]

                obj_id = doc["_id"]
            else:
                obj_id = label.obj_id

            if l_node.obj_id is None:
                l_node.obj_id = obj_id

        return label, l_transform
예제 #5
0
    def f_collect_components(f_node):
        ih = IOHandler()
        frame = ih.load_obj_from_file(obj_id=f_node.pipe_init.frame,
                                      element="Frame",
                                      filepaths=f_node.pipe_init.filepaths)

        label = ih.load_obj_from_file(obj_id=f_node.l_node.obj_id,
                                      element="Label",
                                      filepaths=f_node.pipe_init.filepaths)
        l_values = label.values

        lst_fed = []
        for f in f_node.lst_fed:
            fed = ih.load_obj_from_file(obj_id=f.obj_id,
                                        element="Feature",
                                        filepaths=f_node.pipe_init.filepaths)
            lst_fed.append(fed)
        if len(lst_fed) == 1:
            fed_values = lst_fed[0].values
        else:
            fed_values = np.concatenate(list(map(lambda x: x.values, lst_fed)),
                                        axis=1)

        prevstage = max(map(lambda x: x.stage, lst_fed))

        return frame, l_values, fed_values, prevstage
예제 #6
0
    def lst_fed(self):
        filepaths = self.__pipe_init.filepaths
        lst_fed = self.__essentials["lst_fed"]

        ih = IOHandler()
        return [
            ih.load_obj_from_file(fid, "Feature", filepaths) for fid in lst_fed
        ]
예제 #7
0
    def save_file(self, filepaths):
        """

        :param filepaths: list of dict
        :return:
        """
        self.set_filepaths(filepaths)
        ih = IOHandler()
        ih.save_obj2file(self)
예제 #8
0
    def __init__(self,
                 data=None,
                 col_y=None,
                 lst_layers=None,
                 shuffle=False,
                 stratified=False,
                 col_selected=None,
                 tag=None,
                 db=None,
                 filepaths=None,
                 pipe_id=None):
        """
        The difference between PipeInit and ml_forest.core.constructions.core_init.CoreInit is that
            - PipeInit has initiating Nodes
            - CoreInit has initiating Features/Labels obj_id

        :param data: pandas.DataFrame. This needs to be a pandas data frame with a label column
        :param col_y: The name of the label column
        :param lst_layers: list. This gives the "lst_layers" to the Frame
        :param shuffle: boolean.
        :param stratified: boolean. Should not be used to a regression problem
        :param col_selected: dict. Ex: {'num': ['colname1', 'colname2'], 'cate':['colname3'], ...}
        :param db:
        :param filepaths:
        :param pipe_id
        """
        project = db["project"]
        if project not in root_database:
            root_database[project] = {}
            lst = [
                'Feature', 'FTransform', 'Label', 'LTransform', 'CoreInit',
                'Frame', 'PipeTestData', 'TestFeature'
            ]
            for ele in lst:
                root_database[project][ele] = []

        if pipe_id and isinstance(pipe_id, ObjectId) and filepaths:
            ih = IOHandler()
            self.core = ih.load_obj_from_file(obj_id=pipe_id,
                                              element="CoreInit",
                                              filepaths=filepaths)
        elif pipe_id and not isinstance(pipe_id, ObjectId):
            raise TypeError("The pipe_id you passed is not an ObjectId.")
        else:
            self.core = CoreInit(data, col_y, lst_layers, shuffle, stratified,
                                 col_selected, tag, db, filepaths)

        init_fnodes = self.init_features
        for key in init_fnodes:
            init_fnodes[key] = FNode(self.core, obj_id=init_fnodes[key])
        self._init_fnodes = init_fnodes

        init_lnode = LNode(self.core, obj_id=self.label)
        self._init_lnode = init_lnode
예제 #9
0
    def l_collect_components(l_node):
        ih = IOHandler()
        frame = ih.load_obj_from_file(obj_id=l_node.pipe_init.frame,
                                      element="Frame",
                                      filepaths=l_node.pipe_init.filepaths)

        lab_fed = ih.load_obj_from_file(obj_id=l_node.lab_fed.obj_id,
                                        element="Label",
                                        filepaths=l_node.pipe_init.filepaths)
        lab_fed = lab_fed.values

        return frame, lab_fed
예제 #10
0
    def fetch(self):
        if self.obj_id is None or self.filepaths is None:
            msg = "The node doesn't have obj_id or filepaths yet. The function is designed to fetch an obj whose" +\
                  "location is specified in a node."
            raise ValueError(msg)

        obj_id = self.obj_id
        element = self.decide_element()
        filepaths = self.core.filepaths

        ih = IOHandler()
        obj_fetched = ih.load_obj_from_file(obj_id, element, filepaths)

        return obj_fetched
예제 #11
0
    def create_grid(self, grid_dict):
        frame_id = self.__essentials["frame"]
        filepaths = self.__pipe_init.filepaths
        ih = IOHandler()
        frame = ih.load_obj_from_file(frame_id, "Frame", filepaths)

        idx = pd.MultiIndex.from_product(grid_dict.values(),
                                         names=grid_dict.keys())
        folds = frame.create_structure(self.__layer)
        evals = [e.__name__ for e in self.__evaluators]
        cols = pd.MultiIndex.from_product([evals, folds])

        r_grid, p_grid = pd.DataFrame(index=idx, columns=["feature_id", "f_transform_id"]), \
                         pd.DataFrame(index=idx, columns=cols)

        return r_grid, p_grid
예제 #12
0
    def search_for_scheme(self, db):
        """

        :return:
        """
        dh = DbHandler()
        docs = dh.search_by_essentials(self, db)

        if bool(docs):
            doc = docs[0]
            obj_id = doc["_id"]
            filepaths = doc["filepaths"]
            element = self.decide_element()

            ih = IOHandler()
            scheme_loaded = ih.load_obj_from_file(obj_id, element, filepaths)
            return scheme_loaded
        else:
            return None
예제 #13
0
    def __go(l_node):
        frame_id = l_node.core.frame
        lab_fed_id = l_node.lab_fed.obj_id
        l_transform = l_node.l_transform
        filepaths = l_node.core.filepaths

        ih = IOHandler()
        frame = ih.load_obj_from_file(frame_id, "Frame", filepaths)
        lab_fed = ih.load_obj_from_file(lab_fed_id, "Label", filepaths)

        # TODO: might need to refactor transform with ref better
        if has_ref(l_transform):
            l_values = l_transform.transform_with_ref(l_node)
            #############################################################
        else:
            lflow = LFlow()
            l_values, l_transform = lflow.label_encoding_transform(frame, lab_fed, l_transform)

        return l_values, l_transform
예제 #14
0
    def __go(f_node, frame_id, filepaths, label_id):
        lst_fed_id = [f.obj_id for f in f_node.lst_fed]
        ih = IOHandler()
        frame = ih.load_obj_from_file(frame_id, "Frame", filepaths)
        lst_fed = [ih.load_obj_from_file(f_id, "Feature", filepaths) for f_id in lst_fed_id]
        if label_id:
            label = ih.load_obj_from_file(label_id, "Label", filepaths)
        else:
            label = None
        f_transform = f_node.f_transform

        # TODO: might need to refactor transform with ref better
        if has_ref(f_transform):
            f_values, stage = f_transform.transform_with_ref(f_node)
        #############################################################
        else:
            ff = FFlow()
            if f_transform.rise == 1:
                f_values, f_transform, stage = ff.supervised_fit_transform(frame, lst_fed, f_transform, label)
            else:
                f_values, f_transform, stage = ff.unsupervised_fit_transform(lst_fed, f_transform)

        return f_values, f_transform, stage
예제 #15
0
    def return_constant_params(self, key):
        """

        At this point, self should be loaded from a old record/storage
        For the keys that are not in grid_dict, find the values from self.essentials[key]

        :param key:
        :return:
        """
        filepaths = self.pipe_init.filepaths

        param_lst = []
        ih = IOHandler()
        for ft_id in self.result_grid["f_transform_id"]:
            f_transform = ih.load_obj_from_file(ft_id, "FTransform", filepaths)
            param_lst.append(f_transform.essentials[key])

        if len(set(param_lst)) > 1:
            raise ValueError(
                "Something seriously wrong with the design of the Scheme family"
            )
        else:
            return param_lst[0]
예제 #16
0
    def frame(self):
        filepaths = self.__pipe_init.filepaths
        _id = self.__essentials["frame"]

        ih = IOHandler()
        return ih.load_obj_from_file(_id, "Frame", filepaths)
예제 #17
0
    def label(self):
        filepaths = self.__pipe_init.filepaths
        lid = self.__essentials["label"]

        ih = IOHandler()
        return ih.load_obj_from_file(lid, "Label", filepaths)
예제 #18
0
 def update_scheme(self):
     ih = IOHandler()
     ih.save_obj2file(self)