예제 #1
0
    def set_filepaths(self, filepaths):
        """

        :param filepaths: lst of dictionaries, each dictionary specifies where pkl file is saved.
            Currently supports below:
            [
                {'home': home, 'project':project_name},
                {'bucket': aws_bucket, 'project':project_name}
            ]
        :return:
        """
        if self.obj_id is None:
            msg = "The object doesn't have an obj_id, which means it's not saved in db yet," +\
                  "so it should not be saved in storage either."
            raise AttributeError(msg)

        if self.filepaths:
            raise AttributeError(
                "The set_filepaths method in Base does not allow reseting the file paths."
            )
        if filepaths and not isinstance(filepaths, list):
            raise TypeError(
                "Currently the collection of the file paths has to be of the list type"
            )
        elif filepaths:
            for path in filepaths:
                if not isinstance(path, dict):
                    raise TypeError(
                        "Currently the file paths have to be of the dictionary type"
                    )

        dh = DocsHandler()
        dh.update_doc(self, {"filepaths": filepaths})
        self.__filepaths = filepaths
예제 #2
0
    def save_db(self, db):
        """

        :param db: dict
        :return:
        """
        self.set_db(db)
        dh = DocsHandler()
        obj_id = dh.init_doc(self)
        self.obj_id = obj_id
예제 #3
0
    def get_docs_match_the_lnode(self, lst_l_transform):
        frame = self.core.frame
        lab_fed = self.lab_fed.obj_id

        dh = DocsHandler()
        all_docs = []
        for l_tran in lst_l_transform:
            tmp = Label(frame=frame,
                        l_transform=l_tran,
                        raw_y=lab_fed,
                        values=None)
            all_docs.extend(dh.search_by_essentials(tmp, self.core.db))
        all_docs = sorted(all_docs, key=lambda d: not bool(d["filepaths"]))

        return all_docs
예제 #4
0
    def get_docs_match_the_fnode(self, lst_f_transform):
        frame = self.core.frame
        lst_fed = [f.obj_id for f in self.lst_fed]

        dh = DocsHandler()
        all_docs = []
        for f_tran in lst_f_transform:
            if self.l_node:
                l_id = self.l_node.obj_id
            else:
                l_id = None
            tmp = Feature(frame=frame,
                          f_transform=f_tran,
                          lst_fed=lst_fed,
                          label=l_id,
                          values=None)
            all_docs.extend(dh.search_by_essentials(tmp, self.core.db))
        all_docs = sorted(all_docs, key=lambda d: not bool(d["filepaths"]))

        return all_docs
예제 #5
0
    def collect_doc(self, f_node):
        if not isinstance(f_node, FNode):
            raise TypeError("The parameter f_node should of the type FNode.")

        if f_node.obj_id:
            obj_id = f_node.obj_id
            db = f_node.core.db

            dh = DocsHandler()
            doc = dh.search_by_obj_id(obj_id, "Feature", db)
        else:
            lst_f_transform = self.get_f_transform_candidates(f_node)
            all_docs = f_node.get_docs_match_the_fnode(lst_f_transform)
            if all_docs:
                doc = all_docs[0]
            else:
                doc = None

        if doc and doc["essentials"]["f_transform"]:
            self.matched.append(doc["essentials"]["f_transform"])

        return doc
예제 #6
0
    def collect_doc(self, l_node):
        if not isinstance(l_node, LNode):
            raise TypeError("The parameter l_node should be of the type LNode.")

        if l_node.obj_id:
            obj_id = l_node.obj_id
            db = l_node.core.db

            dh = DocsHandler()
            doc = dh.search_by_obj_id(obj_id, "Label", db)
        else:
            lst_l_transform = self.get_l_transform_candidates(l_node)
            all_docs = l_node.get_docs_match_the_lnode(lst_l_transform)
            if all_docs:
                doc = all_docs[0]
            else:
                doc = None

        if doc and doc["essentials"]["l_transform"]:
            self.matched.append(doc["essentials"]["l_transform"])

        return doc
예제 #7
0
    def __init__(self,
                 data,
                 col_y,
                 lst_layers,
                 shuffle=False,
                 stratified=False,
                 col_selected=None,
                 tag=None,
                 db=None,
                 filepaths=None):
        """
        :param data: pandas.DataFrame. This needs to be a pandas data frame with a label column
        :param col_y: The name of the label column
        :param lst_layers: list. This gives the "lst_layers" to the Frame
        :param shuffle: boolean.
        :param stratified: boolean. Should not be used to a regression problem
        :param col_selected: dict. Ex: {'num': ['colname1', 'colname2'], 'cate':['colname3'], ...}
        :param db:
        :param filepaths:
        """
        if not isinstance(data, pd.DataFrame):
            raise TypeError(
                "The data for initialization should be of the type pandas.DataFrame"
            )
        if col_y and col_y not in data:
            raise KeyError(
                "The column name of the target: col_y provided is not in the data"
            )
        if col_selected:
            for key in col_selected:
                if not isinstance(col_selected[key], list):
                    raise TypeError(
                        "All the values in the dictionary col_selected have to be lists."
                    )

        super(CoreInit, self).__init__()
        self.__essentials = {}

        # Initializing the rows
        if shuffle:
            idx = np.random.choice(data.index, len(data.index), replace=False)
            data = self.shuffle_pddf_idx(data, idx)

        if stratified:
            data, frame = self.get_stratified_starter_and_frame(
                lst_layers, data, col_y)
        else:
            frame = self.get_regular_frame(lst_layers, data)
        frame.save_db_file(db=db, filepaths=filepaths)
        self.__frame = frame.obj_id

        # Initializing labels
        if col_y:
            self._y_name = col_y
            values = data[[col_y]].values
            label = Label(frame.obj_id, None, None, values)
            label.save_db_file(db=db, filepaths=filepaths)
            self.__label = label.obj_id
        else:
            self.__label = None

        # Initializing features (columns)
        self._column_groups = {
        }  # to collect dict like {'num': ['colname1', 'colname2'], 'cate':['colname3'], ...}
        self._init_features = {
        }  # {'num': obj_id(data['colname1', 'colname2']),
        #  'cate': obj_id(data['colname3']), ...}

        if isinstance(col_selected, dict):
            for key in col_selected:
                cols = col_selected[key]
                self._column_groups[key] = cols

                values = data[cols].values
                feature = Feature(frame.obj_id,
                                  None,
                                  None,
                                  None,
                                  values=values)
                feature.stage = 0
                feature.save_db_file(db=db, filepaths=filepaths)
                self._init_features[key] = feature.obj_id
            self.col_selected = col_selected
        elif not col_selected:
            cols = data.columns

            values = data[cols].values
            feature = Feature(frame.obj_id, None, None, None, values=values)
            feature.save_db_file(db=db, filepaths=filepaths)
            self._init_features['raw'] = feature.obj_id
            self.col_selected = cols
        elif isinstance(col_selected, list):
            raise NotImplementedError(
                "Currently only support dictionary to initialize features")
        else:
            raise ValueError(
                "Don't know what to do with the way you specified columns")

        if type(self) == CoreInit:
            self.save_db_file(db=db, filepaths=filepaths)
            DocsHandler.insert_tag(self, tag)
            print(self.obj_id)
예제 #8
0
    def get_f_transform_candidates(self, f_node):
        dh = DocsHandler()
        lst_transform_ids = dh.search_by_essentials(f_node.f_transform, f_node.core.db)
        lst_transform_ids = [x["_id"] for x in lst_transform_ids if x["_id"] not in self.matched]

        return lst_transform_ids