def set_filepaths(self, filepaths): """ :param filepaths: lst of dictionaries, each dictionary specifies where pkl file is saved. Currently supports below: [ {'home': home, 'project':project_name}, {'bucket': aws_bucket, 'project':project_name} ] :return: """ if self.obj_id is None: msg = "The object doesn't have an obj_id, which means it's not saved in db yet," +\ "so it should not be saved in storage either." raise AttributeError(msg) if self.filepaths: raise AttributeError( "The set_filepaths method in Base does not allow reseting the file paths." ) if filepaths and not isinstance(filepaths, list): raise TypeError( "Currently the collection of the file paths has to be of the list type" ) elif filepaths: for path in filepaths: if not isinstance(path, dict): raise TypeError( "Currently the file paths have to be of the dictionary type" ) dh = DocsHandler() dh.update_doc(self, {"filepaths": filepaths}) self.__filepaths = filepaths
def save_db(self, db): """ :param db: dict :return: """ self.set_db(db) dh = DocsHandler() obj_id = dh.init_doc(self) self.obj_id = obj_id
def get_docs_match_the_lnode(self, lst_l_transform): frame = self.core.frame lab_fed = self.lab_fed.obj_id dh = DocsHandler() all_docs = [] for l_tran in lst_l_transform: tmp = Label(frame=frame, l_transform=l_tran, raw_y=lab_fed, values=None) all_docs.extend(dh.search_by_essentials(tmp, self.core.db)) all_docs = sorted(all_docs, key=lambda d: not bool(d["filepaths"])) return all_docs
def get_docs_match_the_fnode(self, lst_f_transform): frame = self.core.frame lst_fed = [f.obj_id for f in self.lst_fed] dh = DocsHandler() all_docs = [] for f_tran in lst_f_transform: if self.l_node: l_id = self.l_node.obj_id else: l_id = None tmp = Feature(frame=frame, f_transform=f_tran, lst_fed=lst_fed, label=l_id, values=None) all_docs.extend(dh.search_by_essentials(tmp, self.core.db)) all_docs = sorted(all_docs, key=lambda d: not bool(d["filepaths"])) return all_docs
def collect_doc(self, f_node): if not isinstance(f_node, FNode): raise TypeError("The parameter f_node should of the type FNode.") if f_node.obj_id: obj_id = f_node.obj_id db = f_node.core.db dh = DocsHandler() doc = dh.search_by_obj_id(obj_id, "Feature", db) else: lst_f_transform = self.get_f_transform_candidates(f_node) all_docs = f_node.get_docs_match_the_fnode(lst_f_transform) if all_docs: doc = all_docs[0] else: doc = None if doc and doc["essentials"]["f_transform"]: self.matched.append(doc["essentials"]["f_transform"]) return doc
def collect_doc(self, l_node): if not isinstance(l_node, LNode): raise TypeError("The parameter l_node should be of the type LNode.") if l_node.obj_id: obj_id = l_node.obj_id db = l_node.core.db dh = DocsHandler() doc = dh.search_by_obj_id(obj_id, "Label", db) else: lst_l_transform = self.get_l_transform_candidates(l_node) all_docs = l_node.get_docs_match_the_lnode(lst_l_transform) if all_docs: doc = all_docs[0] else: doc = None if doc and doc["essentials"]["l_transform"]: self.matched.append(doc["essentials"]["l_transform"]) return doc
def __init__(self, data, col_y, lst_layers, shuffle=False, stratified=False, col_selected=None, tag=None, db=None, filepaths=None): """ :param data: pandas.DataFrame. This needs to be a pandas data frame with a label column :param col_y: The name of the label column :param lst_layers: list. This gives the "lst_layers" to the Frame :param shuffle: boolean. :param stratified: boolean. Should not be used to a regression problem :param col_selected: dict. Ex: {'num': ['colname1', 'colname2'], 'cate':['colname3'], ...} :param db: :param filepaths: """ if not isinstance(data, pd.DataFrame): raise TypeError( "The data for initialization should be of the type pandas.DataFrame" ) if col_y and col_y not in data: raise KeyError( "The column name of the target: col_y provided is not in the data" ) if col_selected: for key in col_selected: if not isinstance(col_selected[key], list): raise TypeError( "All the values in the dictionary col_selected have to be lists." ) super(CoreInit, self).__init__() self.__essentials = {} # Initializing the rows if shuffle: idx = np.random.choice(data.index, len(data.index), replace=False) data = self.shuffle_pddf_idx(data, idx) if stratified: data, frame = self.get_stratified_starter_and_frame( lst_layers, data, col_y) else: frame = self.get_regular_frame(lst_layers, data) frame.save_db_file(db=db, filepaths=filepaths) self.__frame = frame.obj_id # Initializing labels if col_y: self._y_name = col_y values = data[[col_y]].values label = Label(frame.obj_id, None, None, values) label.save_db_file(db=db, filepaths=filepaths) self.__label = label.obj_id else: self.__label = None # Initializing features (columns) self._column_groups = { } # to collect dict like {'num': ['colname1', 'colname2'], 'cate':['colname3'], ...} self._init_features = { } # {'num': obj_id(data['colname1', 'colname2']), # 'cate': obj_id(data['colname3']), ...} if isinstance(col_selected, dict): for key in col_selected: cols = col_selected[key] self._column_groups[key] = cols values = data[cols].values feature = Feature(frame.obj_id, None, None, None, values=values) feature.stage = 0 feature.save_db_file(db=db, filepaths=filepaths) self._init_features[key] = feature.obj_id self.col_selected = col_selected elif not col_selected: cols = data.columns values = data[cols].values feature = Feature(frame.obj_id, None, None, None, values=values) feature.save_db_file(db=db, filepaths=filepaths) self._init_features['raw'] = feature.obj_id self.col_selected = cols elif isinstance(col_selected, list): raise NotImplementedError( "Currently only support dictionary to initialize features") else: raise ValueError( "Don't know what to do with the way you specified columns") if type(self) == CoreInit: self.save_db_file(db=db, filepaths=filepaths) DocsHandler.insert_tag(self, tag) print(self.obj_id)
def get_f_transform_candidates(self, f_node): dh = DocsHandler() lst_transform_ids = dh.search_by_essentials(f_node.f_transform, f_node.core.db) lst_transform_ids = [x["_id"] for x in lst_transform_ids if x["_id"] not in self.matched] return lst_transform_ids