def read_multiple_dataframes( in_csv_train: str = None, in_csv_valid: str = None, in_csv_infer: str = None, tag2class: Optional[Dict[str, int]] = None, class_column: str = None, tag_column: str = None ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]: """This function reads train/valid/infer dataframes from giving paths Args: in_csv_train (str): paths to train csv separated by commas in_csv_valid (str): paths to valid csv separated by commas in_csv_infer (str): paths to infer csv separated by commas tag2class (Dict[str, int], optional): mapping from label names into int tag_column (str, optional): column with label names class_column (str, optional): column to use for split Returns: (tuple): tuple with 4 dataframes whole dataframe, train part, valid part and infer part """ df_train = merge_multiple_fold_csv(fold_name="train", paths=in_csv_train) df_valid = merge_multiple_fold_csv(fold_name="valid", paths=in_csv_valid) df_infer = merge_multiple_fold_csv(fold_name="infer", paths=in_csv_infer) if args_are_not_none(tag2class, tag_column, class_column): df_train = map_dataframe(df_train, tag_column, class_column, tag2class) df_valid = map_dataframe(df_valid, tag_column, class_column, tag2class) df_infer = map_dataframe(df_infer, tag_column, class_column, tag2class) result_dataframe = df_train. \ append(df_valid, ignore_index=True). \ append(df_infer, ignore_index=True) return result_dataframe, df_train, df_valid, df_infer
def read_multiple_dataframes( in_csv_train: str = None, in_csv_valid: str = None, in_csv_infer: str = None, tag2class: Optional[Dict[str, int]] = None, class_column: str = None, tag_column: str = None, ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]: """This function reads train/valid/infer dataframes from giving paths. Args: in_csv_train: paths to train csv separated by commas in_csv_valid: paths to valid csv separated by commas in_csv_infer: paths to infer csv separated by commas tag2class (Dict[str, int], optional): mapping from label names into int tag_column (str, optional): column with label names class_column (str, optional): column to use for split Returns: tuple: tuple with 4 dataframes whole dataframe, train part, valid part and infer part """ assert any(x is not None for x in (in_csv_train, in_csv_valid, in_csv_infer)) result_df = None fold_dfs = {} for fold_df, fold_name in zip((in_csv_train, in_csv_valid, in_csv_infer), ("train", "valid", "infer")): if fold_df is not None: fold_df = merge_multiple_fold_csv(fold_name=fold_name, paths=fold_df) if args_are_not_none(tag2class, tag_column, class_column): fold_df = map_dataframe(fold_df, tag_column, class_column, tag2class) fold_dfs[fold_name] = fold_df result_df = (fold_df if result_df is None else result_df.append( fold_df, ignore_index=True)) output = ( result_df, fold_dfs.get("train", None), fold_dfs.get("valid", None), fold_dfs.get("infer", None), ) return output
def split_dataframe( dataframe: pd.DataFrame, train_folds: List[int], valid_folds: Optional[List[int]] = None, infer_folds: Optional[List[int]] = None, tag2class: Optional[Dict[str, int]] = None, tag_column: str = None, class_column: str = None, seed: int = 42, n_folds: int = 5, ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]: """Split a Pandas DataFrame into folds. Args: dataframe: input dataframe train_folds: train folds valid_folds (List[int], optional): valid folds. If none takes all folds not included in ``train_folds`` infer_folds (List[int], optional): infer folds. If none takes all folds not included in ``train_folds`` and ``valid_folds`` tag2class (Dict[str, int], optional): mapping from label names into int tag_column (str, optional): column with label names class_column (str, optional): column to use for split seed: seed for split n_folds: number of folds Returns: tuple: tuple with 4 dataframes whole dataframe, train part, valid part and infer part """ if args_are_not_none(tag2class, tag_column, class_column): dataframe = map_dataframe(dataframe, tag_column, class_column, tag2class) if class_column is not None: df_all = split_dataframe_on_stratified_folds(dataframe, class_column=class_column, random_state=seed, n_folds=n_folds) else: df_all = split_dataframe_on_folds(dataframe, random_state=seed, n_folds=n_folds) fold_series = df_all["fold"] train_folds = folds_to_list(train_folds) df_train = df_all[fold_series.isin(train_folds)] if valid_folds is None: mask = ~fold_series.isin(train_folds) valid_folds = df_all[mask]["fold"] valid_folds = folds_to_list(valid_folds) df_valid = df_all[fold_series.isin(valid_folds)] infer_folds = folds_to_list(infer_folds or []) df_infer = df_all[fold_series.isin(infer_folds)] return df_all, df_train, df_valid, df_infer
def read_csv_data( in_csv: str = None, train_folds: Optional[List[int]] = None, valid_folds: Optional[List[int]] = None, infer_folds: Optional[List[int]] = None, seed: int = 42, n_folds: int = 5, in_csv_train: str = None, in_csv_valid: str = None, in_csv_infer: str = None, tag2class: Optional[Dict[str, int]] = None, class_column: str = None, tag_column: str = None, ) -> Tuple[pd.DataFrame, List[dict], List[dict], List[dict]]: """ From giving path ``in_csv`` reads a dataframe and split it to train/valid/infer folds or from several paths ``in_csv_train``, ``in_csv_valid``, ``in_csv_infer`` reads independent folds. Note: This function can be used with different combinations of params. First block is used to get dataset from one `csv`: in_csv, train_folds, valid_folds, infer_folds, seed, n_folds Second includes paths to different csv for train/valid and infer parts: in_csv_train, in_csv_valid, in_csv_infer The other params (tag2class, tag_column, class_column) are optional for any previous block Args: in_csv (str): paths to whole dataset train_folds (List[int]): train folds valid_folds (List[int], optional): valid folds. If none takes all folds not included in ``train_folds`` infer_folds (List[int], optional): infer folds. If none takes all folds not included in ``train_folds`` and ``valid_folds`` seed (int): seed for split n_folds (int): number of folds in_csv_train (str): paths to train csv separated by commas in_csv_valid (str): paths to valid csv separated by commas in_csv_infer (str): paths to infer csv separated by commas tag2class (Dict[str, int]): mapping from label names into ints tag_column (str): column with label names class_column (str): column to use for split Returns: (Tuple[pd.DataFrame, List[dict], List[dict], List[dict]]): tuple with 4 elements (whole dataframe, list with train data, list with valid data and list with infer data) """ from_one_df: bool = in_csv is not None from_multiple_df: bool = \ args_are_not_none(in_csv_train, in_csv_valid) \ or in_csv_infer is not None if from_one_df == from_multiple_df: raise ValueError("You should pass `in_csv` " "or `in_csv_train` with `in_csv_valid` but not both!") if from_one_df: dataframe: pd.DataFrame = pd.read_csv(in_csv) dataframe, df_train, df_valid, df_infer = split_dataframe( dataframe, train_folds=train_folds, valid_folds=valid_folds, infer_folds=infer_folds, tag2class=tag2class, class_column=class_column, tag_column=tag_column, seed=seed, n_folds=n_folds) else: dataframe, df_train, df_valid, df_infer = read_multiple_dataframes( in_csv_train=in_csv_train, in_csv_valid=in_csv_valid, in_csv_infer=in_csv_infer, tag2class=tag2class, class_column=class_column, tag_column=tag_column) for data in [df_train, df_valid, df_infer]: if "fold" in data.columns: del data["fold"] result = (dataframe, dataframe_to_list(df_train), dataframe_to_list(df_valid), dataframe_to_list(df_infer)) return result
def test_args_are_not_none(): assert args_are_not_none(1, 2, 3, "") assert not args_are_not_none(-8, "", None, True) assert not args_are_not_none(None)