def read_multiple_dataframes( in_csv_train: str = None, in_csv_valid: str = None, in_csv_infer: str = None, tag2class: Optional[Dict[str, int]] = None, class_column: str = None, tag_column: str = None ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]: """This function reads train/valid/infer dataframes from giving paths Args: in_csv_train (str): paths to train csv separated by commas in_csv_valid (str): paths to valid csv separated by commas in_csv_infer (str): paths to infer csv separated by commas tag2class (Dict[str, int], optional): mapping from label names into int tag_column (str, optional): column with label names class_column (str, optional): column to use for split Returns: (tuple): tuple with 4 dataframes whole dataframe, train part, valid part and infer part """ df_train = merge_multiple_fold_csv(fold_name="train", paths=in_csv_train) df_valid = merge_multiple_fold_csv(fold_name="valid", paths=in_csv_valid) df_infer = merge_multiple_fold_csv(fold_name="infer", paths=in_csv_infer) if args_are_not_none(tag2class, tag_column, class_column): df_train = map_dataframe(df_train, tag_column, class_column, tag2class) df_valid = map_dataframe(df_valid, tag_column, class_column, tag2class) df_infer = map_dataframe(df_infer, tag_column, class_column, tag2class) result_dataframe = df_train. \ append(df_valid, ignore_index=True). \ append(df_infer, ignore_index=True) return result_dataframe, df_train, df_valid, df_infer
def read_multiple_dataframes( in_csv_train: str = None, in_csv_valid: str = None, in_csv_infer: str = None, tag2class: Optional[Dict[str, int]] = None, class_column: str = None, tag_column: str = None ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]: """This function reads train/valid/infer dataframes from giving paths Args: in_csv_train (str): paths to train csv separated by commas in_csv_valid (str): paths to valid csv separated by commas in_csv_infer (str): paths to infer csv separated by commas tag2class (Dict[str, int], optional): mapping from label names into int tag_column (str, optional): column with label names class_column (str, optional): column to use for split Returns: (tuple): tuple with 4 dataframes whole dataframe, train part, valid part and infer part """ assert any( [x is not None for x in (in_csv_train, in_csv_valid, in_csv_infer)] ) result_df = None fold_dfs = {} for fold_df, fold_name in zip( (in_csv_train, in_csv_valid, in_csv_infer), ("train", "valid", "infer") ): if fold_df is not None: fold_df = merge_multiple_fold_csv( fold_name=fold_name, paths=fold_df ) if args_are_not_none(tag2class, tag_column, class_column): fold_df = map_dataframe( fold_df, tag_column, class_column, tag2class ) fold_dfs[fold_name] = fold_df result_df = fold_df \ if result_df is None \ else result_df.append(fold_df, ignore_index=True) output = ( result_df, fold_dfs.get("train", None), fold_dfs.get("valid", None), fold_dfs.get("infer", None), ) return output
def test_args_are_not_none(): """@TODO: Docs. Contribution is welcome.""" assert utils.args_are_not_none(1, 2, 3, "") assert not utils.args_are_not_none(-8, "", None, True) assert not utils.args_are_not_none(None)
def split_dataframe( dataframe: pd.DataFrame, train_folds: List[int], valid_folds: Optional[List[int]] = None, infer_folds: Optional[List[int]] = None, tag2class: Optional[Dict[str, int]] = None, tag_column: str = None, class_column: str = None, seed: int = 42, n_folds: int = 5 ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]: """ Split a Pandas DataFrame into folds. Args: dataframe (pd.DataFrame): input dataframe train_folds (List[int]): train folds valid_folds (List[int], optional): valid folds. If none takes all folds not included in ``train_folds`` infer_folds (List[int], optional): infer folds. If none takes all folds not included in ``train_folds`` and ``valid_folds`` tag2class (Dict[str, int], optional): mapping from label names into int tag_column (str, optional): column with label names class_column (str, optional): column to use for split seed (int): seed for split n_folds (int): number of folds Returns: (tuple): tuple with 4 dataframes whole dataframe, train part, valid part and infer part """ if args_are_not_none(tag2class, tag_column, class_column): dataframe = map_dataframe(dataframe, tag_column, class_column, tag2class) if class_column is not None: result_dataframe = stratified_fold_split(dataframe, class_column=class_column, random_state=seed, n_folds=n_folds) else: result_dataframe = default_fold_split(dataframe, random_state=seed, n_folds=n_folds) fold_series = result_dataframe["fold"] train_folds = folds_to_list(train_folds) df_train = result_dataframe[fold_series.isin(train_folds)] if valid_folds is None: mask = ~fold_series.isin(train_folds) valid_folds = result_dataframe[mask]["fold"] valid_folds = folds_to_list(valid_folds) df_valid = result_dataframe[fold_series.isin(valid_folds)] infer_folds = folds_to_list(infer_folds or []) df_infer = result_dataframe[fold_series.isin(infer_folds)] return result_dataframe, df_train, df_valid, df_infer
def test_args_are_not_none(): assert utils.args_are_not_none(1, 2, 3, "") assert not utils.args_are_not_none(-8, "", None, True) assert not utils.args_are_not_none(None)