Пример #1
0
def read_multiple_dataframes(
    in_csv_train: str = None,
    in_csv_valid: str = None,
    in_csv_infer: str = None,
    tag2class: Optional[Dict[str, int]] = None,
    class_column: str = None,
    tag_column: str = None
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """This function reads train/valid/infer dataframes from giving paths
    Args:
        in_csv_train (str): paths to train csv separated by commas
        in_csv_valid (str): paths to valid csv separated by commas
        in_csv_infer (str): paths to infer csv separated by commas
        tag2class (Dict[str, int], optional): mapping from label names into int
        tag_column (str, optional): column with label names
        class_column (str, optional): column to use for split
    Returns:
        (tuple): tuple with 4 dataframes
            whole dataframe, train part, valid part and infer part
    """
    df_train = merge_multiple_fold_csv(fold_name="train", paths=in_csv_train)
    df_valid = merge_multiple_fold_csv(fold_name="valid", paths=in_csv_valid)
    df_infer = merge_multiple_fold_csv(fold_name="infer", paths=in_csv_infer)

    if args_are_not_none(tag2class, tag_column, class_column):
        df_train = map_dataframe(df_train, tag_column, class_column, tag2class)
        df_valid = map_dataframe(df_valid, tag_column, class_column, tag2class)
        df_infer = map_dataframe(df_infer, tag_column, class_column, tag2class)

    result_dataframe = df_train. \
        append(df_valid, ignore_index=True). \
        append(df_infer, ignore_index=True)

    return result_dataframe, df_train, df_valid, df_infer
Пример #2
0
def read_multiple_dataframes(
    in_csv_train: str = None,
    in_csv_valid: str = None,
    in_csv_infer: str = None,
    tag2class: Optional[Dict[str, int]] = None,
    class_column: str = None,
    tag_column: str = None
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """This function reads train/valid/infer dataframes from giving paths
    Args:
        in_csv_train (str): paths to train csv separated by commas
        in_csv_valid (str): paths to valid csv separated by commas
        in_csv_infer (str): paths to infer csv separated by commas
        tag2class (Dict[str, int], optional): mapping from label names into int
        tag_column (str, optional): column with label names
        class_column (str, optional): column to use for split
    Returns:
        (tuple): tuple with 4 dataframes
            whole dataframe, train part, valid part and infer part
    """
    assert any(
        [x is not None for x in (in_csv_train, in_csv_valid, in_csv_infer)]
    )

    result_df = None
    fold_dfs = {}
    for fold_df, fold_name in zip(
        (in_csv_train, in_csv_valid, in_csv_infer),
        ("train", "valid", "infer")
    ):
        if fold_df is not None:
            fold_df = merge_multiple_fold_csv(
                fold_name=fold_name, paths=fold_df
            )
            if args_are_not_none(tag2class, tag_column, class_column):
                fold_df = map_dataframe(
                    fold_df, tag_column, class_column, tag2class
                )
            fold_dfs[fold_name] = fold_df

            result_df = fold_df \
                if result_df is None \
                else result_df.append(fold_df, ignore_index=True)

    output = (
        result_df,
        fold_dfs.get("train", None),
        fold_dfs.get("valid", None),
        fold_dfs.get("infer", None),
    )

    return output
Пример #3
0
def test_args_are_not_none():
    """@TODO: Docs. Contribution is welcome."""
    assert utils.args_are_not_none(1, 2, 3, "")
    assert not utils.args_are_not_none(-8, "", None, True)
    assert not utils.args_are_not_none(None)
Пример #4
0
def split_dataframe(
    dataframe: pd.DataFrame,
    train_folds: List[int],
    valid_folds: Optional[List[int]] = None,
    infer_folds: Optional[List[int]] = None,
    tag2class: Optional[Dict[str, int]] = None,
    tag_column: str = None,
    class_column: str = None,
    seed: int = 42,
    n_folds: int = 5
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Split a Pandas DataFrame into folds.

    Args:
        dataframe (pd.DataFrame): input dataframe
        train_folds (List[int]): train folds
        valid_folds (List[int], optional): valid folds.
            If none takes all folds not included in ``train_folds``
        infer_folds (List[int], optional): infer folds.
            If none takes all folds not included in ``train_folds``
            and ``valid_folds``
        tag2class (Dict[str, int], optional): mapping from label names into int
        tag_column (str, optional): column with label names
        class_column (str, optional): column to use for split
        seed (int): seed for split
        n_folds (int): number of folds
    Returns:
        (tuple): tuple with 4 dataframes
            whole dataframe, train part, valid part and infer part
    """

    if args_are_not_none(tag2class, tag_column, class_column):
        dataframe = map_dataframe(dataframe, tag_column, class_column,
                                  tag2class)

    if class_column is not None:
        result_dataframe = stratified_fold_split(dataframe,
                                                 class_column=class_column,
                                                 random_state=seed,
                                                 n_folds=n_folds)
    else:
        result_dataframe = default_fold_split(dataframe,
                                              random_state=seed,
                                              n_folds=n_folds)

    fold_series = result_dataframe["fold"]

    train_folds = folds_to_list(train_folds)
    df_train = result_dataframe[fold_series.isin(train_folds)]

    if valid_folds is None:
        mask = ~fold_series.isin(train_folds)
        valid_folds = result_dataframe[mask]["fold"]

    valid_folds = folds_to_list(valid_folds)
    df_valid = result_dataframe[fold_series.isin(valid_folds)]

    infer_folds = folds_to_list(infer_folds or [])
    df_infer = result_dataframe[fold_series.isin(infer_folds)]

    return result_dataframe, df_train, df_valid, df_infer
Пример #5
0
def test_args_are_not_none():
    assert utils.args_are_not_none(1, 2, 3, "")
    assert not utils.args_are_not_none(-8, "", None, True)
    assert not utils.args_are_not_none(None)