示例#1
0
def read_multiple_dataframes(
    in_csv_train: str = None,
    in_csv_valid: str = None,
    in_csv_infer: str = None,
    tag2class: Optional[Dict[str, int]] = None,
    class_column: str = None,
    tag_column: str = None
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """This function reads train/valid/infer dataframes from giving paths
    Args:
        in_csv_train (str): paths to train csv separated by commas
        in_csv_valid (str): paths to valid csv separated by commas
        in_csv_infer (str): paths to infer csv separated by commas
        tag2class (Dict[str, int], optional): mapping from label names into int
        tag_column (str, optional): column with label names
        class_column (str, optional): column to use for split
    Returns:
        (tuple): tuple with 4 dataframes
            whole dataframe, train part, valid part and infer part
    """
    df_train = merge_multiple_fold_csv(fold_name="train", paths=in_csv_train)
    df_valid = merge_multiple_fold_csv(fold_name="valid", paths=in_csv_valid)
    df_infer = merge_multiple_fold_csv(fold_name="infer", paths=in_csv_infer)

    if args_are_not_none(tag2class, tag_column, class_column):
        df_train = map_dataframe(df_train, tag_column, class_column, tag2class)
        df_valid = map_dataframe(df_valid, tag_column, class_column, tag2class)
        df_infer = map_dataframe(df_infer, tag_column, class_column, tag2class)

    result_dataframe = df_train. \
        append(df_valid, ignore_index=True). \
        append(df_infer, ignore_index=True)

    return result_dataframe, df_train, df_valid, df_infer
示例#2
0
def read_multiple_dataframes(
    in_csv_train: str = None,
    in_csv_valid: str = None,
    in_csv_infer: str = None,
    tag2class: Optional[Dict[str, int]] = None,
    class_column: str = None,
    tag_column: str = None,
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """This function reads train/valid/infer dataframes from giving paths.

    Args:
        in_csv_train: paths to train csv separated by commas
        in_csv_valid: paths to valid csv separated by commas
        in_csv_infer: paths to infer csv separated by commas
        tag2class (Dict[str, int], optional): mapping from label names into int
        tag_column (str, optional): column with label names
        class_column (str, optional): column to use for split

    Returns:
        tuple: tuple with 4 dataframes
            whole dataframe, train part, valid part and infer part
    """
    assert any(x is not None
               for x in (in_csv_train, in_csv_valid, in_csv_infer))

    result_df = None
    fold_dfs = {}
    for fold_df, fold_name in zip((in_csv_train, in_csv_valid, in_csv_infer),
                                  ("train", "valid", "infer")):
        if fold_df is not None:
            fold_df = merge_multiple_fold_csv(fold_name=fold_name,
                                              paths=fold_df)
            if args_are_not_none(tag2class, tag_column, class_column):
                fold_df = map_dataframe(fold_df, tag_column, class_column,
                                        tag2class)
            fold_dfs[fold_name] = fold_df

            result_df = (fold_df if result_df is None else result_df.append(
                fold_df, ignore_index=True))

    output = (
        result_df,
        fold_dfs.get("train", None),
        fold_dfs.get("valid", None),
        fold_dfs.get("infer", None),
    )

    return output
示例#3
0
def split_dataframe(
    dataframe: pd.DataFrame,
    train_folds: List[int],
    valid_folds: Optional[List[int]] = None,
    infer_folds: Optional[List[int]] = None,
    tag2class: Optional[Dict[str, int]] = None,
    tag_column: str = None,
    class_column: str = None,
    seed: int = 42,
    n_folds: int = 5,
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """Split a Pandas DataFrame into folds.

    Args:
        dataframe: input dataframe
        train_folds: train folds
        valid_folds (List[int], optional): valid folds.
            If none takes all folds not included in ``train_folds``
        infer_folds (List[int], optional): infer folds.
            If none takes all folds not included in ``train_folds``
            and ``valid_folds``
        tag2class (Dict[str, int], optional): mapping from label names into int
        tag_column (str, optional): column with label names
        class_column (str, optional): column to use for split
        seed: seed for split
        n_folds: number of folds

    Returns:
        tuple: tuple with 4 dataframes
            whole dataframe, train part, valid part and infer part
    """
    if args_are_not_none(tag2class, tag_column, class_column):
        dataframe = map_dataframe(dataframe, tag_column, class_column,
                                  tag2class)

    if class_column is not None:
        df_all = split_dataframe_on_stratified_folds(dataframe,
                                                     class_column=class_column,
                                                     random_state=seed,
                                                     n_folds=n_folds)
    else:
        df_all = split_dataframe_on_folds(dataframe,
                                          random_state=seed,
                                          n_folds=n_folds)

    fold_series = df_all["fold"]

    train_folds = folds_to_list(train_folds)
    df_train = df_all[fold_series.isin(train_folds)]

    if valid_folds is None:
        mask = ~fold_series.isin(train_folds)
        valid_folds = df_all[mask]["fold"]

    valid_folds = folds_to_list(valid_folds)
    df_valid = df_all[fold_series.isin(valid_folds)]

    infer_folds = folds_to_list(infer_folds or [])
    df_infer = df_all[fold_series.isin(infer_folds)]

    return df_all, df_train, df_valid, df_infer
示例#4
0
def read_csv_data(
    in_csv: str = None,
    train_folds: Optional[List[int]] = None,
    valid_folds: Optional[List[int]] = None,
    infer_folds: Optional[List[int]] = None,
    seed: int = 42,
    n_folds: int = 5,
    in_csv_train: str = None,
    in_csv_valid: str = None,
    in_csv_infer: str = None,
    tag2class: Optional[Dict[str, int]] = None,
    class_column: str = None,
    tag_column: str = None,
) -> Tuple[pd.DataFrame, List[dict], List[dict], List[dict]]:
    """
    From giving path ``in_csv`` reads a dataframe
    and split it to train/valid/infer folds
    or from several paths ``in_csv_train``, ``in_csv_valid``, ``in_csv_infer``
    reads independent folds.

    Note:
       This function can be used with different combinations of params.
        First block is used to get dataset from one `csv`:
            in_csv, train_folds, valid_folds, infer_folds, seed, n_folds
        Second includes paths to different csv for train/valid and infer parts:
            in_csv_train, in_csv_valid, in_csv_infer
        The other params (tag2class, tag_column, class_column) are optional
            for any previous block

    Args:
        in_csv (str): paths to whole dataset
        train_folds (List[int]): train folds
        valid_folds (List[int], optional): valid folds.
            If none takes all folds not included in ``train_folds``
        infer_folds (List[int], optional): infer folds.
            If none takes all folds not included in ``train_folds``
            and ``valid_folds``
        seed (int): seed for split
        n_folds (int): number of folds

        in_csv_train (str): paths to train csv separated by commas
        in_csv_valid (str): paths to valid csv separated by commas
        in_csv_infer (str): paths to infer csv separated by commas

        tag2class (Dict[str, int]): mapping from label names into ints
        tag_column (str): column with label names
        class_column (str): column to use for split

    Returns:
        (Tuple[pd.DataFrame, List[dict], List[dict], List[dict]]):
            tuple with 4 elements
            (whole dataframe,
            list with train data,
            list with valid data
            and list with infer data)
    """
    from_one_df: bool = in_csv is not None
    from_multiple_df: bool = \
        args_are_not_none(in_csv_train, in_csv_valid) \
        or in_csv_infer is not None

    if from_one_df == from_multiple_df:
        raise ValueError("You should pass `in_csv` "
                         "or `in_csv_train` with `in_csv_valid` but not both!")

    if from_one_df:
        dataframe: pd.DataFrame = pd.read_csv(in_csv)
        dataframe, df_train, df_valid, df_infer = split_dataframe(
            dataframe,
            train_folds=train_folds,
            valid_folds=valid_folds,
            infer_folds=infer_folds,
            tag2class=tag2class,
            class_column=class_column,
            tag_column=tag_column,
            seed=seed,
            n_folds=n_folds)
    else:
        dataframe, df_train, df_valid, df_infer = read_multiple_dataframes(
            in_csv_train=in_csv_train,
            in_csv_valid=in_csv_valid,
            in_csv_infer=in_csv_infer,
            tag2class=tag2class,
            class_column=class_column,
            tag_column=tag_column)

    for data in [df_train, df_valid, df_infer]:
        if "fold" in data.columns:
            del data["fold"]

    result = (dataframe, dataframe_to_list(df_train),
              dataframe_to_list(df_valid), dataframe_to_list(df_infer))

    return result
示例#5
0
def test_args_are_not_none():
    assert args_are_not_none(1, 2, 3, "")
    assert not args_are_not_none(-8, "", None, True)
    assert not args_are_not_none(None)