def drop_duplicate_rows(data: dd = None,
                        subset: List[str] = None,
                        keep: str = None) -> dd:
    """
    Drop rows containing duplicate data for the specified subset of columns
    :param data: dask dataframe
    :param subset: list of column names
    :param keep: which duplicate to keep
    :return: modified dask dataframe
    """
    return data.drop_duplicates(subset=subset, keep=keep)
Exemplo n.º 2
0
    def transform(self, X: dd, y=None):
        """
        Remove duplicated rows

        Args:
            X (dd): Dataframe to be processed
            y (dd, optional): Target. Defaults to None.

        Returns:
            (dd): Dataframe with rows removed
        """
        return X.drop_duplicates(subset=self.subset)