Пример #1
0
 def create_days_between_feature(self, df, feature_name_one, feature_name_two, result_name = None, absolute_value = True):
     """
     Takes two pandas.DataFrame datetime columns and calculates the absolute
     days between the dates.
     
     Parameters
     ----------
     df : DataFrame
         The dataframe containing the dates to calculate days between.
     feature_name_one : str
         The name of the primary feature column.
     feature_name_two : str
         The name of the secondary feature column.
     result_name: str, optional.  Defaults is None.
         Sets the name of the resulting column to the one provided.
     absolute_value: boolean, optional. Defaults is True
         If true, the absolute value of the difference between dates will be used.
     
     Procedure
     ---------
         1. Ensure both features are pd.datetime
         2. Take the absolute values from the difference of the secondary date from primary and cast to datetime.days
         3. Add this as a column to dateframe as, 'days_between_primary_and_secondary.'
     """
     if is_datetime(df[feature_name_one]) and is_datetime(df[feature_name_two]):
         result_col_name = f'days_between_{feature_name_one}_and_{feature_name_two}'
         if result_name is not None:
             result_col_name = result_name
         df[result_col_name] = (df[feature_name_one] - df[feature_name_two]).dt.days
         if absolute_value:
             df[result_col_name] = abs(df[result_col_name])
     else:
         print(f'Excepted datetime features, received 1: {df[feature_name_two].dtype} 2:{df[feature_name_two].dtype}')
     return
Пример #2
0
def _find_or_check_datetime_variables(X: pd.DataFrame,
                                      variables: Variables = None
                                      ) -> List[Union[str, int]]:
    """
    Checks that variables provided by the user are of type datetime.
    If None, finds all datetime variables in the DataFrame.

    Parameters
    ----------
    X : pandas DataFrame
    variables : variable or list of variables. Defaults to None.

    Returns
    -------
    variables : List of datetime variables.
    """

    if variables is None:
        variables = [
            column for column in X.select_dtypes(exclude="number").columns
            if is_datetime(X[column])
            or _is_categorical_and_is_datetime(X[column])
        ]

        if len(variables) == 0:
            raise ValueError("No datetime variables found in this dataframe.")

    elif isinstance(variables, (str, int)):

        if is_datetime(X[variables]) or (not is_numeric(X[variables])
                                         and _is_categorical_and_is_datetime(
                                             X[variables])):
            variables = [variables]
        else:
            raise TypeError("The variable entered is not datetime.")

    else:
        if len(variables) == 0:
            raise ValueError("The indicated list of variables is empty.")

        # check that the variables entered by the user are datetime
        else:
            vars_non_dt = [
                column
                for column in X[variables].select_dtypes(exclude="datetime")
                if is_numeric(X[column])
                or not _is_categorical_and_is_datetime(X[column])
            ]

            if len(vars_non_dt) > 0:
                raise TypeError("Some of the variables are not datetime.")

    return variables
Пример #3
0
def series2col(s, name):
    kw = {
        'name': name,
        'kind': fpb.Column.SLICE,
    }

    if is_int_dtype(s.dtype):
        kw['dtype'] = fpb.INTEGER
        kw['ints'] = s
    elif is_float_dtype(s.dtype):
        kw['dtype'] = fpb.FLOAT
        kw['floats'] = s
    elif s.dtype == np.object:  # Pandas dtype for str is object
        kw['strings'] = s
        kw['dtype'] = fpb.STRING
    elif s.dtype == np.bool:
        kw['bools'] = s
        kw['dtype'] = fpb.BOOLEAN
    elif is_datetime(s.dtype):
        if s.dt.tz:
            try:
                s = s.dt.tz_localize(pytz.UTC)
            except TypeError:
                s = s.dt.tz_convert('UTC')
        kw['times'] = s.astype(np.int64)
        kw['dtype'] = fpb.TIME
    elif is_categorical_dtype(s.dtype):
        # We assume catgorical data is strings
        kw['strings'] = s.astype(str)
        kw['dtype'] = fpb.STRING
    else:
        raise WriteError('{} - unsupported type - {}'.format(s.name, s.dtype))

    return fpb.Column(**kw)
Пример #4
0
def get_datetime_column_names(df):
    column_names = []
    for column in df.columns:
        if is_datetime(df[column]):
            column_names.append(column)

    return column_names
Пример #5
0
    def import_static_data(self,
                           name: str,
                           dataframe: pd.DataFrame) -> str:

        if isinstance(dataframe, pd.DataFrame):
            if not os.path.exists('.data'):
                os.makedirs('./.data')
            for column in dataframe.columns:
                if is_datetime(dataframe[column]):
                    dataframe[column] = dataframe[column].apply(lambda x: x.strftime("%Y-%m-%d %H:%M:%S.%f"))
            dataframe.to_csv('./.data/temporary-df.csv', index=False, header=True)
            import_guid = str(uuid.uuid4())
            object_name = f"notebook-imports/{import_guid}/import.csv"
            path = self.__get_file_path(object_name)
            if not self.file_repository.upload_file(self._import_bucket, path, './.data/temporary-df.csv'):
                raise Exception("Error Uploading file to bucket")
            return self.notebook_repository.import_static_data(
                project_guid=self.project_guid,
                name=name,
                path=path,
                delete_when_complete=True
            )

        else:
            raise Exception("Error: Must import as data frame")
Пример #6
0
def df_to_datetime_ser(df, col_values, col_date='date', assert_filled=False):
    """
    Obtain the column `col_values` in `df` as a series with
    datetime index from `col_date`.

    Parameters
    ----------
    df : pandas.DataFrame
        Description
    col_values : str
        Column name with values
    col_date : str, default='date'
        Column name with datetime index
    assert_filled : bool, default=False
        Assert that all dates in between have values

    Returns
    -------
    ser : pandas.Series
        Resulting series
    """
    assert col_values in df.columns
    assert col_date in df.columns
    assert is_datetime(df[col_date])

    ser = pd.Series(df[col_values].values, df[col_date])
    # the index must have no duplicate entries
    assert ser.index.duplicated().sum() == 0
    ser.sort_index(inplace=True)
    if assert_filled:
        assert_filled_in_dates(ser)
    return ser
Пример #7
0
def unevenness(vis: Vis, ldf: LuxDataFrame, measure_lst: list,
               dimension_lst: list) -> int:
    """
    Measure the unevenness of a bar chart vis.
    If a bar chart is highly uneven across the possible values, then it may be interesting. (e.g., USA produces lots of cars compared to Japan and Europe)
    Likewise, if a bar chart shows that the measure is the same for any possible values the dimension attribute could take on, then it may not very informative.
    (e.g., The cars produced across all Origins (Europe, Japan, and USA) has approximately the same average Acceleration.)

    Parameters
    ----------
    vis : Vis
    ldf : LuxDataFrame
    measure_lst : list
            List of measures
    dimension_lst : list
            List of dimensions
    Returns
    -------
    int
            Score describing how uneven the bar chart is.
    """
    v = vis.data[measure_lst[0].attribute]
    v = v / v.sum()  # normalize by total to get ratio
    v = v.fillna(0)  # Some bar values may be NaN
    attr = dimension_lst[0].attribute
    if isinstance(attr, pd._libs.tslibs.timestamps.Timestamp):
        # If timestamp, use the _repr_ (e.g., TimeStamp('2020-04-05 00.000')--> '2020-04-05')
        attr = str(attr._date_repr)
    C = ldf.cardinality[attr]
    D = (0.9)**C  # cardinality-based discounting factor
    v_flat = pd.Series([1 / C] * len(v))
    if is_datetime(v):
        v = v.astype("int")
    return D * euclidean(v, v_flat)
Пример #8
0
def numpy_type_2_xsd_type(value: Any) -> (Any, URIRef):
    from pandas.api.types import is_datetime64_any_dtype as is_datetime
    if isinstance(value, str):
        return value, XSD.string
    if isinstance(value, bool):
        return value, XSD.boolean
    if np.issubdtype(type(value), np.integer):
        return value, Literal(value).datatype
    if np.issubdtype(type(value), np.float):
        return value, Literal(value).datatype
    elif isinstance(
            value,
            Timestamp):  # has to come before the test for type date below
        date_time: datetime = value
        if date_time.hour == 0 and date_time.minute == 0 and date_time.second < 2 and date_time.microsecond < 32:
            return date_time.date(), XSD.date
        return value, XSD.datetime
    if isinstance(value, date):
        return value, XSD.date
    if is_datetime(value):
        return value, XSD.datetime
    elif isinstance(value, Timestamp):
        return value, XSD.datetime
    warning(f"Unknown type in numpy_type_2_xsd_type: {type(value)}")
    return value, None
Пример #9
0
 def format_dates(df, format='%Y-%m-%d'):
     date_cols = [
         column for column in df.columns if is_datetime(df[column])
     ]
     df.loc[:, date_cols] = df[date_cols].apply(
         lambda x: x.dt.strftime(format).replace('NaT', ''))
     return df
Пример #10
0
def Mort_CompressColumn(df, use_float16=False):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            # skip datetime type or categorical type
            continue
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')
Пример #11
0
def unevenness(vis:Vis, ldf:LuxDataFrame, measure_lst:list, dimension_lst:list) -> int:
	"""
	Measure the unevenness of a bar chart vis.
	If a bar chart is highly uneven across the possible values, then it may be interesting. (e.g., USA produces lots of cars compared to Japan and Europe)
	Likewise, if a bar chart shows that the measure is the same for any possible values the dimension attribute could take on, then it may not very informative. 
	(e.g., The cars produced across all Origins (Europe, Japan, and USA) has approximately the same average Acceleration.)

	Parameters
	----------
	vis : Vis
	ldf : LuxDataFrame
	measure_lst : list
		List of measures
	dimension_lst : list
		List of dimensions
	Returns
	-------
	int
		Score describing how uneven the bar chart is.
	"""
	v = vis.data[measure_lst[0].attribute]
	v = v/v.sum() # normalize by total to get ratio
	C = ldf.cardinality[dimension_lst[0].attribute]
	D = (0.5) ** C # cardinality-based discounting factor
	v_flat = pd.Series([1 / C] * len(v))
	if (is_datetime(v)):
		v = v.astype('int')
	return D * euclidean(v, v_flat) 
Пример #12
0
    def fit(self, X):
        """Fits the CustomTimestampFeaturizer.

        :param X: The dataset containing timestamp columns to featurize.
        :type X: numpy.array or pandas.DataFrame or iml.datatypes.DenseData or
            scipy.sparse.csr_matrix
        """
        # If the data was previously successfully summarized, then there are no
        # timestamp columns as it must be numeric.
        # Also, if the dataset is sparse, we can assume there are no timestamps
        if isinstance(X, DenseData) or issparse(X):
            return self
        tmp_dataset = X
        # If numpy array, temporarily convert to pandas for easier and uniform timestamp handling
        if isinstance(X, np.ndarray):
            tmp_dataset = pd.DataFrame(X, columns=self._features)
        self._time_col_names = [
            column for column in tmp_dataset.columns
            if is_datetime(tmp_dataset[column])
        ]
        # Calculate the min date for each column
        self._min = []
        for time_col_name in self._time_col_names:
            self._min.append(
                tmp_dataset[time_col_name].map(lambda x: x.timestamp()).min())
        return self
Пример #13
0
def reduce_mem_usage(df: pd.DataFrame,
                     cols_exclude: List[str] = []) -> pd.DataFrame:
    '''Iterate through all the columns of a dataframe and modify
    the data type to reduce memory usage.

    Original code from
    https://www.kaggle.com/gemartin/load-data-reduce-memory-usage by @gemartin
    '''

    start_mem = df.memory_usage().sum() / 1024**2

    cols = [c for c in df.columns if c not in cols_exclude]
    print("Reducing memory for the following columns: ", cols, sep='\n')

    for col in cols:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            continue

        print(f"Reducing memory for {col}")
        col_type = df[col].dtype

        if col_type != object:

            c_min = df[col].min()
            c_max = df[col].max()

            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min \
                        and c_max < np.iinfo(np.int8).max:

                    df[col] = df[col].astype(np.int8)

                elif c_min > np.iinfo(np.int16).min \
                        and c_max < np.iinfo(np.int16).max:

                    df[col] = df[col].astype(np.int16)

                elif c_min > np.iinfo(np.int32).min \
                        and c_max < np.iinfo(np.int32).max:

                    df[col] = df[col].astype(np.int32)

                elif c_min > np.iinfo(np.int64).min \
                        and c_max < np.iinfo(np.int64).max:

                    df[col] = df[col].astype(np.int64)

            else:
                df[col] = df[col].astype(np.float32)
        else:
            df[col] = df[col].astype("category")

    end_mem = df.memory_usage().sum() / 1024**2
    print(f"Memory usage before: {start_mem:.2f} MB",
          f"Memory usage after: {end_mem:.2f} MB "
          f"({100 * (start_mem - end_mem) / start_mem:.1f}% decrease)",
          sep='\n')

    return df
Пример #14
0
    def test_read_sources_file(self):
        """Sources tables are read in properly"""
        test_path = os.path.join("tests", "res", "test_sources.txt")
        test_sources = utils.read_sources_file(test_path)

        self.assertEqual(test_sources.shape, (2, len(utils.SOURCES_COLUMNS)))
        self.assertEqual(test_sources.loc[0, "website"], "Google")
        self.assertTrue(is_datetime(test_sources["date"]))
def preprocess_dataframe(df, time_granularity="1s"): # pragma: no cover
    for feature in df:
        if feature.dtype == object:
            df[feature] = pd.Categorical(df[feature])
        elif is_datetime(df[feature]):
            df[feature] = ((df.feature - df[feature].min()) / pd.Timedelta(time_granularity))

    return
Пример #16
0
def add_category_data(
    stock_data: pd.DataFrame,
) -> typing.Dict[str, typing.List[typing.List[typing.Union[float, int]]]]:
    if is_datetime(stock_data["date"]):
        data_times = stock_data["date"].dt.strftime("%Y-%m-%d").to_list()
    else:
        data_times = stock_data["date"].to_list()
    return dict(categoryData=data_times, )
Пример #17
0
    def compute_data_type(self, ldf: LuxDataFrame):
        for attr in list(ldf.columns):
            temporal_var_list = ["month", "year", "day", "date", "time"]
            if (isinstance(attr, pd._libs.tslibs.timestamps.Timestamp)):
                # If timestamp, make the dictionary keys the _repr_ (e.g., TimeStamp('2020-04-05 00.000')--> '2020-04-05')
                ldf.data_type_lookup[attr] = "temporal"
            # elif any(var in str(attr).lower() for var in temporal_var_list):
            elif str(attr).lower() in temporal_var_list:
                ldf.data_type_lookup[attr] = "temporal"
            elif ldf.dtypes[attr] == "float64":
                ldf.data_type_lookup[attr] = "quantitative"
            elif pd.api.types.is_integer_dtype(ldf.dtypes[attr]):
                # See if integer value is quantitative or nominal by checking if the ratio of cardinality/data size is less than 0.4 and if there are less than 10 unique values
                if (ldf.pre_aggregated):
                    if (ldf.cardinality[attr] == len(ldf)):
                        ldf.data_type_lookup[attr] = "nominal"
                if ldf.cardinality[attr] / len(
                        ldf) < 0.4 and ldf.cardinality[attr] < 10:
                    ldf.data_type_lookup[attr] = "nominal"
                elif check_if_id_like(ldf, attr):
                    ldf.data_type_lookup[attr] = "id"
                else:
                    ldf.data_type_lookup[attr] = "quantitative"
            # Eliminate this clause because a single NaN value can cause the dtype to be object
            elif ldf.dtypes[attr] == "object":
                ldf.data_type_lookup[attr] = "nominal"
            elif is_datetime_series(
                    ldf.dtypes[attr]
            ):  #check if attribute is any type of datetime dtype
                ldf.data_type_lookup[attr] = "temporal"
        # for attr in list(df.dtypes[df.dtypes=="int64"].keys()):
        #   if self.cardinality[attr]>50:
        if (ldf.index.dtype != 'int64' and ldf.index.name):
            ldf.data_type_lookup[ldf.index.name] = "nominal"
        ldf.data_type = self.mapping(ldf.data_type_lookup)

        from pandas.api.types import is_datetime64_any_dtype as is_datetime
        non_datetime_attrs = []
        for attr in ldf.columns:
            if ldf.data_type_lookup[attr] == 'temporal' and not is_datetime(
                    ldf[attr]):
                non_datetime_attrs.append(attr)
        if len(non_datetime_attrs) == 1:
            warnings.warn(
                f"\nLux detects that the attribute '{non_datetime_attrs[0]}' may be temporal.\n"
                "In order to display visualizations for this attribute accurately, temporal attributes should be converted to Pandas Datetime objects.\n\n"
                "Please consider converting this attribute using the pd.to_datetime function and providing a 'format' parameter to specify datetime format of the attribute.\n"
                "For example, you can convert the 'month' attribute in a dataset to Datetime type via the following command:\n\n\t df['month'] = pd.to_datetime(df['month'], format='%m')\n\n"
                "See more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html\n",
                stacklevel=2)
        elif len(non_datetime_attrs) > 1:
            warnings.warn(
                f"\nLux detects that attributes {non_datetime_attrs} may be temporal.\n"
                "In order to display visualizations for these attributes accurately, temporal attributes should be converted to Pandas Datetime objects.\n\n"
                "Please consider converting these attributes using the pd.to_datetime function and providing a 'format' parameter to specify datetime format of the attribute.\n"
                "For example, you can convert the 'month' attribute in a dataset to Datetime type via the following command:\n\n\t df['month'] = pd.to_datetime(df['month'], format='%m')\n\n"
                "See more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html\n",
                stacklevel=2)
Пример #18
0
    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """
        This transformer does not learn any parameter.

        Finds datetime variables or checks that the variables selected by the user
        can be converted to datetime.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The training input samples. Can be the entire dataframe, not just the
            variables to transform.

        y: pandas Series, default=None
            It is not needed in this transformer. You can pass y or None.
        """
        # check input dataframe
        X = check_X(X)

        # special case index
        if self.variables == "index":

            if not (
                is_datetime(X.index)
                or (
                    not is_numeric(X.index) and _is_categorical_and_is_datetime(X.index)
                )
            ):
                raise TypeError("The dataframe index is not datetime.")

            if self.missing_values == "raise":
                self._check_index_contains_na(X.index)

            self.variables_ = None

        else:
            # find or check for datetime variables
            self.variables_ = _find_or_check_datetime_variables(X, self.variables)

            # check if datetime variables contains na
            if self.missing_values == "raise":
                _check_contains_na(X, self.variables_)

        if self.features_to_extract is None:
            self.features_to_extract_ = FEATURES_DEFAULT
        elif isinstance(self.features_to_extract, str):
            self.features_to_extract_ = FEATURES_SUPPORTED
        else:
            self.features_to_extract_ = self.features_to_extract

        # save input features
        self.feature_names_in_ = X.columns.tolist()

        # save train set shape
        self.n_features_in_ = X.shape[1]

        return self
Пример #19
0
def compress_memory_usage(df_in: pd.DataFrame, replacer: dict = None):
    start_mem_usg = df_in.memory_usage().sum() / 1024**2
    cols_with_nas = []
    df = df_in.copy()  # Avoid changing input df
    for col in tqdm(df.columns, "DataFrame: compress_memory_usage"):
        if df[col].dtype != object and not is_datetime(df[col]):
            # make variables for Int, max and min
            is_int = False
            mx = df[col].max()
            mn = df[col].min()
            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(df[col]).all():
                cols_with_nas.append(col)
                if replacer:
                    df[col].fillna(replacer[col], inplace=True)
                else:
                    df[col].fillna(mn - 1, inplace=True)

            as_int = df[col].fillna(0).astype(np.int64)
            result = (df[col] - as_int)
            result = result.sum()
            if -0.01 < result < 0.01:
                is_int = True
            # Make integer/unsigned integer types
            if is_int:
                if mn >= 0:
                    if mx < 255:
                        df[col] = df[col].astype(np.uint8)
                    elif mx < 65535:
                        df[col] = df[col].astype(np.uint16)
                    elif mx < 4294967295:
                        df[col] = df[col].astype(np.uint32)
                    else:
                        df[col] = df[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(
                            np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(
                            np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(
                            np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(
                            np.int64).max:
                        df[col] = df[col].astype(np.int64)
            # Make float data types 32 bit
            else:
                df[col] = df[col].astype(np.float32)

    mem_usg = df.memory_usage().sum() / 1024**2
    print('Memory usage pre-compression was {}'.format(start_mem_usg))
    print('Memory usage after-compression was {}'.format(mem_usg))
    print("This is  {}% of the initial size".format(100 * mem_usg /
                                                    start_mem_usg))
    return df, cols_with_nas
Пример #20
0
def test_get_stations_info(param, expected_output):

    data_all = noaastn.get_stations_info(country=param)

    # check number of columns
    assert data_all.shape[1] == num_column

    # check type of the columns
    for i in range(9):
        assert data_all.dtypes[i] == object
    is_datetime(data_all["start"])
    is_datetime(data_all["end"])

    # match and check each col value pattern by comparing length
    row_df = data_all.sample(1)
    for col in col_len.keys():
        assert (pd.isna(row_df[col].values[0])
                or pd.isnull(row_df[col].values[0])
                or len(row_df[col].values[0]) == col_len[col])
Пример #21
0
    def nan_filler(self):

        for col in self.df.columns:

            if is_datetime(self.df[col]) or is_categorical_dtype(self.df[col]):
                continue

            self.df[col].fillna(self.df[col].median(), inplace=True)

        return self.df
Пример #22
0
    def reduce_mem_usage(self, use_float16: bool = False, info: bool = True):
        """
        Automatically distinguish the type of one single data and reset a suitable type.

        :param use_float16: use float16 or not
        :param info: stay True if a display of information is required
        """

        # the original memory usage
        start_mem = self.memory_usage().sum() / 1024**2

        # reduce the memory usage
        for col in self.columns:
            if is_datetime(self[col]) or is_categorical_dtype(self[col]):
                continue
            col_type = self[col].dtype

            if col_type != object:
                c_min = self[col].min()
                c_max = self[col].max()
                if str(col_type)[:3] == "int":
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(
                            np.int8).max:
                        self[col] = self[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(
                            np.int16).max:
                        self[col] = self[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(
                            np.int32).max:
                        self[col] = self[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(
                            np.int64).max:
                        self[col] = self[col].astype(np.int64)
                else:
                    if use_float16 and c_min > np.finfo(
                            np.float16).min and c_max < np.finfo(
                                np.float16).max:
                        self[col] = self[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(
                            np.float32).max:
                        self[col] = self[col].astype(np.float32)
                    else:
                        self[col] = self[col].astype(np.float64)
            else:
                self[col] = self[col].astype("category")

        if info:
            end_mem = self.memory_usage().sum() / 1024**2
            print("Memory usage before optimization:\t{:.3f} MB".format(
                start_mem))
            print(
                "Memory usage after optimization:\t{:.3f} MB".format(end_mem))
            print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) /
                                                start_mem))
Пример #23
0
def save_geodataframe(gdf, filename, output_dir, include_shp_files=False):
    if not gdf.empty:
        gdf = sanitiser.sanitise_geodataframe(gdf)
        persistence.ensure_dir(output_dir)
        gdf.to_file(os.path.join(output_dir, f'{filename}.geojson'), driver='GeoJSON')
        for col in [col for col in gdf.columns if is_datetime(gdf[col])]:
            gdf[col] = gdf[col].astype(str)
        if include_shp_files:
            shp_files = os.path.join(output_dir, 'shp_files')
            persistence.ensure_dir(shp_files)
            gdf.to_file(os.path.join(shp_files, f'{filename}.shp'))
Пример #24
0
def reduce_mem_usage(df, use_float16=False, cols_exclude=[]):
    """
    Iterate through all the columns of a dataframe and modify the data type to reduce memory usage.

    Original code from https://www.kaggle.com/gemartin/load-data-reduce-memory-usage by @gemartin
    Modified to support timestamp type, categorical type
    Modified to add option to use float16
    """

    start_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))

    cols = [c for c in df.columns if c not in cols_exclude]
    print(cols)

    for col in cols:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            continue
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(
                        np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(
                        np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(
                        np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(
                        np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if use_float16 and c_min > np.finfo(
                        np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(
                        np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype("category")

    end_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) /
                                        start_mem))

    return df
Пример #25
0
def validate_column(vals, n_not_null=N_NOT_NULL, min_observations_in_class=MIN_OBSERVATIONS_IN_CLASS):
    '''The function checks whether a columns can be modeled using ML models. It also classifies column into
    type of applicable models - binary, multiclass, regression or unknown.

    Args:
        vals: column to validate
        n_not_null: columns needs to have at least n_not_null values in order to be valid
        min_observations_in_class: the function will preserve only groups that have at least 
            `min_observations_in_class` observations

    Returns:
        boolean, str: assessment whether column is valid and either its classification or reason 
            for not being a valid column
    '''
    valid, reason = True, ''
    vals_not_na = vals.dropna()
    if vals_not_na.shape[0] < n_not_null:
        valid, reason = False, f'Not enough values ({vals_not_na.shape[0]}) provided, required: {n_not_null}'
        return valid, reason
    if is_datetime(vals_not_na):
        valid, reason = True, 'datetime'
        return valid, reason

    # determine column type
    n_unique = vals_not_na.unique().shape[0]
    if n_unique <= 1:
        valid, reason = False, 'Not enough classes (0 or 1).'
    elif n_unique > vals_not_na.shape[0]*0.9 and (vals_not_na.dtype != 'float64' and vals_not_na.dtype != 'int64'):
        valid, reason = True, 'identifier'
    elif n_unique == 2:
        valid, reason = True, f'binary'
    # 17 is somewhat arbitrary
    elif n_unique > 2 and ((n_unique < 17 and vals_not_na.dtype == 'float64') or \
                           (vals_not_na.dtype != 'float64' and vals_not_na.dtype != 'int64')):
        valid, reason = True, 'multiclass'
    elif n_unique > 2 and (vals_not_na.dtype == 'float64' or vals_not_na.dtype == 'int64'):
        valid, reason = True, 'regression'
    else:
        valid, reason = False, 'Type not known.'

    # second pass - this time we drop small groups for binary and multiclass variables
    # and check whether the type changed
    if reason == 'binary' or reason == 'multiclass':
        vals_not_na = drop_infrequent_groups(vals_not_na, min_observations_in_class)
        n_unique = vals_not_na.unique().shape[0]
        if n_unique <= 1:
            valid, reason = False, 'Not enough classes (0 or 1).'
        elif n_unique == 2:
            valid, reason = True, 'binary'
        else:  # otherwise the type - multiclass is preserved
            pass

    return valid, reason
Пример #26
0
def reduce_mem_usage(df, use_float16=False):
    """
    Original function code is from:
        https://www.kaggle.com/aitude/ashrae-kfold-lightgbm-without-leak-1-08


    Iterate through all the columns of a dataframe and modify the data type
    to reduce memory usage.
    """

    start_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))

    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            continue
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(
                        np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(
                        np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(
                        np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(
                        np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if use_float16 and c_min > np.finfo(
                        np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(
                        np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype("category")

    end_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) /
                                        start_mem))

    return df
Пример #27
0
    def _coerce_dates(self, series: pd.Series, timezone_: timezone) -> pd.Series:
        """
        Method to parse dates in the pandas.DataFrame. Leverages the data timezone
        attribute to ensure correct comparison of dates.

        :param series:
        :return:
        """
        if not is_datetime(series):
            series = pd.Series(series.map(lambda x: pd.Timestamp(x)))
        if not series.dt.tz:
            series = series.dt.tz_localize(timezone_)
        return series.dt.tz_convert(pytz.UTC)
Пример #28
0
def assert_filled_in_dates(ser):
    """
    assert that a pandas.Series with datetime index has values for all days
    in between the minimum and maximum one.

    Parameters
    ----------
    ser : pandas.Series
        Series with datetime index
    """
    assert is_datetime(ser.index), 'ser index must be of dtype datetime'
    ndays = (ser.index.max() - ser.index.min()).days + 1
    assert ndays == len(ser), "there are gaps in dates index"
Пример #29
0
def reduce_mem_usage(df, use_float16=False):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        if col == 'id':
            df[col] = df[col].astype(str)
            continue
        if is_datetime(df[col]):
            # skip datetime type
            continue
        col_type = df[col].dtype
        if col_type == list:
            continue
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(
                        np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(
                        np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(
                        np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(
                        np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if use_float16 and c_min > np.finfo(
                        np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(
                        np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) /
                                        start_mem))

    return df
Пример #30
0
def test_import_utils(test, data):
    output, renamed, missing = test._import_utils(name=deepcopy(data))
    assert output.shape == (3, 3)
    assert "ModelName" in output.columns
    assert list(output["ModelID"]) == list(data["pymodelid"])
    assert is_datetime(output["SnapshotTime"])
    assert "Performance" not in output.columns
    assert renamed == {
        "ModelID": "ModelID",
        "Name": "ModelName",
        "SnapshotTime": "SnapshotTime",
    }
    assert "Junk" not in output.columns
    assert "Treatment" not in output.columns