def test_dataframe(): d2 = DirectAccessV2( api_key=DIRECTACCESS_API_KEY, client_id=DIRECTACCESS_CLIENT_ID, client_secret=DIRECTACCESS_CLIENT_SECRET, access_token=DIRECTACCESS_TOKEN, ) df = d2.to_dataframe("rigs", pagesize=10000, deleteddate="null") # Check index is set to API endpoint "primary key" assert df.index.name == "RigID" # Check datetime64 dtypes assert is_datetime64_ns_dtype(df.CreatedDate) assert is_datetime64_ns_dtype(df.DeletedDate) assert is_datetime64_ns_dtype(df.SpudDate) assert is_datetime64_ns_dtype(df.UpdatedDate) # Check Int64 dtypes assert is_int64_dtype(df.PermitDepth) assert is_int64_dtype(df.FormationDepth) # Check float dtypes assert is_float_dtype(df.RigLatitudeWGS84) assert is_float_dtype(df.RigLongitudeWGS84) return
def test_load_file_pandas_data_types(self): self.create_csv() csv_file = ForestCSVFile(self.csv_path()) # Making sure the datetime fields are parsed as such self.assertTrue( ptypes.is_datetime64_ns_dtype( csv_file.data_frame.dtypes['Start Time'])) self.assertTrue( ptypes.is_datetime64_ns_dtype( csv_file.data_frame.dtypes['End Time'])) # Checking if the boolean "is Success" column is actually boolean self.assertTrue( ptypes.is_bool_dtype(csv_file.data_frame.dtypes['Is Success']))
def pandas_col_to_ibis_type(col): import numpy as np dty = col.dtype # datetime types if pdcom.is_datetime64tz_dtype(dty): return dt.Timestamp(str(dty.tz)) if pdcom.is_datetime64_dtype(dty): if pdcom.is_datetime64_ns_dtype(dty): return dt.timestamp else: raise com.IbisTypeError("Column {0} has dtype {1}, which is " "datetime64-like but does " "not use nanosecond units".format( col.name, dty)) if pdcom.is_timedelta64_dtype(dty): print("Warning: encoding a timedelta64 as an int64") return dt.int64 if pdcom.is_categorical_dtype(dty): return dt.Category(len(col.cat.categories)) if pdcom.is_bool_dtype(dty): return dt.boolean # simple numerical types if issubclass(dty.type, np.int8): return dt.int8 if issubclass(dty.type, np.int16): return dt.int16 if issubclass(dty.type, np.int32): return dt.int32 if issubclass(dty.type, np.int64): return dt.int64 if issubclass(dty.type, np.float32): return dt.float if issubclass(dty.type, np.float64): return dt.double if issubclass(dty.type, np.uint8): return dt.int16 if issubclass(dty.type, np.uint16): return dt.int32 if issubclass(dty.type, np.uint32): return dt.int64 if issubclass(dty.type, np.uint64): raise com.IbisTypeError("Column {} is an unsigned int64".format( col.name)) if pdcom.is_object_dtype(dty): return _infer_object_dtype(col) raise com.IbisTypeError("Column {0} is dtype {1}".format(col.name, dty))
def test_dataframe_v3(self): df = self.v3.to_dataframe("rigs", pagesize=1000, deleteddate="null") # Check index is set to API endpoint "primary keys" self.assertListEqual(df.index.names, ["CompletionID", "WellID"]) # Check object dtypes self.assertTrue(is_object_dtype(df.API_UWI)) self.assertTrue(is_object_dtype(df.ActiveStatus)) # Check datetime64 dtypes self.assertTrue(is_datetime64_ns_dtype(df.DeletedDate)) self.assertTrue(is_datetime64_ns_dtype(df.SpudDate)) self.assertTrue(is_datetime64_ns_dtype(df.UpdatedDate)) # Check Int64 dtypes self.assertTrue(is_int64_dtype(df.RatedWaterDepth)) self.assertTrue(is_int64_dtype(df.RatedHP)) # Check float dtypes self.assertTrue(is_float_dtype(df.RigLatitudeWGS84)) self.assertTrue(is_float_dtype(df.RigLongitudeWGS84))
def test_convert_to_type_pos_01(self): df = pd.DataFrame({ 'date': ['05/06/2018', '05/04/2018'], 'datetime': ['2018-06-05T10:07:31', '2018-04-05T21:56:14'], 'number': ['1', '2.34'], 'int': [4, 8103], 'float': [4.0, 8103.0], 'object': ['just some', 'strings'] }) mapper = { 'number': ['number'], 'date': 'date', 'datetime': ['datetime'], 'integer': 'int', 'float': ['float'] } res = convert_to_type(df, mapper, *mapper.keys()) assert_frame_equal(res, convert_to_type(df, mapper), check_like=True) self.assertTrue(ptypes.is_datetime64_ns_dtype(res['date'].dtype)) self.assertTrue(ptypes.is_datetime64_ns_dtype(res['datetime'].dtype)) self.assertTrue(ptypes.is_float_dtype(res['number'].dtype)) self.assertTrue(ptypes.is_integer_dtype(res['int'].dtype)) self.assertTrue(ptypes.is_float_dtype(res['float'].dtype)) self.assertTrue(ptypes.is_object_dtype(res['object'].dtype))
def pandas_iter( df: pd.DataFrame, columns: List[str], mask: Optional[np.array] = None ) -> Generator[List[Any], None, None]: arrays = [] for column in columns: srs = df.loc[:, column] if mask is not None: srs = srs[mask] if is_datetime64_any_dtype(srs) or is_datetime64_ns_dtype(srs): arrays.append(map(pd.Timestamp, srs.values)) elif is_timedelta64_dtype(srs) or is_timedelta64_ns_dtype(srs): arrays.append(map(pd.Timedelta, srs.values)) else: arrays.append(srs.values) yield from zip(*arrays)
def test_convert_to_type_pos_02(self): df = pd.DataFrame({ 'date': ['05/06/2018', '05/04/2018'], 'datetime': [1543844249621, 1543844249621], 'number': ['1', '2.34'], 'int': [4, 8103], 'float': [4.0, 8103.0], 'object': ['just some', 'strings'] }) mapper = { 'number': ['number'], 'date': 'date', 'datetime': ['datetime'], 'integer': 'int', 'float': ['float'] } kwargs_map = {'datetime': {'unit': 'ms'}} res = convert_to_type(df, mapper, *mapper.keys(), kwargs_map=kwargs_map) self.assertTrue(ptypes.is_datetime64_ns_dtype(res['datetime'].dtype)) self.assertListEqual(res['datetime'].dt.year.tolist(), [2018, 2018])
def stacked_bar_chart( df, cmap, value_key, group_by, title='', xlabel='Years', ylabel='Diff. Capacity [GW]', width=850, height=400, split_neg_pos_by=None, extra_lines=None, extra_lines_y_axis=None, ): """ df: df to plot. long format cmap: colour map dict with [category key] = colour value_key: sets the height of the bars elements group_by: ['category', 'x-label'] -> category is used to stack the bars and colour them, x-label distribute the stacked bars on the x axis title: Graph title xlabel: ylabel: width: height: split_pos_neg_by: Values are filtered for positive and negative values. For import/export NTC flows for each year each year has positive and negative values. So it is necessary to drop the 0 values otherwise the information cannot be plotted. extra_lines: dataframe like for line plot extra_lines_y_axis: list of columns to go to the second y axis """ # round the difference to 2 decimals places df = df.round(4) # set all positive numbers to 0 all_negative = df.copy(deep=True) all_negative.loc[all_negative[value_key] >= 0, value_key] = 0 all_negative.fillna(0, inplace=True) # if drop_zeros then drop all the zeros if split_neg_pos_by: split_column = list(split_neg_pos_by.keys())[0] splite_arguments = split_neg_pos_by[split_column] all_negative = all_negative[all_negative[split_column] == splite_arguments[0]] all_negative.set_index(group_by, inplace=True) # set all negative numbers to 0 all_positive = df.copy(deep=True) all_positive.loc[all_positive[value_key] <= 0, value_key] = 0 all_positive.fillna(0, inplace=True) # if drop_zeros then drop all the zeros if split_neg_pos_by: split_column = list(split_neg_pos_by.keys())[0] splite_arguments = split_neg_pos_by[split_column] all_positive = all_positive[all_positive[split_column] == splite_arguments[1]] all_positive.set_index(group_by, inplace=True) # groupings categories = sorted(list( df[group_by[0]].unique())) # keeps the defined order # check the type of x labels - need to be converted to strings if is_datetime64_ns_dtype(df[group_by[1]]): # convert datetime to string xs = df[group_by[1]].dt.strftime('%Y.%m.%d - %H').unique() else: xs = df[group_by[1]].unique() xs = sorted( list(xs)) # maybe check if there is data for each tech and year? # create index df.set_index(group_by, inplace=True) idx = pd.IndexSlice # create the figure handle hover_stack = HoverTool(tooltips=[ ("%s: " % (group_by[0][0].upper() + group_by[0][1:]), "@cat"), ("%s: " % (group_by[1][0].upper() + group_by[1][1:]), "@x"), ("%s: " % (value_key.upper() + value_key[1][1:]), "@count"), ], names=['stack']) hover_lines = HoverTool(tooltips=[ ("Type: ", "@type"), ("%s: " % (group_by[1][0].upper() + group_by[1][1:]), "@x"), ("Value: ", "@y"), ], names=['lines']) # plot tools tools = [ PanTool, SaveTool, UndoTool, RedoTool, ZoomInTool, ZoomOutTool, BoxZoomTool, ResetTool ] called_tools = [item() for item in tools] + [hover_stack, hover_lines] p = bplt.figure(plot_width=width, plot_height=height, title="", x_range=xs, tools=called_tools, toolbar_location="above") # plot all the positive values lower_bound = np.array([0] * len(xs)) # lower bound for boxes upper_bound = np.array([0] * len(xs)) # upper bound for boxes positive_rs = [] for index, cat in enumerate(categories): # if df.loc[idx[cat,:], value_key].sum() != 0: colour = cmap[cat] if cat in all_positive.index: values = all_positive.loc[idx[cat, :], value_key].values else: values = [0] * len(xs) upper_bound = lower_bound + values source = { 'x': xs, 'top': upper_bound, 'bottom': lower_bound, 'count': values, 'cat': [cat] * len(xs) } positive_rs.append \ (p.vbar(source=source, x='x', top='top', bottom='bottom', width=0.75, fill_color=colour, muted_color=colour, muted_alpha=0.4, line_width=0.1, line_color="black", name='stack')) # set lower_bound to upper_bound lower_bound = upper_bound # plot all the negative values lower_bound = np.array([0] * len(xs)) # lower bound for boxes upper_bound = np.array([0] * len(xs)) # upper bound for boxes negative_rs = [] for index, cat in enumerate(categories): # if df.loc[idx[cat,:], value_key].sum() != 0: colour = cmap[cat] if cat in all_positive.index: values = all_negative.loc[idx[cat, :], value_key].values else: values = [0] * len(xs) upper_bound = lower_bound + values source = { 'x': xs, 'top': upper_bound, 'bottom': lower_bound, 'count': values, 'cat': [cat] * len(xs) } # negative_rs.append(p.vbar(xs, 0.7, upper_bound, lower_bound, fill_color=colour, line_color="black", name=cat, source=source)) negative_rs.append \ (p.vbar(source=source, x='x', top='top', bottom='bottom', width=0.75, fill_color=colour, muted_color=colour, muted_alpha=0.25, line_width=0.1, line_color="black", name='stack')) # set lower_bound to upper_bound lower_bound = upper_bound # plot extra lines if provided if extra_lines is not None: lines_to_plot = extra_lines # add extra y axis if extra_lines_y_axis is not None: # get min, max for _min = 99999 _max = -99999 for line in extra_lines_y_axis: if line in lines_to_plot.columns: _min_column = lines_to_plot[line].min() if _min_column < _min: _min = _min_column _max_column = lines_to_plot[line].max() if _max_column > _max: _max = _max_column # scale the window 10% larger than the actual min, max values to be plotted if _min < 0: _min = 1.1 * _min else: _min = 0.9 * _min if _max > 0: _max = 1.1 * _max else: _max = 0.9 * _max # check that _min, _max cannot be nan if isnan(_min): _min = 0 if isnan(_max): _max = 1 p.extra_y_ranges = {"SecondYAxis": Range1d(start=_min, end=_max)} p.add_layout(LinearAxis(y_range_name="SecondYAxis"), 'right') # lines colour map # setup the colour map lines_cmap = Spectral[11] # retrieve the x values # convert index to string if necessary if isinstance(lines_to_plot.index, pd.DatetimeIndex): x_all_values = list(lines_to_plot.index.strftime('%Y.%m.%d - %H')) else: x_all_values = list(lines_to_plot.index) # add a line renderer legend_items = [] for index, line in enumerate(lines_to_plot.columns): new_line = [] y = list(lines_to_plot[line].values) # get rid of NaN values xy = [ item for item in zip(x_all_values, y) if not np.isnan(item[1]) ] # x = [item[0] for item in xy] # y = [item[1] for item in xy] source = { 'x': [item[0] for item in xy], 'y': [item[1] for item in xy], 'type': [line] * len(xy) } # change to source and change hover tool for circles! if line in extra_lines_y_axis: new_line.append( p.line(source=source, x='x', y='y', line_width=2, color=lines_cmap[index % len(lines_cmap)], y_range_name='SecondYAxis', name='lines')) new_line.append( p.circle(source=source, x='x', y='y', line_width=2, color=lines_cmap[index % len(lines_cmap)], y_range_name='SecondYAxis', name='lines')) else: new_line.append( p.line(source=source, x='x', y='y', line_width=2, color=lines_cmap[index % len(lines_cmap)], name='lines')) new_line.append( p.circle(source=source, x='x', y='y', line_width=2, color=lines_cmap[index % len(lines_cmap)], name='lines')) # create the legend legend_items = [] for index, cat in enumerate(categories): new_item = (cat, [positive_rs[index], negative_rs[index]]) legend_items.append(new_item) legend_items.reverse() legend = Legend(items=legend_items, location=(0, 0)) # legend.legend.location = 'top_left' legend.click_policy = "mute" p.add_layout(legend, 'right') if title: p.title.text = title # axes p.xaxis.axis_label = xlabel p.yaxis.axis_label = ylabel p.xaxis.major_label_orientation = pi / 2 bplt.show(p)
def test_to_utc(self): result = self.utils.to_utc(self.data.copy()) self.assertTrue(is_datetime64_ns_dtype(result.index)) self.assertTrue(is_datetime64tz_dtype(result.index))
def infer_fields_from_df( self, df: pd.DataFrame, entities: Optional[List[Entity]] = None, features: Optional[List[Feature]] = None, replace_existing_features: bool = False, replace_existing_entities: bool = False, discard_unused_fields: bool = False, rows_to_sample: int = 100, ): """ Adds fields (Features or Entities) to a feature set based on the schema of a Datatframe. Only Pandas dataframes are supported. All columns are detected as features, so setting at least one entity manually is advised. Args: df: Pandas dataframe to read schema from entities: List of entities that will be set manually and not inferred. These will take precedence over any existing entities or entities found in the dataframe. features: List of features that will be set manually and not inferred. These will take precedence over any existing feature or features found in the dataframe. replace_existing_features: If true, will replace existing features in this feature set with features found in dataframe. If false, will skip conflicting features. replace_existing_entities: If true, will replace existing entities in this feature set with features found in dataframe. If false, will skip conflicting entities. discard_unused_fields: Boolean flag. Setting this to True will discard any existing fields that are not found in the dataset or provided by the user rows_to_sample: Number of rows to sample to infer types. All rows must have consistent types, even values within list types must be homogeneous """ if entities is None: entities = list() if features is None: features = list() # Validate whether the datetime column exists with the right name if DATETIME_COLUMN not in df: raise Exception("No column 'datetime'") # Validate the data type for the datetime column if not is_datetime64_ns_dtype(df.dtypes[DATETIME_COLUMN]): raise Exception( "Column 'datetime' does not have the correct type: datetime64[ns]" ) # Create dictionary of fields that will not be inferred (manually set) provided_fields = OrderedDict() for field in entities + features: if not isinstance(field, Field): raise Exception(f"Invalid field object type provided {type(field)}") if field.name not in provided_fields: provided_fields[field.name] = field else: raise Exception(f"Duplicate field name detected {field.name}.") new_fields = self._fields.copy() output_log = "" # Add in provided fields for name, field in provided_fields.items(): if name in new_fields.keys(): upsert_message = "created" else: upsert_message = "updated (replacing an existing field)" output_log += ( f"{type(field).__name__} {field.name}" f"({field.dtype}) manually {upsert_message}.\n" ) new_fields[name] = field # Iterate over all of the columns and create features for column in df.columns: column = column.strip() # Skip datetime column if DATETIME_COLUMN in column: continue # Skip user provided fields if column in provided_fields.keys(): continue # Only overwrite conflicting fields if replacement is allowed if column in new_fields: if ( isinstance(self._fields[column], Feature) and not replace_existing_features ): continue if ( isinstance(self._fields[column], Entity) and not replace_existing_entities ): continue # Store this field as a feature new_fields[column] = Feature( name=column, dtype=_infer_pd_column_type(column, df[column], rows_to_sample), ) output_log += f"{type(new_fields[column]).__name__} {new_fields[column].name} ({new_fields[column].dtype}) added from dataframe.\n" # Discard unused fields from feature set if discard_unused_fields: keys_to_remove = [] for key in new_fields.keys(): if not (key in df.columns or key in provided_fields.keys()): output_log += f"{type(new_fields[key]).__name__} {new_fields[key].name} ({new_fields[key].dtype}) removed because it is unused.\n" keys_to_remove.append(key) for key in keys_to_remove: del new_fields[key] # Update feature set self._fields = new_fields print(output_log)
def update_from_dataset(self, df: pd.DataFrame, column_mapping=None): """ Updates Feature Set values based on the data set. Only Pandas dataframes are supported. :param column_mapping: Dictionary of column names to resource (entity, feature) mapping. Forces the interpretation of a column as either an entity or feature. Example: {"driver_id": Entity(name="driver", dtype=ValueType.INT64)} :param df: Pandas dataframe containing datetime column, entity columns, and feature columns. """ fields = OrderedDict() existing_entities = self._client.entities if self._client is not None else None # Validate whether the datetime column exists with the right name if DATETIME_COLUMN not in df: raise Exception("No column 'datetime'") # Validate the data type for the datetime column if not is_datetime64_ns_dtype(df.dtypes[DATETIME_COLUMN]): raise Exception( "Column 'datetime' does not have the correct type: datetime64[ns]" ) # Iterate over all of the columns and detect their class (feature, entity) and type for column in df.columns: column = column.strip() # Skip datetime column if DATETIME_COLUMN in column: continue # Use entity or feature value if provided by the column mapping if column_mapping and column in column_mapping: if issubclass(type(column_mapping[column]), Field): fields[column] = column_mapping[column] continue raise ValueError( "Invalid resource type specified at column name " + column ) # Test whether this column is an existing entity (globally). if existing_entities and column in existing_entities: entity = existing_entities[column] # test whether registered entity type matches user provided type if entity.dtype == dtype_to_value_type(df[column].dtype): # Store this field as an entity fields[column] = entity continue # Ignore fields that already exist if column in self._fields: continue # Store this field as a feature fields[column] = Feature( name=column, dtype=pandas_dtype_to_feast_value_type(df[column].dtype) ) if len([field for field in fields.values() if type(field) == Entity]) == 0: raise Exception( "Could not detect entity column(s). Please provide entity column(s)." ) if len([field for field in fields.values() if type(field) == Feature]) == 0: raise Exception( "Could not detect feature column(s). Please provide feature column(s)." ) self._add_fields(list(fields.values()))
def is_datetime_dtype(argument): return is_datetime64_ns_dtype(argument)