Пример #1
0
def test_dataframe():
    d2 = DirectAccessV2(
        api_key=DIRECTACCESS_API_KEY,
        client_id=DIRECTACCESS_CLIENT_ID,
        client_secret=DIRECTACCESS_CLIENT_SECRET,
        access_token=DIRECTACCESS_TOKEN,
    )
    df = d2.to_dataframe("rigs", pagesize=10000, deleteddate="null")

    # Check index is set to API endpoint "primary key"
    assert df.index.name == "RigID"

    # Check datetime64 dtypes
    assert is_datetime64_ns_dtype(df.CreatedDate)
    assert is_datetime64_ns_dtype(df.DeletedDate)
    assert is_datetime64_ns_dtype(df.SpudDate)
    assert is_datetime64_ns_dtype(df.UpdatedDate)

    # Check Int64 dtypes
    assert is_int64_dtype(df.PermitDepth)
    assert is_int64_dtype(df.FormationDepth)

    # Check float dtypes
    assert is_float_dtype(df.RigLatitudeWGS84)
    assert is_float_dtype(df.RigLongitudeWGS84)

    return
    def test_load_file_pandas_data_types(self):
        self.create_csv()

        csv_file = ForestCSVFile(self.csv_path())
        # Making sure the datetime fields are parsed as such
        self.assertTrue(
            ptypes.is_datetime64_ns_dtype(
                csv_file.data_frame.dtypes['Start Time']))
        self.assertTrue(
            ptypes.is_datetime64_ns_dtype(
                csv_file.data_frame.dtypes['End Time']))
        # Checking if the boolean "is Success" column is actually boolean
        self.assertTrue(
            ptypes.is_bool_dtype(csv_file.data_frame.dtypes['Is Success']))
Пример #3
0
def pandas_col_to_ibis_type(col):
    import numpy as np
    dty = col.dtype

    # datetime types
    if pdcom.is_datetime64tz_dtype(dty):
        return dt.Timestamp(str(dty.tz))

    if pdcom.is_datetime64_dtype(dty):
        if pdcom.is_datetime64_ns_dtype(dty):
            return dt.timestamp
        else:
            raise com.IbisTypeError("Column {0} has dtype {1}, which is "
                                    "datetime64-like but does "
                                    "not use nanosecond units".format(
                                        col.name, dty))
    if pdcom.is_timedelta64_dtype(dty):
        print("Warning: encoding a timedelta64 as an int64")
        return dt.int64

    if pdcom.is_categorical_dtype(dty):
        return dt.Category(len(col.cat.categories))

    if pdcom.is_bool_dtype(dty):
        return dt.boolean

    # simple numerical types
    if issubclass(dty.type, np.int8):
        return dt.int8
    if issubclass(dty.type, np.int16):
        return dt.int16
    if issubclass(dty.type, np.int32):
        return dt.int32
    if issubclass(dty.type, np.int64):
        return dt.int64
    if issubclass(dty.type, np.float32):
        return dt.float
    if issubclass(dty.type, np.float64):
        return dt.double
    if issubclass(dty.type, np.uint8):
        return dt.int16
    if issubclass(dty.type, np.uint16):
        return dt.int32
    if issubclass(dty.type, np.uint32):
        return dt.int64
    if issubclass(dty.type, np.uint64):
        raise com.IbisTypeError("Column {} is an unsigned int64".format(
            col.name))

    if pdcom.is_object_dtype(dty):
        return _infer_object_dtype(col)

    raise com.IbisTypeError("Column {0} is dtype {1}".format(col.name, dty))
    def test_dataframe_v3(self):
        df = self.v3.to_dataframe("rigs", pagesize=1000, deleteddate="null")

        # Check index is set to API endpoint "primary keys"
        self.assertListEqual(df.index.names, ["CompletionID", "WellID"])

        # Check object dtypes
        self.assertTrue(is_object_dtype(df.API_UWI))
        self.assertTrue(is_object_dtype(df.ActiveStatus))

        # Check datetime64 dtypes
        self.assertTrue(is_datetime64_ns_dtype(df.DeletedDate))
        self.assertTrue(is_datetime64_ns_dtype(df.SpudDate))
        self.assertTrue(is_datetime64_ns_dtype(df.UpdatedDate))

        # Check Int64 dtypes
        self.assertTrue(is_int64_dtype(df.RatedWaterDepth))
        self.assertTrue(is_int64_dtype(df.RatedHP))

        # Check float dtypes
        self.assertTrue(is_float_dtype(df.RigLatitudeWGS84))
        self.assertTrue(is_float_dtype(df.RigLongitudeWGS84))
Пример #5
0
 def test_convert_to_type_pos_01(self):
     df = pd.DataFrame({
         'date': ['05/06/2018', '05/04/2018'],
         'datetime': ['2018-06-05T10:07:31', '2018-04-05T21:56:14'],
         'number': ['1', '2.34'],
         'int': [4, 8103],
         'float': [4.0, 8103.0],
         'object': ['just some', 'strings']
     })
     mapper = {
         'number': ['number'],
         'date': 'date',
         'datetime': ['datetime'],
         'integer': 'int',
         'float': ['float']
     }
     res = convert_to_type(df, mapper, *mapper.keys())
     assert_frame_equal(res, convert_to_type(df, mapper), check_like=True)
     self.assertTrue(ptypes.is_datetime64_ns_dtype(res['date'].dtype))
     self.assertTrue(ptypes.is_datetime64_ns_dtype(res['datetime'].dtype))
     self.assertTrue(ptypes.is_float_dtype(res['number'].dtype))
     self.assertTrue(ptypes.is_integer_dtype(res['int'].dtype))
     self.assertTrue(ptypes.is_float_dtype(res['float'].dtype))
     self.assertTrue(ptypes.is_object_dtype(res['object'].dtype))
Пример #6
0
def pandas_iter(
        df: pd.DataFrame,
        columns: List[str],
        mask: Optional[np.array] = None
) -> Generator[List[Any], None, None]:
    arrays = []

    for column in columns:
        srs = df.loc[:, column]

        if mask is not None:
            srs = srs[mask]

        if is_datetime64_any_dtype(srs) or is_datetime64_ns_dtype(srs):
            arrays.append(map(pd.Timestamp, srs.values))
        elif is_timedelta64_dtype(srs) or is_timedelta64_ns_dtype(srs):
            arrays.append(map(pd.Timedelta, srs.values))
        else:
            arrays.append(srs.values)
    yield from zip(*arrays)
Пример #7
0
 def test_convert_to_type_pos_02(self):
     df = pd.DataFrame({
         'date': ['05/06/2018', '05/04/2018'],
         'datetime': [1543844249621, 1543844249621],
         'number': ['1', '2.34'],
         'int': [4, 8103],
         'float': [4.0, 8103.0],
         'object': ['just some', 'strings']
     })
     mapper = {
         'number': ['number'],
         'date': 'date',
         'datetime': ['datetime'],
         'integer': 'int',
         'float': ['float']
     }
     kwargs_map = {'datetime': {'unit': 'ms'}}
     res = convert_to_type(df,
                           mapper,
                           *mapper.keys(),
                           kwargs_map=kwargs_map)
     self.assertTrue(ptypes.is_datetime64_ns_dtype(res['datetime'].dtype))
     self.assertListEqual(res['datetime'].dt.year.tolist(), [2018, 2018])
Пример #8
0
def stacked_bar_chart(
    df,
    cmap,
    value_key,
    group_by,
    title='',
    xlabel='Years',
    ylabel='Diff. Capacity [GW]',
    width=850,
    height=400,
    split_neg_pos_by=None,
    extra_lines=None,
    extra_lines_y_axis=None,
):
    """
    df: df to plot. long format
    cmap: colour map dict with [category key] = colour
    value_key: sets the height of the bars elements
    group_by: ['category', 'x-label'] -> category is used to stack the bars and colour them, x-label distribute the stacked bars on the x axis
    title: Graph title
    xlabel:
    ylabel:
    width:
    height:
    split_pos_neg_by: Values are filtered for positive and negative values. For import/export NTC flows for each year each year has positive and negative values.
                So it is necessary to drop the 0 values otherwise the information cannot be plotted.
    extra_lines: dataframe like for line plot
    extra_lines_y_axis: list of columns to go to the second y axis

    """

    # round the difference to 2 decimals places
    df = df.round(4)

    # set all positive numbers to 0
    all_negative = df.copy(deep=True)
    all_negative.loc[all_negative[value_key] >= 0, value_key] = 0
    all_negative.fillna(0, inplace=True)
    # if drop_zeros then drop all the zeros
    if split_neg_pos_by:
        split_column = list(split_neg_pos_by.keys())[0]
        splite_arguments = split_neg_pos_by[split_column]
        all_negative = all_negative[all_negative[split_column] ==
                                    splite_arguments[0]]
    all_negative.set_index(group_by, inplace=True)

    # set all negative numbers to 0
    all_positive = df.copy(deep=True)
    all_positive.loc[all_positive[value_key] <= 0, value_key] = 0
    all_positive.fillna(0, inplace=True)
    # if drop_zeros then drop all the zeros
    if split_neg_pos_by:
        split_column = list(split_neg_pos_by.keys())[0]
        splite_arguments = split_neg_pos_by[split_column]
        all_positive = all_positive[all_positive[split_column] ==
                                    splite_arguments[1]]
    all_positive.set_index(group_by, inplace=True)

    # groupings
    categories = sorted(list(
        df[group_by[0]].unique()))  # keeps the defined order

    # check the type of x labels - need to be converted to strings
    if is_datetime64_ns_dtype(df[group_by[1]]):
        # convert datetime to string
        xs = df[group_by[1]].dt.strftime('%Y.%m.%d - %H').unique()

    else:
        xs = df[group_by[1]].unique()

    xs = sorted(
        list(xs))  # maybe check if there is data for each tech and year?

    # create index
    df.set_index(group_by, inplace=True)

    idx = pd.IndexSlice

    # create the figure handle

    hover_stack = HoverTool(tooltips=[
        ("%s: " % (group_by[0][0].upper() + group_by[0][1:]), "@cat"),
        ("%s: " % (group_by[1][0].upper() + group_by[1][1:]), "@x"),
        ("%s: " % (value_key.upper() + value_key[1][1:]), "@count"),
    ],
                            names=['stack'])

    hover_lines = HoverTool(tooltips=[
        ("Type: ", "@type"),
        ("%s: " % (group_by[1][0].upper() + group_by[1][1:]), "@x"),
        ("Value: ", "@y"),
    ],
                            names=['lines'])

    # plot tools
    tools = [
        PanTool, SaveTool, UndoTool, RedoTool, ZoomInTool, ZoomOutTool,
        BoxZoomTool, ResetTool
    ]
    called_tools = [item() for item in tools] + [hover_stack, hover_lines]

    p = bplt.figure(plot_width=width,
                    plot_height=height,
                    title="",
                    x_range=xs,
                    tools=called_tools,
                    toolbar_location="above")

    # plot all the positive values
    lower_bound = np.array([0] * len(xs))  # lower bound for boxes
    upper_bound = np.array([0] * len(xs))  # upper bound for boxes
    positive_rs = []
    for index, cat in enumerate(categories):
        # if df.loc[idx[cat,:], value_key].sum() != 0:

        colour = cmap[cat]

        if cat in all_positive.index:
            values = all_positive.loc[idx[cat, :], value_key].values
        else:
            values = [0] * len(xs)

        upper_bound = lower_bound + values

        source = {
            'x': xs,
            'top': upper_bound,
            'bottom': lower_bound,
            'count': values,
            'cat': [cat] * len(xs)
        }

        positive_rs.append \
            (p.vbar(source=source, x='x', top='top', bottom='bottom', width=0.75, fill_color=colour, muted_color=colour, muted_alpha=0.4, line_width=0.1, line_color="black", name='stack'))

        # set lower_bound to upper_bound
        lower_bound = upper_bound

    # plot all the negative values
    lower_bound = np.array([0] * len(xs))  # lower bound for boxes
    upper_bound = np.array([0] * len(xs))  # upper bound for boxes
    negative_rs = []
    for index, cat in enumerate(categories):
        # if df.loc[idx[cat,:], value_key].sum() != 0:
        colour = cmap[cat]

        if cat in all_positive.index:
            values = all_negative.loc[idx[cat, :], value_key].values
        else:
            values = [0] * len(xs)

        upper_bound = lower_bound + values

        source = {
            'x': xs,
            'top': upper_bound,
            'bottom': lower_bound,
            'count': values,
            'cat': [cat] * len(xs)
        }

        # negative_rs.append(p.vbar(xs, 0.7, upper_bound, lower_bound, fill_color=colour, line_color="black", name=cat, source=source))
        negative_rs.append \
            (p.vbar(source=source, x='x', top='top', bottom='bottom', width=0.75, fill_color=colour, muted_color=colour, muted_alpha=0.25, line_width=0.1, line_color="black", name='stack'))

        # set lower_bound to upper_bound
        lower_bound = upper_bound

        # plot extra lines if provided
    if extra_lines is not None:
        lines_to_plot = extra_lines

        # add extra y axis
        if extra_lines_y_axis is not None:
            # get min, max for
            _min = 99999
            _max = -99999
            for line in extra_lines_y_axis:
                if line in lines_to_plot.columns:
                    _min_column = lines_to_plot[line].min()
                    if _min_column < _min:
                        _min = _min_column

                    _max_column = lines_to_plot[line].max()
                    if _max_column > _max:
                        _max = _max_column

            # scale the window 10% larger than the actual min, max values to be plotted
            if _min < 0:
                _min = 1.1 * _min
            else:
                _min = 0.9 * _min

            if _max > 0:
                _max = 1.1 * _max
            else:
                _max = 0.9 * _max

            # check that _min, _max cannot be nan
            if isnan(_min):
                _min = 0

            if isnan(_max):
                _max = 1

            p.extra_y_ranges = {"SecondYAxis": Range1d(start=_min, end=_max)}
            p.add_layout(LinearAxis(y_range_name="SecondYAxis"), 'right')

        # lines colour map
        # setup the colour map
        lines_cmap = Spectral[11]

        # retrieve the x values

        # convert index to string if necessary
        if isinstance(lines_to_plot.index, pd.DatetimeIndex):
            x_all_values = list(lines_to_plot.index.strftime('%Y.%m.%d - %H'))
        else:
            x_all_values = list(lines_to_plot.index)

        # add a line renderer
        legend_items = []
        for index, line in enumerate(lines_to_plot.columns):
            new_line = []

            y = list(lines_to_plot[line].values)

            # get rid of NaN values
            xy = [
                item for item in zip(x_all_values, y) if not np.isnan(item[1])
            ]
            # x = [item[0] for item in xy]
            # y = [item[1] for item in xy]
            source = {
                'x': [item[0] for item in xy],
                'y': [item[1] for item in xy],
                'type': [line] * len(xy)
            }

            # change to source and change hover tool for circles!

            if line in extra_lines_y_axis:
                new_line.append(
                    p.line(source=source,
                           x='x',
                           y='y',
                           line_width=2,
                           color=lines_cmap[index % len(lines_cmap)],
                           y_range_name='SecondYAxis',
                           name='lines'))

                new_line.append(
                    p.circle(source=source,
                             x='x',
                             y='y',
                             line_width=2,
                             color=lines_cmap[index % len(lines_cmap)],
                             y_range_name='SecondYAxis',
                             name='lines'))

            else:
                new_line.append(
                    p.line(source=source,
                           x='x',
                           y='y',
                           line_width=2,
                           color=lines_cmap[index % len(lines_cmap)],
                           name='lines'))

                new_line.append(
                    p.circle(source=source,
                             x='x',
                             y='y',
                             line_width=2,
                             color=lines_cmap[index % len(lines_cmap)],
                             name='lines'))

    # create the legend
    legend_items = []
    for index, cat in enumerate(categories):
        new_item = (cat, [positive_rs[index], negative_rs[index]])
        legend_items.append(new_item)

    legend_items.reverse()

    legend = Legend(items=legend_items, location=(0, 0))

    # legend.legend.location = 'top_left'
    legend.click_policy = "mute"

    p.add_layout(legend, 'right')

    if title:
        p.title.text = title

    # axes
    p.xaxis.axis_label = xlabel
    p.yaxis.axis_label = ylabel
    p.xaxis.major_label_orientation = pi / 2

    bplt.show(p)
Пример #9
0
 def test_to_utc(self):
     result = self.utils.to_utc(self.data.copy())
     self.assertTrue(is_datetime64_ns_dtype(result.index))
     self.assertTrue(is_datetime64tz_dtype(result.index))
Пример #10
0
    def infer_fields_from_df(
        self,
        df: pd.DataFrame,
        entities: Optional[List[Entity]] = None,
        features: Optional[List[Feature]] = None,
        replace_existing_features: bool = False,
        replace_existing_entities: bool = False,
        discard_unused_fields: bool = False,
        rows_to_sample: int = 100,
    ):
        """

        Adds fields (Features or Entities) to a feature set based on the schema
        of a Datatframe. Only Pandas dataframes are supported. All columns are
        detected as features, so setting at least one entity manually is
        advised.

        Args:
            df: Pandas dataframe to read schema from
            entities: List of entities that will be set manually and not
                inferred. These will take precedence over any existing entities
                or entities found in the dataframe.
            features: List of features that will be set manually and not
                inferred. These will take precedence over any existing feature
                or features found in the dataframe.
            replace_existing_features: If true, will replace
                existing features in this feature set with features found in
                dataframe. If false, will skip conflicting features.
            replace_existing_entities: If true, will replace existing entities
                in this feature set with features found in dataframe. If false,
                will skip conflicting entities.
            discard_unused_fields: Boolean flag. Setting this to True will
                discard any existing fields that are not found in the dataset or
                provided by the user
            rows_to_sample: Number of rows to sample to infer types. All rows
                must have consistent types, even values within list types must
                be homogeneous
        """
        if entities is None:
            entities = list()
        if features is None:
            features = list()

        # Validate whether the datetime column exists with the right name
        if DATETIME_COLUMN not in df:
            raise Exception("No column 'datetime'")

        # Validate the data type for the datetime column
        if not is_datetime64_ns_dtype(df.dtypes[DATETIME_COLUMN]):
            raise Exception(
                "Column 'datetime' does not have the correct type: datetime64[ns]"
            )

        # Create dictionary of fields that will not be inferred (manually set)
        provided_fields = OrderedDict()

        for field in entities + features:
            if not isinstance(field, Field):
                raise Exception(f"Invalid field object type provided {type(field)}")
            if field.name not in provided_fields:
                provided_fields[field.name] = field
            else:
                raise Exception(f"Duplicate field name detected {field.name}.")

        new_fields = self._fields.copy()
        output_log = ""

        # Add in provided fields
        for name, field in provided_fields.items():
            if name in new_fields.keys():
                upsert_message = "created"
            else:
                upsert_message = "updated (replacing an existing field)"

            output_log += (
                f"{type(field).__name__} {field.name}"
                f"({field.dtype}) manually {upsert_message}.\n"
            )
            new_fields[name] = field

        # Iterate over all of the columns and create features
        for column in df.columns:
            column = column.strip()

            # Skip datetime column
            if DATETIME_COLUMN in column:
                continue

            # Skip user provided fields
            if column in provided_fields.keys():
                continue

            # Only overwrite conflicting fields if replacement is allowed
            if column in new_fields:
                if (
                    isinstance(self._fields[column], Feature)
                    and not replace_existing_features
                ):
                    continue

                if (
                    isinstance(self._fields[column], Entity)
                    and not replace_existing_entities
                ):
                    continue

            # Store this field as a feature
            new_fields[column] = Feature(
                name=column,
                dtype=_infer_pd_column_type(column, df[column], rows_to_sample),
            )

            output_log += f"{type(new_fields[column]).__name__} {new_fields[column].name} ({new_fields[column].dtype}) added from dataframe.\n"

        # Discard unused fields from feature set
        if discard_unused_fields:
            keys_to_remove = []
            for key in new_fields.keys():
                if not (key in df.columns or key in provided_fields.keys()):
                    output_log += f"{type(new_fields[key]).__name__} {new_fields[key].name} ({new_fields[key].dtype}) removed because it is unused.\n"
                    keys_to_remove.append(key)
            for key in keys_to_remove:
                del new_fields[key]

        # Update feature set
        self._fields = new_fields
        print(output_log)
Пример #11
0
    def update_from_dataset(self, df: pd.DataFrame, column_mapping=None):
        """
        Updates Feature Set values based on the data set. Only Pandas dataframes are supported.
        :param column_mapping: Dictionary of column names to resource (entity, feature) mapping. Forces the interpretation
        of a column as either an entity or feature. Example: {"driver_id": Entity(name="driver", dtype=ValueType.INT64)}
        :param df: Pandas dataframe containing datetime column, entity columns, and feature columns.
        """

        fields = OrderedDict()
        existing_entities = self._client.entities if self._client is not None else None

        # Validate whether the datetime column exists with the right name
        if DATETIME_COLUMN not in df:
            raise Exception("No column 'datetime'")

        # Validate the data type for the datetime column
        if not is_datetime64_ns_dtype(df.dtypes[DATETIME_COLUMN]):
            raise Exception(
                "Column 'datetime' does not have the correct type: datetime64[ns]"
            )

        # Iterate over all of the columns and detect their class (feature, entity) and type
        for column in df.columns:
            column = column.strip()

            # Skip datetime column
            if DATETIME_COLUMN in column:
                continue

            # Use entity or feature value if provided by the column mapping
            if column_mapping and column in column_mapping:
                if issubclass(type(column_mapping[column]), Field):
                    fields[column] = column_mapping[column]
                    continue
                raise ValueError(
                    "Invalid resource type specified at column name " + column
                )

            # Test whether this column is an existing entity (globally).
            if existing_entities and column in existing_entities:
                entity = existing_entities[column]

                # test whether registered entity type matches user provided type
                if entity.dtype == dtype_to_value_type(df[column].dtype):
                    # Store this field as an entity
                    fields[column] = entity
                    continue

            # Ignore fields that already exist
            if column in self._fields:
                continue

            # Store this field as a feature
            fields[column] = Feature(
                name=column, dtype=pandas_dtype_to_feast_value_type(df[column].dtype)
            )

        if len([field for field in fields.values() if type(field) == Entity]) == 0:
            raise Exception(
                "Could not detect entity column(s). Please provide entity column(s)."
            )
        if len([field for field in fields.values() if type(field) == Feature]) == 0:
            raise Exception(
                "Could not detect feature column(s). Please provide feature column(s)."
            )
        self._add_fields(list(fields.values()))
Пример #12
0
def is_datetime_dtype(argument):
    return is_datetime64_ns_dtype(argument)