예제 #1
0
def read_clinical_data(clinical_data_directory: PathLike) -> DataFrame:
    import pandas as pd

    dataframe = find_clinical_data(clinical_data_directory)

    if dataframe is None:
        raise FileNotFoundError("Clinical data not found")

    # Compute participant and session IDs.
    dataframe = dataframe.rename_axis(index={
        "loni_id": "participant_id",
        "visit_number": "session_id"
    })
    dataframe.index = dataframe.index.map(lambda x: (
        f"sub-NIFD{x[0].replace('_', '')}", f"ses-M{(6 * (x[1] - 1)):02d}"))

    # Keep relevant columns and rename them.
    dataframe = (dataframe[[
        "dx", "site", "education", "race", "cdr_box_score", "mmse_tot"
    ]].rename(columns={
        "dx": "diagnosis",
        "cdr_box_score": "cdr",
        "mmse_tot": "mmse"
    }).astype(
        dtype={
            "diagnosis":
            pd.CategoricalDtype(
                ["BV", "CON", "L_SD", "PATIENT (OTHER)", "PNFA", "SV"]),
            "site":
            pd.CategoricalDtype(["UCSF", "MAYO", "MGH"]),
            "education":
            pd.Int64Dtype(),
            "race":
            pd.Int64Dtype(),
            "cdr":
            pd.Float64Dtype(),
            "mmse":
            pd.Float64Dtype(),
        }).replace({
            "education": {
                99: pd.NA
            },
            "race": {
                50: pd.NA,
                99: pd.NA
            }
        }))

    # Keep positive MMSE values only.
    dataframe.mmse = dataframe.mmse.mask(dataframe.mmse < 0)

    return dataframe
예제 #2
0
def test_dwd_observation_data_result_missing_data():
    """Test for DataFrame having empty values for dates where the station should not
    have values"""
    Settings.tidy = True
    Settings.humanize = True
    Settings.si_units = True

    request = DwdObservationRequest(
        parameter=[DwdObservationDataset.CLIMATE_SUMMARY],
        resolution=DwdObservationResolution.DAILY,
        start_date="1933-12-27",  # few days before official start
        end_date="1934-01-04",  # few days after official start,
    ).filter_by_station_id(station_id=[1048], )

    # Leave only one column to potentially contain NaN which is VALUE
    df = request.values.all().df.drop("quality", axis=1)

    df_1933 = df[df["date"].dt.year == 1933]
    df_1934 = df[df["date"].dt.year == 1934]

    assert not df_1933.empty and df_1933.dropna().empty
    assert not df_1934.empty and not df_1934.dropna().empty

    request = DwdObservationRequest(
        parameter=DwdObservationParameter.HOURLY.TEMPERATURE_AIR_MEAN_200,
        resolution=DwdObservationResolution.HOURLY,
        start_date=
        "2020-06-09 12:00:00",  # no data at this time (reason unknown)
        end_date="2020-06-09 12:00:00",
    ).filter_by_station_id(station_id=["03348"], )

    df = request.values.all().df

    assert_frame_equal(
        df,
        pd.DataFrame({
            "station_id":
            pd.Categorical(["03348"]),
            "dataset":
            pd.Categorical(["temperature_air"]),
            "parameter":
            pd.Categorical(["temperature_air_mean_200"]),
            "date": [datetime(2020, 6, 9, 12, 0, 0, tzinfo=pytz.UTC)],
            "value":
            pd.Series([pd.NA], dtype=pd.Float64Dtype()).astype(float),
            "quality":
            pd.Series([pd.NA], dtype=pd.Float64Dtype()).astype(float),
        }),
        check_categorical=False,
    )
예제 #3
0
 def test_astype(self):
     pdf, psdf = self.pdf, self.psdf
     for col in self.numeric_df_cols:
         pser, psser = pdf[col], psdf[col]
         self.assert_eq(pser.astype(int), psser.astype(int))
         self.assert_eq(pser.astype(float), psser.astype(float))
         self.assert_eq(pser.astype(np.float32), psser.astype(np.float32))
         self.assert_eq(pser.astype(np.int32), psser.astype(np.int32))
         self.assert_eq(pser.astype(np.int16), psser.astype(np.int16))
         self.assert_eq(pser.astype(np.int8), psser.astype(np.int8))
         self.assert_eq(pser.astype(str), psser.astype(str))
         self.assert_eq(pser.astype(bool), psser.astype(bool))
         self.assert_eq(pser.astype("category"), psser.astype("category"))
         cat_type = CategoricalDtype(categories=[2, 1, 3])
         self.assert_eq(pser.astype(cat_type), psser.astype(cat_type))
     self.assertRaisesRegex(
         ValueError,
         "Cannot convert fractions with missing values to integer",
         lambda: self.float_withnan_psser.astype(int),
     )
     self.assertRaisesRegex(
         ValueError,
         "Cannot convert fractions with missing values to integer",
         lambda: self.float_withnan_psser.astype(np.int32),
     )
     self.assert_eq(self.float_withnan_psser.astype(str), self.float_withnan_psser.astype(str))
     self.assert_eq(self.float_withnan_psser.astype(bool), self.float_withnan_psser.astype(bool))
     self.assert_eq(
         self.float_withnan_psser.astype("category"), self.float_withnan_psser.astype("category")
     )
     if extension_object_dtypes_available and extension_float_dtypes_available:
         pser = pd.Series(pd.Categorical([1.0, 2.0, 3.0]), dtype=pd.Float64Dtype())
         psser = ps.from_pandas(pser)
         self.assert_eq(pser.astype(pd.BooleanDtype()), psser.astype(pd.BooleanDtype()))
예제 #4
0
    def test_astype(self):
        pdf, psdf = self.pdf, self.psdf
        for col in self.numeric_df_cols:
            pser, psser = pdf[col], psdf[col]

            for int_type in [int, np.int32, np.int16, np.int8]:
                if not pser.hasnans:
                    self.assert_eq(pser.astype(int_type),
                                   psser.astype(int_type))
                else:
                    self.assertRaisesRegex(
                        ValueError,
                        "Cannot convert %s with missing "
                        "values to integer" % psser._dtype_op.pretty_name,
                        lambda: psser.astype(int_type),
                    )

            # TODO(SPARK-37039): the np.nan series.astype(bool) should be True
            if not pser.hasnans:
                self.assert_eq(pser.astype(bool), psser.astype(bool))

            self.assert_eq(pser.astype(float), psser.astype(float))
            self.assert_eq(pser.astype(np.float32), psser.astype(np.float32))
            self.assert_eq(pser.astype(str), psser.astype(str))
            self.assert_eq(pser.astype("category"), psser.astype("category"))
            cat_type = CategoricalDtype(categories=[2, 1, 3])
            self.assert_eq(pser.astype(cat_type), psser.astype(cat_type))
        if extension_object_dtypes_available and extension_float_dtypes_available:
            pser = pd.Series(pd.Categorical([1.0, 2.0, 3.0]),
                             dtype=pd.Float64Dtype())
            psser = ps.from_pandas(pser)
            self.assert_eq(pser.astype(pd.BooleanDtype()),
                           psser.astype(pd.BooleanDtype()))
예제 #5
0
 def _coerce_integers(series: pd.Series) -> pd.Series:
     """Method to parse integers for type coercion. Uses pandas.Int64Dtype() to
     allow missing values."""
     return (
         pd.to_numeric(series, errors="coerce")
         .astype(pd.Float64Dtype())
         .astype(pd.Int64Dtype())
     )
예제 #6
0
def test_astype_to_floating_array():
    # astype to FloatingArray
    arr = pd.array([0.0, 1.0, None], dtype="Float64")

    result = arr.astype("Float64")
    tm.assert_extension_array_equal(result, arr)
    result = arr.astype(pd.Float64Dtype())
    tm.assert_extension_array_equal(result, arr)
    result = arr.astype("Float32")
    expected = pd.array([0.0, 1.0, None], dtype="Float32")
    tm.assert_extension_array_equal(result, expected)
예제 #7
0
    def _coerce_meta_fields(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Method that coerces meta fields. Those fields are expected to be found in the
        DataFrame in a columnar shape. Thore are basically the station id and the date
        fields. Furthermore if the data is tidied parameter can be found as well as
        quality. For station id, parameter and quality those columns are additionally
        coerced to categories to reduce consumption of the DataFrame.

        :param df: pandas.DataFrame with the "fresh" data
        :return: pandas.DataFrame with meta fields being coerced
        """
        df.loc[:, Columns.STATION_ID.value] = self._parse_station_id(df[Columns.STATION_ID.value]).astype("category")
        df.loc[:, Columns.DATASET.value] = self._coerce_strings(df[Columns.DATASET.value]).astype("category")

        if self.sr.stations.tidy:
            df.loc[:, Columns.PARAMETER.value] = self._coerce_strings(df[Columns.PARAMETER.value]).astype("category")
            df.loc[:, Columns.VALUE.value] = df[Columns.VALUE.value].astype(pd.Float64Dtype()).astype(float)
            df.loc[:, Columns.QUALITY.value] = df[Columns.QUALITY.value].astype(pd.Float64Dtype()).astype(float)

        return df
예제 #8
0
 def _fix_int_dtypes(df: pd.DataFrame) -> None:
     """Mutate DataFrame to set dtypes for int columns containing NaN values."""
     for col in df:
         if "float" in df[col].dtype.name and df[col].hasnans:
             # inspect values to determine if dtype of non-null values is int or float
             notna_series = df[col].dropna().values
             if np.equal(notna_series, notna_series.astype(int)).all():
                 # set to dtype that retains integers and supports NaNs
                 df[col] = np.where(df[col].isnull(), None, df[col])
                 df[col] = df[col].astype(pd.Int64Dtype())
             elif np.isclose(notna_series, notna_series.astype(int)).all():
                 # set to float dtype that retains floats and supports NaNs
                 df[col] = np.where(df[col].isnull(), None, df[col])
                 df[col] = df[col].astype(pd.Float64Dtype())
예제 #9
0
파일: parquet.py 프로젝트: tnir/pandas
    def read(
        self,
        path,
        columns=None,
        use_nullable_dtypes=False,
        storage_options: StorageOptions = None,
        **kwargs,
    ) -> DataFrame:
        kwargs["use_pandas_metadata"] = True

        to_pandas_kwargs = {}
        if use_nullable_dtypes:
            import pandas as pd

            mapping = {
                self.api.int8(): pd.Int8Dtype(),
                self.api.int16(): pd.Int16Dtype(),
                self.api.int32(): pd.Int32Dtype(),
                self.api.int64(): pd.Int64Dtype(),
                self.api.uint8(): pd.UInt8Dtype(),
                self.api.uint16(): pd.UInt16Dtype(),
                self.api.uint32(): pd.UInt32Dtype(),
                self.api.uint64(): pd.UInt64Dtype(),
                self.api.bool_(): pd.BooleanDtype(),
                self.api.string(): pd.StringDtype(),
                self.api.float32(): pd.Float32Dtype(),
                self.api.float64(): pd.Float64Dtype(),
            }
            to_pandas_kwargs["types_mapper"] = mapping.get
        manager = get_option("mode.data_manager")
        if manager == "array":
            to_pandas_kwargs["split_blocks"] = True  # type: ignore[assignment]

        path_or_handle, handles, kwargs["filesystem"] = _get_path_or_handle(
            path,
            kwargs.pop("filesystem", None),
            storage_options=storage_options,
            mode="rb",
        )
        try:
            result = self.api.parquet.read_table(
                path_or_handle, columns=columns,
                **kwargs).to_pandas(**to_pandas_kwargs)
            if manager == "array":
                result = result._as_manager("array", copy=False)
            return result
        finally:
            if handles is not None:
                handles.close()
예제 #10
0
 def test_numeric_nullable_dtypes(self):
     dtypes = [
         pd.StringDtype(),
         pd.BooleanDtype(),
         pd.Float64Dtype(),
         pd.Float32Dtype(),
         pd.Int64Dtype(),
         pd.UInt64Dtype(),
         pd.Int32Dtype(),
         pd.UInt32Dtype(),
         pd.Int16Dtype(),
         pd.UInt16Dtype(),
         pd.Int8Dtype(),
         pd.UInt8Dtype(),
         pd.StringDtype(),
     ]
     # TODO: Re-add (".xml", "xml"),
     # TODO: See https://github.com/dmyersturnbull/typed-dfs/issues/46
     for suffix, fn in [
         (".snappy", "parquet"),
         (".feather", "feather"),
         (".csv", "csv"),
         (".tsv", "tsv"),
         (".json", "json"),
         (".xlsx", "xlsx"),
         (".xls", "xls"),
         (".xlsb", "xlsb"),
         (".ods", "ods"),
         (".pickle", "pickle"),
     ]:
         # TODO: include xml
         for dtype in dtypes:
             with tmpfile(suffix) as path:
                 try:
                     df = Ind2Col2.convert(
                         Ind2Col2(
                             sample_data_ind2_col2_pd_na())).astype(dtype)
                     assert list(df.index.names) == ["qqq", "rrr"]
                     assert list(df.columns) == ["abc", "xyz"]
                     getattr(df, "to_" + fn)(path)
                     df2 = getattr(Ind2Col2, "read_" + fn)(path)
                     assert list(df2.index.names) == ["qqq", "rrr"]
                     assert list(df2.columns) == ["abc", "xyz"]
                 except Exception:
                     logger.error(f"Failed on path {path}, dtype {dtype}")
                     raise
예제 #11
0
    def test_replace_nullable_numeric(self):
        # GH#40732, GH#44940

        floats = pd.Series([1.0, 2.0, 3.999, 4.4], dtype=pd.Float64Dtype())
        assert floats.replace({1.0: 9}).dtype == floats.dtype
        assert floats.replace(1.0, 9).dtype == floats.dtype
        assert floats.replace({1.0: 9.0}).dtype == floats.dtype
        assert floats.replace(1.0, 9.0).dtype == floats.dtype

        res = floats.replace(to_replace=[1.0, 2.0], value=[9.0, 10.0])
        assert res.dtype == floats.dtype

        ints = pd.Series([1, 2, 3, 4], dtype=pd.Int64Dtype())
        assert ints.replace({1: 9}).dtype == ints.dtype
        assert ints.replace(1, 9).dtype == ints.dtype
        assert ints.replace({1: 9.0}).dtype == ints.dtype
        assert ints.replace(1, 9.0).dtype == ints.dtype
예제 #12
0
def test_from_pandas():
    dd_dict = {
        'boolean': [True, True, False, None, True],
        'text': ['This', 'is', 'some', 'text', 'so...'],
        'text_missing': pd.Series(['Some', 'parts', None, 'missing', None], dtype='string'),
        'float': [1, 30, -2, 1.5, 0.000],
        'float_missing': [1, None, -2, 1.5, 0.000],
        'float_missing_masked': pd.Series([1, None, -2, 1.5, 0.000], dtype=pd.Float64Dtype()),
        'int_missing': pd.Series([1, None, 5, 1, 10], dtype='Int64'),
        'datetime_1': [pd.NaT, datetime.datetime(2019, 1, 1, 1, 1, 1), datetime.datetime(2019, 1, 1, 1, 1, 1), datetime.datetime(2019, 1, 1, 1, 1, 1), datetime.datetime(2019, 1, 1, 1, 1, 1)],
        'datetime_2': [pd.NaT, None, pd.NaT, pd.NaT, pd.NaT],
        'datetime_3': [pd.Timedelta('1M'), pd.Timedelta('1D'), pd.Timedelta('100M'), pd.Timedelta('2D'), pd.Timedelta('1H')],
        'datetime_4': [pd.Timestamp('2001-1-1 2:2:11'), pd.Timestamp('2001-12'), pd.Timestamp('2001-10-1'), pd.Timestamp('2001-03-1 2:2:11'), pd.Timestamp('2001-1-1 2:2:11')],
        'datetime_5': [datetime.date(2010, 1, 1), datetime.date(2010, 1, 1), datetime.date(2010, 1, 1), datetime.date(2010, 1, 1), datetime.date(2010, 1, 1)],
        'datetime_6': [datetime.time(21, 1, 1), datetime.time(21, 1, 1), datetime.time(21, 1, 1), datetime.time(21, 1, 1), datetime.time(21, 1, 1)],
    }

    # Get pandas dataframe
    pandas_df = pd.DataFrame(dd_dict)
    pandas_df['datetime_7'] = pd.to_timedelta(pandas_df['datetime_2'] - pandas_df['datetime_1'])
    vaex_df = vaex.from_pandas(pandas_df)
    repr_value = repr(vaex_df)
    str_value = str(vaex_df)

    assert 'NaT' in repr_value
    assert 'NaT' in str_value
    assert '--' in repr_value
    assert '--' in str_value

    # string columns are now arrows arrays
    # assert vaex_df.text_missing.is_masked == True
    assert vaex_df.int_missing.is_masked == True
    assert vaex_df.float_missing.is_masked == False
    assert vaex_df.float_missing_masked.is_masked == True
    assert vaex_df.int_missing.tolist() == [1, None, 5, 1, 10]
    assert vaex_df.text_missing.tolist() == ['Some', 'parts', None, 'missing', None]
    assert vaex_df.float_missing.values[[0, 2, 3, 4]].tolist() == [1.0, -2.0, 1.5, 0.0]
    assert np.isnan(vaex_df.float_missing.values[1])
    assert vaex_df.float_missing_masked.tolist() == [1.0, None, -2.0, 1.5, 0.0]
예제 #13
0
def test_cython_agg_EA_known_dtypes(data, op_name, action, with_na):
    if with_na:
        data[3] = pd.NA

    df = DataFrame({"key": ["a", "a", "b", "b"], "col": data})
    grouped = df.groupby("key")

    if action == "always_int":
        # always Int64
        expected_dtype = pd.Int64Dtype()
    elif action == "large_int":
        # for any int/bool use Int64, for float preserve dtype
        if is_float_dtype(data.dtype):
            expected_dtype = data.dtype
        elif is_integer_dtype(data.dtype):
            # match the numpy dtype we'd get with the non-nullable analogue
            expected_dtype = data.dtype
        else:
            expected_dtype = pd.Int64Dtype()
    elif action == "always_float":
        # for any int/bool use Float64, for float preserve dtype
        if is_float_dtype(data.dtype):
            expected_dtype = data.dtype
        else:
            expected_dtype = pd.Float64Dtype()
    elif action == "preserve":
        expected_dtype = data.dtype

    result = getattr(grouped, op_name)()
    assert result["col"].dtype == expected_dtype

    result = grouped.aggregate(op_name)
    assert result["col"].dtype == expected_dtype

    result = getattr(grouped["col"], op_name)()
    assert result.dtype == expected_dtype

    result = grouped["col"].aggregate(op_name)
    assert result.dtype == expected_dtype
예제 #14
0
    def _fix_dtypes(df: pd.DataFrame) -> None:
        """
        Mutate DataFrame to set dtypes for float columns containing NaN values.
        Set dtype of object to str to allow for downstream transformations.
        """
        for col in df:

            if df[col].dtype.name == 'object':
                # if the type wasn't identified or converted, change it to a string so if can still be
                # processed.
                df[col] = df[col].astype(str)

            if "float" in df[col].dtype.name and df[col].hasnans:
                # inspect values to determine if dtype of non-null values is int or float
                notna_series = df[col].dropna().values
                if np.equal(notna_series, notna_series.astype(int)).all():
                    # set to dtype that retains integers and supports NaNs
                    df[col] = np.where(df[col].isnull(), None, df[col])
                    df[col] = df[col].astype(pd.Int64Dtype())
                elif np.isclose(notna_series, notna_series.astype(int)).all():
                    # set to float dtype that retains floats and supports NaNs
                    df[col] = np.where(df[col].isnull(), None, df[col])
                    df[col] = df[col].astype(pd.Float64Dtype())
예제 #15
0
    def test_replace_nullable_numeric(self):
        # GH#40732, GH#44940

        floats = pd.Series([1.0, 2.0, 3.999, 4.4], dtype=pd.Float64Dtype())
        assert floats.replace({1.0: 9}).dtype == floats.dtype
        assert floats.replace(1.0, 9).dtype == floats.dtype
        assert floats.replace({1.0: 9.0}).dtype == floats.dtype
        assert floats.replace(1.0, 9.0).dtype == floats.dtype

        res = floats.replace(to_replace=[1.0, 2.0], value=[9.0, 10.0])
        assert res.dtype == floats.dtype

        ints = pd.Series([1, 2, 3, 4], dtype=pd.Int64Dtype())
        assert ints.replace({1: 9}).dtype == ints.dtype
        assert ints.replace(1, 9).dtype == ints.dtype
        assert ints.replace({1: 9.0}).dtype == ints.dtype
        assert ints.replace(1, 9.0).dtype == ints.dtype

        # nullable (for now) raises instead of casting
        with pytest.raises(TypeError, match="Invalid value"):
            ints.replace({1: 9.5})
        with pytest.raises(TypeError, match="Invalid value"):
            ints.replace(1, 9.5)
예제 #16
0
    def test_to_table_nullable(self):
        boolean_array = pd.array([True, False, None], dtype=pd.BooleanDtype())
        int8_array = pd.array([1, 2, None], dtype=pd.Int8Dtype())
        int16_array = pd.array([1, 2, None], dtype=pd.Int16Dtype())
        int32_array = pd.array([1, 2, None], dtype=pd.Int32Dtype())
        int64_array = pd.array([1, 2, None], dtype=pd.Int64Dtype())
        float_array = pd.array([1.1, 2.2, None], dtype=pd.Float32Dtype())
        double_array = pd.array([1.1, 2.2, None], dtype=pd.Float64Dtype())
        string_array = pd.array(["s11", "s22", None], dtype=pd.StringDtype())
        object_array = pd.array([pd.NA, "s22", None], dtype=object)

        df = pd.DataFrame({
            "NullableBoolean": boolean_array,
            "NullableInt8": int8_array,
            "NullableInt16": int16_array,
            "NullableInt32": int32_array,
            "NullableInt64": int64_array,
            "NullableFloat": float_array,
            "NullableDouble": double_array,
            "NullableString": string_array,
            "NullableObject": object_array,
        })

        table = to_table(df)
        self.assertIs(table.columns[0].data_type, dtypes.bool_)
        self.assertIs(table.columns[1].data_type, dtypes.int8)
        self.assertIs(table.columns[2].data_type, dtypes.int16)
        self.assertIs(table.columns[3].data_type, dtypes.int32)
        self.assertIs(table.columns[4].data_type, dtypes.int64)
        self.assertIs(table.columns[5].data_type, dtypes.float32)
        self.assertIs(table.columns[6].data_type, dtypes.double)
        self.assertIs(table.columns[7].data_type, dtypes.string)
        self.assertIs(table.columns[8].data_type, dtypes.PyObject)
        self.assertEqual(table.size, 3)
        table_string = table.to_string()
        self.assertEqual(9, table_string.count("null"))
예제 #17
0
def run_one(agent1, agent2, game, testset, seed=None):
    sender = agent1
    receiver = agent2
    role_setting = 0

    metrics = "episode role_setting images symbol guess success".split(" ")
    dtypes = [
        pd.Int32Dtype(), bool, object,
        pd.Int32Dtype(),
        pd.Int32Dtype(),
        pd.Float64Dtype()
    ]
    test_log = pd.DataFrame(columns=metrics)
    for column, dtype in zip(metrics, dtypes):
        test_log[column] = test_log[column].astype(dtype)

    if seed is not None:
        set_seed(seed)

    episode = 0
    exit_status = "full"
    error = False

    batch_log = {metric: [] for metric in metrics}
    for test in testset:
        episode += 1
        game.reset()

        try:
            # Sender turn
            sender_ids = test["sender_ids"]
            sender_state = game.get_sender_state_from_ids(ids=sender_ids,
                                                          expand=True)
            sender_probs = np.squeeze(sender.predict(state=sender_state))
            sender_action = sender.choose_action(sender_probs)

            # Receiver turn
            receiver_ids = test["receiver_ids"]
            receiver_pos = test["receiver_pos"]
            receiver_state = game.get_receiver_state_from_ids(receiver_ids,
                                                              receiver_pos,
                                                              sender_action,
                                                              expand=True)
            receiver_probs = np.squeeze(receiver.predict(state=receiver_state))
            receiver_action = receiver.choose_action(receiver_probs)
        except Exception as e:
            print("\n", "ERROR", e)
            error = True
            break

        # Evaluate turn and remember
        sender_reward, receiver_reward, success = game.evaluate_guess(
            receiver_action)

        batch_log["episode"].append(episode)
        batch_log["role_setting"].append(role_setting)
        batch_log["images"].append(sender_ids)
        batch_log["symbol"].append(sender_action)
        batch_log["guess"].append(receiver_action)
        batch_log["success"].append(success)

        if not episode % 200:
            print(f"\r{episode} games played", end="")

    test_log = test_log.append(pd.DataFrame(batch_log))
    if error:
        return test_log, "error"

    print()

    return test_log, exit_status
예제 #18
0
파일: ssf.py 프로젝트: anarkiwi/desidulate
def add_freq_notes_df(sid, ssfs_df):
    real_freqs = {freq: freq * sid.freq_scaler for freq in ssfs_df['freq1'].unique() if pd.notna(freq)}
    closest_notes = {real_freq: closest_midi(real_freq)[1] for real_freq in real_freqs.values()}
    freq_map = [(freq, real_freq, closest_notes[real_freq]) for freq, real_freq in real_freqs.items()]
    freq_map.extend([(pd.NA, pd.NA, pd.NA)])
    freq_notes_df = pd.DataFrame.from_records(freq_map, columns=['freq1', 'real_freq', 'closest_note']).astype(pd.Float64Dtype())
    freq_notes_df['freq1'] = freq_notes_df['freq1'].astype(pd.UInt16Dtype())
    freq_notes_df['closest_note'] = freq_notes_df['closest_note'].astype(pd.UInt8Dtype())
    return set_sid_dtype(ssfs_df).merge(freq_notes_df, how='left', on='freq1')
예제 #19
0
def run_one(*, out_dir, dataset, number_of_images, embedding_size,
            vocabulary_size, sender_type, temperature, number_of_episodes,
            batch_size, analysis_window, optimizer, memory_sampling_mode,
            algorithm, max_memory, exploration_start, exploration_decay,
            exploration_floor, early_stopping_patience, early_stopping_minimum,
            role_mode, shared_embedding, shared_experience, seed, **kwargs):
    CHECKPOINT_EVERY = 1000
    ERROR_PATIENCE = 5

    # TODO: refactor into settings parser
    # LOAD DATASET
    loaded = False
    try:
        from utils.dataprep import load_emb_pickled
        metadata, embeddings = load_emb_pickled(dataset)
        filenames = metadata.get("fnames")
        categories = metadata.get("categories")
        loaded = True
    except FileNotFoundError:
        loaded = False
    if not loaded:
        from utils.dataprep import load_emb_gz, make_categories
        _, filenames, embeddings = load_emb_gz(dataset)
        categories = make_categories(filenames, sep="\\")
    image_shape = [len(embeddings[0])]

    # CREATE GAME
    game_settings = {
        "images": embeddings,
        "categories": categories,
        "images_filenames": filenames
    }
    from game import Game
    game = Game(**game_settings)

    # SET UP AGENTS
    learning_rate = 0.1
    optimizers = {
        "adam": (
            optim.Adam,
            {
                # "amsgrad": True,
                "clipnorm": 1.0
            }),
        "sgd": (optim.SGD, {
            "clipnorm": 1.0
        }),
        "adadelta": (optim.Adadelta, {
            "clipnorm": 1.0
        }),
        "rmsprop": (optim.RMSprop, {
            "clipnorm": 1.0
        })
    }

    agent_settings = {
        "n_images":
        number_of_images,
        "input_image_shape":
        image_shape,
        "embedding_size":
        embedding_size,
        "vocabulary_size":
        vocabulary_size,
        "temperature":
        temperature,
        "optimizer":
        optimizers[optimizer][0](lr=learning_rate, **optimizers[optimizer][1]),
        "sender_type":
        sender_type,
        #     "sender_type": "informed",
        #     "n_informed_filters": 20,
        "max_memory":
        max_memory,
        "exploration_start":
        exploration_start,
        "exploration_decay":
        exploration_decay,
        "exploration_floor":
        exploration_floor
    }

    if role_mode != "switch":
        shared_experience = False

    tensorflow.keras.backend.clear_session()
    if algorithm == "reinforce":
        from agent.reinforce import Sender, Receiver, MultiAgent
    elif algorithm == "qlearning":
        from agent.qlearning import Sender, Receiver, MultiAgent
    else:
        raise ValueError(
            f"Expected 'reinforce' or 'qlearning' algorithm, got '{algorithm}'"
        )

    if role_mode == "switch":
        agent1 = MultiAgent(active_role="sender",
                            shared_embedding=shared_embedding,
                            **agent_settings)
        agent2 = MultiAgent(active_role="receiver",
                            shared_embedding=shared_embedding,
                            **agent_settings)
    elif role_mode == "static":
        agent1 = Sender(**agent_settings)
        agent2 = Receiver(**agent_settings)
    else:
        raise ValueError(
            f"Role mode must be either 'static' or 'switch', not '{role_mode}'"
        )

    metrics = "episode role_setting images symbol guess success sender_loss receiver_loss".split(
        " ")
    if shared_experience:
        metrics.extend(["sender_loss_2", "receiver_loss_2"])

    dtypes = [
        pd.Int32Dtype(), bool, object,
        pd.Int32Dtype(),
        pd.Int32Dtype(),
        pd.Float64Dtype(),
        pd.Float64Dtype(),
        pd.Float64Dtype()
    ]
    training_log = pd.DataFrame(columns=metrics)
    for column, dtype in zip(metrics, dtypes):
        training_log[column] = training_log[column].astype(dtype)

    episode = 0
    early_stopping = EarlyStopping(patience=early_stopping_patience,
                                   min_episodes=early_stopping_minimum)

    set_seed(seed)

    sender = agent1
    receiver = agent2
    role_setting = 0

    next_checkpoint_episode = CHECKPOINT_EVERY
    error_encountered = False
    remaining_errors = ERROR_PATIENCE
    exit_status = "full"
    while episode < number_of_episodes:
        batch_log = {metric: [] for metric in metrics}
        while True:
            episode += 1
            if error_encountered:
                error_encountered = False
                try:
                    print(f"Loading checkpoint")
                    agent1.load(os.path.join(out_dir, "agent1"))
                    agent2.load(os.path.join(out_dir, "agent2"))
                except:
                    pass

            game.reset()

            try:
                # Sender turn
                sender_state, img_ids = game.get_sender_state(
                    n_images=number_of_images,
                    unique_categories=True,
                    expand=True,
                    return_ids=True)
                sender_probs = np.squeeze(sender.predict(state=sender_state))
                sender_action = sender.choose_action(sender_probs)

                # Receiver turn
                receiver_state = game.get_receiver_state(sender_action,
                                                         expand=True)
                receiver_probs = np.squeeze(
                    receiver.predict(state=receiver_state))
                receiver_action = receiver.choose_action(receiver_probs)
            except Exception as e:
                print("\n", e)
                error_encountered = True
                remaining_errors -= 1
                if remaining_errors < 0:
                    exit_status = "error"
                    break
                continue

            # Evaluate turn and remember
            sender_reward, receiver_reward, success = game.evaluate_guess(
                receiver_action)
            sender.remember(state=sender_state,
                            action=np.asarray([sender_action]),
                            action_probs=sender_probs,
                            reward=np.asarray([sender_reward]))
            receiver.remember(state=receiver_state,
                              action=np.asarray([receiver_action]),
                              action_probs=receiver_probs,
                              reward=np.asarray([receiver_reward]))

            if shared_experience:
                receiver.components["sender"].remember(
                    state=sender_state,
                    action=np.asarray([sender_action]),
                    action_probs=sender_probs,
                    reward=np.asarray([sender_reward]))
                sender.components["receiver"].remember(
                    state=receiver_state,
                    action=np.asarray([receiver_action]),
                    action_probs=receiver_probs,
                    reward=np.asarray([receiver_reward]))

            batch_log["episode"].append(episode)
            batch_log["role_setting"].append(role_setting)
            batch_log["images"].append(img_ids)
            batch_log["symbol"].append(sender_action)
            batch_log["guess"].append(receiver_action)
            batch_log["success"].append(success)

            if not episode % 500:
                stats = compute_live_stats(training_log=training_log,
                                           analysis_window=500,
                                           overwrite_line=False)
                if early_stopping.check(episode, stats["mean_success"]):
                    exit_status = "early"
                    break

            if episode % batch_size == 0:
                break
        if exit_status == "error":
            break
        if exit_status == "early":
            break

        # Train on batch
        try:
            # Save before updating
            if episode > next_checkpoint_episode:
                agent1.save(os.path.join(out_dir, "agent1"))
                agent2.save(os.path.join(out_dir, "agent2"))
                next_checkpoint_episode += CHECKPOINT_EVERY

            # Update
            batch_log["sender_loss"] = sender.update_on_batch(
                batch_size, memory_sampling_mode=memory_sampling_mode)
            batch_log["receiver_loss"] = receiver.update_on_batch(
                batch_size, memory_sampling_mode=memory_sampling_mode)
            if shared_experience:
                batch_log["sender_loss_2"] = receiver.components[
                    "sender"].update_on_batch(
                        batch_size, memory_sampling_mode=memory_sampling_mode)
                batch_log["receiver_loss_2"] = sender.components[
                    "receiver"].update_on_batch(
                        batch_size, memory_sampling_mode=memory_sampling_mode)

            training_log = training_log.append(pd.DataFrame(batch_log))
        except Exception as e:
            print("\n", e)
            return training_log, "error"

        stats = compute_live_stats(training_log=training_log,
                                   analysis_window=analysis_window)

        if role_mode == "switch":
            sender.switch_role()
            receiver.switch_role()
            sender, receiver = receiver, sender
            role_setting ^= 1

    print()
    if exit_status != "error":
        agent1.save(os.path.join(out_dir, "agent1"))
        agent2.save(os.path.join(out_dir, "agent2"))

    return training_log, exit_status
예제 #20
0
def test_uses_pandas_na():
    a = pd.array([1, None], dtype=pd.Float64Dtype())
    assert a[1] is pd.NA
예제 #21
0
pandas_dtypes_to_cudf_dtypes = {
    pd.UInt8Dtype(): np.dtype("uint8"),
    pd.UInt16Dtype(): np.dtype("uint16"),
    pd.UInt32Dtype(): np.dtype("uint32"),
    pd.UInt64Dtype(): np.dtype("uint64"),
    pd.Int8Dtype(): np.dtype("int8"),
    pd.Int16Dtype(): np.dtype("int16"),
    pd.Int32Dtype(): np.dtype("int32"),
    pd.Int64Dtype(): np.dtype("int64"),
    pd.BooleanDtype(): np.dtype("bool_"),
    pd.StringDtype(): np.dtype("object"),
}

if PANDAS_GE_120:
    cudf_dtypes_to_pandas_dtypes[np.dtype("float32")] = pd.Float32Dtype()
    cudf_dtypes_to_pandas_dtypes[np.dtype("float64")] = pd.Float64Dtype()
    pandas_dtypes_to_cudf_dtypes[pd.Float32Dtype()] = np.dtype("float32")
    pandas_dtypes_to_cudf_dtypes[pd.Float64Dtype()] = np.dtype("float64")

SIGNED_INTEGER_TYPES = {"int8", "int16", "int32", "int64"}
UNSIGNED_TYPES = {"uint8", "uint16", "uint32", "uint64"}
INTEGER_TYPES = SIGNED_INTEGER_TYPES | UNSIGNED_TYPES
FLOAT_TYPES = {"float32", "float64"}
SIGNED_TYPES = SIGNED_INTEGER_TYPES | FLOAT_TYPES
NUMERIC_TYPES = SIGNED_TYPES | UNSIGNED_TYPES
DATETIME_TYPES = {
    "datetime64[s]",
    "datetime64[ms]",
    "datetime64[us]",
    "datetime64[ns]",
}
예제 #22
0
def test_digitize():
    # np.nan and pd.NA get digitized to -1, suffix should be added
    df = pd.DataFrame(
        [["chr1", 0, 10, np.nan]],
        columns=["chrom", "start", "end", "value"],
    )
    digitized = saddle.digitize(df, 10, vrange=(-1, 1), digitized_suffix=".test")[0]
    assert -1 == digitized["value.test"].values

    df = pd.DataFrame(
        [["chr1", 0, 10, pd.NA]],
        columns=["chrom", "start", "end", "value"],
    ).astype({"value": pd.Float64Dtype()})
    digitized = saddle.digitize(df, 10, vrange=(-1, 1), digitized_suffix=".test")[0]
    assert -1 == digitized["value.test"].values

    n_bins = 10
    digitized = saddle.digitize(df, n_bins, vrange=(-1, 1))[0]
    # the dtype of the returned column should be a categorical
    assert type(digitized["value.d"].dtype) is pd.core.dtypes.dtypes.CategoricalDtype

    # the number of categories should be equal to the number of bins +3
    assert (n_bins + 3) == digitized["value.d"].dtype.categories.shape[0]

    df = pd.DataFrame(
        [
            ["chr1", 0, 10, -0.5],
            ["chr1", 10, 20, 0.5],
        ],
        columns=["chrom", "start", "end", "value"],
    )

    # values out of the range should be in the 0 and n+1 bins
    digitized = saddle.digitize(df, n_bins, vrange=(-0.1, 0.1))[0]
    assert 0 == digitized["value.d"].values[0]
    assert (n_bins + 1) == digitized["value.d"].values[1]

    # for an input dataframe of ten elements between -1 and 1,
    # and 5 bins, each bin should have 2 digitized values
    # this test will need an update after input checking
    df_linspace = pd.DataFrame(
        (np.linspace(-1, 1, 10) * np.ones((4,))[:, None]).T,
        columns=["chrom", "start", "end", "value"],
    )
    p = (np.arange(0, 100, 10) * np.ones((2,))[:, None]).T  # .shape
    p[:, 1] += 10
    df_linspace.iloc[:, 1:3] = p
    df_linspace["chrom"] = "chrX"
    df_linspace = df_linspace.astype({"chrom": "str", "start": int, "end": int})

    x = saddle.digitize(df_linspace, 5, vrange=(-1, 1.001),)[
        0
    ]["value.d"]
    assert (2 == np.histogram(x, np.arange(1, 7))[0]).all()

    # if the bottom and top quantiles are 25 and 75 with 3 bins, then
    # the low outlier and high outlier bins should each have 3 values
    x = saddle.digitize(df_linspace, 1, qrange=(0.25, 0.75),)[
        0
    ]["value.d"]
    assert 3 == np.sum(x == 0)
    assert 3 == np.sum(x == 2)

    # bins[-1] max value should remain in bin N,
    # not get pushed to outlier bin.

    # raises error if not provided with a track
    # (i.e. bedframe with a numeric fourth column)
    df_not_track = pd.DataFrame(
        [["chr1", 20, 40, "non-numeric"]],
        columns=["chrom", "start", "end", "value"],
    )
    with pytest.raises(ValueError):
        saddle.digitize(df_not_track, n_bins, vrange=(0, 2))

    df_not_track = pd.DataFrame(
        [[0, 20, 40, 0]],
        columns=["chrom", "start", "end", "value"],
    )
    with pytest.raises(ValueError):
        saddle.digitize(df_not_track, n_bins, vrange=(0, 2))

    # raises error if both or none of vrange, qrange provided
    with pytest.raises(ValueError):
        saddle.digitize(df, n_bins, vrange=(0, 2), qrange=(0.1, 0.9))
    with pytest.raises(ValueError):
        saddle.digitize(df, n_bins, vrange=None, qrange=None)

    # raises error if vrange lo>hi, qrange lo >hi, or qrange out of (0,1)
    with pytest.raises(ValueError):
        saddle.digitize(df, n_bins, vrange=(2, 1))
    with pytest.raises(ValueError):
        saddle.digitize(df, n_bins, qrange=(0, 2.1))
    with pytest.raises(ValueError):
        saddle.digitize(df, n_bins, qrange=(0.5, 0.25))
예제 #23
0
    class FLOAT64(DataType, dtypes.Float):
        """Semantic representation of a :class:`pandas.Float64Dtype`."""

        type = pd.Float64Dtype()
        bit_width: int = 64
예제 #24
0
    pd.UInt32Dtype(): SqlTypeName.INTEGER,
    np.uint16: SqlTypeName.SMALLINT,
    pd.UInt16Dtype(): SqlTypeName.SMALLINT,
    np.uint8: SqlTypeName.TINYINT,
    pd.UInt8Dtype(): SqlTypeName.TINYINT,
    np.bool8: SqlTypeName.BOOLEAN,
    pd.BooleanDtype(): SqlTypeName.BOOLEAN,
    np.object_: SqlTypeName.VARCHAR,
    pd.StringDtype(): SqlTypeName.VARCHAR,
    np.datetime64: SqlTypeName.TIMESTAMP,
}

if FLOAT_NAN_IMPLEMENTED:  # pragma: no cover
    _PYTHON_TO_SQL.update({
        pd.Float32Dtype(): SqlTypeName.FLOAT,
        pd.Float64Dtype(): SqlTypeName.FLOAT
    })

# Default mapping between SQL types and python types
# for values
_SQL_TO_PYTHON_SCALARS = {
    "DOUBLE": np.float64,
    "FLOAT": np.float32,
    "DECIMAL": np.float32,
    "BIGINT": np.int64,
    "INTEGER": np.int32,
    "SMALLINT": np.int16,
    "TINYINT": np.int8,
    "BOOLEAN": np.bool8,
    "VARCHAR": str,
    "CHAR": str,
예제 #25
0
    bit_width: int = 8


# ###############################################################################
# # float
# ###############################################################################

_register_numpy_numbers(
    builtin_name="float",
    pandera_name="Float",
    sizes=[128, 64, 32, 16] if FLOAT_128_AVAILABLE else [64, 32, 16],
)

if PANDAS_1_2_0_PLUS:

    @Engine.register_dtype(equivalents=[pd.Float64Dtype, pd.Float64Dtype()])
    @immutable
    class FLOAT64(DataType, dtypes.Float):
        """Semantic representation of a :class:`pandas.Float64Dtype`."""

        type = pd.Float64Dtype()
        bit_width: int = 64

    @Engine.register_dtype(equivalents=[pd.Float32Dtype, pd.Float32Dtype()])
    @immutable
    class FLOAT32(FLOAT64):
        """Semantic representation of a :class:`pandas.Float32Dtype`."""

        type = pd.Float32Dtype()
        bit_width: int = 32
예제 #26
0
class DataMapping:
    """
    Map primary data between different supported data frameworks, preserving equivalent data types.

    DataMapping is for primary data, to map metadata types and values use
    :py:class:`TypeMapping <tracdap.rt.impl.type_system.TypeMapping>` and
    :py:class:`TypeMapping <tracdap.rt.impl.type_system.MetadataCodec>`.
    """

    __log = _util.logger_for_namespace(_DataInternal.__module__ +
                                       ".DataMapping")

    # Matches TRAC_ARROW_TYPE_MAPPING in ArrowSchema, tracdap-lib-data

    __TRAC_DECIMAL_PRECISION = 38
    __TRAC_DECIMAL_SCALE = 12
    __TRAC_TIMESTAMP_UNIT = "ms"
    __TRAC_TIMESTAMP_ZONE = None

    __TRAC_TO_ARROW_BASIC_TYPE_MAPPING = {
        _meta.BasicType.BOOLEAN:
        pa.bool_(),
        _meta.BasicType.INTEGER:
        pa.int64(),
        _meta.BasicType.FLOAT:
        pa.float64(),
        _meta.BasicType.DECIMAL:
        pa.decimal128(__TRAC_DECIMAL_PRECISION, __TRAC_DECIMAL_SCALE),
        _meta.BasicType.STRING:
        pa.utf8(),
        _meta.BasicType.DATE:
        pa.date32(),
        _meta.BasicType.DATETIME:
        pa.timestamp(__TRAC_TIMESTAMP_UNIT, __TRAC_TIMESTAMP_ZONE)
    }

    # Check the Pandas dtypes for handling floats are available before setting up the type mapping
    __PANDAS_FLOAT_DTYPE_CHECK = _DataInternal.float_dtype_check()
    __PANDAS_DATETIME_TYPE = pd.to_datetime([]).dtype

    # Only partial mapping is possible, decimal and temporal dtypes cannot be mapped this way
    __ARROW_TO_PANDAS_TYPE_MAPPING = {
        pa.bool_(): pd.BooleanDtype(),
        pa.int8(): pd.Int8Dtype(),
        pa.int16(): pd.Int16Dtype(),
        pa.int32(): pd.Int32Dtype(),
        pa.int64(): pd.Int64Dtype(),
        pa.uint8(): pd.UInt8Dtype(),
        pa.uint16(): pd.UInt16Dtype(),
        pa.uint32(): pd.UInt32Dtype(),
        pa.uint64(): pd.UInt64Dtype(),
        pa.float16(): pd.Float32Dtype(),
        pa.float32(): pd.Float32Dtype(),
        pa.float64(): pd.Float64Dtype(),
        pa.utf8(): pd.StringDtype()
    }

    @staticmethod
    def arrow_to_python_type(arrow_type: pa.DataType) -> type:

        if pa.types.is_boolean(arrow_type):
            return bool

        if pa.types.is_integer(arrow_type):
            return int

        if pa.types.is_floating(arrow_type):
            return float

        if pa.types.is_decimal(arrow_type):
            return decimal.Decimal

        if pa.types.is_string(arrow_type):
            return str

        if pa.types.is_date(arrow_type):
            return dt.date

        if pa.types.is_timestamp(arrow_type):
            return dt.datetime

        raise _ex.ETracInternal(
            f"No Python type mapping available for Arrow type [{arrow_type}]")

    @classmethod
    def python_to_arrow_type(cls, python_type: type) -> pa.DataType:

        if python_type == bool:
            return pa.bool_()

        if python_type == int:
            return pa.int64()

        if python_type == float:
            return pa.float64()

        if python_type == decimal.Decimal:
            return pa.decimal128(cls.__TRAC_DECIMAL_PRECISION,
                                 cls.__TRAC_DECIMAL_SCALE)

        if python_type == str:
            return pa.utf8()

        if python_type == dt.date:
            return pa.date32()

        if python_type == dt.datetime:
            return pa.timestamp(cls.__TRAC_TIMESTAMP_UNIT,
                                cls.__TRAC_TIMESTAMP_ZONE)

        raise _ex.ETracInternal(
            f"No Arrow type mapping available for Python type [{python_type}]")

    @classmethod
    def trac_to_arrow_type(cls,
                           trac_type: _meta.TypeDescriptor) -> pa.DataType:

        return cls.trac_to_arrow_basic_type(trac_type.basicType)

    @classmethod
    def trac_to_arrow_basic_type(
            cls, trac_basic_type: _meta.BasicType) -> pa.DataType:

        arrow_type = cls.__TRAC_TO_ARROW_BASIC_TYPE_MAPPING.get(
            trac_basic_type)

        if arrow_type is None:
            raise _ex.ETracInternal(
                f"No Arrow type mapping available for TRAC type [{trac_basic_type}]"
            )

        return arrow_type

    @classmethod
    def trac_to_arrow_schema(cls,
                             trac_schema: _meta.SchemaDefinition) -> pa.Schema:

        if trac_schema.schemaType != _meta.SchemaType.TABLE:
            raise _ex.ETracInternal(
                f"Schema type [{trac_schema.schemaType}] cannot be converted for Apache Arrow"
            )

        arrow_fields = [(f.fieldName,
                         cls.trac_to_arrow_basic_type(f.fieldType))
                        for f in trac_schema.table.fields]

        return pa.schema(arrow_fields, metadata={})

    @classmethod
    def trac_arrow_decimal_type(cls) -> pa.Decimal128Type:

        return pa.decimal128(cls.__TRAC_DECIMAL_PRECISION,
                             cls.__TRAC_DECIMAL_SCALE)

    @classmethod
    def pandas_datetime_type(cls):
        return cls.__PANDAS_DATETIME_TYPE

    @classmethod
    def view_to_pandas(cls, view: DataView, part: DataPartKey) -> pd.DataFrame:

        deltas = view.parts.get(part)

        # Sanity checks

        if not view.arrow_schema:
            raise _ex.ETracInternal(f"Data view schema not set")

        if not deltas:
            raise _ex.ETracInternal(
                f"Data view for part [{part.opaque_key}] does not contain any items"
            )

        if len(deltas) == 1:
            return cls.item_to_pandas(deltas[0])

        batches = {
            batch
            for delta in deltas for batch in (
                delta.batches if delta.batches else delta.table.to_batches())
        }

        table = pa.Table.from_batches(batches)  # noqa
        return table.to_pandas()

    @classmethod
    def item_to_pandas(cls, item: DataItem) -> pd.DataFrame:

        if item.pandas is not None:
            return item.pandas.copy()

        if item.table is not None:
            return cls.arrow_to_pandas(item.table)

        if item.batches is not None:
            table = pa.Table.from_batches(item.batches, item.schema)  # noqa
            return cls.arrow_to_pandas(table)

        raise _ex.ETracInternal(f"Data item does not contain any usable data")

    @classmethod
    def arrow_to_pandas(cls, table: pa.Table) -> pd.DataFrame:

        return table.to_pandas(
            ignore_metadata=True,  # noqa
            date_as_object=False,  # noqa
            timestamp_as_object=False,  # noqa
            types_mapper=cls.__ARROW_TO_PANDAS_TYPE_MAPPING.get)

    @classmethod
    def pandas_to_view(cls, df: pd.DataFrame, prior_view: DataView,
                       part: DataPartKey):

        item = cls.pandas_to_item(df, prior_view.arrow_schema)
        return cls.add_item_to_view(prior_view, part, item)

    @classmethod
    def pandas_to_item(cls, df: pd.DataFrame,
                       schema: tp.Optional[pa.Schema]) -> DataItem:

        table = cls.pandas_to_arrow(df, schema)
        return DataItem(table.schema, table)

    @classmethod
    def pandas_to_arrow(cls,
                        df: pd.DataFrame,
                        schema: tp.Optional[pa.Schema] = None) -> pa.Table:

        # Here we convert the whole Pandas df and then pass it to conformance
        # An optimization would be to filter columns before applying conformance
        # To do this, we'd need the case-insensitive field matching logic, including output of warnings

        # Also, note that schema is not applied in from_pandas
        # This is because the conformance logic allows for a wider range of conversions
        # Applying the schema directly would fail for some types where casting is possible

        if len(df) == 0:
            df_schema = pa.Schema.from_pandas(df, preserve_index=False)  # noqa
            table = pa.Table.from_batches(list(), df_schema)  # noqa
        else:
            table = pa.Table.from_pandas(df, preserve_index=False)  # noqa

        # If there is no explict schema, give back the table exactly as it was received from Pandas
        # There could be an option here to coerce types to the appropriate TRAC standard types
        # E.g. unsigned int 32 -> signed int 64, TRAC standard integer type

        if schema is None:
            return table
        else:
            return DataConformance.conform_to_schema(table, schema, df.dtypes)

    @classmethod
    def add_item_to_view(cls, view: DataView, part: DataPartKey,
                         item: DataItem) -> DataView:

        prior_deltas = view.parts.get(part) or list()
        deltas = [*prior_deltas, item]
        parts = {**view.parts, part: deltas}

        return DataView(view.trac_schema, view.arrow_schema, parts)