示例#1
0
def test_jamsframe_from_df():

    df = pd.DataFrame(data=[[0.0, 1.0, 'a', 0.0],
                            [1.0, 2.0, 'b', 0.0]],
                      columns=['time', 'duration', 'value', 'confidence'])

    jf = jams.JamsFrame.from_dataframe(df)

    # 1. type check
    assert isinstance(jf, jams.JamsFrame)

    # 2. check field order
    eq_(list(jf.keys().values),
        jams.JamsFrame.fields())

    # 3. check field types
    assert jf['time'].dtype == np.dtype('<m8[ns]')
    assert jf['duration'].dtype == np.dtype('<m8[ns]')

    # 4. Check the values
    eq_(list(jf['time']),
        list(pd.to_timedelta([0.0, 1.0], unit='s')))
    eq_(list(jf['duration']), 
        list(pd.to_timedelta([1.0, 2.0], unit='s')))
    eq_(list(jf['value']), ['a', 'b'])
    eq_(list(jf['confidence']), [0.0, 0.0])
示例#2
0
def _decode_datetime_with_pandas(flat_num_dates, units, calendar):
    if calendar not in _STANDARD_CALENDARS:
        raise OutOfBoundsDatetime(
            'Cannot decode times from a non-standard calendar, {!r}, using '
            'pandas.'.format(calendar))

    delta, ref_date = _unpack_netcdf_time_units(units)
    delta = _netcdf_to_numpy_timeunit(delta)
    try:
        ref_date = pd.Timestamp(ref_date)
    except ValueError:
        # ValueError is raised by pd.Timestamp for non-ISO timestamp
        # strings, in which case we fall back to using cftime
        raise OutOfBoundsDatetime

    # fixes: https://github.com/pydata/pandas/issues/14068
    # these lines check if the the lowest or the highest value in dates
    # cause an OutOfBoundsDatetime (Overflow) error
    with warnings.catch_warnings():
        warnings.filterwarnings('ignore', 'invalid value encountered',
                                RuntimeWarning)
        pd.to_timedelta(flat_num_dates.min(), delta) + ref_date
        pd.to_timedelta(flat_num_dates.max(), delta) + ref_date

    # Cast input dates to integers of nanoseconds because `pd.to_datetime`
    # works much faster when dealing with integers
    # make _NS_PER_TIME_DELTA an array to ensure type upcasting
    flat_num_dates_ns_int = (flat_num_dates.astype(np.float64) *
                             _NS_PER_TIME_DELTA[delta]).astype(np.int64)

    return (pd.to_timedelta(flat_num_dates_ns_int, 'ns') + ref_date).values
示例#3
0
    def test_timedelta_ops_scalar(self):
        # GH 6808
        base = pd.to_datetime("20130101 09:01:12.123456")
        expected_add = pd.to_datetime("20130101 09:01:22.123456")
        expected_sub = pd.to_datetime("20130101 09:01:02.123456")

        for offset in [
            pd.to_timedelta(10, unit="s"),
            timedelta(seconds=10),
            np.timedelta64(10, "s"),
            np.timedelta64(10000000000, "ns"),
            pd.offsets.Second(10),
        ]:
            result = base + offset
            self.assertEqual(result, expected_add)

            result = base - offset
            self.assertEqual(result, expected_sub)

        base = pd.to_datetime("20130102 09:01:12.123456")
        expected_add = pd.to_datetime("20130103 09:01:22.123456")
        expected_sub = pd.to_datetime("20130101 09:01:02.123456")

        for offset in [
            pd.to_timedelta("1 day, 00:00:10"),
            pd.to_timedelta("1 days, 00:00:10"),
            timedelta(days=1, seconds=10),
            np.timedelta64(1, "D") + np.timedelta64(10, "s"),
            pd.offsets.Day() + pd.offsets.Second(10),
        ]:
            result = base + offset
            self.assertEqual(result, expected_add)

            result = base - offset
            self.assertEqual(result, expected_sub)
示例#4
0
def show_overlap_aligned(pc, mc, start=0, length_or_end=100,
                         **dvo_kwargs):
    """pc and mc are aligned dataframes
    start is either an integer (row number) or a string (datetime)
    length_or_end is either an integer (row numbers, or seconds) or a
        string (like '10 min', '5 s', or '12:45:43')

    dvo_kwargs are keyword arguments
        for utils.plotting.categorical.dummy_variable_overlaps
    """
    if isinstance(start, str):
        start = _read_or_infer_datetime(start, pc.index)

        if isinstance(length_or_end, str):
            try:
                end = _read_or_infer_datetime(length_or_end, pc.index)
            except ValueError:
                end = start + pd.to_timedelta(length_or_end)
        else:
            end = start + pd.to_timedelta(length_or_end, unit='s')

        pc_sub = pc[start:end]
        mc_sub = mc[start:end]
    else:
        end = start + length_or_end
        pc_sub = pc.iloc[start:end]
        mc_sub = mc.iloc[start:end]

    kwargs = {'drop_empty_rows':False,
              'drop_empty_cols':False,}
    kwargs.update(dvo_kwargs)
    plotcat.dummy_variable_overlaps(pc_sub, mc_sub, 'pc', 'mc',
                                    x_label='AlmNr', y_label='Time',
                                    **kwargs)
示例#5
0
def test_val_to_num():
    assert val_to_num('7') == 7
    assert val_to_num('.7') == .7
    assert val_to_num('0.7') == .7
    assert val_to_num('07') == 7
    assert val_to_num('0') == 0
    assert val_to_num('00') == 0
    assert val_to_num('-20') == -20
    assert val_to_num(7) == 7
    assert val_to_num(0.7) == 0.7
    assert val_to_num(0) == 0
    assert val_to_num('NOW') == 'NOW'
    assert val_to_num('now') == 'now'
    assert val_to_num('TODAY') == 'TODAY'
    assert val_to_num('') == ''
    assert val_to_num('2018-10-10') == pd.to_datetime('2018-10-10')
    assert val_to_num('2018-10-09') == pd.to_datetime('2018-10-09')
    assert val_to_num('2017-12') == pd.to_datetime('2017-12')
    assert val_to_num('5e+6') == 5e6
    assert val_to_num('5e-6') == 5e-6
    assert val_to_num('0xabc') == '0xabc'
    assert val_to_num('hello world') == 'hello world'
    # The following tests document an idiosyncrasy of val_to_num which is difficult
    # to avoid while timedeltas are supported.
    assert val_to_num('50+20') == pd.to_timedelta('50+20')
    assert val_to_num('50-20') == pd.to_timedelta('50-20')
示例#6
0
文件: core.py 项目: hendriks73/jams
    def __init__(self, data=None, index=None, columns=None, dtype=None):
        '''Construct a new JamsFrame object.

        Parameters
        ----------
        data
            Optional data for the new JamsFrame, in any format supported
            by `pandas.DataFrame.__init__`.

            Fields must be `['time', 'duration', 'value', 'confidence']`.

            `time` and `duration` fields must be floating point types,
            measured in seconds.

        index
            Optional index on `data`.

        columns
        dtype
            These parameters are ignored by JamsFrame, but are allowed
            for API compatibility with `pandas.DataFrame`.

        See Also
        --------
        from_dict
        from_dataframe
        pandas.DataFrame.__init__

        '''
        super(JamsFrame, self).__init__(data=data, index=index,
                                        columns=self.fields())

        self.time = pd.to_timedelta(self.time, unit='s')
        self.duration = pd.to_timedelta(self.duration, unit='s')
示例#7
0
def readCSV(allFiles, saveCSV):
    """ Read CSV files that record piston dendrometer displacement """ 
    dfs = dict() # create a blank dictionary

    for file_ in allFiles:
        # Key: filename; Value: pandas data frame
        dfs[file_] = pd.read_csv(file_, header=None, parse_dates={"Year" : [1]})
        # Parse the year / day of year / time column to a single datetime64 index
        dfs[file_].index =(dfs[file_].Year +
                           pd.to_timedelta(dfs[file_][2],unit='D') +  
                           pd.to_timedelta(dfs[file_][3]//100-1,unit='H') + 
                           #loggers on DLS; minus one hour to set to UTC-8
                           pd.to_timedelta(dfs[file_][3]%100,unit='m'))
        del dfs[file_]['Year'], dfs[file_][0], dfs[file_][2], dfs[file_][3]
        
        # Relabel columns with file name + column number + 1
        # to match the campbell SE channel in the data logger
        for column in dfs[file_]:
            dfs[file_].rename(columns = {column : file_ + 
                                         str(dfs[file_].columns.get_loc(column) + 1)}, 
                              inplace=True)
        
    # Merge each dataframe in the dictionary, by datetim stamp
    merge = functools.partial(pd.merge, left_index=True, right_index=True, how='outer')
    radius = functools.reduce(merge, dfs.values())


    if saveCSV:
        radius.to_csv('Data\merged_radius.csv')

    return radius
示例#8
0
 def test_infer_timedelta_units(self):
     for deltas, expected in [
             (pd.to_timedelta(['1 day', '2 days']), 'days'),
             (pd.to_timedelta(['1h', '1 day 1 hour']), 'hours'),
             (pd.to_timedelta(['1m', '2m', np.nan]), 'minutes'),
             (pd.to_timedelta(['1m3s', '1m4s']), 'seconds')]:
         assert expected == coding.times.infer_timedelta_units(deltas)
示例#9
0
文件: core.py 项目: hendriks73/jams
    def from_dataframe(cls, frame):
        '''Convert a pandas DataFrame into a JamsFrame.

        Note: this operation is destructive, in that the input
        DataFrame will have its type and data altered.

        Parameters
        ----------
        frame : pandas.DataFrame
            The input DataFrame.  Must have the appropriate JamsFrame fields:
            'time', 'duration', 'value', and 'confidence'.

            'time' and 'duration' fields should be of type `float` and measured
            in seconds.

        Returns
        -------
        jams_frame : JamsFrame
            The input `frame` modified to form a JamsFrame.

        See Also
        --------
        from_dict
        '''
        # Encode time properly
        frame.time = pd.to_timedelta(frame.time, unit='s')
        frame.duration = pd.to_timedelta(frame.duration, unit='s')

        # Properly order the columns
        frame = frame[cls.fields()]

        # Clobber the class attribute
        frame.__class__ = cls
        return frame
示例#10
0
    def test_timedelta_ops_scalar(self):
        # GH 6808
        base = pd.to_datetime('20130101 09:01:12.123456')
        expected_add = pd.to_datetime('20130101 09:01:22.123456')
        expected_sub = pd.to_datetime('20130101 09:01:02.123456')

        for offset in [pd.to_timedelta(10, unit='s'), timedelta(seconds=10),
                       np.timedelta64(10, 's'),
                       np.timedelta64(10000000000, 'ns'),
                       pd.offsets.Second(10)]:
            result = base + offset
            assert result == expected_add

            result = base - offset
            assert result == expected_sub

        base = pd.to_datetime('20130102 09:01:12.123456')
        expected_add = pd.to_datetime('20130103 09:01:22.123456')
        expected_sub = pd.to_datetime('20130101 09:01:02.123456')

        for offset in [pd.to_timedelta('1 day, 00:00:10'),
                       pd.to_timedelta('1 days, 00:00:10'),
                       timedelta(days=1, seconds=10),
                       np.timedelta64(1, 'D') + np.timedelta64(10, 's'),
                       pd.offsets.Day() + pd.offsets.Second(10)]:
            result = base + offset
            assert result == expected_add

            result = base - offset
            assert result == expected_sub
示例#11
0
文件: time.py 项目: EQ4/muda
    def deform_annotation(self, annotation, state):
        '''Deform the annotation'''

        track_duration = state['duration']

        # Get the time in seconds
        t = np.asarray([x.total_seconds() for x in annotation.data.time])
        if self.time:
            # Deform
            t += np.random.normal(loc=self.mean,
                                  scale=self.sigma,
                                  size=t.shape)

        # Clip to the track duration
        t = np.clip(t, 0, track_duration)
        annotation.data.time = pd.to_timedelta(t, unit='s')

        # Get the time in seconds
        d = np.asarray([x.total_seconds() for x in annotation.data.duration])
        if self.duration:
            # Deform
            d += np.random.normal(loc=self.mean,
                                  scale=self.sigma,
                                  size=d.shape)

        # Clip to the track duration - interval start
        d = [np.clip(d_i, 0, track_duration - t_i) for (d_i, t_i) in zip(d, t)]
        annotation.data.duration = pd.to_timedelta(d, unit='s')
示例#12
0
    def test_nat_converters(self):
        assert to_timedelta('nat', box=False).astype('int64') == iNaT
        assert to_timedelta('nan', box=False).astype('int64') == iNaT

        def testit(unit, transform):

            # array
            result = to_timedelta(np.arange(5), unit=unit)
            expected = TimedeltaIndex([np.timedelta64(i, transform(unit))
                                       for i in np.arange(5).tolist()])
            tm.assert_index_equal(result, expected)

            # scalar
            result = to_timedelta(2, unit=unit)
            expected = Timedelta(np.timedelta64(2, transform(unit)).astype(
                'timedelta64[ns]'))
            assert result == expected

        # validate all units
        # GH 6855
        for unit in ['Y', 'M', 'W', 'D', 'y', 'w', 'd']:
            testit(unit, lambda x: x.upper())
        for unit in ['days', 'day', 'Day', 'Days']:
            testit(unit, lambda x: 'D')
        for unit in ['h', 'm', 's', 'ms', 'us', 'ns', 'H', 'S', 'MS', 'US',
                     'NS']:
            testit(unit, lambda x: x.lower())

        # offsets

        # m
        testit('T', lambda x: 'm')

        # ms
        testit('L', lambda x: 'ms')
示例#13
0
    def test_timedelta(self):
        converter = lambda x: pd.to_timedelta(x, unit="ms")

        s = Series([timedelta(23), timedelta(seconds=5)])
        self.assertEqual(s.dtype, "timedelta64[ns]")
        # index will be float dtype
        assert_series_equal(s, pd.read_json(s.to_json(), typ="series").apply(converter), check_index_type=False)

        s = Series([timedelta(23), timedelta(seconds=5)], index=pd.Index([0, 1], dtype=float))
        self.assertEqual(s.dtype, "timedelta64[ns]")
        assert_series_equal(s, pd.read_json(s.to_json(), typ="series").apply(converter))

        frame = DataFrame([timedelta(23), timedelta(seconds=5)])
        self.assertEqual(frame[0].dtype, "timedelta64[ns]")
        assert_frame_equal(
            frame, pd.read_json(frame.to_json()).apply(converter), check_index_type=False, check_column_type=False
        )

        frame = DataFrame(
            {
                "a": [timedelta(days=23), timedelta(seconds=5)],
                "b": [1, 2],
                "c": pd.date_range(start="20130101", periods=2),
            }
        )

        result = pd.read_json(frame.to_json(date_unit="ns"))
        result["a"] = pd.to_timedelta(result.a, unit="ns")
        result["c"] = pd.to_datetime(result.c)
        assert_frame_equal(frame, result, check_index_type=False)
示例#14
0
    def test_timedelta_ops(self):
        _skip_if_numpy_not_friendly()

        # GH4984
        # make sure ops return timedeltas
        s = Series([Timestamp('20130101') + timedelta(seconds=i*i) for i in range(10) ])
        td = s.diff()

        result = td.mean()[0]
        # TODO This should have returned a scalar to begin with. Hack for now.
        expected = to_timedelta(timedelta(seconds=9))
        tm.assert_almost_equal(result, expected)

        result = td.quantile(.1)
        # This properly returned a scalar.
        expected = to_timedelta('00:00:02.6')
        tm.assert_almost_equal(result, expected)

        result = td.median()[0]
        # TODO This should have returned a scalar to begin with. Hack for now.
        expected = to_timedelta('00:00:08')
        tm.assert_almost_equal(result, expected)

        # GH 6462
        # consistency in returned values for sum
        result = td.sum()[0]
        expected = to_timedelta('00:01:21')
        tm.assert_almost_equal(result, expected)
示例#15
0
    def test_timedelta_ops_scalar(self):
        _skip_if_numpy_not_friendly()

        # GH 6808
        base = pd.to_datetime('20130101 09:01:12.123456')
        expected_add = pd.to_datetime('20130101 09:01:22.123456')
        expected_sub = pd.to_datetime('20130101 09:01:02.123456')

        for offset in [pd.to_timedelta(10,unit='s'),
                       timedelta(seconds=10),
                       np.timedelta64(10,'s'),
                       np.timedelta64(10000000000,'ns'),
                       pd.offsets.Second(10)]:
            result = base + offset
            self.assertEquals(result, expected_add)

            result = base - offset
            self.assertEquals(result, expected_sub)

        base = pd.to_datetime('20130102 09:01:12.123456')
        expected_add = pd.to_datetime('20130103 09:01:22.123456')
        expected_sub = pd.to_datetime('20130101 09:01:02.123456')

        for offset in [pd.to_timedelta('1 day, 00:00:10'),
                       pd.to_timedelta('1 days, 00:00:10'),
                       timedelta(days=1,seconds=10),
                       np.timedelta64(1,'D')+np.timedelta64(10,'s'),
                       pd.offsets.Day()+pd.offsets.Second(10)]:
            result = base + offset
            self.assertEquals(result, expected_add)

            result = base - offset
            self.assertEquals(result, expected_sub)
示例#16
0
 def test_infer_timedelta_units(self):
     for deltas, expected in [
             (pd.to_timedelta(['1 day', '2 days']), 'days'),
             (pd.to_timedelta(['1h', '1 day 1 hour']), 'hours'),
             (pd.to_timedelta(['1m', '2m', np.nan]), 'minutes'),
             (pd.to_timedelta(['1m3s', '1m4s']), 'seconds')]:
         self.assertEqual(expected, conventions.infer_timedelta_units(deltas))
示例#17
0
    def test_timedelta(self):
        converter = lambda x: pd.to_timedelta(x, unit='ms')

        s = Series([timedelta(23), timedelta(seconds=5)])
        self.assertEqual(s.dtype, 'timedelta64[ns]')

        result = pd.read_json(s.to_json(), typ='series').apply(converter)
        assert_series_equal(result, s)

        s = Series([timedelta(23), timedelta(seconds=5)],
                   index=pd.Index([0, 1]))
        self.assertEqual(s.dtype, 'timedelta64[ns]')
        result = pd.read_json(s.to_json(), typ='series').apply(converter)
        assert_series_equal(result, s)

        frame = DataFrame([timedelta(23), timedelta(seconds=5)])
        self.assertEqual(frame[0].dtype, 'timedelta64[ns]')
        assert_frame_equal(frame, pd.read_json(frame.to_json())
                           .apply(converter))

        frame = DataFrame({'a': [timedelta(days=23), timedelta(seconds=5)],
                           'b': [1, 2],
                           'c': pd.date_range(start='20130101', periods=2)})

        result = pd.read_json(frame.to_json(date_unit='ns'))
        result['a'] = pd.to_timedelta(result.a, unit='ns')
        result['c'] = pd.to_datetime(result.c)
        assert_frame_equal(frame, result)
def compute_sparseness(row,ds_sep):
    """
    Computes the "sparseness" of a sparse row. There are three levels of sparseness:
    SPARSE1: SPARSITY_MIN <= row.mean_sep / ds_sep < SPARSITY_MID
    SPARSE2: SPARSITY_MID <= row.mean_sep / ds_sep < SPARSITY_MAX
    SPARSE3: SPARSITY_MAX <= row.mean_sep / ds_sep
    :param row: The dataset row containing the sparse data
    :param ds_sep: average separation of data points in the dataset
    :return: The appropriate "sparsness" level
    """
    # calculate the row's data density ratio
    if 'mean_sep' in row:
        sep_ratio = row.mean_sep / pd.to_timedelta(ds_sep, 's')
    else:
        # calculate the density from the row
        interval = row.last - row.first
        mean_sep = interval / row.count
        sep_ratio = mean_sep / pd.to_timedelta(ds_sep, 's')

    # in case this method is called on a row that isn't sparse, default to having data present.
    ret_val = PRESENT

    if sep_ratio >= SPARSITY_MAX:
        ret_val = SPARSE3
    elif sep_ratio >= SPARSITY_MID:
        ret_val = SPARSE2
    elif sep_ratio >= SPARSITY_MIN:
        ret_val = SPARSE1
    return ret_val
示例#19
0
def ymd_to_dt(df, utc=True):
    return(pd.to_datetime(df["year"].astype(str) + "-"
                             + df["month"].astype(str) + "-"
                             + df["day"].astype(str),
                         utc=utc)\
         + pd.to_timedelta(df["hour"].astype(str) + "H")\
         + pd.to_timedelta(df["minute"].astype(str) + "M")\
         + pd.to_timedelta(df["second"].astype(str) + "S"))
示例#20
0
    def test_nat_converters(self):
        result = to_timedelta('nat', box=False)
        assert result.dtype.kind == 'm'
        assert result.astype('int64') == iNaT

        result = to_timedelta('nan', box=False)
        assert result.dtype.kind == 'm'
        assert result.astype('int64') == iNaT
示例#21
0
    def test_nat_converters(self):
        result = to_timedelta('nat').to_numpy()
        assert result.dtype.kind == 'M'
        assert result.astype('int64') == iNaT

        result = to_timedelta('nan').to_numpy()
        assert result.dtype.kind == 'M'
        assert result.astype('int64') == iNaT
示例#22
0
文件: times.py 项目: benbovy/xarray
def decode_cf_datetime(num_dates, units, calendar=None):
    """Given an array of numeric dates in netCDF format, convert it into a
    numpy array of date time objects.

    For standard (Gregorian) calendars, this function uses vectorized
    operations, which makes it much faster than cftime.num2date. In such a
    case, the returned array will be of type np.datetime64.

    Note that time unit in `units` must not be smaller than microseconds and
    not larger than days.

    See also
    --------
    cftime.num2date
    """
    num_dates = np.asarray(num_dates)
    flat_num_dates = num_dates.ravel()
    if calendar is None:
        calendar = 'standard'

    delta, ref_date = _unpack_netcdf_time_units(units)

    try:
        if calendar not in _STANDARD_CALENDARS:
            raise OutOfBoundsDatetime

        delta = _netcdf_to_numpy_timeunit(delta)
        try:
            ref_date = pd.Timestamp(ref_date)
        except ValueError:
            # ValueError is raised by pd.Timestamp for non-ISO timestamp
            # strings, in which case we fall back to using cftime
            raise OutOfBoundsDatetime

        # fixes: https://github.com/pydata/pandas/issues/14068
        # these lines check if the the lowest or the highest value in dates
        # cause an OutOfBoundsDatetime (Overflow) error
        with warnings.catch_warnings():
            warnings.filterwarnings('ignore', 'invalid value encountered',
                                    RuntimeWarning)
            pd.to_timedelta(flat_num_dates.min(), delta) + ref_date
            pd.to_timedelta(flat_num_dates.max(), delta) + ref_date

        # Cast input dates to integers of nanoseconds because `pd.to_datetime`
        # works much faster when dealing with integers
        # make _NS_PER_TIME_DELTA an array to ensure type upcasting
        flat_num_dates_ns_int = (flat_num_dates.astype(np.float64) *
                                 _NS_PER_TIME_DELTA[delta]).astype(np.int64)

        dates = (pd.to_timedelta(flat_num_dates_ns_int, 'ns') +
                 ref_date).values

    except (OutOfBoundsDatetime, OverflowError):
        dates = _decode_datetime_with_cftime(
            flat_num_dates.astype(np.float), units, calendar)

    return dates.reshape(num_dates.shape)
示例#23
0
    def test_to_timedelta_box_deprecated(self):
        result = np.timedelta64(0, 'ns')

        # Deprecated - see GH24416
        with tm.assert_produces_warning(FutureWarning):
            to_timedelta(0, box=False)

        expected = to_timedelta(0).to_timedelta64()
        assert result == expected
示例#24
0
    def test_timedelta_ops(self):
        # GH#4984
        # make sure ops return Timedelta
        s = Series([Timestamp('20130101') + timedelta(seconds=i * i)
                    for i in range(10)])
        td = s.diff()

        result = td.mean()
        expected = to_timedelta(timedelta(seconds=9))
        assert result == expected

        result = td.to_frame().mean()
        assert result[0] == expected

        result = td.quantile(.1)
        expected = Timedelta(np.timedelta64(2600, 'ms'))
        assert result == expected

        result = td.median()
        expected = to_timedelta('00:00:09')
        assert result == expected

        result = td.to_frame().median()
        assert result[0] == expected

        # GH#6462
        # consistency in returned values for sum
        result = td.sum()
        expected = to_timedelta('00:01:21')
        assert result == expected

        result = td.to_frame().sum()
        assert result[0] == expected

        # std
        result = td.std()
        expected = to_timedelta(Series(td.dropna().values).std())
        assert result == expected

        result = td.to_frame().std()
        assert result[0] == expected

        # invalid ops
        for op in ['skew', 'kurt', 'sem', 'prod']:
            msg = "reduction operation '{}' not allowed for this dtype"
            with pytest.raises(TypeError, match=msg.format(op)):
                getattr(td, op)()

        # GH#10040
        # make sure NaT is properly handled by median()
        s = Series([Timestamp('2015-02-03'), Timestamp('2015-02-07')])
        assert s.diff().median() == timedelta(days=4)

        s = Series([Timestamp('2015-02-03'), Timestamp('2015-02-07'),
                    Timestamp('2015-02-15')])
        assert s.diff().median() == timedelta(days=6)
示例#25
0
    def test_contains(self):
        # Checking for any NaT-like objects
        # GH 13603
        td = to_timedelta(range(5), unit='d') + pd.offsets.Hour(1)
        for v in [pd.NaT, None, float('nan'), np.nan]:
            assert not (v in td)

        td = to_timedelta([pd.NaT])
        for v in [pd.NaT, None, float('nan'), np.nan]:
            assert (v in td)
示例#26
0
文件: time.py 项目: EQ4/muda
    def deform_times(ann, state):
        '''Deform time values for all annotations.'''

        ann.data.time = [pd.to_timedelta(x.total_seconds() / state['rate'],
                                         unit='s')
                         for x in ann.data.time]

        ann.data.duration = [pd.to_timedelta(x.total_seconds() / state['rate'],
                                             unit='s')
                             for x in ann.data.duration]
示例#27
0
    def get_timeseries(self, tail_n=-1):
        """ Convert the captured values into a pandas.Series with a
            TimeDeltaIndex.
        """
        tail_n = min(len(self.values), 0)

        if tail_n>0:
            return pd.Series(self.values[:-tail_n], index=pd.to_timedelta(self.indices[:-tail_n], unit="s"))
        else:
            return pd.Series(self.values, index=pd.to_timedelta(self.indices, unit="s"))
示例#28
0
    def __test(data):
        ann = Annotation(namespace='onset')

        # Bypass the safety chceks in add_observation
        ann.data.loc[0] = {'time': pd.to_timedelta(data['time'], unit='s'),
                           'duration': pd.to_timedelta(data['duration'],
                                                       unit='s'),
                           'value': None,
                           'confdence': None}

        ann.validate()
示例#29
0
    def test_timedelta_ops(self):
        # GH4984
        # make sure ops return Timedelta
        s = Series([Timestamp('20130101') + timedelta(seconds=i * i)
                    for i in range(10)])
        td = s.diff()

        result = td.mean()
        expected = to_timedelta(timedelta(seconds=9))
        self.assertEqual(result, expected)

        result = td.to_frame().mean()
        self.assertEqual(result[0], expected)

        result = td.quantile(.1)
        expected = Timedelta(np.timedelta64(2600, 'ms'))
        self.assertEqual(result, expected)

        result = td.median()
        expected = to_timedelta('00:00:09')
        self.assertEqual(result, expected)

        result = td.to_frame().median()
        self.assertEqual(result[0], expected)

        # GH 6462
        # consistency in returned values for sum
        result = td.sum()
        expected = to_timedelta('00:01:21')
        self.assertEqual(result, expected)

        result = td.to_frame().sum()
        self.assertEqual(result[0], expected)

        # std
        result = td.std()
        expected = to_timedelta(Series(td.dropna().values).std())
        self.assertEqual(result, expected)

        result = td.to_frame().std()
        self.assertEqual(result[0], expected)

        # invalid ops
        for op in ['skew', 'kurt', 'sem', 'prod']:
            self.assertRaises(TypeError, getattr(td, op))

        # GH 10040
        # make sure NaT is properly handled by median()
        s = Series([Timestamp('2015-02-03'), Timestamp('2015-02-07')])
        self.assertEqual(s.diff().median(), timedelta(days=4))

        s = Series([Timestamp('2015-02-03'), Timestamp('2015-02-07'),
                    Timestamp('2015-02-15')])
        self.assertEqual(s.diff().median(), timedelta(days=6))
示例#30
0
        def testit(unit, transform):

            # array
            result = to_timedelta(np.arange(5),unit=unit)
            expected = Series([ np.timedelta64(i,transform(unit)) for i in np.arange(5).tolist() ])
            tm.assert_series_equal(result, expected)

            # scalar
            result = to_timedelta(2,unit=unit)
            expected = np.timedelta64(2,transform(unit)).astype('timedelta64[ns]')
            self.assert_numpy_array_equal(result,expected)
示例#31
0
    _, window, cutoff_date_str = sys.argv
    cutoff_date = pd.to_datetime(cutoff_date_str)

    # gsheet: Forecasting/Tix Targets
    # only on rolling forecasts
    fin = '~/Forecasts/rolling/par/adj_r_xls_' + window + '_' + cutoff_date_str + '.par'  # input file
    f_df = pd.read_parquet(fin)  # load last adjusted fcast
    if f_df is None:
        s_ut.my_print('ERROR: could not find ' + fin)
        sys.exit()

    # week_starting patch
    df_cols_ = f_df.columns
    if 'ds_week_ending' in df_cols_ and 'ds_week_starting' not in df_cols_:
        f_df['ds_week_ending'] = pd.to_datetime(f_df['ds_week_ending'])
        f_df['ds_week_starting'] = f_df['ds_week_ending'] - pd.to_timedelta(
            6, unit='D')

    dates = [
        str(pd.to_datetime(x).date())
        for x in f_df['ds_week_starting'].unique()
    ]

    # China
    print('************** China ***************')
    cn_plan = prepare_plan('Initiatives - China.csv', cutoff_date, True)
    cn_df = adjust_fcast(cn_plan, f_df, 'China')

    # Homes
    print('************** Homes ***************')
    hm_plan = prepare_plan('Initiatives - Homes.csv', cutoff_date, False)
    print('tix: ' + str(f_df[
示例#32
0
class TestDataFrameUnaryOperators(object):
    # __pos__, __neg__, __inv__

    @pytest.mark.parametrize(
        'df,expected',
        [(pd.DataFrame({'a': [-1, 1]}), pd.DataFrame({'a': [1, -1]})),
         (pd.DataFrame({'a': [False, True]}), pd.DataFrame(
             {'a': [True, False]})),
         (pd.DataFrame({'a': pd.Series(pd.to_timedelta([-1, 1]))}),
          pd.DataFrame({'a': pd.Series(pd.to_timedelta([1, -1]))}))])
    def test_neg_numeric(self, df, expected):
        assert_frame_equal(-df, expected)
        assert_series_equal(-df['a'], expected['a'])

    @pytest.mark.parametrize('df, expected', [
        (np.array([1, 2], dtype=object), np.array([-1, -2], dtype=object)),
        ([Decimal('1.0'), Decimal('2.0')], [Decimal('-1.0'),
                                            Decimal('-2.0')]),
    ])
    def test_neg_object(self, df, expected):
        # GH#21380
        df = pd.DataFrame({'a': df})
        expected = pd.DataFrame({'a': expected})
        assert_frame_equal(-df, expected)
        assert_series_equal(-df['a'], expected['a'])

    @pytest.mark.parametrize('df', [
        pd.DataFrame({'a': ['a', 'b']}),
        pd.DataFrame({'a': pd.to_datetime(['2017-01-22', '1970-01-01'])}),
    ])
    def test_neg_raises(self, df):
        with pytest.raises(TypeError):
            (-df)
        with pytest.raises(TypeError):
            (-df['a'])

    def test_invert(self):
        _seriesd = tm.getSeriesData()
        df = pd.DataFrame(_seriesd)

        assert_frame_equal(-(df < 0), ~(df < 0))

    @pytest.mark.parametrize('df', [
        pd.DataFrame({'a': [-1, 1]}),
        pd.DataFrame({'a': [False, True]}),
        pd.DataFrame({'a': pd.Series(pd.to_timedelta([-1, 1]))}),
    ])
    def test_pos_numeric(self, df):
        # GH#16073
        assert_frame_equal(+df, df)
        assert_series_equal(+df['a'], df['a'])

    @pytest.mark.parametrize(
        'df',
        [
            # numpy changing behavior in the future
            pytest.param(pd.DataFrame({'a': ['a', 'b']}),
                         marks=[pytest.mark.filterwarnings("ignore")]),
            pd.DataFrame({'a': np.array([-1, 2], dtype=object)}),
            pd.DataFrame({'a': [Decimal('-1.0'),
                                Decimal('2.0')]}),
        ])
    def test_pos_object(self, df):
        # GH#21380
        assert_frame_equal(+df, df)
        assert_series_equal(+df['a'], df['a'])

    @pytest.mark.parametrize('df', [
        pd.DataFrame({'a': pd.to_datetime(['2017-01-22', '1970-01-01'])}),
    ])
    def test_pos_raises(self, df):
        with pytest.raises(TypeError):
            (+df)
        with pytest.raises(TypeError):
            (+df['a'])
示例#33
0
 def to_Timedelta(days: int) -> _pd.Timedelta:
     return _pd.to_timedelta(str(days) + 'D')
示例#34
0
     ],
 ),
 html.Div(
     id="info-container",
     className="six columns",
     children=[
         html.Div(
             id="dropout1",
             className="six columns inner-row",
             style={'width': '35%'},
             children=[
                 html.Div(children='''Период'''),
                 dcc.DatePickerRange(
                     id='date-picker-range',
                     start_date=datetime.now() -
                     pd.to_timedelta('2H'),
                     display_format='DD-MM-YYYY',
                     end_date=datetime.now() +
                     pd.to_timedelta('0.5H'),
                 ),
             ],
         ),
         html.Div(
             id="subject-dropout-box",
             className="six columns",
             style={'width': '20%'},
             children=[
                 html.Div(children='''Сервис'''),
                 dcc.Dropdown(
                     id="subject-dropout",
                     options=[{
    data = data[['TowerLon', 'TowerLat']]
    from sklearn.cluster import KMeans
    model = KMeans(n_clusters=clusters)
    model.fit(data)

    return model


#
# TODO: Load up the dataset and take a peek at its head and dtypes.
# Convert the date using pd.to_datetime, and the time using pd.to_timedelta
#
# .. your code here ..
df = pd.read_csv('Datasets/CDR.csv')
df['CallDate'] = pd.to_datetime(df['CallDate'])
df['CallTime'] = pd.to_timedelta(df['CallTime'])

#
# TODO: Create a unique list of of the phone-number values (users) stored in the
# "In" column of the dataset, and save it to a variable called `unique_numbers`.
# Manually check through unique_numbers to ensure the order the numbers appear is
# the same order they appear (uniquely) in your dataset:
#
# .. your code here ..
In = df['In']
unique_numbers = In.unique()

#
# INFO: The locations map above should be too "busy" to really wrap your head around. This
# is where domain expertise comes into play. Your intuition tells you that people are likely
# to behave differently on weekends:
    # Store the destination and starting cells
    chunk["DESTINATION"] = chunk.GRID_POLYLINE.map(lambda x: x[-1])
    chunk["START_CELL"] = chunk.GRID_POLYLINE.map(lambda x: x[0])
    
    # Loop through the cutoff dates. For every cutoff date:
    # 1. The active trips are removed from the chunk (training) set
    # 2. These trips are truncated and saved into the test set.
    # 3. Iteratively, the size of chunk is reduced until we have passed
    #    through all the cutoff dates.
    # 4. This final remained forms the training set.
    
    for cutoff_date in cutoff_dates:
      # Allocate the inactive trips to the training set. Add 30 seconds for 
      # the boundary cases (if time difference between cutoff_date and 
      # TIMESTAMP is less than 30 seconds: not enough data to truncate 
      active = ((chunk.TIMESTAMP + pd.to_timedelta(30, unit = 's')) <= cutoff_date) & ((chunk.TIMESTAMP + pd.to_timedelta(chunk.DURATION, unit = 's')) >= cutoff_date)
      
      # For the active trips, the trip is truncated at the cutoff time
      if np.sum(active) > 0:
        validation = chunk[active].reset_index(drop = True)

        # Compute elapsed time in seconds
        elapsed = np.abs((cutoff_date.astype(np.int64) - (validation.TIMESTAMP.astype(np.int64) // 10 ** 9))) # astype(np.int64) returns unix in nanoseconds!
        
        # Get the (integer) cutoff point from the elapsed time. (15 seconds between each measurement)
        validation["CUTOFF"] = np.floor(elapsed/15).astype(int) 
        
        # Truncate the paths (UGLY WAY)
        validation["TRUNC_POLYLINE"] = None
        validation["TRUNC_GRID_POLYLINE"] = None
        
示例#37
0
def main(loc_id, loc_name, output_version):
    print('Reading in short-term outcomes...')
    ## Read in short-term outcomes
    # region -------------------------------------------------------------------

    # Durations and proportions
    dp = pd.read_csv(
        '{}WORK/12_bundle/covid/data/long_covid/long_covid_proportions_durations_with_overlaps.csv'
        .format(roots['j']))

    # Mild/Moderate
    print('  mild/moderate...')
    midmod = Dataset(loc_id,
                     loc_name,
                     output_version,
                     'midmod',
                     nf_type='long')

    # Hospital
    print('  hospital...')
    hospital = Dataset(loc_id,
                       loc_name,
                       output_version,
                       'hsp_admit',
                       nf_type='long')

    # Icu
    print('  icu...')
    icu = Dataset(loc_id,
                  loc_name,
                  output_version,
                  'icu_admit',
                  nf_type='long')

    # endregion ----------------------------------------------------------------

    print('Calculating mild/moderate incidence & prevalence...')
    ## Mild/Moderate Incidence & Prevalence
    # region -------------------------------------------------------------------
    # Shift hospitalizations 7 days
    lag_hsp = copy.deepcopy(hospital)
    lag_hsp.data = lag_hsp.data.drop(columns=['hospital_deaths'])
    lag_hsp.data.date = lag_hsp.data.date + pd.to_timedelta(
        roots['defaults']['symp_to_hsp_admit_duration'], unit='D')

    # Merge midmod and lag_hsp
    midmod.data = pd.merge(
        midmod.data,
        lag_hsp.data,
        how='left',
        on=['location_id', 'age_group_id', 'sex_id', 'draw_var', 'date'])
    del lag_hsp

    # mild/moderate at risk number = (mild/moderate incidence - hospital admissions|7 days later) |
    #                                 shift forward by {incubation period + mild/moderate duration|no hospital}
    midmod.data[
        'midmod_risk_num'] = midmod.data.midmod_inc - midmod.data.hospital_inc
    midmod.data.date = midmod.data.date + pd.to_timedelta(
        (roots['defaults']['incubation_period'] +
         roots['defaults']['midmod_duration_no_hsp']),
        unit='D')

    # Calculate the incidence of each symptom and overlap, regardless of co-occurrence of additional symptoms (not mutually exclusive)
    # mild/moderate long-term incidence = mild/moderate number at risk * proportion of mild/moderate with each long-term symptom cluster
    midmod.data['midmod_cog_inc'] = (
        midmod.data.midmod_risk_num *
        dp.loc[(dp.outcome == 'cognitive') &
               (dp.population == 'midmod'), 'proportion_mean'].values[0])
    midmod.data['midmod_fat_inc'] = (
        midmod.data.midmod_risk_num *
        dp.loc[(dp.outcome == 'fatigue') &
               (dp.population == 'midmod'), 'proportion_mean'].values[0])
    midmod.data['midmod_resp_inc'] = (
        midmod.data.midmod_risk_num *
        dp.loc[(dp.outcome == 'respiratory') &
               (dp.population == 'midmod'), 'proportion_mean'].values[0])
    midmod.data['midmod_cog_fat_inc'] = (
        midmod.data.midmod_risk_num *
        dp.loc[(dp.outcome == 'cognitive_fatigue') &
               (dp.population == 'midmod'), 'proportion_mean'].values[0])
    midmod.data['midmod_cog_resp_inc'] = (
        midmod.data.midmod_risk_num *
        dp.loc[(dp.outcome == 'cognitive_respiratory') &
               (dp.population == 'midmod'), 'proportion_mean'].values[0])
    midmod.data['midmod_fat_resp_inc'] = (
        midmod.data.midmod_risk_num *
        dp.loc[(dp.outcome == 'fatigue_respiratory') &
               (dp.population == 'midmod'), 'proportion_mean'].values[0])
    midmod.data['midmod_cog_fat_resp_inc'] = (
        midmod.data.midmod_risk_num *
        dp.loc[(dp.outcome == 'cognitive_fatigue_respiratory') &
               (dp.population == 'midmod'), 'proportion_mean'].values[0])

    # Creating mutually exclusive categories of symptoms
    # cog_inc = cog_inc - (cog_fat_inc - cog_fat_resp_inc) - (cog_resp_inc - cog_fat_resp_inc) - cog_fat_resp_inc
    midmod.data.midmod_cog_inc = (midmod.data.midmod_cog_inc -
                                  (midmod.data.midmod_cog_fat_inc -
                                   midmod.data.midmod_cog_fat_resp_inc) -
                                  (midmod.data.midmod_cog_resp_inc -
                                   midmod.data.midmod_cog_fat_resp_inc) -
                                  midmod.data.midmod_cog_fat_resp_inc)

    # fat_inc = fat_inc - (cog_fat_inc - cog_fat_resp_inc) -  (fat_resp_inc - cog_fat_resp_inc) - cog_fat_resp_inc
    midmod.data.midmod_fat_inc = (midmod.data.midmod_fat_inc -
                                  (midmod.data.midmod_cog_fat_inc -
                                   midmod.data.midmod_cog_fat_resp_inc) -
                                  (midmod.data.midmod_fat_resp_inc -
                                   midmod.data.midmod_cog_fat_resp_inc) -
                                  midmod.data.midmod_cog_fat_resp_inc)

    # resp_inc = resp_inc - (fat_resp_inc - cog_fat_resp_inc) - (cog_resp_inc - cog_fat_resp_inc) - cog_fat_resp_inc
    midmod.data.midmod_resp_inc = (midmod.data.midmod_resp_inc -
                                   (midmod.data.midmod_fat_resp_inc -
                                    midmod.data.midmod_cog_fat_resp_inc) -
                                   (midmod.data.midmod_cog_resp_inc -
                                    midmod.data.midmod_cog_fat_resp_inc) -
                                   midmod.data.midmod_cog_fat_resp_inc)

    # cog_fat_inc = cog_fat_inc - cog_fat_resp_inc
    midmod.data.midmod_cog_fat_inc = (midmod.data.midmod_cog_fat_inc -
                                      midmod.data.midmod_cog_fat_resp_inc)

    # cog_resp_inc = cog_resp_inc - cog_fat_resp_inc
    midmod.data.midmod_cog_resp_inc = (midmod.data.midmod_cog_resp_inc -
                                       midmod.data.midmod_cog_fat_resp_inc)

    # fat_resp_inc = fat_resp_inc - cog_fat_resp_inc
    midmod.data.midmod_fat_resp_inc = (midmod.data.midmod_fat_resp_inc -
                                       midmod.data.midmod_cog_fat_resp_inc)

    # mild/moderate long-term prevalence = mild/moderate long-term incidence * [duration]
    midmod.data = calc_prev(df=midmod.data,
                            dp=dp,
                            dst_population='midmod',
                            dst_outcome='cognitive',
                            calc_col_stub='midmod_cog_')
    midmod.data = calc_prev(df=midmod.data,
                            dp=dp,
                            dst_population='midmod',
                            dst_outcome='fatigue',
                            calc_col_stub='midmod_fat_')
    midmod.data = calc_prev(df=midmod.data,
                            dp=dp,
                            dst_population='midmod',
                            dst_outcome='respiratory',
                            calc_col_stub='midmod_resp_')
    midmod.data = calc_prev(df=midmod.data,
                            dp=dp,
                            dst_population='midmod',
                            dst_outcome='cognitive_fatigue',
                            calc_col_stub='midmod_cog_fat_')
    midmod.data = calc_prev(df=midmod.data,
                            dp=dp,
                            dst_population='midmod',
                            dst_outcome='cognitive_respiratory',
                            calc_col_stub='midmod_cog_resp_')
    midmod.data = calc_prev(df=midmod.data,
                            dp=dp,
                            dst_population='midmod',
                            dst_outcome='fatigue_respiratory',
                            calc_col_stub='midmod_fat_resp_')
    midmod.data = calc_prev(df=midmod.data,
                            dp=dp,
                            dst_population='midmod',
                            dst_outcome='cognitive_fatigue_respiratory',
                            calc_col_stub='midmod_cog_fat_resp_')

    # Drop unneeded cols
    midmod.data = midmod.data.drop(
        columns=['midmod_inc', 'hospital_inc', 'midmod_risk_num'])

    # endregion ----------------------------------------------------------------

    print('Calculating severe incidence and prevalence...')
    ## Severe Incidence & Prevalence
    # region -------------------------------------------------------------------

    # Shift icu admissions
    lag_icu = copy.deepcopy(icu)
    lag_icu.data = lag_icu.data.drop(columns=['icu_deaths'])
    lag_icu.data.date = lag_icu.data.date + pd.to_timedelta(
        roots['defaults']['icu_to_death_duration'], unit='D')

    # Shift hospital deaths
    lag_hsp = copy.deepcopy(hospital)
    lag_hsp.data = lag_hsp.data.drop(columns=['hospital_inc'])
    lag_hsp.data.date = lag_hsp.data.date + pd.to_timedelta(
        roots['defaults']['hsp_no_icu_death_duration'], unit='D')

    # Merge lagged datasets
    lag = pd.merge(
        lag_icu.data,
        lag_hsp.data,
        how='left',
        on=['location_id', 'age_group_id', 'sex_id', 'draw_var', 'date'])
    del lag_icu, lag_hsp
    hospital.data = pd.merge(
        hospital.data.drop(columns=['hospital_deaths']),
        lag,
        how='left',
        on=['location_id', 'age_group_id', 'sex_id', 'draw_var', 'date'])
    del lag

    # severe at risk number = (hospital admissions - ICU admissions|3 days later - hospital deaths|6 days later) |
    #                          shift forward by {hospital duration if no ICU no death + hospital mild moderate duration after discharge}
    hospital.data['hospital_risk_num'] = (hospital.data.hospital_inc -
                                          hospital.data.icu_inc -
                                          hospital.data.hospital_deaths)
    hospital.data.date = hospital.data.date + pd.to_timedelta(
        (roots['defaults']['hsp_no_icu_no_death_duration'] +
         roots['defaults']['hsp_midmod_after_discharge_duration']),
        unit='D')

    # Calculate the incidence of each symptom and overlap, regardless of co-occurrence of additional symptoms (not mutually exclusive)
    # severe long-term incidence = severe at risk number * proportion of severe survivors with each long-term symptom cluster
    hospital.data['hospital_cog_inc'] = (
        hospital.data.hospital_risk_num *
        dp.loc[(dp.outcome == 'cognitive') &
               (dp.population == 'hospital'), 'proportion_mean'].values[0])
    hospital.data['hospital_fat_inc'] = (
        hospital.data.hospital_risk_num *
        dp.loc[(dp.outcome == 'fatigue') &
               (dp.population == 'hospital'), 'proportion_mean'].values[0])
    hospital.data['hospital_resp_inc'] = (
        hospital.data.hospital_risk_num *
        dp.loc[(dp.outcome == 'respiratory') &
               (dp.population == 'hospital'), 'proportion_mean'].values[0])
    hospital.data['hospital_cog_fat_inc'] = (
        hospital.data.hospital_risk_num *
        dp.loc[(dp.outcome == 'cognitive_fatigue') &
               (dp.population == 'hospital'), 'proportion_mean'].values[0])
    hospital.data['hospital_cog_resp_inc'] = (
        hospital.data.hospital_risk_num *
        dp.loc[(dp.outcome == 'cognitive_respiratory') &
               (dp.population == 'hospital'), 'proportion_mean'].values[0])
    hospital.data['hospital_fat_resp_inc'] = (
        hospital.data.hospital_risk_num *
        dp.loc[(dp.outcome == 'fatigue_respiratory') &
               (dp.population == 'hospital'), 'proportion_mean'].values[0])
    hospital.data['hospital_cog_fat_resp_inc'] = (
        hospital.data.hospital_risk_num *
        dp.loc[(dp.outcome == 'cognitive_fatigue_respiratory') &
               (dp.population == 'hospital'), 'proportion_mean'].values[0])

    # Creating mutually exclusive categories of symptoms
    # cog_inc = cog_inc - (cog_fat_inc - cog_fat_resp_inc) - (cog_resp_inc - cog_fat_resp_inc) - cog_fat_resp_inc
    hospital.data.hospital_cog_inc = (
        hospital.data.hospital_cog_inc -
        (hospital.data.hospital_cog_fat_inc -
         hospital.data.hospital_cog_fat_resp_inc) -
        (hospital.data.hospital_cog_resp_inc -
         hospital.data.hospital_cog_fat_resp_inc) -
        hospital.data.hospital_cog_fat_resp_inc)

    # fat_inc = fat_inc - (cog_fat_inc - cog_fat_resp_inc) -  (fat_resp_inc - cog_fat_resp_inc) - cog_fat_resp_inc
    hospital.data.hospital_fat_inc = (
        hospital.data.hospital_fat_inc -
        (hospital.data.hospital_cog_fat_inc -
         hospital.data.hospital_cog_fat_resp_inc) -
        (hospital.data.hospital_fat_resp_inc -
         hospital.data.hospital_cog_fat_resp_inc) -
        hospital.data.hospital_cog_fat_resp_inc)

    # resp_inc = resp_inc - (fat_resp_inc - cog_fat_resp_inc) - (cog_resp_inc - cog_fat_resp_inc) - cog_fat_resp_inc
    hospital.data.hospital_resp_inc = (
        hospital.data.hospital_resp_inc -
        (hospital.data.hospital_fat_resp_inc -
         hospital.data.hospital_cog_fat_resp_inc) -
        (hospital.data.hospital_cog_resp_inc -
         hospital.data.hospital_cog_fat_resp_inc) -
        hospital.data.hospital_cog_fat_resp_inc)

    # cog_fat_inc = cog_fat_inc - cog_fat_resp_inc
    hospital.data.hospital_cog_fat_inc = (
        hospital.data.hospital_cog_fat_inc -
        hospital.data.hospital_cog_fat_resp_inc)

    # cog_resp_inc = cog_resp_inc - cog_fat_resp_inc
    hospital.data.hospital_cog_resp_inc = (
        hospital.data.hospital_cog_resp_inc -
        hospital.data.hospital_cog_fat_resp_inc)

    # fat_resp_inc = fat_resp_inc - cog_fat_resp_inc
    hospital.data.hospital_fat_resp_inc = (
        hospital.data.hospital_fat_resp_inc -
        hospital.data.hospital_cog_fat_resp_inc)

    # severe long-term prevalence = severe long-term incidence * [duration]
    hospital.data = calc_prev(df=hospital.data,
                              dp=dp,
                              dst_population='hospital',
                              dst_outcome='cognitive',
                              calc_col_stub='hospital_cog_')
    hospital.data = calc_prev(df=hospital.data,
                              dp=dp,
                              dst_population='hospital',
                              dst_outcome='fatigue',
                              calc_col_stub='hospital_fat_')
    hospital.data = calc_prev(df=hospital.data,
                              dp=dp,
                              dst_population='hospital',
                              dst_outcome='respiratory',
                              calc_col_stub='hospital_resp_')
    hospital.data = calc_prev(df=hospital.data,
                              dp=dp,
                              dst_population='hospital',
                              dst_outcome='cognitive_fatigue',
                              calc_col_stub='hospital_cog_fat_')
    hospital.data = calc_prev(df=hospital.data,
                              dp=dp,
                              dst_population='hospital',
                              dst_outcome='cognitive_respiratory',
                              calc_col_stub='hospital_cog_resp_')
    hospital.data = calc_prev(df=hospital.data,
                              dp=dp,
                              dst_population='hospital',
                              dst_outcome='fatigue_respiratory',
                              calc_col_stub='hospital_fat_resp_')
    hospital.data = calc_prev(df=hospital.data,
                              dp=dp,
                              dst_population='hospital',
                              dst_outcome='cognitive_fatigue_respiratory',
                              calc_col_stub='hospital_cog_fat_resp_')

    # Remove unneeded cols
    hospital.data = hospital.data.drop(columns=[
        'hospital_inc', 'icu_inc', 'hospital_deaths', 'hospital_risk_num'
    ])

    # endregion ----------------------------------------------------------------

    print('Calculating critical incidence and prevalence...')
    ## Critical Incidence & Prevalence
    # region -------------------------------------------------------------------

    # Shift icu deaths
    lag_icu = copy.deepcopy(icu)
    lag_icu.data = lag_icu.data.drop(columns='icu_inc')
    lag_icu.data.date = lag_icu.data.date + pd.to_timedelta(
        roots['defaults']['icu_to_death_duration'], unit='D')

    # Merge icu and lag_icu
    icu.data = pd.merge(
        icu.data.drop(columns='icu_deaths'),
        lag_icu.data,
        how='left',
        on=['location_id', 'age_group_id', 'sex_id', 'draw_var', 'date'])
    del lag_icu

    # critical at risk number = (ICU admissions - ICU deaths|3 days later) |
    #                            shift forward by {ICU duration if no death + ICU mild moderate duration after discharge}
    icu.data['icu_risk_num'] = icu.data.icu_inc - icu.data.icu_deaths
    icu.data.date = icu.data.date - pd.to_timedelta(
        (roots['defaults']['icu_no_death_duration'] +
         roots['defaults']['icu_midmod_after_discharge_duration']),
        unit='D')

    # Calculate the incidence of each symptom and overlap, regardless of co-occurrence of additional symptoms (not mutually exclusive)
    # critical long-term incidence = critical number at risk * proportion of critical with each long-term symptom cluster
    icu.data['icu_cog_inc'] = (
        icu.data.icu_risk_num *
        dp.loc[(dp.outcome == 'cognitive') &
               (dp.population == 'icu'), 'proportion_mean'].values[0])
    icu.data['icu_fat_inc'] = (
        icu.data.icu_risk_num *
        dp.loc[(dp.outcome == 'fatigue') &
               (dp.population == 'icu'), 'proportion_mean'].values[0])
    icu.data['icu_resp_inc'] = (
        icu.data.icu_risk_num *
        dp.loc[(dp.outcome == 'respiratory') &
               (dp.population == 'icu'), 'proportion_mean'].values[0])
    icu.data['icu_cog_fat_inc'] = (
        icu.data.icu_risk_num *
        dp.loc[(dp.outcome == 'cognitive_fatigue') &
               (dp.population == 'icu'), 'proportion_mean'].values[0])
    icu.data['icu_cog_resp_inc'] = (
        icu.data.icu_risk_num *
        dp.loc[(dp.outcome == 'cognitive_respiratory') &
               (dp.population == 'icu'), 'proportion_mean'].values[0])
    icu.data['icu_fat_resp_inc'] = (
        icu.data.icu_risk_num *
        dp.loc[(dp.outcome == 'fatigue_respiratory') &
               (dp.population == 'icu'), 'proportion_mean'].values[0])
    icu.data['icu_cog_fat_resp_inc'] = (
        icu.data.icu_risk_num *
        dp.loc[(dp.outcome == 'cognitive_fatigue_respiratory') &
               (dp.population == 'icu'), 'proportion_mean'].values[0])

    # Creating mutually exclusive categories of symptoms
    # cog_inc = cog_inc - (cog_fat_inc - cog_fat_resp_inc) - (cog_resp_inc - cog_fat_resp_inc) - cog_fat_resp_inc
    icu.data.icu_cog_inc = (
        icu.data.icu_cog_inc -
        (icu.data.icu_cog_fat_inc - icu.data.icu_cog_fat_resp_inc) -
        (icu.data.icu_cog_resp_inc - icu.data.icu_cog_fat_resp_inc) -
        icu.data.icu_cog_fat_resp_inc)

    # fat_inc = fat_inc - (cog_fat_inc - cog_fat_resp_inc) -  (fat_resp_inc - cog_fat_resp_inc) - cog_fat_resp_inc
    icu.data.icu_fat_inc = (
        icu.data.icu_fat_inc -
        (icu.data.icu_cog_fat_inc - icu.data.icu_cog_fat_resp_inc) -
        (icu.data.icu_fat_resp_inc - icu.data.icu_cog_fat_resp_inc) -
        icu.data.icu_cog_fat_resp_inc)

    # resp_inc = resp_inc - (fat_resp_inc - cog_fat_resp_inc) - (cog_resp_inc - cog_fat_resp_inc) - cog_fat_resp_inc
    icu.data.icu_resp_inc = (
        icu.data.icu_resp_inc -
        (icu.data.icu_fat_resp_inc - icu.data.icu_cog_fat_resp_inc) -
        (icu.data.icu_cog_resp_inc - icu.data.icu_cog_fat_resp_inc) -
        icu.data.icu_cog_fat_resp_inc)

    # cog_fat_inc = cog_fat_inc - cog_fat_resp_inc
    icu.data.icu_cog_fat_inc = (icu.data.icu_cog_fat_inc -
                                icu.data.icu_cog_fat_resp_inc)

    # cog_resp_inc = cog_resp_inc - cog_fat_resp_inc
    icu.data.icu_cog_resp_inc = (icu.data.icu_cog_resp_inc -
                                 icu.data.icu_cog_fat_resp_inc)

    # fat_resp_inc = fat_resp_inc - cog_fat_resp_inc
    icu.data.icu_fat_resp_inc = (icu.data.icu_fat_resp_inc -
                                 icu.data.icu_cog_fat_resp_inc)

    # critical long-term prevalence = critical long-term incidence * [duration]
    icu.data = calc_prev(df=icu.data,
                         dp=dp,
                         dst_population='icu',
                         dst_outcome='cognitive',
                         calc_col_stub='icu_cog_')
    icu.data = calc_prev(df=icu.data,
                         dp=dp,
                         dst_population='icu',
                         dst_outcome='fatigue',
                         calc_col_stub='icu_fat_')
    icu.data = calc_prev(df=icu.data,
                         dp=dp,
                         dst_population='icu',
                         dst_outcome='respiratory',
                         calc_col_stub='icu_resp_')
    icu.data = calc_prev(df=icu.data,
                         dp=dp,
                         dst_population='icu',
                         dst_outcome='cognitive_fatigue',
                         calc_col_stub='icu_cog_fat_')
    icu.data = calc_prev(df=icu.data,
                         dp=dp,
                         dst_population='icu',
                         dst_outcome='cognitive_respiratory',
                         calc_col_stub='icu_cog_resp_')
    icu.data = calc_prev(df=icu.data,
                         dp=dp,
                         dst_population='icu',
                         dst_outcome='fatigue_respiratory',
                         calc_col_stub='icu_fat_resp_')
    icu.data = calc_prev(df=icu.data,
                         dp=dp,
                         dst_population='icu',
                         dst_outcome='cognitive_fatigue_respiratory',
                         calc_col_stub='icu_cog_fat_resp_')

    # Remove unneeded cols
    icu.data = icu.data.drop(columns=['icu_inc', 'icu_deaths', 'icu_risk_num'])
    del dp

    # endregion ----------------------------------------------------------------

    print('Aggregating severities...')
    ## Aggregate Severities
    # region -------------------------------------------------------------------

    df = copy.deepcopy(midmod)
    del midmod
    df.data = pd.merge(
        df.data,
        hospital.data,
        how='outer',
        on=['location_id', 'age_group_id', 'sex_id', 'draw_var', 'date'])
    del hospital
    df.data = pd.merge(
        df.data,
        icu.data,
        how='outer',
        on=['location_id', 'age_group_id', 'sex_id', 'draw_var', 'date'])
    del icu

    # Incidence
    df.data['cognitive_inc'] = df.data[[
        'midmod_cog_inc', 'hospital_cog_inc', 'icu_cog_inc'
    ]].sum(axis=1)
    df.data.drop(columns=['midmod_cog_inc', 'hospital_cog_inc', 'icu_cog_inc'],
                 inplace=True)
    df.data['fatigue_inc'] = df.data[[
        'midmod_fat_inc', 'hospital_fat_inc', 'icu_fat_inc'
    ]].sum(axis=1)
    df.data.drop(columns=['midmod_fat_inc', 'hospital_fat_inc', 'icu_fat_inc'],
                 inplace=True)
    df.data['respiratory_inc'] = df.data[[
        'midmod_resp_inc', 'hospital_resp_inc', 'icu_resp_inc'
    ]].sum(axis=1)
    df.data.drop(
        columns=['midmod_resp_inc', 'hospital_resp_inc', 'icu_resp_inc'],
        inplace=True)
    df.data['cognitive_fatigue_inc'] = df.data[[
        'midmod_cog_fat_inc', 'hospital_cog_fat_inc', 'icu_cog_fat_inc'
    ]].sum(axis=1)
    df.data.drop(columns=[
        'midmod_cog_fat_inc', 'hospital_cog_fat_inc', 'icu_cog_fat_inc'
    ],
                 inplace=True)
    df.data['cognitive_respiratory_inc'] = df.data[[
        'midmod_cog_resp_inc', 'hospital_cog_resp_inc', 'icu_cog_resp_inc'
    ]].sum(axis=1)
    df.data.drop(columns=[
        'midmod_cog_resp_inc', 'hospital_cog_resp_inc', 'icu_cog_resp_inc'
    ],
                 inplace=True)
    df.data['fatigue_respiratory_inc'] = df.data[[
        'midmod_fat_resp_inc', 'hospital_fat_resp_inc', 'icu_fat_resp_inc'
    ]].sum(axis=1)
    df.data.drop(columns=[
        'midmod_fat_resp_inc', 'hospital_fat_resp_inc', 'icu_fat_resp_inc'
    ],
                 inplace=True)
    df.data['cognitive_fatigue_respiratory_inc'] = df.data[[
        'midmod_cog_fat_resp_inc', 'hospital_cog_fat_resp_inc',
        'icu_cog_fat_resp_inc'
    ]].sum(axis=1)
    df.data.drop(columns=[
        'midmod_cog_fat_resp_inc', 'hospital_cog_fat_resp_inc',
        'icu_cog_fat_resp_inc'
    ],
                 inplace=True)

    # Prevalence
    df.data['cognitive_prev'] = df.data[[
        'midmod_cog_prev', 'hospital_cog_prev', 'icu_cog_prev'
    ]].sum(axis=1)
    df.data.drop(
        columns=['midmod_cog_prev', 'hospital_cog_prev', 'icu_cog_prev'],
        inplace=True)
    df.data['fatigue_prev'] = df.data[[
        'midmod_fat_prev', 'hospital_fat_prev', 'icu_fat_prev'
    ]].sum(axis=1)
    df.data.drop(
        columns=['midmod_fat_prev', 'hospital_fat_prev', 'icu_fat_prev'],
        inplace=True)
    df.data['respiratory_prev'] = df.data[[
        'midmod_resp_prev', 'hospital_resp_prev', 'icu_resp_prev'
    ]].sum(axis=1)
    df.data.drop(
        columns=['midmod_resp_prev', 'hospital_resp_prev', 'icu_resp_prev'],
        inplace=True)
    df.data['cognitive_fatigue_prev'] = df.data[[
        'midmod_cog_fat_prev', 'hospital_cog_fat_prev', 'icu_cog_fat_prev'
    ]].sum(axis=1)
    df.data.drop(columns=[
        'midmod_cog_fat_prev', 'hospital_cog_fat_prev', 'icu_cog_fat_prev'
    ],
                 inplace=True)
    df.data['cognitive_respiratory_prev'] = df.data[[
        'midmod_cog_resp_prev', 'hospital_cog_resp_prev', 'icu_cog_resp_prev'
    ]].sum(axis=1)
    df.data.drop(columns=[
        'midmod_cog_resp_prev', 'hospital_cog_resp_prev', 'icu_cog_resp_prev'
    ],
                 inplace=True)
    df.data['fatigue_respiratory_prev'] = df.data[[
        'midmod_fat_resp_prev', 'hospital_fat_resp_prev', 'icu_fat_resp_prev'
    ]].sum(axis=1)
    df.data.drop(columns=[
        'midmod_fat_resp_prev', 'hospital_fat_resp_prev', 'icu_fat_resp_prev'
    ],
                 inplace=True)
    df.data['cognitive_fatigue_respiratory_prev'] = df.data[[
        'midmod_cog_fat_resp_prev', 'hospital_cog_fat_resp_prev',
        'icu_cog_fat_resp_prev'
    ]].sum(axis=1)
    df.data.drop(columns=[
        'midmod_cog_fat_resp_prev', 'hospital_cog_fat_resp_prev',
        'icu_cog_fat_resp_prev'
    ],
                 inplace=True)

    # endregion ----------------------------------------------------------------

    print('Aggregating by year...')
    ## Aggregate by year
    # region -------------------------------------------------------------------

    # Subset to 2020
    df.data = df.data[(df.data.date >= datetime.datetime(2020, 1, 1))
                      & (df.data.date <= datetime.datetime(2020, 12, 31))]

    # Sum by day
    df.collapse(
        agg_function='sum',
        group_cols=['location_id', 'age_group_id', 'sex_id', 'draw_var'],
        calc_cols=[
            'cognitive_inc', 'cognitive_prev', 'fatigue_inc', 'fatigue_prev',
            'respiratory_inc', 'respiratory_prev', 'cognitive_fatigue_inc',
            'cognitive_fatigue_prev', 'cognitive_respiratory_inc',
            'cognitive_respiratory_prev', 'fatigue_respiratory_inc',
            'fatigue_respiratory_prev', 'cognitive_fatigue_respiratory_inc',
            'cognitive_fatigue_respiratory_prev'
        ])

    # Divide prevalence by 366
    df.data.cognitive_prev = df.data.cognitive_prev / 366
    df.data.fatigue_prev = df.data.fatigue_prev / 366
    df.data.respiratory_prev = df.data.respiratory_prev / 366
    df.data.cognitive_fatigue_prev = df.data.cognitive_fatigue_prev / 366
    df.data.cognitive_respiratory_prev = df.data.cognitive_respiratory_prev / 366
    df.data.fatigue_respiratory_prev = df.data.fatigue_respiratory_prev / 366
    df.data.cognitive_fatigue_respiratory_prev = df.data.cognitive_fatigue_respiratory_prev / 366

    # Ensure incidence and prevalence aren't negative
    df.check_neg(calc_cols=[
        'cognitive_inc', 'cognitive_prev', 'fatigue_inc', 'fatigue_prev',
        'respiratory_inc', 'respiratory_prev', 'cognitive_fatigue_inc',
        'cognitive_fatigue_prev', 'cognitive_respiratory_inc',
        'cognitive_respiratory_prev', 'fatigue_respiratory_inc',
        'fatigue_respiratory_prev', 'cognitive_fatigue_respiratory_inc',
        'cognitive_fatigue_respiratory_prev'
    ])

    # endregion ----------------------------------------------------------------

    print('Calculating rates...')
    ## Calculate rates
    # region -------------------------------------------------------------------

    # Pull population
    pop = get_population(age_group_id=roots['age_groups'],
                         single_year_age=False,
                         location_id=loc_id,
                         location_set_id=35,
                         year_id=roots['gbd_year'],
                         sex_id=[1, 2],
                         gbd_round_id=roots['gbd_round'],
                         status='best',
                         decomp_step=roots['decomp_step'])
    pop.drop(columns=['year_id', 'run_id'], inplace=True)

    # Merge population
    df.data = pd.merge(df.data,
                       pop,
                       how='left',
                       on=['location_id', 'age_group_id', 'sex_id'])

    # Calculate rates
    df.data['cognitive_inc_rate'] = df.data.cognitive_inc / df.data.population
    df.data['fatigue_inc_rate'] = df.data.fatigue_inc / df.data.population
    df.data[
        'respiratory_inc_rate'] = df.data.respiratory_inc / df.data.population
    df.data[
        'cognitive_fatigue_inc_rate'] = df.data.cognitive_fatigue_inc / df.data.population
    df.data[
        'cognitive_respiratory_inc_rate'] = df.data.cognitive_respiratory_inc / df.data.population
    df.data[
        'fatigue_respiratory_inc_rate'] = df.data.fatigue_respiratory_inc / df.data.population
    df.data[
        'cognitive_fatigue_respiratory_inc_rate'] = df.data.cognitive_fatigue_respiratory_inc / df.data.population

    df.data[
        'cognitive_prev_rate'] = df.data.cognitive_prev / df.data.population
    df.data['fatigue_prev_rate'] = df.data.fatigue_prev / df.data.population
    df.data[
        'respiratory_prev_rate'] = df.data.respiratory_prev / df.data.population
    df.data[
        'cognitive_fatigue_prev_rate'] = df.data.cognitive_fatigue_prev / df.data.population
    df.data[
        'cognitive_respiratory_prev_rate'] = df.data.cognitive_respiratory_prev / df.data.population
    df.data[
        'fatigue_respiratory_prev_rate'] = df.data.fatigue_respiratory_prev / df.data.population
    df.data[
        'cognitive_fatigue_respiratory_prev_rate'] = df.data.cognitive_fatigue_respiratory_prev / df.data.population
    # endregion ----------------------------------------------------------------

    print('Calculating YLDs...')
    ## Calculate YLDs
    # region -------------------------------------------------------------------

    # Read in disability weights
    dw = pd.read_csv('{}dws.csv'.format(roots['disability_weight']))

    # Temporary values
    df.data['cognitive_YLD'] = df.data.cognitive_prev_rate * 0.01
    df.data['fatigue_YLD'] = df.data.fatigue_prev_rate * 0.01
    df.data['respiratory_YLD'] = df.data.respiratory_prev_rate * 0.01
    df.data[
        'cognitive_fatigue_YLD'] = df.data.cognitive_fatigue_prev_rate * 0.01
    df.data[
        'cognitive_respiratory_YLD'] = df.data.cognitive_respiratory_prev_rate * 0.01
    df.data[
        'fatigue_respiratory_YLD'] = df.data.fatigue_respiratory_prev_rate * 0.01
    df.data[
        'cognitive_fatigue_respiratory_YLD'] = df.data.cognitive_fatigue_respiratory_prev_rate * 0.01

    del dw

    # endregion ----------------------------------------------------------------

    print('Saving datasets and running diagnostics...')
    ## Save datasets & run diagnostics
    # region -------------------------------------------------------------------

    # Cognitive
    df.save_data(output_cols=[
        'location_id', 'age_group_id', 'sex_id', 'draw_var', 'cognitive_inc',
        'cognitive_prev', 'cognitive_inc_rate', 'cognitive_prev_rate',
        'cognitive_YLD'
    ],
                 filename='cognitive',
                 stage='stage_2')

    # Fatigue
    df.save_data(output_cols=[
        'location_id', 'age_group_id', 'sex_id', 'draw_var', 'fatigue_inc',
        'fatigue_prev', 'fatigue_inc_rate', 'fatigue_prev_rate', 'fatigue_YLD'
    ],
                 filename='fatigue',
                 stage='stage_2')

    # Respiratory
    df.save_data(output_cols=[
        'location_id', 'age_group_id', 'sex_id', 'draw_var', 'respiratory_inc',
        'respiratory_prev', 'respiratory_inc_rate', 'respiratory_prev_rate',
        'respiratory_YLD'
    ],
                 filename='respiratory',
                 stage='stage_2')

    # Cognitive Fatigue
    df.save_data(output_cols=[
        'location_id', 'age_group_id', 'sex_id', 'draw_var',
        'cognitive_fatigue_inc', 'cognitive_fatigue_prev',
        'cognitive_fatigue_inc_rate', 'cognitive_fatigue_prev_rate',
        'cognitive_fatigue_YLD'
    ],
                 filename='cognitive_fatigue',
                 stage='stage_2')

    # Cognitive Respiratory
    df.save_data(output_cols=[
        'location_id', 'age_group_id', 'sex_id', 'draw_var',
        'cognitive_respiratory_inc', 'cognitive_respiratory_prev',
        'cognitive_respiratory_inc_rate', 'cognitive_respiratory_prev_rate',
        'cognitive_respiratory_YLD'
    ],
                 filename='cognitive_respiratory',
                 stage='stage_2')

    # Fatigue Respiratory
    df.save_data(output_cols=[
        'location_id', 'age_group_id', 'sex_id', 'draw_var',
        'fatigue_respiratory_inc', 'fatigue_respiratory_prev',
        'fatigue_respiratory_inc_rate', 'fatigue_respiratory_prev_rate',
        'fatigue_respiratory_YLD'
    ],
                 filename='fatigue_respiratory',
                 stage='stage_2')

    # Cognitive Fatigue Respiratory
    df.save_data(output_cols=[
        'location_id', 'age_group_id', 'sex_id', 'draw_var',
        'cognitive_fatigue_respiratory_inc',
        'cognitive_fatigue_respiratory_prev',
        'cognitive_fatigue_respiratory_inc_rate',
        'cognitive_fatigue_respiratory_prev_rate',
        'cognitive_fatigue_respiratory_YLD'
    ],
                 filename='cognitive_fatigue_respiratory',
                 stage='stage_2')
示例#38
0
def main():
    """

    Will generate a dictionary as follows:
        <key> patientid : <value> lsit of dicts, where each dict contains admission data
                                  [
                                  {<key> feature/label name : <value> feature/label value}
                                  ]

    """
    parser = argparse.ArgumentParser(description='Generate Text+Code dataset')
    parser.add_argument(
        '-p',
        '--path',
        default=None,
        type=str,
        help='path to pandas dataframe where rows are admissions')
    parser.add_argument(
        '-vp',
        '--vocab_path',
        default='',
        type=str,
        help=
        'path to where code vocabulary are stored assumes diagnoses vocab file named as diag.vocab and cpt vocab as cpt.vocab'
    )
    parser.add_argument('-s',
                        '--save',
                        default='./',
                        type=str,
                        help='path to save pkl files')
    parser.add_argument('-et',
                        '--embed_text',
                        default=False,
                        action='store_true',
                        help='flag wether to embed text or not')
    parser.add_argument('-cpb',
                        '--bert_config_path',
                        default=None,
                        type=str,
                        help='path to bert config')
    parser.add_argument('-vpb',
                        '--bert_vocab_path',
                        default=None,
                        type=str,
                        help='path to bert vocab ')
    parser.add_argument('-sdp',
                        '--state_dict_path',
                        default=None,
                        type=str,
                        help='path to bert state dict')
    parser.add_argument('-gpu', '--gpu', default=0, type=int)
    parser.add_argument('-bsl',
                        '--max_bert_seq_len',
                        default=512,
                        type=int,
                        help='maximum sequence length of bert model')
    parser.add_argument(
        '-tsld',
        '--text_seq_length_discharge',
        default=0,
        type=int,
        help=
        'pass this if maximum text sequence length is known for discharge text to avoid long processing time'
    )
    parser.add_argument(
        '-tslr',
        '--text_seq_length_rest',
        default=0,
        type=int,
        help=
        'pass this if maximum text sequence length is known for rest of text (other than discharge) to avoid longer processing time'
    )
    parser.add_argument('-sc',
                        '--short_code',
                        default=False,
                        action='store_true',
                        help='flag for using short codes ')
    parser.add_argument('-diag',
                        '--diagnoses',
                        default=False,
                        action='store_true',
                        help='flag for including diagnoses codes')
    parser.add_argument('-proc',
                        '--procedures',
                        default=False,
                        action='store_true',
                        help='flag for including procedures codes')
    parser.add_argument('-med',
                        '--medications',
                        default=False,
                        action='store_true',
                        help='flag for including medication codes')
    parser.add_argument('-cpt',
                        '--cpts',
                        default=False,
                        action='store_true',
                        help='flag for including cpt codes')

    parser.add_argument('-ma', '--min_adm', default=0, type=int)

    args = parser.parse_args()
    df = pd.read_pickle(args.path)
    df_orig = df
    # remove organ donor admissions
    if ('DIAGNOSIS' in df.columns):

        REMOVE_DIAGNOSIS = ~((df['DIAGNOSIS'] == 'ORGAN DONOR ACCOUNT') | (df['DIAGNOSIS'] == 'ORGAN DONOR') | \
                       (df['DIAGNOSIS'] == 'DONOR ACCOUNT'))
        df = df[REMOVE_DIAGNOSIS]

    df = df[~df['ICD9_CODE'].isna()]  # drop patients with no icd9 code?
    df = df[~(df['TEXT_REST'].isna() | df['TEXT_REST'].isna())]

    if ('TIMEDELTA' in df.columns):
        df['TIMEDELTA'] = df['TIMEDELTA'].fillna(pd.to_timedelta("0"))
        df['TIMEDELTA'] = pd.to_timedelta(df['TIMEDELTA'])
        df['TIMEDELTA'] = df['TIMEDELTA'].apply(lambda x: x.seconds)

    pids = list(set(df['SUBJECT_ID'].tolist()))

    # lambda
    demographic_cols = {
        'AGE': [],
        'GENDER': [],
        'LAST_CAREUNIT': [],
        'MARITAL_STATUS': [],
        'ETHNICITY': [],
        'DISCHARGE_LOCATION': []
    }

    df.loc[:, 'MARITAL_STATUS'], demographic_cols[
        'MARITAL_STATUS'] = pd.factorize(df['MARITAL_STATUS'])
    df.loc[:, 'ETHNICITY'], demographic_cols['ETHNICITY'] = pd.factorize(
        df['ETHNICITY'])
    df.loc[:, 'DISCHARGE_LOCATION'], demographic_cols[
        'DISCHARGE_LOCATION'] = pd.factorize(df['DISCHARGE_LOCATION'])
    df.loc[:,
           'LAST_CAREUNIT'], demographic_cols['LAST_CAREUNIT'] = pd.factorize(
               df['LAST_CAREUNIT'])
    df.loc[:,
           'GENDER'], demographic_cols['GENDER'] = pd.factorize(df['GENDER'])
    df.loc[:, 'AGE'] = df['AGE'].astype(int)
    los_bins = [1, 2, 3, 4, 5, 6, 7, 8, 14, float('inf')]
    los_labels = [1, 2, 3, 4, 5, 6, 7, 8, 9]
    df.loc[:, 'LOS'] = pd.cut(df['LOS'], bins=los_bins, labels=los_labels)

    temp_data = []
    data = {}

    diag_vocab = Vocab()
    cpt_vocab = Vocab()
    med_vocab = Vocab()
    proc_vocab = Vocab()

    if (args.vocab_path != ''):
        #to use below checkout https://github.com/sajaddarabi/HCUP-US-EHR
        if (args.diagnoses):
            diag_vocab._build_from_file(
                os.path.join(args.vocab_path, 'diag.vocab'))
        if (args.cpts):
            cpt_vocab._build_from_file(
                os.path.join(args.vocab_path, 'cpt.vocab'))
        #if (args.procedures):
        #    proc_vocab._build_from_file(os.path.join(args.vocab_path, 'proc.vocab'))
        #if (args.med):
        #med_vocab._build_from_file(os.path.join(args.vocab_path, 'med.vocab'))

    if (os.path.exists(os.path.join(args.save, 'data.pkl'))):
        temp_data = pickle.load(open(os.path.join(args.save, 'data.pkl'),
                                     'rb'))
        temp_data = temp_data['data']

        t = list(temp_data.keys())
        t = t[0]
        d = 'text_embedding' in temp_data[t][0]

        if (not d):
            temp_data = []
        else:
            model = None
            bert_config = None
            torch.cuda.empty_cache()

    if args.embed_text:
        tokenizer = BertTokenizer(args.bert_vocab_path)

    if args.embed_text and (len(temp_data) == 0):
        bert_config = BertConfig(args.bert_config_path)
        model = BertTextModel(bert_config)
        state_dict = torch.load(args.state_dict_path)
        model.init_bert_weights(state_dict)
        device, _ = _prepare_device(args.gpu)
        model = model.to(device)
        max_seq_len_text_d = args.text_seq_length_discharge
        max_seq_len_text_r = args.text_seq_length_rest

        if max_seq_len_text_d == 0:
            max_seq_len_text = compute_max_seq_len_text(
                df, 'TEXT_DISCHARGE', tokenizer)
            max_seq_len_text = max_seq_len_text // args.max_bert_seq_len + 1
            max_seq_len_text_d = max_seq_len_text
            print("text sequence discharge length: {}".format(
                max_seq_len_text_d))

        if max_seq_len_text_r == 0:
            max_seq_len_text = compute_max_seq_len_text(
                df, 'TEXT_REST', tokenizer)
            max_seq_len_text = max_seq_len_text // args.max_bert_seq_len + 1
            max_seq_len_text_r = max_seq_len_text
            print("text sequence rest length: {}".format(max_seq_len_text_r))
    try:
        for pid in tqdm(pids):
            pid_df = df[df['SUBJECT_ID'] == pid]
            pid_df = pid_df.sort_values('ADMITTIME').reset_index()
            if (len(pid_df) < 1):  # must atleast have two data points
                continue
            data[pid] = []

            t = 0
            hadm_ids = set(df['HADM_ID'])
            for i, r in pid_df.iterrows():
                #filt notes prior to n days and concatenate them
                # leave discharge summary seperate
                admit_data = {}
                demographics = [r['AGE'], r['GENDER'], r['MARITAL_STATUS']]

                icu_unit = np.zeros((demographic_cols['LAST_CAREUNIT'].size, ),
                                    dtype=int)
                icu_unit[r['LAST_CAREUNIT']] = 1
                demographics += list(icu_unit)

                ethnicity = np.zeros((demographic_cols['ETHNICITY'].size, ),
                                     dtype=int)
                ethnicity[r['ETHNICITY']] = 1
                demographics += list(ethnicity)

                ethnicity = np.zeros((demographic_cols['ETHNICITY'].size, ),
                                     dtype=int)
                ethnicity[r['ETHNICITY']] = 1
                demographics += list(ethnicity)

                admit_data['demographics'] = demographics
                dtok, ptok, mtok, ctok = [], [], [], []
                diagnosis_codes, proc_codes, med_codes, cpt_codes = np.nan, np.nan, np.nan, np.nan

                if args.diagnoses:
                    diagnosis_codes = r['ICD9_CODE']

                if (diagnosis_codes == diagnosis_codes):
                    dtok = diag_vocab.convert_to_ids(diagnosis_codes, 'D',
                                                     args.short_code)

                if (args.procedures):
                    proc_codes = r['ICD9_CODE_PROCEDURE']

                if (proc_codes == proc_codes):
                    ptok = proc_vocab.convert_to_ids(proc_codes, 'P',
                                                     args.short_code)

                if args.medications:
                    med_codes = r[
                        'NDC']  # issue with NDC what mapping version is being used..?

                if (med_codes == med_codes):
                    mtok = med_vocab.convert_to_ids(med_codes, 'M')

                if args.cpts:
                    cpt_codes = r['CPT_CD']

                if (cpt_codes == cpt_codes):
                    ctok = cpt_vocab.convert_to_ids(cpt_codes, 'C')

                admit_data['diagnoses'] = dtok
                admit_data['procedures'] = ptok
                admit_data['medications'] = mtok
                admit_data['cptproc'] = ctok

                if (r['TIMEDELTA'] == r['TIMEDELTA']):
                    t += r['TIMEDELTA']

                admit_data['timedelta'] = t

                text_discharge = r['TEXT_DISCHARGE']
                text_rest = r['TEXT_REST']

                ttokd = tokenizer.tokenize(text_discharge)
                ttokd = tokenizer.convert_tokens_to_ids(ttokd)
                ttokr = tokenizer.tokenize(text_rest)
                ttokr = tokenizer.convert_tokens_to_ids(ttokr)

                admit_data['text_discharge_raw'] = text_discharge
                admit_data['text_rest_raw'] = text_rest

                admit_data['text_discharge_len'] = len(ttokd)
                admit_data['text_rest_len'] = len(ttokr)

                admit_data['text_discharge_token'] = ttokd
                admit_data['text_rest_token'] = ttokr

                if len(temp_data) == 0:
                    if (args.embed_text):
                        ttokd = embed_text(ttokd, device, model,
                                           args.max_bert_seq_len,
                                           max_seq_len_text_d)
                        ttokd = ttokd.cpu().numpy()
                        ttokr = embed_text(ttokr, device, model,
                                           args.max_bert_seq_len,
                                           max_seq_len_text_r)
                        ttokr = ttokr.cpu().numpy()
                else:
                    ttok = temp_data[pid][i]['text_embedding']

                admit_data['text_embedding_discharge'] = ttokd
                admit_data['text_embedding_rest'] = ttokr

                admit_data['los'] = r['LOS']
                admit_data['readmission'] = r['readmission_label']
                admit_data['mortality'] = r['DEATHTIME'] == r['DEATHTIME']
                data[pid].append(admit_data)

    except Exception as error:
        print(error)
        import pdb
        pdb.set_trace()

    if (not os.path.exists(args.save)):
        os.makedirs(args.save)

    # temporarly save data incase something goes wrong ...
    try:
        with open(os.path.join(args.save, 'data.pkl'), 'wb') as handle:
            data_dict = {}
            data_dict['data'] = data
            pickle.dump(data_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    except:
        import pdb
        pdb.set_trace()

    pids = list(data.keys())
    flatten = lambda x: [item for sublist in x for item in sublist]

    data_info = {}
    num_icd9_codes, num_proc_codes, num_med_codes = 0, 0, 0
    data_info['num_patients'] = len(pids)
    data_info['max_seq_len_text_d'] = max_seq_len_text_d
    data_info['max_seq_len_text_r'] = max_seq_len_text_r

    data_info['num_icd9_codes'] = 0
    data_info['num_proc_codes'] = 0
    data_info['num_med_codes'] = 0

    if (args.diagnoses):
        num_icd9_codes = len(set(flatten(df_orig['ICD9_CODE'].dropna())))

    data_info['num_icd9_codes'] = num_icd9_codes

    if (args.procedures):
        num_proc_codes = len(
            set(flatten(df_orig['ICD9_CODE_PROCEDURE'].dropna())))

    data_info['num_proc_codes'] = num_proc_codes

    if (args.medications):
        num_med_codes = len(set(flatten(df_orig['NDC'].dropna())))

    data_info['num_med_codes'] = num_med_codes
    data_info['demographics_shape'] = len(data[pids[0]][0]['demographics'])
    data_info['demographic_cols'] = demographic_cols
    data_info['total_codes'] = data_info['num_icd9_codes'] + data_info[
        'num_proc_codes'] + data_info['num_med_codes']

    if (not os.path.exists(args.save)):
        os.makedirs(args.save)

    with open(os.path.join(args.save, 'data.pkl'), 'wb') as handle:
        data_dict = {}
        data_dict['info'] = data_info
        data_dict['data'] = data
        pickle.dump(data_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

    with open(os.path.join(args.save, 'cpt_vocab.pkl'), 'wb') as handle:
        pickle.dump(cpt_vocab, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open(os.path.join(args.save, 'diag_vocab.pkl'), 'wb') as handle:
        pickle.dump(diag_vocab, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open(os.path.join(args.save, 'med_vocab.pkl'), 'wb') as handle:
        pickle.dump(med_vocab, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open(os.path.join(args.save, 'proc_vocab.pkl'), 'wb') as handle:
        pickle.dump(proc_vocab, handle, protocol=pickle.HIGHEST_PROTOCOL)
def main(locator, weather_path, scenario, parameter_set, time_start, time_end,
         time_step_ts, set_temperature_goal, constant_temperature):
    # Preliminary step - time
    date_and_time_prediction = pd.date_range(
        start=time_start, end=time_end, freq=pd.to_timedelta(time_step_ts))
    time_step = date_and_time_prediction[1] - date_and_time_prediction[0]

    time_end_object = datetime.datetime.strptime(time_end, '%Y-%m-%d %H:%M:%S')
    last_step_plus_1 = time_end_object + time_step
    last_step_plus_1_str = datetime.datetime.strftime(last_step_plus_1,
                                                      '%Y-%m-%d %H:%M:%S')
    date_and_time_prediction_plus_1 = pd.date_range(
        start=time_start,
        end=last_step_plus_1_str,
        freq=pd.to_timedelta(time_step_ts))

    # Getting and writting general data
    (internal_loads_df, indoor_comfort_df, construction_envelope_systems_df,
     leakage_envelope_systems_df, window_envelope_systems_df,
     roofs_envelope_systems_df, wall_envelope_systems_df,
     shading_envelope_systems_df, emission_systems_heating_df,
     emission_systems_cooling_df, emission_systems_controller_df,
     system_controls_ini_df, cooling_generation_df, zone_occupancy_df, zone_df,
     architecture_df, technical_systems_df, supply_systems_df,
     weather_general_info, weather_timeseries_initial_df, occupancy_types_full,
     occupancy_types, buildings_names, building_geometry_all,
     occupancy_types_full_cardinal, buildings_cardinal,
     occupancy_types_cardinal, occupants_probability_dic,
     lighting_appliances_probability_dic, processes_probability_dic,
     monthly_use_probability_df, occupancy_density_m2_p, footprint,
     gross_floor_area_m2, floors_cardinal_df, total_gross_floor_area_m2,
     mean_floor_height_m, system_controls_df, supply_temperature_df,
     emissions_cooling_type_dic, emissions_controller_type_dic,
     generation_cooling_code_dic, occupancy_per_building_cardinal,
     occupancy_per_building_list, T_int_cea_dic,
     T_ext_cea_df) = building_extract_cea_data.main(locator, weather_path,
                                                    time_start, time_end)

    (date_and_time, year, wet_bulb_temperature_df,
     occupancy_probability_df) = building_write_definitions.main(
         locator, scenario, date_and_time_prediction, time_start, time_end,
         time_step, parameter_set, internal_loads_df,
         construction_envelope_systems_df, leakage_envelope_systems_df,
         window_envelope_systems_df, roofs_envelope_systems_df,
         wall_envelope_systems_df, shading_envelope_systems_df,
         zone_occupancy_df, architecture_df, weather_general_info,
         weather_timeseries_initial_df, occupancy_types,
         occupancy_types_cardinal, buildings_names, building_geometry_all,
         occupants_probability_dic, lighting_appliances_probability_dic,
         processes_probability_dic, monthly_use_probability_df,
         occupancy_density_m2_p, gross_floor_area_m2, mean_floor_height_m,
         DELTA_P_DIM, HE_E, H_I, DENSITY_AIR, HEAT_CAPACITY_AIR,
         supply_temperature_df, emissions_cooling_type_dic)

    (prediction_horizon, center_interval_temperatures_dic,
     set_setback_temperatures_dic, setback_boolean_dic, heating_boolean,
     cooling_boolean, set_temperatures_dic) = building_setup_district.main(
         date_and_time_prediction, time_step, set_temperature_goal,
         constant_temperature, buildings_names, system_controls_df,
         occupancy_per_building_cardinal, occupancy_per_building_list,
         occupancy_probability_df, indoor_comfort_df, T_int_cea_dic)

    electricity_prices_MWh = pd.read_excel(locator.get_electricity_costs(),
                                           "ELECTRICITY")
    electricity_prices_MWh[
        "PRICE ($/MWh)"] = electricity_prices_MWh["cost_kWh"] * 1000
    electricity_prices_MWh["our_datetime"] = pd.date_range(
        start='1/1/2005', periods=HOURS_IN_YEAR)
    electricity_prices_MWh.set_index('our_datetime', inplace=True)

    (
        Qcsmax_Wm2_dic,
        em_efficiency_mean_dic,
    ) = building_process_hvac_efficiencies.main(
        locator, buildings_names, footprint, buildings_cardinal,
        cooling_generation_df, emission_systems_cooling_df,
        emission_systems_controller_df, generation_cooling_code_dic,
        emissions_cooling_type_dic, emissions_controller_type_dic,
        set_temperatures_dic, T_ext_cea_df, wet_bulb_temperature_df,
        prediction_horizon, date_and_time_prediction,
        occupancy_per_building_cardinal, occupancy_per_building_list,
        supply_temperature_df, PHI_5_MAX, FB, HP_ETA_EX_COOL, HP_AUXRATIO)

    return (prediction_horizon, date_and_time_prediction,
            date_and_time_prediction_plus_1, time_step, year, buildings_names,
            buildings_cardinal, center_interval_temperatures_dic,
            set_setback_temperatures_dic, setback_boolean_dic, heating_boolean,
            cooling_boolean, set_temperatures_dic,
            occupancy_per_building_cardinal, occupancy_per_building_list,
            gross_floor_area_m2, total_gross_floor_area_m2, indoor_comfort_df,
            occupancy_density_m2_p, occupancy_probability_df,
            em_efficiency_mean_dic, Qcsmax_Wm2_dic, electricity_prices_MWh)
示例#40
0
def interpolate_weather_file(weather_file_path,
                             weather_data_type,
                             datetime_start,
                             datetime_end,
                             interpolation_freq,
                             remove_leapyear):
    """Interpolate the data from a weather file to a new frequency."""
    debug_plotting = False  # Show a plot to check the interpolation result
#    debug_plotting = True  # Show a plot to check the interpolation result

#    plot_value = 'IBEAM_H'
#    plot_value = 'IDIFF_H'
    plot_value = 'TAMB'
#    plot_value = 'WSPEED'
#    plot_value = 'RHUM'
#    plot_value = 'WDIR'
#    plot_value = 'CCOVER'
#    plot_value = 'PAMB'

    weather_file = os.path.basename(weather_file_path)

    # Read the file and store it in a DataFrame
    if weather_data_type == 'IGS' or weather_data_type == 'TRNSYS':
        weather_data = read_IGS_weather_file(weather_file_path)
    elif weather_data_type == 'DWD':
        weather_data = read_DWD_weather_file(weather_file_path)
    else:
        logger.error('Weather data type "'+weather_data_type+'" unknown!')
        exit()

    # Assumption: The IGS weather files always start at January 01.
    current_year = datetime_start.year
    newyear = datetime.datetime(current_year, 1, 1)
    # Convert hours of year to DateTime and make that the index of DataFrame
    weather_data.index = pd.to_timedelta(weather_data['HOUR'],
                                         unit='h') + newyear

    # Infer the time frequency of the original data
    original_freq = pd.infer_freq(weather_data.index, warn=True)
    original_freq = pd.to_timedelta(1, unit=original_freq)
#    logger.debug('Inferred freqency = '+str(original_freq))

    if debug_plotting is True:  # Plot the original data (Ambient temperature)
        fig = plt.figure()
        fig.suptitle(weather_file)
        weather_data[plot_value].plot(marker='.', label=plot_value+' orig')

    if interpolation_freq != original_freq:
        # Perform interpolation to new index of hours
        # Definition:
        # "Column value is a mean value related to the time interval delta t
        # ending at the time corresponding to actual weather_data line."

        # Thus during interpolation, a value must move to the middle of the
        # previous timestep

        # If the new frequency is larger (i.e. we are downsampling the data),
        # we need to use 'resample' to take the mean of the time intervals we
        # combine
        if interpolation_freq > original_freq:
            weather_data = weather_data.resample(interpolation_freq,
                                                 label='right',
                                                 closed='right').mean()
        # Now we can do the interpolation (upsampling). If we downsampled
        # before, this now only affects the start and end of the data

        # Create a shifted index to interpolate to
        interpolate_index = pd.date_range(
                start=datetime_start + pd.Timedelta(original_freq)/2  # shift
                + pd.Timedelta(interpolation_freq),  # prevent "0 h" time stamp
                end=datetime_end + pd.Timedelta(original_freq)/2,  # shift
                freq=interpolation_freq)

        weather_data = weather_data.reindex(interpolate_index)
        if interpolation_freq < original_freq:
            # Shift the correct number of steps to set a value to the middle
            # of the time step
            weather_data = weather_data.shift(
                    freq=-pd.Timedelta(original_freq)/2)
        weather_data = weather_data.interpolate(method='time')

        # The interpolation will generate NaN on the lines before the first
        # original line (hours = 1). Fill those NaN 'backwards' with the last
        # valid values:
        weather_data.fillna(method='backfill', inplace=True)

        # Cloud cover is given in integers, so interpolated values need to be
        # rounded
        weather_data['CCOVER'] = weather_data['CCOVER'].round(decimals=0)

        # Convert DateTime index to hours of the year
        weather_data['HOUR'] = (weather_data.index - datetime_start) / \
            np.timedelta64(1, 'h')

        if debug_plotting is True:  # Plot the interpolated data
            weather_data[plot_value].plot(marker='x',
                                          label=plot_value+' intpl.')
    else:
        # No interpolation required. But we need to slice from start to end
        weather_data = weather_data[datetime_start:datetime_end]

    # Remove leapyear from DataFrame (optional)
    if calendar.isleap(current_year) is True:
        logger.warn(str(current_year)+' is a leap year. Be careful!')
    if remove_leapyear is True:
        weather_data = weather_data[~((weather_data.index.month == 2) &
                                      (weather_data.index.day == 29))]

    # Now show the plots, including their legend
    if debug_plotting is True:
        plt.legend()
        plt.show(block=False)

    return weather_data
示例#41
0
train1_mergeDataset_add = pd.merge(train1_mergeDataset,
                                   train1_dayofweek_dataset,
                                   on=['user_id', 'day_of_week'],
                                   how='left')
#train1_mergeDataset_add = train1_mergeDataset
#train1_mergeDataset_add.drop(['DOW_power_sum', 'DOW_allsum', 'power_sum', 'DOW_power_mean', 'DOW_power_std', 'DOW_powaer_rate', 'power_mean', 'power_std', 'power_rate'], axis=1,inplace=True)
#train1_mergeDataset_add.drop(['DOW_power_sum', 'DOW_allsum', 'power_sum', 'DOW_power_mean', 'DOW_powaer_rate', 'power_mean', 'power_rate'], axis=1,inplace=True)
#train1_mergeDataset_add.drop(['DOW_power_sum', 'DOW_allsum', 'power_sum'], axis=1,inplace=True)
train1_mergeDataset_add.drop([
    'DOW_power_sum', 'DOW_allsum', 'power_sum', 'DOW_power_rate', 'power_rate'
],
                             axis=1,
                             inplace=True)
train1_Y = handledataset[
    (handledataset.record_date >=
     (pd.to_datetime('2015-01-01') + pd.to_timedelta(7 * 81, unit='D')))
    & (handledataset.record_date <
       (pd.to_datetime('2015-01-01') + pd.to_timedelta(7 * 82, unit='D')))]
final_train1 = pd.merge(train1_mergeDataset_add,
                        train1_Y,
                        on=['user_id', 'day_of_week'],
                        how='left')

print "select train2 dataset ............."
train2 = pd.read_csv(
    u'/home/haven/Tianchi_power/Wavelet_Handle(2)/F3_Result/train2AndPredictY.csv'
)
train2_MeanStdSum = train2.groupby(['user_id'])['power_consumption'].agg({
    'power_mean':
    np.mean,
    'power_std':
示例#42
0
import matplotlib.dates as mdates

pdf = PdfPages("sg_electricity.pdf")

df = pd.read_csv("sg_electricity.csv")
df["date"] = pd.to_datetime(df.date)


# Clean up the hour variable and create a unified date variable
def f(x):
    u = x.split(":")
    return int(u[0]) + int(u[1]) / 60 - 0.5


df["hourofday"] = df.period_ending_time.apply(f)
df["date"] += pd.to_timedelta(df.hourofday, 'h')

# Below are functions to extract different elements of the date
# variable, in most cases using sin/cos to force continuous
# periodicity.


# Trend (non-periodic)
def q0(x):
    return (x - pd.to_datetime("2012-01-01")).dt.days


# Periodic cycle by year
def f1(x):
    return np.cos(2 * np.pi * x.dt.dayofyear / 365)
示例#43
0
 def test_string_indexing(self):
     # GH 16896
     df = pd.DataFrame({"x": range(3)}, index=pd.to_timedelta(range(3), unit="days"))
     expected = df.iloc[0]
     sliced = df.loc["0 days"]
     tm.assert_series_equal(sliced, expected)
示例#44
0
文件: tools.py 项目: yiwen90/pandas
def _assemble_from_unit_mappings(arg, errors):
    """
    assemble the unit specifed fields from the arg (DataFrame)
    Return a Series for actual parsing

    Parameters
    ----------
    arg : DataFrame
    errors : {'ignore', 'raise', 'coerce'}, default 'raise'

        - If 'raise', then invalid parsing will raise an exception
        - If 'coerce', then invalid parsing will be set as NaT
        - If 'ignore', then invalid parsing will return the input

    Returns
    -------
    Series
    """
    from pandas import to_timedelta, to_numeric, DataFrame
    arg = DataFrame(arg)
    if not arg.columns.is_unique:
        raise ValueError("cannot assemble with duplicate keys")

    # replace passed unit with _unit_map
    def f(value):
        if value in _unit_map:
            return _unit_map[value]

        # m is case significant
        if value.lower() in _unit_map:
            return _unit_map[value.lower()]

        return value

    unit = {k: f(k) for k in arg.keys()}
    unit_rev = {v: k for k, v in unit.items()}

    # we require at least Ymd
    required = ['year', 'month', 'day']
    req = sorted(list(set(required) - set(unit_rev.keys())))
    if len(req):
        raise ValueError("to assemble mappings requires at "
                         "least that [year, month, day] be specified: "
                         "[{0}] is missing".format(','.join(req)))

    # keys we don't recognize
    excess = sorted(list(set(unit_rev.keys()) - set(_unit_map.values())))
    if len(excess):
        raise ValueError("extra keys have been passed "
                         "to the datetime assemblage: "
                         "[{0}]".format(','.join(excess)))

    def coerce(values):
        # we allow coercion to if errors allows
        values = to_numeric(values, errors=errors)

        # prevent overflow in case of int8 or int16
        if is_integer_dtype(values):
            values = values.astype('int64', copy=False)
        return values

    values = (coerce(arg[unit_rev['year']]) * 10000 +
              coerce(arg[unit_rev['month']]) * 100 +
              coerce(arg[unit_rev['day']]))
    try:
        values = to_datetime(values, format='%Y%m%d', errors=errors)
    except (TypeError, ValueError) as e:
        raise ValueError("cannot assemble the " "datetimes: {0}".format(e))

    for u in ['h', 'm', 's', 'ms', 'us', 'ns']:
        value = unit_rev.get(u)
        if value is not None and value in arg:
            try:
                values += to_timedelta(coerce(arg[value]),
                                       unit=u,
                                       errors=errors)
            except (TypeError, ValueError) as e:
                raise ValueError("cannot assemble the datetimes "
                                 "[{0}]: {1}".format(value, e))

    return values
示例#45
0
    def test_construction(self):

        expected = np.timedelta64(10, 'D').astype('m8[ns]').view('i8')
        assert Timedelta(10, unit='d').value == expected
        assert Timedelta(10.0, unit='d').value == expected
        assert Timedelta('10 days').value == expected
        assert Timedelta(days=10).value == expected
        assert Timedelta(days=10.0).value == expected

        expected += np.timedelta64(10, 's').astype('m8[ns]').view('i8')
        assert Timedelta('10 days 00:00:10').value == expected
        assert Timedelta(days=10, seconds=10).value == expected
        assert Timedelta(days=10, milliseconds=10 * 1000).value == expected
        assert (Timedelta(days=10,
                          microseconds=10 * 1000 * 1000).value == expected)

        # gh-8757: test construction with np dtypes
        timedelta_kwargs = {
            'days': 'D',
            'seconds': 's',
            'microseconds': 'us',
            'milliseconds': 'ms',
            'minutes': 'm',
            'hours': 'h',
            'weeks': 'W'
        }
        npdtypes = [
            np.int64, np.int32, np.int16, np.float64, np.float32, np.float16
        ]
        for npdtype in npdtypes:
            for pykwarg, npkwarg in timedelta_kwargs.items():
                expected = np.timedelta64(1,
                                          npkwarg).astype('m8[ns]').view('i8')
                assert Timedelta(**{pykwarg: npdtype(1)}).value == expected

        # rounding cases
        assert Timedelta(82739999850000).value == 82739999850000
        assert ('0 days 22:58:59.999850' in str(Timedelta(82739999850000)))
        assert Timedelta(123072001000000).value == 123072001000000
        assert ('1 days 10:11:12.001' in str(Timedelta(123072001000000)))

        # string conversion with/without leading zero
        # GH 9570
        assert Timedelta('0:00:00') == timedelta(hours=0)
        assert Timedelta('00:00:00') == timedelta(hours=0)
        assert Timedelta('-1:00:00') == -timedelta(hours=1)
        assert Timedelta('-01:00:00') == -timedelta(hours=1)

        # more strings & abbrevs
        # GH 8190
        assert Timedelta('1 h') == timedelta(hours=1)
        assert Timedelta('1 hour') == timedelta(hours=1)
        assert Timedelta('1 hr') == timedelta(hours=1)
        assert Timedelta('1 hours') == timedelta(hours=1)
        assert Timedelta('-1 hours') == -timedelta(hours=1)
        assert Timedelta('1 m') == timedelta(minutes=1)
        assert Timedelta('1.5 m') == timedelta(seconds=90)
        assert Timedelta('1 minute') == timedelta(minutes=1)
        assert Timedelta('1 minutes') == timedelta(minutes=1)
        assert Timedelta('1 s') == timedelta(seconds=1)
        assert Timedelta('1 second') == timedelta(seconds=1)
        assert Timedelta('1 seconds') == timedelta(seconds=1)
        assert Timedelta('1 ms') == timedelta(milliseconds=1)
        assert Timedelta('1 milli') == timedelta(milliseconds=1)
        assert Timedelta('1 millisecond') == timedelta(milliseconds=1)
        assert Timedelta('1 us') == timedelta(microseconds=1)
        assert Timedelta('1 micros') == timedelta(microseconds=1)
        assert Timedelta('1 microsecond') == timedelta(microseconds=1)
        assert Timedelta('1.5 microsecond') == Timedelta('00:00:00.000001500')
        assert Timedelta('1 ns') == Timedelta('00:00:00.000000001')
        assert Timedelta('1 nano') == Timedelta('00:00:00.000000001')
        assert Timedelta('1 nanosecond') == Timedelta('00:00:00.000000001')

        # combos
        assert Timedelta('10 days 1 hour') == timedelta(days=10, hours=1)
        assert Timedelta('10 days 1 h') == timedelta(days=10, hours=1)
        assert Timedelta('10 days 1 h 1m 1s') == timedelta(days=10,
                                                           hours=1,
                                                           minutes=1,
                                                           seconds=1)
        assert Timedelta('-10 days 1 h 1m 1s') == -timedelta(
            days=10, hours=1, minutes=1, seconds=1)
        assert Timedelta('-10 days 1 h 1m 1s') == -timedelta(
            days=10, hours=1, minutes=1, seconds=1)
        assert Timedelta('-10 days 1 h 1m 1s 3us') == -timedelta(
            days=10, hours=1, minutes=1, seconds=1, microseconds=3)
        assert Timedelta('-10 days 1 h 1.5m 1s 3us'), -timedelta(
            days=10, hours=1, minutes=1, seconds=31, microseconds=3)

        # Currently invalid as it has a - on the hh:mm:dd part
        # (only allowed on the days)
        pytest.raises(ValueError,
                      lambda: Timedelta('-10 days -1 h 1.5m 1s 3us'))

        # only leading neg signs are allowed
        pytest.raises(ValueError,
                      lambda: Timedelta('10 days -1 h 1.5m 1s 3us'))

        # no units specified
        pytest.raises(ValueError, lambda: Timedelta('3.1415'))

        # invalid construction
        tm.assert_raises_regex(ValueError, "cannot construct a Timedelta",
                               lambda: Timedelta())
        tm.assert_raises_regex(ValueError, "unit abbreviation w/o a number",
                               lambda: Timedelta('foo'))
        tm.assert_raises_regex(
            ValueError, "cannot construct a Timedelta from the "
            "passed arguments, allowed keywords are ",
            lambda: Timedelta(day=10))

        # round-trip both for string and value
        for v in [
                '1s', '-1s', '1us', '-1us', '1 day', '-1 day',
                '-23:59:59.999999', '-1 days +23:59:59.999999', '-1ns', '1ns',
                '-23:59:59.999999999'
        ]:

            td = Timedelta(v)
            assert Timedelta(td.value) == td

            # str does not normally display nanos
            if not td.nanoseconds:
                assert Timedelta(str(td)) == td
            assert Timedelta(td._repr_base(format='all')) == td

        # floats
        expected = np.timedelta64(
            10, 's').astype('m8[ns]').view('i8') + np.timedelta64(
                500, 'ms').astype('m8[ns]').view('i8')
        assert Timedelta(10.5, unit='s').value == expected

        # offset
        assert (to_timedelta(
            pd.offsets.Hour(2)) == Timedelta('0 days, 02:00:00'))
        assert (Timedelta(pd.offsets.Hour(2)) == Timedelta('0 days, 02:00:00'))
        assert (Timedelta(
            pd.offsets.Second(2)) == Timedelta('0 days, 00:00:02'))

        # gh-11995: unicode
        expected = Timedelta('1H')
        result = pd.Timedelta(u'1H')
        assert result == expected
        assert (to_timedelta(
            pd.offsets.Hour(2)) == Timedelta(u'0 days, 02:00:00'))

        pytest.raises(ValueError, lambda: Timedelta(u'foo bar'))
示例#46
0
def setup_interface_daily():
    b_d = "temp_daily"
    nam_file = "freyberg.nam"
    m = flopy.modflow.Modflow.load(nam_file,
                                   model_ws=b_d,
                                   check=False,
                                   forgive=False)

    # assign the executable name for the model
    m.exe_name = "mfnwt"

    # now let's run this in a new folder called temp so we don't overwrite the original data
    m.change_model_ws("temp", reset_external=True)

    # this writes all the MODFLOW files in the new location
    m.write_input()

    # the following helps get the dependecies (both python and executables) in the right place
    prep_deps.prep_template(t_d="temp")

    pyemu.os_utils.run("{0} {1}".format(m.exe_name, m.name + ".nam"),
                       cwd=m.model_ws)

    props = []
    paks = [
        "upw.hk", "upw.vka", "upw.ss", "upw.sy", "bas6.strt", "extra.prsity"
    ]  #"extra" because not a modflow parameter
    for k in range(m.nlay):
        props.extend([[p, k] for p in paks])
    const_props = props.copy()
    props.append(["rch.rech", None])

    for kper in range(m.nper):
        const_props.append(["rch.rech", kper])

    spatial_list_props = [
        ["wel.flux", 2], ["ghb.cond", 0], ["ghb.cond", 1], ["ghb.cond", 2]
    ]  # spatially by each list entry, across all stress periods
    temporal_list_props = [["wel.flux", kper] for kper in range(m.nper)
                           ]  # spatially uniform for each stress period

    spatial_list_props, temporal_list_props

    dry_kper = int(m.nper * 0.85)
    hds_kperk = [[kper, k] for k in range(m.nlay)
                 for kper in [0, dry_kper, m.nper - 1]]

    hds_kperk

    sfr_obs_dict = {}
    sfr_obs_dict["hw"] = np.arange(1, int(m.nrow / 2))
    sfr_obs_dict["tw"] = np.arange(int(m.nrow / 2), m.nrow)
    sfr_obs_dict["gage_1"] = [39]

    pst_helper = pyemu.helpers.PstFromFlopyModel(
        nam_file,
        new_model_ws=t_d,
        org_model_ws="temp",
        const_props=const_props,
        spatial_list_props=spatial_list_props,
        temporal_list_props=temporal_list_props,
        remove_existing=True,
        grid_props=props,
        pp_props=props,
        sfr_pars=["strk"],
        hds_kperk=hds_kperk,
        sfr_obs=sfr_obs_dict,
        build_prior=False,
        model_exe_name="mfnwt",
        pp_space=4)
    prep_deps.prep_template(t_d=pst_helper.new_model_ws)

    pst = pst_helper.pst

    # check out hydraulic conductivity parameters
    pst.parameter_data.loc[
        pst.parameter_data.parnme.apply(lambda x: "hk" in x), :]

    # what about observations? in particular, the sfr flow-out observations?
    pst.observation_data.loc[
        pst.observation_data.obgnme.apply(lambda x: "flout" in x), :]

    obs = pst.observation_data
    flout_obs = obs.loc[obs.obgnme.apply(lambda x: "flout" in x), "obsnme"]
    obs.loc[flout_obs,
            "obgnme"] = flout_obs.apply(lambda x: "_".join(x.split('_')[:-1]))

    obs_locs = pd.read_csv(
        os.path.join("..", "base_model_files", "obs_loc.csv"))
    #build obs names that correspond to the obsnme values in the control file
    obs_locs.loc[:, "site"] = obs_locs.apply(
        lambda x: "trgw_{0:03d}_{1:03d}".format(x.row - 1, x.col - 1), axis=1)
    kij_dict = {
        site: (2, r - 1, c - 1)
        for site, r, c in zip(obs_locs.site, obs_locs.row, obs_locs.col)
    }

    binary_file = os.path.join(pst_helper.m.model_ws,
                               nam_file.replace(".nam", ".hds"))
    frun_line, tr_hds_df = pyemu.gw_utils.setup_hds_timeseries(
        binary_file, kij_dict=kij_dict, include_path=True, model=pst_helper.m)
    pst_helper.frun_post_lines.append(frun_line)

    tr_hds_df.head()

    [f for f in os.listdir(pst_helper.m.model_ws) if f.endswith(".ins")]

    df = pst_helper.pst.add_observations(os.path.join(
        pst_helper.m.model_ws,
        nam_file.replace(".nam", ".hds_timeseries.processed.ins")),
                                         pst_path=".")
    obs = pst_helper.pst.observation_data
    obs.loc[df.index,
            "obgnme"] = df.index.map(lambda x: "_".join(x.split("_")[:-1]))
    obs.loc[df.index, "weight"] = 1.0

    mp_files = [f for f in os.listdir(b_d) if "mp" in f or "location" in f]
    [
        shutil.copy2(os.path.join(b_d, f),
                     os.path.join(pst_helper.new_model_ws, f))
        for f in mp_files
    ]

    pst_helper.frun_post_lines.append(
        "pyemu.os_utils.run('mp6 freyberg.mpsim >mp6.stdout')")
    pst_helper.tmp_files.append(
        "freyberg.mpenpt")  # placed at top of `forward_run.py`
    pst_helper.write_forward_run()

    out_file = "freyberg.mpenpt"
    ins_file = out_file + ".ins"
    with open(os.path.join(pst_helper.new_model_ws, ins_file), 'w') as f:
        f.write("pif ~\n")
        f.write("l7 w w w !part_status! w w !part_time!\n")

    df = pst_helper.pst.add_observations(os.path.join(pst_helper.new_model_ws,
                                                      ins_file),
                                         os.path.join(pst_helper.new_model_ws,
                                                      out_file),
                                         pst_path=".")
    for k in range(m.nlay):
        np.savetxt(os.path.join(pst_helper.new_model_ws, "arr_org",
                                "prsity_layer_{0}.ref".format(k + 1)),
                   np.zeros((m.nrow, m.ncol)) + 0.001,
                   fmt="%15.6E")

    par = pst.parameter_data
    tag_dict = {
        "hk": [0.1, 10.0],
        "vka": [0.1, 10],
        "strt": [0.95, 1.05],
        "pr": [0.8, 1.2],
        "rech": [0.8, 1.2]
    }
    for t, [l, u] in tag_dict.items():
        t_pars = par.loc[par.parnme.apply(lambda x: t in x), "parnme"]
        par.loc[t_pars, "parubnd"] = u
        par.loc[t_pars, "parlbnd"] = l

    arr_csv = os.path.join(pst_helper.new_model_ws, "arr_pars.csv")
    df = pd.read_csv(arr_csv, index_col=0)

    sy_pr = df.model_file.apply(lambda x: "sy" in x or "pr" in x)
    df.loc[:, "upper_bound"] = np.NaN
    df.loc[sy_pr, "upper_bound"] = 0.4
    df.to_csv(arr_csv)

    pst.control_data.noptmax = 0
    pst.write(os.path.join(pst_helper.new_model_ws, "freyberg.pst"))
    pyemu.os_utils.run("pestpp-ies freyberg.pst", cwd=pst_helper.new_model_ws)

    pst = pyemu.Pst(os.path.join(pst_helper.m.model_ws, "freyberg.pst"))

    pe = pst_helper.draw(100)
    pe.enforce()  # always a good idea!
    pe.to_binary(os.path.join(pst_helper.new_model_ws, "prior.jcb"))
    pst_helper.pst.write(
        os.path.join(pst_helper.m.model_ws, nam_file.replace(".nam", ".pst")))

    obs = pst_helper.pst.observation_data
    dts = pd.to_datetime(pst_helper.m.start_datetime) + pd.to_timedelta(
        np.cumsum(pst_helper.m.dis.perlen.array), unit='d')
    dts_str = list(dts.map(lambda x: x.strftime("%Y%m%d")).values)
    dry_kper = int(pst_helper.m.nper * 0.85)
    dry_dt = dts_str[dry_kper]
    print(dry_dt)
    swgw_forecasts = obs.loc[
        obs.obsnme.apply(lambda x: "fa" in x and
                         ("hw" in x or "tw" in x) and dry_dt in x),
        "obsnme"].tolist()
    hds_fore_name = "hds_00_{0:03d}_{1:03d}_{2:03d}".format(
        int(pst_helper.m.nrow / 3), int(pst_helper.m.ncol / 10), dry_kper)
    print(hds_fore_name)
    hds_forecasts = obs.loc[obs.obsnme.apply(lambda x: hds_fore_name in x),
                            "obsnme"].tolist()
    forecasts = swgw_forecasts
    forecasts.extend(hds_forecasts)
    forecasts.append("part_time")
    forecasts.append("part_status")
    pst_helper.pst.pestpp_options["forecasts"] = forecasts

    pst_helper.pst.write(
        os.path.join(pst_helper.m.model_ws, nam_file.replace(".nam", ".pst")))
    lst = flopy.utils.MfListBudget(
        os.path.join(pst_helper.m.model_ws, "freyberg.list"))
示例#47
0
 def create_index(self):
     return pd.to_timedelta(range(5), unit='d') + pd.offsets.Hour(1)
示例#48
0
import pandas as pd
import numpy as np

df = pd.read_csv('timbu.csv',
                 parse_dates=True,
                 skiprows=12,
                 header=None,
                 dtype=str)
df

df2 = pd.DataFrame()
df2['temperatura_C'] = pd.to_numeric(df[3] + '.' + df[4])
df2['pressao_M'] = pd.to_numeric(df[5] + '.' + df[6])
df2.index = (pd.to_datetime(df[1], dayfirst=True) +
             pd.to_timedelta(df[2])).rename('datahora')

print(df2)

df3 = pd.read_csv('dados_20190525.csv', index_col='datahora', parse_dates=True)

df3
示例#49
0
 def test_iso_conversion(self):
     # GH #21877
     expected = Timedelta(1, unit="s")
     assert to_timedelta("P0DT0H0M1S") == expected
示例#50
0
    def __init__(self,
                 index,
                 grouper=None,
                 obj=None,
                 name=None,
                 level=None,
                 sort=True,
                 observed=False,
                 in_axis=False):

        self.name = name
        self.level = level
        self.grouper = _convert_grouper(index, grouper)
        self.all_grouper = None
        self.index = index
        self.sort = sort
        self.obj = obj
        self.observed = observed
        self.in_axis = in_axis

        # right place for this?
        if isinstance(grouper, (Series, Index)) and name is None:
            self.name = grouper.name

        if isinstance(grouper, MultiIndex):
            self.grouper = grouper.values

        # we have a single grouper which may be a myriad of things,
        # some of which are dependent on the passing in level

        if level is not None:
            if not isinstance(level, int):
                if level not in index.names:
                    raise AssertionError('Level {} not in index'.format(level))
                level = index.names.index(level)

            if self.name is None:
                self.name = index.names[level]

            self.grouper, self._labels, self._group_index = \
                index._get_grouper_for_level(self.grouper, level)

        # a passed Grouper like, directly get the grouper in the same way
        # as single grouper groupby, use the group_info to get labels
        elif isinstance(self.grouper, Grouper):
            # get the new grouper; we already have disambiguated
            # what key/level refer to exactly, don't need to
            # check again as we have by this point converted these
            # to an actual value (rather than a pd.Grouper)
            _, grouper, _ = self.grouper._get_grouper(self.obj, validate=False)
            if self.name is None:
                self.name = grouper.result_index.name
            self.obj = self.grouper.obj
            self.grouper = grouper

        else:
            if self.grouper is None and self.name is not None:
                self.grouper = self.obj[self.name]

            elif isinstance(self.grouper, (list, tuple)):
                self.grouper = com.asarray_tuplesafe(self.grouper)

            # a passed Categorical
            elif is_categorical_dtype(self.grouper):

                from pandas.core.groupby.categorical import recode_for_groupby
                self.grouper, self.all_grouper = recode_for_groupby(
                    self.grouper, self.sort, observed)
                categories = self.grouper.categories

                # we make a CategoricalIndex out of the cat grouper
                # preserving the categories / ordered attributes
                self._labels = self.grouper.codes
                if observed:
                    codes = algorithms.unique1d(self.grouper.codes)
                    codes = codes[codes != -1]
                    if sort or self.grouper.ordered:
                        codes = np.sort(codes)
                else:
                    codes = np.arange(len(categories))

                self._group_index = CategoricalIndex(
                    Categorical.from_codes(codes=codes,
                                           categories=categories,
                                           ordered=self.grouper.ordered))

            # we are done
            if isinstance(self.grouper, Grouping):
                self.grouper = self.grouper.grouper

            # no level passed
            elif not isinstance(self.grouper,
                                (Series, Index, ExtensionArray, np.ndarray)):
                if getattr(self.grouper, 'ndim', 1) != 1:
                    t = self.name or str(type(self.grouper))
                    raise ValueError(
                        "Grouper for '{}' not 1-dimensional".format(t))
                self.grouper = self.index.map(self.grouper)
                if not (hasattr(self.grouper, "__len__")
                        and len(self.grouper) == len(self.index)):
                    errmsg = ('Grouper result violates len(labels) == '
                              'len(data)\nresult: %s' %
                              pprint_thing(self.grouper))
                    self.grouper = None  # Try for sanity
                    raise AssertionError(errmsg)

        # if we have a date/time-like grouper, make sure that we have
        # Timestamps like
        if getattr(self.grouper, 'dtype', None) is not None:
            if is_datetime64_dtype(self.grouper):
                from pandas import to_datetime
                self.grouper = to_datetime(self.grouper)
            elif is_timedelta64_dtype(self.grouper):
                from pandas import to_timedelta
                self.grouper = to_timedelta(self.grouper)
示例#51
0
    def test_fields(self):
        def check(value):
            # that we are int/long like
            assert isinstance(value, (int, compat.long))

        # compat to datetime.timedelta
        rng = to_timedelta('1 days, 10:11:12')
        assert rng.days == 1
        assert rng.seconds == 10 * 3600 + 11 * 60 + 12
        assert rng.microseconds == 0
        assert rng.nanoseconds == 0

        pytest.raises(AttributeError, lambda: rng.hours)
        pytest.raises(AttributeError, lambda: rng.minutes)
        pytest.raises(AttributeError, lambda: rng.milliseconds)

        # GH 10050
        check(rng.days)
        check(rng.seconds)
        check(rng.microseconds)
        check(rng.nanoseconds)

        td = Timedelta('-1 days, 10:11:12')
        assert abs(td) == Timedelta('13:48:48')
        assert str(td) == "-1 days +10:11:12"
        assert -td == Timedelta('0 days 13:48:48')
        assert -Timedelta('-1 days, 10:11:12').value == 49728000000000
        assert Timedelta('-1 days, 10:11:12').value == -49728000000000

        rng = to_timedelta('-1 days, 10:11:12.100123456')
        assert rng.days == -1
        assert rng.seconds == 10 * 3600 + 11 * 60 + 12
        assert rng.microseconds == 100 * 1000 + 123
        assert rng.nanoseconds == 456
        pytest.raises(AttributeError, lambda: rng.hours)
        pytest.raises(AttributeError, lambda: rng.minutes)
        pytest.raises(AttributeError, lambda: rng.milliseconds)

        # components
        tup = pd.to_timedelta(-1, 'us').components
        assert tup.days == -1
        assert tup.hours == 23
        assert tup.minutes == 59
        assert tup.seconds == 59
        assert tup.milliseconds == 999
        assert tup.microseconds == 999
        assert tup.nanoseconds == 0

        # GH 10050
        check(tup.days)
        check(tup.hours)
        check(tup.minutes)
        check(tup.seconds)
        check(tup.milliseconds)
        check(tup.microseconds)
        check(tup.nanoseconds)

        tup = Timedelta('-1 days 1 us').components
        assert tup.days == -2
        assert tup.hours == 23
        assert tup.minutes == 59
        assert tup.seconds == 59
        assert tup.milliseconds == 999
        assert tup.microseconds == 999
        assert tup.nanoseconds == 0
示例#52
0
    def test_fields(self):
        def check(value):
            # that we are int
            assert isinstance(value, int)

        # compat to datetime.timedelta
        rng = to_timedelta("1 days, 10:11:12")
        assert rng.days == 1
        assert rng.seconds == 10 * 3600 + 11 * 60 + 12
        assert rng.microseconds == 0
        assert rng.nanoseconds == 0

        msg = "'Timedelta' object has no attribute '{}'"
        with pytest.raises(AttributeError, match=msg.format("hours")):
            rng.hours
        with pytest.raises(AttributeError, match=msg.format("minutes")):
            rng.minutes
        with pytest.raises(AttributeError, match=msg.format("milliseconds")):
            rng.milliseconds

        # GH 10050
        check(rng.days)
        check(rng.seconds)
        check(rng.microseconds)
        check(rng.nanoseconds)

        td = Timedelta("-1 days, 10:11:12")
        assert abs(td) == Timedelta("13:48:48")
        assert str(td) == "-1 days +10:11:12"
        assert -td == Timedelta("0 days 13:48:48")
        assert -Timedelta("-1 days, 10:11:12").value == 49728000000000
        assert Timedelta("-1 days, 10:11:12").value == -49728000000000

        rng = to_timedelta("-1 days, 10:11:12.100123456")
        assert rng.days == -1
        assert rng.seconds == 10 * 3600 + 11 * 60 + 12
        assert rng.microseconds == 100 * 1000 + 123
        assert rng.nanoseconds == 456
        msg = "'Timedelta' object has no attribute '{}'"
        with pytest.raises(AttributeError, match=msg.format("hours")):
            rng.hours
        with pytest.raises(AttributeError, match=msg.format("minutes")):
            rng.minutes
        with pytest.raises(AttributeError, match=msg.format("milliseconds")):
            rng.milliseconds

        # components
        tup = to_timedelta(-1, "us").components
        assert tup.days == -1
        assert tup.hours == 23
        assert tup.minutes == 59
        assert tup.seconds == 59
        assert tup.milliseconds == 999
        assert tup.microseconds == 999
        assert tup.nanoseconds == 0

        # GH 10050
        check(tup.days)
        check(tup.hours)
        check(tup.minutes)
        check(tup.seconds)
        check(tup.milliseconds)
        check(tup.microseconds)
        check(tup.nanoseconds)

        tup = Timedelta("-1 days 1 us").components
        assert tup.days == -2
        assert tup.hours == 23
        assert tup.minutes == 59
        assert tup.seconds == 59
        assert tup.milliseconds == 999
        assert tup.microseconds == 999
        assert tup.nanoseconds == 0
# %%
np.datetime("2015-07-04 12:00")

# %%
np.datetime64("2015-07-04 12:59:59.50", "ns")

# %%
date = pd.to_datetime("4th of July, 2015")
date

# %%
date.strftime("%A")

# %%
date + pd.to_timedelta(np.arange(12), "D")

# %%
index = pd.DatetimeIndex(
    ["2014-07-04", "2014-08-04", "2015-07-04", "2015-08-04"])
data = pd.Series([0, 1, 2, 3], index=index)
data

# %%
data["2014-07-04":"2015-07-04"]

# %%
data["2015"]

# %%
dates = pd.to_datetime([
示例#54
0
def test_cf_timedelta_2d():
    timedeltas = ['1D', '2D', '3D']
    units = 'days'
    numbers = np.atleast_2d([1, 2, 3])

    timedeltas = np.atleast_2d(pd.to_timedelta(timedeltas, box=False))
    expected = timedeltas

    actual = coding.times.decode_cf_timedelta(numbers, units)
    assert_array_equal(expected, actual)
    assert expected.dtype == actual.dtype


@pytest.mark.parametrize(['deltas', 'expected'],
                         [(pd.to_timedelta(['1 day', '2 days']), 'days'),
                          (pd.to_timedelta(['1h', '1 day 1 hour']), 'hours'),
                          (pd.to_timedelta(['1m', '2m', np.nan]), 'minutes'),
                          (pd.to_timedelta(['1m3s', '1m4s']), 'seconds')])
def test_infer_timedelta_units(deltas, expected):
    assert expected == coding.times.infer_timedelta_units(deltas)


@pytest.mark.skipif(not has_cftime_or_netCDF4, reason='cftime not installed')
@pytest.mark.parametrize(
    ['date_args', 'expected'],
    [((1, 2, 3, 4, 5, 6), '0001-02-03 04:05:06.000000'),
     ((10, 2, 3, 4, 5, 6), '0010-02-03 04:05:06.000000'),
     ((100, 2, 3, 4, 5, 6), '0100-02-03 04:05:06.000000'),
     ((1000, 2, 3, 4, 5, 6), '1000-02-03 04:05:06.000000')])
def test_format_cftime_datetime(date_args, expected):
示例#55
0
def filterPrep(df, string, fltr, time):

    colNames = [
        'EVSE ID', 'Port Number', 'Port Type', 'Station Name',
        'Plug In Event Id', 'City', 'Latitude', 'Longitude', 'User ID',
        'Driver Postal Code', 'Start Date', 'End Date',
        'Total Duration (hh:mm:ss)', 'Charging Time (hh:mm:ss)',
        'Energy (kWh)', 'Ended By', 'Start SOC', 'End SOC'
    ]

    df = pd.DataFrame(df, index=np.arange(len(df)), columns=colNames)

    #filter for dfcf
    #df = df.loc[df['Port Type'] == 'DC Fast']

    df['Start Date'] = pd.to_datetime(df['Start Date'])
    df['End Date'] = pd.to_datetime(df['End Date'])
    df['Total Duration (hh:mm:ss)'] = pd.to_timedelta(
        df['Total Duration (hh:mm:ss)'])
    df['Charging Time (hh:mm:ss)'] = pd.to_timedelta(
        df['Charging Time (hh:mm:ss)'])

    #filter by City
    if fltr:
        df = df[df['City'].str.contains(string)]
        print("Filter for: ", string)
    else:
        print("No Filter")

    #clean data
    df = df.loc[df['Energy (kWh)'] > 0]
    df = df.loc[~pd.isnull(df['End Date'])]
    yr = 2017
    df = df.loc[(df['Start Date'] > datetime.date(yr, 12, 1))
                & (df['Start Date'] < datetime.date(yr + 2, 12, 1))]

    #update data types
    df['Duration (h)'] = df['Total Duration (hh:mm:ss)'].apply(
        lambda x: x.seconds / 3600)
    #df['Duration (h)'] = df['Duration (h)'].apply(lambda x: round(x * 4) / 4)
    df['Charging (h)'] = df['Charging Time (hh:mm:ss)'].apply(
        lambda x: x.seconds / 3600)
    #df['Charging (h)'] = df['Charging (h)'].apply(lambda x: round(x * 4) / 4)
    df['NoCharge (h)'] = df['Duration (h)'] - df['Charging (h)']
    df = df.loc[df['Duration (h)'] > 0]

    # Day of year 0 = Jan1 and day of year 365 = Dec31
    df['DayofYr'] = df['Start Date'].apply(lambda x: x.dayofyear)
    # Monday is 0 and Sunday is 6
    df['DayofWk'] = df['Start Date'].apply(lambda x: x.weekday())
    # Filter for weekdays
    df = df.loc[df['DayofWk'] <= 4]
    #df['isWeekday'] = df['DayofWk'].apply(lambda x: 1 if x <=4 else 0)
    #df = df.loc[df['isWeekday'] == 1]
    df['Year'] = df['Start Date'].apply(lambda x: x.year)
    df['StartHr'] = df['Start Date'].apply(lambda x: x.hour + x.minute / 60)
    df['EndHr'] = df['End Date'].apply(lambda x: x.hour + x.minute / 60)
    if time == 'hour':
        df['StartHr'] = df['StartHr'].apply(lambda x: np.floor(x))
        df['EndHr'] = df['EndHr'].apply(lambda x: np.floor(x))
    elif time == '15min':
        df['StartHr'] = df['StartHr'].apply(lambda x: round(x * 4) / 4)
        df['EndHr'] = df['EndHr'].apply(lambda x: round(x * 4) / 4)
    elif time == '5min':
        df['StartHr'] = df['StartHr'].apply(lambda x: round(x * 4) / 12)
        df['EndHr'] = df['EndHr'].apply(lambda x: round(x * 4) / 12)
    df['AvgPwr'] = df['Energy (kWh)'] / df['Duration (h)']
    df['Date'] = df['Start Date'].apply(
        lambda x: str(x.year) + '-' + str(x.month) + '-' + str(x.day))

    #convert percent to float
    def p2f(s):
        if isinstance(s, str):
            x = s.strip('%')
            x = float(x) / 100
            return x
        else:
            return s

    df['Start SOC'] = df['Start SOC'].apply(lambda x: p2f(x))
    df['End SOC'] = df['End SOC'].apply(lambda x: p2f(x))

    # Sort Dataframe
    df.sort_values(['Start Date'], inplace=True)
    df = df.reset_index(drop=True)

    # Assign Day Count
    df['dayCount'] = 0

    days = list(df['Start Date'].apply(
        lambda x: str(x.year) + '-' + str(x.month) + '-' + str(x.day)))
    daysSet = sorted(set(days), key=days.index)

    c = 0
    for d in daysSet:
        dateTest = [df['Date'] == d]
        trueIdx = list(dateTest[0][dateTest[0]].index)
        df.at[trueIdx, 'dayCount'] = c
        c += 1

    return df
示例#56
0
break_df = pd.merge(break_df, summary[['Login Time', 'Agent Name']])
break_df = pd.merge(break_df, agentlist[['Team Name (ID)', 'Agent Name']])

# Drop Code column
break_df = break_df.drop(columns='Code')

'''
Data wrangling and feature engineering
'''
# Remove all teams other than HERE:
break_df = break_df.where(
    break_df['Team Name (ID)'] == 'Here Navigation (205587)')
break_df = break_df.dropna()

# Change Login Time and Duration from str to time
break_df['Login Time'] = pd.to_timedelta(break_df['Login Time'])
break_df['Duration'] = pd.to_timedelta(break_df['Duration in Seconds'],
                                        unit='Seconds')

# Add new column for percent of time logged in spent in break
break_df['Percent'] = break_df['Duration']/break_df['Login Time']

# Create index for Web (Paper) Agents and drop them from dataframe
paper_agents_index = break_df[break_df['Agent Name'].str.contains
                               ('Paper')].index
break_df.drop(paper_agents_index, inplace=True)

# Clean agent names
break_df['Agent Name'] = break_df['Agent Name'].str.replace(
    '(', '', regex=False).str.replace(')', '', regex=False).str.replace(
    '0', '', regex=False).str.replace('1', '', regex=False).str.replace(
示例#57
0
def create_session_col(
    data: pd.DataFrame,
    user_identifier_cols: List[str],
    time_col: str,
    max_session_time_mins: int,
    max_event_separation_mins: int,
) -> pd.DataFrame:
    """
    Create a "session_ind" column in the dataframe.

    In particular, the session_ind column will be incremented each time a new session
    starts.

    Parameters
    ----------
    data: pd.DataFrame
        This dataframe should contain at least the following columns:
        - time stamp column
        - columns related to user name and/or computer name and/or ip address etc
    user_identifier_cols: List[str]
        Name of the columns which contain username and/or computer name and/or ip address etc.
        Each time the value of one of these columns changes, a new session will be started.
    time_col: str
        Name of the column which contains a time stamp.
        If this column is not already in datetime64[ns, UTC] format, it will be casted to it.
    max_session_time_mins: int
        The maximum length of a session in minutes. If a sequence of events for the same
        user_identifier_cols values exceeds this length, then a new session will be started.
    max_event_separation_mins: int
        The maximum length in minutes between two events in a session. If we have 2 events for
        the same user_identifier_cols values, and if those two events are more than
        `max_event_separation_mins` apart, then a new session will be started.

    Returns
    -------
    pd.DataFrame with an additional "session_ind" column

    """
    max_sep = pd.to_timedelta(max_event_separation_mins, unit="min")
    max_ses = pd.to_timedelta(max_session_time_mins, unit="min")

    df_with_sesind = data.copy()
    if not isinstance(df_with_sesind[time_col].dtype, DatetimeTZDtype):
        df_with_sesind[time_col] = pd.to_datetime(df_with_sesind[time_col])

    final_cols = list(df_with_sesind.columns) + ["session_ind"]

    if len(df_with_sesind) == 0:
        df_with_sesind["session_ind"] = None
        return df_with_sesind

    # Sessionising will not work properly with nans. Temporarily replace nan values with dummy_str.
    for col in user_identifier_cols:
        df_with_sesind[col] = df_with_sesind[col].fillna("dummy_str")

    df_with_sesind = df_with_sesind.sort_values(user_identifier_cols +
                                                [time_col]).reset_index(
                                                    drop=True)

    # initialise first row
    ses_ind = 0
    df_with_sesind.loc[0, "time_diff"] = pd.to_timedelta(0)
    df_with_sesind.loc[0, "cml_time"] = pd.to_timedelta(0)
    df_with_sesind.loc[0, "session_ind"] = ses_ind

    for i in range(1, len(df_with_sesind)):
        cur = df_with_sesind.iloc[i]
        prev = df_with_sesind.iloc[i - 1]

        # if any of the user_identifier_cols values change, a new session should start
        new_flag = False
        for col in user_identifier_cols:
            if cur[col] != prev[col]:
                new_flag = True
                break

        dif = cur[time_col] - prev[time_col]
        cml = prev["cml_time"] + dif
        # if the max session length is exceeded or the max separation between events is exceeded,
        # a new session should start
        if dif > max_sep or cml > max_ses:
            new_flag = True

        if new_flag:
            df_with_sesind.loc[i, "time_diff"] = pd.to_timedelta(0)
            df_with_sesind.loc[i, "cml_time"] = pd.to_timedelta(0)
            ses_ind += 1
            df_with_sesind.loc[i, "session_ind"] = ses_ind
        else:
            df_with_sesind.loc[i, "time_diff"] = dif
            df_with_sesind.loc[i, "cml_time"] = cml
            df_with_sesind.loc[i, "session_ind"] = ses_ind

    # replace dummy_str with nan values
    for col in user_identifier_cols:
        df_with_sesind[col] = df_with_sesind[col].replace("dummy_str", np.nan)

    return df_with_sesind[final_cols]
示例#58
0
# read observations
obs = ps.read_dino('data/B58C0698001_1.csv')

# Create the time series model
ml = ps.Model(obs)

# read weather data
knmi = ps.read.knmi.KnmiStation.fromfile(
    'data/neerslaggeg_HEIBLOEM-L_967-2.txt')
rain = ps.TimeSeries(knmi.data['RD'], settings='prec')

evap = ps.read_knmi('data/etmgeg_380.txt', variables='EV24')
if True:
    # also add 9 hours to the evaporation
    s = evap.series_original
    s.index = s.index + pd.to_timedelta(9, 'h')
    evap.series_original = s

# Create stress
sm = ps.StressModel2(stress=[rain, evap],
                     rfunc=ps.Exponential,
                     name='recharge')
ml.add_stressmodel(sm)

# set the time-offset of the model. This should be done automatically in the future.
ml._set_time_offset()

## Solve
ml.solve(freq='D')
ml.plots.decomposition()
示例#59
0
class TestIntervalIndex(Base):
    _holder = IntervalIndex

    def setup_method(self, method):
        self.index = IntervalIndex.from_arrays([0, 1], [1, 2])
        self.index_with_nan = IntervalIndex.from_tuples([(0, 1), np.nan,
                                                         (1, 2)])
        self.indices = dict(intervalIndex=tm.makeIntervalIndex(10))

    def create_index(self, closed='right'):
        return IntervalIndex.from_breaks(range(11), closed=closed)

    def create_index_with_nan(self, closed='right'):
        mask = [True, False] + [True] * 8
        return IntervalIndex.from_arrays(np.where(mask, np.arange(10), np.nan),
                                         np.where(mask, np.arange(1, 11),
                                                  np.nan),
                                         closed=closed)

    @pytest.mark.parametrize('data', [
        Index([0, 1, 2, 3, 4]),
        Index(list('abcde')),
        date_range('2017-01-01', periods=5),
        date_range('2017-01-01', periods=5, tz='US/Eastern'),
        timedelta_range('1 day', periods=5)
    ])
    def test_constructors(self, data, closed, name):
        left, right = data[:-1], data[1:]
        ivs = [Interval(l, r, closed=closed) for l, r in lzip(left, right)]
        expected = IntervalIndex._simple_new(left=left,
                                             right=right,
                                             closed=closed,
                                             name=name)

        # validate expected
        assert expected.closed == closed
        assert expected.name == name
        assert expected.dtype.subtype == data.dtype
        tm.assert_index_equal(expected.left, data[:-1])
        tm.assert_index_equal(expected.right, data[1:])

        # validated constructors
        result = IntervalIndex(ivs, name=name)
        tm.assert_index_equal(result, expected)

        result = IntervalIndex.from_intervals(ivs, name=name)
        tm.assert_index_equal(result, expected)

        result = IntervalIndex.from_breaks(data, closed=closed, name=name)
        tm.assert_index_equal(result, expected)

        result = IntervalIndex.from_arrays(left,
                                           right,
                                           closed=closed,
                                           name=name)
        tm.assert_index_equal(result, expected)

        result = IntervalIndex.from_tuples(lzip(left, right),
                                           closed=closed,
                                           name=name)
        tm.assert_index_equal(result, expected)

        result = Index(ivs, name=name)
        assert isinstance(result, IntervalIndex)
        tm.assert_index_equal(result, expected)

        # idempotent
        tm.assert_index_equal(Index(expected), expected)
        tm.assert_index_equal(IntervalIndex(expected), expected)

        result = IntervalIndex.from_intervals(expected)
        tm.assert_index_equal(result, expected)

        result = IntervalIndex.from_intervals(expected.values,
                                              name=expected.name)
        tm.assert_index_equal(result, expected)

        left, right = expected.left, expected.right
        result = IntervalIndex.from_arrays(left,
                                           right,
                                           closed=expected.closed,
                                           name=expected.name)
        tm.assert_index_equal(result, expected)

        result = IntervalIndex.from_tuples(expected.to_tuples(),
                                           closed=expected.closed,
                                           name=expected.name)
        tm.assert_index_equal(result, expected)

        breaks = expected.left.tolist() + [expected.right[-1]]
        result = IntervalIndex.from_breaks(breaks,
                                           closed=expected.closed,
                                           name=expected.name)
        tm.assert_index_equal(result, expected)

    @pytest.mark.parametrize('data', [[np.nan], [np.nan] * 2, [np.nan] * 50])
    def test_constructors_nan(self, closed, data):
        # GH 18421
        expected_values = np.array(data, dtype=object)
        expected_idx = IntervalIndex(data, closed=closed)

        # validate the expected index
        assert expected_idx.closed == closed
        tm.assert_numpy_array_equal(expected_idx.values, expected_values)

        result = IntervalIndex.from_tuples(data, closed=closed)
        tm.assert_index_equal(result, expected_idx)
        tm.assert_numpy_array_equal(result.values, expected_values)

        result = IntervalIndex.from_breaks([np.nan] + data, closed=closed)
        tm.assert_index_equal(result, expected_idx)
        tm.assert_numpy_array_equal(result.values, expected_values)

        result = IntervalIndex.from_arrays(data, data, closed=closed)
        tm.assert_index_equal(result, expected_idx)
        tm.assert_numpy_array_equal(result.values, expected_values)

        if closed == 'right':
            # Can't specify closed for IntervalIndex.from_intervals
            result = IntervalIndex.from_intervals(data)
            tm.assert_index_equal(result, expected_idx)
            tm.assert_numpy_array_equal(result.values, expected_values)

    @pytest.mark.parametrize('data', [[],
                                      np.array([], dtype='int64'),
                                      np.array([], dtype='float64'),
                                      np.array([], dtype=object)])
    def test_constructors_empty(self, data, closed):
        # GH 18421
        expected_dtype = data.dtype if isinstance(data, np.ndarray) else object
        expected_values = np.array([], dtype=object)
        expected_index = IntervalIndex(data, closed=closed)

        # validate the expected index
        assert expected_index.empty
        assert expected_index.closed == closed
        assert expected_index.dtype.subtype == expected_dtype
        tm.assert_numpy_array_equal(expected_index.values, expected_values)

        result = IntervalIndex.from_tuples(data, closed=closed)
        tm.assert_index_equal(result, expected_index)
        tm.assert_numpy_array_equal(result.values, expected_values)

        result = IntervalIndex.from_breaks(data, closed=closed)
        tm.assert_index_equal(result, expected_index)
        tm.assert_numpy_array_equal(result.values, expected_values)

        result = IntervalIndex.from_arrays(data, data, closed=closed)
        tm.assert_index_equal(result, expected_index)
        tm.assert_numpy_array_equal(result.values, expected_values)

        if closed == 'right':
            # Can't specify closed for IntervalIndex.from_intervals
            result = IntervalIndex.from_intervals(data)
            tm.assert_index_equal(result, expected_index)
            tm.assert_numpy_array_equal(result.values, expected_values)

    def test_constructors_errors(self):

        # scalar
        msg = (r'IntervalIndex\(...\) must be called with a collection of '
               'some kind, 5 was passed')
        with tm.assert_raises_regex(TypeError, msg):
            IntervalIndex(5)

        # not an interval
        msg = ("type <(class|type) 'numpy.int64'> with value 0 "
               "is not an interval")
        with tm.assert_raises_regex(TypeError, msg):
            IntervalIndex([0, 1])

        with tm.assert_raises_regex(TypeError, msg):
            IntervalIndex.from_intervals([0, 1])

        # invalid closed
        msg = "invalid options for 'closed': invalid"
        with tm.assert_raises_regex(ValueError, msg):
            IntervalIndex.from_arrays([0, 1], [1, 2], closed='invalid')

        # mismatched closed within intervals
        msg = 'intervals must all be closed on the same side'
        with tm.assert_raises_regex(ValueError, msg):
            IntervalIndex.from_intervals(
                [Interval(0, 1), Interval(1, 2, closed='left')])

        with tm.assert_raises_regex(ValueError, msg):
            IntervalIndex([Interval(0, 1), Interval(2, 3, closed='left')])

        with tm.assert_raises_regex(ValueError, msg):
            Index([Interval(0, 1), Interval(2, 3, closed='left')])

        # mismatched closed inferred from intervals vs constructor.
        msg = 'conflicting values for closed'
        with tm.assert_raises_regex(ValueError, msg):
            iv = [Interval(0, 1, closed='both'), Interval(1, 2, closed='both')]
            IntervalIndex(iv, closed='neither')

        # no point in nesting periods in an IntervalIndex
        msg = 'Period dtypes are not supported, use a PeriodIndex instead'
        with tm.assert_raises_regex(ValueError, msg):
            IntervalIndex.from_breaks(pd.period_range('2000-01-01', periods=3))

        # decreasing breaks/arrays
        msg = 'left side of interval must be <= right side'
        with tm.assert_raises_regex(ValueError, msg):
            IntervalIndex.from_breaks(range(10, -1, -1))

        with tm.assert_raises_regex(ValueError, msg):
            IntervalIndex.from_arrays(range(10, -1, -1), range(9, -2, -1))

    @pytest.mark.parametrize('tz_left, tz_right', [(None, 'UTC'),
                                                   ('UTC', None),
                                                   ('UTC', 'US/Eastern')])
    def test_constructors_errors_tz(self, tz_left, tz_right):
        # GH 18537
        left = date_range('2017-01-01', periods=4, tz=tz_left)
        right = date_range('2017-01-02', periods=4, tz=tz_right)

        # don't need to check IntervalIndex(...) or from_intervals, since
        # mixed tz are disallowed at the Interval level
        with pytest.raises(ValueError):
            IntervalIndex.from_arrays(left, right)

        with pytest.raises(ValueError):
            IntervalIndex.from_tuples(lzip(left, right))

        with pytest.raises(ValueError):
            breaks = left.tolist() + [right[-1]]
            IntervalIndex.from_breaks(breaks)

    def test_properties(self, closed):
        index = self.create_index(closed=closed)
        assert len(index) == 10
        assert index.size == 10
        assert index.shape == (10, )

        tm.assert_index_equal(index.left, Index(np.arange(10)))
        tm.assert_index_equal(index.right, Index(np.arange(1, 11)))
        tm.assert_index_equal(index.mid, Index(np.arange(0.5, 10.5)))

        assert index.closed == closed

        ivs = [Interval(l, r, closed) for l, r in zip(range(10), range(1, 11))]
        expected = np.array(ivs, dtype=object)
        tm.assert_numpy_array_equal(np.asarray(index), expected)
        tm.assert_numpy_array_equal(index.values, expected)

        # with nans
        index = self.create_index_with_nan(closed=closed)
        assert len(index) == 10
        assert index.size == 10
        assert index.shape == (10, )

        expected_left = Index([0, np.nan, 2, 3, 4, 5, 6, 7, 8, 9])
        expected_right = expected_left + 1
        expected_mid = expected_left + 0.5
        tm.assert_index_equal(index.left, expected_left)
        tm.assert_index_equal(index.right, expected_right)
        tm.assert_index_equal(index.mid, expected_mid)

        assert index.closed == closed

        ivs = [
            Interval(l, r, closed) if notna(l) else np.nan
            for l, r in zip(expected_left, expected_right)
        ]
        expected = np.array(ivs, dtype=object)
        tm.assert_numpy_array_equal(np.asarray(index), expected)
        tm.assert_numpy_array_equal(index.values, expected)

    @pytest.mark.parametrize(
        'breaks',
        [[1, 1, 2, 5, 15, 53, 217, 1014, 5335, 31240, 201608],
         [-np.inf, -100, -10, 0.5, 1, 1.5, 3.8, 101, 202, np.inf],
         pd.to_datetime(['20170101', '20170202', '20170303', '20170404']),
         pd.to_timedelta(['1ns', '2ms', '3s', '4M', '5H', '6D'])])
    def test_length(self, closed, breaks):
        # GH 18789
        index = IntervalIndex.from_breaks(breaks, closed=closed)
        result = index.length
        expected = Index(iv.length for iv in index)
        tm.assert_index_equal(result, expected)

        # with NA
        index = index.insert(1, np.nan)
        result = index.length
        expected = Index(iv.length if notna(iv) else iv for iv in index)
        tm.assert_index_equal(result, expected)

    @pytest.mark.parametrize('breaks', [
        list('abcdefgh'),
        lzip(range(10), range(1, 11)),
        [['A', 'B'], ['a', 'b'], ['c', 'd'], ['e', 'f']],
        [Interval(0, 1),
         Interval(1, 2),
         Interval(3, 4),
         Interval(4, 5)]
    ])
    def test_length_errors(self, closed, breaks):
        # GH 18789
        index = IntervalIndex.from_breaks(breaks)
        msg = 'IntervalIndex contains Intervals without defined length'
        with tm.assert_raises_regex(TypeError, msg):
            index.length

    def test_with_nans(self, closed):
        index = self.create_index(closed=closed)
        assert not index.hasnans

        result = index.isna()
        expected = np.repeat(False, len(index))
        tm.assert_numpy_array_equal(result, expected)

        result = index.notna()
        expected = np.repeat(True, len(index))
        tm.assert_numpy_array_equal(result, expected)

        index = self.create_index_with_nan(closed=closed)
        assert index.hasnans

        result = index.isna()
        expected = np.array([False, True] + [False] * (len(index) - 2))
        tm.assert_numpy_array_equal(result, expected)

        result = index.notna()
        expected = np.array([True, False] + [True] * (len(index) - 2))
        tm.assert_numpy_array_equal(result, expected)

    def test_copy(self, closed):
        expected = self.create_index(closed=closed)

        result = expected.copy()
        assert result.equals(expected)

        result = expected.copy(deep=True)
        assert result.equals(expected)
        assert result.left is not expected.left

    def test_ensure_copied_data(self, closed):
        # exercise the copy flag in the constructor

        # not copying
        index = self.create_index(closed=closed)
        result = IntervalIndex(index, copy=False)
        tm.assert_numpy_array_equal(index.left.values,
                                    result.left.values,
                                    check_same='same')
        tm.assert_numpy_array_equal(index.right.values,
                                    result.right.values,
                                    check_same='same')

        # by-definition make a copy
        result = IntervalIndex.from_intervals(index.values, copy=False)
        tm.assert_numpy_array_equal(index.left.values,
                                    result.left.values,
                                    check_same='copy')
        tm.assert_numpy_array_equal(index.right.values,
                                    result.right.values,
                                    check_same='copy')

    def test_equals(self, closed):
        expected = IntervalIndex.from_breaks(np.arange(5), closed=closed)
        assert expected.equals(expected)
        assert expected.equals(expected.copy())

        assert not expected.equals(expected.astype(object))
        assert not expected.equals(np.array(expected))
        assert not expected.equals(list(expected))

        assert not expected.equals([1, 2])
        assert not expected.equals(np.array([1, 2]))
        assert not expected.equals(pd.date_range('20130101', periods=2))

        expected_name1 = IntervalIndex.from_breaks(np.arange(5),
                                                   closed=closed,
                                                   name='foo')
        expected_name2 = IntervalIndex.from_breaks(np.arange(5),
                                                   closed=closed,
                                                   name='bar')
        assert expected.equals(expected_name1)
        assert expected_name1.equals(expected_name2)

        for other_closed in {'left', 'right', 'both', 'neither'} - {closed}:
            expected_other_closed = IntervalIndex.from_breaks(
                np.arange(5), closed=other_closed)
            assert not expected.equals(expected_other_closed)

    def test_astype(self, closed):
        idx = self.create_index(closed=closed)
        result = idx.astype(object)
        tm.assert_index_equal(result, Index(idx.values, dtype='object'))
        assert not idx.equals(result)
        assert idx.equals(IntervalIndex.from_intervals(result))

        result = idx.astype('interval')
        tm.assert_index_equal(result, idx)
        assert result.equals(idx)

    @pytest.mark.parametrize('dtype', [
        np.int64, np.float64, 'period[M]', 'timedelta64', 'datetime64[ns]',
        'datetime64[ns, US/Eastern]'
    ])
    def test_astype_errors(self, closed, dtype):
        idx = self.create_index(closed=closed)
        msg = 'Cannot cast IntervalIndex to dtype'
        with tm.assert_raises_regex(TypeError, msg):
            idx.astype(dtype)

    @pytest.mark.parametrize('klass', [list, tuple, np.array, pd.Series])
    def test_where(self, closed, klass):
        idx = self.create_index(closed=closed)
        cond = [True] * len(idx)
        expected = idx
        result = expected.where(klass(cond))
        tm.assert_index_equal(result, expected)

        cond = [False] + [True] * len(idx[1:])
        expected = IntervalIndex([np.nan] + idx[1:].tolist())
        result = idx.where(klass(cond))
        tm.assert_index_equal(result, expected)

    def test_delete(self, closed):
        expected = IntervalIndex.from_breaks(np.arange(1, 11), closed=closed)
        result = self.create_index(closed=closed).delete(0)
        tm.assert_index_equal(result, expected)

    @pytest.mark.parametrize('data', [
        interval_range(0, periods=10, closed='neither'),
        interval_range(1.7, periods=8, freq=2.5, closed='both'),
        interval_range(Timestamp('20170101'), periods=12, closed='left'),
        interval_range(Timedelta('1 day'), periods=6, closed='right'),
        IntervalIndex.from_tuples([('a', 'd'), ('e', 'j'), ('w', 'z')]),
        IntervalIndex.from_tuples([(1, 2), ('a', 'z'), (3.14, 6.28)])
    ])
    def test_insert(self, data):
        item = data[0]
        idx_item = IntervalIndex([item])

        # start
        expected = idx_item.append(data)
        result = data.insert(0, item)
        tm.assert_index_equal(result, expected)

        # end
        expected = data.append(idx_item)
        result = data.insert(len(data), item)
        tm.assert_index_equal(result, expected)

        # mid
        expected = data[:3].append(idx_item).append(data[3:])
        result = data.insert(3, item)
        tm.assert_index_equal(result, expected)

        # invalid type
        msg = 'can only insert Interval objects and NA into an IntervalIndex'
        with tm.assert_raises_regex(ValueError, msg):
            data.insert(1, 'foo')

        # invalid closed
        msg = 'inserted item must be closed on the same side as the index'
        for closed in {'left', 'right', 'both', 'neither'} - {item.closed}:
            with tm.assert_raises_regex(ValueError, msg):
                bad_item = Interval(item.left, item.right, closed=closed)
                data.insert(1, bad_item)

        # GH 18295 (test missing)
        na_idx = IntervalIndex([np.nan], closed=data.closed)
        for na in (np.nan, pd.NaT, None):
            expected = data[:1].append(na_idx).append(data[1:])
            result = data.insert(1, na)
            tm.assert_index_equal(result, expected)

    def test_take(self, closed):
        index = self.create_index(closed=closed)

        result = index.take(range(10))
        tm.assert_index_equal(result, index)

        result = index.take([0, 0, 1])
        expected = IntervalIndex.from_arrays([0, 0, 1], [1, 1, 2],
                                             closed=closed)
        tm.assert_index_equal(result, expected)

    def test_unique(self, closed):
        # unique non-overlapping
        idx = IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)],
                                        closed=closed)
        assert idx.is_unique

        # unique overlapping - distinct endpoints
        idx = IntervalIndex.from_tuples([(0, 1), (0.5, 1.5)], closed=closed)
        assert idx.is_unique

        # unique overlapping - shared endpoints
        idx = pd.IntervalIndex.from_tuples([(1, 2), (1, 3), (2, 3)],
                                           closed=closed)
        assert idx.is_unique

        # unique nested
        idx = IntervalIndex.from_tuples([(-1, 1), (-2, 2)], closed=closed)
        assert idx.is_unique

        # duplicate
        idx = IntervalIndex.from_tuples([(0, 1), (0, 1), (2, 3)],
                                        closed=closed)
        assert not idx.is_unique

        # unique mixed
        idx = IntervalIndex.from_tuples([(0, 1), ('a', 'b')], closed=closed)
        assert idx.is_unique

        # duplicate mixed
        idx = IntervalIndex.from_tuples([(0, 1), ('a', 'b'), (0, 1)],
                                        closed=closed)
        assert not idx.is_unique

        # empty
        idx = IntervalIndex([], closed=closed)
        assert idx.is_unique

    def test_monotonic(self, closed):
        # increasing non-overlapping
        idx = IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)],
                                        closed=closed)
        assert idx.is_monotonic
        assert idx._is_strictly_monotonic_increasing
        assert not idx.is_monotonic_decreasing
        assert not idx._is_strictly_monotonic_decreasing

        # decreasing non-overlapping
        idx = IntervalIndex.from_tuples([(4, 5), (2, 3), (1, 2)],
                                        closed=closed)
        assert not idx.is_monotonic
        assert not idx._is_strictly_monotonic_increasing
        assert idx.is_monotonic_decreasing
        assert idx._is_strictly_monotonic_decreasing

        # unordered non-overlapping
        idx = IntervalIndex.from_tuples([(0, 1), (4, 5), (2, 3)],
                                        closed=closed)
        assert not idx.is_monotonic
        assert not idx._is_strictly_monotonic_increasing
        assert not idx.is_monotonic_decreasing
        assert not idx._is_strictly_monotonic_decreasing

        # increasing overlapping
        idx = IntervalIndex.from_tuples([(0, 2), (0.5, 2.5), (1, 3)],
                                        closed=closed)
        assert idx.is_monotonic
        assert idx._is_strictly_monotonic_increasing
        assert not idx.is_monotonic_decreasing
        assert not idx._is_strictly_monotonic_decreasing

        # decreasing overlapping
        idx = IntervalIndex.from_tuples([(1, 3), (0.5, 2.5), (0, 2)],
                                        closed=closed)
        assert not idx.is_monotonic
        assert not idx._is_strictly_monotonic_increasing
        assert idx.is_monotonic_decreasing
        assert idx._is_strictly_monotonic_decreasing

        # unordered overlapping
        idx = IntervalIndex.from_tuples([(0.5, 2.5), (0, 2), (1, 3)],
                                        closed=closed)
        assert not idx.is_monotonic
        assert not idx._is_strictly_monotonic_increasing
        assert not idx.is_monotonic_decreasing
        assert not idx._is_strictly_monotonic_decreasing

        # increasing overlapping shared endpoints
        idx = pd.IntervalIndex.from_tuples([(1, 2), (1, 3), (2, 3)],
                                           closed=closed)
        assert idx.is_monotonic
        assert idx._is_strictly_monotonic_increasing
        assert not idx.is_monotonic_decreasing
        assert not idx._is_strictly_monotonic_decreasing

        # decreasing overlapping shared endpoints
        idx = pd.IntervalIndex.from_tuples([(2, 3), (1, 3), (1, 2)],
                                           closed=closed)
        assert not idx.is_monotonic
        assert not idx._is_strictly_monotonic_increasing
        assert idx.is_monotonic_decreasing
        assert idx._is_strictly_monotonic_decreasing

        # stationary
        idx = IntervalIndex.from_tuples([(0, 1), (0, 1)], closed=closed)
        assert idx.is_monotonic
        assert not idx._is_strictly_monotonic_increasing
        assert idx.is_monotonic_decreasing
        assert not idx._is_strictly_monotonic_decreasing

        # empty
        idx = IntervalIndex([], closed=closed)
        assert idx.is_monotonic
        assert idx._is_strictly_monotonic_increasing
        assert idx.is_monotonic_decreasing
        assert idx._is_strictly_monotonic_decreasing

    @pytest.mark.skip(reason='not a valid repr as we use interval notation')
    def test_repr(self):
        i = IntervalIndex.from_tuples([(0, 1), (1, 2)], closed='right')
        expected = ("IntervalIndex(left=[0, 1],"
                    "\n              right=[1, 2],"
                    "\n              closed='right',"
                    "\n              dtype='interval[int64]')")
        assert repr(i) == expected

        i = IntervalIndex.from_tuples(
            (Timestamp('20130101'), Timestamp('20130102')),
            (Timestamp('20130102'), Timestamp('20130103')),
            closed='right')
        expected = ("IntervalIndex(left=['2013-01-01', '2013-01-02'],"
                    "\n              right=['2013-01-02', '2013-01-03'],"
                    "\n              closed='right',"
                    "\n              dtype='interval[datetime64[ns]]')")
        assert repr(i) == expected

    @pytest.mark.skip(reason='not a valid repr as we use interval notation')
    def test_repr_max_seq_item_setting(self):
        super(TestIntervalIndex, self).test_repr_max_seq_item_setting()

    @pytest.mark.skip(reason='not a valid repr as we use interval notation')
    def test_repr_roundtrip(self):
        super(TestIntervalIndex, self).test_repr_roundtrip()

    # TODO: check this behavior is consistent with test_interval_new.py
    def test_get_item(self, closed):
        i = IntervalIndex.from_arrays((0, 1, np.nan), (1, 2, np.nan),
                                      closed=closed)
        assert i[0] == Interval(0.0, 1.0, closed=closed)
        assert i[1] == Interval(1.0, 2.0, closed=closed)
        assert isna(i[2])

        result = i[0:1]
        expected = IntervalIndex.from_arrays((0., ), (1., ), closed=closed)
        tm.assert_index_equal(result, expected)

        result = i[0:2]
        expected = IntervalIndex.from_arrays((0., 1), (1., 2.), closed=closed)
        tm.assert_index_equal(result, expected)

        result = i[1:3]
        expected = IntervalIndex.from_arrays((1., np.nan), (2., np.nan),
                                             closed=closed)
        tm.assert_index_equal(result, expected)

    # To be removed, replaced by test_interval_new.py (see #16316, #16386)
    def test_get_loc_value(self):
        pytest.raises(KeyError, self.index.get_loc, 0)
        assert self.index.get_loc(0.5) == 0
        assert self.index.get_loc(1) == 0
        assert self.index.get_loc(1.5) == 1
        assert self.index.get_loc(2) == 1
        pytest.raises(KeyError, self.index.get_loc, -1)
        pytest.raises(KeyError, self.index.get_loc, 3)

        idx = IntervalIndex.from_tuples([(0, 2), (1, 3)])
        assert idx.get_loc(0.5) == 0
        assert idx.get_loc(1) == 0
        tm.assert_numpy_array_equal(idx.get_loc(1.5),
                                    np.array([0, 1], dtype='int64'))
        tm.assert_numpy_array_equal(np.sort(idx.get_loc(2)),
                                    np.array([0, 1], dtype='int64'))
        assert idx.get_loc(3) == 1
        pytest.raises(KeyError, idx.get_loc, 3.5)

        idx = IntervalIndex.from_arrays([0, 2], [1, 3])
        pytest.raises(KeyError, idx.get_loc, 1.5)

    # To be removed, replaced by test_interval_new.py (see #16316, #16386)
    def slice_locs_cases(self, breaks):
        # TODO: same tests for more index types
        index = IntervalIndex.from_breaks([0, 1, 2], closed='right')
        assert index.slice_locs() == (0, 2)
        assert index.slice_locs(0, 1) == (0, 1)
        assert index.slice_locs(1, 1) == (0, 1)
        assert index.slice_locs(0, 2) == (0, 2)
        assert index.slice_locs(0.5, 1.5) == (0, 2)
        assert index.slice_locs(0, 0.5) == (0, 1)
        assert index.slice_locs(start=1) == (0, 2)
        assert index.slice_locs(start=1.2) == (1, 2)
        assert index.slice_locs(end=1) == (0, 1)
        assert index.slice_locs(end=1.1) == (0, 2)
        assert index.slice_locs(end=1.0) == (0, 1)
        assert index.slice_locs(-1, -1) == (0, 0)

        index = IntervalIndex.from_breaks([0, 1, 2], closed='neither')
        assert index.slice_locs(0, 1) == (0, 1)
        assert index.slice_locs(0, 2) == (0, 2)
        assert index.slice_locs(0.5, 1.5) == (0, 2)
        assert index.slice_locs(1, 1) == (1, 1)
        assert index.slice_locs(1, 2) == (1, 2)

        index = IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)],
                                          closed='both')
        assert index.slice_locs(1, 1) == (0, 1)
        assert index.slice_locs(1, 2) == (0, 2)

    # To be removed, replaced by test_interval_new.py (see #16316, #16386)
    def test_slice_locs_int64(self):
        self.slice_locs_cases([0, 1, 2])

    # To be removed, replaced by test_interval_new.py (see #16316, #16386)
    def test_slice_locs_float64(self):
        self.slice_locs_cases([0.0, 1.0, 2.0])

    # To be removed, replaced by test_interval_new.py (see #16316, #16386)
    def slice_locs_decreasing_cases(self, tuples):
        index = IntervalIndex.from_tuples(tuples)
        assert index.slice_locs(1.5, 0.5) == (1, 3)
        assert index.slice_locs(2, 0) == (1, 3)
        assert index.slice_locs(2, 1) == (1, 3)
        assert index.slice_locs(3, 1.1) == (0, 3)
        assert index.slice_locs(3, 3) == (0, 2)
        assert index.slice_locs(3.5, 3.3) == (0, 1)
        assert index.slice_locs(1, -3) == (2, 3)

        slice_locs = index.slice_locs(-1, -1)
        assert slice_locs[0] == slice_locs[1]

    # To be removed, replaced by test_interval_new.py (see #16316, #16386)
    def test_slice_locs_decreasing_int64(self):
        self.slice_locs_cases([(2, 4), (1, 3), (0, 2)])

    # To be removed, replaced by test_interval_new.py (see #16316, #16386)
    def test_slice_locs_decreasing_float64(self):
        self.slice_locs_cases([(2., 4.), (1., 3.), (0., 2.)])

    # To be removed, replaced by test_interval_new.py (see #16316, #16386)
    def test_slice_locs_fails(self):
        index = IntervalIndex.from_tuples([(1, 2), (0, 1), (2, 3)])
        with pytest.raises(KeyError):
            index.slice_locs(1, 2)

    # To be removed, replaced by test_interval_new.py (see #16316, #16386)
    def test_get_loc_interval(self):
        assert self.index.get_loc(Interval(0, 1)) == 0
        assert self.index.get_loc(Interval(0, 0.5)) == 0
        assert self.index.get_loc(Interval(0, 1, 'left')) == 0
        pytest.raises(KeyError, self.index.get_loc, Interval(2, 3))
        pytest.raises(KeyError, self.index.get_loc, Interval(-1, 0, 'left'))

    # To be removed, replaced by test_interval_new.py (see #16316, #16386)
    def test_get_indexer(self):
        actual = self.index.get_indexer([-1, 0, 0.5, 1, 1.5, 2, 3])
        expected = np.array([-1, -1, 0, 0, 1, 1, -1], dtype='intp')
        tm.assert_numpy_array_equal(actual, expected)

        actual = self.index.get_indexer(self.index)
        expected = np.array([0, 1], dtype='intp')
        tm.assert_numpy_array_equal(actual, expected)

        index = IntervalIndex.from_breaks([0, 1, 2], closed='left')
        actual = index.get_indexer([-1, 0, 0.5, 1, 1.5, 2, 3])
        expected = np.array([-1, 0, 0, 1, 1, -1, -1], dtype='intp')
        tm.assert_numpy_array_equal(actual, expected)

        actual = self.index.get_indexer(index[:1])
        expected = np.array([0], dtype='intp')
        tm.assert_numpy_array_equal(actual, expected)

        actual = self.index.get_indexer(index)
        expected = np.array([-1, 1], dtype='intp')
        tm.assert_numpy_array_equal(actual, expected)

    # To be removed, replaced by test_interval_new.py (see #16316, #16386)
    def test_get_indexer_subintervals(self):

        # TODO: is this right?
        # return indexers for wholly contained subintervals
        target = IntervalIndex.from_breaks(np.linspace(0, 2, 5))
        actual = self.index.get_indexer(target)
        expected = np.array([0, 0, 1, 1], dtype='p')
        tm.assert_numpy_array_equal(actual, expected)

        target = IntervalIndex.from_breaks([0, 0.67, 1.33, 2])
        actual = self.index.get_indexer(target)
        expected = np.array([0, 0, 1, 1], dtype='intp')
        tm.assert_numpy_array_equal(actual, expected)

        actual = self.index.get_indexer(target[[0, -1]])
        expected = np.array([0, 1], dtype='intp')
        tm.assert_numpy_array_equal(actual, expected)

        target = IntervalIndex.from_breaks([0, 0.33, 0.67, 1], closed='left')
        actual = self.index.get_indexer(target)
        expected = np.array([0, 0, 0], dtype='intp')
        tm.assert_numpy_array_equal(actual, expected)

    # To be removed, replaced by test_interval_new.py (see #16316, #16386)
    def test_contains(self):
        # Only endpoints are valid.
        i = IntervalIndex.from_arrays([0, 1], [1, 2])

        # Invalid
        assert 0 not in i
        assert 1 not in i
        assert 2 not in i

        # Valid
        assert Interval(0, 1) in i
        assert Interval(0, 2) in i
        assert Interval(0, 0.5) in i
        assert Interval(3, 5) not in i
        assert Interval(-1, 0, closed='left') not in i

    # To be removed, replaced by test_interval_new.py (see #16316, #16386)
    def testcontains(self):
        # can select values that are IN the range of a value
        i = IntervalIndex.from_arrays([0, 1], [1, 2])

        assert i.contains(0.1)
        assert i.contains(0.5)
        assert i.contains(1)
        assert i.contains(Interval(0, 1))
        assert i.contains(Interval(0, 2))

        # these overlaps completely
        assert i.contains(Interval(0, 3))
        assert i.contains(Interval(1, 3))

        assert not i.contains(20)
        assert not i.contains(-20)

    def test_dropna(self, closed):

        expected = IntervalIndex.from_tuples([(0.0, 1.0), (1.0, 2.0)],
                                             closed=closed)

        ii = IntervalIndex.from_tuples([(0, 1), (1, 2), np.nan], closed=closed)
        result = ii.dropna()
        tm.assert_index_equal(result, expected)

        ii = IntervalIndex.from_arrays([0, 1, np.nan], [1, 2, np.nan],
                                       closed=closed)
        result = ii.dropna()
        tm.assert_index_equal(result, expected)

    # TODO: check this behavior is consistent with test_interval_new.py
    def test_non_contiguous(self, closed):
        index = IntervalIndex.from_tuples([(0, 1), (2, 3)], closed=closed)
        target = [0.5, 1.5, 2.5]
        actual = index.get_indexer(target)
        expected = np.array([0, -1, 1], dtype='intp')
        tm.assert_numpy_array_equal(actual, expected)

        assert 1.5 not in index

    def test_union(self, closed):
        index = self.create_index(closed=closed)
        other = IntervalIndex.from_breaks(range(5, 13), closed=closed)

        expected = IntervalIndex.from_breaks(range(13), closed=closed)
        result = index.union(other)
        tm.assert_index_equal(result, expected)

        result = other.union(index)
        tm.assert_index_equal(result, expected)

        tm.assert_index_equal(index.union(index), index)
        tm.assert_index_equal(index.union(index[:1]), index)

    def test_intersection(self, closed):
        index = self.create_index(closed=closed)
        other = IntervalIndex.from_breaks(range(5, 13), closed=closed)

        expected = IntervalIndex.from_breaks(range(5, 11), closed=closed)
        result = index.intersection(other)
        tm.assert_index_equal(result, expected)

        result = other.intersection(index)
        tm.assert_index_equal(result, expected)

        tm.assert_index_equal(index.intersection(index), index)

    def test_difference(self, closed):
        index = self.create_index(closed=closed)
        tm.assert_index_equal(index.difference(index[:1]), index[1:])

    def test_symmetric_difference(self, closed):
        idx = self.create_index(closed=closed)
        result = idx[1:].symmetric_difference(idx[:-1])
        expected = IntervalIndex([idx[0], idx[-1]])
        tm.assert_index_equal(result, expected)

    @pytest.mark.parametrize(
        'op_name',
        ['union', 'intersection', 'difference', 'symmetric_difference'])
    def test_set_operation_errors(self, closed, op_name):
        index = self.create_index(closed=closed)
        set_op = getattr(index, op_name)

        # test errors
        msg = ('can only do set operations between two IntervalIndex objects '
               'that are closed on the same side')
        with tm.assert_raises_regex(ValueError, msg):
            set_op(Index([1, 2, 3]))

        for other_closed in {'right', 'left', 'both', 'neither'} - {closed}:
            other = self.create_index(closed=other_closed)
            with tm.assert_raises_regex(ValueError, msg):
                set_op(other)

    def test_isin(self, closed):
        index = self.create_index(closed=closed)

        expected = np.array([True] + [False] * (len(index) - 1))
        result = index.isin(index[:1])
        tm.assert_numpy_array_equal(result, expected)

        result = index.isin([index[0]])
        tm.assert_numpy_array_equal(result, expected)

        other = IntervalIndex.from_breaks(np.arange(-2, 10), closed=closed)
        expected = np.array([True] * (len(index) - 1) + [False])
        result = index.isin(other)
        tm.assert_numpy_array_equal(result, expected)

        result = index.isin(other.tolist())
        tm.assert_numpy_array_equal(result, expected)

        for other_closed in {'right', 'left', 'both', 'neither'}:
            other = self.create_index(closed=other_closed)
            expected = np.repeat(closed == other_closed, len(index))
            result = index.isin(other)
            tm.assert_numpy_array_equal(result, expected)

            result = index.isin(other.tolist())
            tm.assert_numpy_array_equal(result, expected)

    def test_comparison(self):
        actual = Interval(0, 1) < self.index
        expected = np.array([False, True])
        tm.assert_numpy_array_equal(actual, expected)

        actual = Interval(0.5, 1.5) < self.index
        expected = np.array([False, True])
        tm.assert_numpy_array_equal(actual, expected)
        actual = self.index > Interval(0.5, 1.5)
        tm.assert_numpy_array_equal(actual, expected)

        actual = self.index == self.index
        expected = np.array([True, True])
        tm.assert_numpy_array_equal(actual, expected)
        actual = self.index <= self.index
        tm.assert_numpy_array_equal(actual, expected)
        actual = self.index >= self.index
        tm.assert_numpy_array_equal(actual, expected)

        actual = self.index < self.index
        expected = np.array([False, False])
        tm.assert_numpy_array_equal(actual, expected)
        actual = self.index > self.index
        tm.assert_numpy_array_equal(actual, expected)

        actual = self.index == IntervalIndex.from_breaks([0, 1, 2], 'left')
        tm.assert_numpy_array_equal(actual, expected)

        actual = self.index == self.index.values
        tm.assert_numpy_array_equal(actual, np.array([True, True]))
        actual = self.index.values == self.index
        tm.assert_numpy_array_equal(actual, np.array([True, True]))
        actual = self.index <= self.index.values
        tm.assert_numpy_array_equal(actual, np.array([True, True]))
        actual = self.index != self.index.values
        tm.assert_numpy_array_equal(actual, np.array([False, False]))
        actual = self.index > self.index.values
        tm.assert_numpy_array_equal(actual, np.array([False, False]))
        actual = self.index.values > self.index
        tm.assert_numpy_array_equal(actual, np.array([False, False]))

        # invalid comparisons
        actual = self.index == 0
        tm.assert_numpy_array_equal(actual, np.array([False, False]))
        actual = self.index == self.index.left
        tm.assert_numpy_array_equal(actual, np.array([False, False]))

        with tm.assert_raises_regex(TypeError, 'unorderable types'):
            self.index > 0
        with tm.assert_raises_regex(TypeError, 'unorderable types'):
            self.index <= 0
        with pytest.raises(TypeError):
            self.index > np.arange(2)
        with pytest.raises(ValueError):
            self.index > np.arange(3)

    def test_missing_values(self, closed):
        idx = Index([
            np.nan,
            Interval(0, 1, closed=closed),
            Interval(1, 2, closed=closed)
        ])
        idx2 = IntervalIndex.from_arrays([np.nan, 0, 1], [np.nan, 1, 2],
                                         closed=closed)
        assert idx.equals(idx2)

        with pytest.raises(ValueError):
            IntervalIndex.from_arrays([np.nan, 0, 1],
                                      np.array([0, 1, 2]),
                                      closed=closed)

        tm.assert_numpy_array_equal(isna(idx), np.array([True, False, False]))

    def test_sort_values(self, closed):
        index = self.create_index(closed=closed)

        result = index.sort_values()
        tm.assert_index_equal(result, index)

        result = index.sort_values(ascending=False)
        tm.assert_index_equal(result, index[::-1])

        # with nan
        index = IntervalIndex([Interval(1, 2), np.nan, Interval(0, 1)])

        result = index.sort_values()
        expected = IntervalIndex([Interval(0, 1), Interval(1, 2), np.nan])
        tm.assert_index_equal(result, expected)

        result = index.sort_values(ascending=False)
        expected = IntervalIndex([np.nan, Interval(1, 2), Interval(0, 1)])
        tm.assert_index_equal(result, expected)

    @pytest.mark.parametrize('tz', [None, 'US/Eastern'])
    def test_datetime(self, tz):
        start = Timestamp('2000-01-01', tz=tz)
        dates = date_range(start=start, periods=10)
        index = IntervalIndex.from_breaks(dates)

        # test mid
        start = Timestamp('2000-01-01T12:00', tz=tz)
        expected = date_range(start=start, periods=9)
        tm.assert_index_equal(index.mid, expected)

        # __contains__ doesn't check individual points
        assert Timestamp('2000-01-01', tz=tz) not in index
        assert Timestamp('2000-01-01T12', tz=tz) not in index
        assert Timestamp('2000-01-02', tz=tz) not in index
        iv_true = Interval(Timestamp('2000-01-01T08', tz=tz),
                           Timestamp('2000-01-01T18', tz=tz))
        iv_false = Interval(Timestamp('1999-12-31', tz=tz),
                            Timestamp('2000-01-01', tz=tz))
        assert iv_true in index
        assert iv_false not in index

        # .contains does check individual points
        assert not index.contains(Timestamp('2000-01-01', tz=tz))
        assert index.contains(Timestamp('2000-01-01T12', tz=tz))
        assert index.contains(Timestamp('2000-01-02', tz=tz))
        assert index.contains(iv_true)
        assert not index.contains(iv_false)

        # test get_indexer
        start = Timestamp('1999-12-31T12:00', tz=tz)
        target = date_range(start=start, periods=7, freq='12H')
        actual = index.get_indexer(target)
        expected = np.array([-1, -1, 0, 0, 1, 1, 2], dtype='intp')
        tm.assert_numpy_array_equal(actual, expected)

        start = Timestamp('2000-01-08T18:00', tz=tz)
        target = date_range(start=start, periods=7, freq='6H')
        actual = index.get_indexer(target)
        expected = np.array([7, 7, 8, 8, 8, 8, -1], dtype='intp')
        tm.assert_numpy_array_equal(actual, expected)

    def test_append(self, closed):

        index1 = IntervalIndex.from_arrays([0, 1], [1, 2], closed=closed)
        index2 = IntervalIndex.from_arrays([1, 2], [2, 3], closed=closed)

        result = index1.append(index2)
        expected = IntervalIndex.from_arrays([0, 1, 1, 2], [1, 2, 2, 3],
                                             closed=closed)
        tm.assert_index_equal(result, expected)

        result = index1.append([index1, index2])
        expected = IntervalIndex.from_arrays([0, 1, 0, 1, 1, 2],
                                             [1, 2, 1, 2, 2, 3],
                                             closed=closed)
        tm.assert_index_equal(result, expected)

        msg = ('can only append two IntervalIndex objects that are closed '
               'on the same side')
        for other_closed in {'left', 'right', 'both', 'neither'} - {closed}:
            index_other_closed = IntervalIndex.from_arrays([0, 1], [1, 2],
                                                           closed=other_closed)
            with tm.assert_raises_regex(ValueError, msg):
                index1.append(index_other_closed)

    def test_is_non_overlapping_monotonic(self, closed):
        # Should be True in all cases
        tpls = [(0, 1), (2, 3), (4, 5), (6, 7)]
        idx = IntervalIndex.from_tuples(tpls, closed=closed)
        assert idx.is_non_overlapping_monotonic is True

        idx = IntervalIndex.from_tuples(tpls[::-1], closed=closed)
        assert idx.is_non_overlapping_monotonic is True

        # Should be False in all cases (overlapping)
        tpls = [(0, 2), (1, 3), (4, 5), (6, 7)]
        idx = IntervalIndex.from_tuples(tpls, closed=closed)
        assert idx.is_non_overlapping_monotonic is False

        idx = IntervalIndex.from_tuples(tpls[::-1], closed=closed)
        assert idx.is_non_overlapping_monotonic is False

        # Should be False in all cases (non-monotonic)
        tpls = [(0, 1), (2, 3), (6, 7), (4, 5)]
        idx = IntervalIndex.from_tuples(tpls, closed=closed)
        assert idx.is_non_overlapping_monotonic is False

        idx = IntervalIndex.from_tuples(tpls[::-1], closed=closed)
        assert idx.is_non_overlapping_monotonic is False

        # Should be False for closed='both', otherwise True (GH16560)
        if closed == 'both':
            idx = IntervalIndex.from_breaks(range(4), closed=closed)
            assert idx.is_non_overlapping_monotonic is False
        else:
            idx = IntervalIndex.from_breaks(range(4), closed=closed)
            assert idx.is_non_overlapping_monotonic is True

    @pytest.mark.parametrize('tuples', [
        lzip(range(10), range(1, 11)),
        lzip(date_range('20170101', periods=10),
             date_range('20170101', periods=10)),
        lzip(timedelta_range('0 days', periods=10),
             timedelta_range('1 day', periods=10))
    ])
    def test_to_tuples(self, tuples):
        # GH 18756
        idx = IntervalIndex.from_tuples(tuples)
        result = idx.to_tuples()
        expected = Index(_asarray_tuplesafe(tuples))
        tm.assert_index_equal(result, expected)

    @pytest.mark.parametrize('tuples', [
        lzip(range(10), range(1, 11)) + [np.nan],
        lzip(date_range('20170101', periods=10),
             date_range('20170101', periods=10)) + [np.nan],
        lzip(timedelta_range('0 days', periods=10),
             timedelta_range('1 day', periods=10)) + [np.nan]
    ])
    @pytest.mark.parametrize('na_tuple', [True, False])
    def test_to_tuples_na(self, tuples, na_tuple):
        # GH 18756
        idx = IntervalIndex.from_tuples(tuples)
        result = idx.to_tuples(na_tuple=na_tuple)

        # check the non-NA portion
        expected_notna = Index(_asarray_tuplesafe(tuples[:-1]))
        result_notna = result[:-1]
        tm.assert_index_equal(result_notna, expected_notna)

        # check the NA portion
        result_na = result[-1]
        if na_tuple:
            assert isinstance(result_na, tuple)
            assert len(result_na) == 2
            assert all(isna(x) for x in result_na)
        else:
            assert isna(result_na)
def prepare_timetable(line_car_timetable_path,station_code_path,time_table_path,out_put_name,out_put_path):
    line_car_timetable = pd.read_excel(line_car_timetable_path).dropna()
    station_code = pd.read_excel(station_code_path).dropna()
    out_put_name = out_put_path + out_put_name+'.csv'
    # Combine the timetable files for different lines
    count = 0
    i = 1
    # ------------------------NEW VERSION----------------
    for ix, lt in line_car_timetable.iterrows():
        file = time_table_path + '/' + lt[2]

        if count == 0:
            time_table = pd.read_excel(file, dtype={'Train_No': 'str', 'Trs_No': 'str', 'Trip_No': 'str'})
            columns_list = list(time_table.columns)
            for name in columns_list:
                time_table = time_table.rename(columns = {name: name.replace(" ", "")})
                
            #print (pd.unique(time_table['Car_Num']))
            time_table[['Train_No', 'Trs_No', 'Trip_No']] = \
                    time_table[['Train_No', 'Trs_No', 'Trip_No']].astype('str')
            # print (time_table.columns)
            count += 1
        else:
            new_table = pd.read_excel(file, dtype={'Train_No': 'str', 'Trs_No': 'str', 'Trip_No': 'str',
                                                   'Arr_From': 'str','Dep_From': 'str','Arr_To': 'str','Dep_To': 'str'})
            # print(new_table.columns)
            columns_list = list(new_table.columns)
            for name in columns_list:
                new_table = new_table.rename(columns = {name: name.replace(" ", "")})
            #print (pd.unique(new_table['Car_Num']))
            new_table[['Train_No', 'Trs_No', 'Trip_No']] = \
                new_table[['Train_No', 'Trs_No', 'Trip_No']].astype('str')
            try:
                time_table = pd.concat([time_table, new_table], sort=False)
            except:
                print(new_table.loc[:, ['Train_No', 'Trs_No', 'Trip_No']])
    # -----------------OLD VERSION---------------
    # with open(out_put_name, 'w', newline='') as f:

        # writer = csv.writer(f, delimiter=',')
        # for ix, lt in line_car_timetable.iterrows():

            # df_table = pd.read_excel(file)
            #
            # if i:
            #     writer.writerow(df_table.columns)
            #     i = 0
            # writer.writerows(df_table.values)

    # line code
    # time_table = pd.read_csv(out_put_name)
    time_list = ['Arr_From','Dep_From','Arr_To','Dep_To']
    for time_name in time_list:
        time_table[time_name] = time_table[time_name].astype('str')
        time_table[time_name] = time_table[time_name].apply(lambda x: x.split(' ')[-1])
        time_table[time_name] = pd.to_timedelta(time_table[time_name])
        time_table.loc[time_table[time_name]>pd.Timedelta('1 days'),time_name] -=  pd.Timedelta('1 days')
        time_table[time_name] = time_table[time_name].apply(format_timedelta)
    #time_table.to_csv('test.csv')
  
    time_table.drop(['Train_Trip', 'Train_KM'], axis=1, inplace=True)
    # print (time_table.columns)
    df_merged = time_table.merge(line_car_timetable, left_on='Line', right_on='LINE', how='left')
    
    df_merged['Car_Num'].fillna(df_merged['DEFALT_CARS'], inplace=True)
    
    # station code for from station
    df_merged = df_merged.merge(station_code, left_on=['From', 'Line'], right_on=['STATION', 'LINE'], how='left')
    df_merged['From_ID'] = df_merged['CODE']
    df_merged.drop(['CODE'], axis=1, inplace=True)
    
    # station code for To station
    df_merged = df_merged.merge(station_code, left_on=['To', 'Line'], right_on=['STATION', 'LINE'], how='left')
    df_merged['To_ID'] = df_merged['CODE']
    
    # direction code (down = 2, up =1)
    df_merged['Direction_ID'] = df_merged['Direction'].apply(lambda x: 1 if x == 'UP' else 2)
    
    # prepare the final outputs

    output = df_merged.loc[:,['Line', 'LINE_CODE', 'Train_No', 'Trs_No', 'Trip_No', 'Revenue_Y_N', 'Direction', 'Direction_ID',
                       'From', 'From_ID', 'Arr_From', 'Dep_From', 'To', 'To_ID', 'Arr_To', 'Dep_To', 'Car_Num']]


    # remove revenue is N
    output = output[output.Revenue_Y_N == 'Y']
    output['Trip_No'] = 'T_' + output['Trip_No'] # make the trip no become purely str 
    output = output.drop_duplicates()
    output.to_csv(out_put_name, index=False)
    return output